diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,64839 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 9258, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0003240440699935191, + "grad_norm": 4.436971187591553, + "learning_rate": 3.597122302158274e-08, + "loss": 0.7408, + "step": 1 + }, + { + "epoch": 0.0006480881399870382, + "grad_norm": 4.44703483581543, + "learning_rate": 7.194244604316547e-08, + "loss": 0.7573, + "step": 2 + }, + { + "epoch": 0.0009721322099805574, + "grad_norm": 4.3990912437438965, + "learning_rate": 1.0791366906474822e-07, + "loss": 0.7465, + "step": 3 + }, + { + "epoch": 0.0012961762799740765, + "grad_norm": 4.6868085861206055, + "learning_rate": 1.4388489208633095e-07, + "loss": 0.7651, + "step": 4 + }, + { + "epoch": 0.0016202203499675956, + "grad_norm": 4.419036865234375, + "learning_rate": 1.7985611510791368e-07, + "loss": 0.74, + "step": 5 + }, + { + "epoch": 0.0019442644199611147, + "grad_norm": 4.579814910888672, + "learning_rate": 2.1582733812949643e-07, + "loss": 0.7446, + "step": 6 + }, + { + "epoch": 0.002268308489954634, + "grad_norm": 4.31318998336792, + "learning_rate": 2.5179856115107916e-07, + "loss": 0.7432, + "step": 7 + }, + { + "epoch": 0.002592352559948153, + "grad_norm": 4.498172760009766, + "learning_rate": 2.877697841726619e-07, + "loss": 0.7532, + "step": 8 + }, + { + "epoch": 0.002916396629941672, + "grad_norm": 4.535513877868652, + "learning_rate": 3.237410071942446e-07, + "loss": 0.7587, + "step": 9 + }, + { + "epoch": 0.0032404406999351912, + "grad_norm": 4.351844787597656, + "learning_rate": 3.5971223021582736e-07, + "loss": 0.7507, + "step": 10 + }, + { + "epoch": 0.0035644847699287103, + "grad_norm": 4.505545616149902, + "learning_rate": 3.956834532374101e-07, + "loss": 0.7368, + "step": 11 + }, + { + "epoch": 0.0038885288399222295, + "grad_norm": 4.367278575897217, + "learning_rate": 4.3165467625899287e-07, + "loss": 0.7298, + "step": 12 + }, + { + "epoch": 0.004212572909915748, + "grad_norm": 4.497063636779785, + "learning_rate": 4.676258992805756e-07, + "loss": 0.7495, + "step": 13 + }, + { + "epoch": 0.004536616979909268, + "grad_norm": 4.694037914276123, + "learning_rate": 5.035971223021583e-07, + "loss": 0.755, + "step": 14 + }, + { + "epoch": 0.004860661049902786, + "grad_norm": 4.476596832275391, + "learning_rate": 5.39568345323741e-07, + "loss": 0.7404, + "step": 15 + }, + { + "epoch": 0.005184705119896306, + "grad_norm": 4.339330673217773, + "learning_rate": 5.755395683453238e-07, + "loss": 0.7447, + "step": 16 + }, + { + "epoch": 0.005508749189889825, + "grad_norm": 4.3112335205078125, + "learning_rate": 6.115107913669066e-07, + "loss": 0.7217, + "step": 17 + }, + { + "epoch": 0.005832793259883344, + "grad_norm": 4.041561603546143, + "learning_rate": 6.474820143884893e-07, + "loss": 0.6987, + "step": 18 + }, + { + "epoch": 0.006156837329876863, + "grad_norm": 4.28841495513916, + "learning_rate": 6.83453237410072e-07, + "loss": 0.7106, + "step": 19 + }, + { + "epoch": 0.0064808813998703824, + "grad_norm": 4.189178943634033, + "learning_rate": 7.194244604316547e-07, + "loss": 0.7135, + "step": 20 + }, + { + "epoch": 0.006804925469863901, + "grad_norm": 4.380631446838379, + "learning_rate": 7.553956834532375e-07, + "loss": 0.7017, + "step": 21 + }, + { + "epoch": 0.007128969539857421, + "grad_norm": 4.28890323638916, + "learning_rate": 7.913669064748202e-07, + "loss": 0.6724, + "step": 22 + }, + { + "epoch": 0.007453013609850939, + "grad_norm": 4.130223274230957, + "learning_rate": 8.27338129496403e-07, + "loss": 0.587, + "step": 23 + }, + { + "epoch": 0.007777057679844459, + "grad_norm": 4.008565902709961, + "learning_rate": 8.633093525179857e-07, + "loss": 0.589, + "step": 24 + }, + { + "epoch": 0.008101101749837978, + "grad_norm": 4.349413871765137, + "learning_rate": 8.992805755395684e-07, + "loss": 0.6014, + "step": 25 + }, + { + "epoch": 0.008425145819831496, + "grad_norm": 3.9135501384735107, + "learning_rate": 9.352517985611512e-07, + "loss": 0.5901, + "step": 26 + }, + { + "epoch": 0.008749189889825017, + "grad_norm": 3.899057149887085, + "learning_rate": 9.71223021582734e-07, + "loss": 0.5715, + "step": 27 + }, + { + "epoch": 0.009073233959818535, + "grad_norm": 3.7998855113983154, + "learning_rate": 1.0071942446043167e-06, + "loss": 0.5493, + "step": 28 + }, + { + "epoch": 0.009397278029812054, + "grad_norm": 3.450526475906372, + "learning_rate": 1.0431654676258993e-06, + "loss": 0.5471, + "step": 29 + }, + { + "epoch": 0.009721322099805573, + "grad_norm": 3.6078693866729736, + "learning_rate": 1.079136690647482e-06, + "loss": 0.528, + "step": 30 + }, + { + "epoch": 0.010045366169799093, + "grad_norm": 2.785630941390991, + "learning_rate": 1.115107913669065e-06, + "loss": 0.4635, + "step": 31 + }, + { + "epoch": 0.010369410239792612, + "grad_norm": 2.324575424194336, + "learning_rate": 1.1510791366906476e-06, + "loss": 0.4333, + "step": 32 + }, + { + "epoch": 0.01069345430978613, + "grad_norm": 2.026649236679077, + "learning_rate": 1.1870503597122303e-06, + "loss": 0.4412, + "step": 33 + }, + { + "epoch": 0.01101749837977965, + "grad_norm": 1.9035625457763672, + "learning_rate": 1.2230215827338131e-06, + "loss": 0.4399, + "step": 34 + }, + { + "epoch": 0.01134154244977317, + "grad_norm": 1.7783927917480469, + "learning_rate": 1.2589928057553958e-06, + "loss": 0.443, + "step": 35 + }, + { + "epoch": 0.011665586519766688, + "grad_norm": 1.6150323152542114, + "learning_rate": 1.2949640287769785e-06, + "loss": 0.4127, + "step": 36 + }, + { + "epoch": 0.011989630589760207, + "grad_norm": 1.469312310218811, + "learning_rate": 1.3309352517985614e-06, + "loss": 0.3816, + "step": 37 + }, + { + "epoch": 0.012313674659753726, + "grad_norm": 1.3809667825698853, + "learning_rate": 1.366906474820144e-06, + "loss": 0.3966, + "step": 38 + }, + { + "epoch": 0.012637718729747246, + "grad_norm": 1.4537124633789062, + "learning_rate": 1.4028776978417265e-06, + "loss": 0.4121, + "step": 39 + }, + { + "epoch": 0.012961762799740765, + "grad_norm": 1.2365580797195435, + "learning_rate": 1.4388489208633094e-06, + "loss": 0.3753, + "step": 40 + }, + { + "epoch": 0.013285806869734284, + "grad_norm": 1.2684414386749268, + "learning_rate": 1.474820143884892e-06, + "loss": 0.3963, + "step": 41 + }, + { + "epoch": 0.013609850939727802, + "grad_norm": 1.1836400032043457, + "learning_rate": 1.510791366906475e-06, + "loss": 0.3849, + "step": 42 + }, + { + "epoch": 0.013933895009721323, + "grad_norm": 1.1855645179748535, + "learning_rate": 1.5467625899280579e-06, + "loss": 0.3775, + "step": 43 + }, + { + "epoch": 0.014257939079714841, + "grad_norm": 1.2605303525924683, + "learning_rate": 1.5827338129496403e-06, + "loss": 0.3701, + "step": 44 + }, + { + "epoch": 0.01458198314970836, + "grad_norm": 1.2494381666183472, + "learning_rate": 1.618705035971223e-06, + "loss": 0.3669, + "step": 45 + }, + { + "epoch": 0.014906027219701879, + "grad_norm": 1.0984803438186646, + "learning_rate": 1.654676258992806e-06, + "loss": 0.3553, + "step": 46 + }, + { + "epoch": 0.0152300712896954, + "grad_norm": 1.0629351139068604, + "learning_rate": 1.6906474820143886e-06, + "loss": 0.3355, + "step": 47 + }, + { + "epoch": 0.015554115359688918, + "grad_norm": 1.001863718032837, + "learning_rate": 1.7266187050359715e-06, + "loss": 0.342, + "step": 48 + }, + { + "epoch": 0.015878159429682438, + "grad_norm": 1.0362595319747925, + "learning_rate": 1.762589928057554e-06, + "loss": 0.3459, + "step": 49 + }, + { + "epoch": 0.016202203499675955, + "grad_norm": 1.2314352989196777, + "learning_rate": 1.7985611510791368e-06, + "loss": 0.3801, + "step": 50 + }, + { + "epoch": 0.016526247569669476, + "grad_norm": 1.0542044639587402, + "learning_rate": 1.8345323741007195e-06, + "loss": 0.3256, + "step": 51 + }, + { + "epoch": 0.016850291639662993, + "grad_norm": 1.041985034942627, + "learning_rate": 1.8705035971223024e-06, + "loss": 0.3332, + "step": 52 + }, + { + "epoch": 0.017174335709656513, + "grad_norm": 1.0842797756195068, + "learning_rate": 1.906474820143885e-06, + "loss": 0.3541, + "step": 53 + }, + { + "epoch": 0.017498379779650033, + "grad_norm": 1.0450321435928345, + "learning_rate": 1.942446043165468e-06, + "loss": 0.3626, + "step": 54 + }, + { + "epoch": 0.01782242384964355, + "grad_norm": 0.9607313871383667, + "learning_rate": 1.9784172661870504e-06, + "loss": 0.3229, + "step": 55 + }, + { + "epoch": 0.01814646791963707, + "grad_norm": 1.0011342763900757, + "learning_rate": 2.0143884892086333e-06, + "loss": 0.335, + "step": 56 + }, + { + "epoch": 0.01847051198963059, + "grad_norm": 1.0107240676879883, + "learning_rate": 2.050359712230216e-06, + "loss": 0.3531, + "step": 57 + }, + { + "epoch": 0.018794556059624108, + "grad_norm": 0.9726276397705078, + "learning_rate": 2.0863309352517987e-06, + "loss": 0.326, + "step": 58 + }, + { + "epoch": 0.01911860012961763, + "grad_norm": 0.9930094480514526, + "learning_rate": 2.1223021582733816e-06, + "loss": 0.3047, + "step": 59 + }, + { + "epoch": 0.019442644199611146, + "grad_norm": 1.0234602689743042, + "learning_rate": 2.158273381294964e-06, + "loss": 0.3608, + "step": 60 + }, + { + "epoch": 0.019766688269604666, + "grad_norm": 0.9958673715591431, + "learning_rate": 2.194244604316547e-06, + "loss": 0.3374, + "step": 61 + }, + { + "epoch": 0.020090732339598186, + "grad_norm": 0.8935235738754272, + "learning_rate": 2.23021582733813e-06, + "loss": 0.3058, + "step": 62 + }, + { + "epoch": 0.020414776409591703, + "grad_norm": 0.9323809146881104, + "learning_rate": 2.2661870503597123e-06, + "loss": 0.3105, + "step": 63 + }, + { + "epoch": 0.020738820479585224, + "grad_norm": 0.9058836102485657, + "learning_rate": 2.302158273381295e-06, + "loss": 0.2958, + "step": 64 + }, + { + "epoch": 0.021062864549578744, + "grad_norm": 0.9206125736236572, + "learning_rate": 2.3381294964028776e-06, + "loss": 0.3013, + "step": 65 + }, + { + "epoch": 0.02138690861957226, + "grad_norm": 0.9556636214256287, + "learning_rate": 2.3741007194244605e-06, + "loss": 0.3155, + "step": 66 + }, + { + "epoch": 0.02171095268956578, + "grad_norm": 0.9843814373016357, + "learning_rate": 2.4100719424460434e-06, + "loss": 0.3254, + "step": 67 + }, + { + "epoch": 0.0220349967595593, + "grad_norm": 0.9352341890335083, + "learning_rate": 2.4460431654676263e-06, + "loss": 0.281, + "step": 68 + }, + { + "epoch": 0.02235904082955282, + "grad_norm": 0.905490517616272, + "learning_rate": 2.4820143884892088e-06, + "loss": 0.2842, + "step": 69 + }, + { + "epoch": 0.02268308489954634, + "grad_norm": 0.9274157285690308, + "learning_rate": 2.5179856115107916e-06, + "loss": 0.3019, + "step": 70 + }, + { + "epoch": 0.023007128969539856, + "grad_norm": 0.9611228704452515, + "learning_rate": 2.5539568345323745e-06, + "loss": 0.2737, + "step": 71 + }, + { + "epoch": 0.023331173039533377, + "grad_norm": 0.9289448857307434, + "learning_rate": 2.589928057553957e-06, + "loss": 0.2978, + "step": 72 + }, + { + "epoch": 0.023655217109526897, + "grad_norm": 0.9755799174308777, + "learning_rate": 2.6258992805755395e-06, + "loss": 0.3216, + "step": 73 + }, + { + "epoch": 0.023979261179520414, + "grad_norm": 0.8912398219108582, + "learning_rate": 2.6618705035971228e-06, + "loss": 0.3141, + "step": 74 + }, + { + "epoch": 0.024303305249513935, + "grad_norm": 0.9067403674125671, + "learning_rate": 2.6978417266187052e-06, + "loss": 0.3239, + "step": 75 + }, + { + "epoch": 0.02462734931950745, + "grad_norm": 1.0051820278167725, + "learning_rate": 2.733812949640288e-06, + "loss": 0.3297, + "step": 76 + }, + { + "epoch": 0.024951393389500972, + "grad_norm": 0.9007921814918518, + "learning_rate": 2.7697841726618706e-06, + "loss": 0.2907, + "step": 77 + }, + { + "epoch": 0.025275437459494492, + "grad_norm": 0.9421604871749878, + "learning_rate": 2.805755395683453e-06, + "loss": 0.3225, + "step": 78 + }, + { + "epoch": 0.02559948152948801, + "grad_norm": 0.9415566921234131, + "learning_rate": 2.8417266187050364e-06, + "loss": 0.3117, + "step": 79 + }, + { + "epoch": 0.02592352559948153, + "grad_norm": 0.8771030902862549, + "learning_rate": 2.877697841726619e-06, + "loss": 0.2962, + "step": 80 + }, + { + "epoch": 0.02624756966947505, + "grad_norm": 0.9552968740463257, + "learning_rate": 2.9136690647482017e-06, + "loss": 0.3261, + "step": 81 + }, + { + "epoch": 0.026571613739468567, + "grad_norm": 0.875261127948761, + "learning_rate": 2.949640287769784e-06, + "loss": 0.2574, + "step": 82 + }, + { + "epoch": 0.026895657809462088, + "grad_norm": 0.9335787892341614, + "learning_rate": 2.985611510791367e-06, + "loss": 0.3017, + "step": 83 + }, + { + "epoch": 0.027219701879455604, + "grad_norm": 0.9566580653190613, + "learning_rate": 3.02158273381295e-06, + "loss": 0.2518, + "step": 84 + }, + { + "epoch": 0.027543745949449125, + "grad_norm": 1.029236078262329, + "learning_rate": 3.0575539568345324e-06, + "loss": 0.3018, + "step": 85 + }, + { + "epoch": 0.027867790019442645, + "grad_norm": 0.9539006352424622, + "learning_rate": 3.0935251798561158e-06, + "loss": 0.2737, + "step": 86 + }, + { + "epoch": 0.028191834089436162, + "grad_norm": 0.8749252557754517, + "learning_rate": 3.1294964028776982e-06, + "loss": 0.2658, + "step": 87 + }, + { + "epoch": 0.028515878159429683, + "grad_norm": 0.9875462055206299, + "learning_rate": 3.1654676258992807e-06, + "loss": 0.2728, + "step": 88 + }, + { + "epoch": 0.028839922229423203, + "grad_norm": 1.0440022945404053, + "learning_rate": 3.2014388489208636e-06, + "loss": 0.2765, + "step": 89 + }, + { + "epoch": 0.02916396629941672, + "grad_norm": 0.9891248345375061, + "learning_rate": 3.237410071942446e-06, + "loss": 0.2938, + "step": 90 + }, + { + "epoch": 0.02948801036941024, + "grad_norm": 0.9206273555755615, + "learning_rate": 3.2733812949640294e-06, + "loss": 0.292, + "step": 91 + }, + { + "epoch": 0.029812054439403757, + "grad_norm": 0.8831924200057983, + "learning_rate": 3.309352517985612e-06, + "loss": 0.2889, + "step": 92 + }, + { + "epoch": 0.030136098509397278, + "grad_norm": 0.934688150882721, + "learning_rate": 3.3453237410071943e-06, + "loss": 0.2814, + "step": 93 + }, + { + "epoch": 0.0304601425793908, + "grad_norm": 0.9249264001846313, + "learning_rate": 3.381294964028777e-06, + "loss": 0.2681, + "step": 94 + }, + { + "epoch": 0.030784186649384315, + "grad_norm": 0.9179894328117371, + "learning_rate": 3.4172661870503596e-06, + "loss": 0.2711, + "step": 95 + }, + { + "epoch": 0.031108230719377836, + "grad_norm": 0.8751144409179688, + "learning_rate": 3.453237410071943e-06, + "loss": 0.2876, + "step": 96 + }, + { + "epoch": 0.031432274789371356, + "grad_norm": 0.8831710815429688, + "learning_rate": 3.4892086330935254e-06, + "loss": 0.2742, + "step": 97 + }, + { + "epoch": 0.031756318859364877, + "grad_norm": 0.876036524772644, + "learning_rate": 3.525179856115108e-06, + "loss": 0.2656, + "step": 98 + }, + { + "epoch": 0.03208036292935839, + "grad_norm": 0.906802773475647, + "learning_rate": 3.561151079136691e-06, + "loss": 0.2778, + "step": 99 + }, + { + "epoch": 0.03240440699935191, + "grad_norm": 0.9074854254722595, + "learning_rate": 3.5971223021582737e-06, + "loss": 0.2745, + "step": 100 + }, + { + "epoch": 0.03272845106934543, + "grad_norm": 0.8964940309524536, + "learning_rate": 3.6330935251798566e-06, + "loss": 0.2771, + "step": 101 + }, + { + "epoch": 0.03305249513933895, + "grad_norm": 0.9870682954788208, + "learning_rate": 3.669064748201439e-06, + "loss": 0.2511, + "step": 102 + }, + { + "epoch": 0.03337653920933247, + "grad_norm": 0.8685485124588013, + "learning_rate": 3.7050359712230215e-06, + "loss": 0.2774, + "step": 103 + }, + { + "epoch": 0.033700583279325985, + "grad_norm": 0.8977862000465393, + "learning_rate": 3.741007194244605e-06, + "loss": 0.2451, + "step": 104 + }, + { + "epoch": 0.034024627349319506, + "grad_norm": 0.9542827606201172, + "learning_rate": 3.7769784172661873e-06, + "loss": 0.2577, + "step": 105 + }, + { + "epoch": 0.034348671419313026, + "grad_norm": 0.9061799049377441, + "learning_rate": 3.81294964028777e-06, + "loss": 0.2587, + "step": 106 + }, + { + "epoch": 0.034672715489306546, + "grad_norm": 0.9147353172302246, + "learning_rate": 3.848920863309353e-06, + "loss": 0.2791, + "step": 107 + }, + { + "epoch": 0.03499675955930007, + "grad_norm": 0.9400584101676941, + "learning_rate": 3.884892086330936e-06, + "loss": 0.2735, + "step": 108 + }, + { + "epoch": 0.03532080362929359, + "grad_norm": 0.9575251936912537, + "learning_rate": 3.920863309352518e-06, + "loss": 0.2476, + "step": 109 + }, + { + "epoch": 0.0356448476992871, + "grad_norm": 0.9304046034812927, + "learning_rate": 3.956834532374101e-06, + "loss": 0.2667, + "step": 110 + }, + { + "epoch": 0.03596889176928062, + "grad_norm": 0.9660559296607971, + "learning_rate": 3.992805755395684e-06, + "loss": 0.2579, + "step": 111 + }, + { + "epoch": 0.03629293583927414, + "grad_norm": 0.8823477029800415, + "learning_rate": 4.028776978417267e-06, + "loss": 0.2525, + "step": 112 + }, + { + "epoch": 0.03661697990926766, + "grad_norm": 0.9394170641899109, + "learning_rate": 4.0647482014388495e-06, + "loss": 0.2739, + "step": 113 + }, + { + "epoch": 0.03694102397926118, + "grad_norm": 0.8887931704521179, + "learning_rate": 4.100719424460432e-06, + "loss": 0.2813, + "step": 114 + }, + { + "epoch": 0.037265068049254696, + "grad_norm": 0.9069398045539856, + "learning_rate": 4.1366906474820145e-06, + "loss": 0.2868, + "step": 115 + }, + { + "epoch": 0.037589112119248216, + "grad_norm": 0.9058663249015808, + "learning_rate": 4.172661870503597e-06, + "loss": 0.2669, + "step": 116 + }, + { + "epoch": 0.03791315618924174, + "grad_norm": 0.8527618646621704, + "learning_rate": 4.20863309352518e-06, + "loss": 0.2499, + "step": 117 + }, + { + "epoch": 0.03823720025923526, + "grad_norm": 0.8536428809165955, + "learning_rate": 4.244604316546763e-06, + "loss": 0.2398, + "step": 118 + }, + { + "epoch": 0.03856124432922878, + "grad_norm": 0.8949153423309326, + "learning_rate": 4.280575539568346e-06, + "loss": 0.28, + "step": 119 + }, + { + "epoch": 0.03888528839922229, + "grad_norm": 0.9180740714073181, + "learning_rate": 4.316546762589928e-06, + "loss": 0.2479, + "step": 120 + }, + { + "epoch": 0.03920933246921581, + "grad_norm": 0.9510776996612549, + "learning_rate": 4.352517985611511e-06, + "loss": 0.2592, + "step": 121 + }, + { + "epoch": 0.03953337653920933, + "grad_norm": 0.8663166165351868, + "learning_rate": 4.388489208633094e-06, + "loss": 0.2733, + "step": 122 + }, + { + "epoch": 0.03985742060920285, + "grad_norm": 0.8462815284729004, + "learning_rate": 4.424460431654677e-06, + "loss": 0.2434, + "step": 123 + }, + { + "epoch": 0.04018146467919637, + "grad_norm": 0.8777222037315369, + "learning_rate": 4.46043165467626e-06, + "loss": 0.2746, + "step": 124 + }, + { + "epoch": 0.04050550874918989, + "grad_norm": 0.8853914737701416, + "learning_rate": 4.496402877697842e-06, + "loss": 0.2502, + "step": 125 + }, + { + "epoch": 0.04082955281918341, + "grad_norm": 0.8713170886039734, + "learning_rate": 4.5323741007194245e-06, + "loss": 0.2437, + "step": 126 + }, + { + "epoch": 0.04115359688917693, + "grad_norm": 0.9855924248695374, + "learning_rate": 4.5683453237410074e-06, + "loss": 0.2915, + "step": 127 + }, + { + "epoch": 0.04147764095917045, + "grad_norm": 0.8738729953765869, + "learning_rate": 4.60431654676259e-06, + "loss": 0.2375, + "step": 128 + }, + { + "epoch": 0.04180168502916397, + "grad_norm": 1.1465152502059937, + "learning_rate": 4.640287769784173e-06, + "loss": 0.2789, + "step": 129 + }, + { + "epoch": 0.04212572909915749, + "grad_norm": 0.9107539057731628, + "learning_rate": 4.676258992805755e-06, + "loss": 0.2642, + "step": 130 + }, + { + "epoch": 0.042449773169151, + "grad_norm": 0.8694542050361633, + "learning_rate": 4.712230215827339e-06, + "loss": 0.2586, + "step": 131 + }, + { + "epoch": 0.04277381723914452, + "grad_norm": 0.86564040184021, + "learning_rate": 4.748201438848921e-06, + "loss": 0.2501, + "step": 132 + }, + { + "epoch": 0.04309786130913804, + "grad_norm": 0.8690810799598694, + "learning_rate": 4.784172661870504e-06, + "loss": 0.2424, + "step": 133 + }, + { + "epoch": 0.04342190537913156, + "grad_norm": 0.8725466728210449, + "learning_rate": 4.820143884892087e-06, + "loss": 0.2496, + "step": 134 + }, + { + "epoch": 0.043745949449125084, + "grad_norm": 0.9374155402183533, + "learning_rate": 4.856115107913669e-06, + "loss": 0.2509, + "step": 135 + }, + { + "epoch": 0.0440699935191186, + "grad_norm": 0.9191246628761292, + "learning_rate": 4.892086330935253e-06, + "loss": 0.2552, + "step": 136 + }, + { + "epoch": 0.04439403758911212, + "grad_norm": 0.9410930871963501, + "learning_rate": 4.928057553956835e-06, + "loss": 0.2721, + "step": 137 + }, + { + "epoch": 0.04471808165910564, + "grad_norm": 0.8686294555664062, + "learning_rate": 4.9640287769784175e-06, + "loss": 0.2457, + "step": 138 + }, + { + "epoch": 0.04504212572909916, + "grad_norm": 0.8420104384422302, + "learning_rate": 5e-06, + "loss": 0.2469, + "step": 139 + }, + { + "epoch": 0.04536616979909268, + "grad_norm": 0.8580688834190369, + "learning_rate": 5.035971223021583e-06, + "loss": 0.2439, + "step": 140 + }, + { + "epoch": 0.04569021386908619, + "grad_norm": 0.9308591485023499, + "learning_rate": 5.071942446043165e-06, + "loss": 0.276, + "step": 141 + }, + { + "epoch": 0.04601425793907971, + "grad_norm": 0.9194615483283997, + "learning_rate": 5.107913669064749e-06, + "loss": 0.2731, + "step": 142 + }, + { + "epoch": 0.04633830200907323, + "grad_norm": 0.9039499759674072, + "learning_rate": 5.143884892086332e-06, + "loss": 0.2535, + "step": 143 + }, + { + "epoch": 0.046662346079066754, + "grad_norm": 0.8582255840301514, + "learning_rate": 5.179856115107914e-06, + "loss": 0.2396, + "step": 144 + }, + { + "epoch": 0.046986390149060274, + "grad_norm": 0.8596925735473633, + "learning_rate": 5.215827338129497e-06, + "loss": 0.253, + "step": 145 + }, + { + "epoch": 0.047310434219053794, + "grad_norm": 0.895989716053009, + "learning_rate": 5.251798561151079e-06, + "loss": 0.235, + "step": 146 + }, + { + "epoch": 0.04763447828904731, + "grad_norm": 0.8556272983551025, + "learning_rate": 5.287769784172663e-06, + "loss": 0.2552, + "step": 147 + }, + { + "epoch": 0.04795852235904083, + "grad_norm": 0.9312075972557068, + "learning_rate": 5.3237410071942456e-06, + "loss": 0.2618, + "step": 148 + }, + { + "epoch": 0.04828256642903435, + "grad_norm": 0.791953444480896, + "learning_rate": 5.359712230215828e-06, + "loss": 0.2401, + "step": 149 + }, + { + "epoch": 0.04860661049902787, + "grad_norm": 0.8349822163581848, + "learning_rate": 5.3956834532374105e-06, + "loss": 0.2266, + "step": 150 + }, + { + "epoch": 0.04893065456902139, + "grad_norm": 0.8655434250831604, + "learning_rate": 5.4316546762589925e-06, + "loss": 0.2502, + "step": 151 + }, + { + "epoch": 0.0492546986390149, + "grad_norm": 0.9085103869438171, + "learning_rate": 5.467625899280576e-06, + "loss": 0.2561, + "step": 152 + }, + { + "epoch": 0.04957874270900842, + "grad_norm": 0.9157651662826538, + "learning_rate": 5.503597122302159e-06, + "loss": 0.248, + "step": 153 + }, + { + "epoch": 0.049902786779001944, + "grad_norm": 0.895823061466217, + "learning_rate": 5.539568345323741e-06, + "loss": 0.2582, + "step": 154 + }, + { + "epoch": 0.050226830848995464, + "grad_norm": 0.8751013278961182, + "learning_rate": 5.575539568345324e-06, + "loss": 0.2537, + "step": 155 + }, + { + "epoch": 0.050550874918988985, + "grad_norm": 0.9522944092750549, + "learning_rate": 5.611510791366906e-06, + "loss": 0.2764, + "step": 156 + }, + { + "epoch": 0.0508749189889825, + "grad_norm": 0.9214081168174744, + "learning_rate": 5.64748201438849e-06, + "loss": 0.2592, + "step": 157 + }, + { + "epoch": 0.05119896305897602, + "grad_norm": 0.8131060004234314, + "learning_rate": 5.683453237410073e-06, + "loss": 0.2302, + "step": 158 + }, + { + "epoch": 0.05152300712896954, + "grad_norm": 1.0334426164627075, + "learning_rate": 5.719424460431655e-06, + "loss": 0.253, + "step": 159 + }, + { + "epoch": 0.05184705119896306, + "grad_norm": 0.876676082611084, + "learning_rate": 5.755395683453238e-06, + "loss": 0.2428, + "step": 160 + }, + { + "epoch": 0.05217109526895658, + "grad_norm": 0.892617404460907, + "learning_rate": 5.79136690647482e-06, + "loss": 0.2324, + "step": 161 + }, + { + "epoch": 0.0524951393389501, + "grad_norm": 0.8631848692893982, + "learning_rate": 5.8273381294964035e-06, + "loss": 0.2433, + "step": 162 + }, + { + "epoch": 0.052819183408943614, + "grad_norm": 0.9174634218215942, + "learning_rate": 5.863309352517986e-06, + "loss": 0.2717, + "step": 163 + }, + { + "epoch": 0.053143227478937134, + "grad_norm": 0.890937864780426, + "learning_rate": 5.899280575539568e-06, + "loss": 0.2683, + "step": 164 + }, + { + "epoch": 0.053467271548930655, + "grad_norm": 0.8314776420593262, + "learning_rate": 5.935251798561151e-06, + "loss": 0.2417, + "step": 165 + }, + { + "epoch": 0.053791315618924175, + "grad_norm": 0.8143745064735413, + "learning_rate": 5.971223021582734e-06, + "loss": 0.2451, + "step": 166 + }, + { + "epoch": 0.054115359688917695, + "grad_norm": 0.8573601245880127, + "learning_rate": 6.007194244604317e-06, + "loss": 0.2611, + "step": 167 + }, + { + "epoch": 0.05443940375891121, + "grad_norm": 0.8676037788391113, + "learning_rate": 6.0431654676259e-06, + "loss": 0.249, + "step": 168 + }, + { + "epoch": 0.05476344782890473, + "grad_norm": 0.9017927646636963, + "learning_rate": 6.079136690647483e-06, + "loss": 0.2486, + "step": 169 + }, + { + "epoch": 0.05508749189889825, + "grad_norm": 0.8411036729812622, + "learning_rate": 6.115107913669065e-06, + "loss": 0.2378, + "step": 170 + }, + { + "epoch": 0.05541153596889177, + "grad_norm": 0.8910001516342163, + "learning_rate": 6.151079136690648e-06, + "loss": 0.2714, + "step": 171 + }, + { + "epoch": 0.05573558003888529, + "grad_norm": 0.8422628045082092, + "learning_rate": 6.1870503597122315e-06, + "loss": 0.2484, + "step": 172 + }, + { + "epoch": 0.056059624108878804, + "grad_norm": 0.8160513639450073, + "learning_rate": 6.2230215827338136e-06, + "loss": 0.2442, + "step": 173 + }, + { + "epoch": 0.056383668178872325, + "grad_norm": 0.9058359265327454, + "learning_rate": 6.2589928057553964e-06, + "loss": 0.272, + "step": 174 + }, + { + "epoch": 0.056707712248865845, + "grad_norm": 0.8583076000213623, + "learning_rate": 6.2949640287769785e-06, + "loss": 0.2545, + "step": 175 + }, + { + "epoch": 0.057031756318859365, + "grad_norm": 0.8205615282058716, + "learning_rate": 6.330935251798561e-06, + "loss": 0.2359, + "step": 176 + }, + { + "epoch": 0.057355800388852886, + "grad_norm": 0.8605383038520813, + "learning_rate": 6.366906474820145e-06, + "loss": 0.2571, + "step": 177 + }, + { + "epoch": 0.057679844458846406, + "grad_norm": 0.9030138254165649, + "learning_rate": 6.402877697841727e-06, + "loss": 0.237, + "step": 178 + }, + { + "epoch": 0.05800388852883992, + "grad_norm": 0.8796730637550354, + "learning_rate": 6.43884892086331e-06, + "loss": 0.2538, + "step": 179 + }, + { + "epoch": 0.05832793259883344, + "grad_norm": 0.8664329648017883, + "learning_rate": 6.474820143884892e-06, + "loss": 0.2429, + "step": 180 + }, + { + "epoch": 0.05865197666882696, + "grad_norm": 0.8648440837860107, + "learning_rate": 6.510791366906475e-06, + "loss": 0.2111, + "step": 181 + }, + { + "epoch": 0.05897602073882048, + "grad_norm": 0.8421782851219177, + "learning_rate": 6.546762589928059e-06, + "loss": 0.2377, + "step": 182 + }, + { + "epoch": 0.059300064808814, + "grad_norm": 1.0346119403839111, + "learning_rate": 6.582733812949641e-06, + "loss": 0.2555, + "step": 183 + }, + { + "epoch": 0.059624108878807515, + "grad_norm": 0.9008530378341675, + "learning_rate": 6.618705035971224e-06, + "loss": 0.2621, + "step": 184 + }, + { + "epoch": 0.059948152948801035, + "grad_norm": 0.8390885591506958, + "learning_rate": 6.654676258992806e-06, + "loss": 0.2286, + "step": 185 + }, + { + "epoch": 0.060272197018794556, + "grad_norm": 0.8984460830688477, + "learning_rate": 6.6906474820143886e-06, + "loss": 0.2778, + "step": 186 + }, + { + "epoch": 0.060596241088788076, + "grad_norm": 0.8588024973869324, + "learning_rate": 6.726618705035972e-06, + "loss": 0.247, + "step": 187 + }, + { + "epoch": 0.0609202851587816, + "grad_norm": 0.7865144610404968, + "learning_rate": 6.762589928057554e-06, + "loss": 0.2186, + "step": 188 + }, + { + "epoch": 0.06124432922877511, + "grad_norm": 1.1462135314941406, + "learning_rate": 6.798561151079137e-06, + "loss": 0.225, + "step": 189 + }, + { + "epoch": 0.06156837329876863, + "grad_norm": 0.8443127870559692, + "learning_rate": 6.834532374100719e-06, + "loss": 0.2369, + "step": 190 + }, + { + "epoch": 0.06189241736876215, + "grad_norm": 0.8886812925338745, + "learning_rate": 6.870503597122302e-06, + "loss": 0.2253, + "step": 191 + }, + { + "epoch": 0.06221646143875567, + "grad_norm": 0.8370912671089172, + "learning_rate": 6.906474820143886e-06, + "loss": 0.2419, + "step": 192 + }, + { + "epoch": 0.06254050550874919, + "grad_norm": 0.8914476633071899, + "learning_rate": 6.942446043165468e-06, + "loss": 0.2395, + "step": 193 + }, + { + "epoch": 0.06286454957874271, + "grad_norm": 0.7934096455574036, + "learning_rate": 6.978417266187051e-06, + "loss": 0.2276, + "step": 194 + }, + { + "epoch": 0.06318859364873623, + "grad_norm": 0.8254308700561523, + "learning_rate": 7.014388489208634e-06, + "loss": 0.2287, + "step": 195 + }, + { + "epoch": 0.06351263771872975, + "grad_norm": 0.8662794828414917, + "learning_rate": 7.050359712230216e-06, + "loss": 0.2424, + "step": 196 + }, + { + "epoch": 0.06383668178872326, + "grad_norm": 0.8179991841316223, + "learning_rate": 7.0863309352517995e-06, + "loss": 0.245, + "step": 197 + }, + { + "epoch": 0.06416072585871678, + "grad_norm": 0.849475622177124, + "learning_rate": 7.122302158273382e-06, + "loss": 0.2559, + "step": 198 + }, + { + "epoch": 0.0644847699287103, + "grad_norm": 0.8367174863815308, + "learning_rate": 7.1582733812949644e-06, + "loss": 0.2409, + "step": 199 + }, + { + "epoch": 0.06480881399870382, + "grad_norm": 0.8685291409492493, + "learning_rate": 7.194244604316547e-06, + "loss": 0.2552, + "step": 200 + }, + { + "epoch": 0.06513285806869734, + "grad_norm": 0.8459415435791016, + "learning_rate": 7.230215827338129e-06, + "loss": 0.2281, + "step": 201 + }, + { + "epoch": 0.06545690213869086, + "grad_norm": 0.8228902816772461, + "learning_rate": 7.266187050359713e-06, + "loss": 0.2223, + "step": 202 + }, + { + "epoch": 0.06578094620868438, + "grad_norm": 0.920698881149292, + "learning_rate": 7.302158273381296e-06, + "loss": 0.2479, + "step": 203 + }, + { + "epoch": 0.0661049902786779, + "grad_norm": 0.8067958354949951, + "learning_rate": 7.338129496402878e-06, + "loss": 0.2158, + "step": 204 + }, + { + "epoch": 0.06642903434867142, + "grad_norm": 0.8358391523361206, + "learning_rate": 7.374100719424461e-06, + "loss": 0.2352, + "step": 205 + }, + { + "epoch": 0.06675307841866494, + "grad_norm": 0.8111941814422607, + "learning_rate": 7.410071942446043e-06, + "loss": 0.2319, + "step": 206 + }, + { + "epoch": 0.06707712248865846, + "grad_norm": 0.8654391169548035, + "learning_rate": 7.446043165467627e-06, + "loss": 0.251, + "step": 207 + }, + { + "epoch": 0.06740116655865197, + "grad_norm": 0.8868721127510071, + "learning_rate": 7.48201438848921e-06, + "loss": 0.2259, + "step": 208 + }, + { + "epoch": 0.06772521062864549, + "grad_norm": 0.828054666519165, + "learning_rate": 7.517985611510792e-06, + "loss": 0.2236, + "step": 209 + }, + { + "epoch": 0.06804925469863901, + "grad_norm": 0.8440101146697998, + "learning_rate": 7.5539568345323745e-06, + "loss": 0.2405, + "step": 210 + }, + { + "epoch": 0.06837329876863253, + "grad_norm": 0.8946020603179932, + "learning_rate": 7.589928057553958e-06, + "loss": 0.2357, + "step": 211 + }, + { + "epoch": 0.06869734283862605, + "grad_norm": 0.8750216364860535, + "learning_rate": 7.62589928057554e-06, + "loss": 0.2351, + "step": 212 + }, + { + "epoch": 0.06902138690861957, + "grad_norm": 0.8706597685813904, + "learning_rate": 7.661870503597123e-06, + "loss": 0.2213, + "step": 213 + }, + { + "epoch": 0.06934543097861309, + "grad_norm": 0.8614869713783264, + "learning_rate": 7.697841726618706e-06, + "loss": 0.2317, + "step": 214 + }, + { + "epoch": 0.06966947504860661, + "grad_norm": 0.8689197897911072, + "learning_rate": 7.733812949640287e-06, + "loss": 0.234, + "step": 215 + }, + { + "epoch": 0.06999351911860013, + "grad_norm": 0.8240856528282166, + "learning_rate": 7.769784172661872e-06, + "loss": 0.2291, + "step": 216 + }, + { + "epoch": 0.07031756318859365, + "grad_norm": 0.7766245603561401, + "learning_rate": 7.805755395683455e-06, + "loss": 0.2302, + "step": 217 + }, + { + "epoch": 0.07064160725858717, + "grad_norm": 0.8634687066078186, + "learning_rate": 7.841726618705036e-06, + "loss": 0.2436, + "step": 218 + }, + { + "epoch": 0.07096565132858068, + "grad_norm": 0.9289287328720093, + "learning_rate": 7.877697841726619e-06, + "loss": 0.2374, + "step": 219 + }, + { + "epoch": 0.0712896953985742, + "grad_norm": 0.8757550716400146, + "learning_rate": 7.913669064748202e-06, + "loss": 0.2577, + "step": 220 + }, + { + "epoch": 0.07161373946856772, + "grad_norm": 0.8134459853172302, + "learning_rate": 7.949640287769785e-06, + "loss": 0.2292, + "step": 221 + }, + { + "epoch": 0.07193778353856124, + "grad_norm": 0.8444440960884094, + "learning_rate": 7.985611510791367e-06, + "loss": 0.2299, + "step": 222 + }, + { + "epoch": 0.07226182760855476, + "grad_norm": 0.8103510737419128, + "learning_rate": 8.02158273381295e-06, + "loss": 0.231, + "step": 223 + }, + { + "epoch": 0.07258587167854828, + "grad_norm": 0.900696873664856, + "learning_rate": 8.057553956834533e-06, + "loss": 0.2424, + "step": 224 + }, + { + "epoch": 0.0729099157485418, + "grad_norm": 0.8289245367050171, + "learning_rate": 8.093525179856116e-06, + "loss": 0.2419, + "step": 225 + }, + { + "epoch": 0.07323395981853532, + "grad_norm": 0.7995141744613647, + "learning_rate": 8.129496402877699e-06, + "loss": 0.2033, + "step": 226 + }, + { + "epoch": 0.07355800388852884, + "grad_norm": 0.8330203890800476, + "learning_rate": 8.165467625899282e-06, + "loss": 0.2265, + "step": 227 + }, + { + "epoch": 0.07388204795852236, + "grad_norm": 0.8520117402076721, + "learning_rate": 8.201438848920865e-06, + "loss": 0.2273, + "step": 228 + }, + { + "epoch": 0.07420609202851587, + "grad_norm": 0.8868409991264343, + "learning_rate": 8.237410071942446e-06, + "loss": 0.2531, + "step": 229 + }, + { + "epoch": 0.07453013609850939, + "grad_norm": 0.8631070256233215, + "learning_rate": 8.273381294964029e-06, + "loss": 0.2416, + "step": 230 + }, + { + "epoch": 0.07485418016850291, + "grad_norm": 0.8252522349357605, + "learning_rate": 8.309352517985614e-06, + "loss": 0.2352, + "step": 231 + }, + { + "epoch": 0.07517822423849643, + "grad_norm": 0.8407851457595825, + "learning_rate": 8.345323741007195e-06, + "loss": 0.2537, + "step": 232 + }, + { + "epoch": 0.07550226830848995, + "grad_norm": 0.8433776497840881, + "learning_rate": 8.381294964028778e-06, + "loss": 0.2518, + "step": 233 + }, + { + "epoch": 0.07582631237848347, + "grad_norm": 0.8409492373466492, + "learning_rate": 8.41726618705036e-06, + "loss": 0.2292, + "step": 234 + }, + { + "epoch": 0.076150356448477, + "grad_norm": 0.8289313912391663, + "learning_rate": 8.453237410071943e-06, + "loss": 0.244, + "step": 235 + }, + { + "epoch": 0.07647440051847051, + "grad_norm": 0.8278692960739136, + "learning_rate": 8.489208633093526e-06, + "loss": 0.2642, + "step": 236 + }, + { + "epoch": 0.07679844458846403, + "grad_norm": 0.868140697479248, + "learning_rate": 8.525179856115109e-06, + "loss": 0.2503, + "step": 237 + }, + { + "epoch": 0.07712248865845756, + "grad_norm": 0.933914303779602, + "learning_rate": 8.561151079136692e-06, + "loss": 0.2429, + "step": 238 + }, + { + "epoch": 0.07744653272845108, + "grad_norm": 0.8058993816375732, + "learning_rate": 8.597122302158273e-06, + "loss": 0.228, + "step": 239 + }, + { + "epoch": 0.07777057679844458, + "grad_norm": 0.9699552059173584, + "learning_rate": 8.633093525179856e-06, + "loss": 0.2513, + "step": 240 + }, + { + "epoch": 0.0780946208684381, + "grad_norm": 0.874172031879425, + "learning_rate": 8.66906474820144e-06, + "loss": 0.26, + "step": 241 + }, + { + "epoch": 0.07841866493843162, + "grad_norm": 0.8918880820274353, + "learning_rate": 8.705035971223022e-06, + "loss": 0.2304, + "step": 242 + }, + { + "epoch": 0.07874270900842514, + "grad_norm": 0.7745374441146851, + "learning_rate": 8.741007194244605e-06, + "loss": 0.2204, + "step": 243 + }, + { + "epoch": 0.07906675307841866, + "grad_norm": 0.8315483331680298, + "learning_rate": 8.776978417266188e-06, + "loss": 0.2322, + "step": 244 + }, + { + "epoch": 0.07939079714841218, + "grad_norm": 0.8538967370986938, + "learning_rate": 8.81294964028777e-06, + "loss": 0.2491, + "step": 245 + }, + { + "epoch": 0.0797148412184057, + "grad_norm": 0.8524037003517151, + "learning_rate": 8.848920863309353e-06, + "loss": 0.2322, + "step": 246 + }, + { + "epoch": 0.08003888528839923, + "grad_norm": 0.8282663226127625, + "learning_rate": 8.884892086330936e-06, + "loss": 0.2338, + "step": 247 + }, + { + "epoch": 0.08036292935839275, + "grad_norm": 0.8466488122940063, + "learning_rate": 8.92086330935252e-06, + "loss": 0.2177, + "step": 248 + }, + { + "epoch": 0.08068697342838627, + "grad_norm": 0.7749168276786804, + "learning_rate": 8.956834532374102e-06, + "loss": 0.2156, + "step": 249 + }, + { + "epoch": 0.08101101749837979, + "grad_norm": 0.7421470880508423, + "learning_rate": 8.992805755395683e-06, + "loss": 0.2064, + "step": 250 + }, + { + "epoch": 0.08133506156837329, + "grad_norm": 0.8627637028694153, + "learning_rate": 9.028776978417268e-06, + "loss": 0.2348, + "step": 251 + }, + { + "epoch": 0.08165910563836681, + "grad_norm": 0.8397968411445618, + "learning_rate": 9.064748201438849e-06, + "loss": 0.2108, + "step": 252 + }, + { + "epoch": 0.08198314970836033, + "grad_norm": 0.8088095784187317, + "learning_rate": 9.100719424460432e-06, + "loss": 0.2269, + "step": 253 + }, + { + "epoch": 0.08230719377835385, + "grad_norm": 0.8072569966316223, + "learning_rate": 9.136690647482015e-06, + "loss": 0.23, + "step": 254 + }, + { + "epoch": 0.08263123784834737, + "grad_norm": 0.9000716209411621, + "learning_rate": 9.172661870503598e-06, + "loss": 0.2442, + "step": 255 + }, + { + "epoch": 0.0829552819183409, + "grad_norm": 0.7782520651817322, + "learning_rate": 9.20863309352518e-06, + "loss": 0.2184, + "step": 256 + }, + { + "epoch": 0.08327932598833442, + "grad_norm": 0.8208126425743103, + "learning_rate": 9.244604316546764e-06, + "loss": 0.2235, + "step": 257 + }, + { + "epoch": 0.08360337005832794, + "grad_norm": 0.7606107592582703, + "learning_rate": 9.280575539568346e-06, + "loss": 0.2232, + "step": 258 + }, + { + "epoch": 0.08392741412832146, + "grad_norm": 0.7701852321624756, + "learning_rate": 9.31654676258993e-06, + "loss": 0.2252, + "step": 259 + }, + { + "epoch": 0.08425145819831498, + "grad_norm": 0.8124762177467346, + "learning_rate": 9.35251798561151e-06, + "loss": 0.2281, + "step": 260 + }, + { + "epoch": 0.08457550226830848, + "grad_norm": 0.8278365135192871, + "learning_rate": 9.388489208633095e-06, + "loss": 0.2354, + "step": 261 + }, + { + "epoch": 0.084899546338302, + "grad_norm": 0.7792116403579712, + "learning_rate": 9.424460431654678e-06, + "loss": 0.2214, + "step": 262 + }, + { + "epoch": 0.08522359040829552, + "grad_norm": 0.819582998752594, + "learning_rate": 9.46043165467626e-06, + "loss": 0.2218, + "step": 263 + }, + { + "epoch": 0.08554763447828904, + "grad_norm": 0.7760310173034668, + "learning_rate": 9.496402877697842e-06, + "loss": 0.2066, + "step": 264 + }, + { + "epoch": 0.08587167854828257, + "grad_norm": 0.8136816024780273, + "learning_rate": 9.532374100719425e-06, + "loss": 0.2381, + "step": 265 + }, + { + "epoch": 0.08619572261827609, + "grad_norm": 0.8015646934509277, + "learning_rate": 9.568345323741008e-06, + "loss": 0.2275, + "step": 266 + }, + { + "epoch": 0.0865197666882696, + "grad_norm": 0.7706541419029236, + "learning_rate": 9.60431654676259e-06, + "loss": 0.2259, + "step": 267 + }, + { + "epoch": 0.08684381075826313, + "grad_norm": 0.7953770160675049, + "learning_rate": 9.640287769784174e-06, + "loss": 0.2233, + "step": 268 + }, + { + "epoch": 0.08716785482825665, + "grad_norm": 0.8387556672096252, + "learning_rate": 9.676258992805757e-06, + "loss": 0.2349, + "step": 269 + }, + { + "epoch": 0.08749189889825017, + "grad_norm": 0.8319289684295654, + "learning_rate": 9.712230215827338e-06, + "loss": 0.2326, + "step": 270 + }, + { + "epoch": 0.08781594296824369, + "grad_norm": 0.864242672920227, + "learning_rate": 9.748201438848922e-06, + "loss": 0.211, + "step": 271 + }, + { + "epoch": 0.0881399870382372, + "grad_norm": 0.7795359492301941, + "learning_rate": 9.784172661870505e-06, + "loss": 0.2206, + "step": 272 + }, + { + "epoch": 0.08846403110823071, + "grad_norm": 0.8175662159919739, + "learning_rate": 9.820143884892086e-06, + "loss": 0.2325, + "step": 273 + }, + { + "epoch": 0.08878807517822424, + "grad_norm": 0.755852460861206, + "learning_rate": 9.85611510791367e-06, + "loss": 0.226, + "step": 274 + }, + { + "epoch": 0.08911211924821776, + "grad_norm": 0.8484703898429871, + "learning_rate": 9.892086330935252e-06, + "loss": 0.2396, + "step": 275 + }, + { + "epoch": 0.08943616331821128, + "grad_norm": 0.8286850452423096, + "learning_rate": 9.928057553956835e-06, + "loss": 0.2586, + "step": 276 + }, + { + "epoch": 0.0897602073882048, + "grad_norm": 0.7314445972442627, + "learning_rate": 9.964028776978418e-06, + "loss": 0.2392, + "step": 277 + }, + { + "epoch": 0.09008425145819832, + "grad_norm": 0.7226484417915344, + "learning_rate": 1e-05, + "loss": 0.1949, + "step": 278 + }, + { + "epoch": 0.09040829552819184, + "grad_norm": 0.8189452290534973, + "learning_rate": 9.999999694024202e-06, + "loss": 0.2115, + "step": 279 + }, + { + "epoch": 0.09073233959818536, + "grad_norm": 0.7656586170196533, + "learning_rate": 9.999998776096847e-06, + "loss": 0.2323, + "step": 280 + }, + { + "epoch": 0.09105638366817888, + "grad_norm": 0.7828707695007324, + "learning_rate": 9.999997246218044e-06, + "loss": 0.2014, + "step": 281 + }, + { + "epoch": 0.09138042773817238, + "grad_norm": 0.8037205338478088, + "learning_rate": 9.99999510438798e-06, + "loss": 0.231, + "step": 282 + }, + { + "epoch": 0.0917044718081659, + "grad_norm": 0.8756486177444458, + "learning_rate": 9.999992350606919e-06, + "loss": 0.2363, + "step": 283 + }, + { + "epoch": 0.09202851587815943, + "grad_norm": 0.8126011490821838, + "learning_rate": 9.9999889848752e-06, + "loss": 0.2334, + "step": 284 + }, + { + "epoch": 0.09235255994815295, + "grad_norm": 0.7885010242462158, + "learning_rate": 9.999985007193232e-06, + "loss": 0.2207, + "step": 285 + }, + { + "epoch": 0.09267660401814647, + "grad_norm": 0.702421247959137, + "learning_rate": 9.999980417561503e-06, + "loss": 0.2074, + "step": 286 + }, + { + "epoch": 0.09300064808813999, + "grad_norm": 0.778176486492157, + "learning_rate": 9.999975215980575e-06, + "loss": 0.2389, + "step": 287 + }, + { + "epoch": 0.09332469215813351, + "grad_norm": 0.7950177192687988, + "learning_rate": 9.999969402451084e-06, + "loss": 0.2336, + "step": 288 + }, + { + "epoch": 0.09364873622812703, + "grad_norm": 0.7845720052719116, + "learning_rate": 9.999962976973741e-06, + "loss": 0.2375, + "step": 289 + }, + { + "epoch": 0.09397278029812055, + "grad_norm": 0.8304017186164856, + "learning_rate": 9.999955939549333e-06, + "loss": 0.2314, + "step": 290 + }, + { + "epoch": 0.09429682436811407, + "grad_norm": 0.7669353485107422, + "learning_rate": 9.99994829017872e-06, + "loss": 0.2178, + "step": 291 + }, + { + "epoch": 0.09462086843810759, + "grad_norm": 0.7424461841583252, + "learning_rate": 9.999940028862843e-06, + "loss": 0.2304, + "step": 292 + }, + { + "epoch": 0.0949449125081011, + "grad_norm": 0.7105286121368408, + "learning_rate": 9.999931155602707e-06, + "loss": 0.2165, + "step": 293 + }, + { + "epoch": 0.09526895657809462, + "grad_norm": 0.7596817016601562, + "learning_rate": 9.999921670399401e-06, + "loss": 0.2198, + "step": 294 + }, + { + "epoch": 0.09559300064808814, + "grad_norm": 0.7864949703216553, + "learning_rate": 9.999911573254085e-06, + "loss": 0.2224, + "step": 295 + }, + { + "epoch": 0.09591704471808166, + "grad_norm": 0.7551132440567017, + "learning_rate": 9.999900864167996e-06, + "loss": 0.2074, + "step": 296 + }, + { + "epoch": 0.09624108878807518, + "grad_norm": 0.7826792597770691, + "learning_rate": 9.999889543142444e-06, + "loss": 0.2213, + "step": 297 + }, + { + "epoch": 0.0965651328580687, + "grad_norm": 0.7614756226539612, + "learning_rate": 9.999877610178814e-06, + "loss": 0.2182, + "step": 298 + }, + { + "epoch": 0.09688917692806222, + "grad_norm": 0.820511519908905, + "learning_rate": 9.99986506527857e-06, + "loss": 0.224, + "step": 299 + }, + { + "epoch": 0.09721322099805574, + "grad_norm": 0.7976818680763245, + "learning_rate": 9.99985190844324e-06, + "loss": 0.2322, + "step": 300 + }, + { + "epoch": 0.09753726506804926, + "grad_norm": 0.8125640749931335, + "learning_rate": 9.999838139674443e-06, + "loss": 0.2515, + "step": 301 + }, + { + "epoch": 0.09786130913804278, + "grad_norm": 0.7696121335029602, + "learning_rate": 9.999823758973857e-06, + "loss": 0.2177, + "step": 302 + }, + { + "epoch": 0.0981853532080363, + "grad_norm": 0.750672459602356, + "learning_rate": 9.999808766343246e-06, + "loss": 0.2245, + "step": 303 + }, + { + "epoch": 0.0985093972780298, + "grad_norm": 0.7182970643043518, + "learning_rate": 9.999793161784443e-06, + "loss": 0.1997, + "step": 304 + }, + { + "epoch": 0.09883344134802333, + "grad_norm": 0.7711732387542725, + "learning_rate": 9.99977694529936e-06, + "loss": 0.2175, + "step": 305 + }, + { + "epoch": 0.09915748541801685, + "grad_norm": 0.7251968383789062, + "learning_rate": 9.99976011688998e-06, + "loss": 0.1967, + "step": 306 + }, + { + "epoch": 0.09948152948801037, + "grad_norm": 0.7583543062210083, + "learning_rate": 9.999742676558363e-06, + "loss": 0.214, + "step": 307 + }, + { + "epoch": 0.09980557355800389, + "grad_norm": 0.7657378911972046, + "learning_rate": 9.999724624306644e-06, + "loss": 0.2081, + "step": 308 + }, + { + "epoch": 0.10012961762799741, + "grad_norm": 0.7284070253372192, + "learning_rate": 9.999705960137032e-06, + "loss": 0.1964, + "step": 309 + }, + { + "epoch": 0.10045366169799093, + "grad_norm": 0.7148271799087524, + "learning_rate": 9.999686684051811e-06, + "loss": 0.2017, + "step": 310 + }, + { + "epoch": 0.10077770576798445, + "grad_norm": 0.7276371121406555, + "learning_rate": 9.99966679605334e-06, + "loss": 0.2083, + "step": 311 + }, + { + "epoch": 0.10110174983797797, + "grad_norm": 0.791381299495697, + "learning_rate": 9.999646296144054e-06, + "loss": 0.2403, + "step": 312 + }, + { + "epoch": 0.10142579390797149, + "grad_norm": 0.7297627925872803, + "learning_rate": 9.999625184326463e-06, + "loss": 0.2132, + "step": 313 + }, + { + "epoch": 0.101749837977965, + "grad_norm": 0.7267171144485474, + "learning_rate": 9.999603460603147e-06, + "loss": 0.2011, + "step": 314 + }, + { + "epoch": 0.10207388204795852, + "grad_norm": 0.7363854646682739, + "learning_rate": 9.99958112497677e-06, + "loss": 0.225, + "step": 315 + }, + { + "epoch": 0.10239792611795204, + "grad_norm": 0.7794610857963562, + "learning_rate": 9.999558177450062e-06, + "loss": 0.203, + "step": 316 + }, + { + "epoch": 0.10272197018794556, + "grad_norm": 0.7780491709709167, + "learning_rate": 9.999534618025833e-06, + "loss": 0.2095, + "step": 317 + }, + { + "epoch": 0.10304601425793908, + "grad_norm": 0.8182850480079651, + "learning_rate": 9.999510446706966e-06, + "loss": 0.2327, + "step": 318 + }, + { + "epoch": 0.1033700583279326, + "grad_norm": 0.779887855052948, + "learning_rate": 9.999485663496417e-06, + "loss": 0.2045, + "step": 319 + }, + { + "epoch": 0.10369410239792612, + "grad_norm": 0.735507071018219, + "learning_rate": 9.999460268397225e-06, + "loss": 0.2075, + "step": 320 + }, + { + "epoch": 0.10401814646791964, + "grad_norm": 0.7778931856155396, + "learning_rate": 9.999434261412493e-06, + "loss": 0.2161, + "step": 321 + }, + { + "epoch": 0.10434219053791316, + "grad_norm": 0.7456550598144531, + "learning_rate": 9.999407642545404e-06, + "loss": 0.2109, + "step": 322 + }, + { + "epoch": 0.10466623460790668, + "grad_norm": 0.8102906942367554, + "learning_rate": 9.99938041179922e-06, + "loss": 0.2417, + "step": 323 + }, + { + "epoch": 0.1049902786779002, + "grad_norm": 0.7794447541236877, + "learning_rate": 9.99935256917727e-06, + "loss": 0.2354, + "step": 324 + }, + { + "epoch": 0.10531432274789371, + "grad_norm": 0.8024318814277649, + "learning_rate": 9.999324114682964e-06, + "loss": 0.2345, + "step": 325 + }, + { + "epoch": 0.10563836681788723, + "grad_norm": 0.7224960327148438, + "learning_rate": 9.999295048319785e-06, + "loss": 0.2146, + "step": 326 + }, + { + "epoch": 0.10596241088788075, + "grad_norm": 0.7515241503715515, + "learning_rate": 9.999265370091287e-06, + "loss": 0.207, + "step": 327 + }, + { + "epoch": 0.10628645495787427, + "grad_norm": 0.7258550524711609, + "learning_rate": 9.999235080001105e-06, + "loss": 0.1973, + "step": 328 + }, + { + "epoch": 0.10661049902786779, + "grad_norm": 0.7604589462280273, + "learning_rate": 9.999204178052944e-06, + "loss": 0.1959, + "step": 329 + }, + { + "epoch": 0.10693454309786131, + "grad_norm": 0.8284873366355896, + "learning_rate": 9.999172664250589e-06, + "loss": 0.2337, + "step": 330 + }, + { + "epoch": 0.10725858716785483, + "grad_norm": 0.765424370765686, + "learning_rate": 9.999140538597895e-06, + "loss": 0.2385, + "step": 331 + }, + { + "epoch": 0.10758263123784835, + "grad_norm": 0.7512611150741577, + "learning_rate": 9.999107801098796e-06, + "loss": 0.2078, + "step": 332 + }, + { + "epoch": 0.10790667530784187, + "grad_norm": 0.6939177513122559, + "learning_rate": 9.999074451757295e-06, + "loss": 0.2072, + "step": 333 + }, + { + "epoch": 0.10823071937783539, + "grad_norm": 0.7670846581459045, + "learning_rate": 9.999040490577478e-06, + "loss": 0.2033, + "step": 334 + }, + { + "epoch": 0.10855476344782891, + "grad_norm": 0.7212173342704773, + "learning_rate": 9.999005917563498e-06, + "loss": 0.2049, + "step": 335 + }, + { + "epoch": 0.10887880751782242, + "grad_norm": 0.7780339121818542, + "learning_rate": 9.998970732719588e-06, + "loss": 0.2321, + "step": 336 + }, + { + "epoch": 0.10920285158781594, + "grad_norm": 0.7524231672286987, + "learning_rate": 9.998934936050055e-06, + "loss": 0.2285, + "step": 337 + }, + { + "epoch": 0.10952689565780946, + "grad_norm": 0.7963482141494751, + "learning_rate": 9.99889852755928e-06, + "loss": 0.2369, + "step": 338 + }, + { + "epoch": 0.10985093972780298, + "grad_norm": 0.7501022219657898, + "learning_rate": 9.998861507251717e-06, + "loss": 0.2064, + "step": 339 + }, + { + "epoch": 0.1101749837977965, + "grad_norm": 0.7451834678649902, + "learning_rate": 9.998823875131898e-06, + "loss": 0.2087, + "step": 340 + }, + { + "epoch": 0.11049902786779002, + "grad_norm": 0.787020206451416, + "learning_rate": 9.998785631204428e-06, + "loss": 0.2261, + "step": 341 + }, + { + "epoch": 0.11082307193778354, + "grad_norm": 0.717600405216217, + "learning_rate": 9.998746775473992e-06, + "loss": 0.2163, + "step": 342 + }, + { + "epoch": 0.11114711600777706, + "grad_norm": 0.7501165270805359, + "learning_rate": 9.998707307945339e-06, + "loss": 0.2046, + "step": 343 + }, + { + "epoch": 0.11147116007777058, + "grad_norm": 0.7228868007659912, + "learning_rate": 9.998667228623304e-06, + "loss": 0.2002, + "step": 344 + }, + { + "epoch": 0.1117952041477641, + "grad_norm": 0.7500896453857422, + "learning_rate": 9.998626537512792e-06, + "loss": 0.1977, + "step": 345 + }, + { + "epoch": 0.11211924821775761, + "grad_norm": 0.7988274693489075, + "learning_rate": 9.99858523461878e-06, + "loss": 0.2194, + "step": 346 + }, + { + "epoch": 0.11244329228775113, + "grad_norm": 0.7718932628631592, + "learning_rate": 9.998543319946328e-06, + "loss": 0.2284, + "step": 347 + }, + { + "epoch": 0.11276733635774465, + "grad_norm": 0.7662320137023926, + "learning_rate": 9.998500793500562e-06, + "loss": 0.2156, + "step": 348 + }, + { + "epoch": 0.11309138042773817, + "grad_norm": 0.7333149909973145, + "learning_rate": 9.998457655286689e-06, + "loss": 0.2173, + "step": 349 + }, + { + "epoch": 0.11341542449773169, + "grad_norm": 0.762129008769989, + "learning_rate": 9.998413905309986e-06, + "loss": 0.2351, + "step": 350 + }, + { + "epoch": 0.11373946856772521, + "grad_norm": 0.7451883554458618, + "learning_rate": 9.99836954357581e-06, + "loss": 0.2135, + "step": 351 + }, + { + "epoch": 0.11406351263771873, + "grad_norm": 0.7003828883171082, + "learning_rate": 9.99832457008959e-06, + "loss": 0.1902, + "step": 352 + }, + { + "epoch": 0.11438755670771225, + "grad_norm": 0.7704355716705322, + "learning_rate": 9.998278984856831e-06, + "loss": 0.2068, + "step": 353 + }, + { + "epoch": 0.11471160077770577, + "grad_norm": 0.780831515789032, + "learning_rate": 9.998232787883111e-06, + "loss": 0.2059, + "step": 354 + }, + { + "epoch": 0.11503564484769929, + "grad_norm": 0.7990095019340515, + "learning_rate": 9.998185979174084e-06, + "loss": 0.2142, + "step": 355 + }, + { + "epoch": 0.11535968891769281, + "grad_norm": 0.7470694184303284, + "learning_rate": 9.998138558735479e-06, + "loss": 0.1885, + "step": 356 + }, + { + "epoch": 0.11568373298768632, + "grad_norm": 0.7477388978004456, + "learning_rate": 9.998090526573101e-06, + "loss": 0.1947, + "step": 357 + }, + { + "epoch": 0.11600777705767984, + "grad_norm": 0.7454971671104431, + "learning_rate": 9.998041882692828e-06, + "loss": 0.203, + "step": 358 + }, + { + "epoch": 0.11633182112767336, + "grad_norm": 0.7200604677200317, + "learning_rate": 9.997992627100612e-06, + "loss": 0.1936, + "step": 359 + }, + { + "epoch": 0.11665586519766688, + "grad_norm": 0.7129573225975037, + "learning_rate": 9.997942759802483e-06, + "loss": 0.2073, + "step": 360 + }, + { + "epoch": 0.1169799092676604, + "grad_norm": 0.7242521047592163, + "learning_rate": 9.997892280804545e-06, + "loss": 0.2211, + "step": 361 + }, + { + "epoch": 0.11730395333765392, + "grad_norm": 0.7934162020683289, + "learning_rate": 9.997841190112975e-06, + "loss": 0.2302, + "step": 362 + }, + { + "epoch": 0.11762799740764744, + "grad_norm": 0.7240173816680908, + "learning_rate": 9.997789487734027e-06, + "loss": 0.2145, + "step": 363 + }, + { + "epoch": 0.11795204147764096, + "grad_norm": 0.7518720626831055, + "learning_rate": 9.997737173674027e-06, + "loss": 0.2165, + "step": 364 + }, + { + "epoch": 0.11827608554763448, + "grad_norm": 0.7862987518310547, + "learning_rate": 9.997684247939378e-06, + "loss": 0.1998, + "step": 365 + }, + { + "epoch": 0.118600129617628, + "grad_norm": 0.73781418800354, + "learning_rate": 9.997630710536559e-06, + "loss": 0.2104, + "step": 366 + }, + { + "epoch": 0.11892417368762152, + "grad_norm": 0.7208497524261475, + "learning_rate": 9.997576561472122e-06, + "loss": 0.2169, + "step": 367 + }, + { + "epoch": 0.11924821775761503, + "grad_norm": 0.7701983451843262, + "learning_rate": 9.997521800752695e-06, + "loss": 0.2275, + "step": 368 + }, + { + "epoch": 0.11957226182760855, + "grad_norm": 0.7541593313217163, + "learning_rate": 9.99746642838498e-06, + "loss": 0.1995, + "step": 369 + }, + { + "epoch": 0.11989630589760207, + "grad_norm": 0.6978981494903564, + "learning_rate": 9.99741044437575e-06, + "loss": 0.2075, + "step": 370 + }, + { + "epoch": 0.12022034996759559, + "grad_norm": 0.6774824857711792, + "learning_rate": 9.997353848731862e-06, + "loss": 0.205, + "step": 371 + }, + { + "epoch": 0.12054439403758911, + "grad_norm": 0.736003041267395, + "learning_rate": 9.997296641460242e-06, + "loss": 0.193, + "step": 372 + }, + { + "epoch": 0.12086843810758263, + "grad_norm": 0.7393019795417786, + "learning_rate": 9.99723882256789e-06, + "loss": 0.2138, + "step": 373 + }, + { + "epoch": 0.12119248217757615, + "grad_norm": 0.765889048576355, + "learning_rate": 9.997180392061883e-06, + "loss": 0.2211, + "step": 374 + }, + { + "epoch": 0.12151652624756967, + "grad_norm": 0.733389675617218, + "learning_rate": 9.997121349949372e-06, + "loss": 0.2199, + "step": 375 + }, + { + "epoch": 0.1218405703175632, + "grad_norm": 0.7976531386375427, + "learning_rate": 9.997061696237584e-06, + "loss": 0.2237, + "step": 376 + }, + { + "epoch": 0.12216461438755671, + "grad_norm": 0.6957002282142639, + "learning_rate": 9.99700143093382e-06, + "loss": 0.224, + "step": 377 + }, + { + "epoch": 0.12248865845755022, + "grad_norm": 0.6930292844772339, + "learning_rate": 9.996940554045455e-06, + "loss": 0.2044, + "step": 378 + }, + { + "epoch": 0.12281270252754374, + "grad_norm": 0.8116925954818726, + "learning_rate": 9.99687906557994e-06, + "loss": 0.2241, + "step": 379 + }, + { + "epoch": 0.12313674659753726, + "grad_norm": 0.7239817380905151, + "learning_rate": 9.996816965544802e-06, + "loss": 0.214, + "step": 380 + }, + { + "epoch": 0.12346079066753078, + "grad_norm": 0.7183677554130554, + "learning_rate": 9.99675425394764e-06, + "loss": 0.2092, + "step": 381 + }, + { + "epoch": 0.1237848347375243, + "grad_norm": 0.6525163650512695, + "learning_rate": 9.996690930796132e-06, + "loss": 0.1909, + "step": 382 + }, + { + "epoch": 0.12410887880751782, + "grad_norm": 0.7569447159767151, + "learning_rate": 9.996626996098021e-06, + "loss": 0.2257, + "step": 383 + }, + { + "epoch": 0.12443292287751134, + "grad_norm": 0.7216403484344482, + "learning_rate": 9.996562449861141e-06, + "loss": 0.2057, + "step": 384 + }, + { + "epoch": 0.12475696694750486, + "grad_norm": 0.7046488523483276, + "learning_rate": 9.996497292093386e-06, + "loss": 0.197, + "step": 385 + }, + { + "epoch": 0.12508101101749838, + "grad_norm": 0.7048548460006714, + "learning_rate": 9.996431522802733e-06, + "loss": 0.2103, + "step": 386 + }, + { + "epoch": 0.1254050550874919, + "grad_norm": 0.7337021231651306, + "learning_rate": 9.996365141997229e-06, + "loss": 0.218, + "step": 387 + }, + { + "epoch": 0.12572909915748542, + "grad_norm": 0.7375513911247253, + "learning_rate": 9.996298149685e-06, + "loss": 0.2033, + "step": 388 + }, + { + "epoch": 0.12605314322747893, + "grad_norm": 0.7464216947555542, + "learning_rate": 9.996230545874247e-06, + "loss": 0.2115, + "step": 389 + }, + { + "epoch": 0.12637718729747247, + "grad_norm": 0.7280207276344299, + "learning_rate": 9.996162330573243e-06, + "loss": 0.206, + "step": 390 + }, + { + "epoch": 0.12670123136746597, + "grad_norm": 0.7100143432617188, + "learning_rate": 9.996093503790333e-06, + "loss": 0.2193, + "step": 391 + }, + { + "epoch": 0.1270252754374595, + "grad_norm": 0.721896767616272, + "learning_rate": 9.996024065533948e-06, + "loss": 0.2073, + "step": 392 + }, + { + "epoch": 0.127349319507453, + "grad_norm": 0.7025097608566284, + "learning_rate": 9.995954015812582e-06, + "loss": 0.2016, + "step": 393 + }, + { + "epoch": 0.12767336357744652, + "grad_norm": 0.7675989866256714, + "learning_rate": 9.995883354634806e-06, + "loss": 0.2212, + "step": 394 + }, + { + "epoch": 0.12799740764744005, + "grad_norm": 0.7080046534538269, + "learning_rate": 9.995812082009274e-06, + "loss": 0.2008, + "step": 395 + }, + { + "epoch": 0.12832145171743356, + "grad_norm": 0.6863510608673096, + "learning_rate": 9.995740197944706e-06, + "loss": 0.2137, + "step": 396 + }, + { + "epoch": 0.1286454957874271, + "grad_norm": 0.7432951331138611, + "learning_rate": 9.9956677024499e-06, + "loss": 0.2193, + "step": 397 + }, + { + "epoch": 0.1289695398574206, + "grad_norm": 0.6628539562225342, + "learning_rate": 9.995594595533729e-06, + "loss": 0.1771, + "step": 398 + }, + { + "epoch": 0.12929358392741414, + "grad_norm": 0.7454772591590881, + "learning_rate": 9.995520877205141e-06, + "loss": 0.2032, + "step": 399 + }, + { + "epoch": 0.12961762799740764, + "grad_norm": 0.7162988781929016, + "learning_rate": 9.995446547473157e-06, + "loss": 0.2083, + "step": 400 + }, + { + "epoch": 0.12994167206740118, + "grad_norm": 0.7362358570098877, + "learning_rate": 9.995371606346875e-06, + "loss": 0.1914, + "step": 401 + }, + { + "epoch": 0.13026571613739468, + "grad_norm": 0.6585822701454163, + "learning_rate": 9.995296053835469e-06, + "loss": 0.1903, + "step": 402 + }, + { + "epoch": 0.13058976020738822, + "grad_norm": 0.6886841654777527, + "learning_rate": 9.995219889948184e-06, + "loss": 0.1908, + "step": 403 + }, + { + "epoch": 0.13091380427738172, + "grad_norm": 0.718048095703125, + "learning_rate": 9.99514311469434e-06, + "loss": 0.2034, + "step": 404 + }, + { + "epoch": 0.13123784834737523, + "grad_norm": 0.7566750049591064, + "learning_rate": 9.995065728083337e-06, + "loss": 0.2011, + "step": 405 + }, + { + "epoch": 0.13156189241736876, + "grad_norm": 0.7040491700172424, + "learning_rate": 9.994987730124646e-06, + "loss": 0.2081, + "step": 406 + }, + { + "epoch": 0.13188593648736227, + "grad_norm": 0.7720775604248047, + "learning_rate": 9.994909120827811e-06, + "loss": 0.2185, + "step": 407 + }, + { + "epoch": 0.1322099805573558, + "grad_norm": 0.725797176361084, + "learning_rate": 9.994829900202454e-06, + "loss": 0.2071, + "step": 408 + }, + { + "epoch": 0.1325340246273493, + "grad_norm": 0.7542802095413208, + "learning_rate": 9.99475006825827e-06, + "loss": 0.2148, + "step": 409 + }, + { + "epoch": 0.13285806869734285, + "grad_norm": 0.7492075562477112, + "learning_rate": 9.994669625005032e-06, + "loss": 0.2237, + "step": 410 + }, + { + "epoch": 0.13318211276733635, + "grad_norm": 0.7159206867218018, + "learning_rate": 9.994588570452583e-06, + "loss": 0.2014, + "step": 411 + }, + { + "epoch": 0.1335061568373299, + "grad_norm": 0.7251393795013428, + "learning_rate": 9.994506904610846e-06, + "loss": 0.204, + "step": 412 + }, + { + "epoch": 0.1338302009073234, + "grad_norm": 0.697099506855011, + "learning_rate": 9.994424627489813e-06, + "loss": 0.2048, + "step": 413 + }, + { + "epoch": 0.13415424497731693, + "grad_norm": 0.7721277475357056, + "learning_rate": 9.994341739099556e-06, + "loss": 0.2188, + "step": 414 + }, + { + "epoch": 0.13447828904731043, + "grad_norm": 0.704058051109314, + "learning_rate": 9.99425823945022e-06, + "loss": 0.2005, + "step": 415 + }, + { + "epoch": 0.13480233311730394, + "grad_norm": 0.6567177772521973, + "learning_rate": 9.99417412855202e-06, + "loss": 0.185, + "step": 416 + }, + { + "epoch": 0.13512637718729748, + "grad_norm": 0.7193306088447571, + "learning_rate": 9.994089406415258e-06, + "loss": 0.1941, + "step": 417 + }, + { + "epoch": 0.13545042125729098, + "grad_norm": 0.7080915570259094, + "learning_rate": 9.994004073050297e-06, + "loss": 0.2167, + "step": 418 + }, + { + "epoch": 0.13577446532728452, + "grad_norm": 0.7174451351165771, + "learning_rate": 9.993918128467583e-06, + "loss": 0.2102, + "step": 419 + }, + { + "epoch": 0.13609850939727802, + "grad_norm": 0.7323854565620422, + "learning_rate": 9.993831572677636e-06, + "loss": 0.1976, + "step": 420 + }, + { + "epoch": 0.13642255346727156, + "grad_norm": 0.7107775211334229, + "learning_rate": 9.993744405691049e-06, + "loss": 0.2114, + "step": 421 + }, + { + "epoch": 0.13674659753726506, + "grad_norm": 0.7212236523628235, + "learning_rate": 9.99365662751849e-06, + "loss": 0.2044, + "step": 422 + }, + { + "epoch": 0.1370706416072586, + "grad_norm": 0.7694576978683472, + "learning_rate": 9.9935682381707e-06, + "loss": 0.2152, + "step": 423 + }, + { + "epoch": 0.1373946856772521, + "grad_norm": 0.6720986366271973, + "learning_rate": 9.993479237658501e-06, + "loss": 0.1975, + "step": 424 + }, + { + "epoch": 0.13771872974724564, + "grad_norm": 0.7393246293067932, + "learning_rate": 9.993389625992783e-06, + "loss": 0.2103, + "step": 425 + }, + { + "epoch": 0.13804277381723914, + "grad_norm": 0.7491670846939087, + "learning_rate": 9.993299403184515e-06, + "loss": 0.193, + "step": 426 + }, + { + "epoch": 0.13836681788723265, + "grad_norm": 0.780389130115509, + "learning_rate": 9.99320856924474e-06, + "loss": 0.2256, + "step": 427 + }, + { + "epoch": 0.13869086195722619, + "grad_norm": 0.6445964574813843, + "learning_rate": 9.993117124184572e-06, + "loss": 0.1863, + "step": 428 + }, + { + "epoch": 0.1390149060272197, + "grad_norm": 0.6863457560539246, + "learning_rate": 9.993025068015205e-06, + "loss": 0.1811, + "step": 429 + }, + { + "epoch": 0.13933895009721323, + "grad_norm": 0.7053923010826111, + "learning_rate": 9.992932400747908e-06, + "loss": 0.2131, + "step": 430 + }, + { + "epoch": 0.13966299416720673, + "grad_norm": 0.7096285223960876, + "learning_rate": 9.99283912239402e-06, + "loss": 0.2006, + "step": 431 + }, + { + "epoch": 0.13998703823720027, + "grad_norm": 0.7141770720481873, + "learning_rate": 9.992745232964957e-06, + "loss": 0.2034, + "step": 432 + }, + { + "epoch": 0.14031108230719377, + "grad_norm": 0.7291443943977356, + "learning_rate": 9.99265073247221e-06, + "loss": 0.1955, + "step": 433 + }, + { + "epoch": 0.1406351263771873, + "grad_norm": 0.7909582257270813, + "learning_rate": 9.99255562092735e-06, + "loss": 0.2202, + "step": 434 + }, + { + "epoch": 0.14095917044718081, + "grad_norm": 0.7880704998970032, + "learning_rate": 9.992459898342008e-06, + "loss": 0.2242, + "step": 435 + }, + { + "epoch": 0.14128321451717435, + "grad_norm": 0.7351171970367432, + "learning_rate": 9.99236356472791e-06, + "loss": 0.212, + "step": 436 + }, + { + "epoch": 0.14160725858716786, + "grad_norm": 0.7005438208580017, + "learning_rate": 9.99226662009684e-06, + "loss": 0.204, + "step": 437 + }, + { + "epoch": 0.14193130265716136, + "grad_norm": 0.6722733378410339, + "learning_rate": 9.992169064460663e-06, + "loss": 0.2046, + "step": 438 + }, + { + "epoch": 0.1422553467271549, + "grad_norm": 0.6662759184837341, + "learning_rate": 9.992070897831322e-06, + "loss": 0.2081, + "step": 439 + }, + { + "epoch": 0.1425793907971484, + "grad_norm": 0.7334291338920593, + "learning_rate": 9.99197212022083e-06, + "loss": 0.2198, + "step": 440 + }, + { + "epoch": 0.14290343486714194, + "grad_norm": 0.7043226361274719, + "learning_rate": 9.991872731641276e-06, + "loss": 0.2086, + "step": 441 + }, + { + "epoch": 0.14322747893713544, + "grad_norm": 0.6686563491821289, + "learning_rate": 9.991772732104825e-06, + "loss": 0.2065, + "step": 442 + }, + { + "epoch": 0.14355152300712898, + "grad_norm": 0.609669029712677, + "learning_rate": 9.991672121623717e-06, + "loss": 0.1653, + "step": 443 + }, + { + "epoch": 0.14387556707712248, + "grad_norm": 0.690898597240448, + "learning_rate": 9.991570900210262e-06, + "loss": 0.2024, + "step": 444 + }, + { + "epoch": 0.14419961114711602, + "grad_norm": 0.6984000205993652, + "learning_rate": 9.991469067876854e-06, + "loss": 0.1927, + "step": 445 + }, + { + "epoch": 0.14452365521710953, + "grad_norm": 0.6699127554893494, + "learning_rate": 9.99136662463595e-06, + "loss": 0.1903, + "step": 446 + }, + { + "epoch": 0.14484769928710303, + "grad_norm": 0.6831761598587036, + "learning_rate": 9.991263570500093e-06, + "loss": 0.2032, + "step": 447 + }, + { + "epoch": 0.14517174335709657, + "grad_norm": 0.6796483993530273, + "learning_rate": 9.991159905481893e-06, + "loss": 0.191, + "step": 448 + }, + { + "epoch": 0.14549578742709007, + "grad_norm": 0.7104837894439697, + "learning_rate": 9.99105562959404e-06, + "loss": 0.2004, + "step": 449 + }, + { + "epoch": 0.1458198314970836, + "grad_norm": 0.7294518351554871, + "learning_rate": 9.990950742849295e-06, + "loss": 0.1984, + "step": 450 + }, + { + "epoch": 0.14614387556707711, + "grad_norm": 0.6870996952056885, + "learning_rate": 9.990845245260495e-06, + "loss": 0.196, + "step": 451 + }, + { + "epoch": 0.14646791963707065, + "grad_norm": 0.6710522770881653, + "learning_rate": 9.990739136840552e-06, + "loss": 0.1906, + "step": 452 + }, + { + "epoch": 0.14679196370706415, + "grad_norm": 0.6412180662155151, + "learning_rate": 9.990632417602452e-06, + "loss": 0.1865, + "step": 453 + }, + { + "epoch": 0.1471160077770577, + "grad_norm": 0.7567189931869507, + "learning_rate": 9.99052508755926e-06, + "loss": 0.209, + "step": 454 + }, + { + "epoch": 0.1474400518470512, + "grad_norm": 0.7240652441978455, + "learning_rate": 9.990417146724106e-06, + "loss": 0.2081, + "step": 455 + }, + { + "epoch": 0.14776409591704473, + "grad_norm": 0.6918161511421204, + "learning_rate": 9.990308595110206e-06, + "loss": 0.2005, + "step": 456 + }, + { + "epoch": 0.14808813998703824, + "grad_norm": 0.7122319936752319, + "learning_rate": 9.990199432730842e-06, + "loss": 0.1936, + "step": 457 + }, + { + "epoch": 0.14841218405703174, + "grad_norm": 0.6836199164390564, + "learning_rate": 9.990089659599378e-06, + "loss": 0.2088, + "step": 458 + }, + { + "epoch": 0.14873622812702528, + "grad_norm": 0.6745359301567078, + "learning_rate": 9.989979275729248e-06, + "loss": 0.2043, + "step": 459 + }, + { + "epoch": 0.14906027219701878, + "grad_norm": 0.743945300579071, + "learning_rate": 9.98986828113396e-06, + "loss": 0.1958, + "step": 460 + }, + { + "epoch": 0.14938431626701232, + "grad_norm": 0.7083187103271484, + "learning_rate": 9.9897566758271e-06, + "loss": 0.1946, + "step": 461 + }, + { + "epoch": 0.14970836033700582, + "grad_norm": 0.7363801598548889, + "learning_rate": 9.989644459822329e-06, + "loss": 0.1964, + "step": 462 + }, + { + "epoch": 0.15003240440699936, + "grad_norm": 0.7157705426216125, + "learning_rate": 9.989531633133379e-06, + "loss": 0.1995, + "step": 463 + }, + { + "epoch": 0.15035644847699287, + "grad_norm": 0.6821460127830505, + "learning_rate": 9.989418195774058e-06, + "loss": 0.2009, + "step": 464 + }, + { + "epoch": 0.1506804925469864, + "grad_norm": 0.8093439340591431, + "learning_rate": 9.989304147758254e-06, + "loss": 0.2163, + "step": 465 + }, + { + "epoch": 0.1510045366169799, + "grad_norm": 0.7202951908111572, + "learning_rate": 9.989189489099921e-06, + "loss": 0.2036, + "step": 466 + }, + { + "epoch": 0.15132858068697344, + "grad_norm": 0.6870996356010437, + "learning_rate": 9.989074219813093e-06, + "loss": 0.1963, + "step": 467 + }, + { + "epoch": 0.15165262475696695, + "grad_norm": 0.6940014362335205, + "learning_rate": 9.98895833991188e-06, + "loss": 0.1812, + "step": 468 + }, + { + "epoch": 0.15197666882696045, + "grad_norm": 0.6710031628608704, + "learning_rate": 9.98884184941046e-06, + "loss": 0.182, + "step": 469 + }, + { + "epoch": 0.152300712896954, + "grad_norm": 0.637710690498352, + "learning_rate": 9.988724748323096e-06, + "loss": 0.1757, + "step": 470 + }, + { + "epoch": 0.1526247569669475, + "grad_norm": 0.6982388496398926, + "learning_rate": 9.988607036664118e-06, + "loss": 0.2097, + "step": 471 + }, + { + "epoch": 0.15294880103694103, + "grad_norm": 0.6776319742202759, + "learning_rate": 9.98848871444793e-06, + "loss": 0.2038, + "step": 472 + }, + { + "epoch": 0.15327284510693454, + "grad_norm": 0.7191901803016663, + "learning_rate": 9.988369781689018e-06, + "loss": 0.1931, + "step": 473 + }, + { + "epoch": 0.15359688917692807, + "grad_norm": 0.6711001992225647, + "learning_rate": 9.988250238401933e-06, + "loss": 0.1883, + "step": 474 + }, + { + "epoch": 0.15392093324692158, + "grad_norm": 0.7274355888366699, + "learning_rate": 9.98813008460131e-06, + "loss": 0.2202, + "step": 475 + }, + { + "epoch": 0.1542449773169151, + "grad_norm": 0.7254967093467712, + "learning_rate": 9.988009320301854e-06, + "loss": 0.22, + "step": 476 + }, + { + "epoch": 0.15456902138690862, + "grad_norm": 0.6807737350463867, + "learning_rate": 9.987887945518346e-06, + "loss": 0.2028, + "step": 477 + }, + { + "epoch": 0.15489306545690215, + "grad_norm": 0.6453770399093628, + "learning_rate": 9.987765960265639e-06, + "loss": 0.1875, + "step": 478 + }, + { + "epoch": 0.15521710952689566, + "grad_norm": 0.6426407098770142, + "learning_rate": 9.987643364558664e-06, + "loss": 0.1741, + "step": 479 + }, + { + "epoch": 0.15554115359688916, + "grad_norm": 0.673290491104126, + "learning_rate": 9.987520158412424e-06, + "loss": 0.1934, + "step": 480 + }, + { + "epoch": 0.1558651976668827, + "grad_norm": 0.7302583456039429, + "learning_rate": 9.987396341841999e-06, + "loss": 0.2002, + "step": 481 + }, + { + "epoch": 0.1561892417368762, + "grad_norm": 0.7037186026573181, + "learning_rate": 9.987271914862547e-06, + "loss": 0.2137, + "step": 482 + }, + { + "epoch": 0.15651328580686974, + "grad_norm": 0.6902836561203003, + "learning_rate": 9.98714687748929e-06, + "loss": 0.1804, + "step": 483 + }, + { + "epoch": 0.15683732987686325, + "grad_norm": 0.6849866509437561, + "learning_rate": 9.987021229737535e-06, + "loss": 0.2028, + "step": 484 + }, + { + "epoch": 0.15716137394685678, + "grad_norm": 0.645197331905365, + "learning_rate": 9.98689497162266e-06, + "loss": 0.1771, + "step": 485 + }, + { + "epoch": 0.1574854180168503, + "grad_norm": 0.7027577757835388, + "learning_rate": 9.986768103160119e-06, + "loss": 0.1971, + "step": 486 + }, + { + "epoch": 0.15780946208684382, + "grad_norm": 0.7298611998558044, + "learning_rate": 9.986640624365436e-06, + "loss": 0.2195, + "step": 487 + }, + { + "epoch": 0.15813350615683733, + "grad_norm": 0.6794098615646362, + "learning_rate": 9.986512535254215e-06, + "loss": 0.2028, + "step": 488 + }, + { + "epoch": 0.15845755022683086, + "grad_norm": 0.7098777294158936, + "learning_rate": 9.986383835842133e-06, + "loss": 0.2034, + "step": 489 + }, + { + "epoch": 0.15878159429682437, + "grad_norm": 0.6703172326087952, + "learning_rate": 9.986254526144941e-06, + "loss": 0.1852, + "step": 490 + }, + { + "epoch": 0.15910563836681788, + "grad_norm": 0.6819800138473511, + "learning_rate": 9.986124606178466e-06, + "loss": 0.19, + "step": 491 + }, + { + "epoch": 0.1594296824368114, + "grad_norm": 0.6795838475227356, + "learning_rate": 9.985994075958609e-06, + "loss": 0.2027, + "step": 492 + }, + { + "epoch": 0.15975372650680492, + "grad_norm": 0.7014299631118774, + "learning_rate": 9.985862935501346e-06, + "loss": 0.2033, + "step": 493 + }, + { + "epoch": 0.16007777057679845, + "grad_norm": 0.7198210954666138, + "learning_rate": 9.985731184822724e-06, + "loss": 0.1794, + "step": 494 + }, + { + "epoch": 0.16040181464679196, + "grad_norm": 0.6685338616371155, + "learning_rate": 9.985598823938873e-06, + "loss": 0.1948, + "step": 495 + }, + { + "epoch": 0.1607258587167855, + "grad_norm": 0.6871777772903442, + "learning_rate": 9.98546585286599e-06, + "loss": 0.1727, + "step": 496 + }, + { + "epoch": 0.161049902786779, + "grad_norm": 0.7162487506866455, + "learning_rate": 9.985332271620349e-06, + "loss": 0.2239, + "step": 497 + }, + { + "epoch": 0.16137394685677253, + "grad_norm": 0.6949173808097839, + "learning_rate": 9.985198080218301e-06, + "loss": 0.2019, + "step": 498 + }, + { + "epoch": 0.16169799092676604, + "grad_norm": 0.690800130367279, + "learning_rate": 9.985063278676266e-06, + "loss": 0.1974, + "step": 499 + }, + { + "epoch": 0.16202203499675957, + "grad_norm": 0.7009753584861755, + "learning_rate": 9.984927867010748e-06, + "loss": 0.2018, + "step": 500 + }, + { + "epoch": 0.16234607906675308, + "grad_norm": 0.6902207136154175, + "learning_rate": 9.984791845238315e-06, + "loss": 0.1825, + "step": 501 + }, + { + "epoch": 0.16267012313674659, + "grad_norm": 0.6592951416969299, + "learning_rate": 9.984655213375615e-06, + "loss": 0.2048, + "step": 502 + }, + { + "epoch": 0.16299416720674012, + "grad_norm": 0.7007604241371155, + "learning_rate": 9.984517971439375e-06, + "loss": 0.2017, + "step": 503 + }, + { + "epoch": 0.16331821127673363, + "grad_norm": 0.6989747881889343, + "learning_rate": 9.984380119446388e-06, + "loss": 0.2144, + "step": 504 + }, + { + "epoch": 0.16364225534672716, + "grad_norm": 0.6398130059242249, + "learning_rate": 9.984241657413526e-06, + "loss": 0.1808, + "step": 505 + }, + { + "epoch": 0.16396629941672067, + "grad_norm": 0.6692783236503601, + "learning_rate": 9.98410258535774e-06, + "loss": 0.1968, + "step": 506 + }, + { + "epoch": 0.1642903434867142, + "grad_norm": 0.6372814774513245, + "learning_rate": 9.983962903296044e-06, + "loss": 0.186, + "step": 507 + }, + { + "epoch": 0.1646143875567077, + "grad_norm": 0.6585953235626221, + "learning_rate": 9.983822611245537e-06, + "loss": 0.1843, + "step": 508 + }, + { + "epoch": 0.16493843162670124, + "grad_norm": 0.6987937688827515, + "learning_rate": 9.983681709223392e-06, + "loss": 0.1974, + "step": 509 + }, + { + "epoch": 0.16526247569669475, + "grad_norm": 0.6531558632850647, + "learning_rate": 9.98354019724685e-06, + "loss": 0.1742, + "step": 510 + }, + { + "epoch": 0.16558651976668826, + "grad_norm": 0.6775031089782715, + "learning_rate": 9.983398075333231e-06, + "loss": 0.1885, + "step": 511 + }, + { + "epoch": 0.1659105638366818, + "grad_norm": 0.6820382475852966, + "learning_rate": 9.983255343499932e-06, + "loss": 0.1743, + "step": 512 + }, + { + "epoch": 0.1662346079066753, + "grad_norm": 0.672417402267456, + "learning_rate": 9.983112001764421e-06, + "loss": 0.1927, + "step": 513 + }, + { + "epoch": 0.16655865197666883, + "grad_norm": 0.6865410208702087, + "learning_rate": 9.98296805014424e-06, + "loss": 0.1925, + "step": 514 + }, + { + "epoch": 0.16688269604666234, + "grad_norm": 0.6832920908927917, + "learning_rate": 9.98282348865701e-06, + "loss": 0.2008, + "step": 515 + }, + { + "epoch": 0.16720674011665587, + "grad_norm": 0.6926618814468384, + "learning_rate": 9.982678317320423e-06, + "loss": 0.2011, + "step": 516 + }, + { + "epoch": 0.16753078418664938, + "grad_norm": 0.6344230771064758, + "learning_rate": 9.982532536152242e-06, + "loss": 0.19, + "step": 517 + }, + { + "epoch": 0.1678548282566429, + "grad_norm": 0.6745365858078003, + "learning_rate": 9.982386145170317e-06, + "loss": 0.2045, + "step": 518 + }, + { + "epoch": 0.16817887232663642, + "grad_norm": 0.6516968011856079, + "learning_rate": 9.98223914439256e-06, + "loss": 0.196, + "step": 519 + }, + { + "epoch": 0.16850291639662995, + "grad_norm": 0.6410155892372131, + "learning_rate": 9.982091533836964e-06, + "loss": 0.1841, + "step": 520 + }, + { + "epoch": 0.16882696046662346, + "grad_norm": 0.6878783702850342, + "learning_rate": 9.981943313521594e-06, + "loss": 0.1866, + "step": 521 + }, + { + "epoch": 0.16915100453661697, + "grad_norm": 0.7064847946166992, + "learning_rate": 9.981794483464592e-06, + "loss": 0.1985, + "step": 522 + }, + { + "epoch": 0.1694750486066105, + "grad_norm": 0.700658917427063, + "learning_rate": 9.981645043684172e-06, + "loss": 0.2151, + "step": 523 + }, + { + "epoch": 0.169799092676604, + "grad_norm": 0.663432776927948, + "learning_rate": 9.981494994198624e-06, + "loss": 0.216, + "step": 524 + }, + { + "epoch": 0.17012313674659754, + "grad_norm": 0.6575750708580017, + "learning_rate": 9.981344335026316e-06, + "loss": 0.1759, + "step": 525 + }, + { + "epoch": 0.17044718081659105, + "grad_norm": 0.7086677551269531, + "learning_rate": 9.981193066185682e-06, + "loss": 0.2133, + "step": 526 + }, + { + "epoch": 0.17077122488658458, + "grad_norm": 0.6817197799682617, + "learning_rate": 9.981041187695239e-06, + "loss": 0.1996, + "step": 527 + }, + { + "epoch": 0.1710952689565781, + "grad_norm": 0.6739192605018616, + "learning_rate": 9.980888699573576e-06, + "loss": 0.2071, + "step": 528 + }, + { + "epoch": 0.17141931302657162, + "grad_norm": 0.7066790461540222, + "learning_rate": 9.980735601839354e-06, + "loss": 0.1997, + "step": 529 + }, + { + "epoch": 0.17174335709656513, + "grad_norm": 0.5890233516693115, + "learning_rate": 9.980581894511313e-06, + "loss": 0.1704, + "step": 530 + }, + { + "epoch": 0.17206740116655866, + "grad_norm": 0.7064293026924133, + "learning_rate": 9.980427577608261e-06, + "loss": 0.2072, + "step": 531 + }, + { + "epoch": 0.17239144523655217, + "grad_norm": 0.6045970916748047, + "learning_rate": 9.98027265114909e-06, + "loss": 0.1747, + "step": 532 + }, + { + "epoch": 0.17271548930654568, + "grad_norm": 0.6477690935134888, + "learning_rate": 9.980117115152758e-06, + "loss": 0.166, + "step": 533 + }, + { + "epoch": 0.1730395333765392, + "grad_norm": 0.6647665500640869, + "learning_rate": 9.979960969638303e-06, + "loss": 0.1954, + "step": 534 + }, + { + "epoch": 0.17336357744653272, + "grad_norm": 0.6770102977752686, + "learning_rate": 9.979804214624835e-06, + "loss": 0.2032, + "step": 535 + }, + { + "epoch": 0.17368762151652625, + "grad_norm": 0.695477306842804, + "learning_rate": 9.979646850131539e-06, + "loss": 0.2023, + "step": 536 + }, + { + "epoch": 0.17401166558651976, + "grad_norm": 0.6723573207855225, + "learning_rate": 9.979488876177676e-06, + "loss": 0.1991, + "step": 537 + }, + { + "epoch": 0.1743357096565133, + "grad_norm": 0.6520310044288635, + "learning_rate": 9.97933029278258e-06, + "loss": 0.1836, + "step": 538 + }, + { + "epoch": 0.1746597537265068, + "grad_norm": 0.6617633700370789, + "learning_rate": 9.97917109996566e-06, + "loss": 0.2047, + "step": 539 + }, + { + "epoch": 0.17498379779650033, + "grad_norm": 0.6775153279304504, + "learning_rate": 9.979011297746396e-06, + "loss": 0.1946, + "step": 540 + }, + { + "epoch": 0.17530784186649384, + "grad_norm": 0.7150312066078186, + "learning_rate": 9.978850886144353e-06, + "loss": 0.2092, + "step": 541 + }, + { + "epoch": 0.17563188593648738, + "grad_norm": 0.6678467988967896, + "learning_rate": 9.978689865179161e-06, + "loss": 0.1901, + "step": 542 + }, + { + "epoch": 0.17595593000648088, + "grad_norm": 0.6910053491592407, + "learning_rate": 9.978528234870526e-06, + "loss": 0.2106, + "step": 543 + }, + { + "epoch": 0.1762799740764744, + "grad_norm": 0.6965245604515076, + "learning_rate": 9.978365995238231e-06, + "loss": 0.195, + "step": 544 + }, + { + "epoch": 0.17660401814646792, + "grad_norm": 0.7062773108482361, + "learning_rate": 9.978203146302133e-06, + "loss": 0.2061, + "step": 545 + }, + { + "epoch": 0.17692806221646143, + "grad_norm": 0.6020629405975342, + "learning_rate": 9.978039688082161e-06, + "loss": 0.1698, + "step": 546 + }, + { + "epoch": 0.17725210628645496, + "grad_norm": 0.6427762508392334, + "learning_rate": 9.977875620598323e-06, + "loss": 0.1747, + "step": 547 + }, + { + "epoch": 0.17757615035644847, + "grad_norm": 0.7219037413597107, + "learning_rate": 9.9777109438707e-06, + "loss": 0.1879, + "step": 548 + }, + { + "epoch": 0.177900194426442, + "grad_norm": 0.6616566777229309, + "learning_rate": 9.977545657919444e-06, + "loss": 0.1941, + "step": 549 + }, + { + "epoch": 0.1782242384964355, + "grad_norm": 0.6537032723426819, + "learning_rate": 9.977379762764785e-06, + "loss": 0.181, + "step": 550 + }, + { + "epoch": 0.17854828256642905, + "grad_norm": 0.6982765793800354, + "learning_rate": 9.97721325842703e-06, + "loss": 0.2023, + "step": 551 + }, + { + "epoch": 0.17887232663642255, + "grad_norm": 0.6358122229576111, + "learning_rate": 9.977046144926555e-06, + "loss": 0.1618, + "step": 552 + }, + { + "epoch": 0.17919637070641609, + "grad_norm": 0.6477236151695251, + "learning_rate": 9.976878422283811e-06, + "loss": 0.1782, + "step": 553 + }, + { + "epoch": 0.1795204147764096, + "grad_norm": 0.6369034647941589, + "learning_rate": 9.97671009051933e-06, + "loss": 0.1872, + "step": 554 + }, + { + "epoch": 0.1798444588464031, + "grad_norm": 0.679492175579071, + "learning_rate": 9.976541149653714e-06, + "loss": 0.185, + "step": 555 + }, + { + "epoch": 0.18016850291639663, + "grad_norm": 0.6448071599006653, + "learning_rate": 9.976371599707635e-06, + "loss": 0.1879, + "step": 556 + }, + { + "epoch": 0.18049254698639014, + "grad_norm": 0.6247570514678955, + "learning_rate": 9.976201440701848e-06, + "loss": 0.1812, + "step": 557 + }, + { + "epoch": 0.18081659105638367, + "grad_norm": 0.6881939172744751, + "learning_rate": 9.976030672657177e-06, + "loss": 0.1984, + "step": 558 + }, + { + "epoch": 0.18114063512637718, + "grad_norm": 0.6799702048301697, + "learning_rate": 9.975859295594526e-06, + "loss": 0.1981, + "step": 559 + }, + { + "epoch": 0.18146467919637072, + "grad_norm": 0.6491541862487793, + "learning_rate": 9.975687309534865e-06, + "loss": 0.1663, + "step": 560 + }, + { + "epoch": 0.18178872326636422, + "grad_norm": 0.6550642251968384, + "learning_rate": 9.975514714499247e-06, + "loss": 0.1861, + "step": 561 + }, + { + "epoch": 0.18211276733635776, + "grad_norm": 0.6994182467460632, + "learning_rate": 9.975341510508793e-06, + "loss": 0.2076, + "step": 562 + }, + { + "epoch": 0.18243681140635126, + "grad_norm": 0.692759096622467, + "learning_rate": 9.975167697584706e-06, + "loss": 0.2181, + "step": 563 + }, + { + "epoch": 0.18276085547634477, + "grad_norm": 0.6435503363609314, + "learning_rate": 9.974993275748253e-06, + "loss": 0.18, + "step": 564 + }, + { + "epoch": 0.1830848995463383, + "grad_norm": 0.6955358982086182, + "learning_rate": 9.974818245020788e-06, + "loss": 0.2005, + "step": 565 + }, + { + "epoch": 0.1834089436163318, + "grad_norm": 0.6618521213531494, + "learning_rate": 9.974642605423727e-06, + "loss": 0.2049, + "step": 566 + }, + { + "epoch": 0.18373298768632534, + "grad_norm": 0.6213456392288208, + "learning_rate": 9.97446635697857e-06, + "loss": 0.1781, + "step": 567 + }, + { + "epoch": 0.18405703175631885, + "grad_norm": 0.6700173616409302, + "learning_rate": 9.974289499706888e-06, + "loss": 0.1862, + "step": 568 + }, + { + "epoch": 0.18438107582631239, + "grad_norm": 0.6925329566001892, + "learning_rate": 9.974112033630326e-06, + "loss": 0.2066, + "step": 569 + }, + { + "epoch": 0.1847051198963059, + "grad_norm": 0.6212252378463745, + "learning_rate": 9.973933958770604e-06, + "loss": 0.1817, + "step": 570 + }, + { + "epoch": 0.18502916396629943, + "grad_norm": 0.6753741502761841, + "learning_rate": 9.973755275149516e-06, + "loss": 0.2056, + "step": 571 + }, + { + "epoch": 0.18535320803629293, + "grad_norm": 0.687791645526886, + "learning_rate": 9.973575982788934e-06, + "loss": 0.1862, + "step": 572 + }, + { + "epoch": 0.18567725210628647, + "grad_norm": 0.6519484519958496, + "learning_rate": 9.9733960817108e-06, + "loss": 0.1567, + "step": 573 + }, + { + "epoch": 0.18600129617627997, + "grad_norm": 0.700171709060669, + "learning_rate": 9.97321557193713e-06, + "loss": 0.1883, + "step": 574 + }, + { + "epoch": 0.18632534024627348, + "grad_norm": 0.6499386429786682, + "learning_rate": 9.973034453490017e-06, + "loss": 0.1906, + "step": 575 + }, + { + "epoch": 0.18664938431626701, + "grad_norm": 0.6470776200294495, + "learning_rate": 9.972852726391633e-06, + "loss": 0.196, + "step": 576 + }, + { + "epoch": 0.18697342838626052, + "grad_norm": 0.6618337631225586, + "learning_rate": 9.972670390664214e-06, + "loss": 0.1812, + "step": 577 + }, + { + "epoch": 0.18729747245625405, + "grad_norm": 0.6731171607971191, + "learning_rate": 9.972487446330079e-06, + "loss": 0.1869, + "step": 578 + }, + { + "epoch": 0.18762151652624756, + "grad_norm": 0.6692624092102051, + "learning_rate": 9.972303893411619e-06, + "loss": 0.1912, + "step": 579 + }, + { + "epoch": 0.1879455605962411, + "grad_norm": 0.6499179601669312, + "learning_rate": 9.972119731931298e-06, + "loss": 0.2035, + "step": 580 + }, + { + "epoch": 0.1882696046662346, + "grad_norm": 0.6587321162223816, + "learning_rate": 9.971934961911653e-06, + "loss": 0.1904, + "step": 581 + }, + { + "epoch": 0.18859364873622814, + "grad_norm": 0.657788872718811, + "learning_rate": 9.971749583375303e-06, + "loss": 0.1948, + "step": 582 + }, + { + "epoch": 0.18891769280622164, + "grad_norm": 0.6337559819221497, + "learning_rate": 9.971563596344934e-06, + "loss": 0.1925, + "step": 583 + }, + { + "epoch": 0.18924173687621518, + "grad_norm": 0.6866735219955444, + "learning_rate": 9.971377000843309e-06, + "loss": 0.1981, + "step": 584 + }, + { + "epoch": 0.18956578094620868, + "grad_norm": 0.7488305568695068, + "learning_rate": 9.971189796893266e-06, + "loss": 0.1973, + "step": 585 + }, + { + "epoch": 0.1898898250162022, + "grad_norm": 0.7035614848136902, + "learning_rate": 9.971001984517717e-06, + "loss": 0.1853, + "step": 586 + }, + { + "epoch": 0.19021386908619572, + "grad_norm": 0.6761707663536072, + "learning_rate": 9.970813563739647e-06, + "loss": 0.1879, + "step": 587 + }, + { + "epoch": 0.19053791315618923, + "grad_norm": 0.6215554475784302, + "learning_rate": 9.97062453458212e-06, + "loss": 0.1721, + "step": 588 + }, + { + "epoch": 0.19086195722618277, + "grad_norm": 0.6934155225753784, + "learning_rate": 9.970434897068268e-06, + "loss": 0.2102, + "step": 589 + }, + { + "epoch": 0.19118600129617627, + "grad_norm": 0.6098140478134155, + "learning_rate": 9.970244651221302e-06, + "loss": 0.1685, + "step": 590 + }, + { + "epoch": 0.1915100453661698, + "grad_norm": 0.7065131068229675, + "learning_rate": 9.970053797064506e-06, + "loss": 0.2038, + "step": 591 + }, + { + "epoch": 0.1918340894361633, + "grad_norm": 0.6565381288528442, + "learning_rate": 9.96986233462124e-06, + "loss": 0.1781, + "step": 592 + }, + { + "epoch": 0.19215813350615685, + "grad_norm": 0.642196774482727, + "learning_rate": 9.969670263914936e-06, + "loss": 0.1783, + "step": 593 + }, + { + "epoch": 0.19248217757615035, + "grad_norm": 0.6986721158027649, + "learning_rate": 9.9694775849691e-06, + "loss": 0.2, + "step": 594 + }, + { + "epoch": 0.1928062216461439, + "grad_norm": 0.6566448211669922, + "learning_rate": 9.969284297807319e-06, + "loss": 0.1821, + "step": 595 + }, + { + "epoch": 0.1931302657161374, + "grad_norm": 0.7404976487159729, + "learning_rate": 9.969090402453246e-06, + "loss": 0.2051, + "step": 596 + }, + { + "epoch": 0.1934543097861309, + "grad_norm": 0.7691987156867981, + "learning_rate": 9.96889589893061e-06, + "loss": 0.1989, + "step": 597 + }, + { + "epoch": 0.19377835385612444, + "grad_norm": 0.6925591230392456, + "learning_rate": 9.96870078726322e-06, + "loss": 0.1992, + "step": 598 + }, + { + "epoch": 0.19410239792611794, + "grad_norm": 0.6497508883476257, + "learning_rate": 9.968505067474954e-06, + "loss": 0.1855, + "step": 599 + }, + { + "epoch": 0.19442644199611148, + "grad_norm": 0.6990604996681213, + "learning_rate": 9.968308739589767e-06, + "loss": 0.1962, + "step": 600 + }, + { + "epoch": 0.19475048606610498, + "grad_norm": 0.6696603894233704, + "learning_rate": 9.968111803631688e-06, + "loss": 0.1942, + "step": 601 + }, + { + "epoch": 0.19507453013609852, + "grad_norm": 0.6557125449180603, + "learning_rate": 9.967914259624817e-06, + "loss": 0.1787, + "step": 602 + }, + { + "epoch": 0.19539857420609202, + "grad_norm": 0.6662872433662415, + "learning_rate": 9.967716107593335e-06, + "loss": 0.1853, + "step": 603 + }, + { + "epoch": 0.19572261827608556, + "grad_norm": 0.6978006958961487, + "learning_rate": 9.967517347561493e-06, + "loss": 0.2104, + "step": 604 + }, + { + "epoch": 0.19604666234607906, + "grad_norm": 0.6399781107902527, + "learning_rate": 9.967317979553617e-06, + "loss": 0.1894, + "step": 605 + }, + { + "epoch": 0.1963707064160726, + "grad_norm": 0.6516302824020386, + "learning_rate": 9.967118003594107e-06, + "loss": 0.1792, + "step": 606 + }, + { + "epoch": 0.1966947504860661, + "grad_norm": 0.6929931044578552, + "learning_rate": 9.966917419707439e-06, + "loss": 0.2056, + "step": 607 + }, + { + "epoch": 0.1970187945560596, + "grad_norm": 0.6575304269790649, + "learning_rate": 9.966716227918163e-06, + "loss": 0.1844, + "step": 608 + }, + { + "epoch": 0.19734283862605315, + "grad_norm": 0.6778562664985657, + "learning_rate": 9.966514428250902e-06, + "loss": 0.1927, + "step": 609 + }, + { + "epoch": 0.19766688269604665, + "grad_norm": 0.6473074555397034, + "learning_rate": 9.966312020730353e-06, + "loss": 0.2007, + "step": 610 + }, + { + "epoch": 0.1979909267660402, + "grad_norm": 0.6930627226829529, + "learning_rate": 9.966109005381292e-06, + "loss": 0.2148, + "step": 611 + }, + { + "epoch": 0.1983149708360337, + "grad_norm": 0.7073266506195068, + "learning_rate": 9.965905382228565e-06, + "loss": 0.176, + "step": 612 + }, + { + "epoch": 0.19863901490602723, + "grad_norm": 0.6246760487556458, + "learning_rate": 9.96570115129709e-06, + "loss": 0.1628, + "step": 613 + }, + { + "epoch": 0.19896305897602073, + "grad_norm": 0.6829102039337158, + "learning_rate": 9.965496312611869e-06, + "loss": 0.1851, + "step": 614 + }, + { + "epoch": 0.19928710304601427, + "grad_norm": 0.647975504398346, + "learning_rate": 9.965290866197967e-06, + "loss": 0.1891, + "step": 615 + }, + { + "epoch": 0.19961114711600778, + "grad_norm": 0.6556454300880432, + "learning_rate": 9.965084812080533e-06, + "loss": 0.1792, + "step": 616 + }, + { + "epoch": 0.1999351911860013, + "grad_norm": 0.6431975960731506, + "learning_rate": 9.96487815028478e-06, + "loss": 0.1978, + "step": 617 + }, + { + "epoch": 0.20025923525599482, + "grad_norm": 0.62309730052948, + "learning_rate": 9.964670880836009e-06, + "loss": 0.1811, + "step": 618 + }, + { + "epoch": 0.20058327932598832, + "grad_norm": 0.705092191696167, + "learning_rate": 9.96446300375958e-06, + "loss": 0.1945, + "step": 619 + }, + { + "epoch": 0.20090732339598186, + "grad_norm": 0.6346619725227356, + "learning_rate": 9.964254519080943e-06, + "loss": 0.1764, + "step": 620 + }, + { + "epoch": 0.20123136746597536, + "grad_norm": 0.6662610769271851, + "learning_rate": 9.964045426825609e-06, + "loss": 0.1881, + "step": 621 + }, + { + "epoch": 0.2015554115359689, + "grad_norm": 0.7057406306266785, + "learning_rate": 9.96383572701917e-06, + "loss": 0.2075, + "step": 622 + }, + { + "epoch": 0.2018794556059624, + "grad_norm": 0.6177229285240173, + "learning_rate": 9.963625419687292e-06, + "loss": 0.1937, + "step": 623 + }, + { + "epoch": 0.20220349967595594, + "grad_norm": 0.6827666759490967, + "learning_rate": 9.963414504855714e-06, + "loss": 0.1938, + "step": 624 + }, + { + "epoch": 0.20252754374594945, + "grad_norm": 0.6624761819839478, + "learning_rate": 9.96320298255025e-06, + "loss": 0.1962, + "step": 625 + }, + { + "epoch": 0.20285158781594298, + "grad_norm": 0.6548133492469788, + "learning_rate": 9.96299085279679e-06, + "loss": 0.1857, + "step": 626 + }, + { + "epoch": 0.2031756318859365, + "grad_norm": 0.6573795080184937, + "learning_rate": 9.962778115621294e-06, + "loss": 0.1887, + "step": 627 + }, + { + "epoch": 0.20349967595593, + "grad_norm": 0.6657042503356934, + "learning_rate": 9.962564771049799e-06, + "loss": 0.1888, + "step": 628 + }, + { + "epoch": 0.20382372002592353, + "grad_norm": 0.691179096698761, + "learning_rate": 9.96235081910842e-06, + "loss": 0.1902, + "step": 629 + }, + { + "epoch": 0.20414776409591703, + "grad_norm": 0.6811088919639587, + "learning_rate": 9.962136259823337e-06, + "loss": 0.1788, + "step": 630 + }, + { + "epoch": 0.20447180816591057, + "grad_norm": 0.6676942110061646, + "learning_rate": 9.961921093220815e-06, + "loss": 0.1877, + "step": 631 + }, + { + "epoch": 0.20479585223590407, + "grad_norm": 0.6651305556297302, + "learning_rate": 9.961705319327186e-06, + "loss": 0.1868, + "step": 632 + }, + { + "epoch": 0.2051198963058976, + "grad_norm": 0.6629343628883362, + "learning_rate": 9.961488938168859e-06, + "loss": 0.1705, + "step": 633 + }, + { + "epoch": 0.20544394037589112, + "grad_norm": 0.6603417992591858, + "learning_rate": 9.961271949772316e-06, + "loss": 0.1831, + "step": 634 + }, + { + "epoch": 0.20576798444588465, + "grad_norm": 0.7644823789596558, + "learning_rate": 9.961054354164116e-06, + "loss": 0.2072, + "step": 635 + }, + { + "epoch": 0.20609202851587816, + "grad_norm": 0.690443217754364, + "learning_rate": 9.96083615137089e-06, + "loss": 0.2103, + "step": 636 + }, + { + "epoch": 0.2064160725858717, + "grad_norm": 0.6906901597976685, + "learning_rate": 9.960617341419342e-06, + "loss": 0.2025, + "step": 637 + }, + { + "epoch": 0.2067401166558652, + "grad_norm": 0.6573625206947327, + "learning_rate": 9.960397924336256e-06, + "loss": 0.1952, + "step": 638 + }, + { + "epoch": 0.2070641607258587, + "grad_norm": 0.6751140356063843, + "learning_rate": 9.960177900148483e-06, + "loss": 0.2044, + "step": 639 + }, + { + "epoch": 0.20738820479585224, + "grad_norm": 0.6589855551719666, + "learning_rate": 9.959957268882954e-06, + "loss": 0.2033, + "step": 640 + }, + { + "epoch": 0.20771224886584574, + "grad_norm": 0.6459863781929016, + "learning_rate": 9.959736030566672e-06, + "loss": 0.177, + "step": 641 + }, + { + "epoch": 0.20803629293583928, + "grad_norm": 0.6876140236854553, + "learning_rate": 9.959514185226714e-06, + "loss": 0.2077, + "step": 642 + }, + { + "epoch": 0.20836033700583279, + "grad_norm": 0.6192476749420166, + "learning_rate": 9.959291732890228e-06, + "loss": 0.186, + "step": 643 + }, + { + "epoch": 0.20868438107582632, + "grad_norm": 0.683696985244751, + "learning_rate": 9.959068673584447e-06, + "loss": 0.2169, + "step": 644 + }, + { + "epoch": 0.20900842514581983, + "grad_norm": 0.6164054870605469, + "learning_rate": 9.958845007336667e-06, + "loss": 0.1758, + "step": 645 + }, + { + "epoch": 0.20933246921581336, + "grad_norm": 0.6763623356819153, + "learning_rate": 9.958620734174263e-06, + "loss": 0.1731, + "step": 646 + }, + { + "epoch": 0.20965651328580687, + "grad_norm": 0.6871886849403381, + "learning_rate": 9.958395854124686e-06, + "loss": 0.1931, + "step": 647 + }, + { + "epoch": 0.2099805573558004, + "grad_norm": 0.6257924437522888, + "learning_rate": 9.958170367215456e-06, + "loss": 0.1886, + "step": 648 + }, + { + "epoch": 0.2103046014257939, + "grad_norm": 0.6464048027992249, + "learning_rate": 9.957944273474172e-06, + "loss": 0.1945, + "step": 649 + }, + { + "epoch": 0.21062864549578741, + "grad_norm": 0.6848668456077576, + "learning_rate": 9.957717572928504e-06, + "loss": 0.2027, + "step": 650 + }, + { + "epoch": 0.21095268956578095, + "grad_norm": 0.6994589567184448, + "learning_rate": 9.957490265606202e-06, + "loss": 0.1994, + "step": 651 + }, + { + "epoch": 0.21127673363577446, + "grad_norm": 0.7209458947181702, + "learning_rate": 9.957262351535085e-06, + "loss": 0.2014, + "step": 652 + }, + { + "epoch": 0.211600777705768, + "grad_norm": 0.6271886825561523, + "learning_rate": 9.957033830743043e-06, + "loss": 0.171, + "step": 653 + }, + { + "epoch": 0.2119248217757615, + "grad_norm": 0.6906067728996277, + "learning_rate": 9.956804703258048e-06, + "loss": 0.1983, + "step": 654 + }, + { + "epoch": 0.21224886584575503, + "grad_norm": 0.6128396987915039, + "learning_rate": 9.956574969108143e-06, + "loss": 0.1756, + "step": 655 + }, + { + "epoch": 0.21257290991574854, + "grad_norm": 0.695341169834137, + "learning_rate": 9.956344628321448e-06, + "loss": 0.2226, + "step": 656 + }, + { + "epoch": 0.21289695398574207, + "grad_norm": 0.6387926936149597, + "learning_rate": 9.956113680926149e-06, + "loss": 0.1929, + "step": 657 + }, + { + "epoch": 0.21322099805573558, + "grad_norm": 0.6557097434997559, + "learning_rate": 9.955882126950516e-06, + "loss": 0.1995, + "step": 658 + }, + { + "epoch": 0.2135450421257291, + "grad_norm": 0.6979703903198242, + "learning_rate": 9.955649966422886e-06, + "loss": 0.192, + "step": 659 + }, + { + "epoch": 0.21386908619572262, + "grad_norm": 0.6957211494445801, + "learning_rate": 9.955417199371674e-06, + "loss": 0.2079, + "step": 660 + }, + { + "epoch": 0.21419313026571613, + "grad_norm": 0.6489683389663696, + "learning_rate": 9.95518382582537e-06, + "loss": 0.1781, + "step": 661 + }, + { + "epoch": 0.21451717433570966, + "grad_norm": 0.6822352409362793, + "learning_rate": 9.954949845812536e-06, + "loss": 0.1976, + "step": 662 + }, + { + "epoch": 0.21484121840570317, + "grad_norm": 0.7062362432479858, + "learning_rate": 9.954715259361806e-06, + "loss": 0.1958, + "step": 663 + }, + { + "epoch": 0.2151652624756967, + "grad_norm": 0.6700916886329651, + "learning_rate": 9.954480066501896e-06, + "loss": 0.2005, + "step": 664 + }, + { + "epoch": 0.2154893065456902, + "grad_norm": 0.5978602170944214, + "learning_rate": 9.95424426726159e-06, + "loss": 0.173, + "step": 665 + }, + { + "epoch": 0.21581335061568374, + "grad_norm": 0.6139827966690063, + "learning_rate": 9.954007861669745e-06, + "loss": 0.1801, + "step": 666 + }, + { + "epoch": 0.21613739468567725, + "grad_norm": 0.6229221224784851, + "learning_rate": 9.953770849755295e-06, + "loss": 0.1825, + "step": 667 + }, + { + "epoch": 0.21646143875567078, + "grad_norm": 0.6633325815200806, + "learning_rate": 9.953533231547251e-06, + "loss": 0.1875, + "step": 668 + }, + { + "epoch": 0.2167854828256643, + "grad_norm": 0.666135847568512, + "learning_rate": 9.953295007074693e-06, + "loss": 0.1946, + "step": 669 + }, + { + "epoch": 0.21710952689565782, + "grad_norm": 0.6448487639427185, + "learning_rate": 9.953056176366777e-06, + "loss": 0.1795, + "step": 670 + }, + { + "epoch": 0.21743357096565133, + "grad_norm": 0.6257104873657227, + "learning_rate": 9.952816739452735e-06, + "loss": 0.1689, + "step": 671 + }, + { + "epoch": 0.21775761503564484, + "grad_norm": 0.7119507193565369, + "learning_rate": 9.95257669636187e-06, + "loss": 0.1887, + "step": 672 + }, + { + "epoch": 0.21808165910563837, + "grad_norm": 0.6453043818473816, + "learning_rate": 9.952336047123565e-06, + "loss": 0.1945, + "step": 673 + }, + { + "epoch": 0.21840570317563188, + "grad_norm": 0.6554058790206909, + "learning_rate": 9.952094791767267e-06, + "loss": 0.1848, + "step": 674 + }, + { + "epoch": 0.2187297472456254, + "grad_norm": 0.678406298160553, + "learning_rate": 9.951852930322507e-06, + "loss": 0.1989, + "step": 675 + }, + { + "epoch": 0.21905379131561892, + "grad_norm": 0.641863226890564, + "learning_rate": 9.951610462818888e-06, + "loss": 0.1936, + "step": 676 + }, + { + "epoch": 0.21937783538561245, + "grad_norm": 0.6582136154174805, + "learning_rate": 9.951367389286082e-06, + "loss": 0.1813, + "step": 677 + }, + { + "epoch": 0.21970187945560596, + "grad_norm": 0.622146725654602, + "learning_rate": 9.95112370975384e-06, + "loss": 0.1771, + "step": 678 + }, + { + "epoch": 0.2200259235255995, + "grad_norm": 0.6097822189331055, + "learning_rate": 9.950879424251987e-06, + "loss": 0.1697, + "step": 679 + }, + { + "epoch": 0.220349967595593, + "grad_norm": 0.6120715141296387, + "learning_rate": 9.950634532810421e-06, + "loss": 0.1921, + "step": 680 + }, + { + "epoch": 0.22067401166558653, + "grad_norm": 0.672498881816864, + "learning_rate": 9.950389035459114e-06, + "loss": 0.1979, + "step": 681 + }, + { + "epoch": 0.22099805573558004, + "grad_norm": 0.6239397525787354, + "learning_rate": 9.950142932228114e-06, + "loss": 0.1761, + "step": 682 + }, + { + "epoch": 0.22132209980557355, + "grad_norm": 0.647966742515564, + "learning_rate": 9.949896223147537e-06, + "loss": 0.1774, + "step": 683 + }, + { + "epoch": 0.22164614387556708, + "grad_norm": 0.6499220728874207, + "learning_rate": 9.949648908247583e-06, + "loss": 0.1943, + "step": 684 + }, + { + "epoch": 0.2219701879455606, + "grad_norm": 0.6279643774032593, + "learning_rate": 9.94940098755852e-06, + "loss": 0.1695, + "step": 685 + }, + { + "epoch": 0.22229423201555412, + "grad_norm": 0.6042583584785461, + "learning_rate": 9.949152461110688e-06, + "loss": 0.1764, + "step": 686 + }, + { + "epoch": 0.22261827608554763, + "grad_norm": 0.6498985886573792, + "learning_rate": 9.948903328934507e-06, + "loss": 0.1863, + "step": 687 + }, + { + "epoch": 0.22294232015554116, + "grad_norm": 0.6190524101257324, + "learning_rate": 9.948653591060468e-06, + "loss": 0.1696, + "step": 688 + }, + { + "epoch": 0.22326636422553467, + "grad_norm": 0.7013121247291565, + "learning_rate": 9.948403247519135e-06, + "loss": 0.1951, + "step": 689 + }, + { + "epoch": 0.2235904082955282, + "grad_norm": 0.6156609654426575, + "learning_rate": 9.94815229834115e-06, + "loss": 0.1852, + "step": 690 + }, + { + "epoch": 0.2239144523655217, + "grad_norm": 0.6260215640068054, + "learning_rate": 9.947900743557224e-06, + "loss": 0.1715, + "step": 691 + }, + { + "epoch": 0.22423849643551522, + "grad_norm": 0.6351551413536072, + "learning_rate": 9.947648583198148e-06, + "loss": 0.1885, + "step": 692 + }, + { + "epoch": 0.22456254050550875, + "grad_norm": 0.6523791551589966, + "learning_rate": 9.947395817294781e-06, + "loss": 0.1877, + "step": 693 + }, + { + "epoch": 0.22488658457550226, + "grad_norm": 0.6115183234214783, + "learning_rate": 9.947142445878062e-06, + "loss": 0.1754, + "step": 694 + }, + { + "epoch": 0.2252106286454958, + "grad_norm": 0.6498876810073853, + "learning_rate": 9.946888468978999e-06, + "loss": 0.1869, + "step": 695 + }, + { + "epoch": 0.2255346727154893, + "grad_norm": 0.6169878244400024, + "learning_rate": 9.946633886628676e-06, + "loss": 0.1929, + "step": 696 + }, + { + "epoch": 0.22585871678548283, + "grad_norm": 0.6811516880989075, + "learning_rate": 9.946378698858255e-06, + "loss": 0.2001, + "step": 697 + }, + { + "epoch": 0.22618276085547634, + "grad_norm": 0.6628179550170898, + "learning_rate": 9.946122905698962e-06, + "loss": 0.1813, + "step": 698 + }, + { + "epoch": 0.22650680492546987, + "grad_norm": 0.6483691334724426, + "learning_rate": 9.945866507182112e-06, + "loss": 0.1712, + "step": 699 + }, + { + "epoch": 0.22683084899546338, + "grad_norm": 0.6645535230636597, + "learning_rate": 9.945609503339078e-06, + "loss": 0.184, + "step": 700 + }, + { + "epoch": 0.22715489306545691, + "grad_norm": 0.6483830213546753, + "learning_rate": 9.945351894201318e-06, + "loss": 0.1954, + "step": 701 + }, + { + "epoch": 0.22747893713545042, + "grad_norm": 0.6874852180480957, + "learning_rate": 9.945093679800363e-06, + "loss": 0.1936, + "step": 702 + }, + { + "epoch": 0.22780298120544393, + "grad_norm": 0.6744926571846008, + "learning_rate": 9.944834860167812e-06, + "loss": 0.2008, + "step": 703 + }, + { + "epoch": 0.22812702527543746, + "grad_norm": 0.7001661062240601, + "learning_rate": 9.944575435335344e-06, + "loss": 0.1946, + "step": 704 + }, + { + "epoch": 0.22845106934543097, + "grad_norm": 0.6359503865242004, + "learning_rate": 9.944315405334712e-06, + "loss": 0.1725, + "step": 705 + }, + { + "epoch": 0.2287751134154245, + "grad_norm": 0.628150224685669, + "learning_rate": 9.944054770197736e-06, + "loss": 0.1774, + "step": 706 + }, + { + "epoch": 0.229099157485418, + "grad_norm": 0.6892385482788086, + "learning_rate": 9.94379352995632e-06, + "loss": 0.1859, + "step": 707 + }, + { + "epoch": 0.22942320155541154, + "grad_norm": 0.6501577496528625, + "learning_rate": 9.943531684642435e-06, + "loss": 0.1847, + "step": 708 + }, + { + "epoch": 0.22974724562540505, + "grad_norm": 0.626654863357544, + "learning_rate": 9.94326923428813e-06, + "loss": 0.1867, + "step": 709 + }, + { + "epoch": 0.23007128969539858, + "grad_norm": 0.6670397520065308, + "learning_rate": 9.943006178925525e-06, + "loss": 0.1871, + "step": 710 + }, + { + "epoch": 0.2303953337653921, + "grad_norm": 0.6162786483764648, + "learning_rate": 9.942742518586815e-06, + "loss": 0.1661, + "step": 711 + }, + { + "epoch": 0.23071937783538563, + "grad_norm": 0.65184485912323, + "learning_rate": 9.94247825330427e-06, + "loss": 0.1869, + "step": 712 + }, + { + "epoch": 0.23104342190537913, + "grad_norm": 0.6438986659049988, + "learning_rate": 9.942213383110232e-06, + "loss": 0.1847, + "step": 713 + }, + { + "epoch": 0.23136746597537264, + "grad_norm": 0.6406044960021973, + "learning_rate": 9.941947908037123e-06, + "loss": 0.1969, + "step": 714 + }, + { + "epoch": 0.23169151004536617, + "grad_norm": 0.6293928027153015, + "learning_rate": 9.941681828117432e-06, + "loss": 0.1732, + "step": 715 + }, + { + "epoch": 0.23201555411535968, + "grad_norm": 0.5930849313735962, + "learning_rate": 9.941415143383723e-06, + "loss": 0.1592, + "step": 716 + }, + { + "epoch": 0.2323395981853532, + "grad_norm": 0.6804683208465576, + "learning_rate": 9.941147853868638e-06, + "loss": 0.1954, + "step": 717 + }, + { + "epoch": 0.23266364225534672, + "grad_norm": 0.6410490274429321, + "learning_rate": 9.94087995960489e-06, + "loss": 0.1986, + "step": 718 + }, + { + "epoch": 0.23298768632534025, + "grad_norm": 0.6373780965805054, + "learning_rate": 9.940611460625264e-06, + "loss": 0.185, + "step": 719 + }, + { + "epoch": 0.23331173039533376, + "grad_norm": 0.6732212901115417, + "learning_rate": 9.940342356962627e-06, + "loss": 0.1935, + "step": 720 + }, + { + "epoch": 0.2336357744653273, + "grad_norm": 0.6203884482383728, + "learning_rate": 9.94007264864991e-06, + "loss": 0.1661, + "step": 721 + }, + { + "epoch": 0.2339598185353208, + "grad_norm": 0.6237133145332336, + "learning_rate": 9.939802335720126e-06, + "loss": 0.1768, + "step": 722 + }, + { + "epoch": 0.23428386260531434, + "grad_norm": 0.6512324213981628, + "learning_rate": 9.939531418206355e-06, + "loss": 0.1828, + "step": 723 + }, + { + "epoch": 0.23460790667530784, + "grad_norm": 0.6099259853363037, + "learning_rate": 9.939259896141757e-06, + "loss": 0.1684, + "step": 724 + }, + { + "epoch": 0.23493195074530135, + "grad_norm": 0.6380852460861206, + "learning_rate": 9.938987769559565e-06, + "loss": 0.1808, + "step": 725 + }, + { + "epoch": 0.23525599481529488, + "grad_norm": 0.6275023818016052, + "learning_rate": 9.938715038493083e-06, + "loss": 0.1911, + "step": 726 + }, + { + "epoch": 0.2355800388852884, + "grad_norm": 0.6486274600028992, + "learning_rate": 9.938441702975689e-06, + "loss": 0.1885, + "step": 727 + }, + { + "epoch": 0.23590408295528192, + "grad_norm": 0.6302095055580139, + "learning_rate": 9.93816776304084e-06, + "loss": 0.1861, + "step": 728 + }, + { + "epoch": 0.23622812702527543, + "grad_norm": 0.600139319896698, + "learning_rate": 9.937893218722062e-06, + "loss": 0.1736, + "step": 729 + }, + { + "epoch": 0.23655217109526896, + "grad_norm": 0.5886365175247192, + "learning_rate": 9.937618070052954e-06, + "loss": 0.1683, + "step": 730 + }, + { + "epoch": 0.23687621516526247, + "grad_norm": 0.68076491355896, + "learning_rate": 9.937342317067197e-06, + "loss": 0.1949, + "step": 731 + }, + { + "epoch": 0.237200259235256, + "grad_norm": 0.5942913293838501, + "learning_rate": 9.937065959798538e-06, + "loss": 0.1755, + "step": 732 + }, + { + "epoch": 0.2375243033052495, + "grad_norm": 0.6711556315422058, + "learning_rate": 9.936788998280797e-06, + "loss": 0.1861, + "step": 733 + }, + { + "epoch": 0.23784834737524305, + "grad_norm": 0.6712557077407837, + "learning_rate": 9.936511432547877e-06, + "loss": 0.1827, + "step": 734 + }, + { + "epoch": 0.23817239144523655, + "grad_norm": 0.6791927814483643, + "learning_rate": 9.936233262633746e-06, + "loss": 0.1791, + "step": 735 + }, + { + "epoch": 0.23849643551523006, + "grad_norm": 0.6469034552574158, + "learning_rate": 9.93595448857245e-06, + "loss": 0.1856, + "step": 736 + }, + { + "epoch": 0.2388204795852236, + "grad_norm": 0.6158778667449951, + "learning_rate": 9.935675110398107e-06, + "loss": 0.1594, + "step": 737 + }, + { + "epoch": 0.2391445236552171, + "grad_norm": 0.6263102293014526, + "learning_rate": 9.935395128144914e-06, + "loss": 0.1907, + "step": 738 + }, + { + "epoch": 0.23946856772521063, + "grad_norm": 0.6166470050811768, + "learning_rate": 9.935114541847133e-06, + "loss": 0.1925, + "step": 739 + }, + { + "epoch": 0.23979261179520414, + "grad_norm": 0.6434534192085266, + "learning_rate": 9.93483335153911e-06, + "loss": 0.1972, + "step": 740 + }, + { + "epoch": 0.24011665586519768, + "grad_norm": 0.6516762971878052, + "learning_rate": 9.934551557255257e-06, + "loss": 0.1889, + "step": 741 + }, + { + "epoch": 0.24044069993519118, + "grad_norm": 0.6424468755722046, + "learning_rate": 9.934269159030064e-06, + "loss": 0.2006, + "step": 742 + }, + { + "epoch": 0.24076474400518472, + "grad_norm": 0.6505478620529175, + "learning_rate": 9.933986156898092e-06, + "loss": 0.1934, + "step": 743 + }, + { + "epoch": 0.24108878807517822, + "grad_norm": 0.628738522529602, + "learning_rate": 9.93370255089398e-06, + "loss": 0.1779, + "step": 744 + }, + { + "epoch": 0.24141283214517173, + "grad_norm": 0.6392041444778442, + "learning_rate": 9.933418341052437e-06, + "loss": 0.1792, + "step": 745 + }, + { + "epoch": 0.24173687621516526, + "grad_norm": 0.6310383081436157, + "learning_rate": 9.933133527408248e-06, + "loss": 0.1881, + "step": 746 + }, + { + "epoch": 0.24206092028515877, + "grad_norm": 0.6338793635368347, + "learning_rate": 9.932848109996273e-06, + "loss": 0.2015, + "step": 747 + }, + { + "epoch": 0.2423849643551523, + "grad_norm": 0.6490474343299866, + "learning_rate": 9.932562088851444e-06, + "loss": 0.1904, + "step": 748 + }, + { + "epoch": 0.2427090084251458, + "grad_norm": 0.6636795997619629, + "learning_rate": 9.932275464008763e-06, + "loss": 0.1894, + "step": 749 + }, + { + "epoch": 0.24303305249513935, + "grad_norm": 0.589769184589386, + "learning_rate": 9.931988235503316e-06, + "loss": 0.1749, + "step": 750 + }, + { + "epoch": 0.24335709656513285, + "grad_norm": 0.6744698882102966, + "learning_rate": 9.931700403370253e-06, + "loss": 0.2061, + "step": 751 + }, + { + "epoch": 0.2436811406351264, + "grad_norm": 0.648323118686676, + "learning_rate": 9.931411967644802e-06, + "loss": 0.1867, + "step": 752 + }, + { + "epoch": 0.2440051847051199, + "grad_norm": 0.6055197715759277, + "learning_rate": 9.931122928362268e-06, + "loss": 0.1821, + "step": 753 + }, + { + "epoch": 0.24432922877511343, + "grad_norm": 0.596246600151062, + "learning_rate": 9.930833285558024e-06, + "loss": 0.1658, + "step": 754 + }, + { + "epoch": 0.24465327284510693, + "grad_norm": 0.6292993426322937, + "learning_rate": 9.93054303926752e-06, + "loss": 0.1836, + "step": 755 + }, + { + "epoch": 0.24497731691510044, + "grad_norm": 0.6322280764579773, + "learning_rate": 9.930252189526279e-06, + "loss": 0.1798, + "step": 756 + }, + { + "epoch": 0.24530136098509397, + "grad_norm": 0.5868021249771118, + "learning_rate": 9.929960736369899e-06, + "loss": 0.1647, + "step": 757 + }, + { + "epoch": 0.24562540505508748, + "grad_norm": 0.6106928586959839, + "learning_rate": 9.929668679834051e-06, + "loss": 0.1824, + "step": 758 + }, + { + "epoch": 0.24594944912508102, + "grad_norm": 0.6683313250541687, + "learning_rate": 9.929376019954478e-06, + "loss": 0.1863, + "step": 759 + }, + { + "epoch": 0.24627349319507452, + "grad_norm": 0.6021707653999329, + "learning_rate": 9.929082756767001e-06, + "loss": 0.1621, + "step": 760 + }, + { + "epoch": 0.24659753726506806, + "grad_norm": 0.6000505685806274, + "learning_rate": 9.928788890307513e-06, + "loss": 0.1626, + "step": 761 + }, + { + "epoch": 0.24692158133506156, + "grad_norm": 0.6177734732627869, + "learning_rate": 9.928494420611977e-06, + "loss": 0.1775, + "step": 762 + }, + { + "epoch": 0.2472456254050551, + "grad_norm": 0.6518813371658325, + "learning_rate": 9.928199347716439e-06, + "loss": 0.1734, + "step": 763 + }, + { + "epoch": 0.2475696694750486, + "grad_norm": 0.6121587157249451, + "learning_rate": 9.927903671657007e-06, + "loss": 0.1739, + "step": 764 + }, + { + "epoch": 0.24789371354504214, + "grad_norm": 0.6971595883369446, + "learning_rate": 9.92760739246987e-06, + "loss": 0.1897, + "step": 765 + }, + { + "epoch": 0.24821775761503564, + "grad_norm": 0.6398004293441772, + "learning_rate": 9.927310510191293e-06, + "loss": 0.1878, + "step": 766 + }, + { + "epoch": 0.24854180168502915, + "grad_norm": 0.6389423608779907, + "learning_rate": 9.927013024857609e-06, + "loss": 0.1824, + "step": 767 + }, + { + "epoch": 0.24886584575502269, + "grad_norm": 0.6484429836273193, + "learning_rate": 9.926714936505228e-06, + "loss": 0.196, + "step": 768 + }, + { + "epoch": 0.2491898898250162, + "grad_norm": 0.6396540403366089, + "learning_rate": 9.926416245170633e-06, + "loss": 0.1971, + "step": 769 + }, + { + "epoch": 0.24951393389500973, + "grad_norm": 0.6225264668464661, + "learning_rate": 9.926116950890381e-06, + "loss": 0.2028, + "step": 770 + }, + { + "epoch": 0.24983797796500323, + "grad_norm": 0.6579481363296509, + "learning_rate": 9.925817053701103e-06, + "loss": 0.189, + "step": 771 + }, + { + "epoch": 0.25016202203499677, + "grad_norm": 0.6142616271972656, + "learning_rate": 9.925516553639503e-06, + "loss": 0.177, + "step": 772 + }, + { + "epoch": 0.2504860661049903, + "grad_norm": 0.625681459903717, + "learning_rate": 9.925215450742361e-06, + "loss": 0.1819, + "step": 773 + }, + { + "epoch": 0.2508101101749838, + "grad_norm": 0.62339186668396, + "learning_rate": 9.924913745046526e-06, + "loss": 0.1753, + "step": 774 + }, + { + "epoch": 0.2511341542449773, + "grad_norm": 0.5808535814285278, + "learning_rate": 9.924611436588924e-06, + "loss": 0.1787, + "step": 775 + }, + { + "epoch": 0.25145819831497085, + "grad_norm": 0.6875622868537903, + "learning_rate": 9.92430852540656e-06, + "loss": 0.1908, + "step": 776 + }, + { + "epoch": 0.2517822423849643, + "grad_norm": 0.6806918978691101, + "learning_rate": 9.924005011536501e-06, + "loss": 0.1945, + "step": 777 + }, + { + "epoch": 0.25210628645495786, + "grad_norm": 0.6135112643241882, + "learning_rate": 9.923700895015896e-06, + "loss": 0.1734, + "step": 778 + }, + { + "epoch": 0.2524303305249514, + "grad_norm": 0.6002721786499023, + "learning_rate": 9.923396175881968e-06, + "loss": 0.1703, + "step": 779 + }, + { + "epoch": 0.25275437459494493, + "grad_norm": 0.6776991486549377, + "learning_rate": 9.923090854172011e-06, + "loss": 0.1878, + "step": 780 + }, + { + "epoch": 0.2530784186649384, + "grad_norm": 0.6653709411621094, + "learning_rate": 9.92278492992339e-06, + "loss": 0.1758, + "step": 781 + }, + { + "epoch": 0.25340246273493194, + "grad_norm": 0.5887860059738159, + "learning_rate": 9.922478403173553e-06, + "loss": 0.179, + "step": 782 + }, + { + "epoch": 0.2537265068049255, + "grad_norm": 0.6444964408874512, + "learning_rate": 9.92217127396001e-06, + "loss": 0.2074, + "step": 783 + }, + { + "epoch": 0.254050550874919, + "grad_norm": 0.6349043250083923, + "learning_rate": 9.921863542320354e-06, + "loss": 0.188, + "step": 784 + }, + { + "epoch": 0.2543745949449125, + "grad_norm": 0.6593102216720581, + "learning_rate": 9.921555208292248e-06, + "loss": 0.1977, + "step": 785 + }, + { + "epoch": 0.254698639014906, + "grad_norm": 0.6344327926635742, + "learning_rate": 9.921246271913429e-06, + "loss": 0.1729, + "step": 786 + }, + { + "epoch": 0.25502268308489956, + "grad_norm": 0.6412277221679688, + "learning_rate": 9.920936733221708e-06, + "loss": 0.186, + "step": 787 + }, + { + "epoch": 0.25534672715489304, + "grad_norm": 0.6043070554733276, + "learning_rate": 9.92062659225497e-06, + "loss": 0.1722, + "step": 788 + }, + { + "epoch": 0.2556707712248866, + "grad_norm": 0.5798800587654114, + "learning_rate": 9.92031584905117e-06, + "loss": 0.1584, + "step": 789 + }, + { + "epoch": 0.2559948152948801, + "grad_norm": 0.5982307195663452, + "learning_rate": 9.920004503648344e-06, + "loss": 0.1774, + "step": 790 + }, + { + "epoch": 0.25631885936487364, + "grad_norm": 0.6444849967956543, + "learning_rate": 9.919692556084596e-06, + "loss": 0.1786, + "step": 791 + }, + { + "epoch": 0.2566429034348671, + "grad_norm": 0.6478809118270874, + "learning_rate": 9.919380006398105e-06, + "loss": 0.2069, + "step": 792 + }, + { + "epoch": 0.25696694750486065, + "grad_norm": 0.6714723110198975, + "learning_rate": 9.919066854627124e-06, + "loss": 0.1943, + "step": 793 + }, + { + "epoch": 0.2572909915748542, + "grad_norm": 0.5991042256355286, + "learning_rate": 9.91875310080998e-06, + "loss": 0.1685, + "step": 794 + }, + { + "epoch": 0.2576150356448477, + "grad_norm": 0.6321060657501221, + "learning_rate": 9.918438744985078e-06, + "loss": 0.1737, + "step": 795 + }, + { + "epoch": 0.2579390797148412, + "grad_norm": 0.618497371673584, + "learning_rate": 9.918123787190883e-06, + "loss": 0.1788, + "step": 796 + }, + { + "epoch": 0.25826312378483474, + "grad_norm": 0.6653442978858948, + "learning_rate": 9.91780822746595e-06, + "loss": 0.2012, + "step": 797 + }, + { + "epoch": 0.25858716785482827, + "grad_norm": 0.6287248730659485, + "learning_rate": 9.917492065848898e-06, + "loss": 0.1664, + "step": 798 + }, + { + "epoch": 0.25891121192482175, + "grad_norm": 0.6177788972854614, + "learning_rate": 9.91717530237842e-06, + "loss": 0.1842, + "step": 799 + }, + { + "epoch": 0.2592352559948153, + "grad_norm": 0.6276010274887085, + "learning_rate": 9.916857937093289e-06, + "loss": 0.1832, + "step": 800 + }, + { + "epoch": 0.2595593000648088, + "grad_norm": 0.6513774991035461, + "learning_rate": 9.916539970032344e-06, + "loss": 0.1848, + "step": 801 + }, + { + "epoch": 0.25988334413480235, + "grad_norm": 0.5932650566101074, + "learning_rate": 9.916221401234502e-06, + "loss": 0.1633, + "step": 802 + }, + { + "epoch": 0.26020738820479583, + "grad_norm": 0.66648930311203, + "learning_rate": 9.915902230738756e-06, + "loss": 0.1735, + "step": 803 + }, + { + "epoch": 0.26053143227478937, + "grad_norm": 0.5852811932563782, + "learning_rate": 9.915582458584164e-06, + "loss": 0.1667, + "step": 804 + }, + { + "epoch": 0.2608554763447829, + "grad_norm": 0.6674695611000061, + "learning_rate": 9.915262084809868e-06, + "loss": 0.184, + "step": 805 + }, + { + "epoch": 0.26117952041477643, + "grad_norm": 0.6292879581451416, + "learning_rate": 9.914941109455072e-06, + "loss": 0.1807, + "step": 806 + }, + { + "epoch": 0.2615035644847699, + "grad_norm": 0.6502755880355835, + "learning_rate": 9.914619532559069e-06, + "loss": 0.1969, + "step": 807 + }, + { + "epoch": 0.26182760855476345, + "grad_norm": 0.6338673830032349, + "learning_rate": 9.91429735416121e-06, + "loss": 0.1726, + "step": 808 + }, + { + "epoch": 0.262151652624757, + "grad_norm": 0.6485314965248108, + "learning_rate": 9.91397457430093e-06, + "loss": 0.1916, + "step": 809 + }, + { + "epoch": 0.26247569669475046, + "grad_norm": 0.624607264995575, + "learning_rate": 9.913651193017733e-06, + "loss": 0.1833, + "step": 810 + }, + { + "epoch": 0.262799740764744, + "grad_norm": 0.5967383980751038, + "learning_rate": 9.913327210351197e-06, + "loss": 0.1801, + "step": 811 + }, + { + "epoch": 0.26312378483473753, + "grad_norm": 0.638460636138916, + "learning_rate": 9.913002626340975e-06, + "loss": 0.198, + "step": 812 + }, + { + "epoch": 0.26344782890473106, + "grad_norm": 0.6392395496368408, + "learning_rate": 9.912677441026794e-06, + "loss": 0.1975, + "step": 813 + }, + { + "epoch": 0.26377187297472454, + "grad_norm": 0.6315566301345825, + "learning_rate": 9.912351654448453e-06, + "loss": 0.1935, + "step": 814 + }, + { + "epoch": 0.2640959170447181, + "grad_norm": 0.6048063635826111, + "learning_rate": 9.912025266645824e-06, + "loss": 0.1761, + "step": 815 + }, + { + "epoch": 0.2644199611147116, + "grad_norm": 0.5906796455383301, + "learning_rate": 9.911698277658855e-06, + "loss": 0.1623, + "step": 816 + }, + { + "epoch": 0.26474400518470514, + "grad_norm": 0.5543292760848999, + "learning_rate": 9.911370687527564e-06, + "loss": 0.1599, + "step": 817 + }, + { + "epoch": 0.2650680492546986, + "grad_norm": 0.5940440893173218, + "learning_rate": 9.911042496292049e-06, + "loss": 0.1726, + "step": 818 + }, + { + "epoch": 0.26539209332469216, + "grad_norm": 0.5984013080596924, + "learning_rate": 9.910713703992473e-06, + "loss": 0.177, + "step": 819 + }, + { + "epoch": 0.2657161373946857, + "grad_norm": 0.6281025409698486, + "learning_rate": 9.910384310669078e-06, + "loss": 0.1788, + "step": 820 + }, + { + "epoch": 0.26604018146467917, + "grad_norm": 0.6289580464363098, + "learning_rate": 9.910054316362184e-06, + "loss": 0.1905, + "step": 821 + }, + { + "epoch": 0.2663642255346727, + "grad_norm": 0.6344680786132812, + "learning_rate": 9.909723721112171e-06, + "loss": 0.19, + "step": 822 + }, + { + "epoch": 0.26668826960466624, + "grad_norm": 0.6040329337120056, + "learning_rate": 9.909392524959506e-06, + "loss": 0.1877, + "step": 823 + }, + { + "epoch": 0.2670123136746598, + "grad_norm": 0.5945601463317871, + "learning_rate": 9.909060727944721e-06, + "loss": 0.167, + "step": 824 + }, + { + "epoch": 0.26733635774465325, + "grad_norm": 0.6815995573997498, + "learning_rate": 9.908728330108428e-06, + "loss": 0.1922, + "step": 825 + }, + { + "epoch": 0.2676604018146468, + "grad_norm": 0.641444742679596, + "learning_rate": 9.908395331491307e-06, + "loss": 0.1734, + "step": 826 + }, + { + "epoch": 0.2679844458846403, + "grad_norm": 0.6499939560890198, + "learning_rate": 9.908061732134115e-06, + "loss": 0.168, + "step": 827 + }, + { + "epoch": 0.26830848995463386, + "grad_norm": 0.682487428188324, + "learning_rate": 9.90772753207768e-06, + "loss": 0.1719, + "step": 828 + }, + { + "epoch": 0.26863253402462733, + "grad_norm": 0.672627329826355, + "learning_rate": 9.907392731362905e-06, + "loss": 0.1951, + "step": 829 + }, + { + "epoch": 0.26895657809462087, + "grad_norm": 0.6161004900932312, + "learning_rate": 9.907057330030766e-06, + "loss": 0.1712, + "step": 830 + }, + { + "epoch": 0.2692806221646144, + "grad_norm": 0.6189830303192139, + "learning_rate": 9.906721328122317e-06, + "loss": 0.173, + "step": 831 + }, + { + "epoch": 0.2696046662346079, + "grad_norm": 0.6291322708129883, + "learning_rate": 9.906384725678676e-06, + "loss": 0.1867, + "step": 832 + }, + { + "epoch": 0.2699287103046014, + "grad_norm": 0.6070644855499268, + "learning_rate": 9.906047522741042e-06, + "loss": 0.1755, + "step": 833 + }, + { + "epoch": 0.27025275437459495, + "grad_norm": 0.6049726605415344, + "learning_rate": 9.905709719350686e-06, + "loss": 0.1635, + "step": 834 + }, + { + "epoch": 0.2705767984445885, + "grad_norm": 0.6292316913604736, + "learning_rate": 9.905371315548952e-06, + "loss": 0.1946, + "step": 835 + }, + { + "epoch": 0.27090084251458196, + "grad_norm": 0.6292069554328918, + "learning_rate": 9.905032311377257e-06, + "loss": 0.1828, + "step": 836 + }, + { + "epoch": 0.2712248865845755, + "grad_norm": 0.6271639466285706, + "learning_rate": 9.904692706877089e-06, + "loss": 0.1862, + "step": 837 + }, + { + "epoch": 0.27154893065456903, + "grad_norm": 0.6449607014656067, + "learning_rate": 9.904352502090016e-06, + "loss": 0.1785, + "step": 838 + }, + { + "epoch": 0.27187297472456257, + "grad_norm": 0.6190186738967896, + "learning_rate": 9.904011697057675e-06, + "loss": 0.1756, + "step": 839 + }, + { + "epoch": 0.27219701879455604, + "grad_norm": 0.5763127207756042, + "learning_rate": 9.903670291821776e-06, + "loss": 0.1665, + "step": 840 + }, + { + "epoch": 0.2725210628645496, + "grad_norm": 0.6554551124572754, + "learning_rate": 9.903328286424105e-06, + "loss": 0.1853, + "step": 841 + }, + { + "epoch": 0.2728451069345431, + "grad_norm": 0.6263623833656311, + "learning_rate": 9.90298568090652e-06, + "loss": 0.185, + "step": 842 + }, + { + "epoch": 0.2731691510045366, + "grad_norm": 0.6207384467124939, + "learning_rate": 9.902642475310953e-06, + "loss": 0.161, + "step": 843 + }, + { + "epoch": 0.2734931950745301, + "grad_norm": 0.6042211651802063, + "learning_rate": 9.902298669679406e-06, + "loss": 0.1887, + "step": 844 + }, + { + "epoch": 0.27381723914452366, + "grad_norm": 0.6446455717086792, + "learning_rate": 9.901954264053961e-06, + "loss": 0.1849, + "step": 845 + }, + { + "epoch": 0.2741412832145172, + "grad_norm": 0.6734024882316589, + "learning_rate": 9.901609258476769e-06, + "loss": 0.1997, + "step": 846 + }, + { + "epoch": 0.2744653272845107, + "grad_norm": 0.6023538708686829, + "learning_rate": 9.901263652990054e-06, + "loss": 0.1664, + "step": 847 + }, + { + "epoch": 0.2747893713545042, + "grad_norm": 0.6193968653678894, + "learning_rate": 9.900917447636116e-06, + "loss": 0.175, + "step": 848 + }, + { + "epoch": 0.27511341542449774, + "grad_norm": 0.6214480996131897, + "learning_rate": 9.900570642457327e-06, + "loss": 0.1776, + "step": 849 + }, + { + "epoch": 0.2754374594944913, + "grad_norm": 0.5912626385688782, + "learning_rate": 9.900223237496134e-06, + "loss": 0.1607, + "step": 850 + }, + { + "epoch": 0.27576150356448476, + "grad_norm": 0.6071320176124573, + "learning_rate": 9.899875232795054e-06, + "loss": 0.1859, + "step": 851 + }, + { + "epoch": 0.2760855476344783, + "grad_norm": 0.5959563851356506, + "learning_rate": 9.899526628396678e-06, + "loss": 0.1746, + "step": 852 + }, + { + "epoch": 0.2764095917044718, + "grad_norm": 0.6067162752151489, + "learning_rate": 9.899177424343676e-06, + "loss": 0.1836, + "step": 853 + }, + { + "epoch": 0.2767336357744653, + "grad_norm": 0.6264017224311829, + "learning_rate": 9.898827620678784e-06, + "loss": 0.1792, + "step": 854 + }, + { + "epoch": 0.27705767984445884, + "grad_norm": 0.6678988933563232, + "learning_rate": 9.898477217444817e-06, + "loss": 0.1834, + "step": 855 + }, + { + "epoch": 0.27738172391445237, + "grad_norm": 0.6405592560768127, + "learning_rate": 9.898126214684658e-06, + "loss": 0.2035, + "step": 856 + }, + { + "epoch": 0.2777057679844459, + "grad_norm": 0.6418700814247131, + "learning_rate": 9.897774612441268e-06, + "loss": 0.204, + "step": 857 + }, + { + "epoch": 0.2780298120544394, + "grad_norm": 0.6275181174278259, + "learning_rate": 9.89742241075768e-06, + "loss": 0.169, + "step": 858 + }, + { + "epoch": 0.2783538561244329, + "grad_norm": 0.5936182141304016, + "learning_rate": 9.897069609677e-06, + "loss": 0.1709, + "step": 859 + }, + { + "epoch": 0.27867790019442645, + "grad_norm": 0.5788288712501526, + "learning_rate": 9.896716209242405e-06, + "loss": 0.1734, + "step": 860 + }, + { + "epoch": 0.27900194426442, + "grad_norm": 0.6212310791015625, + "learning_rate": 9.896362209497152e-06, + "loss": 0.1745, + "step": 861 + }, + { + "epoch": 0.27932598833441347, + "grad_norm": 0.6616557240486145, + "learning_rate": 9.896007610484564e-06, + "loss": 0.1905, + "step": 862 + }, + { + "epoch": 0.279650032404407, + "grad_norm": 0.629759669303894, + "learning_rate": 9.895652412248043e-06, + "loss": 0.1706, + "step": 863 + }, + { + "epoch": 0.27997407647440054, + "grad_norm": 0.5903453826904297, + "learning_rate": 9.895296614831058e-06, + "loss": 0.1826, + "step": 864 + }, + { + "epoch": 0.280298120544394, + "grad_norm": 0.6063392758369446, + "learning_rate": 9.894940218277158e-06, + "loss": 0.1534, + "step": 865 + }, + { + "epoch": 0.28062216461438755, + "grad_norm": 0.6047586798667908, + "learning_rate": 9.894583222629963e-06, + "loss": 0.1533, + "step": 866 + }, + { + "epoch": 0.2809462086843811, + "grad_norm": 0.6632272005081177, + "learning_rate": 9.894225627933165e-06, + "loss": 0.1935, + "step": 867 + }, + { + "epoch": 0.2812702527543746, + "grad_norm": 0.5961798429489136, + "learning_rate": 9.893867434230529e-06, + "loss": 0.1763, + "step": 868 + }, + { + "epoch": 0.2815942968243681, + "grad_norm": 0.6355636715888977, + "learning_rate": 9.893508641565896e-06, + "loss": 0.1849, + "step": 869 + }, + { + "epoch": 0.28191834089436163, + "grad_norm": 0.6150272488594055, + "learning_rate": 9.893149249983178e-06, + "loss": 0.1868, + "step": 870 + }, + { + "epoch": 0.28224238496435516, + "grad_norm": 0.7121578454971313, + "learning_rate": 9.892789259526361e-06, + "loss": 0.1916, + "step": 871 + }, + { + "epoch": 0.2825664290343487, + "grad_norm": 0.5670008063316345, + "learning_rate": 9.892428670239504e-06, + "loss": 0.1643, + "step": 872 + }, + { + "epoch": 0.2828904731043422, + "grad_norm": 0.5904400944709778, + "learning_rate": 9.89206748216674e-06, + "loss": 0.168, + "step": 873 + }, + { + "epoch": 0.2832145171743357, + "grad_norm": 0.6115680932998657, + "learning_rate": 9.891705695352276e-06, + "loss": 0.1887, + "step": 874 + }, + { + "epoch": 0.28353856124432925, + "grad_norm": 0.6273742914199829, + "learning_rate": 9.89134330984039e-06, + "loss": 0.1884, + "step": 875 + }, + { + "epoch": 0.2838626053143227, + "grad_norm": 0.6091495156288147, + "learning_rate": 9.890980325675436e-06, + "loss": 0.1782, + "step": 876 + }, + { + "epoch": 0.28418664938431626, + "grad_norm": 0.6677464246749878, + "learning_rate": 9.890616742901837e-06, + "loss": 0.1734, + "step": 877 + }, + { + "epoch": 0.2845106934543098, + "grad_norm": 0.7175049781799316, + "learning_rate": 9.890252561564094e-06, + "loss": 0.1725, + "step": 878 + }, + { + "epoch": 0.2848347375243033, + "grad_norm": 0.6149059534072876, + "learning_rate": 9.889887781706777e-06, + "loss": 0.1907, + "step": 879 + }, + { + "epoch": 0.2851587815942968, + "grad_norm": 0.6019971370697021, + "learning_rate": 9.889522403374536e-06, + "loss": 0.1764, + "step": 880 + }, + { + "epoch": 0.28548282566429034, + "grad_norm": 0.6241403222084045, + "learning_rate": 9.889156426612086e-06, + "loss": 0.1646, + "step": 881 + }, + { + "epoch": 0.2858068697342839, + "grad_norm": 0.6610787510871887, + "learning_rate": 9.88878985146422e-06, + "loss": 0.199, + "step": 882 + }, + { + "epoch": 0.2861309138042774, + "grad_norm": 0.6264455318450928, + "learning_rate": 9.888422677975801e-06, + "loss": 0.1943, + "step": 883 + }, + { + "epoch": 0.2864549578742709, + "grad_norm": 0.6266207098960876, + "learning_rate": 9.888054906191773e-06, + "loss": 0.1757, + "step": 884 + }, + { + "epoch": 0.2867790019442644, + "grad_norm": 0.5767809748649597, + "learning_rate": 9.887686536157145e-06, + "loss": 0.1659, + "step": 885 + }, + { + "epoch": 0.28710304601425796, + "grad_norm": 0.6064203977584839, + "learning_rate": 9.887317567917e-06, + "loss": 0.1764, + "step": 886 + }, + { + "epoch": 0.28742709008425144, + "grad_norm": 0.6159385442733765, + "learning_rate": 9.886948001516497e-06, + "loss": 0.1706, + "step": 887 + }, + { + "epoch": 0.28775113415424497, + "grad_norm": 0.6020685434341431, + "learning_rate": 9.886577837000869e-06, + "loss": 0.1836, + "step": 888 + }, + { + "epoch": 0.2880751782242385, + "grad_norm": 0.6020832657814026, + "learning_rate": 9.88620707441542e-06, + "loss": 0.1691, + "step": 889 + }, + { + "epoch": 0.28839922229423204, + "grad_norm": 0.5680634379386902, + "learning_rate": 9.885835713805526e-06, + "loss": 0.1566, + "step": 890 + }, + { + "epoch": 0.2887232663642255, + "grad_norm": 0.649798572063446, + "learning_rate": 9.885463755216638e-06, + "loss": 0.188, + "step": 891 + }, + { + "epoch": 0.28904731043421905, + "grad_norm": 0.5846118927001953, + "learning_rate": 9.885091198694283e-06, + "loss": 0.1695, + "step": 892 + }, + { + "epoch": 0.2893713545042126, + "grad_norm": 0.6667026877403259, + "learning_rate": 9.884718044284056e-06, + "loss": 0.185, + "step": 893 + }, + { + "epoch": 0.28969539857420606, + "grad_norm": 0.5462652444839478, + "learning_rate": 9.884344292031629e-06, + "loss": 0.1551, + "step": 894 + }, + { + "epoch": 0.2900194426441996, + "grad_norm": 0.6042917966842651, + "learning_rate": 9.883969941982744e-06, + "loss": 0.1671, + "step": 895 + }, + { + "epoch": 0.29034348671419313, + "grad_norm": 0.6363921165466309, + "learning_rate": 9.883594994183219e-06, + "loss": 0.1793, + "step": 896 + }, + { + "epoch": 0.29066753078418667, + "grad_norm": 0.6049078106880188, + "learning_rate": 9.883219448678945e-06, + "loss": 0.168, + "step": 897 + }, + { + "epoch": 0.29099157485418015, + "grad_norm": 0.6222366094589233, + "learning_rate": 9.88284330551588e-06, + "loss": 0.1908, + "step": 898 + }, + { + "epoch": 0.2913156189241737, + "grad_norm": 0.5962700843811035, + "learning_rate": 9.882466564740067e-06, + "loss": 0.1711, + "step": 899 + }, + { + "epoch": 0.2916396629941672, + "grad_norm": 0.6368878483772278, + "learning_rate": 9.882089226397614e-06, + "loss": 0.1841, + "step": 900 + }, + { + "epoch": 0.29196370706416075, + "grad_norm": 0.6085599660873413, + "learning_rate": 9.881711290534699e-06, + "loss": 0.1715, + "step": 901 + }, + { + "epoch": 0.29228775113415423, + "grad_norm": 0.567490816116333, + "learning_rate": 9.88133275719758e-06, + "loss": 0.1601, + "step": 902 + }, + { + "epoch": 0.29261179520414776, + "grad_norm": 0.6248665452003479, + "learning_rate": 9.880953626432588e-06, + "loss": 0.1835, + "step": 903 + }, + { + "epoch": 0.2929358392741413, + "grad_norm": 0.581226110458374, + "learning_rate": 9.880573898286123e-06, + "loss": 0.1779, + "step": 904 + }, + { + "epoch": 0.2932598833441348, + "grad_norm": 0.6184927225112915, + "learning_rate": 9.880193572804662e-06, + "loss": 0.1796, + "step": 905 + }, + { + "epoch": 0.2935839274141283, + "grad_norm": 0.6253811120986938, + "learning_rate": 9.879812650034748e-06, + "loss": 0.1709, + "step": 906 + }, + { + "epoch": 0.29390797148412184, + "grad_norm": 0.6019658446311951, + "learning_rate": 9.879431130023009e-06, + "loss": 0.1761, + "step": 907 + }, + { + "epoch": 0.2942320155541154, + "grad_norm": 0.597907304763794, + "learning_rate": 9.879049012816136e-06, + "loss": 0.1744, + "step": 908 + }, + { + "epoch": 0.29455605962410886, + "grad_norm": 0.5939311385154724, + "learning_rate": 9.878666298460895e-06, + "loss": 0.179, + "step": 909 + }, + { + "epoch": 0.2948801036941024, + "grad_norm": 0.5980244278907776, + "learning_rate": 9.878282987004131e-06, + "loss": 0.19, + "step": 910 + }, + { + "epoch": 0.2952041477640959, + "grad_norm": 0.6104245781898499, + "learning_rate": 9.877899078492752e-06, + "loss": 0.1696, + "step": 911 + }, + { + "epoch": 0.29552819183408946, + "grad_norm": 0.5415026545524597, + "learning_rate": 9.877514572973748e-06, + "loss": 0.1523, + "step": 912 + }, + { + "epoch": 0.29585223590408294, + "grad_norm": 0.6522943377494812, + "learning_rate": 9.87712947049418e-06, + "loss": 0.1712, + "step": 913 + }, + { + "epoch": 0.2961762799740765, + "grad_norm": 0.6318775415420532, + "learning_rate": 9.876743771101178e-06, + "loss": 0.1744, + "step": 914 + }, + { + "epoch": 0.29650032404407, + "grad_norm": 0.5871029496192932, + "learning_rate": 9.876357474841949e-06, + "loss": 0.1605, + "step": 915 + }, + { + "epoch": 0.2968243681140635, + "grad_norm": 0.5952814817428589, + "learning_rate": 9.875970581763771e-06, + "loss": 0.1761, + "step": 916 + }, + { + "epoch": 0.297148412184057, + "grad_norm": 0.5859237313270569, + "learning_rate": 9.875583091913999e-06, + "loss": 0.1583, + "step": 917 + }, + { + "epoch": 0.29747245625405055, + "grad_norm": 0.58506178855896, + "learning_rate": 9.875195005340054e-06, + "loss": 0.1576, + "step": 918 + }, + { + "epoch": 0.2977965003240441, + "grad_norm": 0.6198616027832031, + "learning_rate": 9.874806322089437e-06, + "loss": 0.1822, + "step": 919 + }, + { + "epoch": 0.29812054439403757, + "grad_norm": 0.5580076575279236, + "learning_rate": 9.874417042209717e-06, + "loss": 0.1594, + "step": 920 + }, + { + "epoch": 0.2984445884640311, + "grad_norm": 0.610212504863739, + "learning_rate": 9.874027165748538e-06, + "loss": 0.1668, + "step": 921 + }, + { + "epoch": 0.29876863253402464, + "grad_norm": 0.6204732656478882, + "learning_rate": 9.87363669275362e-06, + "loss": 0.179, + "step": 922 + }, + { + "epoch": 0.29909267660401817, + "grad_norm": 0.6051819920539856, + "learning_rate": 9.873245623272752e-06, + "loss": 0.1844, + "step": 923 + }, + { + "epoch": 0.29941672067401165, + "grad_norm": 0.6267421841621399, + "learning_rate": 9.872853957353794e-06, + "loss": 0.1676, + "step": 924 + }, + { + "epoch": 0.2997407647440052, + "grad_norm": 0.5782517790794373, + "learning_rate": 9.872461695044686e-06, + "loss": 0.1619, + "step": 925 + }, + { + "epoch": 0.3000648088139987, + "grad_norm": 0.6506045460700989, + "learning_rate": 9.872068836393437e-06, + "loss": 0.1812, + "step": 926 + }, + { + "epoch": 0.3003888528839922, + "grad_norm": 0.6031262874603271, + "learning_rate": 9.871675381448126e-06, + "loss": 0.1848, + "step": 927 + }, + { + "epoch": 0.30071289695398573, + "grad_norm": 0.6256006360054016, + "learning_rate": 9.871281330256907e-06, + "loss": 0.1712, + "step": 928 + }, + { + "epoch": 0.30103694102397927, + "grad_norm": 0.5751093626022339, + "learning_rate": 9.870886682868015e-06, + "loss": 0.1631, + "step": 929 + }, + { + "epoch": 0.3013609850939728, + "grad_norm": 0.6084175705909729, + "learning_rate": 9.870491439329745e-06, + "loss": 0.1729, + "step": 930 + }, + { + "epoch": 0.3016850291639663, + "grad_norm": 0.5739203691482544, + "learning_rate": 9.870095599690475e-06, + "loss": 0.1723, + "step": 931 + }, + { + "epoch": 0.3020090732339598, + "grad_norm": 0.6068074107170105, + "learning_rate": 9.869699163998647e-06, + "loss": 0.1681, + "step": 932 + }, + { + "epoch": 0.30233311730395335, + "grad_norm": 0.6015316843986511, + "learning_rate": 9.869302132302785e-06, + "loss": 0.1656, + "step": 933 + }, + { + "epoch": 0.3026571613739469, + "grad_norm": 0.624249279499054, + "learning_rate": 9.86890450465148e-06, + "loss": 0.1888, + "step": 934 + }, + { + "epoch": 0.30298120544394036, + "grad_norm": 0.6235883831977844, + "learning_rate": 9.8685062810934e-06, + "loss": 0.1834, + "step": 935 + }, + { + "epoch": 0.3033052495139339, + "grad_norm": 0.6266400218009949, + "learning_rate": 9.868107461677279e-06, + "loss": 0.1821, + "step": 936 + }, + { + "epoch": 0.30362929358392743, + "grad_norm": 0.6055128574371338, + "learning_rate": 9.867708046451933e-06, + "loss": 0.1687, + "step": 937 + }, + { + "epoch": 0.3039533376539209, + "grad_norm": 0.6110880970954895, + "learning_rate": 9.867308035466245e-06, + "loss": 0.1732, + "step": 938 + }, + { + "epoch": 0.30427738172391444, + "grad_norm": 0.5936911106109619, + "learning_rate": 9.866907428769175e-06, + "loss": 0.1693, + "step": 939 + }, + { + "epoch": 0.304601425793908, + "grad_norm": 0.5937286019325256, + "learning_rate": 9.866506226409748e-06, + "loss": 0.1866, + "step": 940 + }, + { + "epoch": 0.3049254698639015, + "grad_norm": 0.5901283621788025, + "learning_rate": 9.86610442843707e-06, + "loss": 0.1759, + "step": 941 + }, + { + "epoch": 0.305249513933895, + "grad_norm": 0.6732702255249023, + "learning_rate": 9.86570203490032e-06, + "loss": 0.1953, + "step": 942 + }, + { + "epoch": 0.3055735580038885, + "grad_norm": 0.6250772476196289, + "learning_rate": 9.865299045848744e-06, + "loss": 0.1822, + "step": 943 + }, + { + "epoch": 0.30589760207388206, + "grad_norm": 0.6017111539840698, + "learning_rate": 9.864895461331664e-06, + "loss": 0.1741, + "step": 944 + }, + { + "epoch": 0.3062216461438756, + "grad_norm": 0.5704911947250366, + "learning_rate": 9.864491281398476e-06, + "loss": 0.1534, + "step": 945 + }, + { + "epoch": 0.30654569021386907, + "grad_norm": 0.5726393461227417, + "learning_rate": 9.864086506098646e-06, + "loss": 0.1738, + "step": 946 + }, + { + "epoch": 0.3068697342838626, + "grad_norm": 0.6365552544593811, + "learning_rate": 9.863681135481715e-06, + "loss": 0.1867, + "step": 947 + }, + { + "epoch": 0.30719377835385614, + "grad_norm": 0.5838772654533386, + "learning_rate": 9.8632751695973e-06, + "loss": 0.1651, + "step": 948 + }, + { + "epoch": 0.3075178224238496, + "grad_norm": 0.6326878070831299, + "learning_rate": 9.862868608495082e-06, + "loss": 0.2011, + "step": 949 + }, + { + "epoch": 0.30784186649384315, + "grad_norm": 0.6173710823059082, + "learning_rate": 9.862461452224823e-06, + "loss": 0.1722, + "step": 950 + }, + { + "epoch": 0.3081659105638367, + "grad_norm": 0.5816516280174255, + "learning_rate": 9.862053700836353e-06, + "loss": 0.1808, + "step": 951 + }, + { + "epoch": 0.3084899546338302, + "grad_norm": 0.613852322101593, + "learning_rate": 9.861645354379581e-06, + "loss": 0.183, + "step": 952 + }, + { + "epoch": 0.3088139987038237, + "grad_norm": 0.5804192423820496, + "learning_rate": 9.86123641290448e-06, + "loss": 0.1585, + "step": 953 + }, + { + "epoch": 0.30913804277381723, + "grad_norm": 0.580578625202179, + "learning_rate": 9.860826876461105e-06, + "loss": 0.1616, + "step": 954 + }, + { + "epoch": 0.30946208684381077, + "grad_norm": 0.5804625153541565, + "learning_rate": 9.860416745099574e-06, + "loss": 0.167, + "step": 955 + }, + { + "epoch": 0.3097861309138043, + "grad_norm": 0.5484138131141663, + "learning_rate": 9.860006018870087e-06, + "loss": 0.1573, + "step": 956 + }, + { + "epoch": 0.3101101749837978, + "grad_norm": 0.6087974309921265, + "learning_rate": 9.859594697822913e-06, + "loss": 0.1733, + "step": 957 + }, + { + "epoch": 0.3104342190537913, + "grad_norm": 0.5645603537559509, + "learning_rate": 9.859182782008392e-06, + "loss": 0.162, + "step": 958 + }, + { + "epoch": 0.31075826312378485, + "grad_norm": 0.7392151355743408, + "learning_rate": 9.858770271476939e-06, + "loss": 0.187, + "step": 959 + }, + { + "epoch": 0.31108230719377833, + "grad_norm": 0.552918553352356, + "learning_rate": 9.85835716627904e-06, + "loss": 0.1602, + "step": 960 + }, + { + "epoch": 0.31140635126377186, + "grad_norm": 0.6616688966751099, + "learning_rate": 9.857943466465256e-06, + "loss": 0.1907, + "step": 961 + }, + { + "epoch": 0.3117303953337654, + "grad_norm": 0.5790378451347351, + "learning_rate": 9.857529172086222e-06, + "loss": 0.166, + "step": 962 + }, + { + "epoch": 0.31205443940375893, + "grad_norm": 0.6137049198150635, + "learning_rate": 9.857114283192641e-06, + "loss": 0.1772, + "step": 963 + }, + { + "epoch": 0.3123784834737524, + "grad_norm": 0.6005215644836426, + "learning_rate": 9.856698799835292e-06, + "loss": 0.173, + "step": 964 + }, + { + "epoch": 0.31270252754374595, + "grad_norm": 0.6044658422470093, + "learning_rate": 9.856282722065026e-06, + "loss": 0.1819, + "step": 965 + }, + { + "epoch": 0.3130265716137395, + "grad_norm": 0.5882754921913147, + "learning_rate": 9.85586604993277e-06, + "loss": 0.1643, + "step": 966 + }, + { + "epoch": 0.313350615683733, + "grad_norm": 0.5900786519050598, + "learning_rate": 9.855448783489517e-06, + "loss": 0.1663, + "step": 967 + }, + { + "epoch": 0.3136746597537265, + "grad_norm": 0.5808376669883728, + "learning_rate": 9.855030922786334e-06, + "loss": 0.1672, + "step": 968 + }, + { + "epoch": 0.31399870382372, + "grad_norm": 0.5942047834396362, + "learning_rate": 9.85461246787437e-06, + "loss": 0.1779, + "step": 969 + }, + { + "epoch": 0.31432274789371356, + "grad_norm": 0.587271511554718, + "learning_rate": 9.854193418804835e-06, + "loss": 0.1588, + "step": 970 + }, + { + "epoch": 0.31464679196370704, + "grad_norm": 0.6117364764213562, + "learning_rate": 9.853773775629018e-06, + "loss": 0.1686, + "step": 971 + }, + { + "epoch": 0.3149708360337006, + "grad_norm": 0.5825042724609375, + "learning_rate": 9.853353538398278e-06, + "loss": 0.1714, + "step": 972 + }, + { + "epoch": 0.3152948801036941, + "grad_norm": 0.5845495462417603, + "learning_rate": 9.852932707164051e-06, + "loss": 0.1677, + "step": 973 + }, + { + "epoch": 0.31561892417368764, + "grad_norm": 0.6072530746459961, + "learning_rate": 9.852511281977838e-06, + "loss": 0.1763, + "step": 974 + }, + { + "epoch": 0.3159429682436811, + "grad_norm": 0.5852010846138, + "learning_rate": 9.852089262891222e-06, + "loss": 0.1764, + "step": 975 + }, + { + "epoch": 0.31626701231367466, + "grad_norm": 0.5684648752212524, + "learning_rate": 9.851666649955853e-06, + "loss": 0.1727, + "step": 976 + }, + { + "epoch": 0.3165910563836682, + "grad_norm": 0.5730621218681335, + "learning_rate": 9.851243443223451e-06, + "loss": 0.1789, + "step": 977 + }, + { + "epoch": 0.3169151004536617, + "grad_norm": 0.5981029272079468, + "learning_rate": 9.850819642745816e-06, + "loss": 0.1709, + "step": 978 + }, + { + "epoch": 0.3172391445236552, + "grad_norm": 0.6033071279525757, + "learning_rate": 9.850395248574818e-06, + "loss": 0.1869, + "step": 979 + }, + { + "epoch": 0.31756318859364874, + "grad_norm": 0.632879912853241, + "learning_rate": 9.849970260762396e-06, + "loss": 0.1974, + "step": 980 + }, + { + "epoch": 0.31788723266364227, + "grad_norm": 0.6106759309768677, + "learning_rate": 9.849544679360566e-06, + "loss": 0.1572, + "step": 981 + }, + { + "epoch": 0.31821127673363575, + "grad_norm": 0.6135556697845459, + "learning_rate": 9.849118504421413e-06, + "loss": 0.188, + "step": 982 + }, + { + "epoch": 0.3185353208036293, + "grad_norm": 0.5774173140525818, + "learning_rate": 9.8486917359971e-06, + "loss": 0.1806, + "step": 983 + }, + { + "epoch": 0.3188593648736228, + "grad_norm": 0.5769929885864258, + "learning_rate": 9.848264374139855e-06, + "loss": 0.1675, + "step": 984 + }, + { + "epoch": 0.31918340894361635, + "grad_norm": 0.5752699971199036, + "learning_rate": 9.847836418901988e-06, + "loss": 0.1764, + "step": 985 + }, + { + "epoch": 0.31950745301360983, + "grad_norm": 0.5664514899253845, + "learning_rate": 9.847407870335873e-06, + "loss": 0.1677, + "step": 986 + }, + { + "epoch": 0.31983149708360337, + "grad_norm": 0.6190240979194641, + "learning_rate": 9.846978728493961e-06, + "loss": 0.1871, + "step": 987 + }, + { + "epoch": 0.3201555411535969, + "grad_norm": 0.592129111289978, + "learning_rate": 9.846548993428775e-06, + "loss": 0.1748, + "step": 988 + }, + { + "epoch": 0.32047958522359044, + "grad_norm": 0.6201058030128479, + "learning_rate": 9.846118665192912e-06, + "loss": 0.1767, + "step": 989 + }, + { + "epoch": 0.3208036292935839, + "grad_norm": 0.6250030994415283, + "learning_rate": 9.845687743839036e-06, + "loss": 0.1727, + "step": 990 + }, + { + "epoch": 0.32112767336357745, + "grad_norm": 0.5911518931388855, + "learning_rate": 9.84525622941989e-06, + "loss": 0.1792, + "step": 991 + }, + { + "epoch": 0.321451717433571, + "grad_norm": 0.5523138642311096, + "learning_rate": 9.84482412198829e-06, + "loss": 0.1637, + "step": 992 + }, + { + "epoch": 0.32177576150356446, + "grad_norm": 0.6329092383384705, + "learning_rate": 9.844391421597118e-06, + "loss": 0.1785, + "step": 993 + }, + { + "epoch": 0.322099805573558, + "grad_norm": 0.6084178686141968, + "learning_rate": 9.843958128299331e-06, + "loss": 0.1555, + "step": 994 + }, + { + "epoch": 0.32242384964355153, + "grad_norm": 0.6443354487419128, + "learning_rate": 9.843524242147965e-06, + "loss": 0.1761, + "step": 995 + }, + { + "epoch": 0.32274789371354506, + "grad_norm": 0.5739508271217346, + "learning_rate": 9.843089763196119e-06, + "loss": 0.1739, + "step": 996 + }, + { + "epoch": 0.32307193778353854, + "grad_norm": 0.5695315599441528, + "learning_rate": 9.842654691496972e-06, + "loss": 0.1785, + "step": 997 + }, + { + "epoch": 0.3233959818535321, + "grad_norm": 0.6078848838806152, + "learning_rate": 9.84221902710377e-06, + "loss": 0.1765, + "step": 998 + }, + { + "epoch": 0.3237200259235256, + "grad_norm": 0.5366340279579163, + "learning_rate": 9.841782770069837e-06, + "loss": 0.1437, + "step": 999 + }, + { + "epoch": 0.32404406999351915, + "grad_norm": 0.5579332709312439, + "learning_rate": 9.841345920448564e-06, + "loss": 0.1588, + "step": 1000 + }, + { + "epoch": 0.3243681140635126, + "grad_norm": 0.5986455082893372, + "learning_rate": 9.840908478293418e-06, + "loss": 0.1844, + "step": 1001 + }, + { + "epoch": 0.32469215813350616, + "grad_norm": 0.5817307233810425, + "learning_rate": 9.84047044365794e-06, + "loss": 0.1741, + "step": 1002 + }, + { + "epoch": 0.3250162022034997, + "grad_norm": 0.5661544799804688, + "learning_rate": 9.840031816595737e-06, + "loss": 0.1472, + "step": 1003 + }, + { + "epoch": 0.32534024627349317, + "grad_norm": 0.6002435684204102, + "learning_rate": 9.839592597160494e-06, + "loss": 0.1773, + "step": 1004 + }, + { + "epoch": 0.3256642903434867, + "grad_norm": 0.5806878209114075, + "learning_rate": 9.839152785405968e-06, + "loss": 0.1658, + "step": 1005 + }, + { + "epoch": 0.32598833441348024, + "grad_norm": 0.5673524141311646, + "learning_rate": 9.838712381385989e-06, + "loss": 0.1786, + "step": 1006 + }, + { + "epoch": 0.3263123784834738, + "grad_norm": 0.6064449548721313, + "learning_rate": 9.838271385154457e-06, + "loss": 0.1831, + "step": 1007 + }, + { + "epoch": 0.32663642255346725, + "grad_norm": 0.566112756729126, + "learning_rate": 9.837829796765345e-06, + "loss": 0.1513, + "step": 1008 + }, + { + "epoch": 0.3269604666234608, + "grad_norm": 0.547829270362854, + "learning_rate": 9.8373876162727e-06, + "loss": 0.1711, + "step": 1009 + }, + { + "epoch": 0.3272845106934543, + "grad_norm": 0.5666298270225525, + "learning_rate": 9.83694484373064e-06, + "loss": 0.1784, + "step": 1010 + }, + { + "epoch": 0.3276085547634478, + "grad_norm": 0.6046202778816223, + "learning_rate": 9.836501479193356e-06, + "loss": 0.1555, + "step": 1011 + }, + { + "epoch": 0.32793259883344134, + "grad_norm": 0.5862842202186584, + "learning_rate": 9.836057522715114e-06, + "loss": 0.1766, + "step": 1012 + }, + { + "epoch": 0.32825664290343487, + "grad_norm": 0.5938090085983276, + "learning_rate": 9.835612974350247e-06, + "loss": 0.1635, + "step": 1013 + }, + { + "epoch": 0.3285806869734284, + "grad_norm": 0.5398661494255066, + "learning_rate": 9.835167834153162e-06, + "loss": 0.1575, + "step": 1014 + }, + { + "epoch": 0.3289047310434219, + "grad_norm": 0.5675678253173828, + "learning_rate": 9.834722102178344e-06, + "loss": 0.1592, + "step": 1015 + }, + { + "epoch": 0.3292287751134154, + "grad_norm": 0.6059688329696655, + "learning_rate": 9.834275778480345e-06, + "loss": 0.1849, + "step": 1016 + }, + { + "epoch": 0.32955281918340895, + "grad_norm": 0.5854890942573547, + "learning_rate": 9.83382886311379e-06, + "loss": 0.1751, + "step": 1017 + }, + { + "epoch": 0.3298768632534025, + "grad_norm": 0.592852771282196, + "learning_rate": 9.833381356133376e-06, + "loss": 0.1771, + "step": 1018 + }, + { + "epoch": 0.33020090732339596, + "grad_norm": 0.6050461530685425, + "learning_rate": 9.832933257593875e-06, + "loss": 0.1778, + "step": 1019 + }, + { + "epoch": 0.3305249513933895, + "grad_norm": 0.5728069543838501, + "learning_rate": 9.832484567550131e-06, + "loss": 0.1707, + "step": 1020 + }, + { + "epoch": 0.33084899546338303, + "grad_norm": 0.5393828749656677, + "learning_rate": 9.832035286057057e-06, + "loss": 0.1545, + "step": 1021 + }, + { + "epoch": 0.3311730395333765, + "grad_norm": 0.6079297661781311, + "learning_rate": 9.831585413169642e-06, + "loss": 0.1579, + "step": 1022 + }, + { + "epoch": 0.33149708360337005, + "grad_norm": 0.5757361054420471, + "learning_rate": 9.831134948942945e-06, + "loss": 0.1565, + "step": 1023 + }, + { + "epoch": 0.3318211276733636, + "grad_norm": 0.5705800652503967, + "learning_rate": 9.8306838934321e-06, + "loss": 0.1531, + "step": 1024 + }, + { + "epoch": 0.3321451717433571, + "grad_norm": 0.6327515244483948, + "learning_rate": 9.83023224669231e-06, + "loss": 0.1742, + "step": 1025 + }, + { + "epoch": 0.3324692158133506, + "grad_norm": 0.6158990859985352, + "learning_rate": 9.829780008778855e-06, + "loss": 0.189, + "step": 1026 + }, + { + "epoch": 0.33279325988334413, + "grad_norm": 0.563691258430481, + "learning_rate": 9.829327179747083e-06, + "loss": 0.1844, + "step": 1027 + }, + { + "epoch": 0.33311730395333766, + "grad_norm": 0.5974298715591431, + "learning_rate": 9.828873759652415e-06, + "loss": 0.1708, + "step": 1028 + }, + { + "epoch": 0.3334413480233312, + "grad_norm": 0.5656067728996277, + "learning_rate": 9.828419748550345e-06, + "loss": 0.1706, + "step": 1029 + }, + { + "epoch": 0.3337653920933247, + "grad_norm": 0.6119730472564697, + "learning_rate": 9.827965146496441e-06, + "loss": 0.168, + "step": 1030 + }, + { + "epoch": 0.3340894361633182, + "grad_norm": 0.5695846080780029, + "learning_rate": 9.82750995354634e-06, + "loss": 0.1653, + "step": 1031 + }, + { + "epoch": 0.33441348023331174, + "grad_norm": 0.6336842775344849, + "learning_rate": 9.827054169755759e-06, + "loss": 0.1961, + "step": 1032 + }, + { + "epoch": 0.3347375243033052, + "grad_norm": 0.5704305768013, + "learning_rate": 9.826597795180474e-06, + "loss": 0.174, + "step": 1033 + }, + { + "epoch": 0.33506156837329876, + "grad_norm": 0.6015166640281677, + "learning_rate": 9.826140829876344e-06, + "loss": 0.1776, + "step": 1034 + }, + { + "epoch": 0.3353856124432923, + "grad_norm": 0.5965490341186523, + "learning_rate": 9.825683273899298e-06, + "loss": 0.1764, + "step": 1035 + }, + { + "epoch": 0.3357096565132858, + "grad_norm": 0.6270467638969421, + "learning_rate": 9.825225127305334e-06, + "loss": 0.1738, + "step": 1036 + }, + { + "epoch": 0.3360337005832793, + "grad_norm": 0.5696086287498474, + "learning_rate": 9.824766390150528e-06, + "loss": 0.17, + "step": 1037 + }, + { + "epoch": 0.33635774465327284, + "grad_norm": 0.5731741786003113, + "learning_rate": 9.824307062491022e-06, + "loss": 0.1727, + "step": 1038 + }, + { + "epoch": 0.3366817887232664, + "grad_norm": 0.5837844014167786, + "learning_rate": 9.823847144383035e-06, + "loss": 0.1857, + "step": 1039 + }, + { + "epoch": 0.3370058327932599, + "grad_norm": 0.6010358929634094, + "learning_rate": 9.823386635882856e-06, + "loss": 0.1534, + "step": 1040 + }, + { + "epoch": 0.3373298768632534, + "grad_norm": 0.5789628028869629, + "learning_rate": 9.822925537046846e-06, + "loss": 0.166, + "step": 1041 + }, + { + "epoch": 0.3376539209332469, + "grad_norm": 0.575428307056427, + "learning_rate": 9.822463847931441e-06, + "loss": 0.1704, + "step": 1042 + }, + { + "epoch": 0.33797796500324045, + "grad_norm": 0.5323755145072937, + "learning_rate": 9.822001568593144e-06, + "loss": 0.1743, + "step": 1043 + }, + { + "epoch": 0.33830200907323393, + "grad_norm": 0.6056416630744934, + "learning_rate": 9.821538699088538e-06, + "loss": 0.1722, + "step": 1044 + }, + { + "epoch": 0.33862605314322747, + "grad_norm": 0.5860357880592346, + "learning_rate": 9.821075239474271e-06, + "loss": 0.1607, + "step": 1045 + }, + { + "epoch": 0.338950097213221, + "grad_norm": 0.59328293800354, + "learning_rate": 9.820611189807065e-06, + "loss": 0.1749, + "step": 1046 + }, + { + "epoch": 0.33927414128321454, + "grad_norm": 0.5994810461997986, + "learning_rate": 9.820146550143717e-06, + "loss": 0.1937, + "step": 1047 + }, + { + "epoch": 0.339598185353208, + "grad_norm": 0.6299530863761902, + "learning_rate": 9.819681320541094e-06, + "loss": 0.1882, + "step": 1048 + }, + { + "epoch": 0.33992222942320155, + "grad_norm": 0.5794624090194702, + "learning_rate": 9.819215501056136e-06, + "loss": 0.1664, + "step": 1049 + }, + { + "epoch": 0.3402462734931951, + "grad_norm": 0.5705100297927856, + "learning_rate": 9.818749091745854e-06, + "loss": 0.1631, + "step": 1050 + }, + { + "epoch": 0.3405703175631886, + "grad_norm": 0.5567938685417175, + "learning_rate": 9.818282092667332e-06, + "loss": 0.1465, + "step": 1051 + }, + { + "epoch": 0.3408943616331821, + "grad_norm": 0.6096282005310059, + "learning_rate": 9.817814503877728e-06, + "loss": 0.1684, + "step": 1052 + }, + { + "epoch": 0.34121840570317563, + "grad_norm": 0.6146951913833618, + "learning_rate": 9.817346325434266e-06, + "loss": 0.1739, + "step": 1053 + }, + { + "epoch": 0.34154244977316917, + "grad_norm": 0.5440871715545654, + "learning_rate": 9.81687755739425e-06, + "loss": 0.1598, + "step": 1054 + }, + { + "epoch": 0.34186649384316264, + "grad_norm": 0.6106154918670654, + "learning_rate": 9.816408199815054e-06, + "loss": 0.1765, + "step": 1055 + }, + { + "epoch": 0.3421905379131562, + "grad_norm": 0.6298217177391052, + "learning_rate": 9.815938252754117e-06, + "loss": 0.1849, + "step": 1056 + }, + { + "epoch": 0.3425145819831497, + "grad_norm": 0.5941401720046997, + "learning_rate": 9.815467716268964e-06, + "loss": 0.1741, + "step": 1057 + }, + { + "epoch": 0.34283862605314325, + "grad_norm": 0.5596537590026855, + "learning_rate": 9.814996590417178e-06, + "loss": 0.1651, + "step": 1058 + }, + { + "epoch": 0.3431626701231367, + "grad_norm": 0.5766615867614746, + "learning_rate": 9.814524875256422e-06, + "loss": 0.1774, + "step": 1059 + }, + { + "epoch": 0.34348671419313026, + "grad_norm": 0.5682839155197144, + "learning_rate": 9.81405257084443e-06, + "loss": 0.174, + "step": 1060 + }, + { + "epoch": 0.3438107582631238, + "grad_norm": 0.6307297348976135, + "learning_rate": 9.813579677239008e-06, + "loss": 0.1799, + "step": 1061 + }, + { + "epoch": 0.34413480233311733, + "grad_norm": 0.5764597058296204, + "learning_rate": 9.81310619449803e-06, + "loss": 0.1824, + "step": 1062 + }, + { + "epoch": 0.3444588464031108, + "grad_norm": 0.5531014204025269, + "learning_rate": 9.812632122679448e-06, + "loss": 0.157, + "step": 1063 + }, + { + "epoch": 0.34478289047310434, + "grad_norm": 0.5836609601974487, + "learning_rate": 9.812157461841287e-06, + "loss": 0.1764, + "step": 1064 + }, + { + "epoch": 0.3451069345430979, + "grad_norm": 0.5902723073959351, + "learning_rate": 9.811682212041636e-06, + "loss": 0.1738, + "step": 1065 + }, + { + "epoch": 0.34543097861309136, + "grad_norm": 0.629046618938446, + "learning_rate": 9.811206373338664e-06, + "loss": 0.1966, + "step": 1066 + }, + { + "epoch": 0.3457550226830849, + "grad_norm": 0.63542640209198, + "learning_rate": 9.810729945790607e-06, + "loss": 0.1631, + "step": 1067 + }, + { + "epoch": 0.3460790667530784, + "grad_norm": 0.6045624613761902, + "learning_rate": 9.810252929455777e-06, + "loss": 0.1834, + "step": 1068 + }, + { + "epoch": 0.34640311082307196, + "grad_norm": 0.5919462442398071, + "learning_rate": 9.809775324392554e-06, + "loss": 0.1691, + "step": 1069 + }, + { + "epoch": 0.34672715489306544, + "grad_norm": 0.5924056172370911, + "learning_rate": 9.809297130659394e-06, + "loss": 0.1504, + "step": 1070 + }, + { + "epoch": 0.34705119896305897, + "grad_norm": 0.5948939323425293, + "learning_rate": 9.808818348314824e-06, + "loss": 0.1856, + "step": 1071 + }, + { + "epoch": 0.3473752430330525, + "grad_norm": 0.575698733329773, + "learning_rate": 9.80833897741744e-06, + "loss": 0.1736, + "step": 1072 + }, + { + "epoch": 0.34769928710304604, + "grad_norm": 0.5729328393936157, + "learning_rate": 9.807859018025914e-06, + "loss": 0.169, + "step": 1073 + }, + { + "epoch": 0.3480233311730395, + "grad_norm": 0.5551473498344421, + "learning_rate": 9.807378470198987e-06, + "loss": 0.1544, + "step": 1074 + }, + { + "epoch": 0.34834737524303305, + "grad_norm": 0.5853785276412964, + "learning_rate": 9.806897333995475e-06, + "loss": 0.1683, + "step": 1075 + }, + { + "epoch": 0.3486714193130266, + "grad_norm": 0.5476290583610535, + "learning_rate": 9.806415609474264e-06, + "loss": 0.1627, + "step": 1076 + }, + { + "epoch": 0.34899546338302007, + "grad_norm": 0.6016427278518677, + "learning_rate": 9.805933296694312e-06, + "loss": 0.172, + "step": 1077 + }, + { + "epoch": 0.3493195074530136, + "grad_norm": 0.5918509364128113, + "learning_rate": 9.80545039571465e-06, + "loss": 0.1643, + "step": 1078 + }, + { + "epoch": 0.34964355152300713, + "grad_norm": 0.6145140528678894, + "learning_rate": 9.804966906594378e-06, + "loss": 0.1733, + "step": 1079 + }, + { + "epoch": 0.34996759559300067, + "grad_norm": 0.5936920642852783, + "learning_rate": 9.804482829392674e-06, + "loss": 0.1788, + "step": 1080 + }, + { + "epoch": 0.35029163966299415, + "grad_norm": 0.5736120343208313, + "learning_rate": 9.803998164168783e-06, + "loss": 0.1561, + "step": 1081 + }, + { + "epoch": 0.3506156837329877, + "grad_norm": 0.5619922280311584, + "learning_rate": 9.803512910982022e-06, + "loss": 0.1539, + "step": 1082 + }, + { + "epoch": 0.3509397278029812, + "grad_norm": 0.5910178422927856, + "learning_rate": 9.803027069891782e-06, + "loss": 0.1886, + "step": 1083 + }, + { + "epoch": 0.35126377187297475, + "grad_norm": 0.5958241820335388, + "learning_rate": 9.802540640957526e-06, + "loss": 0.1782, + "step": 1084 + }, + { + "epoch": 0.35158781594296823, + "grad_norm": 0.5879762768745422, + "learning_rate": 9.802053624238788e-06, + "loss": 0.168, + "step": 1085 + }, + { + "epoch": 0.35191186001296176, + "grad_norm": 0.6157310009002686, + "learning_rate": 9.801566019795175e-06, + "loss": 0.1748, + "step": 1086 + }, + { + "epoch": 0.3522359040829553, + "grad_norm": 0.6026695370674133, + "learning_rate": 9.801077827686361e-06, + "loss": 0.173, + "step": 1087 + }, + { + "epoch": 0.3525599481529488, + "grad_norm": 0.5551148056983948, + "learning_rate": 9.800589047972101e-06, + "loss": 0.1671, + "step": 1088 + }, + { + "epoch": 0.3528839922229423, + "grad_norm": 0.6182596683502197, + "learning_rate": 9.800099680712214e-06, + "loss": 0.1984, + "step": 1089 + }, + { + "epoch": 0.35320803629293585, + "grad_norm": 0.6011072397232056, + "learning_rate": 9.799609725966596e-06, + "loss": 0.186, + "step": 1090 + }, + { + "epoch": 0.3535320803629294, + "grad_norm": 0.5565938353538513, + "learning_rate": 9.799119183795209e-06, + "loss": 0.1531, + "step": 1091 + }, + { + "epoch": 0.35385612443292286, + "grad_norm": 0.5287664532661438, + "learning_rate": 9.798628054258094e-06, + "loss": 0.1679, + "step": 1092 + }, + { + "epoch": 0.3541801685029164, + "grad_norm": 0.5842703580856323, + "learning_rate": 9.79813633741536e-06, + "loss": 0.173, + "step": 1093 + }, + { + "epoch": 0.3545042125729099, + "grad_norm": 0.5793323516845703, + "learning_rate": 9.79764403332719e-06, + "loss": 0.1769, + "step": 1094 + }, + { + "epoch": 0.35482825664290346, + "grad_norm": 0.5644661784172058, + "learning_rate": 9.797151142053831e-06, + "loss": 0.1684, + "step": 1095 + }, + { + "epoch": 0.35515230071289694, + "grad_norm": 0.6257332563400269, + "learning_rate": 9.796657663655616e-06, + "loss": 0.1931, + "step": 1096 + }, + { + "epoch": 0.3554763447828905, + "grad_norm": 0.5619518756866455, + "learning_rate": 9.796163598192934e-06, + "loss": 0.1623, + "step": 1097 + }, + { + "epoch": 0.355800388852884, + "grad_norm": 0.5887840986251831, + "learning_rate": 9.795668945726263e-06, + "loss": 0.1617, + "step": 1098 + }, + { + "epoch": 0.3561244329228775, + "grad_norm": 0.6297451853752136, + "learning_rate": 9.795173706316135e-06, + "loss": 0.1737, + "step": 1099 + }, + { + "epoch": 0.356448476992871, + "grad_norm": 0.6247619390487671, + "learning_rate": 9.794677880023166e-06, + "loss": 0.1679, + "step": 1100 + }, + { + "epoch": 0.35677252106286456, + "grad_norm": 0.586454451084137, + "learning_rate": 9.794181466908042e-06, + "loss": 0.1497, + "step": 1101 + }, + { + "epoch": 0.3570965651328581, + "grad_norm": 0.6023052930831909, + "learning_rate": 9.793684467031518e-06, + "loss": 0.187, + "step": 1102 + }, + { + "epoch": 0.35742060920285157, + "grad_norm": 0.6439177393913269, + "learning_rate": 9.79318688045442e-06, + "loss": 0.1871, + "step": 1103 + }, + { + "epoch": 0.3577446532728451, + "grad_norm": 0.5750577449798584, + "learning_rate": 9.792688707237651e-06, + "loss": 0.1741, + "step": 1104 + }, + { + "epoch": 0.35806869734283864, + "grad_norm": 0.574123740196228, + "learning_rate": 9.792189947442182e-06, + "loss": 0.1666, + "step": 1105 + }, + { + "epoch": 0.35839274141283217, + "grad_norm": 0.5401131510734558, + "learning_rate": 9.791690601129053e-06, + "loss": 0.1553, + "step": 1106 + }, + { + "epoch": 0.35871678548282565, + "grad_norm": 0.5300427675247192, + "learning_rate": 9.791190668359383e-06, + "loss": 0.1447, + "step": 1107 + }, + { + "epoch": 0.3590408295528192, + "grad_norm": 0.6172694563865662, + "learning_rate": 9.79069014919436e-06, + "loss": 0.1956, + "step": 1108 + }, + { + "epoch": 0.3593648736228127, + "grad_norm": 0.6034216284751892, + "learning_rate": 9.790189043695235e-06, + "loss": 0.1796, + "step": 1109 + }, + { + "epoch": 0.3596889176928062, + "grad_norm": 0.5867305397987366, + "learning_rate": 9.789687351923346e-06, + "loss": 0.1755, + "step": 1110 + }, + { + "epoch": 0.36001296176279973, + "grad_norm": 0.5331410765647888, + "learning_rate": 9.789185073940095e-06, + "loss": 0.1517, + "step": 1111 + }, + { + "epoch": 0.36033700583279327, + "grad_norm": 0.5819225311279297, + "learning_rate": 9.788682209806951e-06, + "loss": 0.1544, + "step": 1112 + }, + { + "epoch": 0.3606610499027868, + "grad_norm": 0.5827124118804932, + "learning_rate": 9.788178759585463e-06, + "loss": 0.1726, + "step": 1113 + }, + { + "epoch": 0.3609850939727803, + "grad_norm": 0.5797318816184998, + "learning_rate": 9.787674723337248e-06, + "loss": 0.1692, + "step": 1114 + }, + { + "epoch": 0.3613091380427738, + "grad_norm": 0.6464574933052063, + "learning_rate": 9.787170101123997e-06, + "loss": 0.1816, + "step": 1115 + }, + { + "epoch": 0.36163318211276735, + "grad_norm": 0.5869221091270447, + "learning_rate": 9.786664893007467e-06, + "loss": 0.1788, + "step": 1116 + }, + { + "epoch": 0.3619572261827609, + "grad_norm": 0.5861597061157227, + "learning_rate": 9.786159099049494e-06, + "loss": 0.1766, + "step": 1117 + }, + { + "epoch": 0.36228127025275436, + "grad_norm": 0.584846019744873, + "learning_rate": 9.78565271931198e-06, + "loss": 0.165, + "step": 1118 + }, + { + "epoch": 0.3626053143227479, + "grad_norm": 0.5789152979850769, + "learning_rate": 9.785145753856901e-06, + "loss": 0.1602, + "step": 1119 + }, + { + "epoch": 0.36292935839274143, + "grad_norm": 0.5569080114364624, + "learning_rate": 9.784638202746308e-06, + "loss": 0.1514, + "step": 1120 + }, + { + "epoch": 0.3632534024627349, + "grad_norm": 0.6112030744552612, + "learning_rate": 9.784130066042316e-06, + "loss": 0.1727, + "step": 1121 + }, + { + "epoch": 0.36357744653272844, + "grad_norm": 0.5604220628738403, + "learning_rate": 9.783621343807119e-06, + "loss": 0.1753, + "step": 1122 + }, + { + "epoch": 0.363901490602722, + "grad_norm": 0.543032169342041, + "learning_rate": 9.783112036102978e-06, + "loss": 0.1671, + "step": 1123 + }, + { + "epoch": 0.3642255346727155, + "grad_norm": 0.6327988505363464, + "learning_rate": 9.782602142992227e-06, + "loss": 0.1836, + "step": 1124 + }, + { + "epoch": 0.364549578742709, + "grad_norm": 0.5535159707069397, + "learning_rate": 9.782091664537274e-06, + "loss": 0.1622, + "step": 1125 + }, + { + "epoch": 0.3648736228127025, + "grad_norm": 0.6538394689559937, + "learning_rate": 9.781580600800595e-06, + "loss": 0.1913, + "step": 1126 + }, + { + "epoch": 0.36519766688269606, + "grad_norm": 0.568351149559021, + "learning_rate": 9.781068951844739e-06, + "loss": 0.1839, + "step": 1127 + }, + { + "epoch": 0.36552171095268954, + "grad_norm": 0.5809906125068665, + "learning_rate": 9.78055671773233e-06, + "loss": 0.1791, + "step": 1128 + }, + { + "epoch": 0.3658457550226831, + "grad_norm": 0.5744894742965698, + "learning_rate": 9.780043898526054e-06, + "loss": 0.1679, + "step": 1129 + }, + { + "epoch": 0.3661697990926766, + "grad_norm": 0.5658349394798279, + "learning_rate": 9.779530494288682e-06, + "loss": 0.1688, + "step": 1130 + }, + { + "epoch": 0.36649384316267014, + "grad_norm": 0.5783803462982178, + "learning_rate": 9.779016505083047e-06, + "loss": 0.1649, + "step": 1131 + }, + { + "epoch": 0.3668178872326636, + "grad_norm": 0.6075659394264221, + "learning_rate": 9.778501930972058e-06, + "loss": 0.1846, + "step": 1132 + }, + { + "epoch": 0.36714193130265715, + "grad_norm": 0.5325374603271484, + "learning_rate": 9.77798677201869e-06, + "loss": 0.1511, + "step": 1133 + }, + { + "epoch": 0.3674659753726507, + "grad_norm": 0.5618516802787781, + "learning_rate": 9.777471028285996e-06, + "loss": 0.1561, + "step": 1134 + }, + { + "epoch": 0.3677900194426442, + "grad_norm": 0.5520266890525818, + "learning_rate": 9.776954699837097e-06, + "loss": 0.1556, + "step": 1135 + }, + { + "epoch": 0.3681140635126377, + "grad_norm": 0.56380695104599, + "learning_rate": 9.776437786735187e-06, + "loss": 0.1573, + "step": 1136 + }, + { + "epoch": 0.36843810758263124, + "grad_norm": 0.5827215909957886, + "learning_rate": 9.775920289043532e-06, + "loss": 0.1581, + "step": 1137 + }, + { + "epoch": 0.36876215165262477, + "grad_norm": 0.5803936719894409, + "learning_rate": 9.775402206825468e-06, + "loss": 0.1735, + "step": 1138 + }, + { + "epoch": 0.36908619572261825, + "grad_norm": 0.5602681040763855, + "learning_rate": 9.774883540144405e-06, + "loss": 0.1567, + "step": 1139 + }, + { + "epoch": 0.3694102397926118, + "grad_norm": 0.5713690519332886, + "learning_rate": 9.774364289063817e-06, + "loss": 0.1643, + "step": 1140 + }, + { + "epoch": 0.3697342838626053, + "grad_norm": 0.5246069431304932, + "learning_rate": 9.773844453647263e-06, + "loss": 0.1551, + "step": 1141 + }, + { + "epoch": 0.37005832793259885, + "grad_norm": 0.5214044451713562, + "learning_rate": 9.773324033958363e-06, + "loss": 0.1547, + "step": 1142 + }, + { + "epoch": 0.37038237200259233, + "grad_norm": 0.5714074373245239, + "learning_rate": 9.772803030060809e-06, + "loss": 0.176, + "step": 1143 + }, + { + "epoch": 0.37070641607258586, + "grad_norm": 0.5851354598999023, + "learning_rate": 9.772281442018368e-06, + "loss": 0.1579, + "step": 1144 + }, + { + "epoch": 0.3710304601425794, + "grad_norm": 0.5684588551521301, + "learning_rate": 9.771759269894878e-06, + "loss": 0.1753, + "step": 1145 + }, + { + "epoch": 0.37135450421257293, + "grad_norm": 0.5431103706359863, + "learning_rate": 9.771236513754247e-06, + "loss": 0.1529, + "step": 1146 + }, + { + "epoch": 0.3716785482825664, + "grad_norm": 0.6564229726791382, + "learning_rate": 9.770713173660458e-06, + "loss": 0.1872, + "step": 1147 + }, + { + "epoch": 0.37200259235255995, + "grad_norm": 0.5576322674751282, + "learning_rate": 9.770189249677559e-06, + "loss": 0.1544, + "step": 1148 + }, + { + "epoch": 0.3723266364225535, + "grad_norm": 0.5599045753479004, + "learning_rate": 9.769664741869676e-06, + "loss": 0.1513, + "step": 1149 + }, + { + "epoch": 0.37265068049254696, + "grad_norm": 0.5974574685096741, + "learning_rate": 9.769139650301e-06, + "loss": 0.1633, + "step": 1150 + }, + { + "epoch": 0.3729747245625405, + "grad_norm": 0.5352615118026733, + "learning_rate": 9.768613975035801e-06, + "loss": 0.1458, + "step": 1151 + }, + { + "epoch": 0.37329876863253403, + "grad_norm": 0.5834352374076843, + "learning_rate": 9.768087716138417e-06, + "loss": 0.1803, + "step": 1152 + }, + { + "epoch": 0.37362281270252756, + "grad_norm": 0.5821603536605835, + "learning_rate": 9.767560873673253e-06, + "loss": 0.1678, + "step": 1153 + }, + { + "epoch": 0.37394685677252104, + "grad_norm": 0.5908502340316772, + "learning_rate": 9.767033447704792e-06, + "loss": 0.1715, + "step": 1154 + }, + { + "epoch": 0.3742709008425146, + "grad_norm": 0.5501680970191956, + "learning_rate": 9.766505438297587e-06, + "loss": 0.1656, + "step": 1155 + }, + { + "epoch": 0.3745949449125081, + "grad_norm": 0.551476240158081, + "learning_rate": 9.765976845516259e-06, + "loss": 0.1651, + "step": 1156 + }, + { + "epoch": 0.37491898898250164, + "grad_norm": 0.6020034551620483, + "learning_rate": 9.7654476694255e-06, + "loss": 0.1722, + "step": 1157 + }, + { + "epoch": 0.3752430330524951, + "grad_norm": 0.5460115075111389, + "learning_rate": 9.764917910090083e-06, + "loss": 0.1485, + "step": 1158 + }, + { + "epoch": 0.37556707712248866, + "grad_norm": 0.5414779782295227, + "learning_rate": 9.764387567574842e-06, + "loss": 0.1508, + "step": 1159 + }, + { + "epoch": 0.3758911211924822, + "grad_norm": 0.5620276927947998, + "learning_rate": 9.763856641944684e-06, + "loss": 0.1762, + "step": 1160 + }, + { + "epoch": 0.37621516526247567, + "grad_norm": 0.5907630920410156, + "learning_rate": 9.763325133264592e-06, + "loss": 0.1813, + "step": 1161 + }, + { + "epoch": 0.3765392093324692, + "grad_norm": 0.626511812210083, + "learning_rate": 9.762793041599616e-06, + "loss": 0.1792, + "step": 1162 + }, + { + "epoch": 0.37686325340246274, + "grad_norm": 0.5377705693244934, + "learning_rate": 9.762260367014877e-06, + "loss": 0.1559, + "step": 1163 + }, + { + "epoch": 0.3771872974724563, + "grad_norm": 0.6175006628036499, + "learning_rate": 9.761727109575574e-06, + "loss": 0.1824, + "step": 1164 + }, + { + "epoch": 0.37751134154244975, + "grad_norm": 0.5645531415939331, + "learning_rate": 9.761193269346968e-06, + "loss": 0.1671, + "step": 1165 + }, + { + "epoch": 0.3778353856124433, + "grad_norm": 0.5879248380661011, + "learning_rate": 9.7606588463944e-06, + "loss": 0.1832, + "step": 1166 + }, + { + "epoch": 0.3781594296824368, + "grad_norm": 0.6032773852348328, + "learning_rate": 9.760123840783275e-06, + "loss": 0.1725, + "step": 1167 + }, + { + "epoch": 0.37848347375243035, + "grad_norm": 0.6185351014137268, + "learning_rate": 9.759588252579073e-06, + "loss": 0.1988, + "step": 1168 + }, + { + "epoch": 0.37880751782242383, + "grad_norm": 0.5803781151771545, + "learning_rate": 9.759052081847345e-06, + "loss": 0.182, + "step": 1169 + }, + { + "epoch": 0.37913156189241737, + "grad_norm": 0.5212180018424988, + "learning_rate": 9.758515328653712e-06, + "loss": 0.1479, + "step": 1170 + }, + { + "epoch": 0.3794556059624109, + "grad_norm": 0.5354920625686646, + "learning_rate": 9.757977993063871e-06, + "loss": 0.1608, + "step": 1171 + }, + { + "epoch": 0.3797796500324044, + "grad_norm": 0.6095584034919739, + "learning_rate": 9.757440075143585e-06, + "loss": 0.1858, + "step": 1172 + }, + { + "epoch": 0.3801036941023979, + "grad_norm": 0.5753458142280579, + "learning_rate": 9.756901574958688e-06, + "loss": 0.1669, + "step": 1173 + }, + { + "epoch": 0.38042773817239145, + "grad_norm": 0.582177996635437, + "learning_rate": 9.756362492575088e-06, + "loss": 0.1722, + "step": 1174 + }, + { + "epoch": 0.380751782242385, + "grad_norm": 0.5849246382713318, + "learning_rate": 9.755822828058765e-06, + "loss": 0.1683, + "step": 1175 + }, + { + "epoch": 0.38107582631237846, + "grad_norm": 0.550369918346405, + "learning_rate": 9.755282581475769e-06, + "loss": 0.1622, + "step": 1176 + }, + { + "epoch": 0.381399870382372, + "grad_norm": 0.5477217435836792, + "learning_rate": 9.754741752892219e-06, + "loss": 0.1587, + "step": 1177 + }, + { + "epoch": 0.38172391445236553, + "grad_norm": 0.5430402159690857, + "learning_rate": 9.754200342374308e-06, + "loss": 0.1674, + "step": 1178 + }, + { + "epoch": 0.38204795852235907, + "grad_norm": 0.5677325129508972, + "learning_rate": 9.753658349988298e-06, + "loss": 0.1744, + "step": 1179 + }, + { + "epoch": 0.38237200259235254, + "grad_norm": 0.5597923994064331, + "learning_rate": 9.753115775800525e-06, + "loss": 0.1639, + "step": 1180 + }, + { + "epoch": 0.3826960466623461, + "grad_norm": 0.5822575688362122, + "learning_rate": 9.752572619877397e-06, + "loss": 0.1478, + "step": 1181 + }, + { + "epoch": 0.3830200907323396, + "grad_norm": 0.5696035623550415, + "learning_rate": 9.752028882285385e-06, + "loss": 0.166, + "step": 1182 + }, + { + "epoch": 0.3833441348023331, + "grad_norm": 0.5821458697319031, + "learning_rate": 9.751484563091043e-06, + "loss": 0.1703, + "step": 1183 + }, + { + "epoch": 0.3836681788723266, + "grad_norm": 0.5574982762336731, + "learning_rate": 9.75093966236099e-06, + "loss": 0.167, + "step": 1184 + }, + { + "epoch": 0.38399222294232016, + "grad_norm": 0.5701562762260437, + "learning_rate": 9.750394180161913e-06, + "loss": 0.1589, + "step": 1185 + }, + { + "epoch": 0.3843162670123137, + "grad_norm": 0.583560049533844, + "learning_rate": 9.749848116560576e-06, + "loss": 0.1691, + "step": 1186 + }, + { + "epoch": 0.3846403110823072, + "grad_norm": 0.613233745098114, + "learning_rate": 9.749301471623813e-06, + "loss": 0.1868, + "step": 1187 + }, + { + "epoch": 0.3849643551523007, + "grad_norm": 0.5886712074279785, + "learning_rate": 9.748754245418526e-06, + "loss": 0.1795, + "step": 1188 + }, + { + "epoch": 0.38528839922229424, + "grad_norm": 0.5409032106399536, + "learning_rate": 9.74820643801169e-06, + "loss": 0.1618, + "step": 1189 + }, + { + "epoch": 0.3856124432922878, + "grad_norm": 0.5986441373825073, + "learning_rate": 9.747658049470353e-06, + "loss": 0.1643, + "step": 1190 + }, + { + "epoch": 0.38593648736228126, + "grad_norm": 0.5554662346839905, + "learning_rate": 9.74710907986163e-06, + "loss": 0.1652, + "step": 1191 + }, + { + "epoch": 0.3862605314322748, + "grad_norm": 0.578743040561676, + "learning_rate": 9.746559529252713e-06, + "loss": 0.1698, + "step": 1192 + }, + { + "epoch": 0.3865845755022683, + "grad_norm": 0.6013315916061401, + "learning_rate": 9.74600939771086e-06, + "loss": 0.1797, + "step": 1193 + }, + { + "epoch": 0.3869086195722618, + "grad_norm": 0.5931842923164368, + "learning_rate": 9.745458685303402e-06, + "loss": 0.1741, + "step": 1194 + }, + { + "epoch": 0.38723266364225534, + "grad_norm": 0.6375784873962402, + "learning_rate": 9.74490739209774e-06, + "loss": 0.1929, + "step": 1195 + }, + { + "epoch": 0.38755670771224887, + "grad_norm": 0.6102494597434998, + "learning_rate": 9.744355518161346e-06, + "loss": 0.1677, + "step": 1196 + }, + { + "epoch": 0.3878807517822424, + "grad_norm": 0.5642244219779968, + "learning_rate": 9.743803063561767e-06, + "loss": 0.176, + "step": 1197 + }, + { + "epoch": 0.3882047958522359, + "grad_norm": 0.5792483687400818, + "learning_rate": 9.743250028366615e-06, + "loss": 0.1693, + "step": 1198 + }, + { + "epoch": 0.3885288399222294, + "grad_norm": 0.5659141540527344, + "learning_rate": 9.742696412643579e-06, + "loss": 0.1578, + "step": 1199 + }, + { + "epoch": 0.38885288399222295, + "grad_norm": 0.5681812763214111, + "learning_rate": 9.742142216460416e-06, + "loss": 0.1593, + "step": 1200 + }, + { + "epoch": 0.3891769280622165, + "grad_norm": 0.5868738889694214, + "learning_rate": 9.741587439884951e-06, + "loss": 0.1765, + "step": 1201 + }, + { + "epoch": 0.38950097213220997, + "grad_norm": 0.5223450064659119, + "learning_rate": 9.741032082985086e-06, + "loss": 0.1612, + "step": 1202 + }, + { + "epoch": 0.3898250162022035, + "grad_norm": 0.6357806921005249, + "learning_rate": 9.740476145828792e-06, + "loss": 0.1856, + "step": 1203 + }, + { + "epoch": 0.39014906027219703, + "grad_norm": 0.5512497425079346, + "learning_rate": 9.739919628484108e-06, + "loss": 0.1589, + "step": 1204 + }, + { + "epoch": 0.3904731043421905, + "grad_norm": 0.5599070191383362, + "learning_rate": 9.739362531019149e-06, + "loss": 0.1663, + "step": 1205 + }, + { + "epoch": 0.39079714841218405, + "grad_norm": 0.5793858766555786, + "learning_rate": 9.738804853502095e-06, + "loss": 0.1649, + "step": 1206 + }, + { + "epoch": 0.3911211924821776, + "grad_norm": 0.5827155709266663, + "learning_rate": 9.738246596001203e-06, + "loss": 0.1746, + "step": 1207 + }, + { + "epoch": 0.3914452365521711, + "grad_norm": 0.6015954613685608, + "learning_rate": 9.737687758584797e-06, + "loss": 0.1794, + "step": 1208 + }, + { + "epoch": 0.3917692806221646, + "grad_norm": 0.5880870819091797, + "learning_rate": 9.737128341321274e-06, + "loss": 0.183, + "step": 1209 + }, + { + "epoch": 0.39209332469215813, + "grad_norm": 0.5348117351531982, + "learning_rate": 9.736568344279101e-06, + "loss": 0.1472, + "step": 1210 + }, + { + "epoch": 0.39241736876215166, + "grad_norm": 0.5760438442230225, + "learning_rate": 9.736007767526817e-06, + "loss": 0.16, + "step": 1211 + }, + { + "epoch": 0.3927414128321452, + "grad_norm": 0.5510878562927246, + "learning_rate": 9.735446611133029e-06, + "loss": 0.1674, + "step": 1212 + }, + { + "epoch": 0.3930654569021387, + "grad_norm": 0.609878420829773, + "learning_rate": 9.73488487516642e-06, + "loss": 0.1895, + "step": 1213 + }, + { + "epoch": 0.3933895009721322, + "grad_norm": 0.6492441296577454, + "learning_rate": 9.734322559695737e-06, + "loss": 0.1774, + "step": 1214 + }, + { + "epoch": 0.39371354504212575, + "grad_norm": 0.5391560792922974, + "learning_rate": 9.733759664789807e-06, + "loss": 0.1611, + "step": 1215 + }, + { + "epoch": 0.3940375891121192, + "grad_norm": 0.5455146431922913, + "learning_rate": 9.73319619051752e-06, + "loss": 0.1722, + "step": 1216 + }, + { + "epoch": 0.39436163318211276, + "grad_norm": 0.595698356628418, + "learning_rate": 9.732632136947838e-06, + "loss": 0.1833, + "step": 1217 + }, + { + "epoch": 0.3946856772521063, + "grad_norm": 0.563563346862793, + "learning_rate": 9.7320675041498e-06, + "loss": 0.159, + "step": 1218 + }, + { + "epoch": 0.3950097213220998, + "grad_norm": 0.5537634491920471, + "learning_rate": 9.73150229219251e-06, + "loss": 0.1598, + "step": 1219 + }, + { + "epoch": 0.3953337653920933, + "grad_norm": 0.5871871113777161, + "learning_rate": 9.730936501145141e-06, + "loss": 0.1868, + "step": 1220 + }, + { + "epoch": 0.39565780946208684, + "grad_norm": 0.526485800743103, + "learning_rate": 9.730370131076945e-06, + "loss": 0.146, + "step": 1221 + }, + { + "epoch": 0.3959818535320804, + "grad_norm": 0.5196928381919861, + "learning_rate": 9.72980318205724e-06, + "loss": 0.1515, + "step": 1222 + }, + { + "epoch": 0.3963058976020739, + "grad_norm": 0.5631489157676697, + "learning_rate": 9.729235654155411e-06, + "loss": 0.1723, + "step": 1223 + }, + { + "epoch": 0.3966299416720674, + "grad_norm": 0.5770860910415649, + "learning_rate": 9.728667547440923e-06, + "loss": 0.1733, + "step": 1224 + }, + { + "epoch": 0.3969539857420609, + "grad_norm": 0.5769538879394531, + "learning_rate": 9.728098861983301e-06, + "loss": 0.1568, + "step": 1225 + }, + { + "epoch": 0.39727802981205446, + "grad_norm": 0.5829085111618042, + "learning_rate": 9.727529597852152e-06, + "loss": 0.166, + "step": 1226 + }, + { + "epoch": 0.39760207388204793, + "grad_norm": 0.6667234897613525, + "learning_rate": 9.726959755117146e-06, + "loss": 0.191, + "step": 1227 + }, + { + "epoch": 0.39792611795204147, + "grad_norm": 0.5731337666511536, + "learning_rate": 9.726389333848026e-06, + "loss": 0.1598, + "step": 1228 + }, + { + "epoch": 0.398250162022035, + "grad_norm": 0.5425414443016052, + "learning_rate": 9.725818334114608e-06, + "loss": 0.1528, + "step": 1229 + }, + { + "epoch": 0.39857420609202854, + "grad_norm": 0.5781131982803345, + "learning_rate": 9.725246755986774e-06, + "loss": 0.1803, + "step": 1230 + }, + { + "epoch": 0.398898250162022, + "grad_norm": 0.5605859756469727, + "learning_rate": 9.724674599534481e-06, + "loss": 0.1525, + "step": 1231 + }, + { + "epoch": 0.39922229423201555, + "grad_norm": 0.5850882530212402, + "learning_rate": 9.724101864827756e-06, + "loss": 0.1627, + "step": 1232 + }, + { + "epoch": 0.3995463383020091, + "grad_norm": 0.5748516321182251, + "learning_rate": 9.723528551936695e-06, + "loss": 0.1667, + "step": 1233 + }, + { + "epoch": 0.3998703823720026, + "grad_norm": 0.5731942653656006, + "learning_rate": 9.722954660931468e-06, + "loss": 0.1682, + "step": 1234 + }, + { + "epoch": 0.4001944264419961, + "grad_norm": 0.560596227645874, + "learning_rate": 9.72238019188231e-06, + "loss": 0.1582, + "step": 1235 + }, + { + "epoch": 0.40051847051198963, + "grad_norm": 0.6035265922546387, + "learning_rate": 9.721805144859533e-06, + "loss": 0.1714, + "step": 1236 + }, + { + "epoch": 0.40084251458198317, + "grad_norm": 0.5419968962669373, + "learning_rate": 9.72122951993352e-06, + "loss": 0.1735, + "step": 1237 + }, + { + "epoch": 0.40116655865197665, + "grad_norm": 0.56743985414505, + "learning_rate": 9.720653317174716e-06, + "loss": 0.1717, + "step": 1238 + }, + { + "epoch": 0.4014906027219702, + "grad_norm": 0.5820173025131226, + "learning_rate": 9.720076536653646e-06, + "loss": 0.1817, + "step": 1239 + }, + { + "epoch": 0.4018146467919637, + "grad_norm": 0.5778538584709167, + "learning_rate": 9.7194991784409e-06, + "loss": 0.1694, + "step": 1240 + }, + { + "epoch": 0.40213869086195725, + "grad_norm": 0.58612060546875, + "learning_rate": 9.718921242607145e-06, + "loss": 0.1799, + "step": 1241 + }, + { + "epoch": 0.4024627349319507, + "grad_norm": 0.5183675289154053, + "learning_rate": 9.718342729223112e-06, + "loss": 0.1576, + "step": 1242 + }, + { + "epoch": 0.40278677900194426, + "grad_norm": 0.5352326035499573, + "learning_rate": 9.717763638359607e-06, + "loss": 0.1736, + "step": 1243 + }, + { + "epoch": 0.4031108230719378, + "grad_norm": 0.5465376973152161, + "learning_rate": 9.717183970087503e-06, + "loss": 0.1596, + "step": 1244 + }, + { + "epoch": 0.40343486714193133, + "grad_norm": 0.5443048477172852, + "learning_rate": 9.716603724477748e-06, + "loss": 0.1672, + "step": 1245 + }, + { + "epoch": 0.4037589112119248, + "grad_norm": 0.5582151412963867, + "learning_rate": 9.716022901601356e-06, + "loss": 0.1662, + "step": 1246 + }, + { + "epoch": 0.40408295528191834, + "grad_norm": 0.5699895620346069, + "learning_rate": 9.715441501529417e-06, + "loss": 0.1691, + "step": 1247 + }, + { + "epoch": 0.4044069993519119, + "grad_norm": 0.5725207924842834, + "learning_rate": 9.714859524333086e-06, + "loss": 0.1641, + "step": 1248 + }, + { + "epoch": 0.40473104342190536, + "grad_norm": 0.5575113892555237, + "learning_rate": 9.714276970083594e-06, + "loss": 0.1668, + "step": 1249 + }, + { + "epoch": 0.4050550874918989, + "grad_norm": 0.5504116415977478, + "learning_rate": 9.713693838852236e-06, + "loss": 0.1576, + "step": 1250 + }, + { + "epoch": 0.4053791315618924, + "grad_norm": 0.606964111328125, + "learning_rate": 9.713110130710387e-06, + "loss": 0.1731, + "step": 1251 + }, + { + "epoch": 0.40570317563188596, + "grad_norm": 0.5411376953125, + "learning_rate": 9.712525845729483e-06, + "loss": 0.1648, + "step": 1252 + }, + { + "epoch": 0.40602721970187944, + "grad_norm": 0.576723575592041, + "learning_rate": 9.711940983981036e-06, + "loss": 0.1628, + "step": 1253 + }, + { + "epoch": 0.406351263771873, + "grad_norm": 0.5311942100524902, + "learning_rate": 9.71135554553663e-06, + "loss": 0.1584, + "step": 1254 + }, + { + "epoch": 0.4066753078418665, + "grad_norm": 0.5140619277954102, + "learning_rate": 9.710769530467912e-06, + "loss": 0.1626, + "step": 1255 + }, + { + "epoch": 0.40699935191186, + "grad_norm": 0.5483884215354919, + "learning_rate": 9.710182938846609e-06, + "loss": 0.1579, + "step": 1256 + }, + { + "epoch": 0.4073233959818535, + "grad_norm": 0.5419959425926208, + "learning_rate": 9.70959577074451e-06, + "loss": 0.1613, + "step": 1257 + }, + { + "epoch": 0.40764744005184705, + "grad_norm": 0.6026223301887512, + "learning_rate": 9.709008026233483e-06, + "loss": 0.1801, + "step": 1258 + }, + { + "epoch": 0.4079714841218406, + "grad_norm": 0.5318058133125305, + "learning_rate": 9.708419705385461e-06, + "loss": 0.1456, + "step": 1259 + }, + { + "epoch": 0.40829552819183407, + "grad_norm": 0.5593992471694946, + "learning_rate": 9.707830808272446e-06, + "loss": 0.1605, + "step": 1260 + }, + { + "epoch": 0.4086195722618276, + "grad_norm": 0.5445616841316223, + "learning_rate": 9.707241334966517e-06, + "loss": 0.1635, + "step": 1261 + }, + { + "epoch": 0.40894361633182114, + "grad_norm": 0.5666195750236511, + "learning_rate": 9.706651285539817e-06, + "loss": 0.1587, + "step": 1262 + }, + { + "epoch": 0.40926766040181467, + "grad_norm": 0.6224789619445801, + "learning_rate": 9.706060660064565e-06, + "loss": 0.1945, + "step": 1263 + }, + { + "epoch": 0.40959170447180815, + "grad_norm": 0.5528307557106018, + "learning_rate": 9.705469458613046e-06, + "loss": 0.1605, + "step": 1264 + }, + { + "epoch": 0.4099157485418017, + "grad_norm": 0.582804799079895, + "learning_rate": 9.704877681257616e-06, + "loss": 0.1705, + "step": 1265 + }, + { + "epoch": 0.4102397926117952, + "grad_norm": 0.5233073234558105, + "learning_rate": 9.704285328070706e-06, + "loss": 0.1484, + "step": 1266 + }, + { + "epoch": 0.4105638366817887, + "grad_norm": 0.546563982963562, + "learning_rate": 9.703692399124813e-06, + "loss": 0.1607, + "step": 1267 + }, + { + "epoch": 0.41088788075178223, + "grad_norm": 0.5636500716209412, + "learning_rate": 9.703098894492506e-06, + "loss": 0.1601, + "step": 1268 + }, + { + "epoch": 0.41121192482177576, + "grad_norm": 0.5653511881828308, + "learning_rate": 9.702504814246423e-06, + "loss": 0.174, + "step": 1269 + }, + { + "epoch": 0.4115359688917693, + "grad_norm": 0.555579423904419, + "learning_rate": 9.701910158459275e-06, + "loss": 0.1686, + "step": 1270 + }, + { + "epoch": 0.4118600129617628, + "grad_norm": 0.5712722539901733, + "learning_rate": 9.701314927203841e-06, + "loss": 0.1605, + "step": 1271 + }, + { + "epoch": 0.4121840570317563, + "grad_norm": 0.5150348544120789, + "learning_rate": 9.700719120552972e-06, + "loss": 0.1507, + "step": 1272 + }, + { + "epoch": 0.41250810110174985, + "grad_norm": 0.5115662217140198, + "learning_rate": 9.70012273857959e-06, + "loss": 0.1543, + "step": 1273 + }, + { + "epoch": 0.4128321451717434, + "grad_norm": 0.5579255223274231, + "learning_rate": 9.699525781356685e-06, + "loss": 0.1775, + "step": 1274 + }, + { + "epoch": 0.41315618924173686, + "grad_norm": 0.5776585936546326, + "learning_rate": 9.69892824895732e-06, + "loss": 0.1683, + "step": 1275 + }, + { + "epoch": 0.4134802333117304, + "grad_norm": 0.6621776819229126, + "learning_rate": 9.698330141454626e-06, + "loss": 0.1793, + "step": 1276 + }, + { + "epoch": 0.41380427738172393, + "grad_norm": 0.5133375525474548, + "learning_rate": 9.697731458921806e-06, + "loss": 0.1442, + "step": 1277 + }, + { + "epoch": 0.4141283214517174, + "grad_norm": 0.5586656332015991, + "learning_rate": 9.697132201432133e-06, + "loss": 0.147, + "step": 1278 + }, + { + "epoch": 0.41445236552171094, + "grad_norm": 0.5529789328575134, + "learning_rate": 9.69653236905895e-06, + "loss": 0.1671, + "step": 1279 + }, + { + "epoch": 0.4147764095917045, + "grad_norm": 0.6018351912498474, + "learning_rate": 9.695931961875673e-06, + "loss": 0.1849, + "step": 1280 + }, + { + "epoch": 0.415100453661698, + "grad_norm": 0.5406911373138428, + "learning_rate": 9.695330979955782e-06, + "loss": 0.1519, + "step": 1281 + }, + { + "epoch": 0.4154244977316915, + "grad_norm": 0.5286436676979065, + "learning_rate": 9.694729423372834e-06, + "loss": 0.1592, + "step": 1282 + }, + { + "epoch": 0.415748541801685, + "grad_norm": 0.5377227067947388, + "learning_rate": 9.694127292200452e-06, + "loss": 0.1658, + "step": 1283 + }, + { + "epoch": 0.41607258587167856, + "grad_norm": 0.5881343483924866, + "learning_rate": 9.693524586512333e-06, + "loss": 0.1728, + "step": 1284 + }, + { + "epoch": 0.4163966299416721, + "grad_norm": 0.5579766035079956, + "learning_rate": 9.692921306382241e-06, + "loss": 0.1643, + "step": 1285 + }, + { + "epoch": 0.41672067401166557, + "grad_norm": 0.5597804188728333, + "learning_rate": 9.692317451884012e-06, + "loss": 0.1746, + "step": 1286 + }, + { + "epoch": 0.4170447180816591, + "grad_norm": 0.5385024547576904, + "learning_rate": 9.691713023091554e-06, + "loss": 0.1738, + "step": 1287 + }, + { + "epoch": 0.41736876215165264, + "grad_norm": 0.5481756329536438, + "learning_rate": 9.69110802007884e-06, + "loss": 0.1706, + "step": 1288 + }, + { + "epoch": 0.4176928062216461, + "grad_norm": 0.5710784196853638, + "learning_rate": 9.690502442919917e-06, + "loss": 0.1636, + "step": 1289 + }, + { + "epoch": 0.41801685029163965, + "grad_norm": 0.5947467088699341, + "learning_rate": 9.689896291688903e-06, + "loss": 0.1819, + "step": 1290 + }, + { + "epoch": 0.4183408943616332, + "grad_norm": 0.5301339626312256, + "learning_rate": 9.689289566459986e-06, + "loss": 0.1426, + "step": 1291 + }, + { + "epoch": 0.4186649384316267, + "grad_norm": 0.606670081615448, + "learning_rate": 9.688682267307418e-06, + "loss": 0.1724, + "step": 1292 + }, + { + "epoch": 0.4189889825016202, + "grad_norm": 0.5852813124656677, + "learning_rate": 9.688074394305535e-06, + "loss": 0.1906, + "step": 1293 + }, + { + "epoch": 0.41931302657161373, + "grad_norm": 0.5888477563858032, + "learning_rate": 9.687465947528727e-06, + "loss": 0.1713, + "step": 1294 + }, + { + "epoch": 0.41963707064160727, + "grad_norm": 0.5683367252349854, + "learning_rate": 9.686856927051467e-06, + "loss": 0.1661, + "step": 1295 + }, + { + "epoch": 0.4199611147116008, + "grad_norm": 0.5632486939430237, + "learning_rate": 9.686247332948291e-06, + "loss": 0.168, + "step": 1296 + }, + { + "epoch": 0.4202851587815943, + "grad_norm": 0.5726707577705383, + "learning_rate": 9.685637165293808e-06, + "loss": 0.1651, + "step": 1297 + }, + { + "epoch": 0.4206092028515878, + "grad_norm": 0.5684093832969666, + "learning_rate": 9.685026424162696e-06, + "loss": 0.1693, + "step": 1298 + }, + { + "epoch": 0.42093324692158135, + "grad_norm": 0.6308449506759644, + "learning_rate": 9.684415109629705e-06, + "loss": 0.1767, + "step": 1299 + }, + { + "epoch": 0.42125729099157483, + "grad_norm": 0.5411693453788757, + "learning_rate": 9.683803221769651e-06, + "loss": 0.1509, + "step": 1300 + }, + { + "epoch": 0.42158133506156836, + "grad_norm": 0.5787219405174255, + "learning_rate": 9.683190760657428e-06, + "loss": 0.1796, + "step": 1301 + }, + { + "epoch": 0.4219053791315619, + "grad_norm": 0.5470903515815735, + "learning_rate": 9.682577726367993e-06, + "loss": 0.1657, + "step": 1302 + }, + { + "epoch": 0.42222942320155543, + "grad_norm": 0.547219455242157, + "learning_rate": 9.681964118976372e-06, + "loss": 0.1653, + "step": 1303 + }, + { + "epoch": 0.4225534672715489, + "grad_norm": 0.5840909481048584, + "learning_rate": 9.681349938557672e-06, + "loss": 0.1736, + "step": 1304 + }, + { + "epoch": 0.42287751134154244, + "grad_norm": 0.5541393756866455, + "learning_rate": 9.680735185187055e-06, + "loss": 0.1623, + "step": 1305 + }, + { + "epoch": 0.423201555411536, + "grad_norm": 0.5564125776290894, + "learning_rate": 9.680119858939765e-06, + "loss": 0.1664, + "step": 1306 + }, + { + "epoch": 0.4235255994815295, + "grad_norm": 0.5307673215866089, + "learning_rate": 9.679503959891112e-06, + "loss": 0.1521, + "step": 1307 + }, + { + "epoch": 0.423849643551523, + "grad_norm": 0.5410294532775879, + "learning_rate": 9.678887488116476e-06, + "loss": 0.1498, + "step": 1308 + }, + { + "epoch": 0.4241736876215165, + "grad_norm": 0.5974996089935303, + "learning_rate": 9.678270443691307e-06, + "loss": 0.1701, + "step": 1309 + }, + { + "epoch": 0.42449773169151006, + "grad_norm": 0.5369963049888611, + "learning_rate": 9.677652826691122e-06, + "loss": 0.1584, + "step": 1310 + }, + { + "epoch": 0.42482177576150354, + "grad_norm": 0.583291232585907, + "learning_rate": 9.677034637191516e-06, + "loss": 0.1696, + "step": 1311 + }, + { + "epoch": 0.4251458198314971, + "grad_norm": 0.6203243732452393, + "learning_rate": 9.676415875268147e-06, + "loss": 0.1773, + "step": 1312 + }, + { + "epoch": 0.4254698639014906, + "grad_norm": 0.6253352165222168, + "learning_rate": 9.675796540996747e-06, + "loss": 0.165, + "step": 1313 + }, + { + "epoch": 0.42579390797148414, + "grad_norm": 0.5516331791877747, + "learning_rate": 9.675176634453117e-06, + "loss": 0.1637, + "step": 1314 + }, + { + "epoch": 0.4261179520414776, + "grad_norm": 0.5999361872673035, + "learning_rate": 9.674556155713125e-06, + "loss": 0.1729, + "step": 1315 + }, + { + "epoch": 0.42644199611147116, + "grad_norm": 0.589998185634613, + "learning_rate": 9.67393510485271e-06, + "loss": 0.1827, + "step": 1316 + }, + { + "epoch": 0.4267660401814647, + "grad_norm": 0.5959166884422302, + "learning_rate": 9.673313481947888e-06, + "loss": 0.1542, + "step": 1317 + }, + { + "epoch": 0.4270900842514582, + "grad_norm": 0.5973237752914429, + "learning_rate": 9.672691287074736e-06, + "loss": 0.165, + "step": 1318 + }, + { + "epoch": 0.4274141283214517, + "grad_norm": 0.5623382329940796, + "learning_rate": 9.672068520309408e-06, + "loss": 0.1741, + "step": 1319 + }, + { + "epoch": 0.42773817239144524, + "grad_norm": 0.5273089408874512, + "learning_rate": 9.67144518172812e-06, + "loss": 0.1554, + "step": 1320 + }, + { + "epoch": 0.42806221646143877, + "grad_norm": 0.5496593713760376, + "learning_rate": 9.670821271407164e-06, + "loss": 0.1631, + "step": 1321 + }, + { + "epoch": 0.42838626053143225, + "grad_norm": 0.5944601893424988, + "learning_rate": 9.670196789422903e-06, + "loss": 0.164, + "step": 1322 + }, + { + "epoch": 0.4287103046014258, + "grad_norm": 0.5003495812416077, + "learning_rate": 9.669571735851766e-06, + "loss": 0.146, + "step": 1323 + }, + { + "epoch": 0.4290343486714193, + "grad_norm": 0.5393743515014648, + "learning_rate": 9.668946110770255e-06, + "loss": 0.164, + "step": 1324 + }, + { + "epoch": 0.42935839274141285, + "grad_norm": 0.5703218579292297, + "learning_rate": 9.668319914254936e-06, + "loss": 0.1682, + "step": 1325 + }, + { + "epoch": 0.42968243681140633, + "grad_norm": 0.5486487746238708, + "learning_rate": 9.667693146382453e-06, + "loss": 0.1702, + "step": 1326 + }, + { + "epoch": 0.43000648088139987, + "grad_norm": 0.5758382081985474, + "learning_rate": 9.667065807229516e-06, + "loss": 0.1617, + "step": 1327 + }, + { + "epoch": 0.4303305249513934, + "grad_norm": 0.5864109992980957, + "learning_rate": 9.666437896872905e-06, + "loss": 0.1724, + "step": 1328 + }, + { + "epoch": 0.43065456902138693, + "grad_norm": 0.5836167931556702, + "learning_rate": 9.665809415389471e-06, + "loss": 0.1737, + "step": 1329 + }, + { + "epoch": 0.4309786130913804, + "grad_norm": 0.5557619333267212, + "learning_rate": 9.665180362856132e-06, + "loss": 0.1607, + "step": 1330 + }, + { + "epoch": 0.43130265716137395, + "grad_norm": 0.5266228914260864, + "learning_rate": 9.664550739349878e-06, + "loss": 0.1708, + "step": 1331 + }, + { + "epoch": 0.4316267012313675, + "grad_norm": 0.5815237164497375, + "learning_rate": 9.663920544947771e-06, + "loss": 0.1395, + "step": 1332 + }, + { + "epoch": 0.43195074530136096, + "grad_norm": 0.561438262462616, + "learning_rate": 9.663289779726941e-06, + "loss": 0.161, + "step": 1333 + }, + { + "epoch": 0.4322747893713545, + "grad_norm": 0.6023616790771484, + "learning_rate": 9.662658443764583e-06, + "loss": 0.1806, + "step": 1334 + }, + { + "epoch": 0.43259883344134803, + "grad_norm": 0.5528768301010132, + "learning_rate": 9.662026537137972e-06, + "loss": 0.1591, + "step": 1335 + }, + { + "epoch": 0.43292287751134156, + "grad_norm": 0.5628527998924255, + "learning_rate": 9.661394059924444e-06, + "loss": 0.1647, + "step": 1336 + }, + { + "epoch": 0.43324692158133504, + "grad_norm": 0.6160968542098999, + "learning_rate": 9.660761012201409e-06, + "loss": 0.1623, + "step": 1337 + }, + { + "epoch": 0.4335709656513286, + "grad_norm": 0.5395432114601135, + "learning_rate": 9.660127394046346e-06, + "loss": 0.1612, + "step": 1338 + }, + { + "epoch": 0.4338950097213221, + "grad_norm": 0.5769019722938538, + "learning_rate": 9.659493205536802e-06, + "loss": 0.1741, + "step": 1339 + }, + { + "epoch": 0.43421905379131565, + "grad_norm": 0.5266570448875427, + "learning_rate": 9.6588584467504e-06, + "loss": 0.1574, + "step": 1340 + }, + { + "epoch": 0.4345430978613091, + "grad_norm": 0.5223969221115112, + "learning_rate": 9.658223117764822e-06, + "loss": 0.1443, + "step": 1341 + }, + { + "epoch": 0.43486714193130266, + "grad_norm": 0.5220791101455688, + "learning_rate": 9.657587218657832e-06, + "loss": 0.1583, + "step": 1342 + }, + { + "epoch": 0.4351911860012962, + "grad_norm": 0.5109962224960327, + "learning_rate": 9.656950749507253e-06, + "loss": 0.1545, + "step": 1343 + }, + { + "epoch": 0.43551523007128967, + "grad_norm": 0.5817949771881104, + "learning_rate": 9.656313710390986e-06, + "loss": 0.1683, + "step": 1344 + }, + { + "epoch": 0.4358392741412832, + "grad_norm": 0.5956231355667114, + "learning_rate": 9.655676101386999e-06, + "loss": 0.1827, + "step": 1345 + }, + { + "epoch": 0.43616331821127674, + "grad_norm": 0.552972137928009, + "learning_rate": 9.655037922573325e-06, + "loss": 0.1603, + "step": 1346 + }, + { + "epoch": 0.4364873622812703, + "grad_norm": 0.5825382471084595, + "learning_rate": 9.654399174028077e-06, + "loss": 0.1782, + "step": 1347 + }, + { + "epoch": 0.43681140635126375, + "grad_norm": 0.564332902431488, + "learning_rate": 9.653759855829425e-06, + "loss": 0.1629, + "step": 1348 + }, + { + "epoch": 0.4371354504212573, + "grad_norm": 0.6328169703483582, + "learning_rate": 9.65311996805562e-06, + "loss": 0.1889, + "step": 1349 + }, + { + "epoch": 0.4374594944912508, + "grad_norm": 0.5259972214698792, + "learning_rate": 9.652479510784976e-06, + "loss": 0.1588, + "step": 1350 + }, + { + "epoch": 0.43778353856124436, + "grad_norm": 0.5787382125854492, + "learning_rate": 9.651838484095879e-06, + "loss": 0.1721, + "step": 1351 + }, + { + "epoch": 0.43810758263123784, + "grad_norm": 0.5727863907814026, + "learning_rate": 9.651196888066787e-06, + "loss": 0.1837, + "step": 1352 + }, + { + "epoch": 0.43843162670123137, + "grad_norm": 0.5699387192726135, + "learning_rate": 9.65055472277622e-06, + "loss": 0.1824, + "step": 1353 + }, + { + "epoch": 0.4387556707712249, + "grad_norm": 0.5503055453300476, + "learning_rate": 9.649911988302778e-06, + "loss": 0.1701, + "step": 1354 + }, + { + "epoch": 0.4390797148412184, + "grad_norm": 0.5899806022644043, + "learning_rate": 9.649268684725122e-06, + "loss": 0.1668, + "step": 1355 + }, + { + "epoch": 0.4394037589112119, + "grad_norm": 0.5935372710227966, + "learning_rate": 9.64862481212199e-06, + "loss": 0.1881, + "step": 1356 + }, + { + "epoch": 0.43972780298120545, + "grad_norm": 0.5293922424316406, + "learning_rate": 9.64798037057218e-06, + "loss": 0.1607, + "step": 1357 + }, + { + "epoch": 0.440051847051199, + "grad_norm": 0.5702993273735046, + "learning_rate": 9.64733536015457e-06, + "loss": 0.1626, + "step": 1358 + }, + { + "epoch": 0.44037589112119246, + "grad_norm": 0.5957580804824829, + "learning_rate": 9.646689780948101e-06, + "loss": 0.1715, + "step": 1359 + }, + { + "epoch": 0.440699935191186, + "grad_norm": 0.6047354936599731, + "learning_rate": 9.646043633031786e-06, + "loss": 0.185, + "step": 1360 + }, + { + "epoch": 0.44102397926117953, + "grad_norm": 0.5889037251472473, + "learning_rate": 9.645396916484709e-06, + "loss": 0.1695, + "step": 1361 + }, + { + "epoch": 0.44134802333117307, + "grad_norm": 0.5471667051315308, + "learning_rate": 9.64474963138602e-06, + "loss": 0.1498, + "step": 1362 + }, + { + "epoch": 0.44167206740116655, + "grad_norm": 0.5360984206199646, + "learning_rate": 9.644101777814939e-06, + "loss": 0.1655, + "step": 1363 + }, + { + "epoch": 0.4419961114711601, + "grad_norm": 0.563440203666687, + "learning_rate": 9.64345335585076e-06, + "loss": 0.1502, + "step": 1364 + }, + { + "epoch": 0.4423201555411536, + "grad_norm": 0.5638374090194702, + "learning_rate": 9.642804365572841e-06, + "loss": 0.1567, + "step": 1365 + }, + { + "epoch": 0.4426441996111471, + "grad_norm": 0.5967015027999878, + "learning_rate": 9.642154807060617e-06, + "loss": 0.1721, + "step": 1366 + }, + { + "epoch": 0.4429682436811406, + "grad_norm": 0.5432306528091431, + "learning_rate": 9.64150468039358e-06, + "loss": 0.1659, + "step": 1367 + }, + { + "epoch": 0.44329228775113416, + "grad_norm": 0.5840265154838562, + "learning_rate": 9.640853985651306e-06, + "loss": 0.1726, + "step": 1368 + }, + { + "epoch": 0.4436163318211277, + "grad_norm": 0.5982809066772461, + "learning_rate": 9.64020272291343e-06, + "loss": 0.1846, + "step": 1369 + }, + { + "epoch": 0.4439403758911212, + "grad_norm": 0.5622856616973877, + "learning_rate": 9.639550892259663e-06, + "loss": 0.1789, + "step": 1370 + }, + { + "epoch": 0.4442644199611147, + "grad_norm": 0.580660343170166, + "learning_rate": 9.638898493769779e-06, + "loss": 0.1592, + "step": 1371 + }, + { + "epoch": 0.44458846403110824, + "grad_norm": 0.5534035563468933, + "learning_rate": 9.638245527523629e-06, + "loss": 0.153, + "step": 1372 + }, + { + "epoch": 0.4449125081011017, + "grad_norm": 0.5400548577308655, + "learning_rate": 9.637591993601127e-06, + "loss": 0.1607, + "step": 1373 + }, + { + "epoch": 0.44523655217109526, + "grad_norm": 0.5466241836547852, + "learning_rate": 9.636937892082261e-06, + "loss": 0.1647, + "step": 1374 + }, + { + "epoch": 0.4455605962410888, + "grad_norm": 0.5731942057609558, + "learning_rate": 9.636283223047087e-06, + "loss": 0.1773, + "step": 1375 + }, + { + "epoch": 0.4458846403110823, + "grad_norm": 0.5324536561965942, + "learning_rate": 9.635627986575727e-06, + "loss": 0.1694, + "step": 1376 + }, + { + "epoch": 0.4462086843810758, + "grad_norm": 0.5857275128364563, + "learning_rate": 9.634972182748378e-06, + "loss": 0.16, + "step": 1377 + }, + { + "epoch": 0.44653272845106934, + "grad_norm": 0.5256370306015015, + "learning_rate": 9.634315811645305e-06, + "loss": 0.148, + "step": 1378 + }, + { + "epoch": 0.4468567725210629, + "grad_norm": 0.5678802132606506, + "learning_rate": 9.633658873346841e-06, + "loss": 0.1726, + "step": 1379 + }, + { + "epoch": 0.4471808165910564, + "grad_norm": 0.5549466609954834, + "learning_rate": 9.633001367933387e-06, + "loss": 0.1682, + "step": 1380 + }, + { + "epoch": 0.4475048606610499, + "grad_norm": 0.5725024342536926, + "learning_rate": 9.632343295485416e-06, + "loss": 0.1621, + "step": 1381 + }, + { + "epoch": 0.4478289047310434, + "grad_norm": 0.5248334407806396, + "learning_rate": 9.631684656083472e-06, + "loss": 0.1604, + "step": 1382 + }, + { + "epoch": 0.44815294880103695, + "grad_norm": 0.5629571676254272, + "learning_rate": 9.631025449808163e-06, + "loss": 0.1697, + "step": 1383 + }, + { + "epoch": 0.44847699287103043, + "grad_norm": 0.5652890205383301, + "learning_rate": 9.63036567674017e-06, + "loss": 0.1517, + "step": 1384 + }, + { + "epoch": 0.44880103694102397, + "grad_norm": 0.6064323782920837, + "learning_rate": 9.629705336960244e-06, + "loss": 0.1673, + "step": 1385 + }, + { + "epoch": 0.4491250810110175, + "grad_norm": 0.55648273229599, + "learning_rate": 9.629044430549206e-06, + "loss": 0.1584, + "step": 1386 + }, + { + "epoch": 0.44944912508101104, + "grad_norm": 0.533288300037384, + "learning_rate": 9.62838295758794e-06, + "loss": 0.1635, + "step": 1387 + }, + { + "epoch": 0.4497731691510045, + "grad_norm": 0.5898700952529907, + "learning_rate": 9.627720918157407e-06, + "loss": 0.1837, + "step": 1388 + }, + { + "epoch": 0.45009721322099805, + "grad_norm": 0.56904536485672, + "learning_rate": 9.627058312338634e-06, + "loss": 0.1591, + "step": 1389 + }, + { + "epoch": 0.4504212572909916, + "grad_norm": 0.5584230422973633, + "learning_rate": 9.626395140212714e-06, + "loss": 0.1632, + "step": 1390 + }, + { + "epoch": 0.4507453013609851, + "grad_norm": 0.5398111939430237, + "learning_rate": 9.625731401860819e-06, + "loss": 0.1592, + "step": 1391 + }, + { + "epoch": 0.4510693454309786, + "grad_norm": 0.5425682663917542, + "learning_rate": 9.625067097364181e-06, + "loss": 0.1528, + "step": 1392 + }, + { + "epoch": 0.45139338950097213, + "grad_norm": 0.5614835619926453, + "learning_rate": 9.624402226804101e-06, + "loss": 0.1521, + "step": 1393 + }, + { + "epoch": 0.45171743357096567, + "grad_norm": 0.5337006449699402, + "learning_rate": 9.623736790261959e-06, + "loss": 0.1518, + "step": 1394 + }, + { + "epoch": 0.45204147764095914, + "grad_norm": 0.5660865902900696, + "learning_rate": 9.623070787819195e-06, + "loss": 0.1563, + "step": 1395 + }, + { + "epoch": 0.4523655217109527, + "grad_norm": 0.586026132106781, + "learning_rate": 9.622404219557322e-06, + "loss": 0.1907, + "step": 1396 + }, + { + "epoch": 0.4526895657809462, + "grad_norm": 0.5866766571998596, + "learning_rate": 9.621737085557918e-06, + "loss": 0.1645, + "step": 1397 + }, + { + "epoch": 0.45301360985093975, + "grad_norm": 0.5219502449035645, + "learning_rate": 9.62106938590264e-06, + "loss": 0.1652, + "step": 1398 + }, + { + "epoch": 0.4533376539209332, + "grad_norm": 0.5699257254600525, + "learning_rate": 9.620401120673202e-06, + "loss": 0.166, + "step": 1399 + }, + { + "epoch": 0.45366169799092676, + "grad_norm": 0.543327808380127, + "learning_rate": 9.619732289951399e-06, + "loss": 0.1655, + "step": 1400 + }, + { + "epoch": 0.4539857420609203, + "grad_norm": 0.5347340703010559, + "learning_rate": 9.619062893819082e-06, + "loss": 0.1605, + "step": 1401 + }, + { + "epoch": 0.45430978613091383, + "grad_norm": 0.571562647819519, + "learning_rate": 9.618392932358185e-06, + "loss": 0.184, + "step": 1402 + }, + { + "epoch": 0.4546338302009073, + "grad_norm": 0.5329867005348206, + "learning_rate": 9.617722405650702e-06, + "loss": 0.1566, + "step": 1403 + }, + { + "epoch": 0.45495787427090084, + "grad_norm": 0.5062045454978943, + "learning_rate": 9.6170513137787e-06, + "loss": 0.1566, + "step": 1404 + }, + { + "epoch": 0.4552819183408944, + "grad_norm": 0.5401381254196167, + "learning_rate": 9.616379656824314e-06, + "loss": 0.1468, + "step": 1405 + }, + { + "epoch": 0.45560596241088785, + "grad_norm": 0.5379137992858887, + "learning_rate": 9.615707434869748e-06, + "loss": 0.1692, + "step": 1406 + }, + { + "epoch": 0.4559300064808814, + "grad_norm": 0.6190140843391418, + "learning_rate": 9.615034647997274e-06, + "loss": 0.1794, + "step": 1407 + }, + { + "epoch": 0.4562540505508749, + "grad_norm": 0.5723645091056824, + "learning_rate": 9.614361296289239e-06, + "loss": 0.1594, + "step": 1408 + }, + { + "epoch": 0.45657809462086846, + "grad_norm": 0.5665557384490967, + "learning_rate": 9.61368737982805e-06, + "loss": 0.1563, + "step": 1409 + }, + { + "epoch": 0.45690213869086194, + "grad_norm": 0.571121335029602, + "learning_rate": 9.613012898696187e-06, + "loss": 0.167, + "step": 1410 + }, + { + "epoch": 0.45722618276085547, + "grad_norm": 0.5885666012763977, + "learning_rate": 9.612337852976207e-06, + "loss": 0.1704, + "step": 1411 + }, + { + "epoch": 0.457550226830849, + "grad_norm": 0.5780050754547119, + "learning_rate": 9.611662242750723e-06, + "loss": 0.1686, + "step": 1412 + }, + { + "epoch": 0.45787427090084254, + "grad_norm": 0.5515772700309753, + "learning_rate": 9.610986068102425e-06, + "loss": 0.1682, + "step": 1413 + }, + { + "epoch": 0.458198314970836, + "grad_norm": 0.5495187640190125, + "learning_rate": 9.610309329114069e-06, + "loss": 0.1633, + "step": 1414 + }, + { + "epoch": 0.45852235904082955, + "grad_norm": 0.5032498240470886, + "learning_rate": 9.609632025868484e-06, + "loss": 0.1419, + "step": 1415 + }, + { + "epoch": 0.4588464031108231, + "grad_norm": 0.5348735451698303, + "learning_rate": 9.608954158448563e-06, + "loss": 0.1721, + "step": 1416 + }, + { + "epoch": 0.45917044718081657, + "grad_norm": 0.4963846802711487, + "learning_rate": 9.608275726937271e-06, + "loss": 0.1469, + "step": 1417 + }, + { + "epoch": 0.4594944912508101, + "grad_norm": 0.6138606071472168, + "learning_rate": 9.607596731417643e-06, + "loss": 0.1589, + "step": 1418 + }, + { + "epoch": 0.45981853532080363, + "grad_norm": 0.5531137585639954, + "learning_rate": 9.606917171972778e-06, + "loss": 0.16, + "step": 1419 + }, + { + "epoch": 0.46014257939079717, + "grad_norm": 0.605338454246521, + "learning_rate": 9.60623704868585e-06, + "loss": 0.1865, + "step": 1420 + }, + { + "epoch": 0.46046662346079065, + "grad_norm": 0.5759372711181641, + "learning_rate": 9.6055563616401e-06, + "loss": 0.1586, + "step": 1421 + }, + { + "epoch": 0.4607906675307842, + "grad_norm": 0.5326551795005798, + "learning_rate": 9.604875110918836e-06, + "loss": 0.1545, + "step": 1422 + }, + { + "epoch": 0.4611147116007777, + "grad_norm": 0.5569169521331787, + "learning_rate": 9.604193296605437e-06, + "loss": 0.165, + "step": 1423 + }, + { + "epoch": 0.46143875567077125, + "grad_norm": 0.6162808537483215, + "learning_rate": 9.60351091878335e-06, + "loss": 0.2078, + "step": 1424 + }, + { + "epoch": 0.46176279974076473, + "grad_norm": 0.6384009122848511, + "learning_rate": 9.602827977536094e-06, + "loss": 0.1826, + "step": 1425 + }, + { + "epoch": 0.46208684381075826, + "grad_norm": 0.5420453548431396, + "learning_rate": 9.60214447294725e-06, + "loss": 0.163, + "step": 1426 + }, + { + "epoch": 0.4624108878807518, + "grad_norm": 0.5329408645629883, + "learning_rate": 9.601460405100475e-06, + "loss": 0.1625, + "step": 1427 + }, + { + "epoch": 0.4627349319507453, + "grad_norm": 0.5379124283790588, + "learning_rate": 9.600775774079493e-06, + "loss": 0.1621, + "step": 1428 + }, + { + "epoch": 0.4630589760207388, + "grad_norm": 0.5374495387077332, + "learning_rate": 9.600090579968095e-06, + "loss": 0.1557, + "step": 1429 + }, + { + "epoch": 0.46338302009073234, + "grad_norm": 0.5596601366996765, + "learning_rate": 9.599404822850142e-06, + "loss": 0.1518, + "step": 1430 + }, + { + "epoch": 0.4637070641607259, + "grad_norm": 0.6118326783180237, + "learning_rate": 9.598718502809565e-06, + "loss": 0.168, + "step": 1431 + }, + { + "epoch": 0.46403110823071936, + "grad_norm": 0.5268542766571045, + "learning_rate": 9.598031619930363e-06, + "loss": 0.1548, + "step": 1432 + }, + { + "epoch": 0.4643551523007129, + "grad_norm": 0.5452706813812256, + "learning_rate": 9.597344174296601e-06, + "loss": 0.1634, + "step": 1433 + }, + { + "epoch": 0.4646791963707064, + "grad_norm": 0.5382551550865173, + "learning_rate": 9.59665616599242e-06, + "loss": 0.1695, + "step": 1434 + }, + { + "epoch": 0.46500324044069996, + "grad_norm": 0.4947247803211212, + "learning_rate": 9.595967595102022e-06, + "loss": 0.1436, + "step": 1435 + }, + { + "epoch": 0.46532728451069344, + "grad_norm": 0.5326928496360779, + "learning_rate": 9.595278461709683e-06, + "loss": 0.1647, + "step": 1436 + }, + { + "epoch": 0.465651328580687, + "grad_norm": 0.5893187522888184, + "learning_rate": 9.594588765899746e-06, + "loss": 0.174, + "step": 1437 + }, + { + "epoch": 0.4659753726506805, + "grad_norm": 0.5314544439315796, + "learning_rate": 9.593898507756622e-06, + "loss": 0.1577, + "step": 1438 + }, + { + "epoch": 0.466299416720674, + "grad_norm": 0.5155757069587708, + "learning_rate": 9.593207687364795e-06, + "loss": 0.1537, + "step": 1439 + }, + { + "epoch": 0.4666234607906675, + "grad_norm": 0.5500592589378357, + "learning_rate": 9.592516304808811e-06, + "loss": 0.1613, + "step": 1440 + }, + { + "epoch": 0.46694750486066106, + "grad_norm": 0.5493935942649841, + "learning_rate": 9.591824360173292e-06, + "loss": 0.1682, + "step": 1441 + }, + { + "epoch": 0.4672715489306546, + "grad_norm": 0.5465030074119568, + "learning_rate": 9.591131853542922e-06, + "loss": 0.1592, + "step": 1442 + }, + { + "epoch": 0.46759559300064807, + "grad_norm": 0.5221512317657471, + "learning_rate": 9.59043878500246e-06, + "loss": 0.1636, + "step": 1443 + }, + { + "epoch": 0.4679196370706416, + "grad_norm": 0.5554530620574951, + "learning_rate": 9.589745154636729e-06, + "loss": 0.1619, + "step": 1444 + }, + { + "epoch": 0.46824368114063514, + "grad_norm": 0.5898672938346863, + "learning_rate": 9.589050962530624e-06, + "loss": 0.1744, + "step": 1445 + }, + { + "epoch": 0.46856772521062867, + "grad_norm": 0.5853576064109802, + "learning_rate": 9.588356208769108e-06, + "loss": 0.1782, + "step": 1446 + }, + { + "epoch": 0.46889176928062215, + "grad_norm": 0.5318002700805664, + "learning_rate": 9.587660893437207e-06, + "loss": 0.1556, + "step": 1447 + }, + { + "epoch": 0.4692158133506157, + "grad_norm": 0.5409642457962036, + "learning_rate": 9.586965016620027e-06, + "loss": 0.1627, + "step": 1448 + }, + { + "epoch": 0.4695398574206092, + "grad_norm": 0.49974918365478516, + "learning_rate": 9.586268578402734e-06, + "loss": 0.1451, + "step": 1449 + }, + { + "epoch": 0.4698639014906027, + "grad_norm": 0.5798755884170532, + "learning_rate": 9.585571578870565e-06, + "loss": 0.1572, + "step": 1450 + }, + { + "epoch": 0.47018794556059623, + "grad_norm": 0.4880210757255554, + "learning_rate": 9.584874018108827e-06, + "loss": 0.1434, + "step": 1451 + }, + { + "epoch": 0.47051198963058977, + "grad_norm": 0.5324985384941101, + "learning_rate": 9.584175896202893e-06, + "loss": 0.1514, + "step": 1452 + }, + { + "epoch": 0.4708360337005833, + "grad_norm": 0.5340385437011719, + "learning_rate": 9.58347721323821e-06, + "loss": 0.1642, + "step": 1453 + }, + { + "epoch": 0.4711600777705768, + "grad_norm": 0.5225486159324646, + "learning_rate": 9.582777969300286e-06, + "loss": 0.1619, + "step": 1454 + }, + { + "epoch": 0.4714841218405703, + "grad_norm": 0.5330610871315002, + "learning_rate": 9.582078164474704e-06, + "loss": 0.1476, + "step": 1455 + }, + { + "epoch": 0.47180816591056385, + "grad_norm": 0.5599969625473022, + "learning_rate": 9.581377798847111e-06, + "loss": 0.1815, + "step": 1456 + }, + { + "epoch": 0.4721322099805574, + "grad_norm": 0.4793117940425873, + "learning_rate": 9.580676872503227e-06, + "loss": 0.1414, + "step": 1457 + }, + { + "epoch": 0.47245625405055086, + "grad_norm": 0.5731749534606934, + "learning_rate": 9.57997538552884e-06, + "loss": 0.1748, + "step": 1458 + }, + { + "epoch": 0.4727802981205444, + "grad_norm": 0.5908327698707581, + "learning_rate": 9.579273338009803e-06, + "loss": 0.1724, + "step": 1459 + }, + { + "epoch": 0.47310434219053793, + "grad_norm": 0.5376626253128052, + "learning_rate": 9.578570730032039e-06, + "loss": 0.1558, + "step": 1460 + }, + { + "epoch": 0.4734283862605314, + "grad_norm": 0.532558023929596, + "learning_rate": 9.577867561681542e-06, + "loss": 0.1537, + "step": 1461 + }, + { + "epoch": 0.47375243033052494, + "grad_norm": 0.5878148078918457, + "learning_rate": 9.577163833044372e-06, + "loss": 0.1555, + "step": 1462 + }, + { + "epoch": 0.4740764744005185, + "grad_norm": 0.5399042367935181, + "learning_rate": 9.57645954420666e-06, + "loss": 0.165, + "step": 1463 + }, + { + "epoch": 0.474400518470512, + "grad_norm": 0.5977577567100525, + "learning_rate": 9.575754695254604e-06, + "loss": 0.1713, + "step": 1464 + }, + { + "epoch": 0.4747245625405055, + "grad_norm": 0.5240918397903442, + "learning_rate": 9.57504928627447e-06, + "loss": 0.1517, + "step": 1465 + }, + { + "epoch": 0.475048606610499, + "grad_norm": 0.5246391296386719, + "learning_rate": 9.574343317352593e-06, + "loss": 0.1582, + "step": 1466 + }, + { + "epoch": 0.47537265068049256, + "grad_norm": 0.5721025466918945, + "learning_rate": 9.573636788575376e-06, + "loss": 0.1815, + "step": 1467 + }, + { + "epoch": 0.4756966947504861, + "grad_norm": 0.5447560548782349, + "learning_rate": 9.572929700029292e-06, + "loss": 0.1665, + "step": 1468 + }, + { + "epoch": 0.47602073882047957, + "grad_norm": 0.5570774078369141, + "learning_rate": 9.572222051800884e-06, + "loss": 0.1615, + "step": 1469 + }, + { + "epoch": 0.4763447828904731, + "grad_norm": 0.5612460374832153, + "learning_rate": 9.571513843976758e-06, + "loss": 0.1724, + "step": 1470 + }, + { + "epoch": 0.47666882696046664, + "grad_norm": 0.5506044030189514, + "learning_rate": 9.570805076643595e-06, + "loss": 0.1629, + "step": 1471 + }, + { + "epoch": 0.4769928710304601, + "grad_norm": 0.5385878682136536, + "learning_rate": 9.570095749888138e-06, + "loss": 0.1643, + "step": 1472 + }, + { + "epoch": 0.47731691510045365, + "grad_norm": 0.5353468060493469, + "learning_rate": 9.569385863797202e-06, + "loss": 0.156, + "step": 1473 + }, + { + "epoch": 0.4776409591704472, + "grad_norm": 0.5617091655731201, + "learning_rate": 9.568675418457673e-06, + "loss": 0.1461, + "step": 1474 + }, + { + "epoch": 0.4779650032404407, + "grad_norm": 0.5673995018005371, + "learning_rate": 9.567964413956501e-06, + "loss": 0.1619, + "step": 1475 + }, + { + "epoch": 0.4782890473104342, + "grad_norm": 0.5591347217559814, + "learning_rate": 9.567252850380705e-06, + "loss": 0.1635, + "step": 1476 + }, + { + "epoch": 0.47861309138042774, + "grad_norm": 0.5915917754173279, + "learning_rate": 9.566540727817375e-06, + "loss": 0.1651, + "step": 1477 + }, + { + "epoch": 0.47893713545042127, + "grad_norm": 0.6004805564880371, + "learning_rate": 9.565828046353669e-06, + "loss": 0.1689, + "step": 1478 + }, + { + "epoch": 0.4792611795204148, + "grad_norm": 0.6274728775024414, + "learning_rate": 9.565114806076808e-06, + "loss": 0.1821, + "step": 1479 + }, + { + "epoch": 0.4795852235904083, + "grad_norm": 0.5600235462188721, + "learning_rate": 9.564401007074091e-06, + "loss": 0.1638, + "step": 1480 + }, + { + "epoch": 0.4799092676604018, + "grad_norm": 0.5907046794891357, + "learning_rate": 9.563686649432874e-06, + "loss": 0.16, + "step": 1481 + }, + { + "epoch": 0.48023331173039535, + "grad_norm": 0.5883347988128662, + "learning_rate": 9.562971733240595e-06, + "loss": 0.1844, + "step": 1482 + }, + { + "epoch": 0.48055735580038883, + "grad_norm": 0.5312012434005737, + "learning_rate": 9.562256258584749e-06, + "loss": 0.1545, + "step": 1483 + }, + { + "epoch": 0.48088139987038236, + "grad_norm": 0.520148754119873, + "learning_rate": 9.561540225552901e-06, + "loss": 0.1646, + "step": 1484 + }, + { + "epoch": 0.4812054439403759, + "grad_norm": 0.5252670049667358, + "learning_rate": 9.560823634232688e-06, + "loss": 0.1583, + "step": 1485 + }, + { + "epoch": 0.48152948801036943, + "grad_norm": 0.5525287985801697, + "learning_rate": 9.560106484711816e-06, + "loss": 0.178, + "step": 1486 + }, + { + "epoch": 0.4818535320803629, + "grad_norm": 0.5581156015396118, + "learning_rate": 9.559388777078054e-06, + "loss": 0.1762, + "step": 1487 + }, + { + "epoch": 0.48217757615035645, + "grad_norm": 0.5156373977661133, + "learning_rate": 9.558670511419246e-06, + "loss": 0.1664, + "step": 1488 + }, + { + "epoch": 0.48250162022035, + "grad_norm": 0.5569294095039368, + "learning_rate": 9.557951687823298e-06, + "loss": 0.1593, + "step": 1489 + }, + { + "epoch": 0.48282566429034346, + "grad_norm": 0.5495864152908325, + "learning_rate": 9.557232306378186e-06, + "loss": 0.1563, + "step": 1490 + }, + { + "epoch": 0.483149708360337, + "grad_norm": 0.5372532606124878, + "learning_rate": 9.556512367171959e-06, + "loss": 0.1602, + "step": 1491 + }, + { + "epoch": 0.48347375243033053, + "grad_norm": 0.5491994023323059, + "learning_rate": 9.555791870292727e-06, + "loss": 0.1668, + "step": 1492 + }, + { + "epoch": 0.48379779650032406, + "grad_norm": 0.5422798991203308, + "learning_rate": 9.555070815828676e-06, + "loss": 0.165, + "step": 1493 + }, + { + "epoch": 0.48412184057031754, + "grad_norm": 0.5528333187103271, + "learning_rate": 9.554349203868052e-06, + "loss": 0.1559, + "step": 1494 + }, + { + "epoch": 0.4844458846403111, + "grad_norm": 0.5231409668922424, + "learning_rate": 9.553627034499176e-06, + "loss": 0.1637, + "step": 1495 + }, + { + "epoch": 0.4847699287103046, + "grad_norm": 0.561600387096405, + "learning_rate": 9.552904307810432e-06, + "loss": 0.1691, + "step": 1496 + }, + { + "epoch": 0.48509397278029814, + "grad_norm": 0.6043953895568848, + "learning_rate": 9.552181023890277e-06, + "loss": 0.1861, + "step": 1497 + }, + { + "epoch": 0.4854180168502916, + "grad_norm": 0.5333933234214783, + "learning_rate": 9.551457182827233e-06, + "loss": 0.158, + "step": 1498 + }, + { + "epoch": 0.48574206092028516, + "grad_norm": 0.5546240210533142, + "learning_rate": 9.550732784709892e-06, + "loss": 0.1602, + "step": 1499 + }, + { + "epoch": 0.4860661049902787, + "grad_norm": 0.6027463674545288, + "learning_rate": 9.550007829626912e-06, + "loss": 0.1604, + "step": 1500 + }, + { + "epoch": 0.48639014906027217, + "grad_norm": 0.5604615807533264, + "learning_rate": 9.549282317667021e-06, + "loss": 0.1626, + "step": 1501 + }, + { + "epoch": 0.4867141931302657, + "grad_norm": 0.5694551467895508, + "learning_rate": 9.548556248919017e-06, + "loss": 0.1754, + "step": 1502 + }, + { + "epoch": 0.48703823720025924, + "grad_norm": 0.5441313982009888, + "learning_rate": 9.547829623471758e-06, + "loss": 0.1655, + "step": 1503 + }, + { + "epoch": 0.4873622812702528, + "grad_norm": 0.5215234756469727, + "learning_rate": 9.547102441414182e-06, + "loss": 0.1541, + "step": 1504 + }, + { + "epoch": 0.48768632534024625, + "grad_norm": 0.5377838611602783, + "learning_rate": 9.546374702835286e-06, + "loss": 0.1584, + "step": 1505 + }, + { + "epoch": 0.4880103694102398, + "grad_norm": 0.5404870510101318, + "learning_rate": 9.545646407824138e-06, + "loss": 0.1674, + "step": 1506 + }, + { + "epoch": 0.4883344134802333, + "grad_norm": 0.5669370889663696, + "learning_rate": 9.544917556469876e-06, + "loss": 0.1681, + "step": 1507 + }, + { + "epoch": 0.48865845755022685, + "grad_norm": 0.5905188918113708, + "learning_rate": 9.544188148861703e-06, + "loss": 0.1789, + "step": 1508 + }, + { + "epoch": 0.48898250162022033, + "grad_norm": 0.5530914068222046, + "learning_rate": 9.543458185088892e-06, + "loss": 0.169, + "step": 1509 + }, + { + "epoch": 0.48930654569021387, + "grad_norm": 0.5152124762535095, + "learning_rate": 9.542727665240783e-06, + "loss": 0.1535, + "step": 1510 + }, + { + "epoch": 0.4896305897602074, + "grad_norm": 0.5777947902679443, + "learning_rate": 9.541996589406784e-06, + "loss": 0.1825, + "step": 1511 + }, + { + "epoch": 0.4899546338302009, + "grad_norm": 0.5618809461593628, + "learning_rate": 9.541264957676373e-06, + "loss": 0.1692, + "step": 1512 + }, + { + "epoch": 0.4902786779001944, + "grad_norm": 0.5675665140151978, + "learning_rate": 9.540532770139093e-06, + "loss": 0.1727, + "step": 1513 + }, + { + "epoch": 0.49060272197018795, + "grad_norm": 0.5648574233055115, + "learning_rate": 9.539800026884558e-06, + "loss": 0.17, + "step": 1514 + }, + { + "epoch": 0.4909267660401815, + "grad_norm": 0.5508841872215271, + "learning_rate": 9.53906672800245e-06, + "loss": 0.1628, + "step": 1515 + }, + { + "epoch": 0.49125081011017496, + "grad_norm": 0.614479660987854, + "learning_rate": 9.538332873582515e-06, + "loss": 0.1751, + "step": 1516 + }, + { + "epoch": 0.4915748541801685, + "grad_norm": 0.5082097053527832, + "learning_rate": 9.53759846371457e-06, + "loss": 0.1466, + "step": 1517 + }, + { + "epoch": 0.49189889825016203, + "grad_norm": 0.5089106559753418, + "learning_rate": 9.536863498488502e-06, + "loss": 0.1395, + "step": 1518 + }, + { + "epoch": 0.49222294232015557, + "grad_norm": 0.5683304667472839, + "learning_rate": 9.53612797799426e-06, + "loss": 0.1702, + "step": 1519 + }, + { + "epoch": 0.49254698639014904, + "grad_norm": 0.5439754724502563, + "learning_rate": 9.535391902321868e-06, + "loss": 0.1634, + "step": 1520 + }, + { + "epoch": 0.4928710304601426, + "grad_norm": 0.5556536912918091, + "learning_rate": 9.534655271561415e-06, + "loss": 0.1678, + "step": 1521 + }, + { + "epoch": 0.4931950745301361, + "grad_norm": 0.5609848499298096, + "learning_rate": 9.533918085803053e-06, + "loss": 0.1695, + "step": 1522 + }, + { + "epoch": 0.4935191186001296, + "grad_norm": 0.5703451037406921, + "learning_rate": 9.533180345137009e-06, + "loss": 0.157, + "step": 1523 + }, + { + "epoch": 0.4938431626701231, + "grad_norm": 0.568932056427002, + "learning_rate": 9.532442049653577e-06, + "loss": 0.1619, + "step": 1524 + }, + { + "epoch": 0.49416720674011666, + "grad_norm": 0.5227696299552917, + "learning_rate": 9.531703199443113e-06, + "loss": 0.1626, + "step": 1525 + }, + { + "epoch": 0.4944912508101102, + "grad_norm": 0.5611757636070251, + "learning_rate": 9.53096379459605e-06, + "loss": 0.1662, + "step": 1526 + }, + { + "epoch": 0.4948152948801037, + "grad_norm": 0.5827354788780212, + "learning_rate": 9.53022383520288e-06, + "loss": 0.1674, + "step": 1527 + }, + { + "epoch": 0.4951393389500972, + "grad_norm": 0.5978224277496338, + "learning_rate": 9.52948332135417e-06, + "loss": 0.1508, + "step": 1528 + }, + { + "epoch": 0.49546338302009074, + "grad_norm": 0.5594303011894226, + "learning_rate": 9.52874225314055e-06, + "loss": 0.1821, + "step": 1529 + }, + { + "epoch": 0.4957874270900843, + "grad_norm": 0.5278100967407227, + "learning_rate": 9.52800063065272e-06, + "loss": 0.1539, + "step": 1530 + }, + { + "epoch": 0.49611147116007775, + "grad_norm": 0.5450807213783264, + "learning_rate": 9.527258453981448e-06, + "loss": 0.1571, + "step": 1531 + }, + { + "epoch": 0.4964355152300713, + "grad_norm": 0.5510877370834351, + "learning_rate": 9.526515723217566e-06, + "loss": 0.1718, + "step": 1532 + }, + { + "epoch": 0.4967595593000648, + "grad_norm": 0.6029154658317566, + "learning_rate": 9.525772438451982e-06, + "loss": 0.157, + "step": 1533 + }, + { + "epoch": 0.4970836033700583, + "grad_norm": 0.5283669233322144, + "learning_rate": 9.525028599775662e-06, + "loss": 0.1463, + "step": 1534 + }, + { + "epoch": 0.49740764744005184, + "grad_norm": 0.5479215979576111, + "learning_rate": 9.524284207279648e-06, + "loss": 0.1605, + "step": 1535 + }, + { + "epoch": 0.49773169151004537, + "grad_norm": 0.6442349553108215, + "learning_rate": 9.523539261055046e-06, + "loss": 0.18, + "step": 1536 + }, + { + "epoch": 0.4980557355800389, + "grad_norm": 0.5268247723579407, + "learning_rate": 9.52279376119303e-06, + "loss": 0.1546, + "step": 1537 + }, + { + "epoch": 0.4983797796500324, + "grad_norm": 0.5354442000389099, + "learning_rate": 9.52204770778484e-06, + "loss": 0.1525, + "step": 1538 + }, + { + "epoch": 0.4987038237200259, + "grad_norm": 0.5068316459655762, + "learning_rate": 9.52130110092179e-06, + "loss": 0.149, + "step": 1539 + }, + { + "epoch": 0.49902786779001945, + "grad_norm": 0.5335785746574402, + "learning_rate": 9.520553940695253e-06, + "loss": 0.1541, + "step": 1540 + }, + { + "epoch": 0.499351911860013, + "grad_norm": 0.5971035361289978, + "learning_rate": 9.519806227196676e-06, + "loss": 0.159, + "step": 1541 + }, + { + "epoch": 0.49967595593000647, + "grad_norm": 0.5317016243934631, + "learning_rate": 9.519057960517572e-06, + "loss": 0.1726, + "step": 1542 + }, + { + "epoch": 0.5, + "grad_norm": 0.5037248730659485, + "learning_rate": 9.518309140749521e-06, + "loss": 0.1377, + "step": 1543 + }, + { + "epoch": 0.5003240440699935, + "grad_norm": 0.5816386938095093, + "learning_rate": 9.517559767984175e-06, + "loss": 0.1796, + "step": 1544 + }, + { + "epoch": 0.5006480881399871, + "grad_norm": 0.5142626762390137, + "learning_rate": 9.516809842313244e-06, + "loss": 0.1576, + "step": 1545 + }, + { + "epoch": 0.5009721322099806, + "grad_norm": 0.5820870399475098, + "learning_rate": 9.516059363828513e-06, + "loss": 0.1785, + "step": 1546 + }, + { + "epoch": 0.501296176279974, + "grad_norm": 0.5120531320571899, + "learning_rate": 9.515308332621838e-06, + "loss": 0.1459, + "step": 1547 + }, + { + "epoch": 0.5016202203499676, + "grad_norm": 0.5186384320259094, + "learning_rate": 9.514556748785133e-06, + "loss": 0.1523, + "step": 1548 + }, + { + "epoch": 0.5019442644199611, + "grad_norm": 0.569932222366333, + "learning_rate": 9.513804612410387e-06, + "loss": 0.168, + "step": 1549 + }, + { + "epoch": 0.5022683084899546, + "grad_norm": 0.544551432132721, + "learning_rate": 9.513051923589652e-06, + "loss": 0.1585, + "step": 1550 + }, + { + "epoch": 0.5025923525599482, + "grad_norm": 0.5181910991668701, + "learning_rate": 9.512298682415052e-06, + "loss": 0.1476, + "step": 1551 + }, + { + "epoch": 0.5029163966299417, + "grad_norm": 0.5322809219360352, + "learning_rate": 9.511544888978777e-06, + "loss": 0.1629, + "step": 1552 + }, + { + "epoch": 0.5032404406999352, + "grad_norm": 0.5483154654502869, + "learning_rate": 9.51079054337308e-06, + "loss": 0.1617, + "step": 1553 + }, + { + "epoch": 0.5035644847699287, + "grad_norm": 0.5534182786941528, + "learning_rate": 9.51003564569029e-06, + "loss": 0.1747, + "step": 1554 + }, + { + "epoch": 0.5038885288399222, + "grad_norm": 0.6017401218414307, + "learning_rate": 9.509280196022798e-06, + "loss": 0.1766, + "step": 1555 + }, + { + "epoch": 0.5042125729099157, + "grad_norm": 0.5111786723136902, + "learning_rate": 9.508524194463062e-06, + "loss": 0.1473, + "step": 1556 + }, + { + "epoch": 0.5045366169799093, + "grad_norm": 0.5252443552017212, + "learning_rate": 9.507767641103612e-06, + "loss": 0.153, + "step": 1557 + }, + { + "epoch": 0.5048606610499028, + "grad_norm": 0.5776534676551819, + "learning_rate": 9.50701053603704e-06, + "loss": 0.1591, + "step": 1558 + }, + { + "epoch": 0.5051847051198963, + "grad_norm": 0.5355959534645081, + "learning_rate": 9.50625287935601e-06, + "loss": 0.1664, + "step": 1559 + }, + { + "epoch": 0.5055087491898899, + "grad_norm": 0.5488954782485962, + "learning_rate": 9.505494671153252e-06, + "loss": 0.1632, + "step": 1560 + }, + { + "epoch": 0.5058327932598834, + "grad_norm": 0.5629811882972717, + "learning_rate": 9.504735911521562e-06, + "loss": 0.1622, + "step": 1561 + }, + { + "epoch": 0.5061568373298768, + "grad_norm": 0.5401865243911743, + "learning_rate": 9.503976600553805e-06, + "loss": 0.1624, + "step": 1562 + }, + { + "epoch": 0.5064808813998704, + "grad_norm": 0.5585532784461975, + "learning_rate": 9.503216738342916e-06, + "loss": 0.1675, + "step": 1563 + }, + { + "epoch": 0.5068049254698639, + "grad_norm": 0.5027323365211487, + "learning_rate": 9.502456324981892e-06, + "loss": 0.1515, + "step": 1564 + }, + { + "epoch": 0.5071289695398574, + "grad_norm": 0.5550998449325562, + "learning_rate": 9.501695360563801e-06, + "loss": 0.1596, + "step": 1565 + }, + { + "epoch": 0.507453013609851, + "grad_norm": 0.5721212029457092, + "learning_rate": 9.500933845181776e-06, + "loss": 0.1602, + "step": 1566 + }, + { + "epoch": 0.5077770576798445, + "grad_norm": 0.5230451226234436, + "learning_rate": 9.500171778929023e-06, + "loss": 0.1496, + "step": 1567 + }, + { + "epoch": 0.508101101749838, + "grad_norm": 0.5238634943962097, + "learning_rate": 9.499409161898808e-06, + "loss": 0.1646, + "step": 1568 + }, + { + "epoch": 0.5084251458198314, + "grad_norm": 0.5461115837097168, + "learning_rate": 9.49864599418447e-06, + "loss": 0.1654, + "step": 1569 + }, + { + "epoch": 0.508749189889825, + "grad_norm": 0.5428388714790344, + "learning_rate": 9.497882275879412e-06, + "loss": 0.1532, + "step": 1570 + }, + { + "epoch": 0.5090732339598185, + "grad_norm": 0.6161945462226868, + "learning_rate": 9.497118007077106e-06, + "loss": 0.1675, + "step": 1571 + }, + { + "epoch": 0.509397278029812, + "grad_norm": 0.541619598865509, + "learning_rate": 9.496353187871092e-06, + "loss": 0.1468, + "step": 1572 + }, + { + "epoch": 0.5097213220998056, + "grad_norm": 0.5606821179389954, + "learning_rate": 9.495587818354977e-06, + "loss": 0.1734, + "step": 1573 + }, + { + "epoch": 0.5100453661697991, + "grad_norm": 0.5038211941719055, + "learning_rate": 9.494821898622431e-06, + "loss": 0.1453, + "step": 1574 + }, + { + "epoch": 0.5103694102397927, + "grad_norm": 0.5432626605033875, + "learning_rate": 9.4940554287672e-06, + "loss": 0.1581, + "step": 1575 + }, + { + "epoch": 0.5106934543097861, + "grad_norm": 0.5432832837104797, + "learning_rate": 9.49328840888309e-06, + "loss": 0.1696, + "step": 1576 + }, + { + "epoch": 0.5110174983797796, + "grad_norm": 0.5713906288146973, + "learning_rate": 9.492520839063977e-06, + "loss": 0.1759, + "step": 1577 + }, + { + "epoch": 0.5113415424497731, + "grad_norm": 0.5782821178436279, + "learning_rate": 9.491752719403805e-06, + "loss": 0.1735, + "step": 1578 + }, + { + "epoch": 0.5116655865197667, + "grad_norm": 0.5207881331443787, + "learning_rate": 9.490984049996581e-06, + "loss": 0.1621, + "step": 1579 + }, + { + "epoch": 0.5119896305897602, + "grad_norm": 0.5409039258956909, + "learning_rate": 9.490214830936388e-06, + "loss": 0.1696, + "step": 1580 + }, + { + "epoch": 0.5123136746597537, + "grad_norm": 0.5568174719810486, + "learning_rate": 9.489445062317367e-06, + "loss": 0.1652, + "step": 1581 + }, + { + "epoch": 0.5126377187297473, + "grad_norm": 0.5258395671844482, + "learning_rate": 9.488674744233732e-06, + "loss": 0.1556, + "step": 1582 + }, + { + "epoch": 0.5129617627997408, + "grad_norm": 0.5342602729797363, + "learning_rate": 9.487903876779763e-06, + "loss": 0.1605, + "step": 1583 + }, + { + "epoch": 0.5132858068697342, + "grad_norm": 0.5090218782424927, + "learning_rate": 9.487132460049804e-06, + "loss": 0.1454, + "step": 1584 + }, + { + "epoch": 0.5136098509397278, + "grad_norm": 0.5238171815872192, + "learning_rate": 9.486360494138271e-06, + "loss": 0.1492, + "step": 1585 + }, + { + "epoch": 0.5139338950097213, + "grad_norm": 0.5661818981170654, + "learning_rate": 9.485587979139647e-06, + "loss": 0.1644, + "step": 1586 + }, + { + "epoch": 0.5142579390797148, + "grad_norm": 0.5241592526435852, + "learning_rate": 9.484814915148477e-06, + "loss": 0.1532, + "step": 1587 + }, + { + "epoch": 0.5145819831497084, + "grad_norm": 0.5554935336112976, + "learning_rate": 9.484041302259377e-06, + "loss": 0.1644, + "step": 1588 + }, + { + "epoch": 0.5149060272197019, + "grad_norm": 0.4919261038303375, + "learning_rate": 9.483267140567031e-06, + "loss": 0.1473, + "step": 1589 + }, + { + "epoch": 0.5152300712896954, + "grad_norm": 0.5082678198814392, + "learning_rate": 9.482492430166188e-06, + "loss": 0.1583, + "step": 1590 + }, + { + "epoch": 0.5155541153596889, + "grad_norm": 0.5806257724761963, + "learning_rate": 9.481717171151665e-06, + "loss": 0.1757, + "step": 1591 + }, + { + "epoch": 0.5158781594296824, + "grad_norm": 0.5450208187103271, + "learning_rate": 9.480941363618347e-06, + "loss": 0.1635, + "step": 1592 + }, + { + "epoch": 0.5162022034996759, + "grad_norm": 0.5514866709709167, + "learning_rate": 9.480165007661186e-06, + "loss": 0.1678, + "step": 1593 + }, + { + "epoch": 0.5165262475696695, + "grad_norm": 0.49130842089653015, + "learning_rate": 9.479388103375199e-06, + "loss": 0.1282, + "step": 1594 + }, + { + "epoch": 0.516850291639663, + "grad_norm": 0.5273051857948303, + "learning_rate": 9.478610650855472e-06, + "loss": 0.1648, + "step": 1595 + }, + { + "epoch": 0.5171743357096565, + "grad_norm": 0.4903629422187805, + "learning_rate": 9.477832650197158e-06, + "loss": 0.1459, + "step": 1596 + }, + { + "epoch": 0.5174983797796501, + "grad_norm": 0.56658536195755, + "learning_rate": 9.477054101495476e-06, + "loss": 0.1604, + "step": 1597 + }, + { + "epoch": 0.5178224238496435, + "grad_norm": 0.5279373526573181, + "learning_rate": 9.476275004845712e-06, + "loss": 0.1595, + "step": 1598 + }, + { + "epoch": 0.518146467919637, + "grad_norm": 0.5263311266899109, + "learning_rate": 9.475495360343222e-06, + "loss": 0.1467, + "step": 1599 + }, + { + "epoch": 0.5184705119896306, + "grad_norm": 0.5892171263694763, + "learning_rate": 9.474715168083426e-06, + "loss": 0.171, + "step": 1600 + }, + { + "epoch": 0.5187945560596241, + "grad_norm": 0.5727109313011169, + "learning_rate": 9.473934428161813e-06, + "loss": 0.177, + "step": 1601 + }, + { + "epoch": 0.5191186001296176, + "grad_norm": 0.5030838251113892, + "learning_rate": 9.473153140673937e-06, + "loss": 0.1477, + "step": 1602 + }, + { + "epoch": 0.5194426441996112, + "grad_norm": 0.540828287601471, + "learning_rate": 9.472371305715417e-06, + "loss": 0.1764, + "step": 1603 + }, + { + "epoch": 0.5197666882696047, + "grad_norm": 0.5530744791030884, + "learning_rate": 9.471588923381949e-06, + "loss": 0.1512, + "step": 1604 + }, + { + "epoch": 0.5200907323395982, + "grad_norm": 0.5362316370010376, + "learning_rate": 9.470805993769284e-06, + "loss": 0.1593, + "step": 1605 + }, + { + "epoch": 0.5204147764095917, + "grad_norm": 0.5342429280281067, + "learning_rate": 9.470022516973245e-06, + "loss": 0.1606, + "step": 1606 + }, + { + "epoch": 0.5207388204795852, + "grad_norm": 0.49285590648651123, + "learning_rate": 9.469238493089723e-06, + "loss": 0.1602, + "step": 1607 + }, + { + "epoch": 0.5210628645495787, + "grad_norm": 0.5392868518829346, + "learning_rate": 9.468453922214676e-06, + "loss": 0.1704, + "step": 1608 + }, + { + "epoch": 0.5213869086195723, + "grad_norm": 0.5489588975906372, + "learning_rate": 9.467668804444127e-06, + "loss": 0.1605, + "step": 1609 + }, + { + "epoch": 0.5217109526895658, + "grad_norm": 0.5366129875183105, + "learning_rate": 9.466883139874165e-06, + "loss": 0.1594, + "step": 1610 + }, + { + "epoch": 0.5220349967595593, + "grad_norm": 0.5764620304107666, + "learning_rate": 9.466096928600953e-06, + "loss": 0.1809, + "step": 1611 + }, + { + "epoch": 0.5223590408295529, + "grad_norm": 0.4988090693950653, + "learning_rate": 9.46531017072071e-06, + "loss": 0.1449, + "step": 1612 + }, + { + "epoch": 0.5226830848995463, + "grad_norm": 0.5809919834136963, + "learning_rate": 9.464522866329729e-06, + "loss": 0.1798, + "step": 1613 + }, + { + "epoch": 0.5230071289695398, + "grad_norm": 0.5060127973556519, + "learning_rate": 9.463735015524369e-06, + "loss": 0.1525, + "step": 1614 + }, + { + "epoch": 0.5233311730395334, + "grad_norm": 0.5147068500518799, + "learning_rate": 9.462946618401057e-06, + "loss": 0.1512, + "step": 1615 + }, + { + "epoch": 0.5236552171095269, + "grad_norm": 0.5227774381637573, + "learning_rate": 9.462157675056282e-06, + "loss": 0.1486, + "step": 1616 + }, + { + "epoch": 0.5239792611795204, + "grad_norm": 0.575864851474762, + "learning_rate": 9.461368185586604e-06, + "loss": 0.1647, + "step": 1617 + }, + { + "epoch": 0.524303305249514, + "grad_norm": 0.5135976672172546, + "learning_rate": 9.460578150088652e-06, + "loss": 0.1527, + "step": 1618 + }, + { + "epoch": 0.5246273493195075, + "grad_norm": 0.5435746908187866, + "learning_rate": 9.459787568659115e-06, + "loss": 0.1675, + "step": 1619 + }, + { + "epoch": 0.5249513933895009, + "grad_norm": 0.511205792427063, + "learning_rate": 9.458996441394753e-06, + "loss": 0.1493, + "step": 1620 + }, + { + "epoch": 0.5252754374594945, + "grad_norm": 0.5185942053794861, + "learning_rate": 9.458204768392394e-06, + "loss": 0.1618, + "step": 1621 + }, + { + "epoch": 0.525599481529488, + "grad_norm": 0.5057900547981262, + "learning_rate": 9.45741254974893e-06, + "loss": 0.1502, + "step": 1622 + }, + { + "epoch": 0.5259235255994815, + "grad_norm": 0.5402381420135498, + "learning_rate": 9.45661978556132e-06, + "loss": 0.1623, + "step": 1623 + }, + { + "epoch": 0.5262475696694751, + "grad_norm": 0.5072924494743347, + "learning_rate": 9.455826475926593e-06, + "loss": 0.1693, + "step": 1624 + }, + { + "epoch": 0.5265716137394686, + "grad_norm": 0.5598753690719604, + "learning_rate": 9.45503262094184e-06, + "loss": 0.1722, + "step": 1625 + }, + { + "epoch": 0.5268956578094621, + "grad_norm": 0.5614151954650879, + "learning_rate": 9.454238220704223e-06, + "loss": 0.1871, + "step": 1626 + }, + { + "epoch": 0.5272197018794557, + "grad_norm": 0.5165886282920837, + "learning_rate": 9.453443275310967e-06, + "loss": 0.1554, + "step": 1627 + }, + { + "epoch": 0.5275437459494491, + "grad_norm": 0.5177538394927979, + "learning_rate": 9.452647784859367e-06, + "loss": 0.1633, + "step": 1628 + }, + { + "epoch": 0.5278677900194426, + "grad_norm": 0.518203616142273, + "learning_rate": 9.451851749446786e-06, + "loss": 0.1581, + "step": 1629 + }, + { + "epoch": 0.5281918340894362, + "grad_norm": 0.5033408999443054, + "learning_rate": 9.451055169170644e-06, + "loss": 0.1522, + "step": 1630 + }, + { + "epoch": 0.5285158781594297, + "grad_norm": 0.5274224281311035, + "learning_rate": 9.450258044128441e-06, + "loss": 0.1575, + "step": 1631 + }, + { + "epoch": 0.5288399222294232, + "grad_norm": 0.4896572530269623, + "learning_rate": 9.449460374417737e-06, + "loss": 0.1375, + "step": 1632 + }, + { + "epoch": 0.5291639662994168, + "grad_norm": 0.5398424863815308, + "learning_rate": 9.448662160136154e-06, + "loss": 0.1622, + "step": 1633 + }, + { + "epoch": 0.5294880103694103, + "grad_norm": 0.5725162625312805, + "learning_rate": 9.44786340138139e-06, + "loss": 0.1748, + "step": 1634 + }, + { + "epoch": 0.5298120544394037, + "grad_norm": 0.5098135471343994, + "learning_rate": 9.447064098251205e-06, + "loss": 0.1544, + "step": 1635 + }, + { + "epoch": 0.5301360985093972, + "grad_norm": 0.5335975289344788, + "learning_rate": 9.446264250843425e-06, + "loss": 0.1484, + "step": 1636 + }, + { + "epoch": 0.5304601425793908, + "grad_norm": 0.5592305660247803, + "learning_rate": 9.445463859255943e-06, + "loss": 0.1675, + "step": 1637 + }, + { + "epoch": 0.5307841866493843, + "grad_norm": 0.5780206918716431, + "learning_rate": 9.444662923586722e-06, + "loss": 0.1668, + "step": 1638 + }, + { + "epoch": 0.5311082307193778, + "grad_norm": 0.5241395831108093, + "learning_rate": 9.443861443933786e-06, + "loss": 0.1527, + "step": 1639 + }, + { + "epoch": 0.5314322747893714, + "grad_norm": 0.5185185670852661, + "learning_rate": 9.443059420395229e-06, + "loss": 0.1421, + "step": 1640 + }, + { + "epoch": 0.5317563188593649, + "grad_norm": 0.5295687913894653, + "learning_rate": 9.44225685306921e-06, + "loss": 0.1593, + "step": 1641 + }, + { + "epoch": 0.5320803629293583, + "grad_norm": 0.5327252745628357, + "learning_rate": 9.441453742053956e-06, + "loss": 0.1614, + "step": 1642 + }, + { + "epoch": 0.5324044069993519, + "grad_norm": 0.5810564756393433, + "learning_rate": 9.440650087447762e-06, + "loss": 0.171, + "step": 1643 + }, + { + "epoch": 0.5327284510693454, + "grad_norm": 0.5510779619216919, + "learning_rate": 9.439845889348987e-06, + "loss": 0.1645, + "step": 1644 + }, + { + "epoch": 0.5330524951393389, + "grad_norm": 0.5120063424110413, + "learning_rate": 9.439041147856056e-06, + "loss": 0.152, + "step": 1645 + }, + { + "epoch": 0.5333765392093325, + "grad_norm": 0.577538251876831, + "learning_rate": 9.43823586306746e-06, + "loss": 0.1759, + "step": 1646 + }, + { + "epoch": 0.533700583279326, + "grad_norm": 0.535686731338501, + "learning_rate": 9.437430035081761e-06, + "loss": 0.1642, + "step": 1647 + }, + { + "epoch": 0.5340246273493195, + "grad_norm": 0.5769588947296143, + "learning_rate": 9.436623663997584e-06, + "loss": 0.1751, + "step": 1648 + }, + { + "epoch": 0.5343486714193131, + "grad_norm": 0.5599579215049744, + "learning_rate": 9.43581674991362e-06, + "loss": 0.1736, + "step": 1649 + }, + { + "epoch": 0.5346727154893065, + "grad_norm": 0.514129102230072, + "learning_rate": 9.435009292928628e-06, + "loss": 0.1574, + "step": 1650 + }, + { + "epoch": 0.5349967595593, + "grad_norm": 0.52392578125, + "learning_rate": 9.434201293141431e-06, + "loss": 0.1584, + "step": 1651 + }, + { + "epoch": 0.5353208036292936, + "grad_norm": 0.511712372303009, + "learning_rate": 9.433392750650923e-06, + "loss": 0.1502, + "step": 1652 + }, + { + "epoch": 0.5356448476992871, + "grad_norm": 0.5033168196678162, + "learning_rate": 9.432583665556062e-06, + "loss": 0.14, + "step": 1653 + }, + { + "epoch": 0.5359688917692806, + "grad_norm": 0.5233294367790222, + "learning_rate": 9.43177403795587e-06, + "loss": 0.1607, + "step": 1654 + }, + { + "epoch": 0.5362929358392742, + "grad_norm": 0.48273828625679016, + "learning_rate": 9.430963867949439e-06, + "loss": 0.1498, + "step": 1655 + }, + { + "epoch": 0.5366169799092677, + "grad_norm": 0.5318136811256409, + "learning_rate": 9.430153155635926e-06, + "loss": 0.1538, + "step": 1656 + }, + { + "epoch": 0.5369410239792611, + "grad_norm": 0.5394309759140015, + "learning_rate": 9.429341901114553e-06, + "loss": 0.1619, + "step": 1657 + }, + { + "epoch": 0.5372650680492547, + "grad_norm": 0.5881305932998657, + "learning_rate": 9.428530104484612e-06, + "loss": 0.1886, + "step": 1658 + }, + { + "epoch": 0.5375891121192482, + "grad_norm": 0.5513939261436462, + "learning_rate": 9.427717765845457e-06, + "loss": 0.1633, + "step": 1659 + }, + { + "epoch": 0.5379131561892417, + "grad_norm": 0.5329276919364929, + "learning_rate": 9.42690488529651e-06, + "loss": 0.1579, + "step": 1660 + }, + { + "epoch": 0.5382372002592353, + "grad_norm": 0.5297683477401733, + "learning_rate": 9.426091462937263e-06, + "loss": 0.1556, + "step": 1661 + }, + { + "epoch": 0.5385612443292288, + "grad_norm": 0.5444457530975342, + "learning_rate": 9.425277498867267e-06, + "loss": 0.1704, + "step": 1662 + }, + { + "epoch": 0.5388852883992223, + "grad_norm": 0.5043376088142395, + "learning_rate": 9.424462993186145e-06, + "loss": 0.1413, + "step": 1663 + }, + { + "epoch": 0.5392093324692158, + "grad_norm": 0.5424678921699524, + "learning_rate": 9.423647945993586e-06, + "loss": 0.1598, + "step": 1664 + }, + { + "epoch": 0.5395333765392093, + "grad_norm": 0.5627645254135132, + "learning_rate": 9.422832357389341e-06, + "loss": 0.183, + "step": 1665 + }, + { + "epoch": 0.5398574206092028, + "grad_norm": 0.5190801620483398, + "learning_rate": 9.422016227473233e-06, + "loss": 0.1809, + "step": 1666 + }, + { + "epoch": 0.5401814646791964, + "grad_norm": 0.5414610505104065, + "learning_rate": 9.42119955634515e-06, + "loss": 0.157, + "step": 1667 + }, + { + "epoch": 0.5405055087491899, + "grad_norm": 0.5154577493667603, + "learning_rate": 9.420382344105037e-06, + "loss": 0.1565, + "step": 1668 + }, + { + "epoch": 0.5408295528191834, + "grad_norm": 0.5236411690711975, + "learning_rate": 9.41956459085292e-06, + "loss": 0.1492, + "step": 1669 + }, + { + "epoch": 0.541153596889177, + "grad_norm": 0.5545842051506042, + "learning_rate": 9.418746296688881e-06, + "loss": 0.1688, + "step": 1670 + }, + { + "epoch": 0.5414776409591704, + "grad_norm": 0.5425675511360168, + "learning_rate": 9.417927461713073e-06, + "loss": 0.1587, + "step": 1671 + }, + { + "epoch": 0.5418016850291639, + "grad_norm": 0.5389112830162048, + "learning_rate": 9.417108086025713e-06, + "loss": 0.1582, + "step": 1672 + }, + { + "epoch": 0.5421257290991575, + "grad_norm": 0.5404740571975708, + "learning_rate": 9.416288169727082e-06, + "loss": 0.1605, + "step": 1673 + }, + { + "epoch": 0.542449773169151, + "grad_norm": 0.544076144695282, + "learning_rate": 9.415467712917535e-06, + "loss": 0.1533, + "step": 1674 + }, + { + "epoch": 0.5427738172391445, + "grad_norm": 0.5358872413635254, + "learning_rate": 9.414646715697482e-06, + "loss": 0.1814, + "step": 1675 + }, + { + "epoch": 0.5430978613091381, + "grad_norm": 0.5488872528076172, + "learning_rate": 9.413825178167408e-06, + "loss": 0.1644, + "step": 1676 + }, + { + "epoch": 0.5434219053791316, + "grad_norm": 0.5466941595077515, + "learning_rate": 9.413003100427864e-06, + "loss": 0.1736, + "step": 1677 + }, + { + "epoch": 0.5437459494491251, + "grad_norm": 0.531701385974884, + "learning_rate": 9.41218048257946e-06, + "loss": 0.1549, + "step": 1678 + }, + { + "epoch": 0.5440699935191186, + "grad_norm": 0.561321496963501, + "learning_rate": 9.411357324722879e-06, + "loss": 0.1757, + "step": 1679 + }, + { + "epoch": 0.5443940375891121, + "grad_norm": 0.5582754015922546, + "learning_rate": 9.410533626958867e-06, + "loss": 0.1611, + "step": 1680 + }, + { + "epoch": 0.5447180816591056, + "grad_norm": 0.5068044066429138, + "learning_rate": 9.409709389388234e-06, + "loss": 0.1458, + "step": 1681 + }, + { + "epoch": 0.5450421257290992, + "grad_norm": 0.5205656290054321, + "learning_rate": 9.408884612111865e-06, + "loss": 0.1569, + "step": 1682 + }, + { + "epoch": 0.5453661697990927, + "grad_norm": 0.5100316405296326, + "learning_rate": 9.408059295230696e-06, + "loss": 0.1525, + "step": 1683 + }, + { + "epoch": 0.5456902138690862, + "grad_norm": 0.5226548910140991, + "learning_rate": 9.407233438845746e-06, + "loss": 0.1601, + "step": 1684 + }, + { + "epoch": 0.5460142579390798, + "grad_norm": 0.5404941439628601, + "learning_rate": 9.406407043058087e-06, + "loss": 0.168, + "step": 1685 + }, + { + "epoch": 0.5463383020090732, + "grad_norm": 0.547059953212738, + "learning_rate": 9.405580107968864e-06, + "loss": 0.157, + "step": 1686 + }, + { + "epoch": 0.5466623460790667, + "grad_norm": 0.5400663614273071, + "learning_rate": 9.404752633679284e-06, + "loss": 0.152, + "step": 1687 + }, + { + "epoch": 0.5469863901490603, + "grad_norm": 0.5538695454597473, + "learning_rate": 9.403924620290624e-06, + "loss": 0.1645, + "step": 1688 + }, + { + "epoch": 0.5473104342190538, + "grad_norm": 0.5280716419219971, + "learning_rate": 9.403096067904223e-06, + "loss": 0.1593, + "step": 1689 + }, + { + "epoch": 0.5476344782890473, + "grad_norm": 0.5219088792800903, + "learning_rate": 9.402266976621489e-06, + "loss": 0.1514, + "step": 1690 + }, + { + "epoch": 0.5479585223590409, + "grad_norm": 0.5405663251876831, + "learning_rate": 9.401437346543893e-06, + "loss": 0.1651, + "step": 1691 + }, + { + "epoch": 0.5482825664290344, + "grad_norm": 0.48504170775413513, + "learning_rate": 9.400607177772978e-06, + "loss": 0.1427, + "step": 1692 + }, + { + "epoch": 0.5486066104990278, + "grad_norm": 0.5304465293884277, + "learning_rate": 9.399776470410344e-06, + "loss": 0.1542, + "step": 1693 + }, + { + "epoch": 0.5489306545690213, + "grad_norm": 0.551505446434021, + "learning_rate": 9.398945224557662e-06, + "loss": 0.1607, + "step": 1694 + }, + { + "epoch": 0.5492546986390149, + "grad_norm": 0.5069938898086548, + "learning_rate": 9.398113440316672e-06, + "loss": 0.1725, + "step": 1695 + }, + { + "epoch": 0.5495787427090084, + "grad_norm": 0.5524683594703674, + "learning_rate": 9.397281117789173e-06, + "loss": 0.1627, + "step": 1696 + }, + { + "epoch": 0.549902786779002, + "grad_norm": 0.5232834815979004, + "learning_rate": 9.396448257077034e-06, + "loss": 0.1612, + "step": 1697 + }, + { + "epoch": 0.5502268308489955, + "grad_norm": 0.5372471213340759, + "learning_rate": 9.395614858282187e-06, + "loss": 0.159, + "step": 1698 + }, + { + "epoch": 0.550550874918989, + "grad_norm": 0.544275164604187, + "learning_rate": 9.394780921506636e-06, + "loss": 0.1579, + "step": 1699 + }, + { + "epoch": 0.5508749189889826, + "grad_norm": 0.5532092452049255, + "learning_rate": 9.393946446852447e-06, + "loss": 0.1756, + "step": 1700 + }, + { + "epoch": 0.551198963058976, + "grad_norm": 0.4750637710094452, + "learning_rate": 9.393111434421747e-06, + "loss": 0.1335, + "step": 1701 + }, + { + "epoch": 0.5515230071289695, + "grad_norm": 0.517975389957428, + "learning_rate": 9.392275884316737e-06, + "loss": 0.1468, + "step": 1702 + }, + { + "epoch": 0.551847051198963, + "grad_norm": 0.5643892288208008, + "learning_rate": 9.391439796639679e-06, + "loss": 0.1596, + "step": 1703 + }, + { + "epoch": 0.5521710952689566, + "grad_norm": 0.5320152640342712, + "learning_rate": 9.390603171492902e-06, + "loss": 0.152, + "step": 1704 + }, + { + "epoch": 0.5524951393389501, + "grad_norm": 0.5514765381813049, + "learning_rate": 9.389766008978803e-06, + "loss": 0.167, + "step": 1705 + }, + { + "epoch": 0.5528191834089436, + "grad_norm": 0.5518473982810974, + "learning_rate": 9.388928309199839e-06, + "loss": 0.167, + "step": 1706 + }, + { + "epoch": 0.5531432274789372, + "grad_norm": 0.5376887321472168, + "learning_rate": 9.388090072258538e-06, + "loss": 0.1582, + "step": 1707 + }, + { + "epoch": 0.5534672715489306, + "grad_norm": 0.5329182147979736, + "learning_rate": 9.387251298257492e-06, + "loss": 0.1641, + "step": 1708 + }, + { + "epoch": 0.5537913156189241, + "grad_norm": 0.5687870383262634, + "learning_rate": 9.38641198729936e-06, + "loss": 0.1604, + "step": 1709 + }, + { + "epoch": 0.5541153596889177, + "grad_norm": 0.5174402594566345, + "learning_rate": 9.385572139486864e-06, + "loss": 0.1685, + "step": 1710 + }, + { + "epoch": 0.5544394037589112, + "grad_norm": 0.5301437973976135, + "learning_rate": 9.384731754922793e-06, + "loss": 0.1549, + "step": 1711 + }, + { + "epoch": 0.5547634478289047, + "grad_norm": 0.5403845310211182, + "learning_rate": 9.383890833710004e-06, + "loss": 0.1713, + "step": 1712 + }, + { + "epoch": 0.5550874918988983, + "grad_norm": 0.5424365401268005, + "learning_rate": 9.383049375951417e-06, + "loss": 0.1661, + "step": 1713 + }, + { + "epoch": 0.5554115359688918, + "grad_norm": 0.5190292596817017, + "learning_rate": 9.382207381750015e-06, + "loss": 0.159, + "step": 1714 + }, + { + "epoch": 0.5557355800388852, + "grad_norm": 0.5062054991722107, + "learning_rate": 9.381364851208855e-06, + "loss": 0.1513, + "step": 1715 + }, + { + "epoch": 0.5560596241088788, + "grad_norm": 0.535383403301239, + "learning_rate": 9.38052178443105e-06, + "loss": 0.1645, + "step": 1716 + }, + { + "epoch": 0.5563836681788723, + "grad_norm": 0.534705638885498, + "learning_rate": 9.379678181519787e-06, + "loss": 0.1628, + "step": 1717 + }, + { + "epoch": 0.5567077122488658, + "grad_norm": 0.524269700050354, + "learning_rate": 9.378834042578314e-06, + "loss": 0.165, + "step": 1718 + }, + { + "epoch": 0.5570317563188594, + "grad_norm": 0.5700576901435852, + "learning_rate": 9.37798936770994e-06, + "loss": 0.1719, + "step": 1719 + }, + { + "epoch": 0.5573558003888529, + "grad_norm": 0.5169806480407715, + "learning_rate": 9.377144157018054e-06, + "loss": 0.1573, + "step": 1720 + }, + { + "epoch": 0.5576798444588464, + "grad_norm": 0.5423029661178589, + "learning_rate": 9.376298410606096e-06, + "loss": 0.1578, + "step": 1721 + }, + { + "epoch": 0.55800388852884, + "grad_norm": 0.5279255509376526, + "learning_rate": 9.375452128577578e-06, + "loss": 0.1539, + "step": 1722 + }, + { + "epoch": 0.5583279325988334, + "grad_norm": 0.5555424094200134, + "learning_rate": 9.374605311036077e-06, + "loss": 0.1632, + "step": 1723 + }, + { + "epoch": 0.5586519766688269, + "grad_norm": 0.4893963634967804, + "learning_rate": 9.373757958085237e-06, + "loss": 0.1394, + "step": 1724 + }, + { + "epoch": 0.5589760207388205, + "grad_norm": 0.5070914626121521, + "learning_rate": 9.372910069828763e-06, + "loss": 0.1571, + "step": 1725 + }, + { + "epoch": 0.559300064808814, + "grad_norm": 0.5325096249580383, + "learning_rate": 9.37206164637043e-06, + "loss": 0.1566, + "step": 1726 + }, + { + "epoch": 0.5596241088788075, + "grad_norm": 0.5089950561523438, + "learning_rate": 9.371212687814076e-06, + "loss": 0.1474, + "step": 1727 + }, + { + "epoch": 0.5599481529488011, + "grad_norm": 0.5487457513809204, + "learning_rate": 9.370363194263604e-06, + "loss": 0.1818, + "step": 1728 + }, + { + "epoch": 0.5602721970187946, + "grad_norm": 0.5325713753700256, + "learning_rate": 9.369513165822987e-06, + "loss": 0.1654, + "step": 1729 + }, + { + "epoch": 0.560596241088788, + "grad_norm": 0.5347417593002319, + "learning_rate": 9.368662602596259e-06, + "loss": 0.1665, + "step": 1730 + }, + { + "epoch": 0.5609202851587816, + "grad_norm": 0.5145535469055176, + "learning_rate": 9.367811504687521e-06, + "loss": 0.159, + "step": 1731 + }, + { + "epoch": 0.5612443292287751, + "grad_norm": 0.5164404511451721, + "learning_rate": 9.366959872200935e-06, + "loss": 0.1641, + "step": 1732 + }, + { + "epoch": 0.5615683732987686, + "grad_norm": 0.49573078751564026, + "learning_rate": 9.36610770524074e-06, + "loss": 0.1671, + "step": 1733 + }, + { + "epoch": 0.5618924173687622, + "grad_norm": 0.5579914450645447, + "learning_rate": 9.365255003911227e-06, + "loss": 0.1592, + "step": 1734 + }, + { + "epoch": 0.5622164614387557, + "grad_norm": 0.5098734498023987, + "learning_rate": 9.364401768316762e-06, + "loss": 0.1494, + "step": 1735 + }, + { + "epoch": 0.5625405055087492, + "grad_norm": 0.5225284099578857, + "learning_rate": 9.363547998561771e-06, + "loss": 0.1491, + "step": 1736 + }, + { + "epoch": 0.5628645495787427, + "grad_norm": 0.5344732999801636, + "learning_rate": 9.362693694750747e-06, + "loss": 0.1707, + "step": 1737 + }, + { + "epoch": 0.5631885936487362, + "grad_norm": 0.5236510038375854, + "learning_rate": 9.361838856988247e-06, + "loss": 0.1617, + "step": 1738 + }, + { + "epoch": 0.5635126377187297, + "grad_norm": 0.5190684199333191, + "learning_rate": 9.360983485378899e-06, + "loss": 0.1692, + "step": 1739 + }, + { + "epoch": 0.5638366817887233, + "grad_norm": 0.5158076286315918, + "learning_rate": 9.360127580027389e-06, + "loss": 0.1555, + "step": 1740 + }, + { + "epoch": 0.5641607258587168, + "grad_norm": 0.4986492395401001, + "learning_rate": 9.359271141038473e-06, + "loss": 0.14, + "step": 1741 + }, + { + "epoch": 0.5644847699287103, + "grad_norm": 0.6440570950508118, + "learning_rate": 9.358414168516971e-06, + "loss": 0.1608, + "step": 1742 + }, + { + "epoch": 0.5648088139987039, + "grad_norm": 0.5538783073425293, + "learning_rate": 9.357556662567767e-06, + "loss": 0.1755, + "step": 1743 + }, + { + "epoch": 0.5651328580686974, + "grad_norm": 0.5434886813163757, + "learning_rate": 9.35669862329581e-06, + "loss": 0.1706, + "step": 1744 + }, + { + "epoch": 0.5654569021386908, + "grad_norm": 0.5213068723678589, + "learning_rate": 9.35584005080612e-06, + "loss": 0.1529, + "step": 1745 + }, + { + "epoch": 0.5657809462086844, + "grad_norm": 0.5256164073944092, + "learning_rate": 9.354980945203776e-06, + "loss": 0.1594, + "step": 1746 + }, + { + "epoch": 0.5661049902786779, + "grad_norm": 0.5316346287727356, + "learning_rate": 9.354121306593922e-06, + "loss": 0.1512, + "step": 1747 + }, + { + "epoch": 0.5664290343486714, + "grad_norm": 0.5344158411026001, + "learning_rate": 9.353261135081773e-06, + "loss": 0.1616, + "step": 1748 + }, + { + "epoch": 0.566753078418665, + "grad_norm": 0.49767589569091797, + "learning_rate": 9.3524004307726e-06, + "loss": 0.1422, + "step": 1749 + }, + { + "epoch": 0.5670771224886585, + "grad_norm": 0.4954655170440674, + "learning_rate": 9.351539193771753e-06, + "loss": 0.1523, + "step": 1750 + }, + { + "epoch": 0.567401166558652, + "grad_norm": 0.5749108195304871, + "learning_rate": 9.350677424184632e-06, + "loss": 0.1658, + "step": 1751 + }, + { + "epoch": 0.5677252106286454, + "grad_norm": 0.5306864976882935, + "learning_rate": 9.349815122116715e-06, + "loss": 0.166, + "step": 1752 + }, + { + "epoch": 0.568049254698639, + "grad_norm": 0.4998403787612915, + "learning_rate": 9.348952287673536e-06, + "loss": 0.1451, + "step": 1753 + }, + { + "epoch": 0.5683732987686325, + "grad_norm": 0.5465645790100098, + "learning_rate": 9.348088920960695e-06, + "loss": 0.156, + "step": 1754 + }, + { + "epoch": 0.568697342838626, + "grad_norm": 0.550026535987854, + "learning_rate": 9.347225022083866e-06, + "loss": 0.1664, + "step": 1755 + }, + { + "epoch": 0.5690213869086196, + "grad_norm": 0.5608965754508972, + "learning_rate": 9.346360591148778e-06, + "loss": 0.1697, + "step": 1756 + }, + { + "epoch": 0.5693454309786131, + "grad_norm": 0.5068145990371704, + "learning_rate": 9.34549562826123e-06, + "loss": 0.1487, + "step": 1757 + }, + { + "epoch": 0.5696694750486067, + "grad_norm": 0.5745015144348145, + "learning_rate": 9.344630133527084e-06, + "loss": 0.1769, + "step": 1758 + }, + { + "epoch": 0.5699935191186001, + "grad_norm": 0.521942138671875, + "learning_rate": 9.34376410705227e-06, + "loss": 0.1518, + "step": 1759 + }, + { + "epoch": 0.5703175631885936, + "grad_norm": 0.49625447392463684, + "learning_rate": 9.342897548942778e-06, + "loss": 0.15, + "step": 1760 + }, + { + "epoch": 0.5706416072585871, + "grad_norm": 0.5072271227836609, + "learning_rate": 9.34203045930467e-06, + "loss": 0.1461, + "step": 1761 + }, + { + "epoch": 0.5709656513285807, + "grad_norm": 0.5093204379081726, + "learning_rate": 9.341162838244068e-06, + "loss": 0.1686, + "step": 1762 + }, + { + "epoch": 0.5712896953985742, + "grad_norm": 0.5371447205543518, + "learning_rate": 9.34029468586716e-06, + "loss": 0.1585, + "step": 1763 + }, + { + "epoch": 0.5716137394685677, + "grad_norm": 0.5677884817123413, + "learning_rate": 9.3394260022802e-06, + "loss": 0.167, + "step": 1764 + }, + { + "epoch": 0.5719377835385613, + "grad_norm": 0.4986721873283386, + "learning_rate": 9.338556787589505e-06, + "loss": 0.1506, + "step": 1765 + }, + { + "epoch": 0.5722618276085548, + "grad_norm": 0.5552408695220947, + "learning_rate": 9.337687041901461e-06, + "loss": 0.1626, + "step": 1766 + }, + { + "epoch": 0.5725858716785482, + "grad_norm": 0.5374127626419067, + "learning_rate": 9.336816765322514e-06, + "loss": 0.157, + "step": 1767 + }, + { + "epoch": 0.5729099157485418, + "grad_norm": 0.5247153043746948, + "learning_rate": 9.33594595795918e-06, + "loss": 0.149, + "step": 1768 + }, + { + "epoch": 0.5732339598185353, + "grad_norm": 0.5264690518379211, + "learning_rate": 9.335074619918036e-06, + "loss": 0.1611, + "step": 1769 + }, + { + "epoch": 0.5735580038885288, + "grad_norm": 0.5231035947799683, + "learning_rate": 9.334202751305724e-06, + "loss": 0.1603, + "step": 1770 + }, + { + "epoch": 0.5738820479585224, + "grad_norm": 0.5216938257217407, + "learning_rate": 9.333330352228954e-06, + "loss": 0.1588, + "step": 1771 + }, + { + "epoch": 0.5742060920285159, + "grad_norm": 0.5302786231040955, + "learning_rate": 9.332457422794498e-06, + "loss": 0.1582, + "step": 1772 + }, + { + "epoch": 0.5745301360985094, + "grad_norm": 0.521327018737793, + "learning_rate": 9.331583963109196e-06, + "loss": 0.1636, + "step": 1773 + }, + { + "epoch": 0.5748541801685029, + "grad_norm": 0.5170415639877319, + "learning_rate": 9.33070997327995e-06, + "loss": 0.1558, + "step": 1774 + }, + { + "epoch": 0.5751782242384964, + "grad_norm": 0.5299942493438721, + "learning_rate": 9.329835453413729e-06, + "loss": 0.1612, + "step": 1775 + }, + { + "epoch": 0.5755022683084899, + "grad_norm": 0.4877796173095703, + "learning_rate": 9.328960403617561e-06, + "loss": 0.1353, + "step": 1776 + }, + { + "epoch": 0.5758263123784835, + "grad_norm": 0.5184668302536011, + "learning_rate": 9.328084823998551e-06, + "loss": 0.1578, + "step": 1777 + }, + { + "epoch": 0.576150356448477, + "grad_norm": 0.5334933400154114, + "learning_rate": 9.327208714663856e-06, + "loss": 0.1631, + "step": 1778 + }, + { + "epoch": 0.5764744005184705, + "grad_norm": 0.5227888822555542, + "learning_rate": 9.326332075720705e-06, + "loss": 0.1626, + "step": 1779 + }, + { + "epoch": 0.5767984445884641, + "grad_norm": 0.533147394657135, + "learning_rate": 9.32545490727639e-06, + "loss": 0.1652, + "step": 1780 + }, + { + "epoch": 0.5771224886584575, + "grad_norm": 0.49876242876052856, + "learning_rate": 9.324577209438269e-06, + "loss": 0.1443, + "step": 1781 + }, + { + "epoch": 0.577446532728451, + "grad_norm": 0.5057281255722046, + "learning_rate": 9.32369898231376e-06, + "loss": 0.1436, + "step": 1782 + }, + { + "epoch": 0.5777705767984446, + "grad_norm": 0.5506377816200256, + "learning_rate": 9.322820226010354e-06, + "loss": 0.1626, + "step": 1783 + }, + { + "epoch": 0.5780946208684381, + "grad_norm": 0.5420976877212524, + "learning_rate": 9.3219409406356e-06, + "loss": 0.1562, + "step": 1784 + }, + { + "epoch": 0.5784186649384316, + "grad_norm": 0.5418969988822937, + "learning_rate": 9.321061126297115e-06, + "loss": 0.1616, + "step": 1785 + }, + { + "epoch": 0.5787427090084252, + "grad_norm": 0.5089748501777649, + "learning_rate": 9.32018078310258e-06, + "loss": 0.1483, + "step": 1786 + }, + { + "epoch": 0.5790667530784187, + "grad_norm": 0.5087575316429138, + "learning_rate": 9.319299911159738e-06, + "loss": 0.1396, + "step": 1787 + }, + { + "epoch": 0.5793907971484121, + "grad_norm": 0.537186324596405, + "learning_rate": 9.318418510576402e-06, + "loss": 0.1569, + "step": 1788 + }, + { + "epoch": 0.5797148412184057, + "grad_norm": 0.5755051970481873, + "learning_rate": 9.317536581460444e-06, + "loss": 0.1531, + "step": 1789 + }, + { + "epoch": 0.5800388852883992, + "grad_norm": 0.5934356451034546, + "learning_rate": 9.316654123919808e-06, + "loss": 0.1786, + "step": 1790 + }, + { + "epoch": 0.5803629293583927, + "grad_norm": 0.5637712478637695, + "learning_rate": 9.315771138062495e-06, + "loss": 0.1699, + "step": 1791 + }, + { + "epoch": 0.5806869734283863, + "grad_norm": 0.5092173218727112, + "learning_rate": 9.314887623996574e-06, + "loss": 0.1481, + "step": 1792 + }, + { + "epoch": 0.5810110174983798, + "grad_norm": 0.5008086562156677, + "learning_rate": 9.31400358183018e-06, + "loss": 0.1502, + "step": 1793 + }, + { + "epoch": 0.5813350615683733, + "grad_norm": 0.5082639455795288, + "learning_rate": 9.31311901167151e-06, + "loss": 0.1388, + "step": 1794 + }, + { + "epoch": 0.5816591056383669, + "grad_norm": 0.5039761662483215, + "learning_rate": 9.312233913628828e-06, + "loss": 0.153, + "step": 1795 + }, + { + "epoch": 0.5819831497083603, + "grad_norm": 0.4801679849624634, + "learning_rate": 9.311348287810459e-06, + "loss": 0.149, + "step": 1796 + }, + { + "epoch": 0.5823071937783538, + "grad_norm": 0.5526052117347717, + "learning_rate": 9.310462134324797e-06, + "loss": 0.1686, + "step": 1797 + }, + { + "epoch": 0.5826312378483474, + "grad_norm": 0.4989832937717438, + "learning_rate": 9.3095754532803e-06, + "loss": 0.1435, + "step": 1798 + }, + { + "epoch": 0.5829552819183409, + "grad_norm": 0.44254204630851746, + "learning_rate": 9.308688244785485e-06, + "loss": 0.1336, + "step": 1799 + }, + { + "epoch": 0.5832793259883344, + "grad_norm": 0.5272495746612549, + "learning_rate": 9.307800508948941e-06, + "loss": 0.1493, + "step": 1800 + }, + { + "epoch": 0.583603370058328, + "grad_norm": 0.4973255693912506, + "learning_rate": 9.306912245879318e-06, + "loss": 0.1436, + "step": 1801 + }, + { + "epoch": 0.5839274141283215, + "grad_norm": 0.5235045552253723, + "learning_rate": 9.30602345568533e-06, + "loss": 0.1529, + "step": 1802 + }, + { + "epoch": 0.5842514581983149, + "grad_norm": 0.48296064138412476, + "learning_rate": 9.305134138475755e-06, + "loss": 0.1378, + "step": 1803 + }, + { + "epoch": 0.5845755022683085, + "grad_norm": 0.5345264077186584, + "learning_rate": 9.304244294359442e-06, + "loss": 0.1631, + "step": 1804 + }, + { + "epoch": 0.584899546338302, + "grad_norm": 0.5436072945594788, + "learning_rate": 9.303353923445293e-06, + "loss": 0.1476, + "step": 1805 + }, + { + "epoch": 0.5852235904082955, + "grad_norm": 0.5196661353111267, + "learning_rate": 9.302463025842284e-06, + "loss": 0.1645, + "step": 1806 + }, + { + "epoch": 0.5855476344782891, + "grad_norm": 0.541592001914978, + "learning_rate": 9.301571601659452e-06, + "loss": 0.159, + "step": 1807 + }, + { + "epoch": 0.5858716785482826, + "grad_norm": 0.5426565408706665, + "learning_rate": 9.300679651005898e-06, + "loss": 0.1677, + "step": 1808 + }, + { + "epoch": 0.5861957226182761, + "grad_norm": 0.5050917267799377, + "learning_rate": 9.299787173990789e-06, + "loss": 0.1487, + "step": 1809 + }, + { + "epoch": 0.5865197666882696, + "grad_norm": 0.48838865756988525, + "learning_rate": 9.298894170723353e-06, + "loss": 0.1498, + "step": 1810 + }, + { + "epoch": 0.5868438107582631, + "grad_norm": 0.5278536081314087, + "learning_rate": 9.29800064131289e-06, + "loss": 0.1651, + "step": 1811 + }, + { + "epoch": 0.5871678548282566, + "grad_norm": 0.5049905180931091, + "learning_rate": 9.297106585868753e-06, + "loss": 0.1485, + "step": 1812 + }, + { + "epoch": 0.5874918988982502, + "grad_norm": 0.5237796902656555, + "learning_rate": 9.296212004500373e-06, + "loss": 0.1678, + "step": 1813 + }, + { + "epoch": 0.5878159429682437, + "grad_norm": 0.5402454137802124, + "learning_rate": 9.295316897317232e-06, + "loss": 0.1638, + "step": 1814 + }, + { + "epoch": 0.5881399870382372, + "grad_norm": 0.5265448689460754, + "learning_rate": 9.294421264428886e-06, + "loss": 0.1581, + "step": 1815 + }, + { + "epoch": 0.5884640311082308, + "grad_norm": 0.4858977794647217, + "learning_rate": 9.29352510594495e-06, + "loss": 0.1521, + "step": 1816 + }, + { + "epoch": 0.5887880751782243, + "grad_norm": 0.5320647358894348, + "learning_rate": 9.292628421975104e-06, + "loss": 0.1517, + "step": 1817 + }, + { + "epoch": 0.5891121192482177, + "grad_norm": 0.5612199902534485, + "learning_rate": 9.291731212629096e-06, + "loss": 0.1662, + "step": 1818 + }, + { + "epoch": 0.5894361633182112, + "grad_norm": 0.557144820690155, + "learning_rate": 9.290833478016735e-06, + "loss": 0.1622, + "step": 1819 + }, + { + "epoch": 0.5897602073882048, + "grad_norm": 0.5538755059242249, + "learning_rate": 9.289935218247895e-06, + "loss": 0.1526, + "step": 1820 + }, + { + "epoch": 0.5900842514581983, + "grad_norm": 0.5562155246734619, + "learning_rate": 9.289036433432513e-06, + "loss": 0.1676, + "step": 1821 + }, + { + "epoch": 0.5904082955281919, + "grad_norm": 0.5501015782356262, + "learning_rate": 9.288137123680595e-06, + "loss": 0.1635, + "step": 1822 + }, + { + "epoch": 0.5907323395981854, + "grad_norm": 0.5297316312789917, + "learning_rate": 9.287237289102202e-06, + "loss": 0.1649, + "step": 1823 + }, + { + "epoch": 0.5910563836681789, + "grad_norm": 0.4975319504737854, + "learning_rate": 9.286336929807471e-06, + "loss": 0.1551, + "step": 1824 + }, + { + "epoch": 0.5913804277381723, + "grad_norm": 0.5168135166168213, + "learning_rate": 9.285436045906593e-06, + "loss": 0.1526, + "step": 1825 + }, + { + "epoch": 0.5917044718081659, + "grad_norm": 0.5180168747901917, + "learning_rate": 9.28453463750983e-06, + "loss": 0.1674, + "step": 1826 + }, + { + "epoch": 0.5920285158781594, + "grad_norm": 0.5400831699371338, + "learning_rate": 9.283632704727507e-06, + "loss": 0.162, + "step": 1827 + }, + { + "epoch": 0.592352559948153, + "grad_norm": 0.5392692685127258, + "learning_rate": 9.282730247670008e-06, + "loss": 0.1572, + "step": 1828 + }, + { + "epoch": 0.5926766040181465, + "grad_norm": 0.5672776103019714, + "learning_rate": 9.281827266447787e-06, + "loss": 0.1712, + "step": 1829 + }, + { + "epoch": 0.59300064808814, + "grad_norm": 0.522743821144104, + "learning_rate": 9.28092376117136e-06, + "loss": 0.1572, + "step": 1830 + }, + { + "epoch": 0.5933246921581335, + "grad_norm": 0.5244477987289429, + "learning_rate": 9.280019731951305e-06, + "loss": 0.1672, + "step": 1831 + }, + { + "epoch": 0.593648736228127, + "grad_norm": 0.505828320980072, + "learning_rate": 9.27911517889827e-06, + "loss": 0.1542, + "step": 1832 + }, + { + "epoch": 0.5939727802981205, + "grad_norm": 0.4986834228038788, + "learning_rate": 9.278210102122962e-06, + "loss": 0.1493, + "step": 1833 + }, + { + "epoch": 0.594296824368114, + "grad_norm": 0.5014796257019043, + "learning_rate": 9.277304501736156e-06, + "loss": 0.1544, + "step": 1834 + }, + { + "epoch": 0.5946208684381076, + "grad_norm": 0.5235145092010498, + "learning_rate": 9.276398377848683e-06, + "loss": 0.1587, + "step": 1835 + }, + { + "epoch": 0.5949449125081011, + "grad_norm": 0.5145488977432251, + "learning_rate": 9.27549173057145e-06, + "loss": 0.1591, + "step": 1836 + }, + { + "epoch": 0.5952689565780946, + "grad_norm": 0.5658987760543823, + "learning_rate": 9.274584560015419e-06, + "loss": 0.1698, + "step": 1837 + }, + { + "epoch": 0.5955930006480882, + "grad_norm": 0.5141145586967468, + "learning_rate": 9.273676866291617e-06, + "loss": 0.1467, + "step": 1838 + }, + { + "epoch": 0.5959170447180817, + "grad_norm": 0.5158601403236389, + "learning_rate": 9.27276864951114e-06, + "loss": 0.1618, + "step": 1839 + }, + { + "epoch": 0.5962410887880751, + "grad_norm": 0.5314555764198303, + "learning_rate": 9.271859909785144e-06, + "loss": 0.1599, + "step": 1840 + }, + { + "epoch": 0.5965651328580687, + "grad_norm": 0.4693741202354431, + "learning_rate": 9.270950647224851e-06, + "loss": 0.1418, + "step": 1841 + }, + { + "epoch": 0.5968891769280622, + "grad_norm": 0.5171167850494385, + "learning_rate": 9.270040861941542e-06, + "loss": 0.152, + "step": 1842 + }, + { + "epoch": 0.5972132209980557, + "grad_norm": 0.5105248093605042, + "learning_rate": 9.269130554046571e-06, + "loss": 0.1562, + "step": 1843 + }, + { + "epoch": 0.5975372650680493, + "grad_norm": 0.5515754222869873, + "learning_rate": 9.268219723651349e-06, + "loss": 0.1575, + "step": 1844 + }, + { + "epoch": 0.5978613091380428, + "grad_norm": 0.47018948197364807, + "learning_rate": 9.267308370867352e-06, + "loss": 0.1364, + "step": 1845 + }, + { + "epoch": 0.5981853532080363, + "grad_norm": 0.5275624394416809, + "learning_rate": 9.26639649580612e-06, + "loss": 0.1698, + "step": 1846 + }, + { + "epoch": 0.5985093972780298, + "grad_norm": 0.5570939183235168, + "learning_rate": 9.265484098579259e-06, + "loss": 0.1586, + "step": 1847 + }, + { + "epoch": 0.5988334413480233, + "grad_norm": 0.5090274810791016, + "learning_rate": 9.264571179298438e-06, + "loss": 0.1477, + "step": 1848 + }, + { + "epoch": 0.5991574854180168, + "grad_norm": 0.5754071474075317, + "learning_rate": 9.263657738075387e-06, + "loss": 0.1657, + "step": 1849 + }, + { + "epoch": 0.5994815294880104, + "grad_norm": 0.5546247363090515, + "learning_rate": 9.262743775021907e-06, + "loss": 0.1523, + "step": 1850 + }, + { + "epoch": 0.5998055735580039, + "grad_norm": 0.519078254699707, + "learning_rate": 9.261829290249855e-06, + "loss": 0.1658, + "step": 1851 + }, + { + "epoch": 0.6001296176279974, + "grad_norm": 0.5111896991729736, + "learning_rate": 9.260914283871154e-06, + "loss": 0.1628, + "step": 1852 + }, + { + "epoch": 0.600453661697991, + "grad_norm": 0.5172817707061768, + "learning_rate": 9.259998755997796e-06, + "loss": 0.1494, + "step": 1853 + }, + { + "epoch": 0.6007777057679844, + "grad_norm": 0.5435042977333069, + "learning_rate": 9.259082706741828e-06, + "loss": 0.1839, + "step": 1854 + }, + { + "epoch": 0.6011017498379779, + "grad_norm": 0.5118913054466248, + "learning_rate": 9.258166136215369e-06, + "loss": 0.1489, + "step": 1855 + }, + { + "epoch": 0.6014257939079715, + "grad_norm": 0.5078453421592712, + "learning_rate": 9.257249044530596e-06, + "loss": 0.1555, + "step": 1856 + }, + { + "epoch": 0.601749837977965, + "grad_norm": 0.49562156200408936, + "learning_rate": 9.256331431799754e-06, + "loss": 0.1619, + "step": 1857 + }, + { + "epoch": 0.6020738820479585, + "grad_norm": 0.5592288970947266, + "learning_rate": 9.25541329813515e-06, + "loss": 0.1618, + "step": 1858 + }, + { + "epoch": 0.6023979261179521, + "grad_norm": 0.5321934819221497, + "learning_rate": 9.254494643649152e-06, + "loss": 0.1641, + "step": 1859 + }, + { + "epoch": 0.6027219701879456, + "grad_norm": 0.4995349645614624, + "learning_rate": 9.2535754684542e-06, + "loss": 0.1504, + "step": 1860 + }, + { + "epoch": 0.6030460142579391, + "grad_norm": 0.5189761519432068, + "learning_rate": 9.252655772662784e-06, + "loss": 0.1414, + "step": 1861 + }, + { + "epoch": 0.6033700583279326, + "grad_norm": 0.5353453755378723, + "learning_rate": 9.251735556387473e-06, + "loss": 0.1696, + "step": 1862 + }, + { + "epoch": 0.6036941023979261, + "grad_norm": 0.49074551463127136, + "learning_rate": 9.250814819740888e-06, + "loss": 0.1524, + "step": 1863 + }, + { + "epoch": 0.6040181464679196, + "grad_norm": 0.46947991847991943, + "learning_rate": 9.249893562835723e-06, + "loss": 0.1413, + "step": 1864 + }, + { + "epoch": 0.6043421905379132, + "grad_norm": 0.49254897236824036, + "learning_rate": 9.248971785784726e-06, + "loss": 0.1457, + "step": 1865 + }, + { + "epoch": 0.6046662346079067, + "grad_norm": 0.5511367917060852, + "learning_rate": 9.248049488700717e-06, + "loss": 0.1761, + "step": 1866 + }, + { + "epoch": 0.6049902786779002, + "grad_norm": 0.5265299677848816, + "learning_rate": 9.247126671696573e-06, + "loss": 0.1541, + "step": 1867 + }, + { + "epoch": 0.6053143227478938, + "grad_norm": 0.5218799114227295, + "learning_rate": 9.24620333488524e-06, + "loss": 0.162, + "step": 1868 + }, + { + "epoch": 0.6056383668178872, + "grad_norm": 0.4817226827144623, + "learning_rate": 9.245279478379726e-06, + "loss": 0.1429, + "step": 1869 + }, + { + "epoch": 0.6059624108878807, + "grad_norm": 0.5035088062286377, + "learning_rate": 9.2443551022931e-06, + "loss": 0.1436, + "step": 1870 + }, + { + "epoch": 0.6062864549578743, + "grad_norm": 0.5474444627761841, + "learning_rate": 9.2434302067385e-06, + "loss": 0.1656, + "step": 1871 + }, + { + "epoch": 0.6066104990278678, + "grad_norm": 0.5090144872665405, + "learning_rate": 9.242504791829123e-06, + "loss": 0.1614, + "step": 1872 + }, + { + "epoch": 0.6069345430978613, + "grad_norm": 0.6202980279922485, + "learning_rate": 9.241578857678228e-06, + "loss": 0.1633, + "step": 1873 + }, + { + "epoch": 0.6072585871678549, + "grad_norm": 0.480751097202301, + "learning_rate": 9.240652404399145e-06, + "loss": 0.1439, + "step": 1874 + }, + { + "epoch": 0.6075826312378484, + "grad_norm": 0.5152072310447693, + "learning_rate": 9.239725432105258e-06, + "loss": 0.1493, + "step": 1875 + }, + { + "epoch": 0.6079066753078418, + "grad_norm": 0.5336874723434448, + "learning_rate": 9.238797940910021e-06, + "loss": 0.1649, + "step": 1876 + }, + { + "epoch": 0.6082307193778353, + "grad_norm": 0.5547704696655273, + "learning_rate": 9.237869930926953e-06, + "loss": 0.1666, + "step": 1877 + }, + { + "epoch": 0.6085547634478289, + "grad_norm": 0.5038400292396545, + "learning_rate": 9.23694140226963e-06, + "loss": 0.1526, + "step": 1878 + }, + { + "epoch": 0.6088788075178224, + "grad_norm": 0.5584294199943542, + "learning_rate": 9.236012355051697e-06, + "loss": 0.1708, + "step": 1879 + }, + { + "epoch": 0.609202851587816, + "grad_norm": 0.4911188781261444, + "learning_rate": 9.23508278938686e-06, + "loss": 0.1412, + "step": 1880 + }, + { + "epoch": 0.6095268956578095, + "grad_norm": 0.5308417081832886, + "learning_rate": 9.234152705388885e-06, + "loss": 0.1554, + "step": 1881 + }, + { + "epoch": 0.609850939727803, + "grad_norm": 0.5310256481170654, + "learning_rate": 9.233222103171612e-06, + "loss": 0.1553, + "step": 1882 + }, + { + "epoch": 0.6101749837977966, + "grad_norm": 0.5036491751670837, + "learning_rate": 9.232290982848933e-06, + "loss": 0.1382, + "step": 1883 + }, + { + "epoch": 0.61049902786779, + "grad_norm": 0.4999232888221741, + "learning_rate": 9.23135934453481e-06, + "loss": 0.1558, + "step": 1884 + }, + { + "epoch": 0.6108230719377835, + "grad_norm": 0.4958760440349579, + "learning_rate": 9.230427188343266e-06, + "loss": 0.1469, + "step": 1885 + }, + { + "epoch": 0.611147116007777, + "grad_norm": 0.5068649053573608, + "learning_rate": 9.229494514388388e-06, + "loss": 0.1408, + "step": 1886 + }, + { + "epoch": 0.6114711600777706, + "grad_norm": 0.5678321719169617, + "learning_rate": 9.228561322784326e-06, + "loss": 0.172, + "step": 1887 + }, + { + "epoch": 0.6117952041477641, + "grad_norm": 0.527591347694397, + "learning_rate": 9.227627613645294e-06, + "loss": 0.1642, + "step": 1888 + }, + { + "epoch": 0.6121192482177576, + "grad_norm": 0.4956148862838745, + "learning_rate": 9.226693387085568e-06, + "loss": 0.1551, + "step": 1889 + }, + { + "epoch": 0.6124432922877512, + "grad_norm": 0.4847855269908905, + "learning_rate": 9.225758643219489e-06, + "loss": 0.1374, + "step": 1890 + }, + { + "epoch": 0.6127673363577446, + "grad_norm": 0.505673348903656, + "learning_rate": 9.22482338216146e-06, + "loss": 0.1498, + "step": 1891 + }, + { + "epoch": 0.6130913804277381, + "grad_norm": 0.49389687180519104, + "learning_rate": 9.22388760402595e-06, + "loss": 0.1405, + "step": 1892 + }, + { + "epoch": 0.6134154244977317, + "grad_norm": 0.48019102215766907, + "learning_rate": 9.222951308927485e-06, + "loss": 0.1413, + "step": 1893 + }, + { + "epoch": 0.6137394685677252, + "grad_norm": 0.5360737442970276, + "learning_rate": 9.222014496980665e-06, + "loss": 0.1479, + "step": 1894 + }, + { + "epoch": 0.6140635126377187, + "grad_norm": 0.5291153192520142, + "learning_rate": 9.221077168300142e-06, + "loss": 0.1649, + "step": 1895 + }, + { + "epoch": 0.6143875567077123, + "grad_norm": 0.5272452235221863, + "learning_rate": 9.220139323000634e-06, + "loss": 0.156, + "step": 1896 + }, + { + "epoch": 0.6147116007777058, + "grad_norm": 0.5477816462516785, + "learning_rate": 9.219200961196929e-06, + "loss": 0.166, + "step": 1897 + }, + { + "epoch": 0.6150356448476992, + "grad_norm": 0.522372841835022, + "learning_rate": 9.218262083003871e-06, + "loss": 0.1643, + "step": 1898 + }, + { + "epoch": 0.6153596889176928, + "grad_norm": 0.5748099088668823, + "learning_rate": 9.21732268853637e-06, + "loss": 0.1773, + "step": 1899 + }, + { + "epoch": 0.6156837329876863, + "grad_norm": 0.4743006229400635, + "learning_rate": 9.216382777909398e-06, + "loss": 0.1358, + "step": 1900 + }, + { + "epoch": 0.6160077770576798, + "grad_norm": 0.507614254951477, + "learning_rate": 9.215442351237993e-06, + "loss": 0.149, + "step": 1901 + }, + { + "epoch": 0.6163318211276734, + "grad_norm": 0.5415900945663452, + "learning_rate": 9.214501408637253e-06, + "loss": 0.1543, + "step": 1902 + }, + { + "epoch": 0.6166558651976669, + "grad_norm": 0.5349360108375549, + "learning_rate": 9.21355995022234e-06, + "loss": 0.1611, + "step": 1903 + }, + { + "epoch": 0.6169799092676604, + "grad_norm": 0.5085405111312866, + "learning_rate": 9.212617976108478e-06, + "loss": 0.153, + "step": 1904 + }, + { + "epoch": 0.6173039533376539, + "grad_norm": 0.537108302116394, + "learning_rate": 9.211675486410959e-06, + "loss": 0.1559, + "step": 1905 + }, + { + "epoch": 0.6176279974076474, + "grad_norm": 0.503128170967102, + "learning_rate": 9.21073248124513e-06, + "loss": 0.1531, + "step": 1906 + }, + { + "epoch": 0.6179520414776409, + "grad_norm": 0.5284308791160583, + "learning_rate": 9.20978896072641e-06, + "loss": 0.1648, + "step": 1907 + }, + { + "epoch": 0.6182760855476345, + "grad_norm": 0.5065122246742249, + "learning_rate": 9.208844924970276e-06, + "loss": 0.1551, + "step": 1908 + }, + { + "epoch": 0.618600129617628, + "grad_norm": 0.5227933526039124, + "learning_rate": 9.207900374092268e-06, + "loss": 0.1589, + "step": 1909 + }, + { + "epoch": 0.6189241736876215, + "grad_norm": 0.5541921257972717, + "learning_rate": 9.206955308207988e-06, + "loss": 0.1717, + "step": 1910 + }, + { + "epoch": 0.6192482177576151, + "grad_norm": 0.5353711843490601, + "learning_rate": 9.206009727433106e-06, + "loss": 0.1744, + "step": 1911 + }, + { + "epoch": 0.6195722618276086, + "grad_norm": 0.4900938868522644, + "learning_rate": 9.205063631883351e-06, + "loss": 0.1454, + "step": 1912 + }, + { + "epoch": 0.619896305897602, + "grad_norm": 0.5369064211845398, + "learning_rate": 9.204117021674515e-06, + "loss": 0.1573, + "step": 1913 + }, + { + "epoch": 0.6202203499675956, + "grad_norm": 0.5292285680770874, + "learning_rate": 9.203169896922453e-06, + "loss": 0.1614, + "step": 1914 + }, + { + "epoch": 0.6205443940375891, + "grad_norm": 0.5318422317504883, + "learning_rate": 9.202222257743088e-06, + "loss": 0.165, + "step": 1915 + }, + { + "epoch": 0.6208684381075826, + "grad_norm": 0.5160491466522217, + "learning_rate": 9.201274104252398e-06, + "loss": 0.1462, + "step": 1916 + }, + { + "epoch": 0.6211924821775762, + "grad_norm": 0.5053558349609375, + "learning_rate": 9.20032543656643e-06, + "loss": 0.1485, + "step": 1917 + }, + { + "epoch": 0.6215165262475697, + "grad_norm": 0.5057937502861023, + "learning_rate": 9.19937625480129e-06, + "loss": 0.1559, + "step": 1918 + }, + { + "epoch": 0.6218405703175632, + "grad_norm": 0.5348033905029297, + "learning_rate": 9.19842655907315e-06, + "loss": 0.1707, + "step": 1919 + }, + { + "epoch": 0.6221646143875567, + "grad_norm": 0.5293624997138977, + "learning_rate": 9.197476349498243e-06, + "loss": 0.1629, + "step": 1920 + }, + { + "epoch": 0.6224886584575502, + "grad_norm": 0.5092240571975708, + "learning_rate": 9.196525626192865e-06, + "loss": 0.1699, + "step": 1921 + }, + { + "epoch": 0.6228127025275437, + "grad_norm": 0.5243478417396545, + "learning_rate": 9.195574389273375e-06, + "loss": 0.1565, + "step": 1922 + }, + { + "epoch": 0.6231367465975373, + "grad_norm": 0.5151318907737732, + "learning_rate": 9.194622638856198e-06, + "loss": 0.1464, + "step": 1923 + }, + { + "epoch": 0.6234607906675308, + "grad_norm": 0.5310802459716797, + "learning_rate": 9.193670375057816e-06, + "loss": 0.1573, + "step": 1924 + }, + { + "epoch": 0.6237848347375243, + "grad_norm": 0.5187256336212158, + "learning_rate": 9.19271759799478e-06, + "loss": 0.1602, + "step": 1925 + }, + { + "epoch": 0.6241088788075179, + "grad_norm": 0.48503974080085754, + "learning_rate": 9.191764307783698e-06, + "loss": 0.1418, + "step": 1926 + }, + { + "epoch": 0.6244329228775113, + "grad_norm": 0.48676154017448425, + "learning_rate": 9.190810504541244e-06, + "loss": 0.1403, + "step": 1927 + }, + { + "epoch": 0.6247569669475048, + "grad_norm": 0.529976487159729, + "learning_rate": 9.189856188384152e-06, + "loss": 0.1638, + "step": 1928 + }, + { + "epoch": 0.6250810110174984, + "grad_norm": 0.5011406540870667, + "learning_rate": 9.188901359429226e-06, + "loss": 0.1522, + "step": 1929 + }, + { + "epoch": 0.6254050550874919, + "grad_norm": 0.4782002866268158, + "learning_rate": 9.187946017793324e-06, + "loss": 0.1445, + "step": 1930 + }, + { + "epoch": 0.6257290991574854, + "grad_norm": 0.48546773195266724, + "learning_rate": 9.186990163593371e-06, + "loss": 0.1413, + "step": 1931 + }, + { + "epoch": 0.626053143227479, + "grad_norm": 0.5287179946899414, + "learning_rate": 9.18603379694636e-06, + "loss": 0.1532, + "step": 1932 + }, + { + "epoch": 0.6263771872974725, + "grad_norm": 0.5079996585845947, + "learning_rate": 9.185076917969331e-06, + "loss": 0.1398, + "step": 1933 + }, + { + "epoch": 0.626701231367466, + "grad_norm": 0.5039437413215637, + "learning_rate": 9.184119526779403e-06, + "loss": 0.157, + "step": 1934 + }, + { + "epoch": 0.6270252754374595, + "grad_norm": 0.5134934782981873, + "learning_rate": 9.183161623493753e-06, + "loss": 0.1481, + "step": 1935 + }, + { + "epoch": 0.627349319507453, + "grad_norm": 0.5130413770675659, + "learning_rate": 9.182203208229614e-06, + "loss": 0.1609, + "step": 1936 + }, + { + "epoch": 0.6276733635774465, + "grad_norm": 0.5762905478477478, + "learning_rate": 9.181244281104289e-06, + "loss": 0.1708, + "step": 1937 + }, + { + "epoch": 0.62799740764744, + "grad_norm": 0.5197169780731201, + "learning_rate": 9.180284842235143e-06, + "loss": 0.1631, + "step": 1938 + }, + { + "epoch": 0.6283214517174336, + "grad_norm": 0.5109285116195679, + "learning_rate": 9.1793248917396e-06, + "loss": 0.1493, + "step": 1939 + }, + { + "epoch": 0.6286454957874271, + "grad_norm": 0.5167864561080933, + "learning_rate": 9.178364429735149e-06, + "loss": 0.1513, + "step": 1940 + }, + { + "epoch": 0.6289695398574207, + "grad_norm": 0.5290215611457825, + "learning_rate": 9.177403456339342e-06, + "loss": 0.1447, + "step": 1941 + }, + { + "epoch": 0.6292935839274141, + "grad_norm": 0.5334181785583496, + "learning_rate": 9.176441971669791e-06, + "loss": 0.1597, + "step": 1942 + }, + { + "epoch": 0.6296176279974076, + "grad_norm": 0.5249089002609253, + "learning_rate": 9.175479975844175e-06, + "loss": 0.1572, + "step": 1943 + }, + { + "epoch": 0.6299416720674011, + "grad_norm": 0.5331795811653137, + "learning_rate": 9.17451746898023e-06, + "loss": 0.1602, + "step": 1944 + }, + { + "epoch": 0.6302657161373947, + "grad_norm": 0.4850247800350189, + "learning_rate": 9.173554451195763e-06, + "loss": 0.1398, + "step": 1945 + }, + { + "epoch": 0.6305897602073882, + "grad_norm": 0.5020791888237, + "learning_rate": 9.17259092260863e-06, + "loss": 0.1494, + "step": 1946 + }, + { + "epoch": 0.6309138042773818, + "grad_norm": 0.5400851964950562, + "learning_rate": 9.171626883336766e-06, + "loss": 0.17, + "step": 1947 + }, + { + "epoch": 0.6312378483473753, + "grad_norm": 0.4870317280292511, + "learning_rate": 9.170662333498153e-06, + "loss": 0.147, + "step": 1948 + }, + { + "epoch": 0.6315618924173687, + "grad_norm": 0.5228290557861328, + "learning_rate": 9.169697273210846e-06, + "loss": 0.1528, + "step": 1949 + }, + { + "epoch": 0.6318859364873622, + "grad_norm": 0.48440635204315186, + "learning_rate": 9.16873170259296e-06, + "loss": 0.1442, + "step": 1950 + }, + { + "epoch": 0.6322099805573558, + "grad_norm": 0.533446729183197, + "learning_rate": 9.167765621762668e-06, + "loss": 0.1524, + "step": 1951 + }, + { + "epoch": 0.6325340246273493, + "grad_norm": 0.5196256041526794, + "learning_rate": 9.166799030838212e-06, + "loss": 0.1628, + "step": 1952 + }, + { + "epoch": 0.6328580686973428, + "grad_norm": 0.5105645060539246, + "learning_rate": 9.165831929937892e-06, + "loss": 0.1429, + "step": 1953 + }, + { + "epoch": 0.6331821127673364, + "grad_norm": 0.5194631218910217, + "learning_rate": 9.164864319180074e-06, + "loss": 0.1642, + "step": 1954 + }, + { + "epoch": 0.6335061568373299, + "grad_norm": 0.5209739804267883, + "learning_rate": 9.16389619868318e-06, + "loss": 0.1742, + "step": 1955 + }, + { + "epoch": 0.6338302009073234, + "grad_norm": 0.5078026056289673, + "learning_rate": 9.162927568565701e-06, + "loss": 0.1484, + "step": 1956 + }, + { + "epoch": 0.6341542449773169, + "grad_norm": 0.4925689399242401, + "learning_rate": 9.16195842894619e-06, + "loss": 0.1509, + "step": 1957 + }, + { + "epoch": 0.6344782890473104, + "grad_norm": 0.5168012380599976, + "learning_rate": 9.160988779943257e-06, + "loss": 0.1572, + "step": 1958 + }, + { + "epoch": 0.6348023331173039, + "grad_norm": 0.5228518843650818, + "learning_rate": 9.160018621675577e-06, + "loss": 0.1566, + "step": 1959 + }, + { + "epoch": 0.6351263771872975, + "grad_norm": 0.5400591492652893, + "learning_rate": 9.159047954261892e-06, + "loss": 0.1583, + "step": 1960 + }, + { + "epoch": 0.635450421257291, + "grad_norm": 0.4899173974990845, + "learning_rate": 9.158076777820998e-06, + "loss": 0.1445, + "step": 1961 + }, + { + "epoch": 0.6357744653272845, + "grad_norm": 0.5116453170776367, + "learning_rate": 9.157105092471764e-06, + "loss": 0.1631, + "step": 1962 + }, + { + "epoch": 0.6360985093972781, + "grad_norm": 0.5398380756378174, + "learning_rate": 9.156132898333108e-06, + "loss": 0.1683, + "step": 1963 + }, + { + "epoch": 0.6364225534672715, + "grad_norm": 0.49203190207481384, + "learning_rate": 9.15516019552402e-06, + "loss": 0.1467, + "step": 1964 + }, + { + "epoch": 0.636746597537265, + "grad_norm": 0.5170449018478394, + "learning_rate": 9.154186984163547e-06, + "loss": 0.1537, + "step": 1965 + }, + { + "epoch": 0.6370706416072586, + "grad_norm": 0.4930606484413147, + "learning_rate": 9.153213264370805e-06, + "loss": 0.1439, + "step": 1966 + }, + { + "epoch": 0.6373946856772521, + "grad_norm": 0.48055657744407654, + "learning_rate": 9.152239036264965e-06, + "loss": 0.1383, + "step": 1967 + }, + { + "epoch": 0.6377187297472456, + "grad_norm": 0.49675148725509644, + "learning_rate": 9.151264299965263e-06, + "loss": 0.1519, + "step": 1968 + }, + { + "epoch": 0.6380427738172392, + "grad_norm": 0.4874381422996521, + "learning_rate": 9.150289055591e-06, + "loss": 0.1409, + "step": 1969 + }, + { + "epoch": 0.6383668178872327, + "grad_norm": 0.4918610155582428, + "learning_rate": 9.149313303261534e-06, + "loss": 0.1473, + "step": 1970 + }, + { + "epoch": 0.6386908619572261, + "grad_norm": 0.526047945022583, + "learning_rate": 9.148337043096287e-06, + "loss": 0.1563, + "step": 1971 + }, + { + "epoch": 0.6390149060272197, + "grad_norm": 0.5674269199371338, + "learning_rate": 9.147360275214746e-06, + "loss": 0.1631, + "step": 1972 + }, + { + "epoch": 0.6393389500972132, + "grad_norm": 0.500179648399353, + "learning_rate": 9.146382999736455e-06, + "loss": 0.1521, + "step": 1973 + }, + { + "epoch": 0.6396629941672067, + "grad_norm": 0.5363490581512451, + "learning_rate": 9.145405216781026e-06, + "loss": 0.1432, + "step": 1974 + }, + { + "epoch": 0.6399870382372003, + "grad_norm": 0.5230416059494019, + "learning_rate": 9.14442692646813e-06, + "loss": 0.1531, + "step": 1975 + }, + { + "epoch": 0.6403110823071938, + "grad_norm": 0.5501431226730347, + "learning_rate": 9.143448128917499e-06, + "loss": 0.1595, + "step": 1976 + }, + { + "epoch": 0.6406351263771873, + "grad_norm": 0.5650454163551331, + "learning_rate": 9.142468824248928e-06, + "loss": 0.1617, + "step": 1977 + }, + { + "epoch": 0.6409591704471809, + "grad_norm": 0.5224339962005615, + "learning_rate": 9.141489012582277e-06, + "loss": 0.1635, + "step": 1978 + }, + { + "epoch": 0.6412832145171743, + "grad_norm": 0.5190390348434448, + "learning_rate": 9.140508694037462e-06, + "loss": 0.1548, + "step": 1979 + }, + { + "epoch": 0.6416072585871678, + "grad_norm": 0.5318208336830139, + "learning_rate": 9.139527868734465e-06, + "loss": 0.17, + "step": 1980 + }, + { + "epoch": 0.6419313026571614, + "grad_norm": 0.5096891522407532, + "learning_rate": 9.138546536793334e-06, + "loss": 0.1619, + "step": 1981 + }, + { + "epoch": 0.6422553467271549, + "grad_norm": 0.45812585949897766, + "learning_rate": 9.137564698334167e-06, + "loss": 0.128, + "step": 1982 + }, + { + "epoch": 0.6425793907971484, + "grad_norm": 0.5666882991790771, + "learning_rate": 9.13658235347714e-06, + "loss": 0.1581, + "step": 1983 + }, + { + "epoch": 0.642903434867142, + "grad_norm": 0.5142824053764343, + "learning_rate": 9.135599502342474e-06, + "loss": 0.1486, + "step": 1984 + }, + { + "epoch": 0.6432274789371355, + "grad_norm": 0.48653656244277954, + "learning_rate": 9.134616145050466e-06, + "loss": 0.1339, + "step": 1985 + }, + { + "epoch": 0.6435515230071289, + "grad_norm": 0.5284735560417175, + "learning_rate": 9.13363228172147e-06, + "loss": 0.1585, + "step": 1986 + }, + { + "epoch": 0.6438755670771225, + "grad_norm": 0.503960132598877, + "learning_rate": 9.132647912475897e-06, + "loss": 0.1369, + "step": 1987 + }, + { + "epoch": 0.644199611147116, + "grad_norm": 0.5423457622528076, + "learning_rate": 9.131663037434228e-06, + "loss": 0.1658, + "step": 1988 + }, + { + "epoch": 0.6445236552171095, + "grad_norm": 0.5273451805114746, + "learning_rate": 9.130677656717e-06, + "loss": 0.1451, + "step": 1989 + }, + { + "epoch": 0.6448476992871031, + "grad_norm": 0.5250473022460938, + "learning_rate": 9.129691770444815e-06, + "loss": 0.1539, + "step": 1990 + }, + { + "epoch": 0.6451717433570966, + "grad_norm": 0.5005070567131042, + "learning_rate": 9.128705378738336e-06, + "loss": 0.1438, + "step": 1991 + }, + { + "epoch": 0.6454957874270901, + "grad_norm": 0.49627885222435, + "learning_rate": 9.127718481718288e-06, + "loss": 0.1517, + "step": 1992 + }, + { + "epoch": 0.6458198314970836, + "grad_norm": 0.5641838908195496, + "learning_rate": 9.126731079505457e-06, + "loss": 0.1742, + "step": 1993 + }, + { + "epoch": 0.6461438755670771, + "grad_norm": 0.5262545347213745, + "learning_rate": 9.125743172220691e-06, + "loss": 0.1595, + "step": 1994 + }, + { + "epoch": 0.6464679196370706, + "grad_norm": 0.49203866720199585, + "learning_rate": 9.124754759984901e-06, + "loss": 0.1422, + "step": 1995 + }, + { + "epoch": 0.6467919637070642, + "grad_norm": 0.5152744650840759, + "learning_rate": 9.12376584291906e-06, + "loss": 0.1515, + "step": 1996 + }, + { + "epoch": 0.6471160077770577, + "grad_norm": 0.5341880917549133, + "learning_rate": 9.122776421144201e-06, + "loss": 0.1588, + "step": 1997 + }, + { + "epoch": 0.6474400518470512, + "grad_norm": 0.5317200422286987, + "learning_rate": 9.12178649478142e-06, + "loss": 0.1654, + "step": 1998 + }, + { + "epoch": 0.6477640959170448, + "grad_norm": 0.4992202818393707, + "learning_rate": 9.120796063951873e-06, + "loss": 0.1632, + "step": 1999 + }, + { + "epoch": 0.6480881399870383, + "grad_norm": 0.5180873274803162, + "learning_rate": 9.11980512877678e-06, + "loss": 0.1541, + "step": 2000 + }, + { + "epoch": 0.6484121840570317, + "grad_norm": 0.4791898727416992, + "learning_rate": 9.118813689377422e-06, + "loss": 0.1371, + "step": 2001 + }, + { + "epoch": 0.6487362281270252, + "grad_norm": 0.531326413154602, + "learning_rate": 9.117821745875143e-06, + "loss": 0.1539, + "step": 2002 + }, + { + "epoch": 0.6490602721970188, + "grad_norm": 0.6741142868995667, + "learning_rate": 9.116829298391345e-06, + "loss": 0.1579, + "step": 2003 + }, + { + "epoch": 0.6493843162670123, + "grad_norm": 0.48655468225479126, + "learning_rate": 9.115836347047495e-06, + "loss": 0.1452, + "step": 2004 + }, + { + "epoch": 0.6497083603370059, + "grad_norm": 0.5300689339637756, + "learning_rate": 9.11484289196512e-06, + "loss": 0.1534, + "step": 2005 + }, + { + "epoch": 0.6500324044069994, + "grad_norm": 0.5319254994392395, + "learning_rate": 9.113848933265811e-06, + "loss": 0.1508, + "step": 2006 + }, + { + "epoch": 0.6503564484769929, + "grad_norm": 0.5492068529129028, + "learning_rate": 9.112854471071217e-06, + "loss": 0.1482, + "step": 2007 + }, + { + "epoch": 0.6506804925469863, + "grad_norm": 0.5193006992340088, + "learning_rate": 9.111859505503052e-06, + "loss": 0.1501, + "step": 2008 + }, + { + "epoch": 0.6510045366169799, + "grad_norm": 0.4945833683013916, + "learning_rate": 9.110864036683087e-06, + "loss": 0.1423, + "step": 2009 + }, + { + "epoch": 0.6513285806869734, + "grad_norm": 0.48565852642059326, + "learning_rate": 9.109868064733163e-06, + "loss": 0.143, + "step": 2010 + }, + { + "epoch": 0.651652624756967, + "grad_norm": 0.5576429963111877, + "learning_rate": 9.108871589775173e-06, + "loss": 0.1578, + "step": 2011 + }, + { + "epoch": 0.6519766688269605, + "grad_norm": 0.5786350965499878, + "learning_rate": 9.107874611931077e-06, + "loss": 0.1648, + "step": 2012 + }, + { + "epoch": 0.652300712896954, + "grad_norm": 0.5137717723846436, + "learning_rate": 9.106877131322897e-06, + "loss": 0.1389, + "step": 2013 + }, + { + "epoch": 0.6526247569669476, + "grad_norm": 0.5338220596313477, + "learning_rate": 9.105879148072712e-06, + "loss": 0.1633, + "step": 2014 + }, + { + "epoch": 0.652948801036941, + "grad_norm": 0.5126753449440002, + "learning_rate": 9.104880662302668e-06, + "loss": 0.1604, + "step": 2015 + }, + { + "epoch": 0.6532728451069345, + "grad_norm": 0.5010611414909363, + "learning_rate": 9.103881674134972e-06, + "loss": 0.1557, + "step": 2016 + }, + { + "epoch": 0.653596889176928, + "grad_norm": 0.5101701021194458, + "learning_rate": 9.102882183691884e-06, + "loss": 0.1614, + "step": 2017 + }, + { + "epoch": 0.6539209332469216, + "grad_norm": 0.5002159476280212, + "learning_rate": 9.101882191095738e-06, + "loss": 0.1496, + "step": 2018 + }, + { + "epoch": 0.6542449773169151, + "grad_norm": 0.5012000203132629, + "learning_rate": 9.10088169646892e-06, + "loss": 0.1597, + "step": 2019 + }, + { + "epoch": 0.6545690213869086, + "grad_norm": 0.48333480954170227, + "learning_rate": 9.099880699933883e-06, + "loss": 0.1474, + "step": 2020 + }, + { + "epoch": 0.6548930654569022, + "grad_norm": 0.5311923027038574, + "learning_rate": 9.098879201613136e-06, + "loss": 0.1596, + "step": 2021 + }, + { + "epoch": 0.6552171095268956, + "grad_norm": 0.4770635962486267, + "learning_rate": 9.097877201629258e-06, + "loss": 0.1391, + "step": 2022 + }, + { + "epoch": 0.6555411535968891, + "grad_norm": 0.5176326036453247, + "learning_rate": 9.096874700104879e-06, + "loss": 0.158, + "step": 2023 + }, + { + "epoch": 0.6558651976668827, + "grad_norm": 0.5453210473060608, + "learning_rate": 9.095871697162698e-06, + "loss": 0.1613, + "step": 2024 + }, + { + "epoch": 0.6561892417368762, + "grad_norm": 0.5248459577560425, + "learning_rate": 9.094868192925473e-06, + "loss": 0.152, + "step": 2025 + }, + { + "epoch": 0.6565132858068697, + "grad_norm": 0.5020378232002258, + "learning_rate": 9.093864187516021e-06, + "loss": 0.1593, + "step": 2026 + }, + { + "epoch": 0.6568373298768633, + "grad_norm": 0.4973413646221161, + "learning_rate": 9.092859681057224e-06, + "loss": 0.1481, + "step": 2027 + }, + { + "epoch": 0.6571613739468568, + "grad_norm": 0.5216763615608215, + "learning_rate": 9.091854673672026e-06, + "loss": 0.1453, + "step": 2028 + }, + { + "epoch": 0.6574854180168503, + "grad_norm": 0.5470162630081177, + "learning_rate": 9.090849165483428e-06, + "loss": 0.1698, + "step": 2029 + }, + { + "epoch": 0.6578094620868438, + "grad_norm": 0.5171158909797668, + "learning_rate": 9.089843156614493e-06, + "loss": 0.1399, + "step": 2030 + }, + { + "epoch": 0.6581335061568373, + "grad_norm": 0.5428946018218994, + "learning_rate": 9.08883664718835e-06, + "loss": 0.1724, + "step": 2031 + }, + { + "epoch": 0.6584575502268308, + "grad_norm": 0.5003471374511719, + "learning_rate": 9.087829637328183e-06, + "loss": 0.1521, + "step": 2032 + }, + { + "epoch": 0.6587815942968244, + "grad_norm": 0.5318998098373413, + "learning_rate": 9.086822127157243e-06, + "loss": 0.1627, + "step": 2033 + }, + { + "epoch": 0.6591056383668179, + "grad_norm": 0.4693848490715027, + "learning_rate": 9.085814116798837e-06, + "loss": 0.1434, + "step": 2034 + }, + { + "epoch": 0.6594296824368114, + "grad_norm": 0.4785168170928955, + "learning_rate": 9.084805606376337e-06, + "loss": 0.1407, + "step": 2035 + }, + { + "epoch": 0.659753726506805, + "grad_norm": 0.49794092774391174, + "learning_rate": 9.083796596013175e-06, + "loss": 0.1648, + "step": 2036 + }, + { + "epoch": 0.6600777705767984, + "grad_norm": 0.5339946746826172, + "learning_rate": 9.082787085832845e-06, + "loss": 0.1556, + "step": 2037 + }, + { + "epoch": 0.6604018146467919, + "grad_norm": 0.5245910286903381, + "learning_rate": 9.081777075958898e-06, + "loss": 0.1632, + "step": 2038 + }, + { + "epoch": 0.6607258587167855, + "grad_norm": 0.4721158742904663, + "learning_rate": 9.080766566514954e-06, + "loss": 0.1502, + "step": 2039 + }, + { + "epoch": 0.661049902786779, + "grad_norm": 0.56708163022995, + "learning_rate": 9.079755557624684e-06, + "loss": 0.1633, + "step": 2040 + }, + { + "epoch": 0.6613739468567725, + "grad_norm": 0.4877159297466278, + "learning_rate": 9.078744049411832e-06, + "loss": 0.1349, + "step": 2041 + }, + { + "epoch": 0.6616979909267661, + "grad_norm": 0.5330770015716553, + "learning_rate": 9.077732042000192e-06, + "loss": 0.1535, + "step": 2042 + }, + { + "epoch": 0.6620220349967596, + "grad_norm": 0.5046741962432861, + "learning_rate": 9.076719535513626e-06, + "loss": 0.1567, + "step": 2043 + }, + { + "epoch": 0.662346079066753, + "grad_norm": 0.5472025275230408, + "learning_rate": 9.075706530076054e-06, + "loss": 0.1667, + "step": 2044 + }, + { + "epoch": 0.6626701231367466, + "grad_norm": 0.515630841255188, + "learning_rate": 9.074693025811458e-06, + "loss": 0.1623, + "step": 2045 + }, + { + "epoch": 0.6629941672067401, + "grad_norm": 0.521040678024292, + "learning_rate": 9.073679022843882e-06, + "loss": 0.1632, + "step": 2046 + }, + { + "epoch": 0.6633182112767336, + "grad_norm": 0.4941205382347107, + "learning_rate": 9.072664521297432e-06, + "loss": 0.142, + "step": 2047 + }, + { + "epoch": 0.6636422553467272, + "grad_norm": 0.5200309753417969, + "learning_rate": 9.07164952129627e-06, + "loss": 0.1538, + "step": 2048 + }, + { + "epoch": 0.6639662994167207, + "grad_norm": 0.5273143649101257, + "learning_rate": 9.070634022964622e-06, + "loss": 0.1555, + "step": 2049 + }, + { + "epoch": 0.6642903434867142, + "grad_norm": 0.5260372757911682, + "learning_rate": 9.069618026426779e-06, + "loss": 0.1637, + "step": 2050 + }, + { + "epoch": 0.6646143875567078, + "grad_norm": 0.4782349169254303, + "learning_rate": 9.068601531807084e-06, + "loss": 0.1451, + "step": 2051 + }, + { + "epoch": 0.6649384316267012, + "grad_norm": 0.47235560417175293, + "learning_rate": 9.067584539229948e-06, + "loss": 0.1401, + "step": 2052 + }, + { + "epoch": 0.6652624756966947, + "grad_norm": 0.5347378253936768, + "learning_rate": 9.066567048819844e-06, + "loss": 0.1723, + "step": 2053 + }, + { + "epoch": 0.6655865197666883, + "grad_norm": 0.49886566400527954, + "learning_rate": 9.0655490607013e-06, + "loss": 0.1545, + "step": 2054 + }, + { + "epoch": 0.6659105638366818, + "grad_norm": 0.5145599842071533, + "learning_rate": 9.064530574998907e-06, + "loss": 0.1571, + "step": 2055 + }, + { + "epoch": 0.6662346079066753, + "grad_norm": 0.4755716621875763, + "learning_rate": 9.063511591837322e-06, + "loss": 0.1421, + "step": 2056 + }, + { + "epoch": 0.6665586519766689, + "grad_norm": 0.535316526889801, + "learning_rate": 9.062492111341254e-06, + "loss": 0.1547, + "step": 2057 + }, + { + "epoch": 0.6668826960466624, + "grad_norm": 0.5307443141937256, + "learning_rate": 9.06147213363548e-06, + "loss": 0.1716, + "step": 2058 + }, + { + "epoch": 0.6672067401166558, + "grad_norm": 0.48614341020584106, + "learning_rate": 9.060451658844835e-06, + "loss": 0.1395, + "step": 2059 + }, + { + "epoch": 0.6675307841866494, + "grad_norm": 0.4861319363117218, + "learning_rate": 9.059430687094215e-06, + "loss": 0.1462, + "step": 2060 + }, + { + "epoch": 0.6678548282566429, + "grad_norm": 0.5452821850776672, + "learning_rate": 9.058409218508577e-06, + "loss": 0.1834, + "step": 2061 + }, + { + "epoch": 0.6681788723266364, + "grad_norm": 0.48399338126182556, + "learning_rate": 9.05738725321294e-06, + "loss": 0.1464, + "step": 2062 + }, + { + "epoch": 0.66850291639663, + "grad_norm": 0.49779587984085083, + "learning_rate": 9.056364791332381e-06, + "loss": 0.1512, + "step": 2063 + }, + { + "epoch": 0.6688269604666235, + "grad_norm": 0.5269302725791931, + "learning_rate": 9.055341832992041e-06, + "loss": 0.1555, + "step": 2064 + }, + { + "epoch": 0.669151004536617, + "grad_norm": 0.5232394933700562, + "learning_rate": 9.05431837831712e-06, + "loss": 0.1593, + "step": 2065 + }, + { + "epoch": 0.6694750486066104, + "grad_norm": 0.5113489031791687, + "learning_rate": 9.053294427432877e-06, + "loss": 0.154, + "step": 2066 + }, + { + "epoch": 0.669799092676604, + "grad_norm": 0.48380905389785767, + "learning_rate": 9.052269980464634e-06, + "loss": 0.1402, + "step": 2067 + }, + { + "epoch": 0.6701231367465975, + "grad_norm": 0.4939453899860382, + "learning_rate": 9.051245037537777e-06, + "loss": 0.1592, + "step": 2068 + }, + { + "epoch": 0.670447180816591, + "grad_norm": 0.5096998810768127, + "learning_rate": 9.050219598777745e-06, + "loss": 0.1562, + "step": 2069 + }, + { + "epoch": 0.6707712248865846, + "grad_norm": 0.5372338891029358, + "learning_rate": 9.049193664310043e-06, + "loss": 0.1541, + "step": 2070 + }, + { + "epoch": 0.6710952689565781, + "grad_norm": 0.4848030209541321, + "learning_rate": 9.048167234260235e-06, + "loss": 0.1562, + "step": 2071 + }, + { + "epoch": 0.6714193130265717, + "grad_norm": 0.4721747040748596, + "learning_rate": 9.04714030875395e-06, + "loss": 0.1437, + "step": 2072 + }, + { + "epoch": 0.6717433570965652, + "grad_norm": 0.49510085582733154, + "learning_rate": 9.046112887916867e-06, + "loss": 0.1535, + "step": 2073 + }, + { + "epoch": 0.6720674011665586, + "grad_norm": 0.468925803899765, + "learning_rate": 9.045084971874738e-06, + "loss": 0.1395, + "step": 2074 + }, + { + "epoch": 0.6723914452365521, + "grad_norm": 0.48380059003829956, + "learning_rate": 9.044056560753367e-06, + "loss": 0.1463, + "step": 2075 + }, + { + "epoch": 0.6727154893065457, + "grad_norm": 0.5094043612480164, + "learning_rate": 9.043027654678623e-06, + "loss": 0.1554, + "step": 2076 + }, + { + "epoch": 0.6730395333765392, + "grad_norm": 0.4993102252483368, + "learning_rate": 9.041998253776433e-06, + "loss": 0.1587, + "step": 2077 + }, + { + "epoch": 0.6733635774465327, + "grad_norm": 0.5299800634384155, + "learning_rate": 9.040968358172787e-06, + "loss": 0.1671, + "step": 2078 + }, + { + "epoch": 0.6736876215165263, + "grad_norm": 0.5652223825454712, + "learning_rate": 9.039937967993734e-06, + "loss": 0.1554, + "step": 2079 + }, + { + "epoch": 0.6740116655865198, + "grad_norm": 0.5279286503791809, + "learning_rate": 9.038907083365382e-06, + "loss": 0.1616, + "step": 2080 + }, + { + "epoch": 0.6743357096565132, + "grad_norm": 0.5427567362785339, + "learning_rate": 9.037875704413904e-06, + "loss": 0.1617, + "step": 2081 + }, + { + "epoch": 0.6746597537265068, + "grad_norm": 0.5197331309318542, + "learning_rate": 9.036843831265528e-06, + "loss": 0.157, + "step": 2082 + }, + { + "epoch": 0.6749837977965003, + "grad_norm": 0.49779677391052246, + "learning_rate": 9.035811464046547e-06, + "loss": 0.1458, + "step": 2083 + }, + { + "epoch": 0.6753078418664938, + "grad_norm": 0.49823495745658875, + "learning_rate": 9.034778602883313e-06, + "loss": 0.1273, + "step": 2084 + }, + { + "epoch": 0.6756318859364874, + "grad_norm": 0.5144118070602417, + "learning_rate": 9.03374524790224e-06, + "loss": 0.1529, + "step": 2085 + }, + { + "epoch": 0.6759559300064809, + "grad_norm": 0.4925682544708252, + "learning_rate": 9.032711399229794e-06, + "loss": 0.1433, + "step": 2086 + }, + { + "epoch": 0.6762799740764744, + "grad_norm": 0.485436350107193, + "learning_rate": 9.031677056992514e-06, + "loss": 0.1453, + "step": 2087 + }, + { + "epoch": 0.6766040181464679, + "grad_norm": 0.5172211527824402, + "learning_rate": 9.030642221316993e-06, + "loss": 0.1654, + "step": 2088 + }, + { + "epoch": 0.6769280622164614, + "grad_norm": 0.5097691416740417, + "learning_rate": 9.029606892329883e-06, + "loss": 0.1562, + "step": 2089 + }, + { + "epoch": 0.6772521062864549, + "grad_norm": 0.49525973200798035, + "learning_rate": 9.028571070157899e-06, + "loss": 0.1505, + "step": 2090 + }, + { + "epoch": 0.6775761503564485, + "grad_norm": 0.5409114360809326, + "learning_rate": 9.027534754927815e-06, + "loss": 0.1643, + "step": 2091 + }, + { + "epoch": 0.677900194426442, + "grad_norm": 0.4941219389438629, + "learning_rate": 9.026497946766468e-06, + "loss": 0.1453, + "step": 2092 + }, + { + "epoch": 0.6782242384964355, + "grad_norm": 0.4984894394874573, + "learning_rate": 9.02546064580075e-06, + "loss": 0.1491, + "step": 2093 + }, + { + "epoch": 0.6785482825664291, + "grad_norm": 0.5567848086357117, + "learning_rate": 9.02442285215762e-06, + "loss": 0.1796, + "step": 2094 + }, + { + "epoch": 0.6788723266364226, + "grad_norm": 0.5158399939537048, + "learning_rate": 9.023384565964093e-06, + "loss": 0.1624, + "step": 2095 + }, + { + "epoch": 0.679196370706416, + "grad_norm": 0.4754045009613037, + "learning_rate": 9.022345787347241e-06, + "loss": 0.1411, + "step": 2096 + }, + { + "epoch": 0.6795204147764096, + "grad_norm": 0.4885212182998657, + "learning_rate": 9.021306516434207e-06, + "loss": 0.1442, + "step": 2097 + }, + { + "epoch": 0.6798444588464031, + "grad_norm": 0.5083634257316589, + "learning_rate": 9.020266753352185e-06, + "loss": 0.1576, + "step": 2098 + }, + { + "epoch": 0.6801685029163966, + "grad_norm": 0.5063057541847229, + "learning_rate": 9.01922649822843e-06, + "loss": 0.1545, + "step": 2099 + }, + { + "epoch": 0.6804925469863902, + "grad_norm": 0.541229784488678, + "learning_rate": 9.018185751190261e-06, + "loss": 0.1634, + "step": 2100 + }, + { + "epoch": 0.6808165910563837, + "grad_norm": 0.49940717220306396, + "learning_rate": 9.017144512365055e-06, + "loss": 0.153, + "step": 2101 + }, + { + "epoch": 0.6811406351263772, + "grad_norm": 0.4865690767765045, + "learning_rate": 9.01610278188025e-06, + "loss": 0.148, + "step": 2102 + }, + { + "epoch": 0.6814646791963707, + "grad_norm": 0.49564990401268005, + "learning_rate": 9.015060559863345e-06, + "loss": 0.1392, + "step": 2103 + }, + { + "epoch": 0.6817887232663642, + "grad_norm": 0.49053755402565, + "learning_rate": 9.014017846441893e-06, + "loss": 0.1494, + "step": 2104 + }, + { + "epoch": 0.6821127673363577, + "grad_norm": 0.5497640371322632, + "learning_rate": 9.012974641743517e-06, + "loss": 0.1617, + "step": 2105 + }, + { + "epoch": 0.6824368114063513, + "grad_norm": 0.5298976898193359, + "learning_rate": 9.011930945895895e-06, + "loss": 0.1529, + "step": 2106 + }, + { + "epoch": 0.6827608554763448, + "grad_norm": 0.4656003713607788, + "learning_rate": 9.010886759026762e-06, + "loss": 0.1398, + "step": 2107 + }, + { + "epoch": 0.6830848995463383, + "grad_norm": 0.5337942838668823, + "learning_rate": 9.009842081263917e-06, + "loss": 0.1531, + "step": 2108 + }, + { + "epoch": 0.6834089436163319, + "grad_norm": 0.5253130793571472, + "learning_rate": 9.008796912735221e-06, + "loss": 0.1582, + "step": 2109 + }, + { + "epoch": 0.6837329876863253, + "grad_norm": 0.49264976382255554, + "learning_rate": 9.00775125356859e-06, + "loss": 0.1501, + "step": 2110 + }, + { + "epoch": 0.6840570317563188, + "grad_norm": 0.49911728501319885, + "learning_rate": 9.006705103892006e-06, + "loss": 0.162, + "step": 2111 + }, + { + "epoch": 0.6843810758263124, + "grad_norm": 0.4753684103488922, + "learning_rate": 9.005658463833503e-06, + "loss": 0.1368, + "step": 2112 + }, + { + "epoch": 0.6847051198963059, + "grad_norm": 0.5304966568946838, + "learning_rate": 9.004611333521183e-06, + "loss": 0.1676, + "step": 2113 + }, + { + "epoch": 0.6850291639662994, + "grad_norm": 0.5524003505706787, + "learning_rate": 9.003563713083203e-06, + "loss": 0.1587, + "step": 2114 + }, + { + "epoch": 0.685353208036293, + "grad_norm": 0.46181124448776245, + "learning_rate": 9.00251560264778e-06, + "loss": 0.139, + "step": 2115 + }, + { + "epoch": 0.6856772521062865, + "grad_norm": 0.5070650577545166, + "learning_rate": 9.001467002343198e-06, + "loss": 0.1448, + "step": 2116 + }, + { + "epoch": 0.68600129617628, + "grad_norm": 0.5460571050643921, + "learning_rate": 9.00041791229779e-06, + "loss": 0.1827, + "step": 2117 + }, + { + "epoch": 0.6863253402462735, + "grad_norm": 0.523775041103363, + "learning_rate": 8.999368332639957e-06, + "loss": 0.1715, + "step": 2118 + }, + { + "epoch": 0.686649384316267, + "grad_norm": 0.48703733086586, + "learning_rate": 8.998318263498158e-06, + "loss": 0.1492, + "step": 2119 + }, + { + "epoch": 0.6869734283862605, + "grad_norm": 0.48289304971694946, + "learning_rate": 8.99726770500091e-06, + "loss": 0.145, + "step": 2120 + }, + { + "epoch": 0.687297472456254, + "grad_norm": 0.5123062133789062, + "learning_rate": 8.99621665727679e-06, + "loss": 0.1537, + "step": 2121 + }, + { + "epoch": 0.6876215165262476, + "grad_norm": 0.5621348023414612, + "learning_rate": 8.995165120454437e-06, + "loss": 0.1464, + "step": 2122 + }, + { + "epoch": 0.6879455605962411, + "grad_norm": 0.5511627793312073, + "learning_rate": 8.994113094662552e-06, + "loss": 0.1579, + "step": 2123 + }, + { + "epoch": 0.6882696046662347, + "grad_norm": 0.5453408360481262, + "learning_rate": 8.99306058002989e-06, + "loss": 0.1646, + "step": 2124 + }, + { + "epoch": 0.6885936487362281, + "grad_norm": 0.5363773703575134, + "learning_rate": 8.992007576685266e-06, + "loss": 0.1623, + "step": 2125 + }, + { + "epoch": 0.6889176928062216, + "grad_norm": 0.5111467242240906, + "learning_rate": 8.990954084757562e-06, + "loss": 0.1526, + "step": 2126 + }, + { + "epoch": 0.6892417368762151, + "grad_norm": 0.5060132145881653, + "learning_rate": 8.989900104375715e-06, + "loss": 0.1478, + "step": 2127 + }, + { + "epoch": 0.6895657809462087, + "grad_norm": 0.49885737895965576, + "learning_rate": 8.988845635668719e-06, + "loss": 0.1405, + "step": 2128 + }, + { + "epoch": 0.6898898250162022, + "grad_norm": 0.5323303937911987, + "learning_rate": 8.98779067876563e-06, + "loss": 0.1532, + "step": 2129 + }, + { + "epoch": 0.6902138690861958, + "grad_norm": 0.4989731013774872, + "learning_rate": 8.98673523379557e-06, + "loss": 0.1426, + "step": 2130 + }, + { + "epoch": 0.6905379131561893, + "grad_norm": 0.480033814907074, + "learning_rate": 8.985679300887711e-06, + "loss": 0.1428, + "step": 2131 + }, + { + "epoch": 0.6908619572261827, + "grad_norm": 0.5180311799049377, + "learning_rate": 8.984622880171289e-06, + "loss": 0.1536, + "step": 2132 + }, + { + "epoch": 0.6911860012961762, + "grad_norm": 0.51394122838974, + "learning_rate": 8.983565971775604e-06, + "loss": 0.1537, + "step": 2133 + }, + { + "epoch": 0.6915100453661698, + "grad_norm": 0.5116256475448608, + "learning_rate": 8.982508575830005e-06, + "loss": 0.1586, + "step": 2134 + }, + { + "epoch": 0.6918340894361633, + "grad_norm": 0.49658048152923584, + "learning_rate": 8.981450692463909e-06, + "loss": 0.1406, + "step": 2135 + }, + { + "epoch": 0.6921581335061568, + "grad_norm": 0.5321874618530273, + "learning_rate": 8.980392321806793e-06, + "loss": 0.1686, + "step": 2136 + }, + { + "epoch": 0.6924821775761504, + "grad_norm": 0.5163205862045288, + "learning_rate": 8.97933346398819e-06, + "loss": 0.1601, + "step": 2137 + }, + { + "epoch": 0.6928062216461439, + "grad_norm": 0.47858354449272156, + "learning_rate": 8.978274119137694e-06, + "loss": 0.149, + "step": 2138 + }, + { + "epoch": 0.6931302657161373, + "grad_norm": 0.4877544045448303, + "learning_rate": 8.97721428738496e-06, + "loss": 0.1445, + "step": 2139 + }, + { + "epoch": 0.6934543097861309, + "grad_norm": 0.5431718826293945, + "learning_rate": 8.976153968859697e-06, + "loss": 0.1671, + "step": 2140 + }, + { + "epoch": 0.6937783538561244, + "grad_norm": 0.48591312766075134, + "learning_rate": 8.975093163691681e-06, + "loss": 0.1495, + "step": 2141 + }, + { + "epoch": 0.6941023979261179, + "grad_norm": 0.5136705040931702, + "learning_rate": 8.974031872010745e-06, + "loss": 0.1417, + "step": 2142 + }, + { + "epoch": 0.6944264419961115, + "grad_norm": 0.547904908657074, + "learning_rate": 8.972970093946777e-06, + "loss": 0.1652, + "step": 2143 + }, + { + "epoch": 0.694750486066105, + "grad_norm": 0.48400211334228516, + "learning_rate": 8.971907829629734e-06, + "loss": 0.1349, + "step": 2144 + }, + { + "epoch": 0.6950745301360985, + "grad_norm": 0.4963843524456024, + "learning_rate": 8.970845079189622e-06, + "loss": 0.1412, + "step": 2145 + }, + { + "epoch": 0.6953985742060921, + "grad_norm": 0.48876792192459106, + "learning_rate": 8.969781842756513e-06, + "loss": 0.1505, + "step": 2146 + }, + { + "epoch": 0.6957226182760855, + "grad_norm": 0.5260959267616272, + "learning_rate": 8.968718120460538e-06, + "loss": 0.1563, + "step": 2147 + }, + { + "epoch": 0.696046662346079, + "grad_norm": 0.48220258951187134, + "learning_rate": 8.967653912431884e-06, + "loss": 0.1319, + "step": 2148 + }, + { + "epoch": 0.6963707064160726, + "grad_norm": 0.4762630760669708, + "learning_rate": 8.9665892188008e-06, + "loss": 0.1476, + "step": 2149 + }, + { + "epoch": 0.6966947504860661, + "grad_norm": 0.48830246925354004, + "learning_rate": 8.965524039697598e-06, + "loss": 0.1458, + "step": 2150 + }, + { + "epoch": 0.6970187945560596, + "grad_norm": 0.5447869896888733, + "learning_rate": 8.96445837525264e-06, + "loss": 0.1609, + "step": 2151 + }, + { + "epoch": 0.6973428386260532, + "grad_norm": 0.4892789423465729, + "learning_rate": 8.963392225596357e-06, + "loss": 0.1536, + "step": 2152 + }, + { + "epoch": 0.6976668826960467, + "grad_norm": 0.502695620059967, + "learning_rate": 8.962325590859236e-06, + "loss": 0.1545, + "step": 2153 + }, + { + "epoch": 0.6979909267660401, + "grad_norm": 0.4724867641925812, + "learning_rate": 8.961258471171818e-06, + "loss": 0.1451, + "step": 2154 + }, + { + "epoch": 0.6983149708360337, + "grad_norm": 0.4843321740627289, + "learning_rate": 8.960190866664713e-06, + "loss": 0.1484, + "step": 2155 + }, + { + "epoch": 0.6986390149060272, + "grad_norm": 0.4993053674697876, + "learning_rate": 8.959122777468583e-06, + "loss": 0.1536, + "step": 2156 + }, + { + "epoch": 0.6989630589760207, + "grad_norm": 0.509663462638855, + "learning_rate": 8.958054203714152e-06, + "loss": 0.1496, + "step": 2157 + }, + { + "epoch": 0.6992871030460143, + "grad_norm": 0.475522518157959, + "learning_rate": 8.956985145532205e-06, + "loss": 0.1418, + "step": 2158 + }, + { + "epoch": 0.6996111471160078, + "grad_norm": 0.523635983467102, + "learning_rate": 8.95591560305358e-06, + "loss": 0.1629, + "step": 2159 + }, + { + "epoch": 0.6999351911860013, + "grad_norm": 0.5369279980659485, + "learning_rate": 8.954845576409184e-06, + "loss": 0.1583, + "step": 2160 + }, + { + "epoch": 0.7002592352559948, + "grad_norm": 0.5319356918334961, + "learning_rate": 8.953775065729972e-06, + "loss": 0.1585, + "step": 2161 + }, + { + "epoch": 0.7005832793259883, + "grad_norm": 0.4694063067436218, + "learning_rate": 8.952704071146972e-06, + "loss": 0.1277, + "step": 2162 + }, + { + "epoch": 0.7009073233959818, + "grad_norm": 0.5389366745948792, + "learning_rate": 8.951632592791255e-06, + "loss": 0.1594, + "step": 2163 + }, + { + "epoch": 0.7012313674659754, + "grad_norm": 0.47011804580688477, + "learning_rate": 8.950560630793965e-06, + "loss": 0.1384, + "step": 2164 + }, + { + "epoch": 0.7015554115359689, + "grad_norm": 0.5274106860160828, + "learning_rate": 8.949488185286297e-06, + "loss": 0.1647, + "step": 2165 + }, + { + "epoch": 0.7018794556059624, + "grad_norm": 0.5034454464912415, + "learning_rate": 8.948415256399512e-06, + "loss": 0.154, + "step": 2166 + }, + { + "epoch": 0.702203499675956, + "grad_norm": 0.47073930501937866, + "learning_rate": 8.94734184426492e-06, + "loss": 0.1442, + "step": 2167 + }, + { + "epoch": 0.7025275437459495, + "grad_norm": 0.4921237826347351, + "learning_rate": 8.946267949013902e-06, + "loss": 0.1436, + "step": 2168 + }, + { + "epoch": 0.7028515878159429, + "grad_norm": 0.5035290122032166, + "learning_rate": 8.945193570777888e-06, + "loss": 0.1452, + "step": 2169 + }, + { + "epoch": 0.7031756318859365, + "grad_norm": 0.5418218374252319, + "learning_rate": 8.944118709688375e-06, + "loss": 0.1778, + "step": 2170 + }, + { + "epoch": 0.70349967595593, + "grad_norm": 0.5310963988304138, + "learning_rate": 8.943043365876913e-06, + "loss": 0.1561, + "step": 2171 + }, + { + "epoch": 0.7038237200259235, + "grad_norm": 0.49203693866729736, + "learning_rate": 8.941967539475115e-06, + "loss": 0.1657, + "step": 2172 + }, + { + "epoch": 0.7041477640959171, + "grad_norm": 0.5172531008720398, + "learning_rate": 8.94089123061465e-06, + "loss": 0.1677, + "step": 2173 + }, + { + "epoch": 0.7044718081659106, + "grad_norm": 0.5177010893821716, + "learning_rate": 8.939814439427251e-06, + "loss": 0.1648, + "step": 2174 + }, + { + "epoch": 0.7047958522359041, + "grad_norm": 0.526005744934082, + "learning_rate": 8.938737166044705e-06, + "loss": 0.1709, + "step": 2175 + }, + { + "epoch": 0.7051198963058976, + "grad_norm": 0.5326618552207947, + "learning_rate": 8.937659410598857e-06, + "loss": 0.1698, + "step": 2176 + }, + { + "epoch": 0.7054439403758911, + "grad_norm": 0.5253227353096008, + "learning_rate": 8.936581173221619e-06, + "loss": 0.1632, + "step": 2177 + }, + { + "epoch": 0.7057679844458846, + "grad_norm": 0.5025566816329956, + "learning_rate": 8.935502454044955e-06, + "loss": 0.1635, + "step": 2178 + }, + { + "epoch": 0.7060920285158782, + "grad_norm": 0.5111781358718872, + "learning_rate": 8.934423253200887e-06, + "loss": 0.1437, + "step": 2179 + }, + { + "epoch": 0.7064160725858717, + "grad_norm": 0.49436041712760925, + "learning_rate": 8.933343570821504e-06, + "loss": 0.1654, + "step": 2180 + }, + { + "epoch": 0.7067401166558652, + "grad_norm": 0.49264398217201233, + "learning_rate": 8.932263407038943e-06, + "loss": 0.1532, + "step": 2181 + }, + { + "epoch": 0.7070641607258588, + "grad_norm": 0.500124454498291, + "learning_rate": 8.931182761985409e-06, + "loss": 0.1501, + "step": 2182 + }, + { + "epoch": 0.7073882047958522, + "grad_norm": 0.5058132410049438, + "learning_rate": 8.930101635793163e-06, + "loss": 0.1554, + "step": 2183 + }, + { + "epoch": 0.7077122488658457, + "grad_norm": 0.5359315276145935, + "learning_rate": 8.929020028594521e-06, + "loss": 0.155, + "step": 2184 + }, + { + "epoch": 0.7080362929358393, + "grad_norm": 0.5466138124465942, + "learning_rate": 8.927937940521865e-06, + "loss": 0.1464, + "step": 2185 + }, + { + "epoch": 0.7083603370058328, + "grad_norm": 0.49145621061325073, + "learning_rate": 8.92685537170763e-06, + "loss": 0.147, + "step": 2186 + }, + { + "epoch": 0.7086843810758263, + "grad_norm": 0.5450732111930847, + "learning_rate": 8.925772322284314e-06, + "loss": 0.1548, + "step": 2187 + }, + { + "epoch": 0.7090084251458199, + "grad_norm": 0.5080904364585876, + "learning_rate": 8.924688792384467e-06, + "loss": 0.1596, + "step": 2188 + }, + { + "epoch": 0.7093324692158134, + "grad_norm": 0.5188769102096558, + "learning_rate": 8.923604782140708e-06, + "loss": 0.1578, + "step": 2189 + }, + { + "epoch": 0.7096565132858069, + "grad_norm": 0.5725817680358887, + "learning_rate": 8.922520291685705e-06, + "loss": 0.1571, + "step": 2190 + }, + { + "epoch": 0.7099805573558003, + "grad_norm": 0.5092363953590393, + "learning_rate": 8.921435321152194e-06, + "loss": 0.148, + "step": 2191 + }, + { + "epoch": 0.7103046014257939, + "grad_norm": 0.5569043755531311, + "learning_rate": 8.92034987067296e-06, + "loss": 0.1619, + "step": 2192 + }, + { + "epoch": 0.7106286454957874, + "grad_norm": 0.512306809425354, + "learning_rate": 8.919263940380855e-06, + "loss": 0.1513, + "step": 2193 + }, + { + "epoch": 0.710952689565781, + "grad_norm": 0.5137192010879517, + "learning_rate": 8.918177530408785e-06, + "loss": 0.1606, + "step": 2194 + }, + { + "epoch": 0.7112767336357745, + "grad_norm": 0.4800979793071747, + "learning_rate": 8.917090640889715e-06, + "loss": 0.1474, + "step": 2195 + }, + { + "epoch": 0.711600777705768, + "grad_norm": 0.5407817959785461, + "learning_rate": 8.91600327195667e-06, + "loss": 0.1655, + "step": 2196 + }, + { + "epoch": 0.7119248217757616, + "grad_norm": 0.49153977632522583, + "learning_rate": 8.914915423742737e-06, + "loss": 0.149, + "step": 2197 + }, + { + "epoch": 0.712248865845755, + "grad_norm": 0.5214946269989014, + "learning_rate": 8.913827096381055e-06, + "loss": 0.1601, + "step": 2198 + }, + { + "epoch": 0.7125729099157485, + "grad_norm": 0.4982912242412567, + "learning_rate": 8.912738290004824e-06, + "loss": 0.1451, + "step": 2199 + }, + { + "epoch": 0.712896953985742, + "grad_norm": 0.49095019698143005, + "learning_rate": 8.911649004747307e-06, + "loss": 0.143, + "step": 2200 + }, + { + "epoch": 0.7132209980557356, + "grad_norm": 0.553778350353241, + "learning_rate": 8.910559240741816e-06, + "loss": 0.142, + "step": 2201 + }, + { + "epoch": 0.7135450421257291, + "grad_norm": 0.4920446276664734, + "learning_rate": 8.909468998121733e-06, + "loss": 0.152, + "step": 2202 + }, + { + "epoch": 0.7138690861957226, + "grad_norm": 0.4924328029155731, + "learning_rate": 8.908378277020491e-06, + "loss": 0.1551, + "step": 2203 + }, + { + "epoch": 0.7141931302657162, + "grad_norm": 0.5034648180007935, + "learning_rate": 8.907287077571585e-06, + "loss": 0.1514, + "step": 2204 + }, + { + "epoch": 0.7145171743357096, + "grad_norm": 0.5196908116340637, + "learning_rate": 8.906195399908563e-06, + "loss": 0.1599, + "step": 2205 + }, + { + "epoch": 0.7148412184057031, + "grad_norm": 0.5454964637756348, + "learning_rate": 8.905103244165044e-06, + "loss": 0.1681, + "step": 2206 + }, + { + "epoch": 0.7151652624756967, + "grad_norm": 0.5191643238067627, + "learning_rate": 8.904010610474687e-06, + "loss": 0.1444, + "step": 2207 + }, + { + "epoch": 0.7154893065456902, + "grad_norm": 0.5437571406364441, + "learning_rate": 8.902917498971228e-06, + "loss": 0.1583, + "step": 2208 + }, + { + "epoch": 0.7158133506156837, + "grad_norm": 0.5087147951126099, + "learning_rate": 8.901823909788449e-06, + "loss": 0.1481, + "step": 2209 + }, + { + "epoch": 0.7161373946856773, + "grad_norm": 0.528440535068512, + "learning_rate": 8.900729843060199e-06, + "loss": 0.1535, + "step": 2210 + }, + { + "epoch": 0.7164614387556708, + "grad_norm": 0.48496097326278687, + "learning_rate": 8.899635298920374e-06, + "loss": 0.1526, + "step": 2211 + }, + { + "epoch": 0.7167854828256643, + "grad_norm": 0.4830402731895447, + "learning_rate": 8.898540277502943e-06, + "loss": 0.1442, + "step": 2212 + }, + { + "epoch": 0.7171095268956578, + "grad_norm": 0.507747232913971, + "learning_rate": 8.897444778941921e-06, + "loss": 0.1426, + "step": 2213 + }, + { + "epoch": 0.7174335709656513, + "grad_norm": 0.4960160255432129, + "learning_rate": 8.896348803371388e-06, + "loss": 0.1377, + "step": 2214 + }, + { + "epoch": 0.7177576150356448, + "grad_norm": 0.5086054801940918, + "learning_rate": 8.895252350925482e-06, + "loss": 0.1478, + "step": 2215 + }, + { + "epoch": 0.7180816591056384, + "grad_norm": 0.5340855121612549, + "learning_rate": 8.894155421738398e-06, + "loss": 0.1491, + "step": 2216 + }, + { + "epoch": 0.7184057031756319, + "grad_norm": 0.5328924059867859, + "learning_rate": 8.893058015944387e-06, + "loss": 0.1587, + "step": 2217 + }, + { + "epoch": 0.7187297472456254, + "grad_norm": 0.5267184972763062, + "learning_rate": 8.891960133677763e-06, + "loss": 0.1555, + "step": 2218 + }, + { + "epoch": 0.719053791315619, + "grad_norm": 0.48961344361305237, + "learning_rate": 8.890861775072897e-06, + "loss": 0.1493, + "step": 2219 + }, + { + "epoch": 0.7193778353856124, + "grad_norm": 0.5472536683082581, + "learning_rate": 8.889762940264216e-06, + "loss": 0.1722, + "step": 2220 + }, + { + "epoch": 0.7197018794556059, + "grad_norm": 0.49720636010169983, + "learning_rate": 8.888663629386206e-06, + "loss": 0.1447, + "step": 2221 + }, + { + "epoch": 0.7200259235255995, + "grad_norm": 0.5057947039604187, + "learning_rate": 8.887563842573412e-06, + "loss": 0.1484, + "step": 2222 + }, + { + "epoch": 0.720349967595593, + "grad_norm": 0.4652838110923767, + "learning_rate": 8.886463579960441e-06, + "loss": 0.1433, + "step": 2223 + }, + { + "epoch": 0.7206740116655865, + "grad_norm": 0.5056071281433105, + "learning_rate": 8.885362841681948e-06, + "loss": 0.1527, + "step": 2224 + }, + { + "epoch": 0.7209980557355801, + "grad_norm": 0.4978449046611786, + "learning_rate": 8.88426162787266e-06, + "loss": 0.1537, + "step": 2225 + }, + { + "epoch": 0.7213220998055736, + "grad_norm": 0.46882349252700806, + "learning_rate": 8.88315993866735e-06, + "loss": 0.1304, + "step": 2226 + }, + { + "epoch": 0.721646143875567, + "grad_norm": 0.48129644989967346, + "learning_rate": 8.882057774200855e-06, + "loss": 0.1487, + "step": 2227 + }, + { + "epoch": 0.7219701879455606, + "grad_norm": 0.5160327553749084, + "learning_rate": 8.880955134608069e-06, + "loss": 0.1689, + "step": 2228 + }, + { + "epoch": 0.7222942320155541, + "grad_norm": 0.5019292831420898, + "learning_rate": 8.879852020023945e-06, + "loss": 0.155, + "step": 2229 + }, + { + "epoch": 0.7226182760855476, + "grad_norm": 0.5016565322875977, + "learning_rate": 8.878748430583496e-06, + "loss": 0.155, + "step": 2230 + }, + { + "epoch": 0.7229423201555412, + "grad_norm": 0.48340004682540894, + "learning_rate": 8.877644366421787e-06, + "loss": 0.1504, + "step": 2231 + }, + { + "epoch": 0.7232663642255347, + "grad_norm": 0.44818857312202454, + "learning_rate": 8.876539827673944e-06, + "loss": 0.1338, + "step": 2232 + }, + { + "epoch": 0.7235904082955282, + "grad_norm": 0.48734670877456665, + "learning_rate": 8.875434814475157e-06, + "loss": 0.1517, + "step": 2233 + }, + { + "epoch": 0.7239144523655218, + "grad_norm": 0.4977285861968994, + "learning_rate": 8.874329326960664e-06, + "loss": 0.1512, + "step": 2234 + }, + { + "epoch": 0.7242384964355152, + "grad_norm": 0.5024043321609497, + "learning_rate": 8.873223365265768e-06, + "loss": 0.1488, + "step": 2235 + }, + { + "epoch": 0.7245625405055087, + "grad_norm": 0.4885674715042114, + "learning_rate": 8.87211692952583e-06, + "loss": 0.1509, + "step": 2236 + }, + { + "epoch": 0.7248865845755023, + "grad_norm": 0.49454137682914734, + "learning_rate": 8.871010019876263e-06, + "loss": 0.1527, + "step": 2237 + }, + { + "epoch": 0.7252106286454958, + "grad_norm": 0.5014051198959351, + "learning_rate": 8.869902636452544e-06, + "loss": 0.1506, + "step": 2238 + }, + { + "epoch": 0.7255346727154893, + "grad_norm": 0.4825291037559509, + "learning_rate": 8.868794779390205e-06, + "loss": 0.1364, + "step": 2239 + }, + { + "epoch": 0.7258587167854829, + "grad_norm": 0.5182274580001831, + "learning_rate": 8.867686448824839e-06, + "loss": 0.1591, + "step": 2240 + }, + { + "epoch": 0.7261827608554764, + "grad_norm": 0.4794568121433258, + "learning_rate": 8.866577644892093e-06, + "loss": 0.1401, + "step": 2241 + }, + { + "epoch": 0.7265068049254698, + "grad_norm": 0.49534085392951965, + "learning_rate": 8.865468367727674e-06, + "loss": 0.1454, + "step": 2242 + }, + { + "epoch": 0.7268308489954634, + "grad_norm": 0.5006011128425598, + "learning_rate": 8.864358617467348e-06, + "loss": 0.1501, + "step": 2243 + }, + { + "epoch": 0.7271548930654569, + "grad_norm": 0.48030370473861694, + "learning_rate": 8.86324839424694e-06, + "loss": 0.1474, + "step": 2244 + }, + { + "epoch": 0.7274789371354504, + "grad_norm": 0.49514061212539673, + "learning_rate": 8.862137698202324e-06, + "loss": 0.1553, + "step": 2245 + }, + { + "epoch": 0.727802981205444, + "grad_norm": 0.48254290223121643, + "learning_rate": 8.861026529469443e-06, + "loss": 0.1482, + "step": 2246 + }, + { + "epoch": 0.7281270252754375, + "grad_norm": 0.5079731941223145, + "learning_rate": 8.859914888184293e-06, + "loss": 0.1585, + "step": 2247 + }, + { + "epoch": 0.728451069345431, + "grad_norm": 0.5570839047431946, + "learning_rate": 8.858802774482928e-06, + "loss": 0.1881, + "step": 2248 + }, + { + "epoch": 0.7287751134154244, + "grad_norm": 0.5489197969436646, + "learning_rate": 8.857690188501457e-06, + "loss": 0.1667, + "step": 2249 + }, + { + "epoch": 0.729099157485418, + "grad_norm": 0.5182020664215088, + "learning_rate": 8.856577130376056e-06, + "loss": 0.1596, + "step": 2250 + }, + { + "epoch": 0.7294232015554115, + "grad_norm": 0.47524747252464294, + "learning_rate": 8.855463600242946e-06, + "loss": 0.1479, + "step": 2251 + }, + { + "epoch": 0.729747245625405, + "grad_norm": 0.4884457588195801, + "learning_rate": 8.854349598238417e-06, + "loss": 0.1458, + "step": 2252 + }, + { + "epoch": 0.7300712896953986, + "grad_norm": 0.5070611834526062, + "learning_rate": 8.85323512449881e-06, + "loss": 0.1521, + "step": 2253 + }, + { + "epoch": 0.7303953337653921, + "grad_norm": 0.4800879955291748, + "learning_rate": 8.852120179160524e-06, + "loss": 0.1525, + "step": 2254 + }, + { + "epoch": 0.7307193778353857, + "grad_norm": 0.5337240695953369, + "learning_rate": 8.85100476236002e-06, + "loss": 0.1596, + "step": 2255 + }, + { + "epoch": 0.7310434219053791, + "grad_norm": 0.4707546532154083, + "learning_rate": 8.849888874233815e-06, + "loss": 0.1422, + "step": 2256 + }, + { + "epoch": 0.7313674659753726, + "grad_norm": 0.5401502847671509, + "learning_rate": 8.848772514918482e-06, + "loss": 0.1561, + "step": 2257 + }, + { + "epoch": 0.7316915100453661, + "grad_norm": 0.5288102626800537, + "learning_rate": 8.84765568455065e-06, + "loss": 0.1634, + "step": 2258 + }, + { + "epoch": 0.7320155541153597, + "grad_norm": 0.5149606466293335, + "learning_rate": 8.846538383267011e-06, + "loss": 0.1564, + "step": 2259 + }, + { + "epoch": 0.7323395981853532, + "grad_norm": 0.4856564402580261, + "learning_rate": 8.845420611204312e-06, + "loss": 0.134, + "step": 2260 + }, + { + "epoch": 0.7326636422553467, + "grad_norm": 0.5287962555885315, + "learning_rate": 8.844302368499358e-06, + "loss": 0.1557, + "step": 2261 + }, + { + "epoch": 0.7329876863253403, + "grad_norm": 0.5078102350234985, + "learning_rate": 8.843183655289007e-06, + "loss": 0.1584, + "step": 2262 + }, + { + "epoch": 0.7333117303953338, + "grad_norm": 0.4822060465812683, + "learning_rate": 8.842064471710183e-06, + "loss": 0.1495, + "step": 2263 + }, + { + "epoch": 0.7336357744653272, + "grad_norm": 0.5514498949050903, + "learning_rate": 8.840944817899861e-06, + "loss": 0.1614, + "step": 2264 + }, + { + "epoch": 0.7339598185353208, + "grad_norm": 0.4666975140571594, + "learning_rate": 8.839824693995078e-06, + "loss": 0.1352, + "step": 2265 + }, + { + "epoch": 0.7342838626053143, + "grad_norm": 0.44550567865371704, + "learning_rate": 8.838704100132925e-06, + "loss": 0.1288, + "step": 2266 + }, + { + "epoch": 0.7346079066753078, + "grad_norm": 0.5307706594467163, + "learning_rate": 8.83758303645055e-06, + "loss": 0.1632, + "step": 2267 + }, + { + "epoch": 0.7349319507453014, + "grad_norm": 0.4947357177734375, + "learning_rate": 8.836461503085162e-06, + "loss": 0.1442, + "step": 2268 + }, + { + "epoch": 0.7352559948152949, + "grad_norm": 0.5394967198371887, + "learning_rate": 8.835339500174028e-06, + "loss": 0.1594, + "step": 2269 + }, + { + "epoch": 0.7355800388852884, + "grad_norm": 0.49313005805015564, + "learning_rate": 8.834217027854466e-06, + "loss": 0.141, + "step": 2270 + }, + { + "epoch": 0.7359040829552819, + "grad_norm": 0.5317650437355042, + "learning_rate": 8.833094086263859e-06, + "loss": 0.1328, + "step": 2271 + }, + { + "epoch": 0.7362281270252754, + "grad_norm": 0.46917349100112915, + "learning_rate": 8.83197067553964e-06, + "loss": 0.1365, + "step": 2272 + }, + { + "epoch": 0.7365521710952689, + "grad_norm": 0.5269829034805298, + "learning_rate": 8.83084679581931e-06, + "loss": 0.1634, + "step": 2273 + }, + { + "epoch": 0.7368762151652625, + "grad_norm": 0.4746880531311035, + "learning_rate": 8.829722447240418e-06, + "loss": 0.1504, + "step": 2274 + }, + { + "epoch": 0.737200259235256, + "grad_norm": 0.4481014013290405, + "learning_rate": 8.828597629940572e-06, + "loss": 0.1288, + "step": 2275 + }, + { + "epoch": 0.7375243033052495, + "grad_norm": 0.47481676936149597, + "learning_rate": 8.82747234405744e-06, + "loss": 0.1519, + "step": 2276 + }, + { + "epoch": 0.7378483473752431, + "grad_norm": 0.4851224422454834, + "learning_rate": 8.826346589728746e-06, + "loss": 0.145, + "step": 2277 + }, + { + "epoch": 0.7381723914452365, + "grad_norm": 0.5078859925270081, + "learning_rate": 8.82522036709227e-06, + "loss": 0.1546, + "step": 2278 + }, + { + "epoch": 0.73849643551523, + "grad_norm": 0.48216864466667175, + "learning_rate": 8.824093676285854e-06, + "loss": 0.1415, + "step": 2279 + }, + { + "epoch": 0.7388204795852236, + "grad_norm": 0.47693872451782227, + "learning_rate": 8.82296651744739e-06, + "loss": 0.1472, + "step": 2280 + }, + { + "epoch": 0.7391445236552171, + "grad_norm": 0.5153552889823914, + "learning_rate": 8.821838890714836e-06, + "loss": 0.1522, + "step": 2281 + }, + { + "epoch": 0.7394685677252106, + "grad_norm": 0.5131588578224182, + "learning_rate": 8.820710796226197e-06, + "loss": 0.1529, + "step": 2282 + }, + { + "epoch": 0.7397926117952042, + "grad_norm": 0.5055592656135559, + "learning_rate": 8.819582234119546e-06, + "loss": 0.1429, + "step": 2283 + }, + { + "epoch": 0.7401166558651977, + "grad_norm": 0.5131552815437317, + "learning_rate": 8.818453204533005e-06, + "loss": 0.145, + "step": 2284 + }, + { + "epoch": 0.7404406999351912, + "grad_norm": 0.5160530209541321, + "learning_rate": 8.817323707604759e-06, + "loss": 0.1559, + "step": 2285 + }, + { + "epoch": 0.7407647440051847, + "grad_norm": 0.4976802170276642, + "learning_rate": 8.816193743473044e-06, + "loss": 0.1552, + "step": 2286 + }, + { + "epoch": 0.7410887880751782, + "grad_norm": 0.5457192063331604, + "learning_rate": 8.815063312276159e-06, + "loss": 0.1698, + "step": 2287 + }, + { + "epoch": 0.7414128321451717, + "grad_norm": 0.5098389983177185, + "learning_rate": 8.813932414152458e-06, + "loss": 0.1533, + "step": 2288 + }, + { + "epoch": 0.7417368762151653, + "grad_norm": 0.4771139621734619, + "learning_rate": 8.812801049240349e-06, + "loss": 0.1444, + "step": 2289 + }, + { + "epoch": 0.7420609202851588, + "grad_norm": 0.4948260486125946, + "learning_rate": 8.811669217678303e-06, + "loss": 0.1397, + "step": 2290 + }, + { + "epoch": 0.7423849643551523, + "grad_norm": 0.4971177577972412, + "learning_rate": 8.810536919604846e-06, + "loss": 0.1507, + "step": 2291 + }, + { + "epoch": 0.7427090084251459, + "grad_norm": 0.5016118884086609, + "learning_rate": 8.809404155158558e-06, + "loss": 0.1555, + "step": 2292 + }, + { + "epoch": 0.7430330524951393, + "grad_norm": 0.5108845233917236, + "learning_rate": 8.808270924478079e-06, + "loss": 0.1719, + "step": 2293 + }, + { + "epoch": 0.7433570965651328, + "grad_norm": 0.46601563692092896, + "learning_rate": 8.807137227702108e-06, + "loss": 0.1384, + "step": 2294 + }, + { + "epoch": 0.7436811406351264, + "grad_norm": 0.5223451256752014, + "learning_rate": 8.806003064969394e-06, + "loss": 0.1611, + "step": 2295 + }, + { + "epoch": 0.7440051847051199, + "grad_norm": 0.5406107306480408, + "learning_rate": 8.804868436418749e-06, + "loss": 0.1557, + "step": 2296 + }, + { + "epoch": 0.7443292287751134, + "grad_norm": 0.49755993485450745, + "learning_rate": 8.803733342189044e-06, + "loss": 0.1603, + "step": 2297 + }, + { + "epoch": 0.744653272845107, + "grad_norm": 0.474341481924057, + "learning_rate": 8.8025977824192e-06, + "loss": 0.1454, + "step": 2298 + }, + { + "epoch": 0.7449773169151005, + "grad_norm": 0.47189074754714966, + "learning_rate": 8.8014617572482e-06, + "loss": 0.1355, + "step": 2299 + }, + { + "epoch": 0.7453013609850939, + "grad_norm": 0.5056459307670593, + "learning_rate": 8.80032526681508e-06, + "loss": 0.1431, + "step": 2300 + }, + { + "epoch": 0.7456254050550875, + "grad_norm": 0.5132352113723755, + "learning_rate": 8.799188311258939e-06, + "loss": 0.138, + "step": 2301 + }, + { + "epoch": 0.745949449125081, + "grad_norm": 0.5066553950309753, + "learning_rate": 8.798050890718927e-06, + "loss": 0.1431, + "step": 2302 + }, + { + "epoch": 0.7462734931950745, + "grad_norm": 0.5290509462356567, + "learning_rate": 8.796913005334254e-06, + "loss": 0.1438, + "step": 2303 + }, + { + "epoch": 0.7465975372650681, + "grad_norm": 0.512313187122345, + "learning_rate": 8.795774655244187e-06, + "loss": 0.1566, + "step": 2304 + }, + { + "epoch": 0.7469215813350616, + "grad_norm": 0.5207310914993286, + "learning_rate": 8.794635840588046e-06, + "loss": 0.1627, + "step": 2305 + }, + { + "epoch": 0.7472456254050551, + "grad_norm": 0.5040437579154968, + "learning_rate": 8.793496561505216e-06, + "loss": 0.1516, + "step": 2306 + }, + { + "epoch": 0.7475696694750487, + "grad_norm": 0.5168375372886658, + "learning_rate": 8.792356818135128e-06, + "loss": 0.1576, + "step": 2307 + }, + { + "epoch": 0.7478937135450421, + "grad_norm": 0.5095380544662476, + "learning_rate": 8.791216610617278e-06, + "loss": 0.1626, + "step": 2308 + }, + { + "epoch": 0.7482177576150356, + "grad_norm": 0.5392544865608215, + "learning_rate": 8.790075939091218e-06, + "loss": 0.1668, + "step": 2309 + }, + { + "epoch": 0.7485418016850292, + "grad_norm": 0.5280436873435974, + "learning_rate": 8.788934803696554e-06, + "loss": 0.1626, + "step": 2310 + }, + { + "epoch": 0.7488658457550227, + "grad_norm": 0.4907281696796417, + "learning_rate": 8.78779320457295e-06, + "loss": 0.1477, + "step": 2311 + }, + { + "epoch": 0.7491898898250162, + "grad_norm": 0.4908786714076996, + "learning_rate": 8.786651141860127e-06, + "loss": 0.1539, + "step": 2312 + }, + { + "epoch": 0.7495139338950098, + "grad_norm": 0.528580904006958, + "learning_rate": 8.785508615697859e-06, + "loss": 0.1633, + "step": 2313 + }, + { + "epoch": 0.7498379779650033, + "grad_norm": 0.5029335618019104, + "learning_rate": 8.784365626225986e-06, + "loss": 0.1623, + "step": 2314 + }, + { + "epoch": 0.7501620220349967, + "grad_norm": 0.48301079869270325, + "learning_rate": 8.783222173584396e-06, + "loss": 0.1505, + "step": 2315 + }, + { + "epoch": 0.7504860661049902, + "grad_norm": 0.5065808892250061, + "learning_rate": 8.782078257913033e-06, + "loss": 0.1486, + "step": 2316 + }, + { + "epoch": 0.7508101101749838, + "grad_norm": 0.5093285441398621, + "learning_rate": 8.780933879351907e-06, + "loss": 0.1541, + "step": 2317 + }, + { + "epoch": 0.7511341542449773, + "grad_norm": 0.5059759616851807, + "learning_rate": 8.779789038041078e-06, + "loss": 0.1527, + "step": 2318 + }, + { + "epoch": 0.7514581983149708, + "grad_norm": 0.44241490960121155, + "learning_rate": 8.77864373412066e-06, + "loss": 0.1367, + "step": 2319 + }, + { + "epoch": 0.7517822423849644, + "grad_norm": 0.5083364844322205, + "learning_rate": 8.777497967730828e-06, + "loss": 0.1475, + "step": 2320 + }, + { + "epoch": 0.7521062864549579, + "grad_norm": 0.5217186212539673, + "learning_rate": 8.776351739011815e-06, + "loss": 0.1617, + "step": 2321 + }, + { + "epoch": 0.7524303305249513, + "grad_norm": 0.5198972821235657, + "learning_rate": 8.775205048103908e-06, + "loss": 0.1489, + "step": 2322 + }, + { + "epoch": 0.7527543745949449, + "grad_norm": 0.5171259641647339, + "learning_rate": 8.774057895147448e-06, + "loss": 0.1397, + "step": 2323 + }, + { + "epoch": 0.7530784186649384, + "grad_norm": 0.4689252972602844, + "learning_rate": 8.772910280282839e-06, + "loss": 0.128, + "step": 2324 + }, + { + "epoch": 0.7534024627349319, + "grad_norm": 0.5266039371490479, + "learning_rate": 8.771762203650536e-06, + "loss": 0.1685, + "step": 2325 + }, + { + "epoch": 0.7537265068049255, + "grad_norm": 0.5220149755477905, + "learning_rate": 8.770613665391053e-06, + "loss": 0.1588, + "step": 2326 + }, + { + "epoch": 0.754050550874919, + "grad_norm": 0.48700717091560364, + "learning_rate": 8.769464665644958e-06, + "loss": 0.1444, + "step": 2327 + }, + { + "epoch": 0.7543745949449125, + "grad_norm": 0.48634734749794006, + "learning_rate": 8.76831520455288e-06, + "loss": 0.1448, + "step": 2328 + }, + { + "epoch": 0.7546986390149061, + "grad_norm": 0.4782525599002838, + "learning_rate": 8.7671652822555e-06, + "loss": 0.1524, + "step": 2329 + }, + { + "epoch": 0.7550226830848995, + "grad_norm": 0.48710310459136963, + "learning_rate": 8.766014898893563e-06, + "loss": 0.1464, + "step": 2330 + }, + { + "epoch": 0.755346727154893, + "grad_norm": 0.5369205474853516, + "learning_rate": 8.764864054607856e-06, + "loss": 0.1651, + "step": 2331 + }, + { + "epoch": 0.7556707712248866, + "grad_norm": 0.4831956624984741, + "learning_rate": 8.763712749539235e-06, + "loss": 0.1392, + "step": 2332 + }, + { + "epoch": 0.7559948152948801, + "grad_norm": 0.4816092550754547, + "learning_rate": 8.762560983828611e-06, + "loss": 0.1573, + "step": 2333 + }, + { + "epoch": 0.7563188593648736, + "grad_norm": 0.513157308101654, + "learning_rate": 8.761408757616947e-06, + "loss": 0.1445, + "step": 2334 + }, + { + "epoch": 0.7566429034348672, + "grad_norm": 0.5071210265159607, + "learning_rate": 8.760256071045264e-06, + "loss": 0.1665, + "step": 2335 + }, + { + "epoch": 0.7569669475048607, + "grad_norm": 0.5065087080001831, + "learning_rate": 8.759102924254638e-06, + "loss": 0.1595, + "step": 2336 + }, + { + "epoch": 0.7572909915748541, + "grad_norm": 0.5367597937583923, + "learning_rate": 8.757949317386207e-06, + "loss": 0.1817, + "step": 2337 + }, + { + "epoch": 0.7576150356448477, + "grad_norm": 0.5146887302398682, + "learning_rate": 8.75679525058116e-06, + "loss": 0.1625, + "step": 2338 + }, + { + "epoch": 0.7579390797148412, + "grad_norm": 0.4877246916294098, + "learning_rate": 8.755640723980743e-06, + "loss": 0.1344, + "step": 2339 + }, + { + "epoch": 0.7582631237848347, + "grad_norm": 0.5225220322608948, + "learning_rate": 8.754485737726257e-06, + "loss": 0.1629, + "step": 2340 + }, + { + "epoch": 0.7585871678548283, + "grad_norm": 0.49157649278640747, + "learning_rate": 8.753330291959064e-06, + "loss": 0.1424, + "step": 2341 + }, + { + "epoch": 0.7589112119248218, + "grad_norm": 0.48859721422195435, + "learning_rate": 8.752174386820578e-06, + "loss": 0.1458, + "step": 2342 + }, + { + "epoch": 0.7592352559948153, + "grad_norm": 0.5182003974914551, + "learning_rate": 8.75101802245227e-06, + "loss": 0.1606, + "step": 2343 + }, + { + "epoch": 0.7595593000648088, + "grad_norm": 0.5214172601699829, + "learning_rate": 8.74986119899567e-06, + "loss": 0.1393, + "step": 2344 + }, + { + "epoch": 0.7598833441348023, + "grad_norm": 0.5092251896858215, + "learning_rate": 8.74870391659236e-06, + "loss": 0.1473, + "step": 2345 + }, + { + "epoch": 0.7602073882047958, + "grad_norm": 0.5081760287284851, + "learning_rate": 8.747546175383984e-06, + "loss": 0.1439, + "step": 2346 + }, + { + "epoch": 0.7605314322747894, + "grad_norm": 0.5062338709831238, + "learning_rate": 8.746387975512232e-06, + "loss": 0.1515, + "step": 2347 + }, + { + "epoch": 0.7608554763447829, + "grad_norm": 0.47075754404067993, + "learning_rate": 8.745229317118859e-06, + "loss": 0.1474, + "step": 2348 + }, + { + "epoch": 0.7611795204147764, + "grad_norm": 0.4888347387313843, + "learning_rate": 8.744070200345675e-06, + "loss": 0.1551, + "step": 2349 + }, + { + "epoch": 0.76150356448477, + "grad_norm": 0.50382399559021, + "learning_rate": 8.742910625334545e-06, + "loss": 0.1563, + "step": 2350 + }, + { + "epoch": 0.7618276085547635, + "grad_norm": 0.5092306137084961, + "learning_rate": 8.741750592227388e-06, + "loss": 0.1601, + "step": 2351 + }, + { + "epoch": 0.7621516526247569, + "grad_norm": 0.5268076062202454, + "learning_rate": 8.740590101166181e-06, + "loss": 0.1753, + "step": 2352 + }, + { + "epoch": 0.7624756966947505, + "grad_norm": 0.4805704355239868, + "learning_rate": 8.739429152292957e-06, + "loss": 0.1489, + "step": 2353 + }, + { + "epoch": 0.762799740764744, + "grad_norm": 0.5004841089248657, + "learning_rate": 8.738267745749806e-06, + "loss": 0.1554, + "step": 2354 + }, + { + "epoch": 0.7631237848347375, + "grad_norm": 0.5143179297447205, + "learning_rate": 8.737105881678872e-06, + "loss": 0.1598, + "step": 2355 + }, + { + "epoch": 0.7634478289047311, + "grad_norm": 0.49346303939819336, + "learning_rate": 8.735943560222358e-06, + "loss": 0.1676, + "step": 2356 + }, + { + "epoch": 0.7637718729747246, + "grad_norm": 0.4914379119873047, + "learning_rate": 8.734780781522516e-06, + "loss": 0.1452, + "step": 2357 + }, + { + "epoch": 0.7640959170447181, + "grad_norm": 0.46416085958480835, + "learning_rate": 8.733617545721663e-06, + "loss": 0.1283, + "step": 2358 + }, + { + "epoch": 0.7644199611147116, + "grad_norm": 0.487813264131546, + "learning_rate": 8.732453852962166e-06, + "loss": 0.144, + "step": 2359 + }, + { + "epoch": 0.7647440051847051, + "grad_norm": 0.5519083142280579, + "learning_rate": 8.731289703386451e-06, + "loss": 0.1632, + "step": 2360 + }, + { + "epoch": 0.7650680492546986, + "grad_norm": 0.5048906207084656, + "learning_rate": 8.730125097137e-06, + "loss": 0.1572, + "step": 2361 + }, + { + "epoch": 0.7653920933246922, + "grad_norm": 0.4748183488845825, + "learning_rate": 8.728960034356344e-06, + "loss": 0.1479, + "step": 2362 + }, + { + "epoch": 0.7657161373946857, + "grad_norm": 0.4857644736766815, + "learning_rate": 8.72779451518708e-06, + "loss": 0.1481, + "step": 2363 + }, + { + "epoch": 0.7660401814646792, + "grad_norm": 0.5448926687240601, + "learning_rate": 8.726628539771856e-06, + "loss": 0.1704, + "step": 2364 + }, + { + "epoch": 0.7663642255346728, + "grad_norm": 0.4890585243701935, + "learning_rate": 8.725462108253375e-06, + "loss": 0.1543, + "step": 2365 + }, + { + "epoch": 0.7666882696046662, + "grad_norm": 0.5503400564193726, + "learning_rate": 8.724295220774396e-06, + "loss": 0.1727, + "step": 2366 + }, + { + "epoch": 0.7670123136746597, + "grad_norm": 0.4946596026420593, + "learning_rate": 8.723127877477737e-06, + "loss": 0.1538, + "step": 2367 + }, + { + "epoch": 0.7673363577446533, + "grad_norm": 0.49211350083351135, + "learning_rate": 8.721960078506269e-06, + "loss": 0.1505, + "step": 2368 + }, + { + "epoch": 0.7676604018146468, + "grad_norm": 0.5130065083503723, + "learning_rate": 8.72079182400292e-06, + "loss": 0.1631, + "step": 2369 + }, + { + "epoch": 0.7679844458846403, + "grad_norm": 0.5098603367805481, + "learning_rate": 8.71962311411067e-06, + "loss": 0.168, + "step": 2370 + }, + { + "epoch": 0.7683084899546339, + "grad_norm": 0.4712441861629486, + "learning_rate": 8.718453948972559e-06, + "loss": 0.1397, + "step": 2371 + }, + { + "epoch": 0.7686325340246274, + "grad_norm": 0.4978718161582947, + "learning_rate": 8.717284328731681e-06, + "loss": 0.1603, + "step": 2372 + }, + { + "epoch": 0.7689565780946209, + "grad_norm": 0.48081910610198975, + "learning_rate": 8.716114253531189e-06, + "loss": 0.1427, + "step": 2373 + }, + { + "epoch": 0.7692806221646143, + "grad_norm": 0.4605656564235687, + "learning_rate": 8.714943723514288e-06, + "loss": 0.1457, + "step": 2374 + }, + { + "epoch": 0.7696046662346079, + "grad_norm": 0.5563753843307495, + "learning_rate": 8.713772738824237e-06, + "loss": 0.1564, + "step": 2375 + }, + { + "epoch": 0.7699287103046014, + "grad_norm": 0.48092120885849, + "learning_rate": 8.712601299604355e-06, + "loss": 0.1508, + "step": 2376 + }, + { + "epoch": 0.770252754374595, + "grad_norm": 0.4575970768928528, + "learning_rate": 8.711429405998017e-06, + "loss": 0.1459, + "step": 2377 + }, + { + "epoch": 0.7705767984445885, + "grad_norm": 0.48775961995124817, + "learning_rate": 8.710257058148647e-06, + "loss": 0.1477, + "step": 2378 + }, + { + "epoch": 0.770900842514582, + "grad_norm": 0.4882473051548004, + "learning_rate": 8.709084256199732e-06, + "loss": 0.1406, + "step": 2379 + }, + { + "epoch": 0.7712248865845756, + "grad_norm": 0.5059452056884766, + "learning_rate": 8.70791100029481e-06, + "loss": 0.1517, + "step": 2380 + }, + { + "epoch": 0.771548930654569, + "grad_norm": 0.512593686580658, + "learning_rate": 8.706737290577475e-06, + "loss": 0.159, + "step": 2381 + }, + { + "epoch": 0.7718729747245625, + "grad_norm": 0.4948345124721527, + "learning_rate": 8.705563127191383e-06, + "loss": 0.1495, + "step": 2382 + }, + { + "epoch": 0.772197018794556, + "grad_norm": 0.45760348439216614, + "learning_rate": 8.704388510280237e-06, + "loss": 0.1522, + "step": 2383 + }, + { + "epoch": 0.7725210628645496, + "grad_norm": 0.5127160549163818, + "learning_rate": 8.703213439987797e-06, + "loss": 0.1614, + "step": 2384 + }, + { + "epoch": 0.7728451069345431, + "grad_norm": 0.46613267064094543, + "learning_rate": 8.702037916457882e-06, + "loss": 0.1437, + "step": 2385 + }, + { + "epoch": 0.7731691510045366, + "grad_norm": 0.4914171099662781, + "learning_rate": 8.700861939834365e-06, + "loss": 0.1671, + "step": 2386 + }, + { + "epoch": 0.7734931950745302, + "grad_norm": 0.508822500705719, + "learning_rate": 8.699685510261173e-06, + "loss": 0.1543, + "step": 2387 + }, + { + "epoch": 0.7738172391445236, + "grad_norm": 0.5024411678314209, + "learning_rate": 8.698508627882291e-06, + "loss": 0.1643, + "step": 2388 + }, + { + "epoch": 0.7741412832145171, + "grad_norm": 0.5050293803215027, + "learning_rate": 8.697331292841757e-06, + "loss": 0.1676, + "step": 2389 + }, + { + "epoch": 0.7744653272845107, + "grad_norm": 0.48795467615127563, + "learning_rate": 8.696153505283667e-06, + "loss": 0.1637, + "step": 2390 + }, + { + "epoch": 0.7747893713545042, + "grad_norm": 0.4281572699546814, + "learning_rate": 8.694975265352168e-06, + "loss": 0.1267, + "step": 2391 + }, + { + "epoch": 0.7751134154244977, + "grad_norm": 0.5082755088806152, + "learning_rate": 8.693796573191467e-06, + "loss": 0.1405, + "step": 2392 + }, + { + "epoch": 0.7754374594944913, + "grad_norm": 0.5042880177497864, + "learning_rate": 8.692617428945823e-06, + "loss": 0.1524, + "step": 2393 + }, + { + "epoch": 0.7757615035644848, + "grad_norm": 0.4925474524497986, + "learning_rate": 8.691437832759555e-06, + "loss": 0.1426, + "step": 2394 + }, + { + "epoch": 0.7760855476344782, + "grad_norm": 0.4777103066444397, + "learning_rate": 8.69025778477703e-06, + "loss": 0.1367, + "step": 2395 + }, + { + "epoch": 0.7764095917044718, + "grad_norm": 0.4859715402126312, + "learning_rate": 8.689077285142678e-06, + "loss": 0.1537, + "step": 2396 + }, + { + "epoch": 0.7767336357744653, + "grad_norm": 0.4879585802555084, + "learning_rate": 8.687896334000979e-06, + "loss": 0.1453, + "step": 2397 + }, + { + "epoch": 0.7770576798444588, + "grad_norm": 0.4773515462875366, + "learning_rate": 8.68671493149647e-06, + "loss": 0.1377, + "step": 2398 + }, + { + "epoch": 0.7773817239144524, + "grad_norm": 0.48462027311325073, + "learning_rate": 8.685533077773744e-06, + "loss": 0.1487, + "step": 2399 + }, + { + "epoch": 0.7777057679844459, + "grad_norm": 0.45635169744491577, + "learning_rate": 8.684350772977447e-06, + "loss": 0.1329, + "step": 2400 + }, + { + "epoch": 0.7780298120544394, + "grad_norm": 0.530298113822937, + "learning_rate": 8.683168017252287e-06, + "loss": 0.1706, + "step": 2401 + }, + { + "epoch": 0.778353856124433, + "grad_norm": 0.5109763741493225, + "learning_rate": 8.681984810743012e-06, + "loss": 0.1663, + "step": 2402 + }, + { + "epoch": 0.7786779001944264, + "grad_norm": 0.478502482175827, + "learning_rate": 8.680801153594442e-06, + "loss": 0.1358, + "step": 2403 + }, + { + "epoch": 0.7790019442644199, + "grad_norm": 0.4598463475704193, + "learning_rate": 8.679617045951445e-06, + "loss": 0.1316, + "step": 2404 + }, + { + "epoch": 0.7793259883344135, + "grad_norm": 0.47924965620040894, + "learning_rate": 8.678432487958943e-06, + "loss": 0.1466, + "step": 2405 + }, + { + "epoch": 0.779650032404407, + "grad_norm": 0.49181798100471497, + "learning_rate": 8.677247479761915e-06, + "loss": 0.1423, + "step": 2406 + }, + { + "epoch": 0.7799740764744005, + "grad_norm": 0.47612518072128296, + "learning_rate": 8.676062021505392e-06, + "loss": 0.1378, + "step": 2407 + }, + { + "epoch": 0.7802981205443941, + "grad_norm": 0.5300808548927307, + "learning_rate": 8.674876113334465e-06, + "loss": 0.1639, + "step": 2408 + }, + { + "epoch": 0.7806221646143876, + "grad_norm": 0.5046321153640747, + "learning_rate": 8.673689755394278e-06, + "loss": 0.15, + "step": 2409 + }, + { + "epoch": 0.780946208684381, + "grad_norm": 0.47892558574676514, + "learning_rate": 8.67250294783003e-06, + "loss": 0.1452, + "step": 2410 + }, + { + "epoch": 0.7812702527543746, + "grad_norm": 0.5028601288795471, + "learning_rate": 8.671315690786972e-06, + "loss": 0.1539, + "step": 2411 + }, + { + "epoch": 0.7815942968243681, + "grad_norm": 0.5324373841285706, + "learning_rate": 8.670127984410415e-06, + "loss": 0.1704, + "step": 2412 + }, + { + "epoch": 0.7819183408943616, + "grad_norm": 0.48744648694992065, + "learning_rate": 8.668939828845721e-06, + "loss": 0.1544, + "step": 2413 + }, + { + "epoch": 0.7822423849643552, + "grad_norm": 0.46258261799812317, + "learning_rate": 8.667751224238311e-06, + "loss": 0.1341, + "step": 2414 + }, + { + "epoch": 0.7825664290343487, + "grad_norm": 0.4862961769104004, + "learning_rate": 8.666562170733658e-06, + "loss": 0.1476, + "step": 2415 + }, + { + "epoch": 0.7828904731043422, + "grad_norm": 0.5449991226196289, + "learning_rate": 8.665372668477293e-06, + "loss": 0.1374, + "step": 2416 + }, + { + "epoch": 0.7832145171743357, + "grad_norm": 0.47787198424339294, + "learning_rate": 8.664182717614793e-06, + "loss": 0.1438, + "step": 2417 + }, + { + "epoch": 0.7835385612443292, + "grad_norm": 0.5111783742904663, + "learning_rate": 8.662992318291803e-06, + "loss": 0.1655, + "step": 2418 + }, + { + "epoch": 0.7838626053143227, + "grad_norm": 0.45256495475769043, + "learning_rate": 8.661801470654011e-06, + "loss": 0.1309, + "step": 2419 + }, + { + "epoch": 0.7841866493843163, + "grad_norm": 0.49635931849479675, + "learning_rate": 8.66061017484717e-06, + "loss": 0.1412, + "step": 2420 + }, + { + "epoch": 0.7845106934543098, + "grad_norm": 0.48685595393180847, + "learning_rate": 8.659418431017082e-06, + "loss": 0.1531, + "step": 2421 + }, + { + "epoch": 0.7848347375243033, + "grad_norm": 0.5329604148864746, + "learning_rate": 8.658226239309602e-06, + "loss": 0.157, + "step": 2422 + }, + { + "epoch": 0.7851587815942969, + "grad_norm": 0.5168407559394836, + "learning_rate": 8.657033599870646e-06, + "loss": 0.1489, + "step": 2423 + }, + { + "epoch": 0.7854828256642904, + "grad_norm": 0.49895137548446655, + "learning_rate": 8.655840512846178e-06, + "loss": 0.1547, + "step": 2424 + }, + { + "epoch": 0.7858068697342838, + "grad_norm": 0.47160446643829346, + "learning_rate": 8.654646978382227e-06, + "loss": 0.1534, + "step": 2425 + }, + { + "epoch": 0.7861309138042774, + "grad_norm": 0.5098379254341125, + "learning_rate": 8.653452996624861e-06, + "loss": 0.157, + "step": 2426 + }, + { + "epoch": 0.7864549578742709, + "grad_norm": 0.5164358615875244, + "learning_rate": 8.652258567720218e-06, + "loss": 0.1516, + "step": 2427 + }, + { + "epoch": 0.7867790019442644, + "grad_norm": 0.49435731768608093, + "learning_rate": 8.651063691814483e-06, + "loss": 0.1551, + "step": 2428 + }, + { + "epoch": 0.787103046014258, + "grad_norm": 0.4760430157184601, + "learning_rate": 8.649868369053897e-06, + "loss": 0.1366, + "step": 2429 + }, + { + "epoch": 0.7874270900842515, + "grad_norm": 0.5499979853630066, + "learning_rate": 8.648672599584756e-06, + "loss": 0.1683, + "step": 2430 + }, + { + "epoch": 0.787751134154245, + "grad_norm": 0.47031792998313904, + "learning_rate": 8.647476383553411e-06, + "loss": 0.146, + "step": 2431 + }, + { + "epoch": 0.7880751782242384, + "grad_norm": 0.4686078727245331, + "learning_rate": 8.646279721106266e-06, + "loss": 0.1391, + "step": 2432 + }, + { + "epoch": 0.788399222294232, + "grad_norm": 0.47792428731918335, + "learning_rate": 8.645082612389783e-06, + "loss": 0.1481, + "step": 2433 + }, + { + "epoch": 0.7887232663642255, + "grad_norm": 0.4733062982559204, + "learning_rate": 8.643885057550476e-06, + "loss": 0.1337, + "step": 2434 + }, + { + "epoch": 0.789047310434219, + "grad_norm": 0.4806113541126251, + "learning_rate": 8.642687056734911e-06, + "loss": 0.1434, + "step": 2435 + }, + { + "epoch": 0.7893713545042126, + "grad_norm": 0.5003640651702881, + "learning_rate": 8.641488610089716e-06, + "loss": 0.1398, + "step": 2436 + }, + { + "epoch": 0.7896953985742061, + "grad_norm": 0.4911183714866638, + "learning_rate": 8.640289717761568e-06, + "loss": 0.1487, + "step": 2437 + }, + { + "epoch": 0.7900194426441997, + "grad_norm": 0.4869093596935272, + "learning_rate": 8.639090379897198e-06, + "loss": 0.1455, + "step": 2438 + }, + { + "epoch": 0.7903434867141931, + "grad_norm": 0.44922661781311035, + "learning_rate": 8.637890596643396e-06, + "loss": 0.1356, + "step": 2439 + }, + { + "epoch": 0.7906675307841866, + "grad_norm": 0.45907965302467346, + "learning_rate": 8.636690368147e-06, + "loss": 0.1273, + "step": 2440 + }, + { + "epoch": 0.7909915748541801, + "grad_norm": 0.5199213624000549, + "learning_rate": 8.635489694554913e-06, + "loss": 0.1572, + "step": 2441 + }, + { + "epoch": 0.7913156189241737, + "grad_norm": 0.5016922354698181, + "learning_rate": 8.634288576014078e-06, + "loss": 0.1571, + "step": 2442 + }, + { + "epoch": 0.7916396629941672, + "grad_norm": 0.4670245349407196, + "learning_rate": 8.633087012671504e-06, + "loss": 0.1384, + "step": 2443 + }, + { + "epoch": 0.7919637070641607, + "grad_norm": 0.5087417960166931, + "learning_rate": 8.631885004674251e-06, + "loss": 0.1413, + "step": 2444 + }, + { + "epoch": 0.7922877511341543, + "grad_norm": 0.48342347145080566, + "learning_rate": 8.630682552169434e-06, + "loss": 0.1443, + "step": 2445 + }, + { + "epoch": 0.7926117952041478, + "grad_norm": 0.49706462025642395, + "learning_rate": 8.629479655304221e-06, + "loss": 0.1558, + "step": 2446 + }, + { + "epoch": 0.7929358392741412, + "grad_norm": 0.47584474086761475, + "learning_rate": 8.628276314225833e-06, + "loss": 0.1468, + "step": 2447 + }, + { + "epoch": 0.7932598833441348, + "grad_norm": 0.44449400901794434, + "learning_rate": 8.627072529081549e-06, + "loss": 0.1298, + "step": 2448 + }, + { + "epoch": 0.7935839274141283, + "grad_norm": 0.49863201379776, + "learning_rate": 8.625868300018701e-06, + "loss": 0.1679, + "step": 2449 + }, + { + "epoch": 0.7939079714841218, + "grad_norm": 0.5134031772613525, + "learning_rate": 8.624663627184671e-06, + "loss": 0.1618, + "step": 2450 + }, + { + "epoch": 0.7942320155541154, + "grad_norm": 0.466678649187088, + "learning_rate": 8.623458510726906e-06, + "loss": 0.1267, + "step": 2451 + }, + { + "epoch": 0.7945560596241089, + "grad_norm": 0.48051100969314575, + "learning_rate": 8.622252950792895e-06, + "loss": 0.1432, + "step": 2452 + }, + { + "epoch": 0.7948801036941024, + "grad_norm": 0.4591250717639923, + "learning_rate": 8.62104694753019e-06, + "loss": 0.1441, + "step": 2453 + }, + { + "epoch": 0.7952041477640959, + "grad_norm": 0.49696993827819824, + "learning_rate": 8.619840501086392e-06, + "loss": 0.1395, + "step": 2454 + }, + { + "epoch": 0.7955281918340894, + "grad_norm": 0.4945259690284729, + "learning_rate": 8.61863361160916e-06, + "loss": 0.1527, + "step": 2455 + }, + { + "epoch": 0.7958522359040829, + "grad_norm": 0.47516417503356934, + "learning_rate": 8.617426279246206e-06, + "loss": 0.1455, + "step": 2456 + }, + { + "epoch": 0.7961762799740765, + "grad_norm": 0.4969664216041565, + "learning_rate": 8.616218504145294e-06, + "loss": 0.1564, + "step": 2457 + }, + { + "epoch": 0.79650032404407, + "grad_norm": 0.49645090103149414, + "learning_rate": 8.615010286454244e-06, + "loss": 0.1535, + "step": 2458 + }, + { + "epoch": 0.7968243681140635, + "grad_norm": 0.5396689176559448, + "learning_rate": 8.613801626320932e-06, + "loss": 0.1615, + "step": 2459 + }, + { + "epoch": 0.7971484121840571, + "grad_norm": 0.4874482750892639, + "learning_rate": 8.612592523893286e-06, + "loss": 0.1466, + "step": 2460 + }, + { + "epoch": 0.7974724562540505, + "grad_norm": 0.5054929256439209, + "learning_rate": 8.611382979319286e-06, + "loss": 0.1479, + "step": 2461 + }, + { + "epoch": 0.797796500324044, + "grad_norm": 0.4731915295124054, + "learning_rate": 8.610172992746971e-06, + "loss": 0.1472, + "step": 2462 + }, + { + "epoch": 0.7981205443940376, + "grad_norm": 0.48185446858406067, + "learning_rate": 8.60896256432443e-06, + "loss": 0.1538, + "step": 2463 + }, + { + "epoch": 0.7984445884640311, + "grad_norm": 0.5094521045684814, + "learning_rate": 8.607751694199811e-06, + "loss": 0.1525, + "step": 2464 + }, + { + "epoch": 0.7987686325340246, + "grad_norm": 0.5127012133598328, + "learning_rate": 8.606540382521308e-06, + "loss": 0.1722, + "step": 2465 + }, + { + "epoch": 0.7990926766040182, + "grad_norm": 0.46042630076408386, + "learning_rate": 8.605328629437177e-06, + "loss": 0.1313, + "step": 2466 + }, + { + "epoch": 0.7994167206740117, + "grad_norm": 0.46402016282081604, + "learning_rate": 8.604116435095724e-06, + "loss": 0.1339, + "step": 2467 + }, + { + "epoch": 0.7997407647440052, + "grad_norm": 0.5011064410209656, + "learning_rate": 8.60290379964531e-06, + "loss": 0.1581, + "step": 2468 + }, + { + "epoch": 0.8000648088139987, + "grad_norm": 0.4951988458633423, + "learning_rate": 8.601690723234349e-06, + "loss": 0.1554, + "step": 2469 + }, + { + "epoch": 0.8003888528839922, + "grad_norm": 0.5081047415733337, + "learning_rate": 8.600477206011312e-06, + "loss": 0.1531, + "step": 2470 + }, + { + "epoch": 0.8007128969539857, + "grad_norm": 0.511314332485199, + "learning_rate": 8.599263248124718e-06, + "loss": 0.1565, + "step": 2471 + }, + { + "epoch": 0.8010369410239793, + "grad_norm": 0.4906594753265381, + "learning_rate": 8.598048849723149e-06, + "loss": 0.1559, + "step": 2472 + }, + { + "epoch": 0.8013609850939728, + "grad_norm": 0.4825765788555145, + "learning_rate": 8.596834010955231e-06, + "loss": 0.1519, + "step": 2473 + }, + { + "epoch": 0.8016850291639663, + "grad_norm": 0.482764333486557, + "learning_rate": 8.595618731969651e-06, + "loss": 0.1403, + "step": 2474 + }, + { + "epoch": 0.8020090732339599, + "grad_norm": 0.46174606680870056, + "learning_rate": 8.594403012915145e-06, + "loss": 0.1434, + "step": 2475 + }, + { + "epoch": 0.8023331173039533, + "grad_norm": 0.5105165839195251, + "learning_rate": 8.593186853940507e-06, + "loss": 0.1528, + "step": 2476 + }, + { + "epoch": 0.8026571613739468, + "grad_norm": 0.5206249356269836, + "learning_rate": 8.591970255194582e-06, + "loss": 0.1598, + "step": 2477 + }, + { + "epoch": 0.8029812054439404, + "grad_norm": 0.5320706963539124, + "learning_rate": 8.590753216826273e-06, + "loss": 0.1538, + "step": 2478 + }, + { + "epoch": 0.8033052495139339, + "grad_norm": 0.5003678202629089, + "learning_rate": 8.58953573898453e-06, + "loss": 0.1615, + "step": 2479 + }, + { + "epoch": 0.8036292935839274, + "grad_norm": 0.4871372580528259, + "learning_rate": 8.588317821818362e-06, + "loss": 0.151, + "step": 2480 + }, + { + "epoch": 0.803953337653921, + "grad_norm": 0.45962315797805786, + "learning_rate": 8.58709946547683e-06, + "loss": 0.1371, + "step": 2481 + }, + { + "epoch": 0.8042773817239145, + "grad_norm": 0.5067052841186523, + "learning_rate": 8.585880670109051e-06, + "loss": 0.1608, + "step": 2482 + }, + { + "epoch": 0.8046014257939079, + "grad_norm": 0.47325399518013, + "learning_rate": 8.58466143586419e-06, + "loss": 0.1437, + "step": 2483 + }, + { + "epoch": 0.8049254698639015, + "grad_norm": 0.47463107109069824, + "learning_rate": 8.583441762891473e-06, + "loss": 0.1458, + "step": 2484 + }, + { + "epoch": 0.805249513933895, + "grad_norm": 0.47829777002334595, + "learning_rate": 8.582221651340174e-06, + "loss": 0.1486, + "step": 2485 + }, + { + "epoch": 0.8055735580038885, + "grad_norm": 0.4528464078903198, + "learning_rate": 8.581001101359622e-06, + "loss": 0.1298, + "step": 2486 + }, + { + "epoch": 0.8058976020738821, + "grad_norm": 0.4922964572906494, + "learning_rate": 8.579780113099206e-06, + "loss": 0.1548, + "step": 2487 + }, + { + "epoch": 0.8062216461438756, + "grad_norm": 0.49854376912117004, + "learning_rate": 8.578558686708356e-06, + "loss": 0.1498, + "step": 2488 + }, + { + "epoch": 0.8065456902138691, + "grad_norm": 0.45197510719299316, + "learning_rate": 8.577336822336567e-06, + "loss": 0.1354, + "step": 2489 + }, + { + "epoch": 0.8068697342838627, + "grad_norm": 0.5470073223114014, + "learning_rate": 8.57611452013338e-06, + "loss": 0.1703, + "step": 2490 + }, + { + "epoch": 0.8071937783538561, + "grad_norm": 0.5207754373550415, + "learning_rate": 8.574891780248396e-06, + "loss": 0.1513, + "step": 2491 + }, + { + "epoch": 0.8075178224238496, + "grad_norm": 0.4938046336174011, + "learning_rate": 8.573668602831268e-06, + "loss": 0.1487, + "step": 2492 + }, + { + "epoch": 0.8078418664938432, + "grad_norm": 0.4892713725566864, + "learning_rate": 8.572444988031696e-06, + "loss": 0.1436, + "step": 2493 + }, + { + "epoch": 0.8081659105638367, + "grad_norm": 0.48755109310150146, + "learning_rate": 8.571220935999443e-06, + "loss": 0.1416, + "step": 2494 + }, + { + "epoch": 0.8084899546338302, + "grad_norm": 0.47816547751426697, + "learning_rate": 8.56999644688432e-06, + "loss": 0.137, + "step": 2495 + }, + { + "epoch": 0.8088139987038238, + "grad_norm": 0.5130065083503723, + "learning_rate": 8.568771520836191e-06, + "loss": 0.1522, + "step": 2496 + }, + { + "epoch": 0.8091380427738173, + "grad_norm": 0.47157740592956543, + "learning_rate": 8.567546158004977e-06, + "loss": 0.1484, + "step": 2497 + }, + { + "epoch": 0.8094620868438107, + "grad_norm": 0.5034700632095337, + "learning_rate": 8.56632035854065e-06, + "loss": 0.1512, + "step": 2498 + }, + { + "epoch": 0.8097861309138042, + "grad_norm": 0.45632052421569824, + "learning_rate": 8.565094122593236e-06, + "loss": 0.1399, + "step": 2499 + }, + { + "epoch": 0.8101101749837978, + "grad_norm": 0.4864128828048706, + "learning_rate": 8.563867450312812e-06, + "loss": 0.1457, + "step": 2500 + }, + { + "epoch": 0.8104342190537913, + "grad_norm": 0.4833630919456482, + "learning_rate": 8.562640341849515e-06, + "loss": 0.142, + "step": 2501 + }, + { + "epoch": 0.8107582631237849, + "grad_norm": 0.44969481229782104, + "learning_rate": 8.561412797353528e-06, + "loss": 0.134, + "step": 2502 + }, + { + "epoch": 0.8110823071937784, + "grad_norm": 0.47103044390678406, + "learning_rate": 8.560184816975093e-06, + "loss": 0.1431, + "step": 2503 + }, + { + "epoch": 0.8114063512637719, + "grad_norm": 0.4690519869327545, + "learning_rate": 8.5589564008645e-06, + "loss": 0.146, + "step": 2504 + }, + { + "epoch": 0.8117303953337653, + "grad_norm": 0.48151180148124695, + "learning_rate": 8.557727549172099e-06, + "loss": 0.1426, + "step": 2505 + }, + { + "epoch": 0.8120544394037589, + "grad_norm": 0.5109602808952332, + "learning_rate": 8.556498262048285e-06, + "loss": 0.1595, + "step": 2506 + }, + { + "epoch": 0.8123784834737524, + "grad_norm": 0.476717472076416, + "learning_rate": 8.555268539643515e-06, + "loss": 0.1431, + "step": 2507 + }, + { + "epoch": 0.812702527543746, + "grad_norm": 0.4775809049606323, + "learning_rate": 8.554038382108293e-06, + "loss": 0.1425, + "step": 2508 + }, + { + "epoch": 0.8130265716137395, + "grad_norm": 0.49816781282424927, + "learning_rate": 8.552807789593178e-06, + "loss": 0.1528, + "step": 2509 + }, + { + "epoch": 0.813350615683733, + "grad_norm": 0.49087977409362793, + "learning_rate": 8.551576762248785e-06, + "loss": 0.1467, + "step": 2510 + }, + { + "epoch": 0.8136746597537265, + "grad_norm": 0.526469886302948, + "learning_rate": 8.550345300225778e-06, + "loss": 0.1555, + "step": 2511 + }, + { + "epoch": 0.81399870382372, + "grad_norm": 0.49106329679489136, + "learning_rate": 8.549113403674876e-06, + "loss": 0.1419, + "step": 2512 + }, + { + "epoch": 0.8143227478937135, + "grad_norm": 0.47168755531311035, + "learning_rate": 8.547881072746852e-06, + "loss": 0.1381, + "step": 2513 + }, + { + "epoch": 0.814646791963707, + "grad_norm": 0.5075554847717285, + "learning_rate": 8.546648307592529e-06, + "loss": 0.1595, + "step": 2514 + }, + { + "epoch": 0.8149708360337006, + "grad_norm": 0.49591541290283203, + "learning_rate": 8.545415108362789e-06, + "loss": 0.1503, + "step": 2515 + }, + { + "epoch": 0.8152948801036941, + "grad_norm": 0.4703139364719391, + "learning_rate": 8.54418147520856e-06, + "loss": 0.1522, + "step": 2516 + }, + { + "epoch": 0.8156189241736876, + "grad_norm": 0.46373069286346436, + "learning_rate": 8.54294740828083e-06, + "loss": 0.1448, + "step": 2517 + }, + { + "epoch": 0.8159429682436812, + "grad_norm": 0.4968792200088501, + "learning_rate": 8.541712907730636e-06, + "loss": 0.1493, + "step": 2518 + }, + { + "epoch": 0.8162670123136747, + "grad_norm": 0.4752669036388397, + "learning_rate": 8.540477973709068e-06, + "loss": 0.1528, + "step": 2519 + }, + { + "epoch": 0.8165910563836681, + "grad_norm": 0.4536360204219818, + "learning_rate": 8.539242606367271e-06, + "loss": 0.1337, + "step": 2520 + }, + { + "epoch": 0.8169151004536617, + "grad_norm": 0.5199907422065735, + "learning_rate": 8.538006805856443e-06, + "loss": 0.1601, + "step": 2521 + }, + { + "epoch": 0.8172391445236552, + "grad_norm": 0.4877196252346039, + "learning_rate": 8.53677057232783e-06, + "loss": 0.1507, + "step": 2522 + }, + { + "epoch": 0.8175631885936487, + "grad_norm": 0.5085099935531616, + "learning_rate": 8.535533905932739e-06, + "loss": 0.1638, + "step": 2523 + }, + { + "epoch": 0.8178872326636423, + "grad_norm": 0.48997917771339417, + "learning_rate": 8.534296806822523e-06, + "loss": 0.1459, + "step": 2524 + }, + { + "epoch": 0.8182112767336358, + "grad_norm": 0.441158264875412, + "learning_rate": 8.533059275148594e-06, + "loss": 0.1327, + "step": 2525 + }, + { + "epoch": 0.8185353208036293, + "grad_norm": 0.47758910059928894, + "learning_rate": 8.531821311062412e-06, + "loss": 0.1415, + "step": 2526 + }, + { + "epoch": 0.8188593648736228, + "grad_norm": 0.45573076605796814, + "learning_rate": 8.530582914715493e-06, + "loss": 0.1363, + "step": 2527 + }, + { + "epoch": 0.8191834089436163, + "grad_norm": 0.46968233585357666, + "learning_rate": 8.529344086259401e-06, + "loss": 0.1375, + "step": 2528 + }, + { + "epoch": 0.8195074530136098, + "grad_norm": 0.47364190220832825, + "learning_rate": 8.528104825845763e-06, + "loss": 0.1396, + "step": 2529 + }, + { + "epoch": 0.8198314970836034, + "grad_norm": 0.47457167506217957, + "learning_rate": 8.526865133626246e-06, + "loss": 0.1453, + "step": 2530 + }, + { + "epoch": 0.8201555411535969, + "grad_norm": 0.49752020835876465, + "learning_rate": 8.525625009752582e-06, + "loss": 0.1552, + "step": 2531 + }, + { + "epoch": 0.8204795852235904, + "grad_norm": 0.4616011381149292, + "learning_rate": 8.524384454376546e-06, + "loss": 0.1317, + "step": 2532 + }, + { + "epoch": 0.820803629293584, + "grad_norm": 0.5041443705558777, + "learning_rate": 8.523143467649972e-06, + "loss": 0.1534, + "step": 2533 + }, + { + "epoch": 0.8211276733635774, + "grad_norm": 0.4988428056240082, + "learning_rate": 8.521902049724743e-06, + "loss": 0.1517, + "step": 2534 + }, + { + "epoch": 0.8214517174335709, + "grad_norm": 0.48993542790412903, + "learning_rate": 8.520660200752799e-06, + "loss": 0.1418, + "step": 2535 + }, + { + "epoch": 0.8217757615035645, + "grad_norm": 0.458396852016449, + "learning_rate": 8.51941792088613e-06, + "loss": 0.1432, + "step": 2536 + }, + { + "epoch": 0.822099805573558, + "grad_norm": 0.5019996166229248, + "learning_rate": 8.518175210276775e-06, + "loss": 0.1491, + "step": 2537 + }, + { + "epoch": 0.8224238496435515, + "grad_norm": 0.4727717936038971, + "learning_rate": 8.516932069076835e-06, + "loss": 0.149, + "step": 2538 + }, + { + "epoch": 0.8227478937135451, + "grad_norm": 0.4895130395889282, + "learning_rate": 8.515688497438458e-06, + "loss": 0.1424, + "step": 2539 + }, + { + "epoch": 0.8230719377835386, + "grad_norm": 0.5439048409461975, + "learning_rate": 8.51444449551384e-06, + "loss": 0.1718, + "step": 2540 + }, + { + "epoch": 0.8233959818535321, + "grad_norm": 0.464637815952301, + "learning_rate": 8.51320006345524e-06, + "loss": 0.1413, + "step": 2541 + }, + { + "epoch": 0.8237200259235256, + "grad_norm": 0.5023958086967468, + "learning_rate": 8.511955201414963e-06, + "loss": 0.1552, + "step": 2542 + }, + { + "epoch": 0.8240440699935191, + "grad_norm": 0.477631539106369, + "learning_rate": 8.510709909545367e-06, + "loss": 0.1531, + "step": 2543 + }, + { + "epoch": 0.8243681140635126, + "grad_norm": 0.4549383521080017, + "learning_rate": 8.509464187998863e-06, + "loss": 0.1422, + "step": 2544 + }, + { + "epoch": 0.8246921581335062, + "grad_norm": 0.48795077204704285, + "learning_rate": 8.50821803692792e-06, + "loss": 0.1502, + "step": 2545 + }, + { + "epoch": 0.8250162022034997, + "grad_norm": 0.473164439201355, + "learning_rate": 8.50697145648505e-06, + "loss": 0.1362, + "step": 2546 + }, + { + "epoch": 0.8253402462734932, + "grad_norm": 0.5239213109016418, + "learning_rate": 8.505724446822824e-06, + "loss": 0.1453, + "step": 2547 + }, + { + "epoch": 0.8256642903434868, + "grad_norm": 0.4741290509700775, + "learning_rate": 8.504477008093862e-06, + "loss": 0.1379, + "step": 2548 + }, + { + "epoch": 0.8259883344134802, + "grad_norm": 0.4502844214439392, + "learning_rate": 8.503229140450842e-06, + "loss": 0.1208, + "step": 2549 + }, + { + "epoch": 0.8263123784834737, + "grad_norm": 0.5354313254356384, + "learning_rate": 8.501980844046486e-06, + "loss": 0.1652, + "step": 2550 + }, + { + "epoch": 0.8266364225534673, + "grad_norm": 0.49819859862327576, + "learning_rate": 8.500732119033581e-06, + "loss": 0.1446, + "step": 2551 + }, + { + "epoch": 0.8269604666234608, + "grad_norm": 0.5132746696472168, + "learning_rate": 8.499482965564952e-06, + "loss": 0.1471, + "step": 2552 + }, + { + "epoch": 0.8272845106934543, + "grad_norm": 0.47220292687416077, + "learning_rate": 8.498233383793486e-06, + "loss": 0.1546, + "step": 2553 + }, + { + "epoch": 0.8276085547634479, + "grad_norm": 0.5188787579536438, + "learning_rate": 8.496983373872119e-06, + "loss": 0.1474, + "step": 2554 + }, + { + "epoch": 0.8279325988334414, + "grad_norm": 0.5114861726760864, + "learning_rate": 8.495732935953839e-06, + "loss": 0.1532, + "step": 2555 + }, + { + "epoch": 0.8282566429034348, + "grad_norm": 0.5248055458068848, + "learning_rate": 8.494482070191691e-06, + "loss": 0.1514, + "step": 2556 + }, + { + "epoch": 0.8285806869734283, + "grad_norm": 0.4819242060184479, + "learning_rate": 8.493230776738768e-06, + "loss": 0.1359, + "step": 2557 + }, + { + "epoch": 0.8289047310434219, + "grad_norm": 0.5227571129798889, + "learning_rate": 8.491979055748214e-06, + "loss": 0.1592, + "step": 2558 + }, + { + "epoch": 0.8292287751134154, + "grad_norm": 0.48012658953666687, + "learning_rate": 8.490726907373227e-06, + "loss": 0.1416, + "step": 2559 + }, + { + "epoch": 0.829552819183409, + "grad_norm": 0.4852195680141449, + "learning_rate": 8.48947433176706e-06, + "loss": 0.1503, + "step": 2560 + }, + { + "epoch": 0.8298768632534025, + "grad_norm": 0.5364710688591003, + "learning_rate": 8.488221329083017e-06, + "loss": 0.1668, + "step": 2561 + }, + { + "epoch": 0.830200907323396, + "grad_norm": 0.5360770225524902, + "learning_rate": 8.48696789947445e-06, + "loss": 0.1487, + "step": 2562 + }, + { + "epoch": 0.8305249513933896, + "grad_norm": 0.5258188843727112, + "learning_rate": 8.48571404309477e-06, + "loss": 0.1671, + "step": 2563 + }, + { + "epoch": 0.830848995463383, + "grad_norm": 0.46062448620796204, + "learning_rate": 8.484459760097435e-06, + "loss": 0.1457, + "step": 2564 + }, + { + "epoch": 0.8311730395333765, + "grad_norm": 0.5202407240867615, + "learning_rate": 8.483205050635957e-06, + "loss": 0.153, + "step": 2565 + }, + { + "epoch": 0.83149708360337, + "grad_norm": 0.5415773987770081, + "learning_rate": 8.481949914863901e-06, + "loss": 0.1712, + "step": 2566 + }, + { + "epoch": 0.8318211276733636, + "grad_norm": 0.4666374623775482, + "learning_rate": 8.480694352934884e-06, + "loss": 0.1401, + "step": 2567 + }, + { + "epoch": 0.8321451717433571, + "grad_norm": 0.4689330756664276, + "learning_rate": 8.479438365002573e-06, + "loss": 0.1363, + "step": 2568 + }, + { + "epoch": 0.8324692158133506, + "grad_norm": 0.4993961751461029, + "learning_rate": 8.478181951220693e-06, + "loss": 0.1549, + "step": 2569 + }, + { + "epoch": 0.8327932598833442, + "grad_norm": 0.47131237387657166, + "learning_rate": 8.476925111743009e-06, + "loss": 0.1469, + "step": 2570 + }, + { + "epoch": 0.8331173039533376, + "grad_norm": 0.4705948531627655, + "learning_rate": 8.475667846723352e-06, + "loss": 0.1448, + "step": 2571 + }, + { + "epoch": 0.8334413480233311, + "grad_norm": 0.5221664309501648, + "learning_rate": 8.474410156315597e-06, + "loss": 0.1547, + "step": 2572 + }, + { + "epoch": 0.8337653920933247, + "grad_norm": 0.496259868144989, + "learning_rate": 8.473152040673676e-06, + "loss": 0.1557, + "step": 2573 + }, + { + "epoch": 0.8340894361633182, + "grad_norm": 0.47865161299705505, + "learning_rate": 8.471893499951567e-06, + "loss": 0.1474, + "step": 2574 + }, + { + "epoch": 0.8344134802333117, + "grad_norm": 0.4825884997844696, + "learning_rate": 8.470634534303304e-06, + "loss": 0.1412, + "step": 2575 + }, + { + "epoch": 0.8347375243033053, + "grad_norm": 0.48204711079597473, + "learning_rate": 8.469375143882972e-06, + "loss": 0.165, + "step": 2576 + }, + { + "epoch": 0.8350615683732988, + "grad_norm": 0.4458855986595154, + "learning_rate": 8.468115328844708e-06, + "loss": 0.1344, + "step": 2577 + }, + { + "epoch": 0.8353856124432922, + "grad_norm": 0.47697317600250244, + "learning_rate": 8.466855089342703e-06, + "loss": 0.1482, + "step": 2578 + }, + { + "epoch": 0.8357096565132858, + "grad_norm": 0.48309290409088135, + "learning_rate": 8.465594425531197e-06, + "loss": 0.1495, + "step": 2579 + }, + { + "epoch": 0.8360337005832793, + "grad_norm": 0.47534915804862976, + "learning_rate": 8.464333337564481e-06, + "loss": 0.1536, + "step": 2580 + }, + { + "epoch": 0.8363577446532728, + "grad_norm": 0.475146621465683, + "learning_rate": 8.463071825596904e-06, + "loss": 0.1545, + "step": 2581 + }, + { + "epoch": 0.8366817887232664, + "grad_norm": 0.48443689942359924, + "learning_rate": 8.46180988978286e-06, + "loss": 0.1518, + "step": 2582 + }, + { + "epoch": 0.8370058327932599, + "grad_norm": 0.47789594531059265, + "learning_rate": 8.460547530276798e-06, + "loss": 0.1354, + "step": 2583 + }, + { + "epoch": 0.8373298768632534, + "grad_norm": 0.4564815163612366, + "learning_rate": 8.459284747233218e-06, + "loss": 0.1359, + "step": 2584 + }, + { + "epoch": 0.837653920933247, + "grad_norm": 0.5119007229804993, + "learning_rate": 8.458021540806674e-06, + "loss": 0.157, + "step": 2585 + }, + { + "epoch": 0.8379779650032404, + "grad_norm": 0.47056737542152405, + "learning_rate": 8.45675791115177e-06, + "loss": 0.1317, + "step": 2586 + }, + { + "epoch": 0.8383020090732339, + "grad_norm": 0.453413724899292, + "learning_rate": 8.455493858423163e-06, + "loss": 0.1251, + "step": 2587 + }, + { + "epoch": 0.8386260531432275, + "grad_norm": 0.49031537771224976, + "learning_rate": 8.454229382775558e-06, + "loss": 0.1562, + "step": 2588 + }, + { + "epoch": 0.838950097213221, + "grad_norm": 0.4968595504760742, + "learning_rate": 8.452964484363717e-06, + "loss": 0.14, + "step": 2589 + }, + { + "epoch": 0.8392741412832145, + "grad_norm": 0.5026286840438843, + "learning_rate": 8.45169916334245e-06, + "loss": 0.1506, + "step": 2590 + }, + { + "epoch": 0.8395981853532081, + "grad_norm": 0.4869048297405243, + "learning_rate": 8.450433419866619e-06, + "loss": 0.1392, + "step": 2591 + }, + { + "epoch": 0.8399222294232016, + "grad_norm": 0.4574863314628601, + "learning_rate": 8.449167254091141e-06, + "loss": 0.1305, + "step": 2592 + }, + { + "epoch": 0.840246273493195, + "grad_norm": 0.5489205121994019, + "learning_rate": 8.447900666170983e-06, + "loss": 0.159, + "step": 2593 + }, + { + "epoch": 0.8405703175631886, + "grad_norm": 0.4784124791622162, + "learning_rate": 8.446633656261161e-06, + "loss": 0.1539, + "step": 2594 + }, + { + "epoch": 0.8408943616331821, + "grad_norm": 0.46957382559776306, + "learning_rate": 8.445366224516744e-06, + "loss": 0.1352, + "step": 2595 + }, + { + "epoch": 0.8412184057031756, + "grad_norm": 0.49930262565612793, + "learning_rate": 8.444098371092856e-06, + "loss": 0.1453, + "step": 2596 + }, + { + "epoch": 0.8415424497731692, + "grad_norm": 0.4878937602043152, + "learning_rate": 8.44283009614467e-06, + "loss": 0.1386, + "step": 2597 + }, + { + "epoch": 0.8418664938431627, + "grad_norm": 0.5614111423492432, + "learning_rate": 8.441561399827407e-06, + "loss": 0.1933, + "step": 2598 + }, + { + "epoch": 0.8421905379131562, + "grad_norm": 0.5082785487174988, + "learning_rate": 8.440292282296348e-06, + "loss": 0.1559, + "step": 2599 + }, + { + "epoch": 0.8425145819831497, + "grad_norm": 0.48195216059684753, + "learning_rate": 8.439022743706817e-06, + "loss": 0.1442, + "step": 2600 + }, + { + "epoch": 0.8428386260531432, + "grad_norm": 0.47267797589302063, + "learning_rate": 8.437752784214195e-06, + "loss": 0.1488, + "step": 2601 + }, + { + "epoch": 0.8431626701231367, + "grad_norm": 0.5153549909591675, + "learning_rate": 8.436482403973911e-06, + "loss": 0.1597, + "step": 2602 + }, + { + "epoch": 0.8434867141931303, + "grad_norm": 0.4659569561481476, + "learning_rate": 8.43521160314145e-06, + "loss": 0.1335, + "step": 2603 + }, + { + "epoch": 0.8438107582631238, + "grad_norm": 0.49978286027908325, + "learning_rate": 8.433940381872343e-06, + "loss": 0.1748, + "step": 2604 + }, + { + "epoch": 0.8441348023331173, + "grad_norm": 0.5085880756378174, + "learning_rate": 8.432668740322177e-06, + "loss": 0.1611, + "step": 2605 + }, + { + "epoch": 0.8444588464031109, + "grad_norm": 0.4894861578941345, + "learning_rate": 8.431396678646588e-06, + "loss": 0.1496, + "step": 2606 + }, + { + "epoch": 0.8447828904731044, + "grad_norm": 0.5207383036613464, + "learning_rate": 8.430124197001264e-06, + "loss": 0.1588, + "step": 2607 + }, + { + "epoch": 0.8451069345430978, + "grad_norm": 0.5010978579521179, + "learning_rate": 8.428851295541944e-06, + "loss": 0.158, + "step": 2608 + }, + { + "epoch": 0.8454309786130914, + "grad_norm": 0.4909263253211975, + "learning_rate": 8.427577974424421e-06, + "loss": 0.1502, + "step": 2609 + }, + { + "epoch": 0.8457550226830849, + "grad_norm": 0.5070400238037109, + "learning_rate": 8.426304233804534e-06, + "loss": 0.1753, + "step": 2610 + }, + { + "epoch": 0.8460790667530784, + "grad_norm": 0.4728187024593353, + "learning_rate": 8.425030073838178e-06, + "loss": 0.1451, + "step": 2611 + }, + { + "epoch": 0.846403110823072, + "grad_norm": 0.46031421422958374, + "learning_rate": 8.423755494681298e-06, + "loss": 0.139, + "step": 2612 + }, + { + "epoch": 0.8467271548930655, + "grad_norm": 0.43631961941719055, + "learning_rate": 8.42248049648989e-06, + "loss": 0.1379, + "step": 2613 + }, + { + "epoch": 0.847051198963059, + "grad_norm": 0.4628419876098633, + "learning_rate": 8.42120507942e-06, + "loss": 0.1473, + "step": 2614 + }, + { + "epoch": 0.8473752430330524, + "grad_norm": 0.4439941346645355, + "learning_rate": 8.419929243627731e-06, + "loss": 0.1214, + "step": 2615 + }, + { + "epoch": 0.847699287103046, + "grad_norm": 0.5476211905479431, + "learning_rate": 8.418652989269229e-06, + "loss": 0.1616, + "step": 2616 + }, + { + "epoch": 0.8480233311730395, + "grad_norm": 0.41573604941368103, + "learning_rate": 8.417376316500696e-06, + "loss": 0.1253, + "step": 2617 + }, + { + "epoch": 0.848347375243033, + "grad_norm": 0.44237151741981506, + "learning_rate": 8.416099225478383e-06, + "loss": 0.1312, + "step": 2618 + }, + { + "epoch": 0.8486714193130266, + "grad_norm": 0.4541921317577362, + "learning_rate": 8.414821716358596e-06, + "loss": 0.1444, + "step": 2619 + }, + { + "epoch": 0.8489954633830201, + "grad_norm": 0.5047772526741028, + "learning_rate": 8.413543789297692e-06, + "loss": 0.1541, + "step": 2620 + }, + { + "epoch": 0.8493195074530137, + "grad_norm": 0.478910893201828, + "learning_rate": 8.41226544445207e-06, + "loss": 0.1487, + "step": 2621 + }, + { + "epoch": 0.8496435515230071, + "grad_norm": 0.49247220158576965, + "learning_rate": 8.410986681978192e-06, + "loss": 0.1678, + "step": 2622 + }, + { + "epoch": 0.8499675955930006, + "grad_norm": 0.4732974171638489, + "learning_rate": 8.409707502032565e-06, + "loss": 0.1457, + "step": 2623 + }, + { + "epoch": 0.8502916396629941, + "grad_norm": 0.5027114152908325, + "learning_rate": 8.40842790477175e-06, + "loss": 0.1586, + "step": 2624 + }, + { + "epoch": 0.8506156837329877, + "grad_norm": 0.47344860434532166, + "learning_rate": 8.407147890352353e-06, + "loss": 0.137, + "step": 2625 + }, + { + "epoch": 0.8509397278029812, + "grad_norm": 0.495750367641449, + "learning_rate": 8.405867458931038e-06, + "loss": 0.1442, + "step": 2626 + }, + { + "epoch": 0.8512637718729748, + "grad_norm": 0.4720262289047241, + "learning_rate": 8.40458661066452e-06, + "loss": 0.1454, + "step": 2627 + }, + { + "epoch": 0.8515878159429683, + "grad_norm": 0.5401026606559753, + "learning_rate": 8.403305345709559e-06, + "loss": 0.1551, + "step": 2628 + }, + { + "epoch": 0.8519118600129617, + "grad_norm": 0.4968286454677582, + "learning_rate": 8.40202366422297e-06, + "loss": 0.151, + "step": 2629 + }, + { + "epoch": 0.8522359040829552, + "grad_norm": 0.45830318331718445, + "learning_rate": 8.400741566361617e-06, + "loss": 0.1399, + "step": 2630 + }, + { + "epoch": 0.8525599481529488, + "grad_norm": 0.4789321720600128, + "learning_rate": 8.399459052282418e-06, + "loss": 0.1383, + "step": 2631 + }, + { + "epoch": 0.8528839922229423, + "grad_norm": 0.5061787962913513, + "learning_rate": 8.398176122142344e-06, + "loss": 0.1469, + "step": 2632 + }, + { + "epoch": 0.8532080362929358, + "grad_norm": 0.510563850402832, + "learning_rate": 8.396892776098406e-06, + "loss": 0.1663, + "step": 2633 + }, + { + "epoch": 0.8535320803629294, + "grad_norm": 0.5131493210792542, + "learning_rate": 8.395609014307677e-06, + "loss": 0.1537, + "step": 2634 + }, + { + "epoch": 0.8538561244329229, + "grad_norm": 0.44384628534317017, + "learning_rate": 8.394324836927278e-06, + "loss": 0.1365, + "step": 2635 + }, + { + "epoch": 0.8541801685029164, + "grad_norm": 0.4685351252555847, + "learning_rate": 8.393040244114379e-06, + "loss": 0.1442, + "step": 2636 + }, + { + "epoch": 0.8545042125729099, + "grad_norm": 0.4517896771430969, + "learning_rate": 8.3917552360262e-06, + "loss": 0.1568, + "step": 2637 + }, + { + "epoch": 0.8548282566429034, + "grad_norm": 0.4689981937408447, + "learning_rate": 8.390469812820015e-06, + "loss": 0.1563, + "step": 2638 + }, + { + "epoch": 0.8551523007128969, + "grad_norm": 0.4492129385471344, + "learning_rate": 8.389183974653148e-06, + "loss": 0.1418, + "step": 2639 + }, + { + "epoch": 0.8554763447828905, + "grad_norm": 0.5006503462791443, + "learning_rate": 8.38789772168297e-06, + "loss": 0.1438, + "step": 2640 + }, + { + "epoch": 0.855800388852884, + "grad_norm": 0.49266812205314636, + "learning_rate": 8.386611054066911e-06, + "loss": 0.1435, + "step": 2641 + }, + { + "epoch": 0.8561244329228775, + "grad_norm": 0.4764671325683594, + "learning_rate": 8.385323971962442e-06, + "loss": 0.1535, + "step": 2642 + }, + { + "epoch": 0.8564484769928711, + "grad_norm": 0.5092926621437073, + "learning_rate": 8.384036475527093e-06, + "loss": 0.1529, + "step": 2643 + }, + { + "epoch": 0.8567725210628645, + "grad_norm": 0.4808288514614105, + "learning_rate": 8.38274856491844e-06, + "loss": 0.144, + "step": 2644 + }, + { + "epoch": 0.857096565132858, + "grad_norm": 0.5002308487892151, + "learning_rate": 8.381460240294108e-06, + "loss": 0.1483, + "step": 2645 + }, + { + "epoch": 0.8574206092028516, + "grad_norm": 0.4734160900115967, + "learning_rate": 8.380171501811778e-06, + "loss": 0.1523, + "step": 2646 + }, + { + "epoch": 0.8577446532728451, + "grad_norm": 0.4946827292442322, + "learning_rate": 8.378882349629178e-06, + "loss": 0.1433, + "step": 2647 + }, + { + "epoch": 0.8580686973428386, + "grad_norm": 0.46848565340042114, + "learning_rate": 8.377592783904092e-06, + "loss": 0.1416, + "step": 2648 + }, + { + "epoch": 0.8583927414128322, + "grad_norm": 0.4753412902355194, + "learning_rate": 8.376302804794343e-06, + "loss": 0.1421, + "step": 2649 + }, + { + "epoch": 0.8587167854828257, + "grad_norm": 0.47309020161628723, + "learning_rate": 8.375012412457818e-06, + "loss": 0.143, + "step": 2650 + }, + { + "epoch": 0.8590408295528191, + "grad_norm": 0.4673686623573303, + "learning_rate": 8.373721607052445e-06, + "loss": 0.1416, + "step": 2651 + }, + { + "epoch": 0.8593648736228127, + "grad_norm": 0.46272486448287964, + "learning_rate": 8.37243038873621e-06, + "loss": 0.1546, + "step": 2652 + }, + { + "epoch": 0.8596889176928062, + "grad_norm": 0.459953635931015, + "learning_rate": 8.37113875766714e-06, + "loss": 0.141, + "step": 2653 + }, + { + "epoch": 0.8600129617627997, + "grad_norm": 0.45387542247772217, + "learning_rate": 8.369846714003323e-06, + "loss": 0.1357, + "step": 2654 + }, + { + "epoch": 0.8603370058327933, + "grad_norm": 0.5121089816093445, + "learning_rate": 8.36855425790289e-06, + "loss": 0.1589, + "step": 2655 + }, + { + "epoch": 0.8606610499027868, + "grad_norm": 0.490045428276062, + "learning_rate": 8.367261389524027e-06, + "loss": 0.1486, + "step": 2656 + }, + { + "epoch": 0.8609850939727803, + "grad_norm": 0.4958958327770233, + "learning_rate": 8.365968109024967e-06, + "loss": 0.1553, + "step": 2657 + }, + { + "epoch": 0.8613091380427739, + "grad_norm": 0.5067430138587952, + "learning_rate": 8.364674416563995e-06, + "loss": 0.1628, + "step": 2658 + }, + { + "epoch": 0.8616331821127673, + "grad_norm": 0.4974973499774933, + "learning_rate": 8.363380312299447e-06, + "loss": 0.1578, + "step": 2659 + }, + { + "epoch": 0.8619572261827608, + "grad_norm": 0.45146653056144714, + "learning_rate": 8.362085796389711e-06, + "loss": 0.1415, + "step": 2660 + }, + { + "epoch": 0.8622812702527544, + "grad_norm": 0.46035945415496826, + "learning_rate": 8.360790868993219e-06, + "loss": 0.1465, + "step": 2661 + }, + { + "epoch": 0.8626053143227479, + "grad_norm": 0.49235203862190247, + "learning_rate": 8.35949553026846e-06, + "loss": 0.1471, + "step": 2662 + }, + { + "epoch": 0.8629293583927414, + "grad_norm": 0.5069705843925476, + "learning_rate": 8.35819978037397e-06, + "loss": 0.1527, + "step": 2663 + }, + { + "epoch": 0.863253402462735, + "grad_norm": 0.4901638329029083, + "learning_rate": 8.356903619468336e-06, + "loss": 0.1554, + "step": 2664 + }, + { + "epoch": 0.8635774465327285, + "grad_norm": 0.5096777677536011, + "learning_rate": 8.355607047710199e-06, + "loss": 0.1562, + "step": 2665 + }, + { + "epoch": 0.8639014906027219, + "grad_norm": 0.5072498917579651, + "learning_rate": 8.354310065258244e-06, + "loss": 0.1601, + "step": 2666 + }, + { + "epoch": 0.8642255346727155, + "grad_norm": 0.5249346494674683, + "learning_rate": 8.353012672271206e-06, + "loss": 0.1609, + "step": 2667 + }, + { + "epoch": 0.864549578742709, + "grad_norm": 0.4963613450527191, + "learning_rate": 8.351714868907878e-06, + "loss": 0.1544, + "step": 2668 + }, + { + "epoch": 0.8648736228127025, + "grad_norm": 0.5151003003120422, + "learning_rate": 8.350416655327098e-06, + "loss": 0.1494, + "step": 2669 + }, + { + "epoch": 0.8651976668826961, + "grad_norm": 0.4893086552619934, + "learning_rate": 8.349118031687755e-06, + "loss": 0.1612, + "step": 2670 + }, + { + "epoch": 0.8655217109526896, + "grad_norm": 0.49971023201942444, + "learning_rate": 8.347818998148784e-06, + "loss": 0.1493, + "step": 2671 + }, + { + "epoch": 0.8658457550226831, + "grad_norm": 0.462802529335022, + "learning_rate": 8.346519554869179e-06, + "loss": 0.1289, + "step": 2672 + }, + { + "epoch": 0.8661697990926766, + "grad_norm": 0.47770702838897705, + "learning_rate": 8.345219702007979e-06, + "loss": 0.141, + "step": 2673 + }, + { + "epoch": 0.8664938431626701, + "grad_norm": 0.5033993124961853, + "learning_rate": 8.343919439724268e-06, + "loss": 0.1561, + "step": 2674 + }, + { + "epoch": 0.8668178872326636, + "grad_norm": 0.4785201847553253, + "learning_rate": 8.342618768177192e-06, + "loss": 0.1423, + "step": 2675 + }, + { + "epoch": 0.8671419313026572, + "grad_norm": 0.4748586416244507, + "learning_rate": 8.341317687525936e-06, + "loss": 0.1555, + "step": 2676 + }, + { + "epoch": 0.8674659753726507, + "grad_norm": 0.5080292224884033, + "learning_rate": 8.340016197929741e-06, + "loss": 0.163, + "step": 2677 + }, + { + "epoch": 0.8677900194426442, + "grad_norm": 0.4762882888317108, + "learning_rate": 8.338714299547898e-06, + "loss": 0.1454, + "step": 2678 + }, + { + "epoch": 0.8681140635126378, + "grad_norm": 0.4820559322834015, + "learning_rate": 8.337411992539747e-06, + "loss": 0.1618, + "step": 2679 + }, + { + "epoch": 0.8684381075826313, + "grad_norm": 0.4682447612285614, + "learning_rate": 8.336109277064676e-06, + "loss": 0.1414, + "step": 2680 + }, + { + "epoch": 0.8687621516526247, + "grad_norm": 0.5108025074005127, + "learning_rate": 8.334806153282126e-06, + "loss": 0.1549, + "step": 2681 + }, + { + "epoch": 0.8690861957226182, + "grad_norm": 0.4689571261405945, + "learning_rate": 8.333502621351586e-06, + "loss": 0.1533, + "step": 2682 + }, + { + "epoch": 0.8694102397926118, + "grad_norm": 0.4340459108352661, + "learning_rate": 8.332198681432596e-06, + "loss": 0.1259, + "step": 2683 + }, + { + "epoch": 0.8697342838626053, + "grad_norm": 0.45238205790519714, + "learning_rate": 8.330894333684745e-06, + "loss": 0.1426, + "step": 2684 + }, + { + "epoch": 0.8700583279325989, + "grad_norm": 0.4423817992210388, + "learning_rate": 8.329589578267674e-06, + "loss": 0.1312, + "step": 2685 + }, + { + "epoch": 0.8703823720025924, + "grad_norm": 0.4896402060985565, + "learning_rate": 8.328284415341072e-06, + "loss": 0.1646, + "step": 2686 + }, + { + "epoch": 0.8707064160725859, + "grad_norm": 0.48064756393432617, + "learning_rate": 8.326978845064676e-06, + "loss": 0.134, + "step": 2687 + }, + { + "epoch": 0.8710304601425793, + "grad_norm": 0.49347373843193054, + "learning_rate": 8.325672867598278e-06, + "loss": 0.1565, + "step": 2688 + }, + { + "epoch": 0.8713545042125729, + "grad_norm": 0.46822673082351685, + "learning_rate": 8.324366483101716e-06, + "loss": 0.1442, + "step": 2689 + }, + { + "epoch": 0.8716785482825664, + "grad_norm": 0.47423186898231506, + "learning_rate": 8.323059691734879e-06, + "loss": 0.1334, + "step": 2690 + }, + { + "epoch": 0.87200259235256, + "grad_norm": 0.4813585579395294, + "learning_rate": 8.321752493657706e-06, + "loss": 0.1503, + "step": 2691 + }, + { + "epoch": 0.8723266364225535, + "grad_norm": 0.48055943846702576, + "learning_rate": 8.320444889030184e-06, + "loss": 0.1454, + "step": 2692 + }, + { + "epoch": 0.872650680492547, + "grad_norm": 0.45660504698753357, + "learning_rate": 8.319136878012352e-06, + "loss": 0.1449, + "step": 2693 + }, + { + "epoch": 0.8729747245625405, + "grad_norm": 0.48899611830711365, + "learning_rate": 8.317828460764297e-06, + "loss": 0.1518, + "step": 2694 + }, + { + "epoch": 0.873298768632534, + "grad_norm": 0.4620334804058075, + "learning_rate": 8.316519637446158e-06, + "loss": 0.1463, + "step": 2695 + }, + { + "epoch": 0.8736228127025275, + "grad_norm": 0.4999637305736542, + "learning_rate": 8.315210408218124e-06, + "loss": 0.1516, + "step": 2696 + }, + { + "epoch": 0.873946856772521, + "grad_norm": 0.4712785482406616, + "learning_rate": 8.313900773240428e-06, + "loss": 0.1384, + "step": 2697 + }, + { + "epoch": 0.8742709008425146, + "grad_norm": 0.4864901304244995, + "learning_rate": 8.312590732673359e-06, + "loss": 0.1557, + "step": 2698 + }, + { + "epoch": 0.8745949449125081, + "grad_norm": 0.46352046728134155, + "learning_rate": 8.31128028667725e-06, + "loss": 0.1377, + "step": 2699 + }, + { + "epoch": 0.8749189889825016, + "grad_norm": 0.49007776379585266, + "learning_rate": 8.309969435412493e-06, + "loss": 0.1488, + "step": 2700 + }, + { + "epoch": 0.8752430330524952, + "grad_norm": 0.4691650867462158, + "learning_rate": 8.30865817903952e-06, + "loss": 0.1453, + "step": 2701 + }, + { + "epoch": 0.8755670771224887, + "grad_norm": 0.48605218529701233, + "learning_rate": 8.307346517718813e-06, + "loss": 0.1492, + "step": 2702 + }, + { + "epoch": 0.8758911211924821, + "grad_norm": 0.4926930069923401, + "learning_rate": 8.306034451610913e-06, + "loss": 0.1441, + "step": 2703 + }, + { + "epoch": 0.8762151652624757, + "grad_norm": 0.4776822626590729, + "learning_rate": 8.3047219808764e-06, + "loss": 0.1337, + "step": 2704 + }, + { + "epoch": 0.8765392093324692, + "grad_norm": 0.44507667422294617, + "learning_rate": 8.303409105675909e-06, + "loss": 0.1378, + "step": 2705 + }, + { + "epoch": 0.8768632534024627, + "grad_norm": 0.49557462334632874, + "learning_rate": 8.302095826170122e-06, + "loss": 0.152, + "step": 2706 + }, + { + "epoch": 0.8771872974724563, + "grad_norm": 0.4982512295246124, + "learning_rate": 8.300782142519772e-06, + "loss": 0.1569, + "step": 2707 + }, + { + "epoch": 0.8775113415424498, + "grad_norm": 0.46075528860092163, + "learning_rate": 8.299468054885643e-06, + "loss": 0.1374, + "step": 2708 + }, + { + "epoch": 0.8778353856124433, + "grad_norm": 0.49135658144950867, + "learning_rate": 8.298153563428565e-06, + "loss": 0.1545, + "step": 2709 + }, + { + "epoch": 0.8781594296824368, + "grad_norm": 0.5004354119300842, + "learning_rate": 8.296838668309421e-06, + "loss": 0.1611, + "step": 2710 + }, + { + "epoch": 0.8784834737524303, + "grad_norm": 0.4572104215621948, + "learning_rate": 8.295523369689138e-06, + "loss": 0.1397, + "step": 2711 + }, + { + "epoch": 0.8788075178224238, + "grad_norm": 0.4650300443172455, + "learning_rate": 8.294207667728698e-06, + "loss": 0.1437, + "step": 2712 + }, + { + "epoch": 0.8791315618924174, + "grad_norm": 0.49239322543144226, + "learning_rate": 8.292891562589131e-06, + "loss": 0.1493, + "step": 2713 + }, + { + "epoch": 0.8794556059624109, + "grad_norm": 0.4775744378566742, + "learning_rate": 8.291575054431513e-06, + "loss": 0.1407, + "step": 2714 + }, + { + "epoch": 0.8797796500324044, + "grad_norm": 0.4666978120803833, + "learning_rate": 8.290258143416974e-06, + "loss": 0.1432, + "step": 2715 + }, + { + "epoch": 0.880103694102398, + "grad_norm": 0.5041922926902771, + "learning_rate": 8.28894082970669e-06, + "loss": 0.1723, + "step": 2716 + }, + { + "epoch": 0.8804277381723914, + "grad_norm": 0.5009000301361084, + "learning_rate": 8.287623113461887e-06, + "loss": 0.1583, + "step": 2717 + }, + { + "epoch": 0.8807517822423849, + "grad_norm": 0.529421865940094, + "learning_rate": 8.286304994843844e-06, + "loss": 0.1664, + "step": 2718 + }, + { + "epoch": 0.8810758263123785, + "grad_norm": 0.4445277452468872, + "learning_rate": 8.284986474013882e-06, + "loss": 0.1307, + "step": 2719 + }, + { + "epoch": 0.881399870382372, + "grad_norm": 0.46141302585601807, + "learning_rate": 8.283667551133376e-06, + "loss": 0.1472, + "step": 2720 + }, + { + "epoch": 0.8817239144523655, + "grad_norm": 0.5086813569068909, + "learning_rate": 8.282348226363753e-06, + "loss": 0.1632, + "step": 2721 + }, + { + "epoch": 0.8820479585223591, + "grad_norm": 0.4846382737159729, + "learning_rate": 8.28102849986648e-06, + "loss": 0.1388, + "step": 2722 + }, + { + "epoch": 0.8823720025923526, + "grad_norm": 0.5189880132675171, + "learning_rate": 8.279708371803081e-06, + "loss": 0.1465, + "step": 2723 + }, + { + "epoch": 0.8826960466623461, + "grad_norm": 0.4765397906303406, + "learning_rate": 8.27838784233513e-06, + "loss": 0.1382, + "step": 2724 + }, + { + "epoch": 0.8830200907323396, + "grad_norm": 0.5010700225830078, + "learning_rate": 8.277066911624242e-06, + "loss": 0.156, + "step": 2725 + }, + { + "epoch": 0.8833441348023331, + "grad_norm": 0.5363819599151611, + "learning_rate": 8.275745579832088e-06, + "loss": 0.1748, + "step": 2726 + }, + { + "epoch": 0.8836681788723266, + "grad_norm": 0.463154673576355, + "learning_rate": 8.27442384712039e-06, + "loss": 0.1403, + "step": 2727 + }, + { + "epoch": 0.8839922229423202, + "grad_norm": 0.4815209209918976, + "learning_rate": 8.27310171365091e-06, + "loss": 0.1477, + "step": 2728 + }, + { + "epoch": 0.8843162670123137, + "grad_norm": 0.4952019453048706, + "learning_rate": 8.271779179585466e-06, + "loss": 0.1578, + "step": 2729 + }, + { + "epoch": 0.8846403110823072, + "grad_norm": 0.5037125945091248, + "learning_rate": 8.270456245085923e-06, + "loss": 0.1622, + "step": 2730 + }, + { + "epoch": 0.8849643551523008, + "grad_norm": 0.49532702565193176, + "learning_rate": 8.269132910314197e-06, + "loss": 0.1533, + "step": 2731 + }, + { + "epoch": 0.8852883992222942, + "grad_norm": 0.4823807179927826, + "learning_rate": 8.267809175432252e-06, + "loss": 0.1529, + "step": 2732 + }, + { + "epoch": 0.8856124432922877, + "grad_norm": 0.5192368626594543, + "learning_rate": 8.266485040602098e-06, + "loss": 0.1459, + "step": 2733 + }, + { + "epoch": 0.8859364873622813, + "grad_norm": 0.4310244917869568, + "learning_rate": 8.265160505985796e-06, + "loss": 0.1196, + "step": 2734 + }, + { + "epoch": 0.8862605314322748, + "grad_norm": 0.49385082721710205, + "learning_rate": 8.263835571745457e-06, + "loss": 0.151, + "step": 2735 + }, + { + "epoch": 0.8865845755022683, + "grad_norm": 0.4536881446838379, + "learning_rate": 8.26251023804324e-06, + "loss": 0.1448, + "step": 2736 + }, + { + "epoch": 0.8869086195722619, + "grad_norm": 0.4777822494506836, + "learning_rate": 8.261184505041354e-06, + "loss": 0.1525, + "step": 2737 + }, + { + "epoch": 0.8872326636422554, + "grad_norm": 0.534416913986206, + "learning_rate": 8.259858372902056e-06, + "loss": 0.1623, + "step": 2738 + }, + { + "epoch": 0.8875567077122488, + "grad_norm": 0.4864097535610199, + "learning_rate": 8.258531841787652e-06, + "loss": 0.1537, + "step": 2739 + }, + { + "epoch": 0.8878807517822424, + "grad_norm": 0.45313143730163574, + "learning_rate": 8.257204911860494e-06, + "loss": 0.1394, + "step": 2740 + }, + { + "epoch": 0.8882047958522359, + "grad_norm": 0.47940826416015625, + "learning_rate": 8.255877583282987e-06, + "loss": 0.1537, + "step": 2741 + }, + { + "epoch": 0.8885288399222294, + "grad_norm": 0.48140037059783936, + "learning_rate": 8.254549856217584e-06, + "loss": 0.1397, + "step": 2742 + }, + { + "epoch": 0.888852883992223, + "grad_norm": 0.4759960472583771, + "learning_rate": 8.253221730826784e-06, + "loss": 0.1575, + "step": 2743 + }, + { + "epoch": 0.8891769280622165, + "grad_norm": 0.5218455195426941, + "learning_rate": 8.251893207273139e-06, + "loss": 0.1598, + "step": 2744 + }, + { + "epoch": 0.88950097213221, + "grad_norm": 0.514616847038269, + "learning_rate": 8.250564285719245e-06, + "loss": 0.1572, + "step": 2745 + }, + { + "epoch": 0.8898250162022034, + "grad_norm": 0.4813917577266693, + "learning_rate": 8.249234966327751e-06, + "loss": 0.1356, + "step": 2746 + }, + { + "epoch": 0.890149060272197, + "grad_norm": 0.4729185700416565, + "learning_rate": 8.247905249261352e-06, + "loss": 0.1442, + "step": 2747 + }, + { + "epoch": 0.8904731043421905, + "grad_norm": 0.49564072489738464, + "learning_rate": 8.246575134682792e-06, + "loss": 0.1403, + "step": 2748 + }, + { + "epoch": 0.890797148412184, + "grad_norm": 0.47402387857437134, + "learning_rate": 8.245244622754866e-06, + "loss": 0.143, + "step": 2749 + }, + { + "epoch": 0.8911211924821776, + "grad_norm": 0.46405506134033203, + "learning_rate": 8.243913713640415e-06, + "loss": 0.1364, + "step": 2750 + }, + { + "epoch": 0.8914452365521711, + "grad_norm": 0.5036703944206238, + "learning_rate": 8.242582407502327e-06, + "loss": 0.1498, + "step": 2751 + }, + { + "epoch": 0.8917692806221647, + "grad_norm": 0.46614500880241394, + "learning_rate": 8.241250704503545e-06, + "loss": 0.1373, + "step": 2752 + }, + { + "epoch": 0.8920933246921582, + "grad_norm": 0.4918436110019684, + "learning_rate": 8.239918604807054e-06, + "loss": 0.1482, + "step": 2753 + }, + { + "epoch": 0.8924173687621516, + "grad_norm": 0.47470030188560486, + "learning_rate": 8.23858610857589e-06, + "loss": 0.1351, + "step": 2754 + }, + { + "epoch": 0.8927414128321451, + "grad_norm": 0.4631500840187073, + "learning_rate": 8.237253215973138e-06, + "loss": 0.142, + "step": 2755 + }, + { + "epoch": 0.8930654569021387, + "grad_norm": 0.442050963640213, + "learning_rate": 8.235919927161931e-06, + "loss": 0.1281, + "step": 2756 + }, + { + "epoch": 0.8933895009721322, + "grad_norm": 0.46017080545425415, + "learning_rate": 8.234586242305451e-06, + "loss": 0.1421, + "step": 2757 + }, + { + "epoch": 0.8937135450421257, + "grad_norm": 0.5412111878395081, + "learning_rate": 8.233252161566928e-06, + "loss": 0.1597, + "step": 2758 + }, + { + "epoch": 0.8940375891121193, + "grad_norm": 0.5096152424812317, + "learning_rate": 8.231917685109643e-06, + "loss": 0.1626, + "step": 2759 + }, + { + "epoch": 0.8943616331821128, + "grad_norm": 0.5203890800476074, + "learning_rate": 8.23058281309692e-06, + "loss": 0.1666, + "step": 2760 + }, + { + "epoch": 0.8946856772521062, + "grad_norm": 0.4767521321773529, + "learning_rate": 8.229247545692134e-06, + "loss": 0.1453, + "step": 2761 + }, + { + "epoch": 0.8950097213220998, + "grad_norm": 0.4585646390914917, + "learning_rate": 8.22791188305871e-06, + "loss": 0.1268, + "step": 2762 + }, + { + "epoch": 0.8953337653920933, + "grad_norm": 0.4929313361644745, + "learning_rate": 8.226575825360122e-06, + "loss": 0.1527, + "step": 2763 + }, + { + "epoch": 0.8956578094620868, + "grad_norm": 0.5119484066963196, + "learning_rate": 8.225239372759888e-06, + "loss": 0.1568, + "step": 2764 + }, + { + "epoch": 0.8959818535320804, + "grad_norm": 0.471207857131958, + "learning_rate": 8.223902525421576e-06, + "loss": 0.1411, + "step": 2765 + }, + { + "epoch": 0.8963058976020739, + "grad_norm": 0.46978434920310974, + "learning_rate": 8.222565283508806e-06, + "loss": 0.155, + "step": 2766 + }, + { + "epoch": 0.8966299416720674, + "grad_norm": 0.42427974939346313, + "learning_rate": 8.221227647185241e-06, + "loss": 0.1266, + "step": 2767 + }, + { + "epoch": 0.8969539857420609, + "grad_norm": 0.504667341709137, + "learning_rate": 8.219889616614596e-06, + "loss": 0.1687, + "step": 2768 + }, + { + "epoch": 0.8972780298120544, + "grad_norm": 0.4763314723968506, + "learning_rate": 8.218551191960633e-06, + "loss": 0.1424, + "step": 2769 + }, + { + "epoch": 0.8976020738820479, + "grad_norm": 0.46471917629241943, + "learning_rate": 8.217212373387164e-06, + "loss": 0.1395, + "step": 2770 + }, + { + "epoch": 0.8979261179520415, + "grad_norm": 0.4854791760444641, + "learning_rate": 8.215873161058043e-06, + "loss": 0.1556, + "step": 2771 + }, + { + "epoch": 0.898250162022035, + "grad_norm": 0.500381588935852, + "learning_rate": 8.21453355513718e-06, + "loss": 0.1554, + "step": 2772 + }, + { + "epoch": 0.8985742060920285, + "grad_norm": 0.46978870034217834, + "learning_rate": 8.21319355578853e-06, + "loss": 0.1452, + "step": 2773 + }, + { + "epoch": 0.8988982501620221, + "grad_norm": 0.4473946690559387, + "learning_rate": 8.211853163176093e-06, + "loss": 0.1343, + "step": 2774 + }, + { + "epoch": 0.8992222942320156, + "grad_norm": 0.515302300453186, + "learning_rate": 8.210512377463924e-06, + "loss": 0.1588, + "step": 2775 + }, + { + "epoch": 0.899546338302009, + "grad_norm": 0.47159039974212646, + "learning_rate": 8.209171198816119e-06, + "loss": 0.1461, + "step": 2776 + }, + { + "epoch": 0.8998703823720026, + "grad_norm": 0.50645512342453, + "learning_rate": 8.207829627396827e-06, + "loss": 0.1616, + "step": 2777 + }, + { + "epoch": 0.9001944264419961, + "grad_norm": 0.49735721945762634, + "learning_rate": 8.206487663370242e-06, + "loss": 0.1536, + "step": 2778 + }, + { + "epoch": 0.9005184705119896, + "grad_norm": 0.5144430994987488, + "learning_rate": 8.205145306900608e-06, + "loss": 0.1683, + "step": 2779 + }, + { + "epoch": 0.9008425145819832, + "grad_norm": 0.5268824696540833, + "learning_rate": 8.203802558152216e-06, + "loss": 0.1611, + "step": 2780 + }, + { + "epoch": 0.9011665586519767, + "grad_norm": 0.4668341279029846, + "learning_rate": 8.202459417289409e-06, + "loss": 0.1575, + "step": 2781 + }, + { + "epoch": 0.9014906027219702, + "grad_norm": 0.4710846543312073, + "learning_rate": 8.201115884476568e-06, + "loss": 0.1329, + "step": 2782 + }, + { + "epoch": 0.9018146467919637, + "grad_norm": 0.45003706216812134, + "learning_rate": 8.199771959878135e-06, + "loss": 0.1412, + "step": 2783 + }, + { + "epoch": 0.9021386908619572, + "grad_norm": 0.41275516152381897, + "learning_rate": 8.19842764365859e-06, + "loss": 0.1176, + "step": 2784 + }, + { + "epoch": 0.9024627349319507, + "grad_norm": 0.4877631366252899, + "learning_rate": 8.197082935982463e-06, + "loss": 0.1438, + "step": 2785 + }, + { + "epoch": 0.9027867790019443, + "grad_norm": 0.5254728198051453, + "learning_rate": 8.195737837014336e-06, + "loss": 0.162, + "step": 2786 + }, + { + "epoch": 0.9031108230719378, + "grad_norm": 0.48036709427833557, + "learning_rate": 8.194392346918834e-06, + "loss": 0.1442, + "step": 2787 + }, + { + "epoch": 0.9034348671419313, + "grad_norm": 0.4583662450313568, + "learning_rate": 8.193046465860635e-06, + "loss": 0.1448, + "step": 2788 + }, + { + "epoch": 0.9037589112119249, + "grad_norm": 0.48229631781578064, + "learning_rate": 8.191700194004457e-06, + "loss": 0.151, + "step": 2789 + }, + { + "epoch": 0.9040829552819183, + "grad_norm": 0.4751323163509369, + "learning_rate": 8.190353531515074e-06, + "loss": 0.1509, + "step": 2790 + }, + { + "epoch": 0.9044069993519118, + "grad_norm": 0.4783444106578827, + "learning_rate": 8.189006478557303e-06, + "loss": 0.1534, + "step": 2791 + }, + { + "epoch": 0.9047310434219054, + "grad_norm": 0.476629376411438, + "learning_rate": 8.187659035296011e-06, + "loss": 0.1413, + "step": 2792 + }, + { + "epoch": 0.9050550874918989, + "grad_norm": 0.4860250651836395, + "learning_rate": 8.186311201896114e-06, + "loss": 0.1569, + "step": 2793 + }, + { + "epoch": 0.9053791315618924, + "grad_norm": 0.48255497217178345, + "learning_rate": 8.18496297852257e-06, + "loss": 0.1607, + "step": 2794 + }, + { + "epoch": 0.905703175631886, + "grad_norm": 0.43398430943489075, + "learning_rate": 8.183614365340393e-06, + "loss": 0.128, + "step": 2795 + }, + { + "epoch": 0.9060272197018795, + "grad_norm": 0.4728407561779022, + "learning_rate": 8.182265362514633e-06, + "loss": 0.1459, + "step": 2796 + }, + { + "epoch": 0.906351263771873, + "grad_norm": 0.4984278380870819, + "learning_rate": 8.180915970210404e-06, + "loss": 0.1526, + "step": 2797 + }, + { + "epoch": 0.9066753078418665, + "grad_norm": 0.44915592670440674, + "learning_rate": 8.17956618859285e-06, + "loss": 0.1503, + "step": 2798 + }, + { + "epoch": 0.90699935191186, + "grad_norm": 0.48544642329216003, + "learning_rate": 8.178216017827178e-06, + "loss": 0.1621, + "step": 2799 + }, + { + "epoch": 0.9073233959818535, + "grad_norm": 0.479897677898407, + "learning_rate": 8.176865458078632e-06, + "loss": 0.1427, + "step": 2800 + }, + { + "epoch": 0.907647440051847, + "grad_norm": 0.47575831413269043, + "learning_rate": 8.175514509512508e-06, + "loss": 0.1354, + "step": 2801 + }, + { + "epoch": 0.9079714841218406, + "grad_norm": 0.44075480103492737, + "learning_rate": 8.17416317229415e-06, + "loss": 0.1398, + "step": 2802 + }, + { + "epoch": 0.9082955281918341, + "grad_norm": 0.44483470916748047, + "learning_rate": 8.172811446588947e-06, + "loss": 0.1308, + "step": 2803 + }, + { + "epoch": 0.9086195722618277, + "grad_norm": 0.43445441126823425, + "learning_rate": 8.171459332562339e-06, + "loss": 0.1349, + "step": 2804 + }, + { + "epoch": 0.9089436163318211, + "grad_norm": 0.49809399247169495, + "learning_rate": 8.17010683037981e-06, + "loss": 0.1493, + "step": 2805 + }, + { + "epoch": 0.9092676604018146, + "grad_norm": 0.4239096939563751, + "learning_rate": 8.168753940206895e-06, + "loss": 0.1249, + "step": 2806 + }, + { + "epoch": 0.9095917044718081, + "grad_norm": 0.45722323656082153, + "learning_rate": 8.167400662209173e-06, + "loss": 0.1361, + "step": 2807 + }, + { + "epoch": 0.9099157485418017, + "grad_norm": 0.5097815990447998, + "learning_rate": 8.166046996552272e-06, + "loss": 0.1507, + "step": 2808 + }, + { + "epoch": 0.9102397926117952, + "grad_norm": 0.48827484250068665, + "learning_rate": 8.16469294340187e-06, + "loss": 0.1473, + "step": 2809 + }, + { + "epoch": 0.9105638366817888, + "grad_norm": 0.4702487587928772, + "learning_rate": 8.163338502923687e-06, + "loss": 0.1418, + "step": 2810 + }, + { + "epoch": 0.9108878807517823, + "grad_norm": 0.517118513584137, + "learning_rate": 8.161983675283496e-06, + "loss": 0.1607, + "step": 2811 + }, + { + "epoch": 0.9112119248217757, + "grad_norm": 0.4593200087547302, + "learning_rate": 8.160628460647113e-06, + "loss": 0.1414, + "step": 2812 + }, + { + "epoch": 0.9115359688917692, + "grad_norm": 0.49310043454170227, + "learning_rate": 8.159272859180403e-06, + "loss": 0.1355, + "step": 2813 + }, + { + "epoch": 0.9118600129617628, + "grad_norm": 0.5267544388771057, + "learning_rate": 8.15791687104928e-06, + "loss": 0.1552, + "step": 2814 + }, + { + "epoch": 0.9121840570317563, + "grad_norm": 0.46945372223854065, + "learning_rate": 8.156560496419701e-06, + "loss": 0.151, + "step": 2815 + }, + { + "epoch": 0.9125081011017498, + "grad_norm": 0.4934663772583008, + "learning_rate": 8.155203735457677e-06, + "loss": 0.1539, + "step": 2816 + }, + { + "epoch": 0.9128321451717434, + "grad_norm": 0.42091354727745056, + "learning_rate": 8.15384658832926e-06, + "loss": 0.1275, + "step": 2817 + }, + { + "epoch": 0.9131561892417369, + "grad_norm": 0.45683035254478455, + "learning_rate": 8.152489055200553e-06, + "loss": 0.141, + "step": 2818 + }, + { + "epoch": 0.9134802333117304, + "grad_norm": 0.45693978667259216, + "learning_rate": 8.151131136237705e-06, + "loss": 0.1457, + "step": 2819 + }, + { + "epoch": 0.9138042773817239, + "grad_norm": 0.4520793855190277, + "learning_rate": 8.149772831606908e-06, + "loss": 0.1531, + "step": 2820 + }, + { + "epoch": 0.9141283214517174, + "grad_norm": 0.47464117407798767, + "learning_rate": 8.14841414147441e-06, + "loss": 0.1459, + "step": 2821 + }, + { + "epoch": 0.9144523655217109, + "grad_norm": 0.48440176248550415, + "learning_rate": 8.1470550660065e-06, + "loss": 0.1525, + "step": 2822 + }, + { + "epoch": 0.9147764095917045, + "grad_norm": 0.4686981737613678, + "learning_rate": 8.145695605369516e-06, + "loss": 0.1466, + "step": 2823 + }, + { + "epoch": 0.915100453661698, + "grad_norm": 0.48142120242118835, + "learning_rate": 8.144335759729844e-06, + "loss": 0.1502, + "step": 2824 + }, + { + "epoch": 0.9154244977316915, + "grad_norm": 0.46992120146751404, + "learning_rate": 8.142975529253914e-06, + "loss": 0.1429, + "step": 2825 + }, + { + "epoch": 0.9157485418016851, + "grad_norm": 0.45641136169433594, + "learning_rate": 8.141614914108204e-06, + "loss": 0.1401, + "step": 2826 + }, + { + "epoch": 0.9160725858716785, + "grad_norm": 0.4673773944377899, + "learning_rate": 8.140253914459244e-06, + "loss": 0.1395, + "step": 2827 + }, + { + "epoch": 0.916396629941672, + "grad_norm": 0.4894830584526062, + "learning_rate": 8.138892530473601e-06, + "loss": 0.1651, + "step": 2828 + }, + { + "epoch": 0.9167206740116656, + "grad_norm": 0.49354568123817444, + "learning_rate": 8.137530762317902e-06, + "loss": 0.1439, + "step": 2829 + }, + { + "epoch": 0.9170447180816591, + "grad_norm": 0.4844863712787628, + "learning_rate": 8.136168610158812e-06, + "loss": 0.1432, + "step": 2830 + }, + { + "epoch": 0.9173687621516526, + "grad_norm": 0.5116393566131592, + "learning_rate": 8.134806074163044e-06, + "loss": 0.1636, + "step": 2831 + }, + { + "epoch": 0.9176928062216462, + "grad_norm": 0.4723188579082489, + "learning_rate": 8.13344315449736e-06, + "loss": 0.1446, + "step": 2832 + }, + { + "epoch": 0.9180168502916397, + "grad_norm": 0.4761399030685425, + "learning_rate": 8.132079851328565e-06, + "loss": 0.1335, + "step": 2833 + }, + { + "epoch": 0.9183408943616331, + "grad_norm": 0.4631648361682892, + "learning_rate": 8.13071616482352e-06, + "loss": 0.1506, + "step": 2834 + }, + { + "epoch": 0.9186649384316267, + "grad_norm": 0.47083237767219543, + "learning_rate": 8.129352095149123e-06, + "loss": 0.145, + "step": 2835 + }, + { + "epoch": 0.9189889825016202, + "grad_norm": 0.513043999671936, + "learning_rate": 8.127987642472324e-06, + "loss": 0.1487, + "step": 2836 + }, + { + "epoch": 0.9193130265716137, + "grad_norm": 0.4924792945384979, + "learning_rate": 8.126622806960121e-06, + "loss": 0.1544, + "step": 2837 + }, + { + "epoch": 0.9196370706416073, + "grad_norm": 0.48701411485671997, + "learning_rate": 8.125257588779553e-06, + "loss": 0.1559, + "step": 2838 + }, + { + "epoch": 0.9199611147116008, + "grad_norm": 0.4733465909957886, + "learning_rate": 8.12389198809771e-06, + "loss": 0.1388, + "step": 2839 + }, + { + "epoch": 0.9202851587815943, + "grad_norm": 0.45928820967674255, + "learning_rate": 8.12252600508173e-06, + "loss": 0.1444, + "step": 2840 + }, + { + "epoch": 0.9206092028515879, + "grad_norm": 0.4772232174873352, + "learning_rate": 8.121159639898796e-06, + "loss": 0.1521, + "step": 2841 + }, + { + "epoch": 0.9209332469215813, + "grad_norm": 0.4992552697658539, + "learning_rate": 8.119792892716136e-06, + "loss": 0.1538, + "step": 2842 + }, + { + "epoch": 0.9212572909915748, + "grad_norm": 0.45725077390670776, + "learning_rate": 8.11842576370103e-06, + "loss": 0.1342, + "step": 2843 + }, + { + "epoch": 0.9215813350615684, + "grad_norm": 0.4585983455181122, + "learning_rate": 8.117058253020797e-06, + "loss": 0.1424, + "step": 2844 + }, + { + "epoch": 0.9219053791315619, + "grad_norm": 0.5417724847793579, + "learning_rate": 8.11569036084281e-06, + "loss": 0.1503, + "step": 2845 + }, + { + "epoch": 0.9222294232015554, + "grad_norm": 0.4953386187553406, + "learning_rate": 8.114322087334485e-06, + "loss": 0.1365, + "step": 2846 + }, + { + "epoch": 0.922553467271549, + "grad_norm": 0.4717243015766144, + "learning_rate": 8.112953432663286e-06, + "loss": 0.1402, + "step": 2847 + }, + { + "epoch": 0.9228775113415425, + "grad_norm": 0.49322399497032166, + "learning_rate": 8.11158439699672e-06, + "loss": 0.1508, + "step": 2848 + }, + { + "epoch": 0.9232015554115359, + "grad_norm": 0.4861105978488922, + "learning_rate": 8.11021498050235e-06, + "loss": 0.154, + "step": 2849 + }, + { + "epoch": 0.9235255994815295, + "grad_norm": 0.44113120436668396, + "learning_rate": 8.108845183347773e-06, + "loss": 0.1416, + "step": 2850 + }, + { + "epoch": 0.923849643551523, + "grad_norm": 0.45397233963012695, + "learning_rate": 8.107475005700645e-06, + "loss": 0.137, + "step": 2851 + }, + { + "epoch": 0.9241736876215165, + "grad_norm": 0.4606253504753113, + "learning_rate": 8.106104447728656e-06, + "loss": 0.1401, + "step": 2852 + }, + { + "epoch": 0.9244977316915101, + "grad_norm": 0.5172616243362427, + "learning_rate": 8.104733509599552e-06, + "loss": 0.147, + "step": 2853 + }, + { + "epoch": 0.9248217757615036, + "grad_norm": 0.48049119114875793, + "learning_rate": 8.103362191481122e-06, + "loss": 0.1506, + "step": 2854 + }, + { + "epoch": 0.9251458198314971, + "grad_norm": 0.4916389286518097, + "learning_rate": 8.101990493541205e-06, + "loss": 0.1684, + "step": 2855 + }, + { + "epoch": 0.9254698639014906, + "grad_norm": 0.459881067276001, + "learning_rate": 8.10061841594768e-06, + "loss": 0.1599, + "step": 2856 + }, + { + "epoch": 0.9257939079714841, + "grad_norm": 0.5155965685844421, + "learning_rate": 8.099245958868478e-06, + "loss": 0.1479, + "step": 2857 + }, + { + "epoch": 0.9261179520414776, + "grad_norm": 0.47220301628112793, + "learning_rate": 8.097873122471571e-06, + "loss": 0.1543, + "step": 2858 + }, + { + "epoch": 0.9264419961114712, + "grad_norm": 0.47689226269721985, + "learning_rate": 8.096499906924987e-06, + "loss": 0.1375, + "step": 2859 + }, + { + "epoch": 0.9267660401814647, + "grad_norm": 0.5045191645622253, + "learning_rate": 8.095126312396789e-06, + "loss": 0.1537, + "step": 2860 + }, + { + "epoch": 0.9270900842514582, + "grad_norm": 0.43550044298171997, + "learning_rate": 8.093752339055094e-06, + "loss": 0.1357, + "step": 2861 + }, + { + "epoch": 0.9274141283214518, + "grad_norm": 0.4881744086742401, + "learning_rate": 8.09237798706806e-06, + "loss": 0.1501, + "step": 2862 + }, + { + "epoch": 0.9277381723914452, + "grad_norm": 0.48399439454078674, + "learning_rate": 8.0910032566039e-06, + "loss": 0.1465, + "step": 2863 + }, + { + "epoch": 0.9280622164614387, + "grad_norm": 0.4915633201599121, + "learning_rate": 8.089628147830864e-06, + "loss": 0.1465, + "step": 2864 + }, + { + "epoch": 0.9283862605314323, + "grad_norm": 0.5544672608375549, + "learning_rate": 8.088252660917253e-06, + "loss": 0.1691, + "step": 2865 + }, + { + "epoch": 0.9287103046014258, + "grad_norm": 0.4762614667415619, + "learning_rate": 8.086876796031411e-06, + "loss": 0.1371, + "step": 2866 + }, + { + "epoch": 0.9290343486714193, + "grad_norm": 0.4570804834365845, + "learning_rate": 8.085500553341734e-06, + "loss": 0.1379, + "step": 2867 + }, + { + "epoch": 0.9293583927414129, + "grad_norm": 0.4688291847705841, + "learning_rate": 8.08412393301666e-06, + "loss": 0.1434, + "step": 2868 + }, + { + "epoch": 0.9296824368114064, + "grad_norm": 0.4795081913471222, + "learning_rate": 8.082746935224673e-06, + "loss": 0.1268, + "step": 2869 + }, + { + "epoch": 0.9300064808813999, + "grad_norm": 0.4637512266635895, + "learning_rate": 8.081369560134303e-06, + "loss": 0.141, + "step": 2870 + }, + { + "epoch": 0.9303305249513933, + "grad_norm": 0.4551636576652527, + "learning_rate": 8.079991807914129e-06, + "loss": 0.1452, + "step": 2871 + }, + { + "epoch": 0.9306545690213869, + "grad_norm": 0.5067548155784607, + "learning_rate": 8.078613678732774e-06, + "loss": 0.1649, + "step": 2872 + }, + { + "epoch": 0.9309786130913804, + "grad_norm": 0.4417652189731598, + "learning_rate": 8.07723517275891e-06, + "loss": 0.1344, + "step": 2873 + }, + { + "epoch": 0.931302657161374, + "grad_norm": 0.4837871789932251, + "learning_rate": 8.075856290161251e-06, + "loss": 0.1475, + "step": 2874 + }, + { + "epoch": 0.9316267012313675, + "grad_norm": 0.4757400155067444, + "learning_rate": 8.074477031108556e-06, + "loss": 0.1395, + "step": 2875 + }, + { + "epoch": 0.931950745301361, + "grad_norm": 0.47112858295440674, + "learning_rate": 8.073097395769635e-06, + "loss": 0.1409, + "step": 2876 + }, + { + "epoch": 0.9322747893713546, + "grad_norm": 0.4562358856201172, + "learning_rate": 8.071717384313347e-06, + "loss": 0.1356, + "step": 2877 + }, + { + "epoch": 0.932598833441348, + "grad_norm": 0.45477864146232605, + "learning_rate": 8.070336996908585e-06, + "loss": 0.1267, + "step": 2878 + }, + { + "epoch": 0.9329228775113415, + "grad_norm": 0.47922149300575256, + "learning_rate": 8.068956233724298e-06, + "loss": 0.1351, + "step": 2879 + }, + { + "epoch": 0.933246921581335, + "grad_norm": 0.4852301776409149, + "learning_rate": 8.067575094929476e-06, + "loss": 0.1384, + "step": 2880 + }, + { + "epoch": 0.9335709656513286, + "grad_norm": 0.47646522521972656, + "learning_rate": 8.066193580693163e-06, + "loss": 0.1499, + "step": 2881 + }, + { + "epoch": 0.9338950097213221, + "grad_norm": 0.44006016850471497, + "learning_rate": 8.064811691184436e-06, + "loss": 0.126, + "step": 2882 + }, + { + "epoch": 0.9342190537913156, + "grad_norm": 0.4813414514064789, + "learning_rate": 8.063429426572427e-06, + "loss": 0.1448, + "step": 2883 + }, + { + "epoch": 0.9345430978613092, + "grad_norm": 0.46030181646347046, + "learning_rate": 8.062046787026314e-06, + "loss": 0.1282, + "step": 2884 + }, + { + "epoch": 0.9348671419313026, + "grad_norm": 0.4716332256793976, + "learning_rate": 8.060663772715318e-06, + "loss": 0.136, + "step": 2885 + }, + { + "epoch": 0.9351911860012961, + "grad_norm": 0.49652379751205444, + "learning_rate": 8.059280383808704e-06, + "loss": 0.1544, + "step": 2886 + }, + { + "epoch": 0.9355152300712897, + "grad_norm": 0.4331119656562805, + "learning_rate": 8.057896620475786e-06, + "loss": 0.1243, + "step": 2887 + }, + { + "epoch": 0.9358392741412832, + "grad_norm": 0.4984351098537445, + "learning_rate": 8.056512482885927e-06, + "loss": 0.1533, + "step": 2888 + }, + { + "epoch": 0.9361633182112767, + "grad_norm": 0.4716634452342987, + "learning_rate": 8.055127971208529e-06, + "loss": 0.1511, + "step": 2889 + }, + { + "epoch": 0.9364873622812703, + "grad_norm": 0.47842100262641907, + "learning_rate": 8.053743085613042e-06, + "loss": 0.1507, + "step": 2890 + }, + { + "epoch": 0.9368114063512638, + "grad_norm": 0.4523334801197052, + "learning_rate": 8.052357826268965e-06, + "loss": 0.143, + "step": 2891 + }, + { + "epoch": 0.9371354504212573, + "grad_norm": 0.5262511968612671, + "learning_rate": 8.05097219334584e-06, + "loss": 0.1683, + "step": 2892 + }, + { + "epoch": 0.9374594944912508, + "grad_norm": 0.458168625831604, + "learning_rate": 8.049586187013252e-06, + "loss": 0.133, + "step": 2893 + }, + { + "epoch": 0.9377835385612443, + "grad_norm": 0.4705049395561218, + "learning_rate": 8.048199807440838e-06, + "loss": 0.1464, + "step": 2894 + }, + { + "epoch": 0.9381075826312378, + "grad_norm": 0.4413962662220001, + "learning_rate": 8.046813054798274e-06, + "loss": 0.1377, + "step": 2895 + }, + { + "epoch": 0.9384316267012314, + "grad_norm": 0.461868554353714, + "learning_rate": 8.04542592925529e-06, + "loss": 0.1305, + "step": 2896 + }, + { + "epoch": 0.9387556707712249, + "grad_norm": 0.45427650213241577, + "learning_rate": 8.044038430981655e-06, + "loss": 0.1515, + "step": 2897 + }, + { + "epoch": 0.9390797148412184, + "grad_norm": 0.4581550061702728, + "learning_rate": 8.042650560147184e-06, + "loss": 0.1444, + "step": 2898 + }, + { + "epoch": 0.939403758911212, + "grad_norm": 0.4827260673046112, + "learning_rate": 8.041262316921741e-06, + "loss": 0.151, + "step": 2899 + }, + { + "epoch": 0.9397278029812054, + "grad_norm": 0.4749002158641815, + "learning_rate": 8.03987370147523e-06, + "loss": 0.1417, + "step": 2900 + }, + { + "epoch": 0.9400518470511989, + "grad_norm": 0.4695237874984741, + "learning_rate": 8.038484713977606e-06, + "loss": 0.146, + "step": 2901 + }, + { + "epoch": 0.9403758911211925, + "grad_norm": 0.48500707745552063, + "learning_rate": 8.037095354598869e-06, + "loss": 0.1451, + "step": 2902 + }, + { + "epoch": 0.940699935191186, + "grad_norm": 0.507603645324707, + "learning_rate": 8.03570562350906e-06, + "loss": 0.1548, + "step": 2903 + }, + { + "epoch": 0.9410239792611795, + "grad_norm": 0.5060255527496338, + "learning_rate": 8.034315520878272e-06, + "loss": 0.1522, + "step": 2904 + }, + { + "epoch": 0.9413480233311731, + "grad_norm": 0.45632651448249817, + "learning_rate": 8.03292504687664e-06, + "loss": 0.1385, + "step": 2905 + }, + { + "epoch": 0.9416720674011666, + "grad_norm": 0.4859046936035156, + "learning_rate": 8.031534201674342e-06, + "loss": 0.152, + "step": 2906 + }, + { + "epoch": 0.94199611147116, + "grad_norm": 0.4495350420475006, + "learning_rate": 8.030142985441605e-06, + "loss": 0.1347, + "step": 2907 + }, + { + "epoch": 0.9423201555411536, + "grad_norm": 0.4688016176223755, + "learning_rate": 8.028751398348702e-06, + "loss": 0.1465, + "step": 2908 + }, + { + "epoch": 0.9426441996111471, + "grad_norm": 0.4817104637622833, + "learning_rate": 8.027359440565946e-06, + "loss": 0.1512, + "step": 2909 + }, + { + "epoch": 0.9429682436811406, + "grad_norm": 0.46768316626548767, + "learning_rate": 8.025967112263704e-06, + "loss": 0.1559, + "step": 2910 + }, + { + "epoch": 0.9432922877511342, + "grad_norm": 0.45750781893730164, + "learning_rate": 8.02457441361238e-06, + "loss": 0.1401, + "step": 2911 + }, + { + "epoch": 0.9436163318211277, + "grad_norm": 0.5045716762542725, + "learning_rate": 8.023181344782426e-06, + "loss": 0.1468, + "step": 2912 + }, + { + "epoch": 0.9439403758911212, + "grad_norm": 0.4775066673755646, + "learning_rate": 8.021787905944346e-06, + "loss": 0.1443, + "step": 2913 + }, + { + "epoch": 0.9442644199611148, + "grad_norm": 0.48258528113365173, + "learning_rate": 8.020394097268677e-06, + "loss": 0.1496, + "step": 2914 + }, + { + "epoch": 0.9445884640311082, + "grad_norm": 0.4880499541759491, + "learning_rate": 8.01899991892601e-06, + "loss": 0.1441, + "step": 2915 + }, + { + "epoch": 0.9449125081011017, + "grad_norm": 0.45834702253341675, + "learning_rate": 8.01760537108698e-06, + "loss": 0.1348, + "step": 2916 + }, + { + "epoch": 0.9452365521710953, + "grad_norm": 0.465637743473053, + "learning_rate": 8.016210453922265e-06, + "loss": 0.1525, + "step": 2917 + }, + { + "epoch": 0.9455605962410888, + "grad_norm": 0.4635922908782959, + "learning_rate": 8.01481516760259e-06, + "loss": 0.1338, + "step": 2918 + }, + { + "epoch": 0.9458846403110823, + "grad_norm": 0.4392727315425873, + "learning_rate": 8.013419512298724e-06, + "loss": 0.1253, + "step": 2919 + }, + { + "epoch": 0.9462086843810759, + "grad_norm": 0.5187051892280579, + "learning_rate": 8.012023488181481e-06, + "loss": 0.1575, + "step": 2920 + }, + { + "epoch": 0.9465327284510694, + "grad_norm": 0.4780518114566803, + "learning_rate": 8.010627095421722e-06, + "loss": 0.1458, + "step": 2921 + }, + { + "epoch": 0.9468567725210628, + "grad_norm": 0.4945436120033264, + "learning_rate": 8.009230334190352e-06, + "loss": 0.1379, + "step": 2922 + }, + { + "epoch": 0.9471808165910564, + "grad_norm": 0.4752620756626129, + "learning_rate": 8.007833204658322e-06, + "loss": 0.1457, + "step": 2923 + }, + { + "epoch": 0.9475048606610499, + "grad_norm": 0.5349672436714172, + "learning_rate": 8.006435706996623e-06, + "loss": 0.1564, + "step": 2924 + }, + { + "epoch": 0.9478289047310434, + "grad_norm": 0.5222102999687195, + "learning_rate": 8.0050378413763e-06, + "loss": 0.1553, + "step": 2925 + }, + { + "epoch": 0.948152948801037, + "grad_norm": 0.47248953580856323, + "learning_rate": 8.003639607968436e-06, + "loss": 0.1483, + "step": 2926 + }, + { + "epoch": 0.9484769928710305, + "grad_norm": 0.46400532126426697, + "learning_rate": 8.00224100694416e-06, + "loss": 0.1316, + "step": 2927 + }, + { + "epoch": 0.948801036941024, + "grad_norm": 0.48341265320777893, + "learning_rate": 8.000842038474652e-06, + "loss": 0.1511, + "step": 2928 + }, + { + "epoch": 0.9491250810110174, + "grad_norm": 0.508072555065155, + "learning_rate": 7.999442702731127e-06, + "loss": 0.1748, + "step": 2929 + }, + { + "epoch": 0.949449125081011, + "grad_norm": 0.5086212158203125, + "learning_rate": 7.99804299988485e-06, + "loss": 0.15, + "step": 2930 + }, + { + "epoch": 0.9497731691510045, + "grad_norm": 0.4628218114376068, + "learning_rate": 7.996642930107136e-06, + "loss": 0.1561, + "step": 2931 + }, + { + "epoch": 0.950097213220998, + "grad_norm": 0.45513710379600525, + "learning_rate": 7.995242493569335e-06, + "loss": 0.1385, + "step": 2932 + }, + { + "epoch": 0.9504212572909916, + "grad_norm": 0.42527732253074646, + "learning_rate": 7.99384169044285e-06, + "loss": 0.1277, + "step": 2933 + }, + { + "epoch": 0.9507453013609851, + "grad_norm": 0.4785211682319641, + "learning_rate": 7.992440520899126e-06, + "loss": 0.1344, + "step": 2934 + }, + { + "epoch": 0.9510693454309787, + "grad_norm": 0.4923803210258484, + "learning_rate": 7.991038985109649e-06, + "loss": 0.1598, + "step": 2935 + }, + { + "epoch": 0.9513933895009722, + "grad_norm": 0.45439448952674866, + "learning_rate": 7.989637083245958e-06, + "loss": 0.1386, + "step": 2936 + }, + { + "epoch": 0.9517174335709656, + "grad_norm": 0.5161374807357788, + "learning_rate": 7.988234815479629e-06, + "loss": 0.169, + "step": 2937 + }, + { + "epoch": 0.9520414776409591, + "grad_norm": 0.4716404378414154, + "learning_rate": 7.986832181982286e-06, + "loss": 0.1378, + "step": 2938 + }, + { + "epoch": 0.9523655217109527, + "grad_norm": 0.4898372292518616, + "learning_rate": 7.985429182925599e-06, + "loss": 0.1373, + "step": 2939 + }, + { + "epoch": 0.9526895657809462, + "grad_norm": 0.49842342734336853, + "learning_rate": 7.984025818481283e-06, + "loss": 0.1486, + "step": 2940 + }, + { + "epoch": 0.9530136098509397, + "grad_norm": 0.44580867886543274, + "learning_rate": 7.982622088821092e-06, + "loss": 0.136, + "step": 2941 + }, + { + "epoch": 0.9533376539209333, + "grad_norm": 0.47177910804748535, + "learning_rate": 7.981217994116833e-06, + "loss": 0.1461, + "step": 2942 + }, + { + "epoch": 0.9536616979909268, + "grad_norm": 0.4503575563430786, + "learning_rate": 7.97981353454035e-06, + "loss": 0.1307, + "step": 2943 + }, + { + "epoch": 0.9539857420609202, + "grad_norm": 0.5025628209114075, + "learning_rate": 7.978408710263538e-06, + "loss": 0.1553, + "step": 2944 + }, + { + "epoch": 0.9543097861309138, + "grad_norm": 0.46699047088623047, + "learning_rate": 7.977003521458336e-06, + "loss": 0.1358, + "step": 2945 + }, + { + "epoch": 0.9546338302009073, + "grad_norm": 0.47436779737472534, + "learning_rate": 7.97559796829672e-06, + "loss": 0.1404, + "step": 2946 + }, + { + "epoch": 0.9549578742709008, + "grad_norm": 0.5132781267166138, + "learning_rate": 7.97419205095072e-06, + "loss": 0.1638, + "step": 2947 + }, + { + "epoch": 0.9552819183408944, + "grad_norm": 0.4697002172470093, + "learning_rate": 7.972785769592404e-06, + "loss": 0.1535, + "step": 2948 + }, + { + "epoch": 0.9556059624108879, + "grad_norm": 0.46338951587677, + "learning_rate": 7.971379124393887e-06, + "loss": 0.1456, + "step": 2949 + }, + { + "epoch": 0.9559300064808814, + "grad_norm": 0.45680999755859375, + "learning_rate": 7.969972115527334e-06, + "loss": 0.1406, + "step": 2950 + }, + { + "epoch": 0.9562540505508749, + "grad_norm": 0.4780776798725128, + "learning_rate": 7.968564743164944e-06, + "loss": 0.1475, + "step": 2951 + }, + { + "epoch": 0.9565780946208684, + "grad_norm": 0.4549115300178528, + "learning_rate": 7.967157007478967e-06, + "loss": 0.1313, + "step": 2952 + }, + { + "epoch": 0.9569021386908619, + "grad_norm": 0.49157506227493286, + "learning_rate": 7.965748908641698e-06, + "loss": 0.1502, + "step": 2953 + }, + { + "epoch": 0.9572261827608555, + "grad_norm": 0.46433138847351074, + "learning_rate": 7.96434044682547e-06, + "loss": 0.1488, + "step": 2954 + }, + { + "epoch": 0.957550226830849, + "grad_norm": 0.48217496275901794, + "learning_rate": 7.96293162220267e-06, + "loss": 0.1609, + "step": 2955 + }, + { + "epoch": 0.9578742709008425, + "grad_norm": 0.42298516631126404, + "learning_rate": 7.961522434945723e-06, + "loss": 0.1365, + "step": 2956 + }, + { + "epoch": 0.9581983149708361, + "grad_norm": 0.5255170464515686, + "learning_rate": 7.9601128852271e-06, + "loss": 0.1532, + "step": 2957 + }, + { + "epoch": 0.9585223590408296, + "grad_norm": 0.42979586124420166, + "learning_rate": 7.958702973219317e-06, + "loss": 0.1408, + "step": 2958 + }, + { + "epoch": 0.958846403110823, + "grad_norm": 0.4746703803539276, + "learning_rate": 7.957292699094932e-06, + "loss": 0.152, + "step": 2959 + }, + { + "epoch": 0.9591704471808166, + "grad_norm": 0.49901002645492554, + "learning_rate": 7.95588206302655e-06, + "loss": 0.1497, + "step": 2960 + }, + { + "epoch": 0.9594944912508101, + "grad_norm": 0.4546007215976715, + "learning_rate": 7.954471065186816e-06, + "loss": 0.1389, + "step": 2961 + }, + { + "epoch": 0.9598185353208036, + "grad_norm": 0.4283216595649719, + "learning_rate": 7.953059705748427e-06, + "loss": 0.1305, + "step": 2962 + }, + { + "epoch": 0.9601425793907972, + "grad_norm": 0.49616873264312744, + "learning_rate": 7.951647984884116e-06, + "loss": 0.1507, + "step": 2963 + }, + { + "epoch": 0.9604666234607907, + "grad_norm": 0.46941477060317993, + "learning_rate": 7.950235902766668e-06, + "loss": 0.1482, + "step": 2964 + }, + { + "epoch": 0.9607906675307842, + "grad_norm": 0.4654848277568817, + "learning_rate": 7.948823459568907e-06, + "loss": 0.1465, + "step": 2965 + }, + { + "epoch": 0.9611147116007777, + "grad_norm": 0.49566254019737244, + "learning_rate": 7.947410655463699e-06, + "loss": 0.1544, + "step": 2966 + }, + { + "epoch": 0.9614387556707712, + "grad_norm": 0.4398048222064972, + "learning_rate": 7.94599749062396e-06, + "loss": 0.1377, + "step": 2967 + }, + { + "epoch": 0.9617627997407647, + "grad_norm": 0.4460553824901581, + "learning_rate": 7.94458396522265e-06, + "loss": 0.144, + "step": 2968 + }, + { + "epoch": 0.9620868438107583, + "grad_norm": 0.4828304052352905, + "learning_rate": 7.943170079432764e-06, + "loss": 0.1702, + "step": 2969 + }, + { + "epoch": 0.9624108878807518, + "grad_norm": 0.4819565415382385, + "learning_rate": 7.941755833427356e-06, + "loss": 0.1525, + "step": 2970 + }, + { + "epoch": 0.9627349319507453, + "grad_norm": 0.4813225567340851, + "learning_rate": 7.940341227379513e-06, + "loss": 0.1503, + "step": 2971 + }, + { + "epoch": 0.9630589760207389, + "grad_norm": 0.46761325001716614, + "learning_rate": 7.938926261462366e-06, + "loss": 0.153, + "step": 2972 + }, + { + "epoch": 0.9633830200907323, + "grad_norm": 0.4724077880382538, + "learning_rate": 7.937510935849097e-06, + "loss": 0.1488, + "step": 2973 + }, + { + "epoch": 0.9637070641607258, + "grad_norm": 0.45716893672943115, + "learning_rate": 7.936095250712926e-06, + "loss": 0.1441, + "step": 2974 + }, + { + "epoch": 0.9640311082307194, + "grad_norm": 0.45643728971481323, + "learning_rate": 7.93467920622712e-06, + "loss": 0.1294, + "step": 2975 + }, + { + "epoch": 0.9643551523007129, + "grad_norm": 0.4829012155532837, + "learning_rate": 7.93326280256499e-06, + "loss": 0.152, + "step": 2976 + }, + { + "epoch": 0.9646791963707064, + "grad_norm": 0.4712313413619995, + "learning_rate": 7.931846039899888e-06, + "loss": 0.135, + "step": 2977 + }, + { + "epoch": 0.9650032404407, + "grad_norm": 0.4900357723236084, + "learning_rate": 7.930428918405213e-06, + "loss": 0.1479, + "step": 2978 + }, + { + "epoch": 0.9653272845106935, + "grad_norm": 0.48142194747924805, + "learning_rate": 7.92901143825441e-06, + "loss": 0.143, + "step": 2979 + }, + { + "epoch": 0.9656513285806869, + "grad_norm": 0.5146341919898987, + "learning_rate": 7.927593599620958e-06, + "loss": 0.1517, + "step": 2980 + }, + { + "epoch": 0.9659753726506805, + "grad_norm": 0.4810130000114441, + "learning_rate": 7.926175402678393e-06, + "loss": 0.1487, + "step": 2981 + }, + { + "epoch": 0.966299416720674, + "grad_norm": 0.48700380325317383, + "learning_rate": 7.924756847600285e-06, + "loss": 0.1543, + "step": 2982 + }, + { + "epoch": 0.9666234607906675, + "grad_norm": 0.44338053464889526, + "learning_rate": 7.923337934560255e-06, + "loss": 0.1311, + "step": 2983 + }, + { + "epoch": 0.9669475048606611, + "grad_norm": 0.46160584688186646, + "learning_rate": 7.92191866373196e-06, + "loss": 0.1355, + "step": 2984 + }, + { + "epoch": 0.9672715489306546, + "grad_norm": 0.4628986418247223, + "learning_rate": 7.920499035289106e-06, + "loss": 0.1343, + "step": 2985 + }, + { + "epoch": 0.9675955930006481, + "grad_norm": 0.5248939394950867, + "learning_rate": 7.919079049405444e-06, + "loss": 0.1508, + "step": 2986 + }, + { + "epoch": 0.9679196370706417, + "grad_norm": 0.4699530005455017, + "learning_rate": 7.917658706254766e-06, + "loss": 0.134, + "step": 2987 + }, + { + "epoch": 0.9682436811406351, + "grad_norm": 0.5100036859512329, + "learning_rate": 7.916238006010906e-06, + "loss": 0.1496, + "step": 2988 + }, + { + "epoch": 0.9685677252106286, + "grad_norm": 0.5195657014846802, + "learning_rate": 7.914816948847747e-06, + "loss": 0.1613, + "step": 2989 + }, + { + "epoch": 0.9688917692806222, + "grad_norm": 0.45567071437835693, + "learning_rate": 7.913395534939212e-06, + "loss": 0.135, + "step": 2990 + }, + { + "epoch": 0.9692158133506157, + "grad_norm": 0.49796679615974426, + "learning_rate": 7.911973764459264e-06, + "loss": 0.159, + "step": 2991 + }, + { + "epoch": 0.9695398574206092, + "grad_norm": 0.47392913699150085, + "learning_rate": 7.91055163758192e-06, + "loss": 0.146, + "step": 2992 + }, + { + "epoch": 0.9698639014906028, + "grad_norm": 0.5466551184654236, + "learning_rate": 7.90912915448123e-06, + "loss": 0.1613, + "step": 2993 + }, + { + "epoch": 0.9701879455605963, + "grad_norm": 0.4669461250305176, + "learning_rate": 7.907706315331293e-06, + "loss": 0.1338, + "step": 2994 + }, + { + "epoch": 0.9705119896305897, + "grad_norm": 0.479095458984375, + "learning_rate": 7.906283120306256e-06, + "loss": 0.149, + "step": 2995 + }, + { + "epoch": 0.9708360337005832, + "grad_norm": 0.5312058925628662, + "learning_rate": 7.904859569580296e-06, + "loss": 0.1571, + "step": 2996 + }, + { + "epoch": 0.9711600777705768, + "grad_norm": 0.5019996166229248, + "learning_rate": 7.90343566332765e-06, + "loss": 0.1458, + "step": 2997 + }, + { + "epoch": 0.9714841218405703, + "grad_norm": 0.4630244970321655, + "learning_rate": 7.902011401722582e-06, + "loss": 0.1517, + "step": 2998 + }, + { + "epoch": 0.9718081659105638, + "grad_norm": 0.45654380321502686, + "learning_rate": 7.900586784939415e-06, + "loss": 0.1366, + "step": 2999 + }, + { + "epoch": 0.9721322099805574, + "grad_norm": 0.5140264630317688, + "learning_rate": 7.899161813152504e-06, + "loss": 0.1395, + "step": 3000 + }, + { + "epoch": 0.9724562540505509, + "grad_norm": 0.46823394298553467, + "learning_rate": 7.897736486536254e-06, + "loss": 0.149, + "step": 3001 + }, + { + "epoch": 0.9727802981205443, + "grad_norm": 0.44872385263442993, + "learning_rate": 7.896310805265109e-06, + "loss": 0.1349, + "step": 3002 + }, + { + "epoch": 0.9731043421905379, + "grad_norm": 0.4944060742855072, + "learning_rate": 7.89488476951356e-06, + "loss": 0.1648, + "step": 3003 + }, + { + "epoch": 0.9734283862605314, + "grad_norm": 0.4401301145553589, + "learning_rate": 7.89345837945614e-06, + "loss": 0.13, + "step": 3004 + }, + { + "epoch": 0.9737524303305249, + "grad_norm": 0.47425198554992676, + "learning_rate": 7.892031635267427e-06, + "loss": 0.1369, + "step": 3005 + }, + { + "epoch": 0.9740764744005185, + "grad_norm": 0.42337566614151, + "learning_rate": 7.890604537122038e-06, + "loss": 0.1317, + "step": 3006 + }, + { + "epoch": 0.974400518470512, + "grad_norm": 0.4444950819015503, + "learning_rate": 7.889177085194638e-06, + "loss": 0.1334, + "step": 3007 + }, + { + "epoch": 0.9747245625405055, + "grad_norm": 0.4752909243106842, + "learning_rate": 7.887749279659928e-06, + "loss": 0.1539, + "step": 3008 + }, + { + "epoch": 0.9750486066104991, + "grad_norm": 0.4772418737411499, + "learning_rate": 7.886321120692664e-06, + "loss": 0.1417, + "step": 3009 + }, + { + "epoch": 0.9753726506804925, + "grad_norm": 0.4452485740184784, + "learning_rate": 7.884892608467638e-06, + "loss": 0.1346, + "step": 3010 + }, + { + "epoch": 0.975696694750486, + "grad_norm": 0.4744517505168915, + "learning_rate": 7.883463743159685e-06, + "loss": 0.1426, + "step": 3011 + }, + { + "epoch": 0.9760207388204796, + "grad_norm": 0.5261785387992859, + "learning_rate": 7.88203452494368e-06, + "loss": 0.1681, + "step": 3012 + }, + { + "epoch": 0.9763447828904731, + "grad_norm": 0.5469948649406433, + "learning_rate": 7.880604953994553e-06, + "loss": 0.1513, + "step": 3013 + }, + { + "epoch": 0.9766688269604666, + "grad_norm": 0.44291892647743225, + "learning_rate": 7.879175030487264e-06, + "loss": 0.1317, + "step": 3014 + }, + { + "epoch": 0.9769928710304602, + "grad_norm": 0.47622525691986084, + "learning_rate": 7.877744754596826e-06, + "loss": 0.1487, + "step": 3015 + }, + { + "epoch": 0.9773169151004537, + "grad_norm": 0.4686462879180908, + "learning_rate": 7.876314126498288e-06, + "loss": 0.1419, + "step": 3016 + }, + { + "epoch": 0.9776409591704471, + "grad_norm": 0.4816147983074188, + "learning_rate": 7.874883146366746e-06, + "loss": 0.1451, + "step": 3017 + }, + { + "epoch": 0.9779650032404407, + "grad_norm": 0.44923725724220276, + "learning_rate": 7.873451814377336e-06, + "loss": 0.1344, + "step": 3018 + }, + { + "epoch": 0.9782890473104342, + "grad_norm": 0.5001899003982544, + "learning_rate": 7.872020130705244e-06, + "loss": 0.157, + "step": 3019 + }, + { + "epoch": 0.9786130913804277, + "grad_norm": 0.45089191198349, + "learning_rate": 7.870588095525688e-06, + "loss": 0.1354, + "step": 3020 + }, + { + "epoch": 0.9789371354504213, + "grad_norm": 0.49082210659980774, + "learning_rate": 7.86915570901394e-06, + "loss": 0.1423, + "step": 3021 + }, + { + "epoch": 0.9792611795204148, + "grad_norm": 0.46292129158973694, + "learning_rate": 7.867722971345311e-06, + "loss": 0.1487, + "step": 3022 + }, + { + "epoch": 0.9795852235904083, + "grad_norm": 0.48895570635795593, + "learning_rate": 7.866289882695148e-06, + "loss": 0.1564, + "step": 3023 + }, + { + "epoch": 0.9799092676604018, + "grad_norm": 0.4756009578704834, + "learning_rate": 7.864856443238854e-06, + "loss": 0.1499, + "step": 3024 + }, + { + "epoch": 0.9802333117303953, + "grad_norm": 0.46246933937072754, + "learning_rate": 7.863422653151866e-06, + "loss": 0.1424, + "step": 3025 + }, + { + "epoch": 0.9805573558003888, + "grad_norm": 0.4539702832698822, + "learning_rate": 7.861988512609663e-06, + "loss": 0.1429, + "step": 3026 + }, + { + "epoch": 0.9808813998703824, + "grad_norm": 0.512012243270874, + "learning_rate": 7.860554021787774e-06, + "loss": 0.1603, + "step": 3027 + }, + { + "epoch": 0.9812054439403759, + "grad_norm": 0.47591957449913025, + "learning_rate": 7.859119180861762e-06, + "loss": 0.1554, + "step": 3028 + }, + { + "epoch": 0.9815294880103694, + "grad_norm": 0.5062115788459778, + "learning_rate": 7.857683990007245e-06, + "loss": 0.1507, + "step": 3029 + }, + { + "epoch": 0.981853532080363, + "grad_norm": 0.40852585434913635, + "learning_rate": 7.85624844939987e-06, + "loss": 0.1163, + "step": 3030 + }, + { + "epoch": 0.9821775761503565, + "grad_norm": 0.5025544762611389, + "learning_rate": 7.854812559215335e-06, + "loss": 0.1466, + "step": 3031 + }, + { + "epoch": 0.9825016202203499, + "grad_norm": 0.495726615190506, + "learning_rate": 7.85337631962938e-06, + "loss": 0.1519, + "step": 3032 + }, + { + "epoch": 0.9828256642903435, + "grad_norm": 0.5085958242416382, + "learning_rate": 7.851939730817786e-06, + "loss": 0.1554, + "step": 3033 + }, + { + "epoch": 0.983149708360337, + "grad_norm": 0.4771917462348938, + "learning_rate": 7.850502792956378e-06, + "loss": 0.1487, + "step": 3034 + }, + { + "epoch": 0.9834737524303305, + "grad_norm": 0.4856005311012268, + "learning_rate": 7.849065506221023e-06, + "loss": 0.1512, + "step": 3035 + }, + { + "epoch": 0.9837977965003241, + "grad_norm": 0.4857740104198456, + "learning_rate": 7.847627870787632e-06, + "loss": 0.1454, + "step": 3036 + }, + { + "epoch": 0.9841218405703176, + "grad_norm": 0.46327874064445496, + "learning_rate": 7.846189886832157e-06, + "loss": 0.1298, + "step": 3037 + }, + { + "epoch": 0.9844458846403111, + "grad_norm": 0.5154156684875488, + "learning_rate": 7.844751554530593e-06, + "loss": 0.1593, + "step": 3038 + }, + { + "epoch": 0.9847699287103046, + "grad_norm": 0.45782309770584106, + "learning_rate": 7.843312874058976e-06, + "loss": 0.1501, + "step": 3039 + }, + { + "epoch": 0.9850939727802981, + "grad_norm": 0.479270875453949, + "learning_rate": 7.841873845593389e-06, + "loss": 0.1385, + "step": 3040 + }, + { + "epoch": 0.9854180168502916, + "grad_norm": 0.4804162085056305, + "learning_rate": 7.840434469309956e-06, + "loss": 0.14, + "step": 3041 + }, + { + "epoch": 0.9857420609202852, + "grad_norm": 0.4845614731311798, + "learning_rate": 7.838994745384842e-06, + "loss": 0.1583, + "step": 3042 + }, + { + "epoch": 0.9860661049902787, + "grad_norm": 0.48938602209091187, + "learning_rate": 7.837554673994254e-06, + "loss": 0.1418, + "step": 3043 + }, + { + "epoch": 0.9863901490602722, + "grad_norm": 0.40282419323921204, + "learning_rate": 7.836114255314444e-06, + "loss": 0.1175, + "step": 3044 + }, + { + "epoch": 0.9867141931302658, + "grad_norm": 0.4799858033657074, + "learning_rate": 7.834673489521705e-06, + "loss": 0.1665, + "step": 3045 + }, + { + "epoch": 0.9870382372002592, + "grad_norm": 0.4240192770957947, + "learning_rate": 7.83323237679237e-06, + "loss": 0.1349, + "step": 3046 + }, + { + "epoch": 0.9873622812702527, + "grad_norm": 0.48984846472740173, + "learning_rate": 7.831790917302822e-06, + "loss": 0.1513, + "step": 3047 + }, + { + "epoch": 0.9876863253402463, + "grad_norm": 0.46244630217552185, + "learning_rate": 7.830349111229481e-06, + "loss": 0.1373, + "step": 3048 + }, + { + "epoch": 0.9880103694102398, + "grad_norm": 0.472954124212265, + "learning_rate": 7.828906958748806e-06, + "loss": 0.1471, + "step": 3049 + }, + { + "epoch": 0.9883344134802333, + "grad_norm": 0.45337915420532227, + "learning_rate": 7.827464460037308e-06, + "loss": 0.1455, + "step": 3050 + }, + { + "epoch": 0.9886584575502269, + "grad_norm": 0.44221627712249756, + "learning_rate": 7.82602161527153e-06, + "loss": 0.1502, + "step": 3051 + }, + { + "epoch": 0.9889825016202204, + "grad_norm": 0.4312097728252411, + "learning_rate": 7.824578424628065e-06, + "loss": 0.1238, + "step": 3052 + }, + { + "epoch": 0.9893065456902139, + "grad_norm": 0.460366815328598, + "learning_rate": 7.823134888283543e-06, + "loss": 0.141, + "step": 3053 + }, + { + "epoch": 0.9896305897602073, + "grad_norm": 0.5097335577011108, + "learning_rate": 7.821691006414644e-06, + "loss": 0.1538, + "step": 3054 + }, + { + "epoch": 0.9899546338302009, + "grad_norm": 0.4305676817893982, + "learning_rate": 7.820246779198079e-06, + "loss": 0.1333, + "step": 3055 + }, + { + "epoch": 0.9902786779001944, + "grad_norm": 0.4182222783565521, + "learning_rate": 7.818802206810613e-06, + "loss": 0.1268, + "step": 3056 + }, + { + "epoch": 0.990602721970188, + "grad_norm": 0.4591180682182312, + "learning_rate": 7.817357289429044e-06, + "loss": 0.1399, + "step": 3057 + }, + { + "epoch": 0.9909267660401815, + "grad_norm": 0.4615064263343811, + "learning_rate": 7.815912027230216e-06, + "loss": 0.1357, + "step": 3058 + }, + { + "epoch": 0.991250810110175, + "grad_norm": 0.4499772787094116, + "learning_rate": 7.814466420391017e-06, + "loss": 0.1393, + "step": 3059 + }, + { + "epoch": 0.9915748541801686, + "grad_norm": 0.4455864131450653, + "learning_rate": 7.813020469088372e-06, + "loss": 0.1386, + "step": 3060 + }, + { + "epoch": 0.991898898250162, + "grad_norm": 0.4960353672504425, + "learning_rate": 7.811574173499257e-06, + "loss": 0.1453, + "step": 3061 + }, + { + "epoch": 0.9922229423201555, + "grad_norm": 0.43458986282348633, + "learning_rate": 7.81012753380068e-06, + "loss": 0.1415, + "step": 3062 + }, + { + "epoch": 0.992546986390149, + "grad_norm": 0.4443108141422272, + "learning_rate": 7.808680550169696e-06, + "loss": 0.1265, + "step": 3063 + }, + { + "epoch": 0.9928710304601426, + "grad_norm": 0.47840794920921326, + "learning_rate": 7.807233222783403e-06, + "loss": 0.1466, + "step": 3064 + }, + { + "epoch": 0.9931950745301361, + "grad_norm": 0.4474855661392212, + "learning_rate": 7.80578555181894e-06, + "loss": 0.137, + "step": 3065 + }, + { + "epoch": 0.9935191186001296, + "grad_norm": 0.4584507346153259, + "learning_rate": 7.80433753745349e-06, + "loss": 0.1394, + "step": 3066 + }, + { + "epoch": 0.9938431626701232, + "grad_norm": 0.432372510433197, + "learning_rate": 7.802889179864271e-06, + "loss": 0.1229, + "step": 3067 + }, + { + "epoch": 0.9941672067401166, + "grad_norm": 0.4571312963962555, + "learning_rate": 7.80144047922855e-06, + "loss": 0.1348, + "step": 3068 + }, + { + "epoch": 0.9944912508101101, + "grad_norm": 0.4718877673149109, + "learning_rate": 7.799991435723637e-06, + "loss": 0.1406, + "step": 3069 + }, + { + "epoch": 0.9948152948801037, + "grad_norm": 0.4299832284450531, + "learning_rate": 7.798542049526875e-06, + "loss": 0.1359, + "step": 3070 + }, + { + "epoch": 0.9951393389500972, + "grad_norm": 0.4358609914779663, + "learning_rate": 7.79709232081566e-06, + "loss": 0.1338, + "step": 3071 + }, + { + "epoch": 0.9954633830200907, + "grad_norm": 0.43104878067970276, + "learning_rate": 7.795642249767423e-06, + "loss": 0.1211, + "step": 3072 + }, + { + "epoch": 0.9957874270900843, + "grad_norm": 0.4711206555366516, + "learning_rate": 7.794191836559637e-06, + "loss": 0.139, + "step": 3073 + }, + { + "epoch": 0.9961114711600778, + "grad_norm": 0.538809061050415, + "learning_rate": 7.792741081369822e-06, + "loss": 0.1673, + "step": 3074 + }, + { + "epoch": 0.9964355152300713, + "grad_norm": 0.4683513343334198, + "learning_rate": 7.791289984375534e-06, + "loss": 0.1347, + "step": 3075 + }, + { + "epoch": 0.9967595593000648, + "grad_norm": 0.47475746273994446, + "learning_rate": 7.789838545754373e-06, + "loss": 0.1573, + "step": 3076 + }, + { + "epoch": 0.9970836033700583, + "grad_norm": 0.45275938510894775, + "learning_rate": 7.788386765683982e-06, + "loss": 0.1387, + "step": 3077 + }, + { + "epoch": 0.9974076474400518, + "grad_norm": 0.4447527825832367, + "learning_rate": 7.786934644342044e-06, + "loss": 0.1334, + "step": 3078 + }, + { + "epoch": 0.9977316915100454, + "grad_norm": 0.43458884954452515, + "learning_rate": 7.785482181906286e-06, + "loss": 0.1371, + "step": 3079 + }, + { + "epoch": 0.9980557355800389, + "grad_norm": 0.4534459412097931, + "learning_rate": 7.784029378554475e-06, + "loss": 0.1389, + "step": 3080 + }, + { + "epoch": 0.9983797796500324, + "grad_norm": 0.4595879018306732, + "learning_rate": 7.782576234464419e-06, + "loss": 0.1341, + "step": 3081 + }, + { + "epoch": 0.998703823720026, + "grad_norm": 0.5015422701835632, + "learning_rate": 7.78112274981397e-06, + "loss": 0.1518, + "step": 3082 + }, + { + "epoch": 0.9990278677900194, + "grad_norm": 0.45415136218070984, + "learning_rate": 7.779668924781017e-06, + "loss": 0.1281, + "step": 3083 + }, + { + "epoch": 0.9993519118600129, + "grad_norm": 0.4373987913131714, + "learning_rate": 7.778214759543498e-06, + "loss": 0.1298, + "step": 3084 + }, + { + "epoch": 0.9996759559300065, + "grad_norm": 0.44714266061782837, + "learning_rate": 7.77676025427939e-06, + "loss": 0.1328, + "step": 3085 + }, + { + "epoch": 1.0, + "grad_norm": 0.5027826428413391, + "learning_rate": 7.775305409166707e-06, + "loss": 0.1434, + "step": 3086 + }, + { + "epoch": 1.0003240440699934, + "grad_norm": 0.44878828525543213, + "learning_rate": 7.773850224383509e-06, + "loss": 0.1197, + "step": 3087 + }, + { + "epoch": 1.000648088139987, + "grad_norm": 0.40076735615730286, + "learning_rate": 7.772394700107895e-06, + "loss": 0.1014, + "step": 3088 + }, + { + "epoch": 1.0009721322099805, + "grad_norm": 0.4214591979980469, + "learning_rate": 7.77093883651801e-06, + "loss": 0.1088, + "step": 3089 + }, + { + "epoch": 1.0012961762799741, + "grad_norm": 0.4249264895915985, + "learning_rate": 7.769482633792035e-06, + "loss": 0.1148, + "step": 3090 + }, + { + "epoch": 1.0016202203499676, + "grad_norm": 0.4293353259563446, + "learning_rate": 7.768026092108196e-06, + "loss": 0.1062, + "step": 3091 + }, + { + "epoch": 1.0019442644199612, + "grad_norm": 0.4549325704574585, + "learning_rate": 7.766569211644763e-06, + "loss": 0.1124, + "step": 3092 + }, + { + "epoch": 1.0022683084899546, + "grad_norm": 0.4095253050327301, + "learning_rate": 7.765111992580038e-06, + "loss": 0.1027, + "step": 3093 + }, + { + "epoch": 1.002592352559948, + "grad_norm": 0.47369253635406494, + "learning_rate": 7.763654435092374e-06, + "loss": 0.1194, + "step": 3094 + }, + { + "epoch": 1.0029163966299417, + "grad_norm": 0.45340731739997864, + "learning_rate": 7.762196539360161e-06, + "loss": 0.1014, + "step": 3095 + }, + { + "epoch": 1.0032404406999351, + "grad_norm": 0.5088295340538025, + "learning_rate": 7.760738305561832e-06, + "loss": 0.1169, + "step": 3096 + }, + { + "epoch": 1.0035644847699288, + "grad_norm": 0.4732248783111572, + "learning_rate": 7.759279733875862e-06, + "loss": 0.113, + "step": 3097 + }, + { + "epoch": 1.0038885288399222, + "grad_norm": 0.5000881552696228, + "learning_rate": 7.757820824480763e-06, + "loss": 0.111, + "step": 3098 + }, + { + "epoch": 1.0042125729099158, + "grad_norm": 0.4899469316005707, + "learning_rate": 7.756361577555093e-06, + "loss": 0.1219, + "step": 3099 + }, + { + "epoch": 1.0045366169799093, + "grad_norm": 0.4718400537967682, + "learning_rate": 7.75490199327745e-06, + "loss": 0.1184, + "step": 3100 + }, + { + "epoch": 1.0048606610499027, + "grad_norm": 0.46882617473602295, + "learning_rate": 7.753442071826472e-06, + "loss": 0.1115, + "step": 3101 + }, + { + "epoch": 1.0051847051198963, + "grad_norm": 0.4647389054298401, + "learning_rate": 7.75198181338084e-06, + "loss": 0.1054, + "step": 3102 + }, + { + "epoch": 1.0055087491898898, + "grad_norm": 0.4482908844947815, + "learning_rate": 7.750521218119275e-06, + "loss": 0.1128, + "step": 3103 + }, + { + "epoch": 1.0058327932598834, + "grad_norm": 0.43499132990837097, + "learning_rate": 7.74906028622054e-06, + "loss": 0.1029, + "step": 3104 + }, + { + "epoch": 1.0061568373298768, + "grad_norm": 0.47596755623817444, + "learning_rate": 7.74759901786344e-06, + "loss": 0.1197, + "step": 3105 + }, + { + "epoch": 1.0064808813998705, + "grad_norm": 0.46033400297164917, + "learning_rate": 7.746137413226817e-06, + "loss": 0.1119, + "step": 3106 + }, + { + "epoch": 1.0068049254698639, + "grad_norm": 0.48392659425735474, + "learning_rate": 7.744675472489561e-06, + "loss": 0.1264, + "step": 3107 + }, + { + "epoch": 1.0071289695398573, + "grad_norm": 0.45975932478904724, + "learning_rate": 7.743213195830597e-06, + "loss": 0.1033, + "step": 3108 + }, + { + "epoch": 1.007453013609851, + "grad_norm": 0.4883347153663635, + "learning_rate": 7.741750583428895e-06, + "loss": 0.1193, + "step": 3109 + }, + { + "epoch": 1.0077770576798444, + "grad_norm": 0.4730301797389984, + "learning_rate": 7.740287635463464e-06, + "loss": 0.1184, + "step": 3110 + }, + { + "epoch": 1.008101101749838, + "grad_norm": 0.45033764839172363, + "learning_rate": 7.738824352113353e-06, + "loss": 0.1151, + "step": 3111 + }, + { + "epoch": 1.0084251458198314, + "grad_norm": 0.44802072644233704, + "learning_rate": 7.737360733557656e-06, + "loss": 0.1129, + "step": 3112 + }, + { + "epoch": 1.008749189889825, + "grad_norm": 0.4515936076641083, + "learning_rate": 7.735896779975504e-06, + "loss": 0.113, + "step": 3113 + }, + { + "epoch": 1.0090732339598185, + "grad_norm": 0.47022631764411926, + "learning_rate": 7.734432491546073e-06, + "loss": 0.115, + "step": 3114 + }, + { + "epoch": 1.0093972780298122, + "grad_norm": 0.4840375781059265, + "learning_rate": 7.732967868448576e-06, + "loss": 0.1191, + "step": 3115 + }, + { + "epoch": 1.0097213220998056, + "grad_norm": 0.431972473859787, + "learning_rate": 7.731502910862268e-06, + "loss": 0.1107, + "step": 3116 + }, + { + "epoch": 1.010045366169799, + "grad_norm": 0.5001168251037598, + "learning_rate": 7.730037618966448e-06, + "loss": 0.1231, + "step": 3117 + }, + { + "epoch": 1.0103694102397927, + "grad_norm": 0.4558895528316498, + "learning_rate": 7.728571992940452e-06, + "loss": 0.1173, + "step": 3118 + }, + { + "epoch": 1.010693454309786, + "grad_norm": 0.4773532450199127, + "learning_rate": 7.727106032963658e-06, + "loss": 0.1236, + "step": 3119 + }, + { + "epoch": 1.0110174983797797, + "grad_norm": 0.441366583108902, + "learning_rate": 7.725639739215486e-06, + "loss": 0.1073, + "step": 3120 + }, + { + "epoch": 1.0113415424497731, + "grad_norm": 0.4424722194671631, + "learning_rate": 7.724173111875398e-06, + "loss": 0.1114, + "step": 3121 + }, + { + "epoch": 1.0116655865197668, + "grad_norm": 0.476109117269516, + "learning_rate": 7.722706151122892e-06, + "loss": 0.1226, + "step": 3122 + }, + { + "epoch": 1.0119896305897602, + "grad_norm": 0.47857046127319336, + "learning_rate": 7.721238857137512e-06, + "loss": 0.1213, + "step": 3123 + }, + { + "epoch": 1.0123136746597536, + "grad_norm": 0.4299698770046234, + "learning_rate": 7.719771230098839e-06, + "loss": 0.1099, + "step": 3124 + }, + { + "epoch": 1.0126377187297473, + "grad_norm": 0.44482967257499695, + "learning_rate": 7.718303270186495e-06, + "loss": 0.107, + "step": 3125 + }, + { + "epoch": 1.0129617627997407, + "grad_norm": 0.44184449315071106, + "learning_rate": 7.716834977580147e-06, + "loss": 0.1096, + "step": 3126 + }, + { + "epoch": 1.0132858068697344, + "grad_norm": 0.4429199695587158, + "learning_rate": 7.715366352459499e-06, + "loss": 0.1054, + "step": 3127 + }, + { + "epoch": 1.0136098509397278, + "grad_norm": 0.49366313219070435, + "learning_rate": 7.713897395004295e-06, + "loss": 0.1197, + "step": 3128 + }, + { + "epoch": 1.0139338950097214, + "grad_norm": 0.4341050386428833, + "learning_rate": 7.712428105394325e-06, + "loss": 0.1048, + "step": 3129 + }, + { + "epoch": 1.0142579390797148, + "grad_norm": 0.42968085408210754, + "learning_rate": 7.71095848380941e-06, + "loss": 0.105, + "step": 3130 + }, + { + "epoch": 1.0145819831497083, + "grad_norm": 0.45736268162727356, + "learning_rate": 7.709488530429423e-06, + "loss": 0.1075, + "step": 3131 + }, + { + "epoch": 1.014906027219702, + "grad_norm": 0.5073167085647583, + "learning_rate": 7.70801824543427e-06, + "loss": 0.1208, + "step": 3132 + }, + { + "epoch": 1.0152300712896953, + "grad_norm": 0.46384376287460327, + "learning_rate": 7.706547629003897e-06, + "loss": 0.1131, + "step": 3133 + }, + { + "epoch": 1.015554115359689, + "grad_norm": 0.4488130509853363, + "learning_rate": 7.705076681318298e-06, + "loss": 0.1097, + "step": 3134 + }, + { + "epoch": 1.0158781594296824, + "grad_norm": 0.4590933918952942, + "learning_rate": 7.7036054025575e-06, + "loss": 0.1117, + "step": 3135 + }, + { + "epoch": 1.016202203499676, + "grad_norm": 0.4480055570602417, + "learning_rate": 7.702133792901574e-06, + "loss": 0.1087, + "step": 3136 + }, + { + "epoch": 1.0165262475696695, + "grad_norm": 0.4935062527656555, + "learning_rate": 7.700661852530629e-06, + "loss": 0.1215, + "step": 3137 + }, + { + "epoch": 1.016850291639663, + "grad_norm": 0.4196431040763855, + "learning_rate": 7.699189581624818e-06, + "loss": 0.0967, + "step": 3138 + }, + { + "epoch": 1.0171743357096565, + "grad_norm": 0.4557700753211975, + "learning_rate": 7.697716980364334e-06, + "loss": 0.1156, + "step": 3139 + }, + { + "epoch": 1.01749837977965, + "grad_norm": 0.4312606751918793, + "learning_rate": 7.696244048929405e-06, + "loss": 0.102, + "step": 3140 + }, + { + "epoch": 1.0178224238496436, + "grad_norm": 0.4428192675113678, + "learning_rate": 7.694770787500308e-06, + "loss": 0.1021, + "step": 3141 + }, + { + "epoch": 1.018146467919637, + "grad_norm": 0.4538060426712036, + "learning_rate": 7.693297196257354e-06, + "loss": 0.1104, + "step": 3142 + }, + { + "epoch": 1.0184705119896307, + "grad_norm": 0.4797634184360504, + "learning_rate": 7.691823275380895e-06, + "loss": 0.1192, + "step": 3143 + }, + { + "epoch": 1.018794556059624, + "grad_norm": 0.4432651400566101, + "learning_rate": 7.690349025051327e-06, + "loss": 0.1107, + "step": 3144 + }, + { + "epoch": 1.0191186001296175, + "grad_norm": 0.46791303157806396, + "learning_rate": 7.688874445449083e-06, + "loss": 0.1118, + "step": 3145 + }, + { + "epoch": 1.0194426441996112, + "grad_norm": 0.5142552852630615, + "learning_rate": 7.687399536754636e-06, + "loss": 0.1157, + "step": 3146 + }, + { + "epoch": 1.0197666882696046, + "grad_norm": 0.4536757171154022, + "learning_rate": 7.685924299148504e-06, + "loss": 0.1085, + "step": 3147 + }, + { + "epoch": 1.0200907323395982, + "grad_norm": 0.5158340334892273, + "learning_rate": 7.684448732811239e-06, + "loss": 0.1316, + "step": 3148 + }, + { + "epoch": 1.0204147764095917, + "grad_norm": 0.49748632311820984, + "learning_rate": 7.682972837923434e-06, + "loss": 0.1184, + "step": 3149 + }, + { + "epoch": 1.0207388204795853, + "grad_norm": 0.47120100259780884, + "learning_rate": 7.68149661466573e-06, + "loss": 0.1251, + "step": 3150 + }, + { + "epoch": 1.0210628645495787, + "grad_norm": 0.4461970627307892, + "learning_rate": 7.680020063218796e-06, + "loss": 0.1063, + "step": 3151 + }, + { + "epoch": 1.0213869086195722, + "grad_norm": 0.5021184086799622, + "learning_rate": 7.678543183763351e-06, + "loss": 0.1307, + "step": 3152 + }, + { + "epoch": 1.0217109526895658, + "grad_norm": 0.4431654214859009, + "learning_rate": 7.677065976480153e-06, + "loss": 0.1067, + "step": 3153 + }, + { + "epoch": 1.0220349967595592, + "grad_norm": 0.41496190428733826, + "learning_rate": 7.675588441549995e-06, + "loss": 0.0973, + "step": 3154 + }, + { + "epoch": 1.0223590408295529, + "grad_norm": 0.4980688989162445, + "learning_rate": 7.674110579153713e-06, + "loss": 0.1201, + "step": 3155 + }, + { + "epoch": 1.0226830848995463, + "grad_norm": 0.47226595878601074, + "learning_rate": 7.672632389472186e-06, + "loss": 0.116, + "step": 3156 + }, + { + "epoch": 1.02300712896954, + "grad_norm": 0.4931360185146332, + "learning_rate": 7.671153872686324e-06, + "loss": 0.1166, + "step": 3157 + }, + { + "epoch": 1.0233311730395334, + "grad_norm": 0.5069724321365356, + "learning_rate": 7.669675028977089e-06, + "loss": 0.1181, + "step": 3158 + }, + { + "epoch": 1.023655217109527, + "grad_norm": 0.4590316712856293, + "learning_rate": 7.668195858525474e-06, + "loss": 0.1099, + "step": 3159 + }, + { + "epoch": 1.0239792611795204, + "grad_norm": 0.4730747938156128, + "learning_rate": 7.666716361512516e-06, + "loss": 0.1173, + "step": 3160 + }, + { + "epoch": 1.0243033052495139, + "grad_norm": 0.43002721667289734, + "learning_rate": 7.66523653811929e-06, + "loss": 0.0994, + "step": 3161 + }, + { + "epoch": 1.0246273493195075, + "grad_norm": 0.4950290322303772, + "learning_rate": 7.663756388526915e-06, + "loss": 0.1188, + "step": 3162 + }, + { + "epoch": 1.024951393389501, + "grad_norm": 0.4726638197898865, + "learning_rate": 7.662275912916543e-06, + "loss": 0.1109, + "step": 3163 + }, + { + "epoch": 1.0252754374594946, + "grad_norm": 0.47714924812316895, + "learning_rate": 7.660795111469374e-06, + "loss": 0.1183, + "step": 3164 + }, + { + "epoch": 1.025599481529488, + "grad_norm": 0.5277173519134521, + "learning_rate": 7.659313984366643e-06, + "loss": 0.1264, + "step": 3165 + }, + { + "epoch": 1.0259235255994816, + "grad_norm": 0.4854954779148102, + "learning_rate": 7.657832531789623e-06, + "loss": 0.1099, + "step": 3166 + }, + { + "epoch": 1.026247569669475, + "grad_norm": 0.48848310112953186, + "learning_rate": 7.65635075391963e-06, + "loss": 0.1253, + "step": 3167 + }, + { + "epoch": 1.0265716137394685, + "grad_norm": 0.4541592001914978, + "learning_rate": 7.654868650938023e-06, + "loss": 0.1078, + "step": 3168 + }, + { + "epoch": 1.0268956578094621, + "grad_norm": 0.42836257815361023, + "learning_rate": 7.653386223026191e-06, + "loss": 0.1, + "step": 3169 + }, + { + "epoch": 1.0272197018794555, + "grad_norm": 0.4688536524772644, + "learning_rate": 7.651903470365573e-06, + "loss": 0.1146, + "step": 3170 + }, + { + "epoch": 1.0275437459494492, + "grad_norm": 0.4861224293708801, + "learning_rate": 7.650420393137646e-06, + "loss": 0.1205, + "step": 3171 + }, + { + "epoch": 1.0278677900194426, + "grad_norm": 0.4465397596359253, + "learning_rate": 7.648936991523916e-06, + "loss": 0.1122, + "step": 3172 + }, + { + "epoch": 1.0281918340894363, + "grad_norm": 0.4481448531150818, + "learning_rate": 7.647453265705944e-06, + "loss": 0.1094, + "step": 3173 + }, + { + "epoch": 1.0285158781594297, + "grad_norm": 0.4905964434146881, + "learning_rate": 7.645969215865321e-06, + "loss": 0.1219, + "step": 3174 + }, + { + "epoch": 1.028839922229423, + "grad_norm": 0.471800833940506, + "learning_rate": 7.644484842183681e-06, + "loss": 0.1064, + "step": 3175 + }, + { + "epoch": 1.0291639662994168, + "grad_norm": 0.4817582368850708, + "learning_rate": 7.643000144842698e-06, + "loss": 0.1168, + "step": 3176 + }, + { + "epoch": 1.0294880103694102, + "grad_norm": 0.441641241312027, + "learning_rate": 7.641515124024084e-06, + "loss": 0.1139, + "step": 3177 + }, + { + "epoch": 1.0298120544394038, + "grad_norm": 0.4686790704727173, + "learning_rate": 7.640029779909588e-06, + "loss": 0.1131, + "step": 3178 + }, + { + "epoch": 1.0301360985093972, + "grad_norm": 0.4923335909843445, + "learning_rate": 7.638544112681008e-06, + "loss": 0.1245, + "step": 3179 + }, + { + "epoch": 1.030460142579391, + "grad_norm": 0.4751708507537842, + "learning_rate": 7.637058122520168e-06, + "loss": 0.1176, + "step": 3180 + }, + { + "epoch": 1.0307841866493843, + "grad_norm": 0.46796074509620667, + "learning_rate": 7.635571809608945e-06, + "loss": 0.107, + "step": 3181 + }, + { + "epoch": 1.0311082307193777, + "grad_norm": 0.4851223826408386, + "learning_rate": 7.634085174129246e-06, + "loss": 0.1177, + "step": 3182 + }, + { + "epoch": 1.0314322747893714, + "grad_norm": 0.44485923647880554, + "learning_rate": 7.63259821626302e-06, + "loss": 0.1077, + "step": 3183 + }, + { + "epoch": 1.0317563188593648, + "grad_norm": 0.4402599334716797, + "learning_rate": 7.631110936192262e-06, + "loss": 0.1142, + "step": 3184 + }, + { + "epoch": 1.0320803629293585, + "grad_norm": 0.45097285509109497, + "learning_rate": 7.629623334098994e-06, + "loss": 0.1179, + "step": 3185 + }, + { + "epoch": 1.0324044069993519, + "grad_norm": 0.4726148843765259, + "learning_rate": 7.628135410165286e-06, + "loss": 0.1109, + "step": 3186 + }, + { + "epoch": 1.0327284510693455, + "grad_norm": 0.49244487285614014, + "learning_rate": 7.626647164573247e-06, + "loss": 0.1147, + "step": 3187 + }, + { + "epoch": 1.033052495139339, + "grad_norm": 0.46729210019111633, + "learning_rate": 7.625158597505022e-06, + "loss": 0.1064, + "step": 3188 + }, + { + "epoch": 1.0333765392093324, + "grad_norm": 0.4835570156574249, + "learning_rate": 7.6236697091428e-06, + "loss": 0.1155, + "step": 3189 + }, + { + "epoch": 1.033700583279326, + "grad_norm": 0.4817955195903778, + "learning_rate": 7.622180499668805e-06, + "loss": 0.1146, + "step": 3190 + }, + { + "epoch": 1.0340246273493194, + "grad_norm": 0.49716344475746155, + "learning_rate": 7.620690969265299e-06, + "loss": 0.1068, + "step": 3191 + }, + { + "epoch": 1.034348671419313, + "grad_norm": 0.4979782700538635, + "learning_rate": 7.61920111811459e-06, + "loss": 0.1107, + "step": 3192 + }, + { + "epoch": 1.0346727154893065, + "grad_norm": 0.48396703600883484, + "learning_rate": 7.61771094639902e-06, + "loss": 0.1107, + "step": 3193 + }, + { + "epoch": 1.0349967595593002, + "grad_norm": 0.48865407705307007, + "learning_rate": 7.61622045430097e-06, + "loss": 0.1132, + "step": 3194 + }, + { + "epoch": 1.0353208036292936, + "grad_norm": 0.44049808382987976, + "learning_rate": 7.6147296420028645e-06, + "loss": 0.1023, + "step": 3195 + }, + { + "epoch": 1.035644847699287, + "grad_norm": 0.4887087047100067, + "learning_rate": 7.613238509687164e-06, + "loss": 0.1099, + "step": 3196 + }, + { + "epoch": 1.0359688917692806, + "grad_norm": 0.4959479868412018, + "learning_rate": 7.611747057536367e-06, + "loss": 0.1198, + "step": 3197 + }, + { + "epoch": 1.036292935839274, + "grad_norm": 0.466974675655365, + "learning_rate": 7.610255285733015e-06, + "loss": 0.1051, + "step": 3198 + }, + { + "epoch": 1.0366169799092677, + "grad_norm": 0.5238450765609741, + "learning_rate": 7.608763194459685e-06, + "loss": 0.1248, + "step": 3199 + }, + { + "epoch": 1.0369410239792611, + "grad_norm": 0.490261048078537, + "learning_rate": 7.607270783898995e-06, + "loss": 0.1198, + "step": 3200 + }, + { + "epoch": 1.0372650680492548, + "grad_norm": 0.49088743329048157, + "learning_rate": 7.6057780542336e-06, + "loss": 0.1127, + "step": 3201 + }, + { + "epoch": 1.0375891121192482, + "grad_norm": 0.45407554507255554, + "learning_rate": 7.604285005646199e-06, + "loss": 0.1057, + "step": 3202 + }, + { + "epoch": 1.0379131561892416, + "grad_norm": 0.4346422553062439, + "learning_rate": 7.602791638319522e-06, + "loss": 0.1012, + "step": 3203 + }, + { + "epoch": 1.0382372002592353, + "grad_norm": 0.4368624985218048, + "learning_rate": 7.601297952436349e-06, + "loss": 0.1046, + "step": 3204 + }, + { + "epoch": 1.0385612443292287, + "grad_norm": 0.4649512767791748, + "learning_rate": 7.5998039481794846e-06, + "loss": 0.1101, + "step": 3205 + }, + { + "epoch": 1.0388852883992223, + "grad_norm": 0.46254339814186096, + "learning_rate": 7.598309625731788e-06, + "loss": 0.1095, + "step": 3206 + }, + { + "epoch": 1.0392093324692158, + "grad_norm": 0.4886171817779541, + "learning_rate": 7.596814985276147e-06, + "loss": 0.1209, + "step": 3207 + }, + { + "epoch": 1.0395333765392094, + "grad_norm": 0.47876039147377014, + "learning_rate": 7.595320026995491e-06, + "loss": 0.1197, + "step": 3208 + }, + { + "epoch": 1.0398574206092028, + "grad_norm": 0.46234801411628723, + "learning_rate": 7.593824751072787e-06, + "loss": 0.1117, + "step": 3209 + }, + { + "epoch": 1.0401814646791965, + "grad_norm": 0.462101548910141, + "learning_rate": 7.592329157691045e-06, + "loss": 0.1108, + "step": 3210 + }, + { + "epoch": 1.04050550874919, + "grad_norm": 0.49281927943229675, + "learning_rate": 7.590833247033309e-06, + "loss": 0.1194, + "step": 3211 + }, + { + "epoch": 1.0408295528191833, + "grad_norm": 0.43509066104888916, + "learning_rate": 7.589337019282664e-06, + "loss": 0.1026, + "step": 3212 + }, + { + "epoch": 1.041153596889177, + "grad_norm": 0.505856454372406, + "learning_rate": 7.587840474622236e-06, + "loss": 0.1222, + "step": 3213 + }, + { + "epoch": 1.0414776409591704, + "grad_norm": 0.484377920627594, + "learning_rate": 7.586343613235186e-06, + "loss": 0.1131, + "step": 3214 + }, + { + "epoch": 1.041801685029164, + "grad_norm": 0.47142449021339417, + "learning_rate": 7.584846435304715e-06, + "loss": 0.114, + "step": 3215 + }, + { + "epoch": 1.0421257290991575, + "grad_norm": 0.45768871903419495, + "learning_rate": 7.5833489410140636e-06, + "loss": 0.1034, + "step": 3216 + }, + { + "epoch": 1.042449773169151, + "grad_norm": 0.4345049560070038, + "learning_rate": 7.58185113054651e-06, + "loss": 0.1042, + "step": 3217 + }, + { + "epoch": 1.0427738172391445, + "grad_norm": 0.4796913266181946, + "learning_rate": 7.580353004085372e-06, + "loss": 0.1162, + "step": 3218 + }, + { + "epoch": 1.043097861309138, + "grad_norm": 0.5288426876068115, + "learning_rate": 7.578854561814008e-06, + "loss": 0.1207, + "step": 3219 + }, + { + "epoch": 1.0434219053791316, + "grad_norm": 0.4750249981880188, + "learning_rate": 7.577355803915809e-06, + "loss": 0.1187, + "step": 3220 + }, + { + "epoch": 1.043745949449125, + "grad_norm": 0.4727710485458374, + "learning_rate": 7.575856730574212e-06, + "loss": 0.1155, + "step": 3221 + }, + { + "epoch": 1.0440699935191187, + "grad_norm": 0.515414834022522, + "learning_rate": 7.574357341972687e-06, + "loss": 0.1128, + "step": 3222 + }, + { + "epoch": 1.044394037589112, + "grad_norm": 0.51041179895401, + "learning_rate": 7.5728576382947436e-06, + "loss": 0.1237, + "step": 3223 + }, + { + "epoch": 1.0447180816591057, + "grad_norm": 0.4669690430164337, + "learning_rate": 7.571357619723933e-06, + "loss": 0.1153, + "step": 3224 + }, + { + "epoch": 1.0450421257290992, + "grad_norm": 0.47048744559288025, + "learning_rate": 7.569857286443843e-06, + "loss": 0.1114, + "step": 3225 + }, + { + "epoch": 1.0453661697990926, + "grad_norm": 0.4638916850090027, + "learning_rate": 7.5683566386381e-06, + "loss": 0.1111, + "step": 3226 + }, + { + "epoch": 1.0456902138690862, + "grad_norm": 0.47223541140556335, + "learning_rate": 7.566855676490368e-06, + "loss": 0.1118, + "step": 3227 + }, + { + "epoch": 1.0460142579390797, + "grad_norm": 0.45786023139953613, + "learning_rate": 7.5653544001843485e-06, + "loss": 0.1088, + "step": 3228 + }, + { + "epoch": 1.0463383020090733, + "grad_norm": 0.45595699548721313, + "learning_rate": 7.563852809903786e-06, + "loss": 0.1196, + "step": 3229 + }, + { + "epoch": 1.0466623460790667, + "grad_norm": 0.5083061456680298, + "learning_rate": 7.562350905832459e-06, + "loss": 0.1151, + "step": 3230 + }, + { + "epoch": 1.0469863901490604, + "grad_norm": 0.4339052438735962, + "learning_rate": 7.560848688154187e-06, + "loss": 0.1083, + "step": 3231 + }, + { + "epoch": 1.0473104342190538, + "grad_norm": 0.46849381923675537, + "learning_rate": 7.559346157052828e-06, + "loss": 0.1118, + "step": 3232 + }, + { + "epoch": 1.0476344782890472, + "grad_norm": 0.4972139596939087, + "learning_rate": 7.5578433127122745e-06, + "loss": 0.1143, + "step": 3233 + }, + { + "epoch": 1.0479585223590409, + "grad_norm": 0.47988584637641907, + "learning_rate": 7.556340155316462e-06, + "loss": 0.1296, + "step": 3234 + }, + { + "epoch": 1.0482825664290343, + "grad_norm": 0.48681557178497314, + "learning_rate": 7.55483668504936e-06, + "loss": 0.1082, + "step": 3235 + }, + { + "epoch": 1.048606610499028, + "grad_norm": 0.444217711687088, + "learning_rate": 7.553332902094981e-06, + "loss": 0.1092, + "step": 3236 + }, + { + "epoch": 1.0489306545690213, + "grad_norm": 0.46810221672058105, + "learning_rate": 7.551828806637374e-06, + "loss": 0.1163, + "step": 3237 + }, + { + "epoch": 1.049254698639015, + "grad_norm": 0.49482589960098267, + "learning_rate": 7.550324398860625e-06, + "loss": 0.1038, + "step": 3238 + }, + { + "epoch": 1.0495787427090084, + "grad_norm": 0.4945639371871948, + "learning_rate": 7.548819678948857e-06, + "loss": 0.1134, + "step": 3239 + }, + { + "epoch": 1.0499027867790018, + "grad_norm": 0.4766770303249359, + "learning_rate": 7.547314647086235e-06, + "loss": 0.1166, + "step": 3240 + }, + { + "epoch": 1.0502268308489955, + "grad_norm": 0.4779733717441559, + "learning_rate": 7.545809303456961e-06, + "loss": 0.1229, + "step": 3241 + }, + { + "epoch": 1.050550874918989, + "grad_norm": 0.48405709862709045, + "learning_rate": 7.544303648245275e-06, + "loss": 0.119, + "step": 3242 + }, + { + "epoch": 1.0508749189889826, + "grad_norm": 0.4582526683807373, + "learning_rate": 7.54279768163545e-06, + "loss": 0.1081, + "step": 3243 + }, + { + "epoch": 1.051198963058976, + "grad_norm": 0.4700034260749817, + "learning_rate": 7.541291403811808e-06, + "loss": 0.1013, + "step": 3244 + }, + { + "epoch": 1.0515230071289696, + "grad_norm": 0.48437923192977905, + "learning_rate": 7.539784814958697e-06, + "loss": 0.126, + "step": 3245 + }, + { + "epoch": 1.051847051198963, + "grad_norm": 0.4672539234161377, + "learning_rate": 7.538277915260513e-06, + "loss": 0.1057, + "step": 3246 + }, + { + "epoch": 1.0521710952689567, + "grad_norm": 0.4580419659614563, + "learning_rate": 7.536770704901684e-06, + "loss": 0.1097, + "step": 3247 + }, + { + "epoch": 1.0524951393389501, + "grad_norm": 0.4356781244277954, + "learning_rate": 7.535263184066679e-06, + "loss": 0.1052, + "step": 3248 + }, + { + "epoch": 1.0528191834089435, + "grad_norm": 0.4719600975513458, + "learning_rate": 7.5337553529400046e-06, + "loss": 0.1181, + "step": 3249 + }, + { + "epoch": 1.0531432274789372, + "grad_norm": 0.4823813736438751, + "learning_rate": 7.532247211706202e-06, + "loss": 0.1207, + "step": 3250 + }, + { + "epoch": 1.0534672715489306, + "grad_norm": 0.471195250749588, + "learning_rate": 7.530738760549856e-06, + "loss": 0.1168, + "step": 3251 + }, + { + "epoch": 1.0537913156189243, + "grad_norm": 0.4789368212223053, + "learning_rate": 7.529229999655585e-06, + "loss": 0.1173, + "step": 3252 + }, + { + "epoch": 1.0541153596889177, + "grad_norm": 0.5182376503944397, + "learning_rate": 7.5277209292080465e-06, + "loss": 0.1169, + "step": 3253 + }, + { + "epoch": 1.054439403758911, + "grad_norm": 0.5064989924430847, + "learning_rate": 7.526211549391936e-06, + "loss": 0.1243, + "step": 3254 + }, + { + "epoch": 1.0547634478289047, + "grad_norm": 0.49826836585998535, + "learning_rate": 7.524701860391987e-06, + "loss": 0.1096, + "step": 3255 + }, + { + "epoch": 1.0550874918988982, + "grad_norm": 0.4568706750869751, + "learning_rate": 7.523191862392973e-06, + "loss": 0.1086, + "step": 3256 + }, + { + "epoch": 1.0554115359688918, + "grad_norm": 0.44507694244384766, + "learning_rate": 7.521681555579702e-06, + "loss": 0.1078, + "step": 3257 + }, + { + "epoch": 1.0557355800388852, + "grad_norm": 0.4660944640636444, + "learning_rate": 7.520170940137019e-06, + "loss": 0.1096, + "step": 3258 + }, + { + "epoch": 1.0560596241088789, + "grad_norm": 0.4876995384693146, + "learning_rate": 7.51866001624981e-06, + "loss": 0.1154, + "step": 3259 + }, + { + "epoch": 1.0563836681788723, + "grad_norm": 0.45799127221107483, + "learning_rate": 7.517148784102997e-06, + "loss": 0.1141, + "step": 3260 + }, + { + "epoch": 1.056707712248866, + "grad_norm": 0.4525223970413208, + "learning_rate": 7.515637243881542e-06, + "loss": 0.1119, + "step": 3261 + }, + { + "epoch": 1.0570317563188594, + "grad_norm": 0.43269988894462585, + "learning_rate": 7.5141253957704405e-06, + "loss": 0.1106, + "step": 3262 + }, + { + "epoch": 1.0573558003888528, + "grad_norm": 0.5266064405441284, + "learning_rate": 7.512613239954729e-06, + "loss": 0.1319, + "step": 3263 + }, + { + "epoch": 1.0576798444588464, + "grad_norm": 0.4743003845214844, + "learning_rate": 7.511100776619483e-06, + "loss": 0.1108, + "step": 3264 + }, + { + "epoch": 1.0580038885288399, + "grad_norm": 0.48685547709465027, + "learning_rate": 7.509588005949811e-06, + "loss": 0.1174, + "step": 3265 + }, + { + "epoch": 1.0583279325988335, + "grad_norm": 0.4517221748828888, + "learning_rate": 7.50807492813086e-06, + "loss": 0.1123, + "step": 3266 + }, + { + "epoch": 1.058651976668827, + "grad_norm": 0.4700964391231537, + "learning_rate": 7.5065615433478165e-06, + "loss": 0.1185, + "step": 3267 + }, + { + "epoch": 1.0589760207388206, + "grad_norm": 0.4660481810569763, + "learning_rate": 7.505047851785908e-06, + "loss": 0.1174, + "step": 3268 + }, + { + "epoch": 1.059300064808814, + "grad_norm": 0.46513262391090393, + "learning_rate": 7.503533853630393e-06, + "loss": 0.1159, + "step": 3269 + }, + { + "epoch": 1.0596241088788074, + "grad_norm": 0.4085860848426819, + "learning_rate": 7.50201954906657e-06, + "loss": 0.0909, + "step": 3270 + }, + { + "epoch": 1.059948152948801, + "grad_norm": 0.48166197538375854, + "learning_rate": 7.500504938279775e-06, + "loss": 0.1246, + "step": 3271 + }, + { + "epoch": 1.0602721970187945, + "grad_norm": 0.4505249857902527, + "learning_rate": 7.498990021455383e-06, + "loss": 0.1074, + "step": 3272 + }, + { + "epoch": 1.0605962410887881, + "grad_norm": 0.459302693605423, + "learning_rate": 7.497474798778805e-06, + "loss": 0.1092, + "step": 3273 + }, + { + "epoch": 1.0609202851587816, + "grad_norm": 0.4699563980102539, + "learning_rate": 7.495959270435489e-06, + "loss": 0.1097, + "step": 3274 + }, + { + "epoch": 1.0612443292287752, + "grad_norm": 0.4632161259651184, + "learning_rate": 7.494443436610921e-06, + "loss": 0.1089, + "step": 3275 + }, + { + "epoch": 1.0615683732987686, + "grad_norm": 0.5196097493171692, + "learning_rate": 7.4929272974906245e-06, + "loss": 0.1289, + "step": 3276 + }, + { + "epoch": 1.061892417368762, + "grad_norm": 0.4618408977985382, + "learning_rate": 7.49141085326016e-06, + "loss": 0.1045, + "step": 3277 + }, + { + "epoch": 1.0622164614387557, + "grad_norm": 0.518011212348938, + "learning_rate": 7.489894104105124e-06, + "loss": 0.1209, + "step": 3278 + }, + { + "epoch": 1.0625405055087491, + "grad_norm": 0.4100458025932312, + "learning_rate": 7.488377050211155e-06, + "loss": 0.0966, + "step": 3279 + }, + { + "epoch": 1.0628645495787428, + "grad_norm": 0.4721738398075104, + "learning_rate": 7.4868596917639245e-06, + "loss": 0.1159, + "step": 3280 + }, + { + "epoch": 1.0631885936487362, + "grad_norm": 0.4702790081501007, + "learning_rate": 7.485342028949143e-06, + "loss": 0.1071, + "step": 3281 + }, + { + "epoch": 1.0635126377187298, + "grad_norm": 0.48280274868011475, + "learning_rate": 7.483824061952557e-06, + "loss": 0.1156, + "step": 3282 + }, + { + "epoch": 1.0638366817887233, + "grad_norm": 0.4804169237613678, + "learning_rate": 7.4823057909599504e-06, + "loss": 0.1135, + "step": 3283 + }, + { + "epoch": 1.0641607258587167, + "grad_norm": 0.5231265425682068, + "learning_rate": 7.480787216157146e-06, + "loss": 0.1166, + "step": 3284 + }, + { + "epoch": 1.0644847699287103, + "grad_norm": 0.4714162349700928, + "learning_rate": 7.479268337730002e-06, + "loss": 0.1128, + "step": 3285 + }, + { + "epoch": 1.0648088139987038, + "grad_norm": 0.518753707408905, + "learning_rate": 7.477749155864416e-06, + "loss": 0.1171, + "step": 3286 + }, + { + "epoch": 1.0651328580686974, + "grad_norm": 0.4315536320209503, + "learning_rate": 7.476229670746318e-06, + "loss": 0.1053, + "step": 3287 + }, + { + "epoch": 1.0654569021386908, + "grad_norm": 0.4754064679145813, + "learning_rate": 7.47470988256168e-06, + "loss": 0.1206, + "step": 3288 + }, + { + "epoch": 1.0657809462086845, + "grad_norm": 0.4845450818538666, + "learning_rate": 7.47318979149651e-06, + "loss": 0.1162, + "step": 3289 + }, + { + "epoch": 1.0661049902786779, + "grad_norm": 0.46454256772994995, + "learning_rate": 7.4716693977368516e-06, + "loss": 0.1094, + "step": 3290 + }, + { + "epoch": 1.0664290343486713, + "grad_norm": 0.519045352935791, + "learning_rate": 7.470148701468786e-06, + "loss": 0.1288, + "step": 3291 + }, + { + "epoch": 1.066753078418665, + "grad_norm": 0.47300589084625244, + "learning_rate": 7.468627702878434e-06, + "loss": 0.1163, + "step": 3292 + }, + { + "epoch": 1.0670771224886584, + "grad_norm": 0.4752744138240814, + "learning_rate": 7.4671064021519494e-06, + "loss": 0.117, + "step": 3293 + }, + { + "epoch": 1.067401166558652, + "grad_norm": 0.48252978920936584, + "learning_rate": 7.465584799475522e-06, + "loss": 0.1169, + "step": 3294 + }, + { + "epoch": 1.0677252106286454, + "grad_norm": 0.4439176917076111, + "learning_rate": 7.4640628950353865e-06, + "loss": 0.0988, + "step": 3295 + }, + { + "epoch": 1.068049254698639, + "grad_norm": 0.510744035243988, + "learning_rate": 7.462540689017806e-06, + "loss": 0.1058, + "step": 3296 + }, + { + "epoch": 1.0683732987686325, + "grad_norm": 0.4444325864315033, + "learning_rate": 7.4610181816090845e-06, + "loss": 0.1072, + "step": 3297 + }, + { + "epoch": 1.0686973428386262, + "grad_norm": 0.4627124071121216, + "learning_rate": 7.459495372995561e-06, + "loss": 0.1071, + "step": 3298 + }, + { + "epoch": 1.0690213869086196, + "grad_norm": 0.43973034620285034, + "learning_rate": 7.4579722633636154e-06, + "loss": 0.0997, + "step": 3299 + }, + { + "epoch": 1.069345430978613, + "grad_norm": 0.4386976659297943, + "learning_rate": 7.456448852899658e-06, + "loss": 0.1097, + "step": 3300 + }, + { + "epoch": 1.0696694750486067, + "grad_norm": 0.4828323721885681, + "learning_rate": 7.454925141790141e-06, + "loss": 0.115, + "step": 3301 + }, + { + "epoch": 1.0699935191186, + "grad_norm": 0.5100405216217041, + "learning_rate": 7.453401130221553e-06, + "loss": 0.1211, + "step": 3302 + }, + { + "epoch": 1.0703175631885937, + "grad_norm": 0.4685654938220978, + "learning_rate": 7.451876818380417e-06, + "loss": 0.1161, + "step": 3303 + }, + { + "epoch": 1.0706416072585871, + "grad_norm": 0.45673686265945435, + "learning_rate": 7.450352206453295e-06, + "loss": 0.1011, + "step": 3304 + }, + { + "epoch": 1.0709656513285806, + "grad_norm": 0.4703875184059143, + "learning_rate": 7.448827294626785e-06, + "loss": 0.1161, + "step": 3305 + }, + { + "epoch": 1.0712896953985742, + "grad_norm": 0.4594164192676544, + "learning_rate": 7.4473020830875195e-06, + "loss": 0.1131, + "step": 3306 + }, + { + "epoch": 1.0716137394685676, + "grad_norm": 0.4367749094963074, + "learning_rate": 7.445776572022171e-06, + "loss": 0.1005, + "step": 3307 + }, + { + "epoch": 1.0719377835385613, + "grad_norm": 0.47489625215530396, + "learning_rate": 7.444250761617447e-06, + "loss": 0.1175, + "step": 3308 + }, + { + "epoch": 1.0722618276085547, + "grad_norm": 0.4518619477748871, + "learning_rate": 7.442724652060092e-06, + "loss": 0.1073, + "step": 3309 + }, + { + "epoch": 1.0725858716785484, + "grad_norm": 0.45332789421081543, + "learning_rate": 7.441198243536886e-06, + "loss": 0.1051, + "step": 3310 + }, + { + "epoch": 1.0729099157485418, + "grad_norm": 0.4794146418571472, + "learning_rate": 7.4396715362346495e-06, + "loss": 0.1184, + "step": 3311 + }, + { + "epoch": 1.0732339598185354, + "grad_norm": 0.4581579566001892, + "learning_rate": 7.438144530340233e-06, + "loss": 0.1097, + "step": 3312 + }, + { + "epoch": 1.0735580038885288, + "grad_norm": 0.46173423528671265, + "learning_rate": 7.43661722604053e-06, + "loss": 0.1197, + "step": 3313 + }, + { + "epoch": 1.0738820479585223, + "grad_norm": 0.437900185585022, + "learning_rate": 7.435089623522466e-06, + "loss": 0.1041, + "step": 3314 + }, + { + "epoch": 1.074206092028516, + "grad_norm": 0.43803054094314575, + "learning_rate": 7.433561722973006e-06, + "loss": 0.1063, + "step": 3315 + }, + { + "epoch": 1.0745301360985093, + "grad_norm": 0.49713608622550964, + "learning_rate": 7.432033524579152e-06, + "loss": 0.1148, + "step": 3316 + }, + { + "epoch": 1.074854180168503, + "grad_norm": 0.44355711340904236, + "learning_rate": 7.4305050285279364e-06, + "loss": 0.1078, + "step": 3317 + }, + { + "epoch": 1.0751782242384964, + "grad_norm": 0.528022825717926, + "learning_rate": 7.4289762350064356e-06, + "loss": 0.1287, + "step": 3318 + }, + { + "epoch": 1.07550226830849, + "grad_norm": 0.5100262761116028, + "learning_rate": 7.427447144201756e-06, + "loss": 0.1249, + "step": 3319 + }, + { + "epoch": 1.0758263123784835, + "grad_norm": 0.4281594157218933, + "learning_rate": 7.4259177563010465e-06, + "loss": 0.1026, + "step": 3320 + }, + { + "epoch": 1.076150356448477, + "grad_norm": 0.5049169659614563, + "learning_rate": 7.4243880714914875e-06, + "loss": 0.1189, + "step": 3321 + }, + { + "epoch": 1.0764744005184705, + "grad_norm": 0.4304545819759369, + "learning_rate": 7.422858089960299e-06, + "loss": 0.1029, + "step": 3322 + }, + { + "epoch": 1.076798444588464, + "grad_norm": 0.5015721917152405, + "learning_rate": 7.421327811894735e-06, + "loss": 0.1188, + "step": 3323 + }, + { + "epoch": 1.0771224886584576, + "grad_norm": 0.45259350538253784, + "learning_rate": 7.419797237482089e-06, + "loss": 0.1052, + "step": 3324 + }, + { + "epoch": 1.077446532728451, + "grad_norm": 0.5052790641784668, + "learning_rate": 7.418266366909685e-06, + "loss": 0.127, + "step": 3325 + }, + { + "epoch": 1.0777705767984447, + "grad_norm": 0.485783189535141, + "learning_rate": 7.4167352003648885e-06, + "loss": 0.1267, + "step": 3326 + }, + { + "epoch": 1.078094620868438, + "grad_norm": 0.47802627086639404, + "learning_rate": 7.4152037380351e-06, + "loss": 0.1117, + "step": 3327 + }, + { + "epoch": 1.0784186649384315, + "grad_norm": 0.47302913665771484, + "learning_rate": 7.413671980107754e-06, + "loss": 0.1192, + "step": 3328 + }, + { + "epoch": 1.0787427090084252, + "grad_norm": 0.48858019709587097, + "learning_rate": 7.412139926770323e-06, + "loss": 0.1117, + "step": 3329 + }, + { + "epoch": 1.0790667530784186, + "grad_norm": 0.44832563400268555, + "learning_rate": 7.410607578210319e-06, + "loss": 0.1048, + "step": 3330 + }, + { + "epoch": 1.0793907971484122, + "grad_norm": 0.4587792754173279, + "learning_rate": 7.409074934615282e-06, + "loss": 0.1148, + "step": 3331 + }, + { + "epoch": 1.0797148412184057, + "grad_norm": 0.4639880657196045, + "learning_rate": 7.407541996172795e-06, + "loss": 0.1077, + "step": 3332 + }, + { + "epoch": 1.0800388852883993, + "grad_norm": 0.4755636155605316, + "learning_rate": 7.406008763070474e-06, + "loss": 0.1246, + "step": 3333 + }, + { + "epoch": 1.0803629293583927, + "grad_norm": 0.48292648792266846, + "learning_rate": 7.404475235495973e-06, + "loss": 0.1127, + "step": 3334 + }, + { + "epoch": 1.0806869734283864, + "grad_norm": 0.46788838505744934, + "learning_rate": 7.402941413636979e-06, + "loss": 0.1207, + "step": 3335 + }, + { + "epoch": 1.0810110174983798, + "grad_norm": 0.425082802772522, + "learning_rate": 7.40140729768122e-06, + "loss": 0.0978, + "step": 3336 + }, + { + "epoch": 1.0813350615683732, + "grad_norm": 0.4816244840621948, + "learning_rate": 7.399872887816455e-06, + "loss": 0.1168, + "step": 3337 + }, + { + "epoch": 1.0816591056383669, + "grad_norm": 0.4528196454048157, + "learning_rate": 7.3983381842304815e-06, + "loss": 0.1148, + "step": 3338 + }, + { + "epoch": 1.0819831497083603, + "grad_norm": 0.45505785942077637, + "learning_rate": 7.39680318711113e-06, + "loss": 0.1067, + "step": 3339 + }, + { + "epoch": 1.082307193778354, + "grad_norm": 0.47752875089645386, + "learning_rate": 7.395267896646272e-06, + "loss": 0.1203, + "step": 3340 + }, + { + "epoch": 1.0826312378483474, + "grad_norm": 0.43200385570526123, + "learning_rate": 7.3937323130238125e-06, + "loss": 0.1079, + "step": 3341 + }, + { + "epoch": 1.0829552819183408, + "grad_norm": 0.4563518762588501, + "learning_rate": 7.3921964364316915e-06, + "loss": 0.1092, + "step": 3342 + }, + { + "epoch": 1.0832793259883344, + "grad_norm": 0.5140635967254639, + "learning_rate": 7.390660267057883e-06, + "loss": 0.127, + "step": 3343 + }, + { + "epoch": 1.0836033700583279, + "grad_norm": 0.48420506715774536, + "learning_rate": 7.389123805090401e-06, + "loss": 0.122, + "step": 3344 + }, + { + "epoch": 1.0839274141283215, + "grad_norm": 0.46503114700317383, + "learning_rate": 7.387587050717295e-06, + "loss": 0.1143, + "step": 3345 + }, + { + "epoch": 1.084251458198315, + "grad_norm": 0.444766640663147, + "learning_rate": 7.386050004126647e-06, + "loss": 0.1011, + "step": 3346 + }, + { + "epoch": 1.0845755022683086, + "grad_norm": 0.49079981446266174, + "learning_rate": 7.384512665506578e-06, + "loss": 0.1177, + "step": 3347 + }, + { + "epoch": 1.084899546338302, + "grad_norm": 0.4863133430480957, + "learning_rate": 7.382975035045242e-06, + "loss": 0.117, + "step": 3348 + }, + { + "epoch": 1.0852235904082956, + "grad_norm": 0.4846367835998535, + "learning_rate": 7.381437112930832e-06, + "loss": 0.1211, + "step": 3349 + }, + { + "epoch": 1.085547634478289, + "grad_norm": 0.49157649278640747, + "learning_rate": 7.379898899351572e-06, + "loss": 0.1145, + "step": 3350 + }, + { + "epoch": 1.0858716785482825, + "grad_norm": 0.4918596148490906, + "learning_rate": 7.378360394495726e-06, + "loss": 0.128, + "step": 3351 + }, + { + "epoch": 1.0861957226182761, + "grad_norm": 0.4473949372768402, + "learning_rate": 7.376821598551592e-06, + "loss": 0.1041, + "step": 3352 + }, + { + "epoch": 1.0865197666882696, + "grad_norm": 0.47902122139930725, + "learning_rate": 7.375282511707505e-06, + "loss": 0.1171, + "step": 3353 + }, + { + "epoch": 1.0868438107582632, + "grad_norm": 0.45830297470092773, + "learning_rate": 7.373743134151832e-06, + "loss": 0.1138, + "step": 3354 + }, + { + "epoch": 1.0871678548282566, + "grad_norm": 0.48079654574394226, + "learning_rate": 7.3722034660729795e-06, + "loss": 0.1181, + "step": 3355 + }, + { + "epoch": 1.0874918988982503, + "grad_norm": 0.4816071689128876, + "learning_rate": 7.370663507659386e-06, + "loss": 0.1164, + "step": 3356 + }, + { + "epoch": 1.0878159429682437, + "grad_norm": 0.4607110023498535, + "learning_rate": 7.36912325909953e-06, + "loss": 0.1108, + "step": 3357 + }, + { + "epoch": 1.088139987038237, + "grad_norm": 0.4666801393032074, + "learning_rate": 7.367582720581923e-06, + "loss": 0.1028, + "step": 3358 + }, + { + "epoch": 1.0884640311082308, + "grad_norm": 0.4456480145454407, + "learning_rate": 7.366041892295111e-06, + "loss": 0.1066, + "step": 3359 + }, + { + "epoch": 1.0887880751782242, + "grad_norm": 0.4845721423625946, + "learning_rate": 7.364500774427675e-06, + "loss": 0.116, + "step": 3360 + }, + { + "epoch": 1.0891121192482178, + "grad_norm": 0.4765332043170929, + "learning_rate": 7.3629593671682345e-06, + "loss": 0.118, + "step": 3361 + }, + { + "epoch": 1.0894361633182112, + "grad_norm": 0.4184534251689911, + "learning_rate": 7.361417670705443e-06, + "loss": 0.0986, + "step": 3362 + }, + { + "epoch": 1.089760207388205, + "grad_norm": 0.492906391620636, + "learning_rate": 7.3598756852279885e-06, + "loss": 0.1205, + "step": 3363 + }, + { + "epoch": 1.0900842514581983, + "grad_norm": 0.42441263794898987, + "learning_rate": 7.358333410924596e-06, + "loss": 0.0995, + "step": 3364 + }, + { + "epoch": 1.0904082955281917, + "grad_norm": 0.44085580110549927, + "learning_rate": 7.3567908479840235e-06, + "loss": 0.1068, + "step": 3365 + }, + { + "epoch": 1.0907323395981854, + "grad_norm": 0.4744189381599426, + "learning_rate": 7.355247996595068e-06, + "loss": 0.1091, + "step": 3366 + }, + { + "epoch": 1.0910563836681788, + "grad_norm": 0.4566498398780823, + "learning_rate": 7.353704856946559e-06, + "loss": 0.1082, + "step": 3367 + }, + { + "epoch": 1.0913804277381725, + "grad_norm": 0.4317505955696106, + "learning_rate": 7.352161429227359e-06, + "loss": 0.1037, + "step": 3368 + }, + { + "epoch": 1.0917044718081659, + "grad_norm": 0.47662752866744995, + "learning_rate": 7.350617713626372e-06, + "loss": 0.1223, + "step": 3369 + }, + { + "epoch": 1.0920285158781595, + "grad_norm": 0.49168187379837036, + "learning_rate": 7.349073710332533e-06, + "loss": 0.1173, + "step": 3370 + }, + { + "epoch": 1.092352559948153, + "grad_norm": 0.47964388132095337, + "learning_rate": 7.347529419534811e-06, + "loss": 0.1139, + "step": 3371 + }, + { + "epoch": 1.0926766040181464, + "grad_norm": 0.47219377756118774, + "learning_rate": 7.3459848414222154e-06, + "loss": 0.1184, + "step": 3372 + }, + { + "epoch": 1.09300064808814, + "grad_norm": 0.4608166217803955, + "learning_rate": 7.3444399761837855e-06, + "loss": 0.1107, + "step": 3373 + }, + { + "epoch": 1.0933246921581334, + "grad_norm": 0.49217841029167175, + "learning_rate": 7.3428948240085985e-06, + "loss": 0.1232, + "step": 3374 + }, + { + "epoch": 1.093648736228127, + "grad_norm": 0.47193947434425354, + "learning_rate": 7.3413493850857665e-06, + "loss": 0.1133, + "step": 3375 + }, + { + "epoch": 1.0939727802981205, + "grad_norm": 0.4649535119533539, + "learning_rate": 7.3398036596044345e-06, + "loss": 0.1183, + "step": 3376 + }, + { + "epoch": 1.0942968243681142, + "grad_norm": 0.45862525701522827, + "learning_rate": 7.3382576477537855e-06, + "loss": 0.108, + "step": 3377 + }, + { + "epoch": 1.0946208684381076, + "grad_norm": 0.4705563187599182, + "learning_rate": 7.336711349723039e-06, + "loss": 0.1168, + "step": 3378 + }, + { + "epoch": 1.094944912508101, + "grad_norm": 0.44781872630119324, + "learning_rate": 7.335164765701442e-06, + "loss": 0.1085, + "step": 3379 + }, + { + "epoch": 1.0952689565780946, + "grad_norm": 0.4419195353984833, + "learning_rate": 7.3336178958782865e-06, + "loss": 0.1008, + "step": 3380 + }, + { + "epoch": 1.095593000648088, + "grad_norm": 0.4678310453891754, + "learning_rate": 7.33207074044289e-06, + "loss": 0.1114, + "step": 3381 + }, + { + "epoch": 1.0959170447180817, + "grad_norm": 0.4491790235042572, + "learning_rate": 7.33052329958461e-06, + "loss": 0.1055, + "step": 3382 + }, + { + "epoch": 1.0962410887880751, + "grad_norm": 0.5071040987968445, + "learning_rate": 7.32897557349284e-06, + "loss": 0.1171, + "step": 3383 + }, + { + "epoch": 1.0965651328580688, + "grad_norm": 0.4346998333930969, + "learning_rate": 7.327427562357008e-06, + "loss": 0.0983, + "step": 3384 + }, + { + "epoch": 1.0968891769280622, + "grad_norm": 0.4447813630104065, + "learning_rate": 7.325879266366571e-06, + "loss": 0.1013, + "step": 3385 + }, + { + "epoch": 1.0972132209980558, + "grad_norm": 0.48892831802368164, + "learning_rate": 7.324330685711029e-06, + "loss": 0.1191, + "step": 3386 + }, + { + "epoch": 1.0975372650680493, + "grad_norm": 0.4909844696521759, + "learning_rate": 7.322781820579912e-06, + "loss": 0.1222, + "step": 3387 + }, + { + "epoch": 1.0978613091380427, + "grad_norm": 0.4649695158004761, + "learning_rate": 7.321232671162787e-06, + "loss": 0.1084, + "step": 3388 + }, + { + "epoch": 1.0981853532080363, + "grad_norm": 0.4527525007724762, + "learning_rate": 7.319683237649253e-06, + "loss": 0.1119, + "step": 3389 + }, + { + "epoch": 1.0985093972780298, + "grad_norm": 0.5357375144958496, + "learning_rate": 7.3181335202289495e-06, + "loss": 0.1129, + "step": 3390 + }, + { + "epoch": 1.0988334413480234, + "grad_norm": 0.5059093236923218, + "learning_rate": 7.3165835190915435e-06, + "loss": 0.1173, + "step": 3391 + }, + { + "epoch": 1.0991574854180168, + "grad_norm": 0.43152016401290894, + "learning_rate": 7.315033234426741e-06, + "loss": 0.11, + "step": 3392 + }, + { + "epoch": 1.0994815294880103, + "grad_norm": 0.49721404910087585, + "learning_rate": 7.3134826664242805e-06, + "loss": 0.1113, + "step": 3393 + }, + { + "epoch": 1.099805573558004, + "grad_norm": 0.4575223922729492, + "learning_rate": 7.311931815273938e-06, + "loss": 0.1135, + "step": 3394 + }, + { + "epoch": 1.1001296176279973, + "grad_norm": 0.49471038579940796, + "learning_rate": 7.310380681165523e-06, + "loss": 0.124, + "step": 3395 + }, + { + "epoch": 1.100453661697991, + "grad_norm": 0.4235450327396393, + "learning_rate": 7.308829264288879e-06, + "loss": 0.1042, + "step": 3396 + }, + { + "epoch": 1.1007777057679844, + "grad_norm": 0.4568019211292267, + "learning_rate": 7.307277564833886e-06, + "loss": 0.108, + "step": 3397 + }, + { + "epoch": 1.101101749837978, + "grad_norm": 0.4586898684501648, + "learning_rate": 7.305725582990453e-06, + "loss": 0.1065, + "step": 3398 + }, + { + "epoch": 1.1014257939079715, + "grad_norm": 0.48752400279045105, + "learning_rate": 7.30417331894853e-06, + "loss": 0.1146, + "step": 3399 + }, + { + "epoch": 1.101749837977965, + "grad_norm": 0.4247609078884125, + "learning_rate": 7.3026207728980994e-06, + "loss": 0.1105, + "step": 3400 + }, + { + "epoch": 1.1020738820479585, + "grad_norm": 0.4526832401752472, + "learning_rate": 7.301067945029178e-06, + "loss": 0.116, + "step": 3401 + }, + { + "epoch": 1.102397926117952, + "grad_norm": 0.480944961309433, + "learning_rate": 7.299514835531815e-06, + "loss": 0.1222, + "step": 3402 + }, + { + "epoch": 1.1027219701879456, + "grad_norm": 0.4303266406059265, + "learning_rate": 7.2979614445960975e-06, + "loss": 0.1036, + "step": 3403 + }, + { + "epoch": 1.103046014257939, + "grad_norm": 0.47771838307380676, + "learning_rate": 7.296407772412146e-06, + "loss": 0.1242, + "step": 3404 + }, + { + "epoch": 1.1033700583279327, + "grad_norm": 0.47071340680122375, + "learning_rate": 7.2948538191701136e-06, + "loss": 0.1113, + "step": 3405 + }, + { + "epoch": 1.103694102397926, + "grad_norm": 0.4870220720767975, + "learning_rate": 7.293299585060188e-06, + "loss": 0.1197, + "step": 3406 + }, + { + "epoch": 1.1040181464679197, + "grad_norm": 0.4674378037452698, + "learning_rate": 7.291745070272596e-06, + "loss": 0.1096, + "step": 3407 + }, + { + "epoch": 1.1043421905379132, + "grad_norm": 0.509381115436554, + "learning_rate": 7.2901902749975915e-06, + "loss": 0.1213, + "step": 3408 + }, + { + "epoch": 1.1046662346079066, + "grad_norm": 0.48851171135902405, + "learning_rate": 7.288635199425471e-06, + "loss": 0.1002, + "step": 3409 + }, + { + "epoch": 1.1049902786779002, + "grad_norm": 0.4779938757419586, + "learning_rate": 7.287079843746555e-06, + "loss": 0.1147, + "step": 3410 + }, + { + "epoch": 1.1053143227478937, + "grad_norm": 0.47918787598609924, + "learning_rate": 7.285524208151208e-06, + "loss": 0.12, + "step": 3411 + }, + { + "epoch": 1.1056383668178873, + "grad_norm": 0.49099570512771606, + "learning_rate": 7.283968292829824e-06, + "loss": 0.1121, + "step": 3412 + }, + { + "epoch": 1.1059624108878807, + "grad_norm": 0.4889676868915558, + "learning_rate": 7.282412097972831e-06, + "loss": 0.1152, + "step": 3413 + }, + { + "epoch": 1.1062864549578744, + "grad_norm": 0.459026038646698, + "learning_rate": 7.280855623770692e-06, + "loss": 0.1167, + "step": 3414 + }, + { + "epoch": 1.1066104990278678, + "grad_norm": 0.479397177696228, + "learning_rate": 7.279298870413906e-06, + "loss": 0.1096, + "step": 3415 + }, + { + "epoch": 1.1069345430978612, + "grad_norm": 0.503694474697113, + "learning_rate": 7.2777418380930035e-06, + "loss": 0.1159, + "step": 3416 + }, + { + "epoch": 1.1072585871678549, + "grad_norm": 0.5161811113357544, + "learning_rate": 7.276184526998548e-06, + "loss": 0.1187, + "step": 3417 + }, + { + "epoch": 1.1075826312378483, + "grad_norm": 0.46992769837379456, + "learning_rate": 7.2746269373211445e-06, + "loss": 0.1165, + "step": 3418 + }, + { + "epoch": 1.107906675307842, + "grad_norm": 0.4351085126399994, + "learning_rate": 7.2730690692514225e-06, + "loss": 0.1007, + "step": 3419 + }, + { + "epoch": 1.1082307193778353, + "grad_norm": 0.4792276918888092, + "learning_rate": 7.271510922980052e-06, + "loss": 0.1049, + "step": 3420 + }, + { + "epoch": 1.108554763447829, + "grad_norm": 0.5022870898246765, + "learning_rate": 7.269952498697734e-06, + "loss": 0.1251, + "step": 3421 + }, + { + "epoch": 1.1088788075178224, + "grad_norm": 0.48089098930358887, + "learning_rate": 7.2683937965952055e-06, + "loss": 0.1124, + "step": 3422 + }, + { + "epoch": 1.1092028515878158, + "grad_norm": 0.49522295594215393, + "learning_rate": 7.266834816863237e-06, + "loss": 0.1177, + "step": 3423 + }, + { + "epoch": 1.1095268956578095, + "grad_norm": 0.45647338032722473, + "learning_rate": 7.26527555969263e-06, + "loss": 0.105, + "step": 3424 + }, + { + "epoch": 1.109850939727803, + "grad_norm": 0.4819112718105316, + "learning_rate": 7.263716025274225e-06, + "loss": 0.1091, + "step": 3425 + }, + { + "epoch": 1.1101749837977966, + "grad_norm": 0.5295591354370117, + "learning_rate": 7.262156213798892e-06, + "loss": 0.1332, + "step": 3426 + }, + { + "epoch": 1.11049902786779, + "grad_norm": 0.438809871673584, + "learning_rate": 7.260596125457538e-06, + "loss": 0.1014, + "step": 3427 + }, + { + "epoch": 1.1108230719377836, + "grad_norm": 0.48159369826316833, + "learning_rate": 7.259035760441103e-06, + "loss": 0.1192, + "step": 3428 + }, + { + "epoch": 1.111147116007777, + "grad_norm": 0.47701340913772583, + "learning_rate": 7.25747511894056e-06, + "loss": 0.113, + "step": 3429 + }, + { + "epoch": 1.1114711600777705, + "grad_norm": 0.47457194328308105, + "learning_rate": 7.255914201146917e-06, + "loss": 0.1134, + "step": 3430 + }, + { + "epoch": 1.1117952041477641, + "grad_norm": 0.5145271420478821, + "learning_rate": 7.254353007251213e-06, + "loss": 0.123, + "step": 3431 + }, + { + "epoch": 1.1121192482177575, + "grad_norm": 0.41805732250213623, + "learning_rate": 7.252791537444527e-06, + "loss": 0.0951, + "step": 3432 + }, + { + "epoch": 1.1124432922877512, + "grad_norm": 0.48133549094200134, + "learning_rate": 7.251229791917964e-06, + "loss": 0.109, + "step": 3433 + }, + { + "epoch": 1.1127673363577446, + "grad_norm": 0.4440436065196991, + "learning_rate": 7.249667770862668e-06, + "loss": 0.109, + "step": 3434 + }, + { + "epoch": 1.1130913804277383, + "grad_norm": 0.4566704332828522, + "learning_rate": 7.248105474469816e-06, + "loss": 0.1154, + "step": 3435 + }, + { + "epoch": 1.1134154244977317, + "grad_norm": 0.4194703996181488, + "learning_rate": 7.2465429029306164e-06, + "loss": 0.1001, + "step": 3436 + }, + { + "epoch": 1.1137394685677253, + "grad_norm": 0.49655959010124207, + "learning_rate": 7.244980056436315e-06, + "loss": 0.1145, + "step": 3437 + }, + { + "epoch": 1.1140635126377187, + "grad_norm": 0.4804117977619171, + "learning_rate": 7.243416935178187e-06, + "loss": 0.1122, + "step": 3438 + }, + { + "epoch": 1.1143875567077122, + "grad_norm": 0.4633403420448303, + "learning_rate": 7.241853539347545e-06, + "loss": 0.1121, + "step": 3439 + }, + { + "epoch": 1.1147116007777058, + "grad_norm": 0.5017057061195374, + "learning_rate": 7.2402898691357315e-06, + "loss": 0.1262, + "step": 3440 + }, + { + "epoch": 1.1150356448476992, + "grad_norm": 0.4831858277320862, + "learning_rate": 7.238725924734125e-06, + "loss": 0.1127, + "step": 3441 + }, + { + "epoch": 1.1153596889176929, + "grad_norm": 0.47501975297927856, + "learning_rate": 7.237161706334139e-06, + "loss": 0.1098, + "step": 3442 + }, + { + "epoch": 1.1156837329876863, + "grad_norm": 0.4766427278518677, + "learning_rate": 7.235597214127218e-06, + "loss": 0.1097, + "step": 3443 + }, + { + "epoch": 1.1160077770576797, + "grad_norm": 0.44049620628356934, + "learning_rate": 7.23403244830484e-06, + "loss": 0.107, + "step": 3444 + }, + { + "epoch": 1.1163318211276734, + "grad_norm": 0.47011467814445496, + "learning_rate": 7.232467409058518e-06, + "loss": 0.1056, + "step": 3445 + }, + { + "epoch": 1.1166558651976668, + "grad_norm": 0.49452170729637146, + "learning_rate": 7.2309020965797945e-06, + "loss": 0.1176, + "step": 3446 + }, + { + "epoch": 1.1169799092676604, + "grad_norm": 0.42975515127182007, + "learning_rate": 7.229336511060253e-06, + "loss": 0.1035, + "step": 3447 + }, + { + "epoch": 1.1173039533376539, + "grad_norm": 0.4905330240726471, + "learning_rate": 7.227770652691504e-06, + "loss": 0.1291, + "step": 3448 + }, + { + "epoch": 1.1176279974076475, + "grad_norm": 0.4783221483230591, + "learning_rate": 7.226204521665195e-06, + "loss": 0.1185, + "step": 3449 + }, + { + "epoch": 1.117952041477641, + "grad_norm": 0.4664043188095093, + "learning_rate": 7.224638118173e-06, + "loss": 0.1117, + "step": 3450 + }, + { + "epoch": 1.1182760855476346, + "grad_norm": 0.472312867641449, + "learning_rate": 7.223071442406639e-06, + "loss": 0.1147, + "step": 3451 + }, + { + "epoch": 1.118600129617628, + "grad_norm": 0.46934542059898376, + "learning_rate": 7.221504494557854e-06, + "loss": 0.1078, + "step": 3452 + }, + { + "epoch": 1.1189241736876214, + "grad_norm": 0.4561583399772644, + "learning_rate": 7.219937274818424e-06, + "loss": 0.1108, + "step": 3453 + }, + { + "epoch": 1.119248217757615, + "grad_norm": 0.4605262875556946, + "learning_rate": 7.218369783380163e-06, + "loss": 0.1109, + "step": 3454 + }, + { + "epoch": 1.1195722618276085, + "grad_norm": 0.490578293800354, + "learning_rate": 7.216802020434915e-06, + "loss": 0.1095, + "step": 3455 + }, + { + "epoch": 1.1198963058976021, + "grad_norm": 0.5183651447296143, + "learning_rate": 7.215233986174561e-06, + "loss": 0.1224, + "step": 3456 + }, + { + "epoch": 1.1202203499675956, + "grad_norm": 0.49729302525520325, + "learning_rate": 7.213665680791012e-06, + "loss": 0.1186, + "step": 3457 + }, + { + "epoch": 1.1205443940375892, + "grad_norm": 0.460462749004364, + "learning_rate": 7.212097104476213e-06, + "loss": 0.1115, + "step": 3458 + }, + { + "epoch": 1.1208684381075826, + "grad_norm": 0.495995432138443, + "learning_rate": 7.210528257422144e-06, + "loss": 0.119, + "step": 3459 + }, + { + "epoch": 1.121192482177576, + "grad_norm": 0.4926115870475769, + "learning_rate": 7.208959139820815e-06, + "loss": 0.1193, + "step": 3460 + }, + { + "epoch": 1.1215165262475697, + "grad_norm": 0.4380375146865845, + "learning_rate": 7.207389751864271e-06, + "loss": 0.1044, + "step": 3461 + }, + { + "epoch": 1.1218405703175631, + "grad_norm": 0.45447611808776855, + "learning_rate": 7.205820093744591e-06, + "loss": 0.1045, + "step": 3462 + }, + { + "epoch": 1.1221646143875568, + "grad_norm": 0.4510899484157562, + "learning_rate": 7.204250165653888e-06, + "loss": 0.1016, + "step": 3463 + }, + { + "epoch": 1.1224886584575502, + "grad_norm": 0.509792149066925, + "learning_rate": 7.2026799677843e-06, + "loss": 0.1183, + "step": 3464 + }, + { + "epoch": 1.1228127025275438, + "grad_norm": 0.4836459755897522, + "learning_rate": 7.20110950032801e-06, + "loss": 0.1133, + "step": 3465 + }, + { + "epoch": 1.1231367465975373, + "grad_norm": 0.49302640557289124, + "learning_rate": 7.1995387634772255e-06, + "loss": 0.1183, + "step": 3466 + }, + { + "epoch": 1.1234607906675307, + "grad_norm": 0.47388386726379395, + "learning_rate": 7.197967757424188e-06, + "loss": 0.1119, + "step": 3467 + }, + { + "epoch": 1.1237848347375243, + "grad_norm": 0.49097520112991333, + "learning_rate": 7.196396482361176e-06, + "loss": 0.1159, + "step": 3468 + }, + { + "epoch": 1.1241088788075178, + "grad_norm": 0.4219222664833069, + "learning_rate": 7.194824938480496e-06, + "loss": 0.0995, + "step": 3469 + }, + { + "epoch": 1.1244329228775114, + "grad_norm": 0.47584277391433716, + "learning_rate": 7.193253125974493e-06, + "loss": 0.1108, + "step": 3470 + }, + { + "epoch": 1.1247569669475048, + "grad_norm": 0.4976772665977478, + "learning_rate": 7.191681045035538e-06, + "loss": 0.1159, + "step": 3471 + }, + { + "epoch": 1.1250810110174985, + "grad_norm": 0.4697946310043335, + "learning_rate": 7.190108695856041e-06, + "loss": 0.1168, + "step": 3472 + }, + { + "epoch": 1.125405055087492, + "grad_norm": 0.44162654876708984, + "learning_rate": 7.1885360786284405e-06, + "loss": 0.1047, + "step": 3473 + }, + { + "epoch": 1.1257290991574855, + "grad_norm": 0.4678237736225128, + "learning_rate": 7.186963193545212e-06, + "loss": 0.1118, + "step": 3474 + }, + { + "epoch": 1.126053143227479, + "grad_norm": 0.44769546389579773, + "learning_rate": 7.185390040798861e-06, + "loss": 0.1098, + "step": 3475 + }, + { + "epoch": 1.1263771872974724, + "grad_norm": 0.46062126755714417, + "learning_rate": 7.183816620581923e-06, + "loss": 0.1003, + "step": 3476 + }, + { + "epoch": 1.126701231367466, + "grad_norm": 0.4380654990673065, + "learning_rate": 7.182242933086974e-06, + "loss": 0.1113, + "step": 3477 + }, + { + "epoch": 1.1270252754374595, + "grad_norm": 0.4305112659931183, + "learning_rate": 7.180668978506613e-06, + "loss": 0.1088, + "step": 3478 + }, + { + "epoch": 1.127349319507453, + "grad_norm": 0.4602273106575012, + "learning_rate": 7.1790947570334815e-06, + "loss": 0.1047, + "step": 3479 + }, + { + "epoch": 1.1276733635774465, + "grad_norm": 0.4375208914279938, + "learning_rate": 7.1775202688602455e-06, + "loss": 0.1094, + "step": 3480 + }, + { + "epoch": 1.12799740764744, + "grad_norm": 0.48319289088249207, + "learning_rate": 7.17594551417961e-06, + "loss": 0.1177, + "step": 3481 + }, + { + "epoch": 1.1283214517174336, + "grad_norm": 0.49129560589790344, + "learning_rate": 7.174370493184308e-06, + "loss": 0.1195, + "step": 3482 + }, + { + "epoch": 1.128645495787427, + "grad_norm": 0.767615020275116, + "learning_rate": 7.172795206067107e-06, + "loss": 0.1144, + "step": 3483 + }, + { + "epoch": 1.1289695398574207, + "grad_norm": 0.4652465879917145, + "learning_rate": 7.171219653020807e-06, + "loss": 0.1144, + "step": 3484 + }, + { + "epoch": 1.129293583927414, + "grad_norm": 0.45197775959968567, + "learning_rate": 7.16964383423824e-06, + "loss": 0.1013, + "step": 3485 + }, + { + "epoch": 1.1296176279974077, + "grad_norm": 0.45165348052978516, + "learning_rate": 7.168067749912273e-06, + "loss": 0.1131, + "step": 3486 + }, + { + "epoch": 1.1299416720674011, + "grad_norm": 0.47169649600982666, + "learning_rate": 7.1664914002358e-06, + "loss": 0.1198, + "step": 3487 + }, + { + "epoch": 1.1302657161373948, + "grad_norm": 0.44688108563423157, + "learning_rate": 7.164914785401756e-06, + "loss": 0.107, + "step": 3488 + }, + { + "epoch": 1.1305897602073882, + "grad_norm": 0.4839881360530853, + "learning_rate": 7.163337905603097e-06, + "loss": 0.1247, + "step": 3489 + }, + { + "epoch": 1.1309138042773816, + "grad_norm": 0.48105567693710327, + "learning_rate": 7.161760761032822e-06, + "loss": 0.1136, + "step": 3490 + }, + { + "epoch": 1.1312378483473753, + "grad_norm": 0.47501346468925476, + "learning_rate": 7.160183351883957e-06, + "loss": 0.1111, + "step": 3491 + }, + { + "epoch": 1.1315618924173687, + "grad_norm": 0.513714075088501, + "learning_rate": 7.158605678349562e-06, + "loss": 0.1227, + "step": 3492 + }, + { + "epoch": 1.1318859364873624, + "grad_norm": 0.4716878831386566, + "learning_rate": 7.15702774062273e-06, + "loss": 0.1235, + "step": 3493 + }, + { + "epoch": 1.1322099805573558, + "grad_norm": 0.47698792815208435, + "learning_rate": 7.155449538896584e-06, + "loss": 0.1162, + "step": 3494 + }, + { + "epoch": 1.1325340246273492, + "grad_norm": 0.5115220546722412, + "learning_rate": 7.15387107336428e-06, + "loss": 0.1233, + "step": 3495 + }, + { + "epoch": 1.1328580686973428, + "grad_norm": 0.45470064878463745, + "learning_rate": 7.1522923442190074e-06, + "loss": 0.1107, + "step": 3496 + }, + { + "epoch": 1.1331821127673363, + "grad_norm": 0.4847749173641205, + "learning_rate": 7.15071335165399e-06, + "loss": 0.1266, + "step": 3497 + }, + { + "epoch": 1.13350615683733, + "grad_norm": 0.47126853466033936, + "learning_rate": 7.149134095862476e-06, + "loss": 0.1163, + "step": 3498 + }, + { + "epoch": 1.1338302009073233, + "grad_norm": 0.4674606919288635, + "learning_rate": 7.1475545770377555e-06, + "loss": 0.1203, + "step": 3499 + }, + { + "epoch": 1.134154244977317, + "grad_norm": 0.4372369349002838, + "learning_rate": 7.145974795373145e-06, + "loss": 0.1114, + "step": 3500 + }, + { + "epoch": 1.1344782890473104, + "grad_norm": 0.4782916009426117, + "learning_rate": 7.1443947510619925e-06, + "loss": 0.1193, + "step": 3501 + }, + { + "epoch": 1.134802333117304, + "grad_norm": 0.48257872462272644, + "learning_rate": 7.142814444297683e-06, + "loss": 0.1158, + "step": 3502 + }, + { + "epoch": 1.1351263771872975, + "grad_norm": 0.4267340302467346, + "learning_rate": 7.1412338752736286e-06, + "loss": 0.1069, + "step": 3503 + }, + { + "epoch": 1.135450421257291, + "grad_norm": 0.4908585548400879, + "learning_rate": 7.1396530441832775e-06, + "loss": 0.1215, + "step": 3504 + }, + { + "epoch": 1.1357744653272845, + "grad_norm": 0.48388153314590454, + "learning_rate": 7.1380719512201065e-06, + "loss": 0.1204, + "step": 3505 + }, + { + "epoch": 1.136098509397278, + "grad_norm": 0.4557008445262909, + "learning_rate": 7.136490596577629e-06, + "loss": 0.1185, + "step": 3506 + }, + { + "epoch": 1.1364225534672716, + "grad_norm": 0.4656080901622772, + "learning_rate": 7.134908980449383e-06, + "loss": 0.1215, + "step": 3507 + }, + { + "epoch": 1.136746597537265, + "grad_norm": 0.4793829619884491, + "learning_rate": 7.133327103028946e-06, + "loss": 0.1217, + "step": 3508 + }, + { + "epoch": 1.1370706416072587, + "grad_norm": 0.44613632559776306, + "learning_rate": 7.131744964509925e-06, + "loss": 0.1105, + "step": 3509 + }, + { + "epoch": 1.137394685677252, + "grad_norm": 0.43979090452194214, + "learning_rate": 7.130162565085955e-06, + "loss": 0.1045, + "step": 3510 + }, + { + "epoch": 1.1377187297472457, + "grad_norm": 0.4457186162471771, + "learning_rate": 7.1285799049507095e-06, + "loss": 0.106, + "step": 3511 + }, + { + "epoch": 1.1380427738172392, + "grad_norm": 0.45643284916877747, + "learning_rate": 7.126996984297891e-06, + "loss": 0.1118, + "step": 3512 + }, + { + "epoch": 1.1383668178872326, + "grad_norm": 0.4419260323047638, + "learning_rate": 7.125413803321232e-06, + "loss": 0.107, + "step": 3513 + }, + { + "epoch": 1.1386908619572262, + "grad_norm": 0.4509630799293518, + "learning_rate": 7.123830362214498e-06, + "loss": 0.1141, + "step": 3514 + }, + { + "epoch": 1.1390149060272197, + "grad_norm": 0.44466331601142883, + "learning_rate": 7.122246661171488e-06, + "loss": 0.0995, + "step": 3515 + }, + { + "epoch": 1.1393389500972133, + "grad_norm": 0.4958738088607788, + "learning_rate": 7.120662700386032e-06, + "loss": 0.1174, + "step": 3516 + }, + { + "epoch": 1.1396629941672067, + "grad_norm": 0.4959253966808319, + "learning_rate": 7.119078480051993e-06, + "loss": 0.1139, + "step": 3517 + }, + { + "epoch": 1.1399870382372002, + "grad_norm": 0.4844566583633423, + "learning_rate": 7.11749400036326e-06, + "loss": 0.1157, + "step": 3518 + }, + { + "epoch": 1.1403110823071938, + "grad_norm": 0.46060386300086975, + "learning_rate": 7.1159092615137614e-06, + "loss": 0.1074, + "step": 3519 + }, + { + "epoch": 1.1406351263771872, + "grad_norm": 0.49883654713630676, + "learning_rate": 7.114324263697452e-06, + "loss": 0.1204, + "step": 3520 + }, + { + "epoch": 1.1409591704471809, + "grad_norm": 0.47432583570480347, + "learning_rate": 7.112739007108321e-06, + "loss": 0.1183, + "step": 3521 + }, + { + "epoch": 1.1412832145171743, + "grad_norm": 0.4798056483268738, + "learning_rate": 7.111153491940389e-06, + "loss": 0.1055, + "step": 3522 + }, + { + "epoch": 1.141607258587168, + "grad_norm": 0.4634266793727875, + "learning_rate": 7.109567718387706e-06, + "loss": 0.111, + "step": 3523 + }, + { + "epoch": 1.1419313026571614, + "grad_norm": 0.4673268795013428, + "learning_rate": 7.1079816866443585e-06, + "loss": 0.113, + "step": 3524 + }, + { + "epoch": 1.142255346727155, + "grad_norm": 0.4723830223083496, + "learning_rate": 7.106395396904458e-06, + "loss": 0.1117, + "step": 3525 + }, + { + "epoch": 1.1425793907971484, + "grad_norm": 0.45182153582572937, + "learning_rate": 7.104808849362153e-06, + "loss": 0.11, + "step": 3526 + }, + { + "epoch": 1.1429034348671419, + "grad_norm": 0.5160151124000549, + "learning_rate": 7.103222044211619e-06, + "loss": 0.1198, + "step": 3527 + }, + { + "epoch": 1.1432274789371355, + "grad_norm": 0.4806891679763794, + "learning_rate": 7.10163498164707e-06, + "loss": 0.1147, + "step": 3528 + }, + { + "epoch": 1.143551523007129, + "grad_norm": 0.464277058839798, + "learning_rate": 7.1000476618627435e-06, + "loss": 0.1066, + "step": 3529 + }, + { + "epoch": 1.1438755670771226, + "grad_norm": 0.4657345116138458, + "learning_rate": 7.098460085052915e-06, + "loss": 0.1089, + "step": 3530 + }, + { + "epoch": 1.144199611147116, + "grad_norm": 0.4736658036708832, + "learning_rate": 7.096872251411885e-06, + "loss": 0.1138, + "step": 3531 + }, + { + "epoch": 1.1445236552171094, + "grad_norm": 0.5042957663536072, + "learning_rate": 7.0952841611339906e-06, + "loss": 0.1247, + "step": 3532 + }, + { + "epoch": 1.144847699287103, + "grad_norm": 0.47397053241729736, + "learning_rate": 7.093695814413599e-06, + "loss": 0.1165, + "step": 3533 + }, + { + "epoch": 1.1451717433570965, + "grad_norm": 0.4877525568008423, + "learning_rate": 7.0921072114451084e-06, + "loss": 0.1149, + "step": 3534 + }, + { + "epoch": 1.1454957874270901, + "grad_norm": 0.4706099331378937, + "learning_rate": 7.090518352422948e-06, + "loss": 0.109, + "step": 3535 + }, + { + "epoch": 1.1458198314970836, + "grad_norm": 0.4718784689903259, + "learning_rate": 7.088929237541579e-06, + "loss": 0.1044, + "step": 3536 + }, + { + "epoch": 1.1461438755670772, + "grad_norm": 0.46416759490966797, + "learning_rate": 7.087339866995495e-06, + "loss": 0.117, + "step": 3537 + }, + { + "epoch": 1.1464679196370706, + "grad_norm": 0.49240201711654663, + "learning_rate": 7.0857502409792166e-06, + "loss": 0.1254, + "step": 3538 + }, + { + "epoch": 1.1467919637070643, + "grad_norm": 0.45724916458129883, + "learning_rate": 7.084160359687302e-06, + "loss": 0.1103, + "step": 3539 + }, + { + "epoch": 1.1471160077770577, + "grad_norm": 0.4559352993965149, + "learning_rate": 7.082570223314335e-06, + "loss": 0.1116, + "step": 3540 + }, + { + "epoch": 1.1474400518470511, + "grad_norm": 0.4867151081562042, + "learning_rate": 7.080979832054933e-06, + "loss": 0.1095, + "step": 3541 + }, + { + "epoch": 1.1477640959170448, + "grad_norm": 0.4994567632675171, + "learning_rate": 7.0793891861037445e-06, + "loss": 0.1159, + "step": 3542 + }, + { + "epoch": 1.1480881399870382, + "grad_norm": 0.4985491931438446, + "learning_rate": 7.077798285655452e-06, + "loss": 0.125, + "step": 3543 + }, + { + "epoch": 1.1484121840570318, + "grad_norm": 0.46385741233825684, + "learning_rate": 7.076207130904762e-06, + "loss": 0.105, + "step": 3544 + }, + { + "epoch": 1.1487362281270252, + "grad_norm": 0.46539029479026794, + "learning_rate": 7.074615722046418e-06, + "loss": 0.1147, + "step": 3545 + }, + { + "epoch": 1.1490602721970187, + "grad_norm": 0.45837581157684326, + "learning_rate": 7.073024059275194e-06, + "loss": 0.1118, + "step": 3546 + }, + { + "epoch": 1.1493843162670123, + "grad_norm": 0.42826589941978455, + "learning_rate": 7.071432142785895e-06, + "loss": 0.1077, + "step": 3547 + }, + { + "epoch": 1.1497083603370057, + "grad_norm": 0.49138933420181274, + "learning_rate": 7.069839972773352e-06, + "loss": 0.1179, + "step": 3548 + }, + { + "epoch": 1.1500324044069994, + "grad_norm": 0.49313196539878845, + "learning_rate": 7.0682475494324365e-06, + "loss": 0.1172, + "step": 3549 + }, + { + "epoch": 1.1503564484769928, + "grad_norm": 0.45699644088745117, + "learning_rate": 7.066654872958042e-06, + "loss": 0.1172, + "step": 3550 + }, + { + "epoch": 1.1506804925469865, + "grad_norm": 0.46840643882751465, + "learning_rate": 7.0650619435451e-06, + "loss": 0.116, + "step": 3551 + }, + { + "epoch": 1.1510045366169799, + "grad_norm": 0.5004029870033264, + "learning_rate": 7.063468761388564e-06, + "loss": 0.1254, + "step": 3552 + }, + { + "epoch": 1.1513285806869735, + "grad_norm": 0.528785765171051, + "learning_rate": 7.061875326683429e-06, + "loss": 0.1204, + "step": 3553 + }, + { + "epoch": 1.151652624756967, + "grad_norm": 0.5159833431243896, + "learning_rate": 7.060281639624714e-06, + "loss": 0.1272, + "step": 3554 + }, + { + "epoch": 1.1519766688269604, + "grad_norm": 0.4606555998325348, + "learning_rate": 7.0586877004074725e-06, + "loss": 0.1124, + "step": 3555 + }, + { + "epoch": 1.152300712896954, + "grad_norm": 0.45181626081466675, + "learning_rate": 7.057093509226785e-06, + "loss": 0.1106, + "step": 3556 + }, + { + "epoch": 1.1526247569669474, + "grad_norm": 0.4718472957611084, + "learning_rate": 7.055499066277767e-06, + "loss": 0.1121, + "step": 3557 + }, + { + "epoch": 1.152948801036941, + "grad_norm": 0.45812535285949707, + "learning_rate": 7.053904371755562e-06, + "loss": 0.111, + "step": 3558 + }, + { + "epoch": 1.1532728451069345, + "grad_norm": 0.5077002048492432, + "learning_rate": 7.052309425855344e-06, + "loss": 0.1267, + "step": 3559 + }, + { + "epoch": 1.1535968891769282, + "grad_norm": 0.4858371317386627, + "learning_rate": 7.050714228772322e-06, + "loss": 0.1177, + "step": 3560 + }, + { + "epoch": 1.1539209332469216, + "grad_norm": 0.45799070596694946, + "learning_rate": 7.04911878070173e-06, + "loss": 0.1075, + "step": 3561 + }, + { + "epoch": 1.1542449773169152, + "grad_norm": 0.48219260573387146, + "learning_rate": 7.047523081838836e-06, + "loss": 0.1229, + "step": 3562 + }, + { + "epoch": 1.1545690213869086, + "grad_norm": 0.43742167949676514, + "learning_rate": 7.045927132378939e-06, + "loss": 0.1022, + "step": 3563 + }, + { + "epoch": 1.154893065456902, + "grad_norm": 0.4254918396472931, + "learning_rate": 7.044330932517367e-06, + "loss": 0.0991, + "step": 3564 + }, + { + "epoch": 1.1552171095268957, + "grad_norm": 0.45003291964530945, + "learning_rate": 7.042734482449478e-06, + "loss": 0.11, + "step": 3565 + }, + { + "epoch": 1.1555411535968891, + "grad_norm": 0.43879011273384094, + "learning_rate": 7.041137782370665e-06, + "loss": 0.1061, + "step": 3566 + }, + { + "epoch": 1.1558651976668828, + "grad_norm": 0.42425820231437683, + "learning_rate": 7.0395408324763485e-06, + "loss": 0.0993, + "step": 3567 + }, + { + "epoch": 1.1561892417368762, + "grad_norm": 0.4832261800765991, + "learning_rate": 7.037943632961977e-06, + "loss": 0.1188, + "step": 3568 + }, + { + "epoch": 1.1565132858068696, + "grad_norm": 0.5043490529060364, + "learning_rate": 7.036346184023033e-06, + "loss": 0.1264, + "step": 3569 + }, + { + "epoch": 1.1568373298768633, + "grad_norm": 0.4804173409938812, + "learning_rate": 7.034748485855028e-06, + "loss": 0.1139, + "step": 3570 + }, + { + "epoch": 1.1571613739468567, + "grad_norm": 0.4613834023475647, + "learning_rate": 7.03315053865351e-06, + "loss": 0.1094, + "step": 3571 + }, + { + "epoch": 1.1574854180168503, + "grad_norm": 0.46034568548202515, + "learning_rate": 7.031552342614046e-06, + "loss": 0.1118, + "step": 3572 + }, + { + "epoch": 1.1578094620868438, + "grad_norm": 0.41124221682548523, + "learning_rate": 7.029953897932243e-06, + "loss": 0.0983, + "step": 3573 + }, + { + "epoch": 1.1581335061568374, + "grad_norm": 0.43085286021232605, + "learning_rate": 7.028355204803735e-06, + "loss": 0.1025, + "step": 3574 + }, + { + "epoch": 1.1584575502268308, + "grad_norm": 0.4392501413822174, + "learning_rate": 7.026756263424184e-06, + "loss": 0.1021, + "step": 3575 + }, + { + "epoch": 1.1587815942968245, + "grad_norm": 0.4602506756782532, + "learning_rate": 7.0251570739892884e-06, + "loss": 0.1139, + "step": 3576 + }, + { + "epoch": 1.159105638366818, + "grad_norm": 0.5223574638366699, + "learning_rate": 7.023557636694771e-06, + "loss": 0.1237, + "step": 3577 + }, + { + "epoch": 1.1594296824368113, + "grad_norm": 0.42616182565689087, + "learning_rate": 7.021957951736389e-06, + "loss": 0.1033, + "step": 3578 + }, + { + "epoch": 1.159753726506805, + "grad_norm": 0.4668428599834442, + "learning_rate": 7.0203580193099285e-06, + "loss": 0.1081, + "step": 3579 + }, + { + "epoch": 1.1600777705767984, + "grad_norm": 0.4539085328578949, + "learning_rate": 7.018757839611204e-06, + "loss": 0.1078, + "step": 3580 + }, + { + "epoch": 1.160401814646792, + "grad_norm": 0.46340179443359375, + "learning_rate": 7.0171574128360635e-06, + "loss": 0.1133, + "step": 3581 + }, + { + "epoch": 1.1607258587167855, + "grad_norm": 0.4713670611381531, + "learning_rate": 7.015556739180383e-06, + "loss": 0.1156, + "step": 3582 + }, + { + "epoch": 1.1610499027867789, + "grad_norm": 0.4762658476829529, + "learning_rate": 7.01395581884007e-06, + "loss": 0.1081, + "step": 3583 + }, + { + "epoch": 1.1613739468567725, + "grad_norm": 0.4581030011177063, + "learning_rate": 7.012354652011062e-06, + "loss": 0.1085, + "step": 3584 + }, + { + "epoch": 1.161697990926766, + "grad_norm": 0.4777476489543915, + "learning_rate": 7.010753238889325e-06, + "loss": 0.1276, + "step": 3585 + }, + { + "epoch": 1.1620220349967596, + "grad_norm": 0.4817984700202942, + "learning_rate": 7.009151579670856e-06, + "loss": 0.1115, + "step": 3586 + }, + { + "epoch": 1.162346079066753, + "grad_norm": 0.5016462206840515, + "learning_rate": 7.007549674551686e-06, + "loss": 0.1153, + "step": 3587 + }, + { + "epoch": 1.1626701231367467, + "grad_norm": 0.462336927652359, + "learning_rate": 7.005947523727869e-06, + "loss": 0.111, + "step": 3588 + }, + { + "epoch": 1.16299416720674, + "grad_norm": 0.45126065611839294, + "learning_rate": 7.004345127395493e-06, + "loss": 0.098, + "step": 3589 + }, + { + "epoch": 1.1633182112767337, + "grad_norm": 0.4864955246448517, + "learning_rate": 7.0027424857506784e-06, + "loss": 0.1118, + "step": 3590 + }, + { + "epoch": 1.1636422553467272, + "grad_norm": 0.5310540795326233, + "learning_rate": 7.001139598989572e-06, + "loss": 0.124, + "step": 3591 + }, + { + "epoch": 1.1639662994167206, + "grad_norm": 0.4483162462711334, + "learning_rate": 6.999536467308351e-06, + "loss": 0.1049, + "step": 3592 + }, + { + "epoch": 1.1642903434867142, + "grad_norm": 0.4710056483745575, + "learning_rate": 6.997933090903224e-06, + "loss": 0.1131, + "step": 3593 + }, + { + "epoch": 1.1646143875567077, + "grad_norm": 0.4988642632961273, + "learning_rate": 6.996329469970427e-06, + "loss": 0.1224, + "step": 3594 + }, + { + "epoch": 1.1649384316267013, + "grad_norm": 0.4584542512893677, + "learning_rate": 6.994725604706229e-06, + "loss": 0.1102, + "step": 3595 + }, + { + "epoch": 1.1652624756966947, + "grad_norm": 0.4387478828430176, + "learning_rate": 6.993121495306928e-06, + "loss": 0.0976, + "step": 3596 + }, + { + "epoch": 1.1655865197666881, + "grad_norm": 0.4995095431804657, + "learning_rate": 6.991517141968851e-06, + "loss": 0.1174, + "step": 3597 + }, + { + "epoch": 1.1659105638366818, + "grad_norm": 0.45613548159599304, + "learning_rate": 6.989912544888354e-06, + "loss": 0.107, + "step": 3598 + }, + { + "epoch": 1.1662346079066752, + "grad_norm": 0.46213752031326294, + "learning_rate": 6.988307704261826e-06, + "loss": 0.11, + "step": 3599 + }, + { + "epoch": 1.1665586519766689, + "grad_norm": 0.4745365381240845, + "learning_rate": 6.986702620285683e-06, + "loss": 0.1132, + "step": 3600 + }, + { + "epoch": 1.1668826960466623, + "grad_norm": 0.42137643694877625, + "learning_rate": 6.985097293156373e-06, + "loss": 0.0969, + "step": 3601 + }, + { + "epoch": 1.167206740116656, + "grad_norm": 0.49307501316070557, + "learning_rate": 6.9834917230703705e-06, + "loss": 0.1141, + "step": 3602 + }, + { + "epoch": 1.1675307841866494, + "grad_norm": 0.4686894714832306, + "learning_rate": 6.981885910224184e-06, + "loss": 0.1186, + "step": 3603 + }, + { + "epoch": 1.167854828256643, + "grad_norm": 0.4602605104446411, + "learning_rate": 6.980279854814348e-06, + "loss": 0.1129, + "step": 3604 + }, + { + "epoch": 1.1681788723266364, + "grad_norm": 0.547022819519043, + "learning_rate": 6.978673557037427e-06, + "loss": 0.1353, + "step": 3605 + }, + { + "epoch": 1.1685029163966298, + "grad_norm": 0.4729814827442169, + "learning_rate": 6.977067017090019e-06, + "loss": 0.1147, + "step": 3606 + }, + { + "epoch": 1.1688269604666235, + "grad_norm": 0.48199722170829773, + "learning_rate": 6.975460235168747e-06, + "loss": 0.1119, + "step": 3607 + }, + { + "epoch": 1.169151004536617, + "grad_norm": 0.48165759444236755, + "learning_rate": 6.973853211470266e-06, + "loss": 0.111, + "step": 3608 + }, + { + "epoch": 1.1694750486066106, + "grad_norm": 0.5283550024032593, + "learning_rate": 6.972245946191262e-06, + "loss": 0.1238, + "step": 3609 + }, + { + "epoch": 1.169799092676604, + "grad_norm": 0.4986545145511627, + "learning_rate": 6.970638439528445e-06, + "loss": 0.1251, + "step": 3610 + }, + { + "epoch": 1.1701231367465976, + "grad_norm": 0.4904198944568634, + "learning_rate": 6.96903069167856e-06, + "loss": 0.1149, + "step": 3611 + }, + { + "epoch": 1.170447180816591, + "grad_norm": 0.49144813418388367, + "learning_rate": 6.967422702838381e-06, + "loss": 0.1258, + "step": 3612 + }, + { + "epoch": 1.1707712248865847, + "grad_norm": 0.5225583910942078, + "learning_rate": 6.965814473204708e-06, + "loss": 0.1329, + "step": 3613 + }, + { + "epoch": 1.1710952689565781, + "grad_norm": 0.4927222430706024, + "learning_rate": 6.964206002974377e-06, + "loss": 0.1173, + "step": 3614 + }, + { + "epoch": 1.1714193130265715, + "grad_norm": 0.48234739899635315, + "learning_rate": 6.962597292344244e-06, + "loss": 0.1142, + "step": 3615 + }, + { + "epoch": 1.1717433570965652, + "grad_norm": 0.48802363872528076, + "learning_rate": 6.960988341511204e-06, + "loss": 0.1123, + "step": 3616 + }, + { + "epoch": 1.1720674011665586, + "grad_norm": 0.45166656374931335, + "learning_rate": 6.959379150672172e-06, + "loss": 0.1048, + "step": 3617 + }, + { + "epoch": 1.1723914452365523, + "grad_norm": 0.4444221258163452, + "learning_rate": 6.9577697200241014e-06, + "loss": 0.1023, + "step": 3618 + }, + { + "epoch": 1.1727154893065457, + "grad_norm": 0.49823105335235596, + "learning_rate": 6.956160049763969e-06, + "loss": 0.1055, + "step": 3619 + }, + { + "epoch": 1.173039533376539, + "grad_norm": 0.4852776825428009, + "learning_rate": 6.9545501400887846e-06, + "loss": 0.115, + "step": 3620 + }, + { + "epoch": 1.1733635774465327, + "grad_norm": 0.48619014024734497, + "learning_rate": 6.952939991195584e-06, + "loss": 0.1169, + "step": 3621 + }, + { + "epoch": 1.1736876215165262, + "grad_norm": 0.4564978778362274, + "learning_rate": 6.951329603281435e-06, + "loss": 0.104, + "step": 3622 + }, + { + "epoch": 1.1740116655865198, + "grad_norm": 0.47127726674079895, + "learning_rate": 6.9497189765434326e-06, + "loss": 0.109, + "step": 3623 + }, + { + "epoch": 1.1743357096565132, + "grad_norm": 0.487866073846817, + "learning_rate": 6.948108111178702e-06, + "loss": 0.1156, + "step": 3624 + }, + { + "epoch": 1.1746597537265069, + "grad_norm": 0.4357970952987671, + "learning_rate": 6.946497007384398e-06, + "loss": 0.1012, + "step": 3625 + }, + { + "epoch": 1.1749837977965003, + "grad_norm": 0.5288655757904053, + "learning_rate": 6.944885665357704e-06, + "loss": 0.1231, + "step": 3626 + }, + { + "epoch": 1.175307841866494, + "grad_norm": 0.49479320645332336, + "learning_rate": 6.943274085295832e-06, + "loss": 0.118, + "step": 3627 + }, + { + "epoch": 1.1756318859364874, + "grad_norm": 0.52422696352005, + "learning_rate": 6.941662267396026e-06, + "loss": 0.126, + "step": 3628 + }, + { + "epoch": 1.1759559300064808, + "grad_norm": 0.47797513008117676, + "learning_rate": 6.940050211855554e-06, + "loss": 0.1113, + "step": 3629 + }, + { + "epoch": 1.1762799740764744, + "grad_norm": 0.4675103425979614, + "learning_rate": 6.9384379188717155e-06, + "loss": 0.1052, + "step": 3630 + }, + { + "epoch": 1.1766040181464679, + "grad_norm": 0.4948538839817047, + "learning_rate": 6.936825388641842e-06, + "loss": 0.1209, + "step": 3631 + }, + { + "epoch": 1.1769280622164615, + "grad_norm": 0.4761638045310974, + "learning_rate": 6.935212621363292e-06, + "loss": 0.1129, + "step": 3632 + }, + { + "epoch": 1.177252106286455, + "grad_norm": 0.4795760214328766, + "learning_rate": 6.9335996172334505e-06, + "loss": 0.1083, + "step": 3633 + }, + { + "epoch": 1.1775761503564484, + "grad_norm": 0.45007088780403137, + "learning_rate": 6.931986376449736e-06, + "loss": 0.1116, + "step": 3634 + }, + { + "epoch": 1.177900194426442, + "grad_norm": 0.44401949644088745, + "learning_rate": 6.9303728992095905e-06, + "loss": 0.1059, + "step": 3635 + }, + { + "epoch": 1.1782242384964354, + "grad_norm": 0.4538906216621399, + "learning_rate": 6.928759185710492e-06, + "loss": 0.1122, + "step": 3636 + }, + { + "epoch": 1.178548282566429, + "grad_norm": 0.4766751229763031, + "learning_rate": 6.9271452361499396e-06, + "loss": 0.1173, + "step": 3637 + }, + { + "epoch": 1.1788723266364225, + "grad_norm": 0.4567437469959259, + "learning_rate": 6.925531050725465e-06, + "loss": 0.114, + "step": 3638 + }, + { + "epoch": 1.1791963707064161, + "grad_norm": 0.48373913764953613, + "learning_rate": 6.923916629634632e-06, + "loss": 0.1203, + "step": 3639 + }, + { + "epoch": 1.1795204147764096, + "grad_norm": 0.4734843671321869, + "learning_rate": 6.9223019730750285e-06, + "loss": 0.1188, + "step": 3640 + }, + { + "epoch": 1.1798444588464032, + "grad_norm": 0.4800986647605896, + "learning_rate": 6.920687081244271e-06, + "loss": 0.1188, + "step": 3641 + }, + { + "epoch": 1.1801685029163966, + "grad_norm": 0.4790305197238922, + "learning_rate": 6.919071954340011e-06, + "loss": 0.1167, + "step": 3642 + }, + { + "epoch": 1.18049254698639, + "grad_norm": 0.48875489830970764, + "learning_rate": 6.9174565925599205e-06, + "loss": 0.1199, + "step": 3643 + }, + { + "epoch": 1.1808165910563837, + "grad_norm": 0.44894662499427795, + "learning_rate": 6.915840996101705e-06, + "loss": 0.1008, + "step": 3644 + }, + { + "epoch": 1.1811406351263771, + "grad_norm": 0.45716553926467896, + "learning_rate": 6.9142251651631e-06, + "loss": 0.1105, + "step": 3645 + }, + { + "epoch": 1.1814646791963708, + "grad_norm": 0.47267618775367737, + "learning_rate": 6.912609099941865e-06, + "loss": 0.1153, + "step": 3646 + }, + { + "epoch": 1.1817887232663642, + "grad_norm": 0.5003800988197327, + "learning_rate": 6.910992800635792e-06, + "loss": 0.1265, + "step": 3647 + }, + { + "epoch": 1.1821127673363578, + "grad_norm": 0.481633722782135, + "learning_rate": 6.9093762674427e-06, + "loss": 0.1159, + "step": 3648 + }, + { + "epoch": 1.1824368114063513, + "grad_norm": 0.49060332775115967, + "learning_rate": 6.907759500560436e-06, + "loss": 0.1116, + "step": 3649 + }, + { + "epoch": 1.1827608554763447, + "grad_norm": 0.4521649479866028, + "learning_rate": 6.906142500186879e-06, + "loss": 0.1038, + "step": 3650 + }, + { + "epoch": 1.1830848995463383, + "grad_norm": 0.44412970542907715, + "learning_rate": 6.904525266519931e-06, + "loss": 0.1127, + "step": 3651 + }, + { + "epoch": 1.1834089436163318, + "grad_norm": 0.48646020889282227, + "learning_rate": 6.90290779975753e-06, + "loss": 0.1235, + "step": 3652 + }, + { + "epoch": 1.1837329876863254, + "grad_norm": 0.4747726619243622, + "learning_rate": 6.901290100097634e-06, + "loss": 0.1188, + "step": 3653 + }, + { + "epoch": 1.1840570317563188, + "grad_norm": 0.45250165462493896, + "learning_rate": 6.899672167738236e-06, + "loss": 0.1147, + "step": 3654 + }, + { + "epoch": 1.1843810758263125, + "grad_norm": 0.4335748553276062, + "learning_rate": 6.898054002877356e-06, + "loss": 0.105, + "step": 3655 + }, + { + "epoch": 1.184705119896306, + "grad_norm": 0.47117671370506287, + "learning_rate": 6.89643560571304e-06, + "loss": 0.1113, + "step": 3656 + }, + { + "epoch": 1.1850291639662993, + "grad_norm": 0.5246734619140625, + "learning_rate": 6.894816976443365e-06, + "loss": 0.1219, + "step": 3657 + }, + { + "epoch": 1.185353208036293, + "grad_norm": 0.4772835075855255, + "learning_rate": 6.8931981152664354e-06, + "loss": 0.1297, + "step": 3658 + }, + { + "epoch": 1.1856772521062864, + "grad_norm": 0.4772994816303253, + "learning_rate": 6.891579022380384e-06, + "loss": 0.1206, + "step": 3659 + }, + { + "epoch": 1.18600129617628, + "grad_norm": 0.5017443299293518, + "learning_rate": 6.889959697983371e-06, + "loss": 0.1171, + "step": 3660 + }, + { + "epoch": 1.1863253402462735, + "grad_norm": 0.49202096462249756, + "learning_rate": 6.888340142273588e-06, + "loss": 0.1229, + "step": 3661 + }, + { + "epoch": 1.186649384316267, + "grad_norm": 0.5096995234489441, + "learning_rate": 6.886720355449253e-06, + "loss": 0.1266, + "step": 3662 + }, + { + "epoch": 1.1869734283862605, + "grad_norm": 0.45919767022132874, + "learning_rate": 6.88510033770861e-06, + "loss": 0.1153, + "step": 3663 + }, + { + "epoch": 1.1872974724562542, + "grad_norm": 0.45726320147514343, + "learning_rate": 6.883480089249937e-06, + "loss": 0.1128, + "step": 3664 + }, + { + "epoch": 1.1876215165262476, + "grad_norm": 0.4446745216846466, + "learning_rate": 6.881859610271532e-06, + "loss": 0.1037, + "step": 3665 + }, + { + "epoch": 1.187945560596241, + "grad_norm": 0.48562106490135193, + "learning_rate": 6.88023890097173e-06, + "loss": 0.1249, + "step": 3666 + }, + { + "epoch": 1.1882696046662347, + "grad_norm": 0.487153559923172, + "learning_rate": 6.878617961548888e-06, + "loss": 0.1117, + "step": 3667 + }, + { + "epoch": 1.188593648736228, + "grad_norm": 0.4806040823459625, + "learning_rate": 6.876996792201394e-06, + "loss": 0.1239, + "step": 3668 + }, + { + "epoch": 1.1889176928062217, + "grad_norm": 0.44205746054649353, + "learning_rate": 6.875375393127663e-06, + "loss": 0.0955, + "step": 3669 + }, + { + "epoch": 1.1892417368762151, + "grad_norm": 0.49507611989974976, + "learning_rate": 6.873753764526141e-06, + "loss": 0.1193, + "step": 3670 + }, + { + "epoch": 1.1895657809462086, + "grad_norm": 0.44034644961357117, + "learning_rate": 6.872131906595295e-06, + "loss": 0.104, + "step": 3671 + }, + { + "epoch": 1.1898898250162022, + "grad_norm": 0.5001266598701477, + "learning_rate": 6.870509819533628e-06, + "loss": 0.119, + "step": 3672 + }, + { + "epoch": 1.1902138690861956, + "grad_norm": 0.4320615530014038, + "learning_rate": 6.868887503539667e-06, + "loss": 0.1052, + "step": 3673 + }, + { + "epoch": 1.1905379131561893, + "grad_norm": 0.44837626814842224, + "learning_rate": 6.867264958811968e-06, + "loss": 0.1115, + "step": 3674 + }, + { + "epoch": 1.1908619572261827, + "grad_norm": 0.43587976694107056, + "learning_rate": 6.865642185549115e-06, + "loss": 0.1012, + "step": 3675 + }, + { + "epoch": 1.1911860012961764, + "grad_norm": 0.515214204788208, + "learning_rate": 6.864019183949719e-06, + "loss": 0.1261, + "step": 3676 + }, + { + "epoch": 1.1915100453661698, + "grad_norm": 0.4589107930660248, + "learning_rate": 6.86239595421242e-06, + "loss": 0.1051, + "step": 3677 + }, + { + "epoch": 1.1918340894361634, + "grad_norm": 0.503598690032959, + "learning_rate": 6.860772496535887e-06, + "loss": 0.1284, + "step": 3678 + }, + { + "epoch": 1.1921581335061568, + "grad_norm": 0.48150861263275146, + "learning_rate": 6.859148811118812e-06, + "loss": 0.1233, + "step": 3679 + }, + { + "epoch": 1.1924821775761503, + "grad_norm": 0.4901348054409027, + "learning_rate": 6.857524898159921e-06, + "loss": 0.12, + "step": 3680 + }, + { + "epoch": 1.192806221646144, + "grad_norm": 0.4648503065109253, + "learning_rate": 6.855900757857965e-06, + "loss": 0.108, + "step": 3681 + }, + { + "epoch": 1.1931302657161373, + "grad_norm": 0.440851628780365, + "learning_rate": 6.854276390411721e-06, + "loss": 0.1062, + "step": 3682 + }, + { + "epoch": 1.193454309786131, + "grad_norm": 0.49569249153137207, + "learning_rate": 6.85265179602e-06, + "loss": 0.11, + "step": 3683 + }, + { + "epoch": 1.1937783538561244, + "grad_norm": 0.47996777296066284, + "learning_rate": 6.851026974881634e-06, + "loss": 0.1153, + "step": 3684 + }, + { + "epoch": 1.1941023979261178, + "grad_norm": 0.5012381076812744, + "learning_rate": 6.849401927195485e-06, + "loss": 0.1184, + "step": 3685 + }, + { + "epoch": 1.1944264419961115, + "grad_norm": 0.5340484380722046, + "learning_rate": 6.847776653160443e-06, + "loss": 0.1163, + "step": 3686 + }, + { + "epoch": 1.194750486066105, + "grad_norm": 0.4254695475101471, + "learning_rate": 6.846151152975427e-06, + "loss": 0.1032, + "step": 3687 + }, + { + "epoch": 1.1950745301360985, + "grad_norm": 0.48035746812820435, + "learning_rate": 6.844525426839383e-06, + "loss": 0.1201, + "step": 3688 + }, + { + "epoch": 1.195398574206092, + "grad_norm": 0.502835214138031, + "learning_rate": 6.842899474951283e-06, + "loss": 0.1194, + "step": 3689 + }, + { + "epoch": 1.1957226182760856, + "grad_norm": 0.44748517870903015, + "learning_rate": 6.841273297510127e-06, + "loss": 0.1031, + "step": 3690 + }, + { + "epoch": 1.196046662346079, + "grad_norm": 0.43392542004585266, + "learning_rate": 6.839646894714944e-06, + "loss": 0.1025, + "step": 3691 + }, + { + "epoch": 1.1963707064160727, + "grad_norm": 0.5193026065826416, + "learning_rate": 6.838020266764791e-06, + "loss": 0.1255, + "step": 3692 + }, + { + "epoch": 1.196694750486066, + "grad_norm": 0.4965396523475647, + "learning_rate": 6.836393413858751e-06, + "loss": 0.1206, + "step": 3693 + }, + { + "epoch": 1.1970187945560595, + "grad_norm": 0.5123834013938904, + "learning_rate": 6.834766336195934e-06, + "loss": 0.1095, + "step": 3694 + }, + { + "epoch": 1.1973428386260532, + "grad_norm": 0.46777281165122986, + "learning_rate": 6.83313903397548e-06, + "loss": 0.1107, + "step": 3695 + }, + { + "epoch": 1.1976668826960466, + "grad_norm": 0.48866865038871765, + "learning_rate": 6.831511507396555e-06, + "loss": 0.1117, + "step": 3696 + }, + { + "epoch": 1.1979909267660402, + "grad_norm": 0.5388202667236328, + "learning_rate": 6.82988375665835e-06, + "loss": 0.1098, + "step": 3697 + }, + { + "epoch": 1.1983149708360337, + "grad_norm": 0.4483700096607208, + "learning_rate": 6.828255781960089e-06, + "loss": 0.1088, + "step": 3698 + }, + { + "epoch": 1.1986390149060273, + "grad_norm": 0.48592525720596313, + "learning_rate": 6.82662758350102e-06, + "loss": 0.1126, + "step": 3699 + }, + { + "epoch": 1.1989630589760207, + "grad_norm": 0.45029422640800476, + "learning_rate": 6.8249991614804165e-06, + "loss": 0.1062, + "step": 3700 + }, + { + "epoch": 1.1992871030460144, + "grad_norm": 0.4915621876716614, + "learning_rate": 6.823370516097585e-06, + "loss": 0.118, + "step": 3701 + }, + { + "epoch": 1.1996111471160078, + "grad_norm": 0.4654853045940399, + "learning_rate": 6.8217416475518515e-06, + "loss": 0.1078, + "step": 3702 + }, + { + "epoch": 1.1999351911860012, + "grad_norm": 0.4795008897781372, + "learning_rate": 6.820112556042577e-06, + "loss": 0.1096, + "step": 3703 + }, + { + "epoch": 1.2002592352559949, + "grad_norm": 0.4701564908027649, + "learning_rate": 6.8184832417691446e-06, + "loss": 0.1114, + "step": 3704 + }, + { + "epoch": 1.2005832793259883, + "grad_norm": 0.4656620919704437, + "learning_rate": 6.816853704930969e-06, + "loss": 0.1094, + "step": 3705 + }, + { + "epoch": 1.200907323395982, + "grad_norm": 0.4720616340637207, + "learning_rate": 6.815223945727488e-06, + "loss": 0.1163, + "step": 3706 + }, + { + "epoch": 1.2012313674659754, + "grad_norm": 0.48066261410713196, + "learning_rate": 6.81359396435817e-06, + "loss": 0.1177, + "step": 3707 + }, + { + "epoch": 1.2015554115359688, + "grad_norm": 0.5046426057815552, + "learning_rate": 6.811963761022507e-06, + "loss": 0.1255, + "step": 3708 + }, + { + "epoch": 1.2018794556059624, + "grad_norm": 0.4512089788913727, + "learning_rate": 6.810333335920021e-06, + "loss": 0.1056, + "step": 3709 + }, + { + "epoch": 1.2022034996759559, + "grad_norm": 0.41402915120124817, + "learning_rate": 6.80870268925026e-06, + "loss": 0.0983, + "step": 3710 + }, + { + "epoch": 1.2025275437459495, + "grad_norm": 0.48315009474754333, + "learning_rate": 6.807071821212798e-06, + "loss": 0.112, + "step": 3711 + }, + { + "epoch": 1.202851587815943, + "grad_norm": 0.4571656286716461, + "learning_rate": 6.8054407320072405e-06, + "loss": 0.1072, + "step": 3712 + }, + { + "epoch": 1.2031756318859366, + "grad_norm": 0.4467449486255646, + "learning_rate": 6.8038094218332155e-06, + "loss": 0.1075, + "step": 3713 + }, + { + "epoch": 1.20349967595593, + "grad_norm": 0.42994919419288635, + "learning_rate": 6.802177890890378e-06, + "loss": 0.1039, + "step": 3714 + }, + { + "epoch": 1.2038237200259236, + "grad_norm": 0.4829837679862976, + "learning_rate": 6.800546139378415e-06, + "loss": 0.1132, + "step": 3715 + }, + { + "epoch": 1.204147764095917, + "grad_norm": 0.4595179855823517, + "learning_rate": 6.798914167497033e-06, + "loss": 0.1108, + "step": 3716 + }, + { + "epoch": 1.2044718081659105, + "grad_norm": 0.5180400609970093, + "learning_rate": 6.797281975445973e-06, + "loss": 0.128, + "step": 3717 + }, + { + "epoch": 1.2047958522359041, + "grad_norm": 0.4271709620952606, + "learning_rate": 6.795649563424997e-06, + "loss": 0.1041, + "step": 3718 + }, + { + "epoch": 1.2051198963058976, + "grad_norm": 0.4888884127140045, + "learning_rate": 6.7940169316339e-06, + "loss": 0.1222, + "step": 3719 + }, + { + "epoch": 1.2054439403758912, + "grad_norm": 0.42299413681030273, + "learning_rate": 6.7923840802724975e-06, + "loss": 0.0963, + "step": 3720 + }, + { + "epoch": 1.2057679844458846, + "grad_norm": 0.47027137875556946, + "learning_rate": 6.790751009540635e-06, + "loss": 0.1104, + "step": 3721 + }, + { + "epoch": 1.206092028515878, + "grad_norm": 0.4526819884777069, + "learning_rate": 6.789117719638184e-06, + "loss": 0.1118, + "step": 3722 + }, + { + "epoch": 1.2064160725858717, + "grad_norm": 0.4731588065624237, + "learning_rate": 6.787484210765044e-06, + "loss": 0.125, + "step": 3723 + }, + { + "epoch": 1.2067401166558651, + "grad_norm": 0.5060051679611206, + "learning_rate": 6.7858504831211416e-06, + "loss": 0.1199, + "step": 3724 + }, + { + "epoch": 1.2070641607258588, + "grad_norm": 0.4772077798843384, + "learning_rate": 6.784216536906429e-06, + "loss": 0.1195, + "step": 3725 + }, + { + "epoch": 1.2073882047958522, + "grad_norm": 0.43619367480278015, + "learning_rate": 6.782582372320882e-06, + "loss": 0.1056, + "step": 3726 + }, + { + "epoch": 1.2077122488658458, + "grad_norm": 0.42430293560028076, + "learning_rate": 6.780947989564511e-06, + "loss": 0.0981, + "step": 3727 + }, + { + "epoch": 1.2080362929358393, + "grad_norm": 0.4979735314846039, + "learning_rate": 6.7793133888373475e-06, + "loss": 0.127, + "step": 3728 + }, + { + "epoch": 1.208360337005833, + "grad_norm": 0.4383695423603058, + "learning_rate": 6.77767857033945e-06, + "loss": 0.0991, + "step": 3729 + }, + { + "epoch": 1.2086843810758263, + "grad_norm": 0.49072328209877014, + "learning_rate": 6.7760435342709054e-06, + "loss": 0.1166, + "step": 3730 + }, + { + "epoch": 1.2090084251458197, + "grad_norm": 0.46386846899986267, + "learning_rate": 6.774408280831825e-06, + "loss": 0.1138, + "step": 3731 + }, + { + "epoch": 1.2093324692158134, + "grad_norm": 0.48256877064704895, + "learning_rate": 6.772772810222349e-06, + "loss": 0.119, + "step": 3732 + }, + { + "epoch": 1.2096565132858068, + "grad_norm": 0.5180603861808777, + "learning_rate": 6.771137122642642e-06, + "loss": 0.1215, + "step": 3733 + }, + { + "epoch": 1.2099805573558005, + "grad_norm": 0.45415544509887695, + "learning_rate": 6.769501218292897e-06, + "loss": 0.114, + "step": 3734 + }, + { + "epoch": 1.2103046014257939, + "grad_norm": 0.4832041263580322, + "learning_rate": 6.767865097373334e-06, + "loss": 0.1198, + "step": 3735 + }, + { + "epoch": 1.2106286454957873, + "grad_norm": 0.4709514379501343, + "learning_rate": 6.766228760084197e-06, + "loss": 0.1147, + "step": 3736 + }, + { + "epoch": 1.210952689565781, + "grad_norm": 0.46234336495399475, + "learning_rate": 6.764592206625759e-06, + "loss": 0.1135, + "step": 3737 + }, + { + "epoch": 1.2112767336357744, + "grad_norm": 0.48604175448417664, + "learning_rate": 6.762955437198315e-06, + "loss": 0.1181, + "step": 3738 + }, + { + "epoch": 1.211600777705768, + "grad_norm": 0.4512622058391571, + "learning_rate": 6.761318452002194e-06, + "loss": 0.1043, + "step": 3739 + }, + { + "epoch": 1.2119248217757614, + "grad_norm": 0.4739454388618469, + "learning_rate": 6.759681251237745e-06, + "loss": 0.1134, + "step": 3740 + }, + { + "epoch": 1.212248865845755, + "grad_norm": 0.473141610622406, + "learning_rate": 6.7580438351053466e-06, + "loss": 0.1151, + "step": 3741 + }, + { + "epoch": 1.2125729099157485, + "grad_norm": 0.4593276381492615, + "learning_rate": 6.756406203805401e-06, + "loss": 0.1097, + "step": 3742 + }, + { + "epoch": 1.2128969539857422, + "grad_norm": 0.44628623127937317, + "learning_rate": 6.7547683575383415e-06, + "loss": 0.1099, + "step": 3743 + }, + { + "epoch": 1.2132209980557356, + "grad_norm": 0.4702170789241791, + "learning_rate": 6.7531302965046194e-06, + "loss": 0.1142, + "step": 3744 + }, + { + "epoch": 1.213545042125729, + "grad_norm": 0.5326228737831116, + "learning_rate": 6.751492020904722e-06, + "loss": 0.1116, + "step": 3745 + }, + { + "epoch": 1.2138690861957226, + "grad_norm": 0.4977304935455322, + "learning_rate": 6.749853530939156e-06, + "loss": 0.1268, + "step": 3746 + }, + { + "epoch": 1.214193130265716, + "grad_norm": 0.46702513098716736, + "learning_rate": 6.748214826808459e-06, + "loss": 0.1094, + "step": 3747 + }, + { + "epoch": 1.2145171743357097, + "grad_norm": 0.5220220685005188, + "learning_rate": 6.746575908713191e-06, + "loss": 0.1327, + "step": 3748 + }, + { + "epoch": 1.2148412184057031, + "grad_norm": 0.4549591839313507, + "learning_rate": 6.7449367768539405e-06, + "loss": 0.1107, + "step": 3749 + }, + { + "epoch": 1.2151652624756968, + "grad_norm": 0.526520311832428, + "learning_rate": 6.74329743143132e-06, + "loss": 0.1404, + "step": 3750 + }, + { + "epoch": 1.2154893065456902, + "grad_norm": 0.465791791677475, + "learning_rate": 6.74165787264597e-06, + "loss": 0.1125, + "step": 3751 + }, + { + "epoch": 1.2158133506156839, + "grad_norm": 0.4611221253871918, + "learning_rate": 6.740018100698559e-06, + "loss": 0.1038, + "step": 3752 + }, + { + "epoch": 1.2161373946856773, + "grad_norm": 0.4689771234989166, + "learning_rate": 6.738378115789775e-06, + "loss": 0.1121, + "step": 3753 + }, + { + "epoch": 1.2164614387556707, + "grad_norm": 0.4977121949195862, + "learning_rate": 6.73673791812034e-06, + "loss": 0.1183, + "step": 3754 + }, + { + "epoch": 1.2167854828256643, + "grad_norm": 0.4518553912639618, + "learning_rate": 6.7350975078909944e-06, + "loss": 0.1076, + "step": 3755 + }, + { + "epoch": 1.2171095268956578, + "grad_norm": 0.4840216636657715, + "learning_rate": 6.733456885302513e-06, + "loss": 0.1234, + "step": 3756 + }, + { + "epoch": 1.2174335709656514, + "grad_norm": 0.4555285573005676, + "learning_rate": 6.731816050555689e-06, + "loss": 0.1101, + "step": 3757 + }, + { + "epoch": 1.2177576150356448, + "grad_norm": 0.434114009141922, + "learning_rate": 6.730175003851346e-06, + "loss": 0.1088, + "step": 3758 + }, + { + "epoch": 1.2180816591056383, + "grad_norm": 0.4861230254173279, + "learning_rate": 6.728533745390331e-06, + "loss": 0.1243, + "step": 3759 + }, + { + "epoch": 1.218405703175632, + "grad_norm": 0.4449920952320099, + "learning_rate": 6.72689227537352e-06, + "loss": 0.1172, + "step": 3760 + }, + { + "epoch": 1.2187297472456253, + "grad_norm": 0.4476756751537323, + "learning_rate": 6.725250594001812e-06, + "loss": 0.1135, + "step": 3761 + }, + { + "epoch": 1.219053791315619, + "grad_norm": 0.4657207131385803, + "learning_rate": 6.723608701476135e-06, + "loss": 0.1137, + "step": 3762 + }, + { + "epoch": 1.2193778353856124, + "grad_norm": 0.4610616862773895, + "learning_rate": 6.7219665979974355e-06, + "loss": 0.1124, + "step": 3763 + }, + { + "epoch": 1.219701879455606, + "grad_norm": 0.47316476702690125, + "learning_rate": 6.720324283766696e-06, + "loss": 0.1131, + "step": 3764 + }, + { + "epoch": 1.2200259235255995, + "grad_norm": 0.4340543746948242, + "learning_rate": 6.718681758984917e-06, + "loss": 0.1024, + "step": 3765 + }, + { + "epoch": 1.220349967595593, + "grad_norm": 0.45548132061958313, + "learning_rate": 6.71703902385313e-06, + "loss": 0.1142, + "step": 3766 + }, + { + "epoch": 1.2206740116655865, + "grad_norm": 0.4511728584766388, + "learning_rate": 6.715396078572388e-06, + "loss": 0.1074, + "step": 3767 + }, + { + "epoch": 1.22099805573558, + "grad_norm": 0.48144659399986267, + "learning_rate": 6.713752923343774e-06, + "loss": 0.1179, + "step": 3768 + }, + { + "epoch": 1.2213220998055736, + "grad_norm": 0.4838949739933014, + "learning_rate": 6.712109558368391e-06, + "loss": 0.1173, + "step": 3769 + }, + { + "epoch": 1.221646143875567, + "grad_norm": 0.4324999153614044, + "learning_rate": 6.710465983847373e-06, + "loss": 0.1041, + "step": 3770 + }, + { + "epoch": 1.2219701879455607, + "grad_norm": 0.47770074009895325, + "learning_rate": 6.708822199981877e-06, + "loss": 0.1125, + "step": 3771 + }, + { + "epoch": 1.222294232015554, + "grad_norm": 0.4648069441318512, + "learning_rate": 6.7071782069730865e-06, + "loss": 0.11, + "step": 3772 + }, + { + "epoch": 1.2226182760855475, + "grad_norm": 0.45485344529151917, + "learning_rate": 6.705534005022209e-06, + "loss": 0.1143, + "step": 3773 + }, + { + "epoch": 1.2229423201555412, + "grad_norm": 0.4402785003185272, + "learning_rate": 6.703889594330481e-06, + "loss": 0.0989, + "step": 3774 + }, + { + "epoch": 1.2232663642255346, + "grad_norm": 0.44086694717407227, + "learning_rate": 6.702244975099164e-06, + "loss": 0.1043, + "step": 3775 + }, + { + "epoch": 1.2235904082955282, + "grad_norm": 0.4607573449611664, + "learning_rate": 6.700600147529539e-06, + "loss": 0.1106, + "step": 3776 + }, + { + "epoch": 1.2239144523655217, + "grad_norm": 0.4859803020954132, + "learning_rate": 6.698955111822918e-06, + "loss": 0.12, + "step": 3777 + }, + { + "epoch": 1.2242384964355153, + "grad_norm": 0.44567012786865234, + "learning_rate": 6.697309868180639e-06, + "loss": 0.1043, + "step": 3778 + }, + { + "epoch": 1.2245625405055087, + "grad_norm": 0.4929608702659607, + "learning_rate": 6.6956644168040644e-06, + "loss": 0.1242, + "step": 3779 + }, + { + "epoch": 1.2248865845755024, + "grad_norm": 0.49041947722435, + "learning_rate": 6.69401875789458e-06, + "loss": 0.1122, + "step": 3780 + }, + { + "epoch": 1.2252106286454958, + "grad_norm": 0.4901500940322876, + "learning_rate": 6.692372891653599e-06, + "loss": 0.1153, + "step": 3781 + }, + { + "epoch": 1.2255346727154892, + "grad_norm": 0.4653424024581909, + "learning_rate": 6.690726818282559e-06, + "loss": 0.1054, + "step": 3782 + }, + { + "epoch": 1.2258587167854829, + "grad_norm": 0.4732823669910431, + "learning_rate": 6.689080537982924e-06, + "loss": 0.1204, + "step": 3783 + }, + { + "epoch": 1.2261827608554763, + "grad_norm": 0.5188104510307312, + "learning_rate": 6.687434050956184e-06, + "loss": 0.129, + "step": 3784 + }, + { + "epoch": 1.22650680492547, + "grad_norm": 0.46234825253486633, + "learning_rate": 6.68578735740385e-06, + "loss": 0.11, + "step": 3785 + }, + { + "epoch": 1.2268308489954634, + "grad_norm": 0.47152841091156006, + "learning_rate": 6.684140457527465e-06, + "loss": 0.1119, + "step": 3786 + }, + { + "epoch": 1.227154893065457, + "grad_norm": 0.5100438594818115, + "learning_rate": 6.68249335152859e-06, + "loss": 0.13, + "step": 3787 + }, + { + "epoch": 1.2274789371354504, + "grad_norm": 0.46005114912986755, + "learning_rate": 6.680846039608817e-06, + "loss": 0.1128, + "step": 3788 + }, + { + "epoch": 1.2278029812054438, + "grad_norm": 0.4707135856151581, + "learning_rate": 6.679198521969761e-06, + "loss": 0.1178, + "step": 3789 + }, + { + "epoch": 1.2281270252754375, + "grad_norm": 0.46408483386039734, + "learning_rate": 6.677550798813062e-06, + "loss": 0.1116, + "step": 3790 + }, + { + "epoch": 1.228451069345431, + "grad_norm": 0.4784737229347229, + "learning_rate": 6.675902870340385e-06, + "loss": 0.1125, + "step": 3791 + }, + { + "epoch": 1.2287751134154246, + "grad_norm": 0.4646395444869995, + "learning_rate": 6.6742547367534205e-06, + "loss": 0.1088, + "step": 3792 + }, + { + "epoch": 1.229099157485418, + "grad_norm": 0.44539207220077515, + "learning_rate": 6.6726063982538846e-06, + "loss": 0.1038, + "step": 3793 + }, + { + "epoch": 1.2294232015554116, + "grad_norm": 0.4744553565979004, + "learning_rate": 6.6709578550435174e-06, + "loss": 0.1141, + "step": 3794 + }, + { + "epoch": 1.229747245625405, + "grad_norm": 0.47019147872924805, + "learning_rate": 6.669309107324085e-06, + "loss": 0.1138, + "step": 3795 + }, + { + "epoch": 1.2300712896953985, + "grad_norm": 0.4724312126636505, + "learning_rate": 6.667660155297377e-06, + "loss": 0.1139, + "step": 3796 + }, + { + "epoch": 1.2303953337653921, + "grad_norm": 0.4590516984462738, + "learning_rate": 6.666010999165211e-06, + "loss": 0.1082, + "step": 3797 + }, + { + "epoch": 1.2307193778353855, + "grad_norm": 0.47386661171913147, + "learning_rate": 6.664361639129429e-06, + "loss": 0.1096, + "step": 3798 + }, + { + "epoch": 1.2310434219053792, + "grad_norm": 0.46582818031311035, + "learning_rate": 6.662712075391891e-06, + "loss": 0.1118, + "step": 3799 + }, + { + "epoch": 1.2313674659753726, + "grad_norm": 0.4459727704524994, + "learning_rate": 6.6610623081544934e-06, + "loss": 0.1031, + "step": 3800 + }, + { + "epoch": 1.2316915100453663, + "grad_norm": 0.48455187678337097, + "learning_rate": 6.659412337619149e-06, + "loss": 0.1101, + "step": 3801 + }, + { + "epoch": 1.2320155541153597, + "grad_norm": 0.4238426089286804, + "learning_rate": 6.657762163987799e-06, + "loss": 0.1047, + "step": 3802 + }, + { + "epoch": 1.2323395981853533, + "grad_norm": 0.4523986876010895, + "learning_rate": 6.656111787462407e-06, + "loss": 0.1106, + "step": 3803 + }, + { + "epoch": 1.2326636422553467, + "grad_norm": 0.45181649923324585, + "learning_rate": 6.654461208244968e-06, + "loss": 0.1015, + "step": 3804 + }, + { + "epoch": 1.2329876863253402, + "grad_norm": 0.459146648645401, + "learning_rate": 6.65281042653749e-06, + "loss": 0.1094, + "step": 3805 + }, + { + "epoch": 1.2333117303953338, + "grad_norm": 0.4525477886199951, + "learning_rate": 6.651159442542019e-06, + "loss": 0.1084, + "step": 3806 + }, + { + "epoch": 1.2336357744653272, + "grad_norm": 0.45423898100852966, + "learning_rate": 6.6495082564606164e-06, + "loss": 0.1111, + "step": 3807 + }, + { + "epoch": 1.2339598185353209, + "grad_norm": 0.48852837085723877, + "learning_rate": 6.6478568684953704e-06, + "loss": 0.1191, + "step": 3808 + }, + { + "epoch": 1.2342838626053143, + "grad_norm": 0.4974091649055481, + "learning_rate": 6.6462052788483965e-06, + "loss": 0.1265, + "step": 3809 + }, + { + "epoch": 1.2346079066753077, + "grad_norm": 0.5248463153839111, + "learning_rate": 6.644553487721833e-06, + "loss": 0.1283, + "step": 3810 + }, + { + "epoch": 1.2349319507453014, + "grad_norm": 0.4278448820114136, + "learning_rate": 6.642901495317844e-06, + "loss": 0.1005, + "step": 3811 + }, + { + "epoch": 1.2352559948152948, + "grad_norm": 0.4968985617160797, + "learning_rate": 6.641249301838615e-06, + "loss": 0.1252, + "step": 3812 + }, + { + "epoch": 1.2355800388852884, + "grad_norm": 0.47982725501060486, + "learning_rate": 6.639596907486359e-06, + "loss": 0.1171, + "step": 3813 + }, + { + "epoch": 1.2359040829552819, + "grad_norm": 0.4500291347503662, + "learning_rate": 6.637944312463317e-06, + "loss": 0.1055, + "step": 3814 + }, + { + "epoch": 1.2362281270252755, + "grad_norm": 0.4736287593841553, + "learning_rate": 6.636291516971747e-06, + "loss": 0.1188, + "step": 3815 + }, + { + "epoch": 1.236552171095269, + "grad_norm": 0.4399452209472656, + "learning_rate": 6.6346385212139345e-06, + "loss": 0.1083, + "step": 3816 + }, + { + "epoch": 1.2368762151652626, + "grad_norm": 0.4650920331478119, + "learning_rate": 6.632985325392194e-06, + "loss": 0.114, + "step": 3817 + }, + { + "epoch": 1.237200259235256, + "grad_norm": 0.500536322593689, + "learning_rate": 6.631331929708855e-06, + "loss": 0.1222, + "step": 3818 + }, + { + "epoch": 1.2375243033052494, + "grad_norm": 0.472375750541687, + "learning_rate": 6.629678334366282e-06, + "loss": 0.1167, + "step": 3819 + }, + { + "epoch": 1.237848347375243, + "grad_norm": 0.4487501084804535, + "learning_rate": 6.628024539566857e-06, + "loss": 0.1046, + "step": 3820 + }, + { + "epoch": 1.2381723914452365, + "grad_norm": 0.4511720538139343, + "learning_rate": 6.626370545512989e-06, + "loss": 0.1032, + "step": 3821 + }, + { + "epoch": 1.2384964355152301, + "grad_norm": 0.46332141757011414, + "learning_rate": 6.6247163524071115e-06, + "loss": 0.1132, + "step": 3822 + }, + { + "epoch": 1.2388204795852236, + "grad_norm": 0.4876859486103058, + "learning_rate": 6.62306196045168e-06, + "loss": 0.1216, + "step": 3823 + }, + { + "epoch": 1.239144523655217, + "grad_norm": 0.48191460967063904, + "learning_rate": 6.6214073698491766e-06, + "loss": 0.1128, + "step": 3824 + }, + { + "epoch": 1.2394685677252106, + "grad_norm": 0.46776261925697327, + "learning_rate": 6.619752580802108e-06, + "loss": 0.1081, + "step": 3825 + }, + { + "epoch": 1.239792611795204, + "grad_norm": 0.4756350517272949, + "learning_rate": 6.618097593513006e-06, + "loss": 0.1161, + "step": 3826 + }, + { + "epoch": 1.2401166558651977, + "grad_norm": 0.48851534724235535, + "learning_rate": 6.616442408184421e-06, + "loss": 0.1203, + "step": 3827 + }, + { + "epoch": 1.2404406999351911, + "grad_norm": 0.4545134902000427, + "learning_rate": 6.614787025018932e-06, + "loss": 0.1147, + "step": 3828 + }, + { + "epoch": 1.2407647440051848, + "grad_norm": 0.44553035497665405, + "learning_rate": 6.6131314442191465e-06, + "loss": 0.1094, + "step": 3829 + }, + { + "epoch": 1.2410887880751782, + "grad_norm": 0.402369886636734, + "learning_rate": 6.6114756659876864e-06, + "loss": 0.0925, + "step": 3830 + }, + { + "epoch": 1.2414128321451718, + "grad_norm": 0.4575660824775696, + "learning_rate": 6.609819690527206e-06, + "loss": 0.1123, + "step": 3831 + }, + { + "epoch": 1.2417368762151653, + "grad_norm": 0.46121862530708313, + "learning_rate": 6.6081635180403794e-06, + "loss": 0.1124, + "step": 3832 + }, + { + "epoch": 1.2420609202851587, + "grad_norm": 0.44581833481788635, + "learning_rate": 6.606507148729906e-06, + "loss": 0.1005, + "step": 3833 + }, + { + "epoch": 1.2423849643551523, + "grad_norm": 0.5141975283622742, + "learning_rate": 6.6048505827985096e-06, + "loss": 0.118, + "step": 3834 + }, + { + "epoch": 1.2427090084251458, + "grad_norm": 0.4516439437866211, + "learning_rate": 6.603193820448941e-06, + "loss": 0.1083, + "step": 3835 + }, + { + "epoch": 1.2430330524951394, + "grad_norm": 0.5207176804542542, + "learning_rate": 6.601536861883966e-06, + "loss": 0.1321, + "step": 3836 + }, + { + "epoch": 1.2433570965651328, + "grad_norm": 0.4359625279903412, + "learning_rate": 6.599879707306384e-06, + "loss": 0.1008, + "step": 3837 + }, + { + "epoch": 1.2436811406351265, + "grad_norm": 0.5066201090812683, + "learning_rate": 6.598222356919014e-06, + "loss": 0.1093, + "step": 3838 + }, + { + "epoch": 1.24400518470512, + "grad_norm": 0.49363771080970764, + "learning_rate": 6.5965648109247e-06, + "loss": 0.1225, + "step": 3839 + }, + { + "epoch": 1.2443292287751135, + "grad_norm": 0.4841170012950897, + "learning_rate": 6.594907069526308e-06, + "loss": 0.12, + "step": 3840 + }, + { + "epoch": 1.244653272845107, + "grad_norm": 0.45232200622558594, + "learning_rate": 6.593249132926731e-06, + "loss": 0.1099, + "step": 3841 + }, + { + "epoch": 1.2449773169151004, + "grad_norm": 0.5103926658630371, + "learning_rate": 6.591591001328884e-06, + "loss": 0.1165, + "step": 3842 + }, + { + "epoch": 1.245301360985094, + "grad_norm": 0.506633460521698, + "learning_rate": 6.589932674935706e-06, + "loss": 0.1205, + "step": 3843 + }, + { + "epoch": 1.2456254050550875, + "grad_norm": 0.4852462708950043, + "learning_rate": 6.588274153950161e-06, + "loss": 0.1134, + "step": 3844 + }, + { + "epoch": 1.245949449125081, + "grad_norm": 0.4266104996204376, + "learning_rate": 6.586615438575234e-06, + "loss": 0.108, + "step": 3845 + }, + { + "epoch": 1.2462734931950745, + "grad_norm": 0.4661915600299835, + "learning_rate": 6.584956529013937e-06, + "loss": 0.1102, + "step": 3846 + }, + { + "epoch": 1.246597537265068, + "grad_norm": 0.4633307456970215, + "learning_rate": 6.583297425469306e-06, + "loss": 0.1084, + "step": 3847 + }, + { + "epoch": 1.2469215813350616, + "grad_norm": 0.4619586765766144, + "learning_rate": 6.581638128144396e-06, + "loss": 0.1035, + "step": 3848 + }, + { + "epoch": 1.247245625405055, + "grad_norm": 0.49360668659210205, + "learning_rate": 6.579978637242292e-06, + "loss": 0.1222, + "step": 3849 + }, + { + "epoch": 1.2475696694750487, + "grad_norm": 0.4530634582042694, + "learning_rate": 6.578318952966098e-06, + "loss": 0.1112, + "step": 3850 + }, + { + "epoch": 1.247893713545042, + "grad_norm": 0.47121280431747437, + "learning_rate": 6.576659075518943e-06, + "loss": 0.1127, + "step": 3851 + }, + { + "epoch": 1.2482177576150357, + "grad_norm": 0.5079575777053833, + "learning_rate": 6.5749990051039795e-06, + "loss": 0.1196, + "step": 3852 + }, + { + "epoch": 1.2485418016850292, + "grad_norm": 0.46687185764312744, + "learning_rate": 6.573338741924387e-06, + "loss": 0.1112, + "step": 3853 + }, + { + "epoch": 1.2488658457550228, + "grad_norm": 0.4628097414970398, + "learning_rate": 6.571678286183362e-06, + "loss": 0.107, + "step": 3854 + }, + { + "epoch": 1.2491898898250162, + "grad_norm": 0.42948663234710693, + "learning_rate": 6.5700176380841294e-06, + "loss": 0.1013, + "step": 3855 + }, + { + "epoch": 1.2495139338950096, + "grad_norm": 0.4683387279510498, + "learning_rate": 6.568356797829938e-06, + "loss": 0.1136, + "step": 3856 + }, + { + "epoch": 1.2498379779650033, + "grad_norm": 0.48012471199035645, + "learning_rate": 6.566695765624056e-06, + "loss": 0.112, + "step": 3857 + }, + { + "epoch": 1.2501620220349967, + "grad_norm": 0.5040128827095032, + "learning_rate": 6.565034541669782e-06, + "loss": 0.1204, + "step": 3858 + }, + { + "epoch": 1.2504860661049904, + "grad_norm": 0.4954575300216675, + "learning_rate": 6.563373126170428e-06, + "loss": 0.1213, + "step": 3859 + }, + { + "epoch": 1.2508101101749838, + "grad_norm": 0.47964587807655334, + "learning_rate": 6.56171151932934e-06, + "loss": 0.1207, + "step": 3860 + }, + { + "epoch": 1.2511341542449772, + "grad_norm": 0.4623505473136902, + "learning_rate": 6.560049721349879e-06, + "loss": 0.1086, + "step": 3861 + }, + { + "epoch": 1.2514581983149708, + "grad_norm": 0.4356464743614197, + "learning_rate": 6.558387732435435e-06, + "loss": 0.1077, + "step": 3862 + }, + { + "epoch": 1.2517822423849643, + "grad_norm": 0.47153621912002563, + "learning_rate": 6.556725552789418e-06, + "loss": 0.11, + "step": 3863 + }, + { + "epoch": 1.252106286454958, + "grad_norm": 0.5097867250442505, + "learning_rate": 6.555063182615264e-06, + "loss": 0.1285, + "step": 3864 + }, + { + "epoch": 1.2524303305249513, + "grad_norm": 0.4765617847442627, + "learning_rate": 6.553400622116433e-06, + "loss": 0.1107, + "step": 3865 + }, + { + "epoch": 1.252754374594945, + "grad_norm": 0.4168030619621277, + "learning_rate": 6.551737871496402e-06, + "loss": 0.1001, + "step": 3866 + }, + { + "epoch": 1.2530784186649384, + "grad_norm": 0.47275760769844055, + "learning_rate": 6.5500749309586775e-06, + "loss": 0.1255, + "step": 3867 + }, + { + "epoch": 1.253402462734932, + "grad_norm": 0.4696555733680725, + "learning_rate": 6.548411800706787e-06, + "loss": 0.1105, + "step": 3868 + }, + { + "epoch": 1.2537265068049255, + "grad_norm": 0.6402665376663208, + "learning_rate": 6.5467484809442825e-06, + "loss": 0.119, + "step": 3869 + }, + { + "epoch": 1.254050550874919, + "grad_norm": 0.4428213834762573, + "learning_rate": 6.545084971874738e-06, + "loss": 0.101, + "step": 3870 + }, + { + "epoch": 1.2543745949449125, + "grad_norm": 0.4355408847332001, + "learning_rate": 6.543421273701751e-06, + "loss": 0.1008, + "step": 3871 + }, + { + "epoch": 1.254698639014906, + "grad_norm": 0.4726782441139221, + "learning_rate": 6.54175738662894e-06, + "loss": 0.1105, + "step": 3872 + }, + { + "epoch": 1.2550226830848996, + "grad_norm": 0.4533577859401703, + "learning_rate": 6.540093310859951e-06, + "loss": 0.1066, + "step": 3873 + }, + { + "epoch": 1.255346727154893, + "grad_norm": 0.47577226161956787, + "learning_rate": 6.538429046598449e-06, + "loss": 0.1092, + "step": 3874 + }, + { + "epoch": 1.2556707712248865, + "grad_norm": 0.48334789276123047, + "learning_rate": 6.536764594048126e-06, + "loss": 0.1174, + "step": 3875 + }, + { + "epoch": 1.25599481529488, + "grad_norm": 0.46139639616012573, + "learning_rate": 6.535099953412693e-06, + "loss": 0.1075, + "step": 3876 + }, + { + "epoch": 1.2563188593648738, + "grad_norm": 0.47558170557022095, + "learning_rate": 6.533435124895888e-06, + "loss": 0.1092, + "step": 3877 + }, + { + "epoch": 1.2566429034348672, + "grad_norm": 0.49053213000297546, + "learning_rate": 6.531770108701467e-06, + "loss": 0.1186, + "step": 3878 + }, + { + "epoch": 1.2569669475048606, + "grad_norm": 0.48654669523239136, + "learning_rate": 6.530104905033213e-06, + "loss": 0.1194, + "step": 3879 + }, + { + "epoch": 1.2572909915748542, + "grad_norm": 0.4468207359313965, + "learning_rate": 6.528439514094933e-06, + "loss": 0.1064, + "step": 3880 + }, + { + "epoch": 1.2576150356448477, + "grad_norm": 0.4944261610507965, + "learning_rate": 6.526773936090452e-06, + "loss": 0.1199, + "step": 3881 + }, + { + "epoch": 1.2579390797148413, + "grad_norm": 0.46873268485069275, + "learning_rate": 6.525108171223622e-06, + "loss": 0.1064, + "step": 3882 + }, + { + "epoch": 1.2582631237848347, + "grad_norm": 0.5125519633293152, + "learning_rate": 6.523442219698316e-06, + "loss": 0.1279, + "step": 3883 + }, + { + "epoch": 1.2585871678548282, + "grad_norm": 0.42340168356895447, + "learning_rate": 6.5217760817184295e-06, + "loss": 0.1018, + "step": 3884 + }, + { + "epoch": 1.2589112119248218, + "grad_norm": 0.45173561573028564, + "learning_rate": 6.520109757487883e-06, + "loss": 0.1014, + "step": 3885 + }, + { + "epoch": 1.2592352559948152, + "grad_norm": 0.45571208000183105, + "learning_rate": 6.518443247210617e-06, + "loss": 0.1119, + "step": 3886 + }, + { + "epoch": 1.2595593000648089, + "grad_norm": 0.4843176305294037, + "learning_rate": 6.516776551090599e-06, + "loss": 0.1142, + "step": 3887 + }, + { + "epoch": 1.2598833441348023, + "grad_norm": 0.4941485822200775, + "learning_rate": 6.515109669331814e-06, + "loss": 0.115, + "step": 3888 + }, + { + "epoch": 1.2602073882047957, + "grad_norm": 0.43944495916366577, + "learning_rate": 6.513442602138273e-06, + "loss": 0.1092, + "step": 3889 + }, + { + "epoch": 1.2605314322747894, + "grad_norm": 0.4794701635837555, + "learning_rate": 6.511775349714008e-06, + "loss": 0.116, + "step": 3890 + }, + { + "epoch": 1.260855476344783, + "grad_norm": 0.45469602942466736, + "learning_rate": 6.510107912263077e-06, + "loss": 0.113, + "step": 3891 + }, + { + "epoch": 1.2611795204147764, + "grad_norm": 0.42920494079589844, + "learning_rate": 6.508440289989555e-06, + "loss": 0.1055, + "step": 3892 + }, + { + "epoch": 1.2615035644847699, + "grad_norm": 0.47261765599250793, + "learning_rate": 6.506772483097545e-06, + "loss": 0.121, + "step": 3893 + }, + { + "epoch": 1.2618276085547635, + "grad_norm": 0.45626595616340637, + "learning_rate": 6.505104491791169e-06, + "loss": 0.1076, + "step": 3894 + }, + { + "epoch": 1.262151652624757, + "grad_norm": 0.49716857075691223, + "learning_rate": 6.503436316274574e-06, + "loss": 0.1264, + "step": 3895 + }, + { + "epoch": 1.2624756966947506, + "grad_norm": 0.4248673617839813, + "learning_rate": 6.5017679567519285e-06, + "loss": 0.1026, + "step": 3896 + }, + { + "epoch": 1.262799740764744, + "grad_norm": 0.4387178122997284, + "learning_rate": 6.500099413427423e-06, + "loss": 0.0995, + "step": 3897 + }, + { + "epoch": 1.2631237848347374, + "grad_norm": 0.4541378319263458, + "learning_rate": 6.498430686505271e-06, + "loss": 0.1055, + "step": 3898 + }, + { + "epoch": 1.263447828904731, + "grad_norm": 0.4681780934333801, + "learning_rate": 6.4967617761897104e-06, + "loss": 0.1127, + "step": 3899 + }, + { + "epoch": 1.2637718729747245, + "grad_norm": 0.490855872631073, + "learning_rate": 6.495092682684997e-06, + "loss": 0.1323, + "step": 3900 + }, + { + "epoch": 1.2640959170447181, + "grad_norm": 0.43128740787506104, + "learning_rate": 6.493423406195414e-06, + "loss": 0.1045, + "step": 3901 + }, + { + "epoch": 1.2644199611147116, + "grad_norm": 0.4609931707382202, + "learning_rate": 6.491753946925263e-06, + "loss": 0.1182, + "step": 3902 + }, + { + "epoch": 1.2647440051847052, + "grad_norm": 0.4781551957130432, + "learning_rate": 6.49008430507887e-06, + "loss": 0.1216, + "step": 3903 + }, + { + "epoch": 1.2650680492546986, + "grad_norm": 0.46152034401893616, + "learning_rate": 6.488414480860583e-06, + "loss": 0.1139, + "step": 3904 + }, + { + "epoch": 1.2653920933246923, + "grad_norm": 0.4464397132396698, + "learning_rate": 6.486744474474772e-06, + "loss": 0.0979, + "step": 3905 + }, + { + "epoch": 1.2657161373946857, + "grad_norm": 0.4776097536087036, + "learning_rate": 6.485074286125831e-06, + "loss": 0.111, + "step": 3906 + }, + { + "epoch": 1.2660401814646791, + "grad_norm": 0.44906583428382874, + "learning_rate": 6.483403916018174e-06, + "loss": 0.1113, + "step": 3907 + }, + { + "epoch": 1.2663642255346728, + "grad_norm": 0.4759385585784912, + "learning_rate": 6.481733364356237e-06, + "loss": 0.1141, + "step": 3908 + }, + { + "epoch": 1.2666882696046662, + "grad_norm": 0.4835518002510071, + "learning_rate": 6.480062631344483e-06, + "loss": 0.1186, + "step": 3909 + }, + { + "epoch": 1.2670123136746598, + "grad_norm": 0.48663994669914246, + "learning_rate": 6.478391717187389e-06, + "loss": 0.1115, + "step": 3910 + }, + { + "epoch": 1.2673363577446533, + "grad_norm": 0.44438567757606506, + "learning_rate": 6.476720622089461e-06, + "loss": 0.1009, + "step": 3911 + }, + { + "epoch": 1.2676604018146467, + "grad_norm": 0.48949071764945984, + "learning_rate": 6.4750493462552245e-06, + "loss": 0.1203, + "step": 3912 + }, + { + "epoch": 1.2679844458846403, + "grad_norm": 0.46416011452674866, + "learning_rate": 6.473377889889228e-06, + "loss": 0.1061, + "step": 3913 + }, + { + "epoch": 1.268308489954634, + "grad_norm": 0.46517306566238403, + "learning_rate": 6.4717062531960415e-06, + "loss": 0.1068, + "step": 3914 + }, + { + "epoch": 1.2686325340246274, + "grad_norm": 0.4801362454891205, + "learning_rate": 6.470034436380257e-06, + "loss": 0.1062, + "step": 3915 + }, + { + "epoch": 1.2689565780946208, + "grad_norm": 0.5002215504646301, + "learning_rate": 6.468362439646487e-06, + "loss": 0.1129, + "step": 3916 + }, + { + "epoch": 1.2692806221646145, + "grad_norm": 0.45998987555503845, + "learning_rate": 6.4666902631993714e-06, + "loss": 0.1165, + "step": 3917 + }, + { + "epoch": 1.2696046662346079, + "grad_norm": 0.5007704496383667, + "learning_rate": 6.465017907243564e-06, + "loss": 0.1168, + "step": 3918 + }, + { + "epoch": 1.2699287103046015, + "grad_norm": 0.47280606627464294, + "learning_rate": 6.463345371983748e-06, + "loss": 0.1143, + "step": 3919 + }, + { + "epoch": 1.270252754374595, + "grad_norm": 0.5030761957168579, + "learning_rate": 6.4616726576246255e-06, + "loss": 0.1172, + "step": 3920 + }, + { + "epoch": 1.2705767984445884, + "grad_norm": 0.4938787519931793, + "learning_rate": 6.459999764370919e-06, + "loss": 0.121, + "step": 3921 + }, + { + "epoch": 1.270900842514582, + "grad_norm": 0.42462942004203796, + "learning_rate": 6.458326692427376e-06, + "loss": 0.1052, + "step": 3922 + }, + { + "epoch": 1.2712248865845754, + "grad_norm": 0.4278605580329895, + "learning_rate": 6.456653441998764e-06, + "loss": 0.1029, + "step": 3923 + }, + { + "epoch": 1.271548930654569, + "grad_norm": 0.4416208565235138, + "learning_rate": 6.454980013289871e-06, + "loss": 0.1019, + "step": 3924 + }, + { + "epoch": 1.2718729747245625, + "grad_norm": 0.492037832736969, + "learning_rate": 6.4533064065055095e-06, + "loss": 0.1232, + "step": 3925 + }, + { + "epoch": 1.272197018794556, + "grad_norm": 0.46979275345802307, + "learning_rate": 6.451632621850514e-06, + "loss": 0.1085, + "step": 3926 + }, + { + "epoch": 1.2725210628645496, + "grad_norm": 0.4623831808567047, + "learning_rate": 6.449958659529738e-06, + "loss": 0.1064, + "step": 3927 + }, + { + "epoch": 1.2728451069345432, + "grad_norm": 0.44108131527900696, + "learning_rate": 6.448284519748058e-06, + "loss": 0.101, + "step": 3928 + }, + { + "epoch": 1.2731691510045366, + "grad_norm": 0.5097630620002747, + "learning_rate": 6.446610202710374e-06, + "loss": 0.1199, + "step": 3929 + }, + { + "epoch": 1.27349319507453, + "grad_norm": 0.5080080032348633, + "learning_rate": 6.444935708621605e-06, + "loss": 0.1262, + "step": 3930 + }, + { + "epoch": 1.2738172391445237, + "grad_norm": 0.45362842082977295, + "learning_rate": 6.443261037686694e-06, + "loss": 0.1085, + "step": 3931 + }, + { + "epoch": 1.2741412832145171, + "grad_norm": 0.4672083258628845, + "learning_rate": 6.441586190110603e-06, + "loss": 0.1137, + "step": 3932 + }, + { + "epoch": 1.2744653272845108, + "grad_norm": 0.45072510838508606, + "learning_rate": 6.439911166098319e-06, + "loss": 0.1058, + "step": 3933 + }, + { + "epoch": 1.2747893713545042, + "grad_norm": 0.5025404095649719, + "learning_rate": 6.438235965854849e-06, + "loss": 0.1159, + "step": 3934 + }, + { + "epoch": 1.2751134154244976, + "grad_norm": 0.491618812084198, + "learning_rate": 6.436560589585217e-06, + "loss": 0.1191, + "step": 3935 + }, + { + "epoch": 1.2754374594944913, + "grad_norm": 0.4758274555206299, + "learning_rate": 6.434885037494477e-06, + "loss": 0.1122, + "step": 3936 + }, + { + "epoch": 1.2757615035644847, + "grad_norm": 0.4895617663860321, + "learning_rate": 6.4332093097876994e-06, + "loss": 0.1211, + "step": 3937 + }, + { + "epoch": 1.2760855476344783, + "grad_norm": 0.45443156361579895, + "learning_rate": 6.431533406669976e-06, + "loss": 0.1105, + "step": 3938 + }, + { + "epoch": 1.2764095917044718, + "grad_norm": 0.47179800271987915, + "learning_rate": 6.429857328346422e-06, + "loss": 0.1098, + "step": 3939 + }, + { + "epoch": 1.2767336357744652, + "grad_norm": 0.4958186447620392, + "learning_rate": 6.428181075022175e-06, + "loss": 0.1209, + "step": 3940 + }, + { + "epoch": 1.2770576798444588, + "grad_norm": 0.485847145318985, + "learning_rate": 6.426504646902389e-06, + "loss": 0.1223, + "step": 3941 + }, + { + "epoch": 1.2773817239144525, + "grad_norm": 0.45189395546913147, + "learning_rate": 6.424828044192243e-06, + "loss": 0.1005, + "step": 3942 + }, + { + "epoch": 1.277705767984446, + "grad_norm": 0.48433998227119446, + "learning_rate": 6.423151267096939e-06, + "loss": 0.1216, + "step": 3943 + }, + { + "epoch": 1.2780298120544393, + "grad_norm": 0.4740990102291107, + "learning_rate": 6.421474315821696e-06, + "loss": 0.1217, + "step": 3944 + }, + { + "epoch": 1.278353856124433, + "grad_norm": 0.5032677054405212, + "learning_rate": 6.419797190571759e-06, + "loss": 0.1259, + "step": 3945 + }, + { + "epoch": 1.2786779001944264, + "grad_norm": 0.5017115473747253, + "learning_rate": 6.41811989155239e-06, + "loss": 0.1282, + "step": 3946 + }, + { + "epoch": 1.27900194426442, + "grad_norm": 0.49619826674461365, + "learning_rate": 6.416442418968875e-06, + "loss": 0.1242, + "step": 3947 + }, + { + "epoch": 1.2793259883344135, + "grad_norm": 0.4738031029701233, + "learning_rate": 6.41476477302652e-06, + "loss": 0.1141, + "step": 3948 + }, + { + "epoch": 1.279650032404407, + "grad_norm": 0.4560187757015228, + "learning_rate": 6.413086953930652e-06, + "loss": 0.1117, + "step": 3949 + }, + { + "epoch": 1.2799740764744005, + "grad_norm": 0.4704577326774597, + "learning_rate": 6.4114089618866215e-06, + "loss": 0.1204, + "step": 3950 + }, + { + "epoch": 1.280298120544394, + "grad_norm": 0.46816858649253845, + "learning_rate": 6.409730797099797e-06, + "loss": 0.1161, + "step": 3951 + }, + { + "epoch": 1.2806221646143876, + "grad_norm": 0.46960848569869995, + "learning_rate": 6.40805245977557e-06, + "loss": 0.1159, + "step": 3952 + }, + { + "epoch": 1.280946208684381, + "grad_norm": 0.44595637917518616, + "learning_rate": 6.406373950119354e-06, + "loss": 0.1072, + "step": 3953 + }, + { + "epoch": 1.2812702527543747, + "grad_norm": 0.471149742603302, + "learning_rate": 6.4046952683365805e-06, + "loss": 0.1164, + "step": 3954 + }, + { + "epoch": 1.281594296824368, + "grad_norm": 0.4557897448539734, + "learning_rate": 6.403016414632705e-06, + "loss": 0.1093, + "step": 3955 + }, + { + "epoch": 1.2819183408943617, + "grad_norm": 0.45571160316467285, + "learning_rate": 6.4013373892132024e-06, + "loss": 0.1186, + "step": 3956 + }, + { + "epoch": 1.2822423849643552, + "grad_norm": 0.45447635650634766, + "learning_rate": 6.399658192283569e-06, + "loss": 0.0979, + "step": 3957 + }, + { + "epoch": 1.2825664290343486, + "grad_norm": 0.4660548269748688, + "learning_rate": 6.3979788240493226e-06, + "loss": 0.109, + "step": 3958 + }, + { + "epoch": 1.2828904731043422, + "grad_norm": 0.45442885160446167, + "learning_rate": 6.3962992847160025e-06, + "loss": 0.1102, + "step": 3959 + }, + { + "epoch": 1.2832145171743357, + "grad_norm": 0.48392462730407715, + "learning_rate": 6.3946195744891656e-06, + "loss": 0.1213, + "step": 3960 + }, + { + "epoch": 1.2835385612443293, + "grad_norm": 0.4479372501373291, + "learning_rate": 6.392939693574395e-06, + "loss": 0.0962, + "step": 3961 + }, + { + "epoch": 1.2838626053143227, + "grad_norm": 0.5068491101264954, + "learning_rate": 6.391259642177291e-06, + "loss": 0.1287, + "step": 3962 + }, + { + "epoch": 1.2841866493843161, + "grad_norm": 0.43103277683258057, + "learning_rate": 6.389579420503475e-06, + "loss": 0.0999, + "step": 3963 + }, + { + "epoch": 1.2845106934543098, + "grad_norm": 0.47594237327575684, + "learning_rate": 6.387899028758589e-06, + "loss": 0.1131, + "step": 3964 + }, + { + "epoch": 1.2848347375243034, + "grad_norm": 0.4817759692668915, + "learning_rate": 6.3862184671482995e-06, + "loss": 0.1174, + "step": 3965 + }, + { + "epoch": 1.2851587815942969, + "grad_norm": 0.4870638847351074, + "learning_rate": 6.384537735878288e-06, + "loss": 0.1177, + "step": 3966 + }, + { + "epoch": 1.2854828256642903, + "grad_norm": 0.5051252245903015, + "learning_rate": 6.3828568351542605e-06, + "loss": 0.1216, + "step": 3967 + }, + { + "epoch": 1.285806869734284, + "grad_norm": 0.4776197075843811, + "learning_rate": 6.381175765181945e-06, + "loss": 0.1129, + "step": 3968 + }, + { + "epoch": 1.2861309138042774, + "grad_norm": 0.44561052322387695, + "learning_rate": 6.379494526167086e-06, + "loss": 0.1089, + "step": 3969 + }, + { + "epoch": 1.286454957874271, + "grad_norm": 0.5017491579055786, + "learning_rate": 6.37781311831545e-06, + "loss": 0.115, + "step": 3970 + }, + { + "epoch": 1.2867790019442644, + "grad_norm": 0.4586908221244812, + "learning_rate": 6.376131541832829e-06, + "loss": 0.1057, + "step": 3971 + }, + { + "epoch": 1.2871030460142578, + "grad_norm": 0.4588594138622284, + "learning_rate": 6.374449796925027e-06, + "loss": 0.1186, + "step": 3972 + }, + { + "epoch": 1.2874270900842515, + "grad_norm": 0.47810885310173035, + "learning_rate": 6.372767883797877e-06, + "loss": 0.1162, + "step": 3973 + }, + { + "epoch": 1.287751134154245, + "grad_norm": 0.44631102681159973, + "learning_rate": 6.371085802657226e-06, + "loss": 0.1074, + "step": 3974 + }, + { + "epoch": 1.2880751782242386, + "grad_norm": 0.5198760628700256, + "learning_rate": 6.369403553708948e-06, + "loss": 0.1318, + "step": 3975 + }, + { + "epoch": 1.288399222294232, + "grad_norm": 0.4657326638698578, + "learning_rate": 6.367721137158933e-06, + "loss": 0.1075, + "step": 3976 + }, + { + "epoch": 1.2887232663642254, + "grad_norm": 0.4796975255012512, + "learning_rate": 6.366038553213089e-06, + "loss": 0.1125, + "step": 3977 + }, + { + "epoch": 1.289047310434219, + "grad_norm": 0.4714679718017578, + "learning_rate": 6.364355802077351e-06, + "loss": 0.1147, + "step": 3978 + }, + { + "epoch": 1.2893713545042127, + "grad_norm": 0.41647621989250183, + "learning_rate": 6.36267288395767e-06, + "loss": 0.1075, + "step": 3979 + }, + { + "epoch": 1.2896953985742061, + "grad_norm": 0.4634070098400116, + "learning_rate": 6.36098979906002e-06, + "loss": 0.1095, + "step": 3980 + }, + { + "epoch": 1.2900194426441995, + "grad_norm": 0.41540664434432983, + "learning_rate": 6.359306547590395e-06, + "loss": 0.1038, + "step": 3981 + }, + { + "epoch": 1.2903434867141932, + "grad_norm": 0.4272691607475281, + "learning_rate": 6.357623129754807e-06, + "loss": 0.1059, + "step": 3982 + }, + { + "epoch": 1.2906675307841866, + "grad_norm": 0.49106115102767944, + "learning_rate": 6.35593954575929e-06, + "loss": 0.1188, + "step": 3983 + }, + { + "epoch": 1.2909915748541803, + "grad_norm": 0.463112473487854, + "learning_rate": 6.354255795809899e-06, + "loss": 0.1115, + "step": 3984 + }, + { + "epoch": 1.2913156189241737, + "grad_norm": 0.631113588809967, + "learning_rate": 6.35257188011271e-06, + "loss": 0.1172, + "step": 3985 + }, + { + "epoch": 1.291639662994167, + "grad_norm": 0.4728262722492218, + "learning_rate": 6.3508877988738174e-06, + "loss": 0.1176, + "step": 3986 + }, + { + "epoch": 1.2919637070641607, + "grad_norm": 0.4452451765537262, + "learning_rate": 6.349203552299336e-06, + "loss": 0.1103, + "step": 3987 + }, + { + "epoch": 1.2922877511341542, + "grad_norm": 0.4820338487625122, + "learning_rate": 6.347519140595399e-06, + "loss": 0.1155, + "step": 3988 + }, + { + "epoch": 1.2926117952041478, + "grad_norm": 0.46228158473968506, + "learning_rate": 6.345834563968165e-06, + "loss": 0.1117, + "step": 3989 + }, + { + "epoch": 1.2929358392741412, + "grad_norm": 0.4567924439907074, + "learning_rate": 6.344149822623809e-06, + "loss": 0.1139, + "step": 3990 + }, + { + "epoch": 1.2932598833441347, + "grad_norm": 0.47757378220558167, + "learning_rate": 6.3424649167685274e-06, + "loss": 0.1027, + "step": 3991 + }, + { + "epoch": 1.2935839274141283, + "grad_norm": 0.4581971764564514, + "learning_rate": 6.340779846608535e-06, + "loss": 0.1105, + "step": 3992 + }, + { + "epoch": 1.293907971484122, + "grad_norm": 0.4690464437007904, + "learning_rate": 6.339094612350071e-06, + "loss": 0.1111, + "step": 3993 + }, + { + "epoch": 1.2942320155541154, + "grad_norm": 0.4487064778804779, + "learning_rate": 6.3374092141993884e-06, + "loss": 0.1042, + "step": 3994 + }, + { + "epoch": 1.2945560596241088, + "grad_norm": 0.5062069296836853, + "learning_rate": 6.3357236523627656e-06, + "loss": 0.1211, + "step": 3995 + }, + { + "epoch": 1.2948801036941024, + "grad_norm": 0.46276944875717163, + "learning_rate": 6.334037927046498e-06, + "loss": 0.1071, + "step": 3996 + }, + { + "epoch": 1.2952041477640959, + "grad_norm": 0.5312107801437378, + "learning_rate": 6.3323520384569036e-06, + "loss": 0.124, + "step": 3997 + }, + { + "epoch": 1.2955281918340895, + "grad_norm": 0.4688504934310913, + "learning_rate": 6.330665986800318e-06, + "loss": 0.1158, + "step": 3998 + }, + { + "epoch": 1.295852235904083, + "grad_norm": 0.4981670379638672, + "learning_rate": 6.328979772283097e-06, + "loss": 0.125, + "step": 3999 + }, + { + "epoch": 1.2961762799740764, + "grad_norm": 0.48476776480674744, + "learning_rate": 6.327293395111618e-06, + "loss": 0.1245, + "step": 4000 + }, + { + "epoch": 1.29650032404407, + "grad_norm": 0.472931444644928, + "learning_rate": 6.325606855492275e-06, + "loss": 0.1167, + "step": 4001 + }, + { + "epoch": 1.2968243681140634, + "grad_norm": 0.4795880913734436, + "learning_rate": 6.323920153631486e-06, + "loss": 0.1083, + "step": 4002 + }, + { + "epoch": 1.297148412184057, + "grad_norm": 0.5015627145767212, + "learning_rate": 6.322233289735689e-06, + "loss": 0.1194, + "step": 4003 + }, + { + "epoch": 1.2974724562540505, + "grad_norm": 0.4274068772792816, + "learning_rate": 6.320546264011335e-06, + "loss": 0.1032, + "step": 4004 + }, + { + "epoch": 1.2977965003240441, + "grad_norm": 0.4991876184940338, + "learning_rate": 6.318859076664904e-06, + "loss": 0.1206, + "step": 4005 + }, + { + "epoch": 1.2981205443940376, + "grad_norm": 0.47470569610595703, + "learning_rate": 6.317171727902889e-06, + "loss": 0.1241, + "step": 4006 + }, + { + "epoch": 1.2984445884640312, + "grad_norm": 0.48465102910995483, + "learning_rate": 6.315484217931805e-06, + "loss": 0.1119, + "step": 4007 + }, + { + "epoch": 1.2987686325340246, + "grad_norm": 0.4615066945552826, + "learning_rate": 6.313796546958189e-06, + "loss": 0.1082, + "step": 4008 + }, + { + "epoch": 1.299092676604018, + "grad_norm": 0.45659682154655457, + "learning_rate": 6.3121087151885915e-06, + "loss": 0.1126, + "step": 4009 + }, + { + "epoch": 1.2994167206740117, + "grad_norm": 0.502086877822876, + "learning_rate": 6.310420722829591e-06, + "loss": 0.1197, + "step": 4010 + }, + { + "epoch": 1.2997407647440051, + "grad_norm": 0.46333470940589905, + "learning_rate": 6.308732570087781e-06, + "loss": 0.1091, + "step": 4011 + }, + { + "epoch": 1.3000648088139988, + "grad_norm": 0.4794006943702698, + "learning_rate": 6.307044257169773e-06, + "loss": 0.1234, + "step": 4012 + }, + { + "epoch": 1.3003888528839922, + "grad_norm": 0.4782438576221466, + "learning_rate": 6.305355784282201e-06, + "loss": 0.1106, + "step": 4013 + }, + { + "epoch": 1.3007128969539856, + "grad_norm": 0.4819827675819397, + "learning_rate": 6.303667151631718e-06, + "loss": 0.1154, + "step": 4014 + }, + { + "epoch": 1.3010369410239793, + "grad_norm": 0.44112661480903625, + "learning_rate": 6.301978359424995e-06, + "loss": 0.1057, + "step": 4015 + }, + { + "epoch": 1.301360985093973, + "grad_norm": 0.4780457019805908, + "learning_rate": 6.300289407868726e-06, + "loss": 0.1271, + "step": 4016 + }, + { + "epoch": 1.3016850291639663, + "grad_norm": 0.4852437376976013, + "learning_rate": 6.298600297169622e-06, + "loss": 0.1228, + "step": 4017 + }, + { + "epoch": 1.3020090732339598, + "grad_norm": 0.4688003361225128, + "learning_rate": 6.296911027534413e-06, + "loss": 0.1024, + "step": 4018 + }, + { + "epoch": 1.3023331173039534, + "grad_norm": 0.4358528256416321, + "learning_rate": 6.295221599169848e-06, + "loss": 0.1057, + "step": 4019 + }, + { + "epoch": 1.3026571613739468, + "grad_norm": 0.46564021706581116, + "learning_rate": 6.293532012282699e-06, + "loss": 0.1106, + "step": 4020 + }, + { + "epoch": 1.3029812054439405, + "grad_norm": 0.4571780562400818, + "learning_rate": 6.291842267079753e-06, + "loss": 0.1135, + "step": 4021 + }, + { + "epoch": 1.303305249513934, + "grad_norm": 0.4707525968551636, + "learning_rate": 6.29015236376782e-06, + "loss": 0.1149, + "step": 4022 + }, + { + "epoch": 1.3036292935839273, + "grad_norm": 0.45767292380332947, + "learning_rate": 6.288462302553728e-06, + "loss": 0.118, + "step": 4023 + }, + { + "epoch": 1.303953337653921, + "grad_norm": 0.46272486448287964, + "learning_rate": 6.286772083644324e-06, + "loss": 0.1077, + "step": 4024 + }, + { + "epoch": 1.3042773817239144, + "grad_norm": 0.4843446910381317, + "learning_rate": 6.285081707246472e-06, + "loss": 0.1152, + "step": 4025 + }, + { + "epoch": 1.304601425793908, + "grad_norm": 0.4920760989189148, + "learning_rate": 6.28339117356706e-06, + "loss": 0.1192, + "step": 4026 + }, + { + "epoch": 1.3049254698639015, + "grad_norm": 0.4871255159378052, + "learning_rate": 6.281700482812993e-06, + "loss": 0.111, + "step": 4027 + }, + { + "epoch": 1.3052495139338949, + "grad_norm": 0.4424149990081787, + "learning_rate": 6.280009635191194e-06, + "loss": 0.1014, + "step": 4028 + }, + { + "epoch": 1.3055735580038885, + "grad_norm": 0.4454768896102905, + "learning_rate": 6.2783186309086086e-06, + "loss": 0.1079, + "step": 4029 + }, + { + "epoch": 1.3058976020738822, + "grad_norm": 0.47427868843078613, + "learning_rate": 6.276627470172198e-06, + "loss": 0.1132, + "step": 4030 + }, + { + "epoch": 1.3062216461438756, + "grad_norm": 0.49968528747558594, + "learning_rate": 6.274936153188942e-06, + "loss": 0.1237, + "step": 4031 + }, + { + "epoch": 1.306545690213869, + "grad_norm": 0.4610420763492584, + "learning_rate": 6.273244680165843e-06, + "loss": 0.1139, + "step": 4032 + }, + { + "epoch": 1.3068697342838627, + "grad_norm": 0.4568065106868744, + "learning_rate": 6.271553051309922e-06, + "loss": 0.1063, + "step": 4033 + }, + { + "epoch": 1.307193778353856, + "grad_norm": 0.4630391299724579, + "learning_rate": 6.269861266828217e-06, + "loss": 0.1086, + "step": 4034 + }, + { + "epoch": 1.3075178224238497, + "grad_norm": 0.46840715408325195, + "learning_rate": 6.268169326927788e-06, + "loss": 0.1118, + "step": 4035 + }, + { + "epoch": 1.3078418664938432, + "grad_norm": 0.48442018032073975, + "learning_rate": 6.266477231815707e-06, + "loss": 0.1148, + "step": 4036 + }, + { + "epoch": 1.3081659105638366, + "grad_norm": 0.4597585201263428, + "learning_rate": 6.264784981699074e-06, + "loss": 0.1095, + "step": 4037 + }, + { + "epoch": 1.3084899546338302, + "grad_norm": 0.44043147563934326, + "learning_rate": 6.263092576785005e-06, + "loss": 0.1008, + "step": 4038 + }, + { + "epoch": 1.3088139987038236, + "grad_norm": 0.4840199649333954, + "learning_rate": 6.2614000172806324e-06, + "loss": 0.1236, + "step": 4039 + }, + { + "epoch": 1.3091380427738173, + "grad_norm": 0.4641066789627075, + "learning_rate": 6.2597073033931075e-06, + "loss": 0.1124, + "step": 4040 + }, + { + "epoch": 1.3094620868438107, + "grad_norm": 0.47489872574806213, + "learning_rate": 6.258014435329604e-06, + "loss": 0.1153, + "step": 4041 + }, + { + "epoch": 1.3097861309138044, + "grad_norm": 0.47044837474823, + "learning_rate": 6.256321413297313e-06, + "loss": 0.1152, + "step": 4042 + }, + { + "epoch": 1.3101101749837978, + "grad_norm": 0.4591846168041229, + "learning_rate": 6.254628237503442e-06, + "loss": 0.1081, + "step": 4043 + }, + { + "epoch": 1.3104342190537914, + "grad_norm": 0.4056306779384613, + "learning_rate": 6.25293490815522e-06, + "loss": 0.0999, + "step": 4044 + }, + { + "epoch": 1.3107582631237849, + "grad_norm": 0.49313053488731384, + "learning_rate": 6.251241425459895e-06, + "loss": 0.1214, + "step": 4045 + }, + { + "epoch": 1.3110823071937783, + "grad_norm": 0.46979349851608276, + "learning_rate": 6.249547789624734e-06, + "loss": 0.1125, + "step": 4046 + }, + { + "epoch": 1.311406351263772, + "grad_norm": 0.4617290496826172, + "learning_rate": 6.247854000857018e-06, + "loss": 0.1103, + "step": 4047 + }, + { + "epoch": 1.3117303953337653, + "grad_norm": 0.4825688600540161, + "learning_rate": 6.246160059364054e-06, + "loss": 0.122, + "step": 4048 + }, + { + "epoch": 1.312054439403759, + "grad_norm": 0.48396843671798706, + "learning_rate": 6.244465965353161e-06, + "loss": 0.1287, + "step": 4049 + }, + { + "epoch": 1.3123784834737524, + "grad_norm": 0.4837722182273865, + "learning_rate": 6.242771719031684e-06, + "loss": 0.1166, + "step": 4050 + }, + { + "epoch": 1.3127025275437458, + "grad_norm": 0.5088424682617188, + "learning_rate": 6.241077320606977e-06, + "loss": 0.1257, + "step": 4051 + }, + { + "epoch": 1.3130265716137395, + "grad_norm": 0.49504292011260986, + "learning_rate": 6.2393827702864215e-06, + "loss": 0.1137, + "step": 4052 + }, + { + "epoch": 1.3133506156837331, + "grad_norm": 0.43925389647483826, + "learning_rate": 6.2376880682774125e-06, + "loss": 0.1062, + "step": 4053 + }, + { + "epoch": 1.3136746597537265, + "grad_norm": 0.4258055090904236, + "learning_rate": 6.235993214787367e-06, + "loss": 0.0944, + "step": 4054 + }, + { + "epoch": 1.31399870382372, + "grad_norm": 0.4500040113925934, + "learning_rate": 6.234298210023716e-06, + "loss": 0.1132, + "step": 4055 + }, + { + "epoch": 1.3143227478937136, + "grad_norm": 0.44641897082328796, + "learning_rate": 6.2326030541939135e-06, + "loss": 0.1068, + "step": 4056 + }, + { + "epoch": 1.314646791963707, + "grad_norm": 0.512460470199585, + "learning_rate": 6.230907747505428e-06, + "loss": 0.1285, + "step": 4057 + }, + { + "epoch": 1.3149708360337007, + "grad_norm": 0.48335960507392883, + "learning_rate": 6.229212290165752e-06, + "loss": 0.1192, + "step": 4058 + }, + { + "epoch": 1.315294880103694, + "grad_norm": 0.4757685661315918, + "learning_rate": 6.227516682382391e-06, + "loss": 0.1143, + "step": 4059 + }, + { + "epoch": 1.3156189241736875, + "grad_norm": 0.48404660820961, + "learning_rate": 6.225820924362873e-06, + "loss": 0.1132, + "step": 4060 + }, + { + "epoch": 1.3159429682436812, + "grad_norm": 0.46162092685699463, + "learning_rate": 6.224125016314739e-06, + "loss": 0.112, + "step": 4061 + }, + { + "epoch": 1.3162670123136746, + "grad_norm": 0.439531534910202, + "learning_rate": 6.222428958445555e-06, + "loss": 0.1028, + "step": 4062 + }, + { + "epoch": 1.3165910563836682, + "grad_norm": 0.48746368288993835, + "learning_rate": 6.220732750962899e-06, + "loss": 0.1137, + "step": 4063 + }, + { + "epoch": 1.3169151004536617, + "grad_norm": 0.5479814410209656, + "learning_rate": 6.219036394074372e-06, + "loss": 0.1339, + "step": 4064 + }, + { + "epoch": 1.317239144523655, + "grad_norm": 0.44706690311431885, + "learning_rate": 6.217339887987591e-06, + "loss": 0.1028, + "step": 4065 + }, + { + "epoch": 1.3175631885936487, + "grad_norm": 0.448042094707489, + "learning_rate": 6.215643232910193e-06, + "loss": 0.107, + "step": 4066 + }, + { + "epoch": 1.3178872326636424, + "grad_norm": 0.5104048252105713, + "learning_rate": 6.213946429049833e-06, + "loss": 0.1276, + "step": 4067 + }, + { + "epoch": 1.3182112767336358, + "grad_norm": 0.43246206641197205, + "learning_rate": 6.212249476614181e-06, + "loss": 0.0954, + "step": 4068 + }, + { + "epoch": 1.3185353208036292, + "grad_norm": 0.4300832450389862, + "learning_rate": 6.210552375810927e-06, + "loss": 0.105, + "step": 4069 + }, + { + "epoch": 1.3188593648736229, + "grad_norm": 0.4607033431529999, + "learning_rate": 6.208855126847783e-06, + "loss": 0.1086, + "step": 4070 + }, + { + "epoch": 1.3191834089436163, + "grad_norm": 0.45668113231658936, + "learning_rate": 6.207157729932474e-06, + "loss": 0.1104, + "step": 4071 + }, + { + "epoch": 1.31950745301361, + "grad_norm": 0.4810584485530853, + "learning_rate": 6.205460185272745e-06, + "loss": 0.1146, + "step": 4072 + }, + { + "epoch": 1.3198314970836034, + "grad_norm": 0.5209870338439941, + "learning_rate": 6.203762493076359e-06, + "loss": 0.1243, + "step": 4073 + }, + { + "epoch": 1.3201555411535968, + "grad_norm": 0.4863770306110382, + "learning_rate": 6.202064653551097e-06, + "loss": 0.1171, + "step": 4074 + }, + { + "epoch": 1.3204795852235904, + "grad_norm": 0.45841556787490845, + "learning_rate": 6.200366666904758e-06, + "loss": 0.11, + "step": 4075 + }, + { + "epoch": 1.3208036292935839, + "grad_norm": 0.43122103810310364, + "learning_rate": 6.1986685333451606e-06, + "loss": 0.106, + "step": 4076 + }, + { + "epoch": 1.3211276733635775, + "grad_norm": 0.474511057138443, + "learning_rate": 6.196970253080137e-06, + "loss": 0.111, + "step": 4077 + }, + { + "epoch": 1.321451717433571, + "grad_norm": 0.47046247124671936, + "learning_rate": 6.195271826317544e-06, + "loss": 0.1099, + "step": 4078 + }, + { + "epoch": 1.3217757615035644, + "grad_norm": 0.46735498309135437, + "learning_rate": 6.193573253265248e-06, + "loss": 0.1095, + "step": 4079 + }, + { + "epoch": 1.322099805573558, + "grad_norm": 0.48384442925453186, + "learning_rate": 6.191874534131143e-06, + "loss": 0.121, + "step": 4080 + }, + { + "epoch": 1.3224238496435516, + "grad_norm": 0.4530140161514282, + "learning_rate": 6.190175669123131e-06, + "loss": 0.1108, + "step": 4081 + }, + { + "epoch": 1.322747893713545, + "grad_norm": 0.4611378312110901, + "learning_rate": 6.188476658449141e-06, + "loss": 0.1204, + "step": 4082 + }, + { + "epoch": 1.3230719377835385, + "grad_norm": 0.4774719774723053, + "learning_rate": 6.186777502317113e-06, + "loss": 0.1174, + "step": 4083 + }, + { + "epoch": 1.3233959818535321, + "grad_norm": 0.458342581987381, + "learning_rate": 6.1850782009350075e-06, + "loss": 0.1153, + "step": 4084 + }, + { + "epoch": 1.3237200259235256, + "grad_norm": 0.4817325472831726, + "learning_rate": 6.183378754510801e-06, + "loss": 0.1121, + "step": 4085 + }, + { + "epoch": 1.3240440699935192, + "grad_norm": 0.4727479815483093, + "learning_rate": 6.181679163252493e-06, + "loss": 0.11, + "step": 4086 + }, + { + "epoch": 1.3243681140635126, + "grad_norm": 0.5003477931022644, + "learning_rate": 6.1799794273680936e-06, + "loss": 0.1195, + "step": 4087 + }, + { + "epoch": 1.324692158133506, + "grad_norm": 0.4953130781650543, + "learning_rate": 6.178279547065635e-06, + "loss": 0.1163, + "step": 4088 + }, + { + "epoch": 1.3250162022034997, + "grad_norm": 0.4566429853439331, + "learning_rate": 6.176579522553168e-06, + "loss": 0.1094, + "step": 4089 + }, + { + "epoch": 1.3253402462734931, + "grad_norm": 0.4813137948513031, + "learning_rate": 6.174879354038757e-06, + "loss": 0.1124, + "step": 4090 + }, + { + "epoch": 1.3256642903434868, + "grad_norm": 0.4739673435688019, + "learning_rate": 6.173179041730487e-06, + "loss": 0.1277, + "step": 4091 + }, + { + "epoch": 1.3259883344134802, + "grad_norm": 0.5202913880348206, + "learning_rate": 6.171478585836459e-06, + "loss": 0.1322, + "step": 4092 + }, + { + "epoch": 1.3263123784834738, + "grad_norm": 0.45613497495651245, + "learning_rate": 6.169777986564794e-06, + "loss": 0.1105, + "step": 4093 + }, + { + "epoch": 1.3266364225534673, + "grad_norm": 0.4877902865409851, + "learning_rate": 6.168077244123627e-06, + "loss": 0.1218, + "step": 4094 + }, + { + "epoch": 1.326960466623461, + "grad_norm": 0.4779025912284851, + "learning_rate": 6.166376358721112e-06, + "loss": 0.1196, + "step": 4095 + }, + { + "epoch": 1.3272845106934543, + "grad_norm": 0.4462127387523651, + "learning_rate": 6.164675330565425e-06, + "loss": 0.1114, + "step": 4096 + }, + { + "epoch": 1.3276085547634477, + "grad_norm": 0.4394828677177429, + "learning_rate": 6.1629741598647496e-06, + "loss": 0.1106, + "step": 4097 + }, + { + "epoch": 1.3279325988334414, + "grad_norm": 0.4651632308959961, + "learning_rate": 6.161272846827298e-06, + "loss": 0.1071, + "step": 4098 + }, + { + "epoch": 1.3282566429034348, + "grad_norm": 0.4370875358581543, + "learning_rate": 6.15957139166129e-06, + "loss": 0.116, + "step": 4099 + }, + { + "epoch": 1.3285806869734285, + "grad_norm": 0.47040608525276184, + "learning_rate": 6.157869794574969e-06, + "loss": 0.1177, + "step": 4100 + }, + { + "epoch": 1.3289047310434219, + "grad_norm": 0.4459068179130554, + "learning_rate": 6.156168055776595e-06, + "loss": 0.107, + "step": 4101 + }, + { + "epoch": 1.3292287751134153, + "grad_norm": 0.42454037070274353, + "learning_rate": 6.154466175474444e-06, + "loss": 0.1014, + "step": 4102 + }, + { + "epoch": 1.329552819183409, + "grad_norm": 0.4620989263057709, + "learning_rate": 6.1527641538768075e-06, + "loss": 0.1132, + "step": 4103 + }, + { + "epoch": 1.3298768632534026, + "grad_norm": 0.4761102795600891, + "learning_rate": 6.151061991192001e-06, + "loss": 0.1108, + "step": 4104 + }, + { + "epoch": 1.330200907323396, + "grad_norm": 0.4630895256996155, + "learning_rate": 6.149359687628348e-06, + "loss": 0.1133, + "step": 4105 + }, + { + "epoch": 1.3305249513933894, + "grad_norm": 0.4673987627029419, + "learning_rate": 6.147657243394196e-06, + "loss": 0.1132, + "step": 4106 + }, + { + "epoch": 1.330848995463383, + "grad_norm": 0.4772607088088989, + "learning_rate": 6.145954658697908e-06, + "loss": 0.1165, + "step": 4107 + }, + { + "epoch": 1.3311730395333765, + "grad_norm": 0.4496089518070221, + "learning_rate": 6.144251933747864e-06, + "loss": 0.1164, + "step": 4108 + }, + { + "epoch": 1.3314970836033702, + "grad_norm": 0.5247877836227417, + "learning_rate": 6.142549068752459e-06, + "loss": 0.1283, + "step": 4109 + }, + { + "epoch": 1.3318211276733636, + "grad_norm": 0.47814130783081055, + "learning_rate": 6.1408460639201095e-06, + "loss": 0.1111, + "step": 4110 + }, + { + "epoch": 1.332145171743357, + "grad_norm": 0.4383772313594818, + "learning_rate": 6.139142919459246e-06, + "loss": 0.102, + "step": 4111 + }, + { + "epoch": 1.3324692158133506, + "grad_norm": 0.4361100196838379, + "learning_rate": 6.137439635578316e-06, + "loss": 0.1123, + "step": 4112 + }, + { + "epoch": 1.332793259883344, + "grad_norm": 0.4610941708087921, + "learning_rate": 6.135736212485788e-06, + "loss": 0.1088, + "step": 4113 + }, + { + "epoch": 1.3331173039533377, + "grad_norm": 0.48235994577407837, + "learning_rate": 6.1340326503901405e-06, + "loss": 0.1077, + "step": 4114 + }, + { + "epoch": 1.3334413480233311, + "grad_norm": 0.49216118454933167, + "learning_rate": 6.132328949499877e-06, + "loss": 0.1067, + "step": 4115 + }, + { + "epoch": 1.3337653920933246, + "grad_norm": 0.46302199363708496, + "learning_rate": 6.1306251100235094e-06, + "loss": 0.1104, + "step": 4116 + }, + { + "epoch": 1.3340894361633182, + "grad_norm": 0.4512770473957062, + "learning_rate": 6.128921132169575e-06, + "loss": 0.1066, + "step": 4117 + }, + { + "epoch": 1.3344134802333119, + "grad_norm": 0.47927504777908325, + "learning_rate": 6.1272170161466225e-06, + "loss": 0.1171, + "step": 4118 + }, + { + "epoch": 1.3347375243033053, + "grad_norm": 0.4398737847805023, + "learning_rate": 6.125512762163219e-06, + "loss": 0.1039, + "step": 4119 + }, + { + "epoch": 1.3350615683732987, + "grad_norm": 0.45653656125068665, + "learning_rate": 6.123808370427949e-06, + "loss": 0.1155, + "step": 4120 + }, + { + "epoch": 1.3353856124432923, + "grad_norm": 0.4693288207054138, + "learning_rate": 6.122103841149416e-06, + "loss": 0.115, + "step": 4121 + }, + { + "epoch": 1.3357096565132858, + "grad_norm": 0.43127191066741943, + "learning_rate": 6.120399174536233e-06, + "loss": 0.1096, + "step": 4122 + }, + { + "epoch": 1.3360337005832794, + "grad_norm": 0.47410160303115845, + "learning_rate": 6.1186943707970395e-06, + "loss": 0.1134, + "step": 4123 + }, + { + "epoch": 1.3363577446532728, + "grad_norm": 0.45060470700263977, + "learning_rate": 6.116989430140484e-06, + "loss": 0.1021, + "step": 4124 + }, + { + "epoch": 1.3366817887232663, + "grad_norm": 0.4330093562602997, + "learning_rate": 6.115284352775235e-06, + "loss": 0.1, + "step": 4125 + }, + { + "epoch": 1.33700583279326, + "grad_norm": 0.4936785399913788, + "learning_rate": 6.113579138909978e-06, + "loss": 0.1234, + "step": 4126 + }, + { + "epoch": 1.3373298768632533, + "grad_norm": 0.44440215826034546, + "learning_rate": 6.111873788753416e-06, + "loss": 0.1064, + "step": 4127 + }, + { + "epoch": 1.337653920933247, + "grad_norm": 0.48851221799850464, + "learning_rate": 6.110168302514266e-06, + "loss": 0.1146, + "step": 4128 + }, + { + "epoch": 1.3379779650032404, + "grad_norm": 0.468937486410141, + "learning_rate": 6.108462680401262e-06, + "loss": 0.1142, + "step": 4129 + }, + { + "epoch": 1.3383020090732338, + "grad_norm": 0.4295579493045807, + "learning_rate": 6.106756922623156e-06, + "loss": 0.0968, + "step": 4130 + }, + { + "epoch": 1.3386260531432275, + "grad_norm": 0.4338614344596863, + "learning_rate": 6.1050510293887165e-06, + "loss": 0.1022, + "step": 4131 + }, + { + "epoch": 1.3389500972132211, + "grad_norm": 0.43360260128974915, + "learning_rate": 6.1033450009067295e-06, + "loss": 0.1013, + "step": 4132 + }, + { + "epoch": 1.3392741412832145, + "grad_norm": 0.4844948649406433, + "learning_rate": 6.101638837385997e-06, + "loss": 0.1129, + "step": 4133 + }, + { + "epoch": 1.339598185353208, + "grad_norm": 0.4562806487083435, + "learning_rate": 6.099932539035335e-06, + "loss": 0.1071, + "step": 4134 + }, + { + "epoch": 1.3399222294232016, + "grad_norm": 0.44041526317596436, + "learning_rate": 6.098226106063577e-06, + "loss": 0.1031, + "step": 4135 + }, + { + "epoch": 1.340246273493195, + "grad_norm": 0.4702534079551697, + "learning_rate": 6.0965195386795774e-06, + "loss": 0.1141, + "step": 4136 + }, + { + "epoch": 1.3405703175631887, + "grad_norm": 0.4721149802207947, + "learning_rate": 6.0948128370922e-06, + "loss": 0.1113, + "step": 4137 + }, + { + "epoch": 1.340894361633182, + "grad_norm": 0.46534913778305054, + "learning_rate": 6.093106001510329e-06, + "loss": 0.1143, + "step": 4138 + }, + { + "epoch": 1.3412184057031755, + "grad_norm": 0.49070435762405396, + "learning_rate": 6.091399032142869e-06, + "loss": 0.1139, + "step": 4139 + }, + { + "epoch": 1.3415424497731692, + "grad_norm": 0.4650188684463501, + "learning_rate": 6.08969192919873e-06, + "loss": 0.1108, + "step": 4140 + }, + { + "epoch": 1.3418664938431626, + "grad_norm": 0.42567306756973267, + "learning_rate": 6.087984692886848e-06, + "loss": 0.1031, + "step": 4141 + }, + { + "epoch": 1.3421905379131562, + "grad_norm": 0.4322057068347931, + "learning_rate": 6.086277323416172e-06, + "loss": 0.101, + "step": 4142 + }, + { + "epoch": 1.3425145819831497, + "grad_norm": 0.4325510263442993, + "learning_rate": 6.084569820995668e-06, + "loss": 0.1026, + "step": 4143 + }, + { + "epoch": 1.3428386260531433, + "grad_norm": 0.46976298093795776, + "learning_rate": 6.0828621858343175e-06, + "loss": 0.1216, + "step": 4144 + }, + { + "epoch": 1.3431626701231367, + "grad_norm": 0.4701208472251892, + "learning_rate": 6.08115441814112e-06, + "loss": 0.1137, + "step": 4145 + }, + { + "epoch": 1.3434867141931304, + "grad_norm": 0.45568281412124634, + "learning_rate": 6.079446518125086e-06, + "loss": 0.1119, + "step": 4146 + }, + { + "epoch": 1.3438107582631238, + "grad_norm": 0.4867906868457794, + "learning_rate": 6.077738485995249e-06, + "loss": 0.1181, + "step": 4147 + }, + { + "epoch": 1.3441348023331172, + "grad_norm": 0.4914509654045105, + "learning_rate": 6.076030321960654e-06, + "loss": 0.1129, + "step": 4148 + }, + { + "epoch": 1.3444588464031109, + "grad_norm": 0.4761381149291992, + "learning_rate": 6.074322026230365e-06, + "loss": 0.1193, + "step": 4149 + }, + { + "epoch": 1.3447828904731043, + "grad_norm": 0.4466744363307953, + "learning_rate": 6.072613599013459e-06, + "loss": 0.1099, + "step": 4150 + }, + { + "epoch": 1.345106934543098, + "grad_norm": 0.4828190207481384, + "learning_rate": 6.070905040519034e-06, + "loss": 0.1176, + "step": 4151 + }, + { + "epoch": 1.3454309786130914, + "grad_norm": 0.4406268298625946, + "learning_rate": 6.069196350956198e-06, + "loss": 0.1056, + "step": 4152 + }, + { + "epoch": 1.3457550226830848, + "grad_norm": 0.4638471305370331, + "learning_rate": 6.06748753053408e-06, + "loss": 0.1106, + "step": 4153 + }, + { + "epoch": 1.3460790667530784, + "grad_norm": 0.4701295495033264, + "learning_rate": 6.065778579461821e-06, + "loss": 0.1175, + "step": 4154 + }, + { + "epoch": 1.346403110823072, + "grad_norm": 0.4829196035861969, + "learning_rate": 6.064069497948581e-06, + "loss": 0.1202, + "step": 4155 + }, + { + "epoch": 1.3467271548930655, + "grad_norm": 0.4684045910835266, + "learning_rate": 6.062360286203538e-06, + "loss": 0.1084, + "step": 4156 + }, + { + "epoch": 1.347051198963059, + "grad_norm": 0.5000907182693481, + "learning_rate": 6.06065094443588e-06, + "loss": 0.1151, + "step": 4157 + }, + { + "epoch": 1.3473752430330526, + "grad_norm": 0.4776161015033722, + "learning_rate": 6.058941472854813e-06, + "loss": 0.117, + "step": 4158 + }, + { + "epoch": 1.347699287103046, + "grad_norm": 0.44210824370384216, + "learning_rate": 6.057231871669562e-06, + "loss": 0.1053, + "step": 4159 + }, + { + "epoch": 1.3480233311730396, + "grad_norm": 0.47822654247283936, + "learning_rate": 6.055522141089364e-06, + "loss": 0.1114, + "step": 4160 + }, + { + "epoch": 1.348347375243033, + "grad_norm": 0.4372398853302002, + "learning_rate": 6.053812281323474e-06, + "loss": 0.1023, + "step": 4161 + }, + { + "epoch": 1.3486714193130265, + "grad_norm": 0.45535725355148315, + "learning_rate": 6.052102292581162e-06, + "loss": 0.1065, + "step": 4162 + }, + { + "epoch": 1.3489954633830201, + "grad_norm": 0.4853324592113495, + "learning_rate": 6.050392175071716e-06, + "loss": 0.1165, + "step": 4163 + }, + { + "epoch": 1.3493195074530135, + "grad_norm": 0.4817340672016144, + "learning_rate": 6.048681929004436e-06, + "loss": 0.1182, + "step": 4164 + }, + { + "epoch": 1.3496435515230072, + "grad_norm": 0.43401870131492615, + "learning_rate": 6.0469715545886394e-06, + "loss": 0.1052, + "step": 4165 + }, + { + "epoch": 1.3499675955930006, + "grad_norm": 0.43376854062080383, + "learning_rate": 6.0452610520336595e-06, + "loss": 0.0973, + "step": 4166 + }, + { + "epoch": 1.350291639662994, + "grad_norm": 0.4899052083492279, + "learning_rate": 6.043550421548847e-06, + "loss": 0.1259, + "step": 4167 + }, + { + "epoch": 1.3506156837329877, + "grad_norm": 0.4612118601799011, + "learning_rate": 6.041839663343565e-06, + "loss": 0.1069, + "step": 4168 + }, + { + "epoch": 1.3509397278029813, + "grad_norm": 0.4711647033691406, + "learning_rate": 6.0401287776271945e-06, + "loss": 0.116, + "step": 4169 + }, + { + "epoch": 1.3512637718729748, + "grad_norm": 0.5112835764884949, + "learning_rate": 6.03841776460913e-06, + "loss": 0.124, + "step": 4170 + }, + { + "epoch": 1.3515878159429682, + "grad_norm": 0.42946645617485046, + "learning_rate": 6.0367066244987834e-06, + "loss": 0.099, + "step": 4171 + }, + { + "epoch": 1.3519118600129618, + "grad_norm": 0.49457547068595886, + "learning_rate": 6.034995357505582e-06, + "loss": 0.1281, + "step": 4172 + }, + { + "epoch": 1.3522359040829552, + "grad_norm": 0.49168887734413147, + "learning_rate": 6.03328396383897e-06, + "loss": 0.1239, + "step": 4173 + }, + { + "epoch": 1.3525599481529489, + "grad_norm": 0.5071669816970825, + "learning_rate": 6.031572443708401e-06, + "loss": 0.1101, + "step": 4174 + }, + { + "epoch": 1.3528839922229423, + "grad_norm": 0.49065500497817993, + "learning_rate": 6.0298607973233545e-06, + "loss": 0.1242, + "step": 4175 + }, + { + "epoch": 1.3532080362929357, + "grad_norm": 0.5139132738113403, + "learning_rate": 6.028149024893314e-06, + "loss": 0.1134, + "step": 4176 + }, + { + "epoch": 1.3535320803629294, + "grad_norm": 0.4649846851825714, + "learning_rate": 6.026437126627787e-06, + "loss": 0.1193, + "step": 4177 + }, + { + "epoch": 1.3538561244329228, + "grad_norm": 0.46105092763900757, + "learning_rate": 6.024725102736293e-06, + "loss": 0.1113, + "step": 4178 + }, + { + "epoch": 1.3541801685029164, + "grad_norm": 0.4748861491680145, + "learning_rate": 6.023012953428365e-06, + "loss": 0.1117, + "step": 4179 + }, + { + "epoch": 1.3545042125729099, + "grad_norm": 0.4985915720462799, + "learning_rate": 6.021300678913555e-06, + "loss": 0.1227, + "step": 4180 + }, + { + "epoch": 1.3548282566429035, + "grad_norm": 0.463781476020813, + "learning_rate": 6.019588279401431e-06, + "loss": 0.1107, + "step": 4181 + }, + { + "epoch": 1.355152300712897, + "grad_norm": 0.43629616498947144, + "learning_rate": 6.01787575510157e-06, + "loss": 0.1096, + "step": 4182 + }, + { + "epoch": 1.3554763447828906, + "grad_norm": 0.5077922940254211, + "learning_rate": 6.016163106223572e-06, + "loss": 0.122, + "step": 4183 + }, + { + "epoch": 1.355800388852884, + "grad_norm": 0.4964093565940857, + "learning_rate": 6.0144503329770445e-06, + "loss": 0.1262, + "step": 4184 + }, + { + "epoch": 1.3561244329228774, + "grad_norm": 0.45850878953933716, + "learning_rate": 6.012737435571618e-06, + "loss": 0.111, + "step": 4185 + }, + { + "epoch": 1.356448476992871, + "grad_norm": 0.44368118047714233, + "learning_rate": 6.011024414216934e-06, + "loss": 0.1042, + "step": 4186 + }, + { + "epoch": 1.3567725210628645, + "grad_norm": 0.44517815113067627, + "learning_rate": 6.00931126912265e-06, + "loss": 0.1011, + "step": 4187 + }, + { + "epoch": 1.3570965651328581, + "grad_norm": 0.5106947422027588, + "learning_rate": 6.007598000498436e-06, + "loss": 0.1249, + "step": 4188 + }, + { + "epoch": 1.3574206092028516, + "grad_norm": 0.5051454305648804, + "learning_rate": 6.005884608553982e-06, + "loss": 0.1248, + "step": 4189 + }, + { + "epoch": 1.357744653272845, + "grad_norm": 0.4792226552963257, + "learning_rate": 6.00417109349899e-06, + "loss": 0.1113, + "step": 4190 + }, + { + "epoch": 1.3580686973428386, + "grad_norm": 0.4766109585762024, + "learning_rate": 6.002457455543176e-06, + "loss": 0.1121, + "step": 4191 + }, + { + "epoch": 1.3583927414128323, + "grad_norm": 0.4882993996143341, + "learning_rate": 6.000743694896274e-06, + "loss": 0.1245, + "step": 4192 + }, + { + "epoch": 1.3587167854828257, + "grad_norm": 0.44108298420906067, + "learning_rate": 5.999029811768031e-06, + "loss": 0.1008, + "step": 4193 + }, + { + "epoch": 1.3590408295528191, + "grad_norm": 0.4641225039958954, + "learning_rate": 5.997315806368214e-06, + "loss": 0.1079, + "step": 4194 + }, + { + "epoch": 1.3593648736228128, + "grad_norm": 0.47970643639564514, + "learning_rate": 5.995601678906593e-06, + "loss": 0.1135, + "step": 4195 + }, + { + "epoch": 1.3596889176928062, + "grad_norm": 0.48470956087112427, + "learning_rate": 5.993887429592966e-06, + "loss": 0.1158, + "step": 4196 + }, + { + "epoch": 1.3600129617627998, + "grad_norm": 0.4878312647342682, + "learning_rate": 5.992173058637139e-06, + "loss": 0.1164, + "step": 4197 + }, + { + "epoch": 1.3603370058327933, + "grad_norm": 0.4864386022090912, + "learning_rate": 5.990458566248936e-06, + "loss": 0.1118, + "step": 4198 + }, + { + "epoch": 1.3606610499027867, + "grad_norm": 0.4517388939857483, + "learning_rate": 5.988743952638192e-06, + "loss": 0.1105, + "step": 4199 + }, + { + "epoch": 1.3609850939727803, + "grad_norm": 0.46198770403862, + "learning_rate": 5.987029218014762e-06, + "loss": 0.1156, + "step": 4200 + }, + { + "epoch": 1.3613091380427738, + "grad_norm": 0.5112099051475525, + "learning_rate": 5.985314362588508e-06, + "loss": 0.1263, + "step": 4201 + }, + { + "epoch": 1.3616331821127674, + "grad_norm": 0.44200339913368225, + "learning_rate": 5.9835993865693144e-06, + "loss": 0.1051, + "step": 4202 + }, + { + "epoch": 1.3619572261827608, + "grad_norm": 0.45443880558013916, + "learning_rate": 5.98188429016708e-06, + "loss": 0.1037, + "step": 4203 + }, + { + "epoch": 1.3622812702527543, + "grad_norm": 0.4965524673461914, + "learning_rate": 5.980169073591712e-06, + "loss": 0.1255, + "step": 4204 + }, + { + "epoch": 1.362605314322748, + "grad_norm": 0.42498093843460083, + "learning_rate": 5.978453737053138e-06, + "loss": 0.095, + "step": 4205 + }, + { + "epoch": 1.3629293583927415, + "grad_norm": 0.4135143458843231, + "learning_rate": 5.9767382807613e-06, + "loss": 0.0998, + "step": 4206 + }, + { + "epoch": 1.363253402462735, + "grad_norm": 0.4564655125141144, + "learning_rate": 5.975022704926152e-06, + "loss": 0.1113, + "step": 4207 + }, + { + "epoch": 1.3635774465327284, + "grad_norm": 0.4875675439834595, + "learning_rate": 5.973307009757663e-06, + "loss": 0.1176, + "step": 4208 + }, + { + "epoch": 1.363901490602722, + "grad_norm": 0.43339845538139343, + "learning_rate": 5.971591195465819e-06, + "loss": 0.1048, + "step": 4209 + }, + { + "epoch": 1.3642255346727155, + "grad_norm": 0.4639114439487457, + "learning_rate": 5.969875262260619e-06, + "loss": 0.113, + "step": 4210 + }, + { + "epoch": 1.364549578742709, + "grad_norm": 0.4624967873096466, + "learning_rate": 5.968159210352076e-06, + "loss": 0.1068, + "step": 4211 + }, + { + "epoch": 1.3648736228127025, + "grad_norm": 0.4855850636959076, + "learning_rate": 5.966443039950217e-06, + "loss": 0.1197, + "step": 4212 + }, + { + "epoch": 1.365197666882696, + "grad_norm": 0.46072232723236084, + "learning_rate": 5.9647267512650866e-06, + "loss": 0.1115, + "step": 4213 + }, + { + "epoch": 1.3655217109526896, + "grad_norm": 0.4450376629829407, + "learning_rate": 5.9630103445067414e-06, + "loss": 0.1014, + "step": 4214 + }, + { + "epoch": 1.365845755022683, + "grad_norm": 0.476151704788208, + "learning_rate": 5.961293819885251e-06, + "loss": 0.1167, + "step": 4215 + }, + { + "epoch": 1.3661697990926767, + "grad_norm": 0.4346964955329895, + "learning_rate": 5.959577177610703e-06, + "loss": 0.0987, + "step": 4216 + }, + { + "epoch": 1.36649384316267, + "grad_norm": 0.4830533564090729, + "learning_rate": 5.9578604178932e-06, + "loss": 0.1138, + "step": 4217 + }, + { + "epoch": 1.3668178872326635, + "grad_norm": 0.4955235421657562, + "learning_rate": 5.956143540942854e-06, + "loss": 0.1223, + "step": 4218 + }, + { + "epoch": 1.3671419313026572, + "grad_norm": 0.5037327408790588, + "learning_rate": 5.954426546969795e-06, + "loss": 0.1212, + "step": 4219 + }, + { + "epoch": 1.3674659753726508, + "grad_norm": 0.4872628450393677, + "learning_rate": 5.952709436184165e-06, + "loss": 0.1212, + "step": 4220 + }, + { + "epoch": 1.3677900194426442, + "grad_norm": 0.485176146030426, + "learning_rate": 5.9509922087961245e-06, + "loss": 0.1156, + "step": 4221 + }, + { + "epoch": 1.3681140635126376, + "grad_norm": 0.4762813150882721, + "learning_rate": 5.949274865015843e-06, + "loss": 0.1181, + "step": 4222 + }, + { + "epoch": 1.3684381075826313, + "grad_norm": 0.4957357943058014, + "learning_rate": 5.947557405053508e-06, + "loss": 0.1157, + "step": 4223 + }, + { + "epoch": 1.3687621516526247, + "grad_norm": 0.4911295771598816, + "learning_rate": 5.94583982911932e-06, + "loss": 0.1254, + "step": 4224 + }, + { + "epoch": 1.3690861957226184, + "grad_norm": 0.4450444281101227, + "learning_rate": 5.9441221374234925e-06, + "loss": 0.1111, + "step": 4225 + }, + { + "epoch": 1.3694102397926118, + "grad_norm": 0.4533950090408325, + "learning_rate": 5.942404330176256e-06, + "loss": 0.1036, + "step": 4226 + }, + { + "epoch": 1.3697342838626052, + "grad_norm": 0.47625434398651123, + "learning_rate": 5.940686407587851e-06, + "loss": 0.1149, + "step": 4227 + }, + { + "epoch": 1.3700583279325989, + "grad_norm": 0.4599977433681488, + "learning_rate": 5.9389683698685376e-06, + "loss": 0.1104, + "step": 4228 + }, + { + "epoch": 1.3703823720025923, + "grad_norm": 0.4753097593784332, + "learning_rate": 5.9372502172285854e-06, + "loss": 0.1192, + "step": 4229 + }, + { + "epoch": 1.370706416072586, + "grad_norm": 0.48577627539634705, + "learning_rate": 5.935531949878281e-06, + "loss": 0.1172, + "step": 4230 + }, + { + "epoch": 1.3710304601425793, + "grad_norm": 0.44800129532814026, + "learning_rate": 5.933813568027921e-06, + "loss": 0.1073, + "step": 4231 + }, + { + "epoch": 1.371354504212573, + "grad_norm": 0.4549189805984497, + "learning_rate": 5.932095071887823e-06, + "loss": 0.1082, + "step": 4232 + }, + { + "epoch": 1.3716785482825664, + "grad_norm": 0.4787933826446533, + "learning_rate": 5.930376461668308e-06, + "loss": 0.1216, + "step": 4233 + }, + { + "epoch": 1.37200259235256, + "grad_norm": 0.4628862738609314, + "learning_rate": 5.928657737579723e-06, + "loss": 0.1084, + "step": 4234 + }, + { + "epoch": 1.3723266364225535, + "grad_norm": 0.47287628054618835, + "learning_rate": 5.92693889983242e-06, + "loss": 0.1191, + "step": 4235 + }, + { + "epoch": 1.372650680492547, + "grad_norm": 0.5101609230041504, + "learning_rate": 5.92521994863677e-06, + "loss": 0.1291, + "step": 4236 + }, + { + "epoch": 1.3729747245625405, + "grad_norm": 0.47439146041870117, + "learning_rate": 5.923500884203154e-06, + "loss": 0.1114, + "step": 4237 + }, + { + "epoch": 1.373298768632534, + "grad_norm": 0.4542114734649658, + "learning_rate": 5.9217817067419705e-06, + "loss": 0.1052, + "step": 4238 + }, + { + "epoch": 1.3736228127025276, + "grad_norm": 0.43908876180648804, + "learning_rate": 5.920062416463629e-06, + "loss": 0.1047, + "step": 4239 + }, + { + "epoch": 1.373946856772521, + "grad_norm": 0.4801899790763855, + "learning_rate": 5.9183430135785555e-06, + "loss": 0.115, + "step": 4240 + }, + { + "epoch": 1.3742709008425145, + "grad_norm": 0.4846722185611725, + "learning_rate": 5.916623498297188e-06, + "loss": 0.1144, + "step": 4241 + }, + { + "epoch": 1.374594944912508, + "grad_norm": 0.4870109260082245, + "learning_rate": 5.914903870829977e-06, + "loss": 0.1134, + "step": 4242 + }, + { + "epoch": 1.3749189889825018, + "grad_norm": 0.4847196936607361, + "learning_rate": 5.913184131387389e-06, + "loss": 0.1197, + "step": 4243 + }, + { + "epoch": 1.3752430330524952, + "grad_norm": 0.4836478531360626, + "learning_rate": 5.911464280179905e-06, + "loss": 0.1128, + "step": 4244 + }, + { + "epoch": 1.3755670771224886, + "grad_norm": 0.4619148075580597, + "learning_rate": 5.909744317418015e-06, + "loss": 0.1162, + "step": 4245 + }, + { + "epoch": 1.3758911211924822, + "grad_norm": 0.44892606139183044, + "learning_rate": 5.908024243312228e-06, + "loss": 0.1017, + "step": 4246 + }, + { + "epoch": 1.3762151652624757, + "grad_norm": 0.44396424293518066, + "learning_rate": 5.906304058073063e-06, + "loss": 0.0963, + "step": 4247 + }, + { + "epoch": 1.3765392093324693, + "grad_norm": 0.4212004244327545, + "learning_rate": 5.904583761911058e-06, + "loss": 0.1004, + "step": 4248 + }, + { + "epoch": 1.3768632534024627, + "grad_norm": 0.4835984706878662, + "learning_rate": 5.902863355036755e-06, + "loss": 0.1171, + "step": 4249 + }, + { + "epoch": 1.3771872974724562, + "grad_norm": 0.4739507734775543, + "learning_rate": 5.901142837660718e-06, + "loss": 0.1204, + "step": 4250 + }, + { + "epoch": 1.3775113415424498, + "grad_norm": 0.44449394941329956, + "learning_rate": 5.899422209993522e-06, + "loss": 0.1051, + "step": 4251 + }, + { + "epoch": 1.3778353856124432, + "grad_norm": 0.47911691665649414, + "learning_rate": 5.897701472245756e-06, + "loss": 0.1217, + "step": 4252 + }, + { + "epoch": 1.3781594296824369, + "grad_norm": 0.4616232216358185, + "learning_rate": 5.895980624628018e-06, + "loss": 0.1148, + "step": 4253 + }, + { + "epoch": 1.3784834737524303, + "grad_norm": 0.49407100677490234, + "learning_rate": 5.8942596673509266e-06, + "loss": 0.1327, + "step": 4254 + }, + { + "epoch": 1.3788075178224237, + "grad_norm": 0.4562760889530182, + "learning_rate": 5.892538600625109e-06, + "loss": 0.1092, + "step": 4255 + }, + { + "epoch": 1.3791315618924174, + "grad_norm": 0.4495685398578644, + "learning_rate": 5.890817424661206e-06, + "loss": 0.1072, + "step": 4256 + }, + { + "epoch": 1.379455605962411, + "grad_norm": 0.4521036446094513, + "learning_rate": 5.889096139669874e-06, + "loss": 0.1142, + "step": 4257 + }, + { + "epoch": 1.3797796500324044, + "grad_norm": 0.44457265734672546, + "learning_rate": 5.887374745861782e-06, + "loss": 0.1162, + "step": 4258 + }, + { + "epoch": 1.3801036941023979, + "grad_norm": 0.47161737084388733, + "learning_rate": 5.885653243447612e-06, + "loss": 0.1086, + "step": 4259 + }, + { + "epoch": 1.3804277381723915, + "grad_norm": 0.47300124168395996, + "learning_rate": 5.883931632638059e-06, + "loss": 0.1149, + "step": 4260 + }, + { + "epoch": 1.380751782242385, + "grad_norm": 0.45929595828056335, + "learning_rate": 5.882209913643831e-06, + "loss": 0.113, + "step": 4261 + }, + { + "epoch": 1.3810758263123786, + "grad_norm": 0.46925094723701477, + "learning_rate": 5.8804880866756494e-06, + "loss": 0.1085, + "step": 4262 + }, + { + "epoch": 1.381399870382372, + "grad_norm": 0.45652511715888977, + "learning_rate": 5.87876615194425e-06, + "loss": 0.112, + "step": 4263 + }, + { + "epoch": 1.3817239144523654, + "grad_norm": 0.44846397638320923, + "learning_rate": 5.877044109660381e-06, + "loss": 0.1075, + "step": 4264 + }, + { + "epoch": 1.382047958522359, + "grad_norm": 0.4708314538002014, + "learning_rate": 5.875321960034804e-06, + "loss": 0.1127, + "step": 4265 + }, + { + "epoch": 1.3823720025923525, + "grad_norm": 0.4565647840499878, + "learning_rate": 5.873599703278292e-06, + "loss": 0.1057, + "step": 4266 + }, + { + "epoch": 1.3826960466623461, + "grad_norm": 0.4954982101917267, + "learning_rate": 5.8718773396016345e-06, + "loss": 0.1198, + "step": 4267 + }, + { + "epoch": 1.3830200907323396, + "grad_norm": 0.47357866168022156, + "learning_rate": 5.87015486921563e-06, + "loss": 0.1092, + "step": 4268 + }, + { + "epoch": 1.383344134802333, + "grad_norm": 0.46112143993377686, + "learning_rate": 5.8684322923310936e-06, + "loss": 0.105, + "step": 4269 + }, + { + "epoch": 1.3836681788723266, + "grad_norm": 0.4896453320980072, + "learning_rate": 5.8667096091588506e-06, + "loss": 0.1144, + "step": 4270 + }, + { + "epoch": 1.3839922229423203, + "grad_norm": 0.4607629179954529, + "learning_rate": 5.8649868199097425e-06, + "loss": 0.1092, + "step": 4271 + }, + { + "epoch": 1.3843162670123137, + "grad_norm": 0.44088199734687805, + "learning_rate": 5.863263924794622e-06, + "loss": 0.1042, + "step": 4272 + }, + { + "epoch": 1.3846403110823071, + "grad_norm": 0.5327633619308472, + "learning_rate": 5.861540924024355e-06, + "loss": 0.1257, + "step": 4273 + }, + { + "epoch": 1.3849643551523008, + "grad_norm": 0.46473363041877747, + "learning_rate": 5.8598178178098185e-06, + "loss": 0.1123, + "step": 4274 + }, + { + "epoch": 1.3852883992222942, + "grad_norm": 0.4873182475566864, + "learning_rate": 5.858094606361904e-06, + "loss": 0.1163, + "step": 4275 + }, + { + "epoch": 1.3856124432922878, + "grad_norm": 0.4518771469593048, + "learning_rate": 5.856371289891517e-06, + "loss": 0.1086, + "step": 4276 + }, + { + "epoch": 1.3859364873622813, + "grad_norm": 0.47316327691078186, + "learning_rate": 5.854647868609574e-06, + "loss": 0.1123, + "step": 4277 + }, + { + "epoch": 1.3862605314322747, + "grad_norm": 0.5178737044334412, + "learning_rate": 5.852924342727006e-06, + "loss": 0.1369, + "step": 4278 + }, + { + "epoch": 1.3865845755022683, + "grad_norm": 0.43328502774238586, + "learning_rate": 5.851200712454757e-06, + "loss": 0.1043, + "step": 4279 + }, + { + "epoch": 1.3869086195722617, + "grad_norm": 0.46997931599617004, + "learning_rate": 5.849476978003778e-06, + "loss": 0.1149, + "step": 4280 + }, + { + "epoch": 1.3872326636422554, + "grad_norm": 0.45517292618751526, + "learning_rate": 5.847753139585042e-06, + "loss": 0.1089, + "step": 4281 + }, + { + "epoch": 1.3875567077122488, + "grad_norm": 0.49720102548599243, + "learning_rate": 5.846029197409528e-06, + "loss": 0.1292, + "step": 4282 + }, + { + "epoch": 1.3878807517822425, + "grad_norm": 0.46744126081466675, + "learning_rate": 5.844305151688231e-06, + "loss": 0.117, + "step": 4283 + }, + { + "epoch": 1.3882047958522359, + "grad_norm": 0.46075981855392456, + "learning_rate": 5.842581002632157e-06, + "loss": 0.117, + "step": 4284 + }, + { + "epoch": 1.3885288399222295, + "grad_norm": 0.43533214926719666, + "learning_rate": 5.840856750452326e-06, + "loss": 0.1102, + "step": 4285 + }, + { + "epoch": 1.388852883992223, + "grad_norm": 0.41412171721458435, + "learning_rate": 5.8391323953597675e-06, + "loss": 0.0969, + "step": 4286 + }, + { + "epoch": 1.3891769280622164, + "grad_norm": 0.43589508533477783, + "learning_rate": 5.837407937565528e-06, + "loss": 0.1016, + "step": 4287 + }, + { + "epoch": 1.38950097213221, + "grad_norm": 0.45945459604263306, + "learning_rate": 5.8356833772806636e-06, + "loss": 0.1125, + "step": 4288 + }, + { + "epoch": 1.3898250162022034, + "grad_norm": 0.4774591326713562, + "learning_rate": 5.833958714716242e-06, + "loss": 0.1183, + "step": 4289 + }, + { + "epoch": 1.390149060272197, + "grad_norm": 0.4431980550289154, + "learning_rate": 5.832233950083349e-06, + "loss": 0.1108, + "step": 4290 + }, + { + "epoch": 1.3904731043421905, + "grad_norm": 0.4475851058959961, + "learning_rate": 5.830509083593078e-06, + "loss": 0.1157, + "step": 4291 + }, + { + "epoch": 1.390797148412184, + "grad_norm": 0.4317694306373596, + "learning_rate": 5.828784115456534e-06, + "loss": 0.1031, + "step": 4292 + }, + { + "epoch": 1.3911211924821776, + "grad_norm": 0.4566093981266022, + "learning_rate": 5.827059045884836e-06, + "loss": 0.1175, + "step": 4293 + }, + { + "epoch": 1.3914452365521712, + "grad_norm": 0.45999404788017273, + "learning_rate": 5.825333875089119e-06, + "loss": 0.1115, + "step": 4294 + }, + { + "epoch": 1.3917692806221647, + "grad_norm": 0.4236088693141937, + "learning_rate": 5.823608603280526e-06, + "loss": 0.1107, + "step": 4295 + }, + { + "epoch": 1.392093324692158, + "grad_norm": 0.4458802342414856, + "learning_rate": 5.821883230670212e-06, + "loss": 0.1157, + "step": 4296 + }, + { + "epoch": 1.3924173687621517, + "grad_norm": 0.4616601765155792, + "learning_rate": 5.820157757469349e-06, + "loss": 0.1205, + "step": 4297 + }, + { + "epoch": 1.3927414128321451, + "grad_norm": 0.4825879633426666, + "learning_rate": 5.818432183889113e-06, + "loss": 0.1148, + "step": 4298 + }, + { + "epoch": 1.3930654569021388, + "grad_norm": 0.4698737859725952, + "learning_rate": 5.816706510140703e-06, + "loss": 0.1155, + "step": 4299 + }, + { + "epoch": 1.3933895009721322, + "grad_norm": 0.4210644066333771, + "learning_rate": 5.814980736435321e-06, + "loss": 0.1053, + "step": 4300 + }, + { + "epoch": 1.3937135450421256, + "grad_norm": 0.4978773891925812, + "learning_rate": 5.813254862984188e-06, + "loss": 0.1183, + "step": 4301 + }, + { + "epoch": 1.3940375891121193, + "grad_norm": 0.4545080363750458, + "learning_rate": 5.811528889998531e-06, + "loss": 0.1083, + "step": 4302 + }, + { + "epoch": 1.3943616331821127, + "grad_norm": 0.4733128249645233, + "learning_rate": 5.809802817689596e-06, + "loss": 0.1115, + "step": 4303 + }, + { + "epoch": 1.3946856772521063, + "grad_norm": 0.45566970109939575, + "learning_rate": 5.8080766462686345e-06, + "loss": 0.11, + "step": 4304 + }, + { + "epoch": 1.3950097213220998, + "grad_norm": 0.4669945538043976, + "learning_rate": 5.806350375946914e-06, + "loss": 0.1083, + "step": 4305 + }, + { + "epoch": 1.3953337653920932, + "grad_norm": 0.46048641204833984, + "learning_rate": 5.804624006935715e-06, + "loss": 0.1123, + "step": 4306 + }, + { + "epoch": 1.3956578094620868, + "grad_norm": 0.4888690412044525, + "learning_rate": 5.802897539446326e-06, + "loss": 0.1292, + "step": 4307 + }, + { + "epoch": 1.3959818535320805, + "grad_norm": 0.44061604142189026, + "learning_rate": 5.801170973690052e-06, + "loss": 0.102, + "step": 4308 + }, + { + "epoch": 1.396305897602074, + "grad_norm": 0.5046406388282776, + "learning_rate": 5.799444309878205e-06, + "loss": 0.1234, + "step": 4309 + }, + { + "epoch": 1.3966299416720673, + "grad_norm": 0.4761616885662079, + "learning_rate": 5.797717548222115e-06, + "loss": 0.1188, + "step": 4310 + }, + { + "epoch": 1.396953985742061, + "grad_norm": 0.4426950216293335, + "learning_rate": 5.795990688933117e-06, + "loss": 0.0986, + "step": 4311 + }, + { + "epoch": 1.3972780298120544, + "grad_norm": 0.42711836099624634, + "learning_rate": 5.794263732222567e-06, + "loss": 0.0988, + "step": 4312 + }, + { + "epoch": 1.397602073882048, + "grad_norm": 0.46538108587265015, + "learning_rate": 5.792536678301824e-06, + "loss": 0.1112, + "step": 4313 + }, + { + "epoch": 1.3979261179520415, + "grad_norm": 0.4521970748901367, + "learning_rate": 5.790809527382264e-06, + "loss": 0.1084, + "step": 4314 + }, + { + "epoch": 1.398250162022035, + "grad_norm": 0.48829540610313416, + "learning_rate": 5.789082279675276e-06, + "loss": 0.1184, + "step": 4315 + }, + { + "epoch": 1.3985742060920285, + "grad_norm": 0.4372498691082001, + "learning_rate": 5.787354935392253e-06, + "loss": 0.1032, + "step": 4316 + }, + { + "epoch": 1.398898250162022, + "grad_norm": 0.4990762174129486, + "learning_rate": 5.78562749474461e-06, + "loss": 0.118, + "step": 4317 + }, + { + "epoch": 1.3992222942320156, + "grad_norm": 0.4872603118419647, + "learning_rate": 5.783899957943766e-06, + "loss": 0.1246, + "step": 4318 + }, + { + "epoch": 1.399546338302009, + "grad_norm": 0.4665011465549469, + "learning_rate": 5.782172325201155e-06, + "loss": 0.1106, + "step": 4319 + }, + { + "epoch": 1.3998703823720027, + "grad_norm": 0.41253137588500977, + "learning_rate": 5.780444596728224e-06, + "loss": 0.0922, + "step": 4320 + }, + { + "epoch": 1.400194426441996, + "grad_norm": 0.491841197013855, + "learning_rate": 5.778716772736431e-06, + "loss": 0.1269, + "step": 4321 + }, + { + "epoch": 1.4005184705119897, + "grad_norm": 0.4381025433540344, + "learning_rate": 5.776988853437242e-06, + "loss": 0.1131, + "step": 4322 + }, + { + "epoch": 1.4008425145819832, + "grad_norm": 0.5174381732940674, + "learning_rate": 5.775260839042139e-06, + "loss": 0.1291, + "step": 4323 + }, + { + "epoch": 1.4011665586519766, + "grad_norm": 0.46297118067741394, + "learning_rate": 5.7735327297626154e-06, + "loss": 0.1072, + "step": 4324 + }, + { + "epoch": 1.4014906027219702, + "grad_norm": 0.4963762164115906, + "learning_rate": 5.771804525810174e-06, + "loss": 0.1232, + "step": 4325 + }, + { + "epoch": 1.4018146467919637, + "grad_norm": 0.49227485060691833, + "learning_rate": 5.77007622739633e-06, + "loss": 0.1058, + "step": 4326 + }, + { + "epoch": 1.4021386908619573, + "grad_norm": 0.4902411997318268, + "learning_rate": 5.7683478347326115e-06, + "loss": 0.1237, + "step": 4327 + }, + { + "epoch": 1.4024627349319507, + "grad_norm": 0.49123913049697876, + "learning_rate": 5.766619348030556e-06, + "loss": 0.1098, + "step": 4328 + }, + { + "epoch": 1.4027867790019442, + "grad_norm": 0.46614375710487366, + "learning_rate": 5.7648907675017126e-06, + "loss": 0.106, + "step": 4329 + }, + { + "epoch": 1.4031108230719378, + "grad_norm": 0.4563212990760803, + "learning_rate": 5.763162093357645e-06, + "loss": 0.1072, + "step": 4330 + }, + { + "epoch": 1.4034348671419314, + "grad_norm": 0.4761507511138916, + "learning_rate": 5.7614333258099245e-06, + "loss": 0.1175, + "step": 4331 + }, + { + "epoch": 1.4037589112119249, + "grad_norm": 0.5162764191627502, + "learning_rate": 5.7597044650701365e-06, + "loss": 0.1259, + "step": 4332 + }, + { + "epoch": 1.4040829552819183, + "grad_norm": 0.4567275643348694, + "learning_rate": 5.757975511349877e-06, + "loss": 0.1058, + "step": 4333 + }, + { + "epoch": 1.404406999351912, + "grad_norm": 0.4625505805015564, + "learning_rate": 5.7562464648607515e-06, + "loss": 0.1028, + "step": 4334 + }, + { + "epoch": 1.4047310434219054, + "grad_norm": 0.5034233331680298, + "learning_rate": 5.7545173258143804e-06, + "loss": 0.1161, + "step": 4335 + }, + { + "epoch": 1.405055087491899, + "grad_norm": 0.4404418468475342, + "learning_rate": 5.752788094422392e-06, + "loss": 0.1109, + "step": 4336 + }, + { + "epoch": 1.4053791315618924, + "grad_norm": 0.43947386741638184, + "learning_rate": 5.75105877089643e-06, + "loss": 0.1106, + "step": 4337 + }, + { + "epoch": 1.4057031756318858, + "grad_norm": 0.45304930210113525, + "learning_rate": 5.749329355448145e-06, + "loss": 0.1055, + "step": 4338 + }, + { + "epoch": 1.4060272197018795, + "grad_norm": 0.5092594027519226, + "learning_rate": 5.7475998482892e-06, + "loss": 0.1227, + "step": 4339 + }, + { + "epoch": 1.406351263771873, + "grad_norm": 0.4667177200317383, + "learning_rate": 5.745870249631273e-06, + "loss": 0.1194, + "step": 4340 + }, + { + "epoch": 1.4066753078418666, + "grad_norm": 0.42807644605636597, + "learning_rate": 5.744140559686046e-06, + "loss": 0.0996, + "step": 4341 + }, + { + "epoch": 1.40699935191186, + "grad_norm": 0.49830734729766846, + "learning_rate": 5.7424107786652175e-06, + "loss": 0.124, + "step": 4342 + }, + { + "epoch": 1.4073233959818534, + "grad_norm": 0.49827465415000916, + "learning_rate": 5.7406809067804984e-06, + "loss": 0.1226, + "step": 4343 + }, + { + "epoch": 1.407647440051847, + "grad_norm": 0.4859582185745239, + "learning_rate": 5.738950944243605e-06, + "loss": 0.1247, + "step": 4344 + }, + { + "epoch": 1.4079714841218407, + "grad_norm": 0.4473390281200409, + "learning_rate": 5.737220891266271e-06, + "loss": 0.1104, + "step": 4345 + }, + { + "epoch": 1.4082955281918341, + "grad_norm": 0.4510458707809448, + "learning_rate": 5.735490748060237e-06, + "loss": 0.1062, + "step": 4346 + }, + { + "epoch": 1.4086195722618275, + "grad_norm": 0.46265891194343567, + "learning_rate": 5.733760514837255e-06, + "loss": 0.1091, + "step": 4347 + }, + { + "epoch": 1.4089436163318212, + "grad_norm": 0.46770209074020386, + "learning_rate": 5.732030191809091e-06, + "loss": 0.1184, + "step": 4348 + }, + { + "epoch": 1.4092676604018146, + "grad_norm": 0.5292964577674866, + "learning_rate": 5.730299779187516e-06, + "loss": 0.1177, + "step": 4349 + }, + { + "epoch": 1.4095917044718083, + "grad_norm": 0.45356395840644836, + "learning_rate": 5.7285692771843185e-06, + "loss": 0.1078, + "step": 4350 + }, + { + "epoch": 1.4099157485418017, + "grad_norm": 0.48939910531044006, + "learning_rate": 5.726838686011294e-06, + "loss": 0.1131, + "step": 4351 + }, + { + "epoch": 1.410239792611795, + "grad_norm": 0.4604235589504242, + "learning_rate": 5.7251080058802525e-06, + "loss": 0.1143, + "step": 4352 + }, + { + "epoch": 1.4105638366817888, + "grad_norm": 0.48968636989593506, + "learning_rate": 5.723377237003009e-06, + "loss": 0.1204, + "step": 4353 + }, + { + "epoch": 1.4108878807517822, + "grad_norm": 0.5055221915245056, + "learning_rate": 5.721646379591394e-06, + "loss": 0.1252, + "step": 4354 + }, + { + "epoch": 1.4112119248217758, + "grad_norm": 0.47648561000823975, + "learning_rate": 5.71991543385725e-06, + "loss": 0.1163, + "step": 4355 + }, + { + "epoch": 1.4115359688917692, + "grad_norm": 0.4697307348251343, + "learning_rate": 5.718184400012425e-06, + "loss": 0.1159, + "step": 4356 + }, + { + "epoch": 1.4118600129617627, + "grad_norm": 0.4544907808303833, + "learning_rate": 5.716453278268782e-06, + "loss": 0.107, + "step": 4357 + }, + { + "epoch": 1.4121840570317563, + "grad_norm": 0.4428936839103699, + "learning_rate": 5.7147220688381955e-06, + "loss": 0.106, + "step": 4358 + }, + { + "epoch": 1.41250810110175, + "grad_norm": 0.4969765245914459, + "learning_rate": 5.712990771932545e-06, + "loss": 0.1279, + "step": 4359 + }, + { + "epoch": 1.4128321451717434, + "grad_norm": 0.44887062907218933, + "learning_rate": 5.7112593877637264e-06, + "loss": 0.1104, + "step": 4360 + }, + { + "epoch": 1.4131561892417368, + "grad_norm": 0.4952261447906494, + "learning_rate": 5.7095279165436446e-06, + "loss": 0.1241, + "step": 4361 + }, + { + "epoch": 1.4134802333117304, + "grad_norm": 0.42035895586013794, + "learning_rate": 5.707796358484214e-06, + "loss": 0.0978, + "step": 4362 + }, + { + "epoch": 1.4138042773817239, + "grad_norm": 0.4624359607696533, + "learning_rate": 5.706064713797361e-06, + "loss": 0.112, + "step": 4363 + }, + { + "epoch": 1.4141283214517175, + "grad_norm": 0.44356340169906616, + "learning_rate": 5.704332982695025e-06, + "loss": 0.1032, + "step": 4364 + }, + { + "epoch": 1.414452365521711, + "grad_norm": 0.4767822325229645, + "learning_rate": 5.7026011653891466e-06, + "loss": 0.1115, + "step": 4365 + }, + { + "epoch": 1.4147764095917044, + "grad_norm": 0.42960530519485474, + "learning_rate": 5.700869262091689e-06, + "loss": 0.1027, + "step": 4366 + }, + { + "epoch": 1.415100453661698, + "grad_norm": 0.4716300368309021, + "learning_rate": 5.699137273014619e-06, + "loss": 0.1124, + "step": 4367 + }, + { + "epoch": 1.4154244977316914, + "grad_norm": 0.4724923074245453, + "learning_rate": 5.697405198369914e-06, + "loss": 0.1139, + "step": 4368 + }, + { + "epoch": 1.415748541801685, + "grad_norm": 0.4250900149345398, + "learning_rate": 5.695673038369565e-06, + "loss": 0.1003, + "step": 4369 + }, + { + "epoch": 1.4160725858716785, + "grad_norm": 0.45592403411865234, + "learning_rate": 5.693940793225571e-06, + "loss": 0.1135, + "step": 4370 + }, + { + "epoch": 1.4163966299416721, + "grad_norm": 0.47380363941192627, + "learning_rate": 5.692208463149941e-06, + "loss": 0.1128, + "step": 4371 + }, + { + "epoch": 1.4167206740116656, + "grad_norm": 0.4276685416698456, + "learning_rate": 5.690476048354696e-06, + "loss": 0.1023, + "step": 4372 + }, + { + "epoch": 1.4170447180816592, + "grad_norm": 0.46587416529655457, + "learning_rate": 5.688743549051867e-06, + "loss": 0.1156, + "step": 4373 + }, + { + "epoch": 1.4173687621516526, + "grad_norm": 0.4692516326904297, + "learning_rate": 5.687010965453495e-06, + "loss": 0.1156, + "step": 4374 + }, + { + "epoch": 1.417692806221646, + "grad_norm": 0.48215970396995544, + "learning_rate": 5.6852782977716326e-06, + "loss": 0.121, + "step": 4375 + }, + { + "epoch": 1.4180168502916397, + "grad_norm": 0.46034133434295654, + "learning_rate": 5.683545546218341e-06, + "loss": 0.1179, + "step": 4376 + }, + { + "epoch": 1.4183408943616331, + "grad_norm": 0.5115047097206116, + "learning_rate": 5.68181271100569e-06, + "loss": 0.1221, + "step": 4377 + }, + { + "epoch": 1.4186649384316268, + "grad_norm": 0.49292752146720886, + "learning_rate": 5.680079792345764e-06, + "loss": 0.1227, + "step": 4378 + }, + { + "epoch": 1.4189889825016202, + "grad_norm": 0.4294084310531616, + "learning_rate": 5.6783467904506554e-06, + "loss": 0.1006, + "step": 4379 + }, + { + "epoch": 1.4193130265716136, + "grad_norm": 0.4855675995349884, + "learning_rate": 5.676613705532468e-06, + "loss": 0.1139, + "step": 4380 + }, + { + "epoch": 1.4196370706416073, + "grad_norm": 0.4840838313102722, + "learning_rate": 5.674880537803311e-06, + "loss": 0.1182, + "step": 4381 + }, + { + "epoch": 1.419961114711601, + "grad_norm": 0.442710816860199, + "learning_rate": 5.673147287475311e-06, + "loss": 0.1043, + "step": 4382 + }, + { + "epoch": 1.4202851587815943, + "grad_norm": 0.46457353234291077, + "learning_rate": 5.671413954760599e-06, + "loss": 0.1145, + "step": 4383 + }, + { + "epoch": 1.4206092028515878, + "grad_norm": 0.47664782404899597, + "learning_rate": 5.669680539871318e-06, + "loss": 0.119, + "step": 4384 + }, + { + "epoch": 1.4209332469215814, + "grad_norm": 0.4621475040912628, + "learning_rate": 5.667947043019624e-06, + "loss": 0.1155, + "step": 4385 + }, + { + "epoch": 1.4212572909915748, + "grad_norm": 0.48262637853622437, + "learning_rate": 5.666213464417676e-06, + "loss": 0.1173, + "step": 4386 + }, + { + "epoch": 1.4215813350615685, + "grad_norm": 0.46006882190704346, + "learning_rate": 5.6644798042776495e-06, + "loss": 0.1153, + "step": 4387 + }, + { + "epoch": 1.421905379131562, + "grad_norm": 0.4948621094226837, + "learning_rate": 5.662746062811729e-06, + "loss": 0.1153, + "step": 4388 + }, + { + "epoch": 1.4222294232015553, + "grad_norm": 0.5037163496017456, + "learning_rate": 5.661012240232106e-06, + "loss": 0.1208, + "step": 4389 + }, + { + "epoch": 1.422553467271549, + "grad_norm": 0.484428346157074, + "learning_rate": 5.659278336750983e-06, + "loss": 0.1188, + "step": 4390 + }, + { + "epoch": 1.4228775113415424, + "grad_norm": 0.46746310591697693, + "learning_rate": 5.6575443525805754e-06, + "loss": 0.1162, + "step": 4391 + }, + { + "epoch": 1.423201555411536, + "grad_norm": 0.44541990756988525, + "learning_rate": 5.655810287933103e-06, + "loss": 0.1101, + "step": 4392 + }, + { + "epoch": 1.4235255994815295, + "grad_norm": 0.4760904312133789, + "learning_rate": 5.6540761430208e-06, + "loss": 0.1185, + "step": 4393 + }, + { + "epoch": 1.4238496435515229, + "grad_norm": 0.4710126221179962, + "learning_rate": 5.652341918055912e-06, + "loss": 0.114, + "step": 4394 + }, + { + "epoch": 1.4241736876215165, + "grad_norm": 0.4768831133842468, + "learning_rate": 5.650607613250685e-06, + "loss": 0.11, + "step": 4395 + }, + { + "epoch": 1.4244977316915102, + "grad_norm": 0.5086276531219482, + "learning_rate": 5.648873228817385e-06, + "loss": 0.1232, + "step": 4396 + }, + { + "epoch": 1.4248217757615036, + "grad_norm": 0.4469287097454071, + "learning_rate": 5.647138764968284e-06, + "loss": 0.1076, + "step": 4397 + }, + { + "epoch": 1.425145819831497, + "grad_norm": 0.478959321975708, + "learning_rate": 5.645404221915662e-06, + "loss": 0.1221, + "step": 4398 + }, + { + "epoch": 1.4254698639014907, + "grad_norm": 0.4652095139026642, + "learning_rate": 5.643669599871813e-06, + "loss": 0.1128, + "step": 4399 + }, + { + "epoch": 1.425793907971484, + "grad_norm": 0.485418438911438, + "learning_rate": 5.641934899049035e-06, + "loss": 0.1172, + "step": 4400 + }, + { + "epoch": 1.4261179520414777, + "grad_norm": 0.4753665626049042, + "learning_rate": 5.64020011965964e-06, + "loss": 0.1138, + "step": 4401 + }, + { + "epoch": 1.4264419961114712, + "grad_norm": 0.46920743584632874, + "learning_rate": 5.63846526191595e-06, + "loss": 0.1159, + "step": 4402 + }, + { + "epoch": 1.4267660401814646, + "grad_norm": 0.5033137798309326, + "learning_rate": 5.63673032603029e-06, + "loss": 0.1249, + "step": 4403 + }, + { + "epoch": 1.4270900842514582, + "grad_norm": 0.4642874300479889, + "learning_rate": 5.6349953122150026e-06, + "loss": 0.1109, + "step": 4404 + }, + { + "epoch": 1.4274141283214516, + "grad_norm": 0.45898953080177307, + "learning_rate": 5.633260220682436e-06, + "loss": 0.1151, + "step": 4405 + }, + { + "epoch": 1.4277381723914453, + "grad_norm": 0.44028547406196594, + "learning_rate": 5.631525051644949e-06, + "loss": 0.1085, + "step": 4406 + }, + { + "epoch": 1.4280622164614387, + "grad_norm": 0.4695223271846771, + "learning_rate": 5.629789805314912e-06, + "loss": 0.114, + "step": 4407 + }, + { + "epoch": 1.4283862605314321, + "grad_norm": 0.4158531725406647, + "learning_rate": 5.628054481904696e-06, + "loss": 0.0991, + "step": 4408 + }, + { + "epoch": 1.4287103046014258, + "grad_norm": 0.5092782378196716, + "learning_rate": 5.626319081626693e-06, + "loss": 0.1201, + "step": 4409 + }, + { + "epoch": 1.4290343486714194, + "grad_norm": 0.47505107522010803, + "learning_rate": 5.624583604693297e-06, + "loss": 0.1108, + "step": 4410 + }, + { + "epoch": 1.4293583927414129, + "grad_norm": 0.45775267481803894, + "learning_rate": 5.622848051316916e-06, + "loss": 0.1118, + "step": 4411 + }, + { + "epoch": 1.4296824368114063, + "grad_norm": 0.4317185580730438, + "learning_rate": 5.621112421709961e-06, + "loss": 0.0998, + "step": 4412 + }, + { + "epoch": 1.4300064808814, + "grad_norm": 0.50062096118927, + "learning_rate": 5.61937671608486e-06, + "loss": 0.1268, + "step": 4413 + }, + { + "epoch": 1.4303305249513933, + "grad_norm": 0.46965810656547546, + "learning_rate": 5.617640934654044e-06, + "loss": 0.1085, + "step": 4414 + }, + { + "epoch": 1.430654569021387, + "grad_norm": 0.48559674620628357, + "learning_rate": 5.6159050776299574e-06, + "loss": 0.1201, + "step": 4415 + }, + { + "epoch": 1.4309786130913804, + "grad_norm": 0.4981156885623932, + "learning_rate": 5.614169145225051e-06, + "loss": 0.1211, + "step": 4416 + }, + { + "epoch": 1.4313026571613738, + "grad_norm": 0.47100457549095154, + "learning_rate": 5.612433137651787e-06, + "loss": 0.1025, + "step": 4417 + }, + { + "epoch": 1.4316267012313675, + "grad_norm": 0.5128635168075562, + "learning_rate": 5.6106970551226355e-06, + "loss": 0.1257, + "step": 4418 + }, + { + "epoch": 1.431950745301361, + "grad_norm": 0.4389077425003052, + "learning_rate": 5.608960897850078e-06, + "loss": 0.0965, + "step": 4419 + }, + { + "epoch": 1.4322747893713546, + "grad_norm": 0.4553956687450409, + "learning_rate": 5.6072246660466e-06, + "loss": 0.1088, + "step": 4420 + }, + { + "epoch": 1.432598833441348, + "grad_norm": 0.4483269155025482, + "learning_rate": 5.605488359924702e-06, + "loss": 0.105, + "step": 4421 + }, + { + "epoch": 1.4329228775113416, + "grad_norm": 0.4663432538509369, + "learning_rate": 5.603751979696892e-06, + "loss": 0.1081, + "step": 4422 + }, + { + "epoch": 1.433246921581335, + "grad_norm": 0.4651101529598236, + "learning_rate": 5.602015525575683e-06, + "loss": 0.1182, + "step": 4423 + }, + { + "epoch": 1.4335709656513287, + "grad_norm": 0.47833356261253357, + "learning_rate": 5.600278997773601e-06, + "loss": 0.1066, + "step": 4424 + }, + { + "epoch": 1.4338950097213221, + "grad_norm": 0.46498045325279236, + "learning_rate": 5.5985423965031815e-06, + "loss": 0.1114, + "step": 4425 + }, + { + "epoch": 1.4342190537913155, + "grad_norm": 0.4368334412574768, + "learning_rate": 5.596805721976966e-06, + "loss": 0.1048, + "step": 4426 + }, + { + "epoch": 1.4345430978613092, + "grad_norm": 0.46312999725341797, + "learning_rate": 5.59506897440751e-06, + "loss": 0.1022, + "step": 4427 + }, + { + "epoch": 1.4348671419313026, + "grad_norm": 0.459268718957901, + "learning_rate": 5.593332154007369e-06, + "loss": 0.1083, + "step": 4428 + }, + { + "epoch": 1.4351911860012962, + "grad_norm": 0.4764942228794098, + "learning_rate": 5.591595260989118e-06, + "loss": 0.1226, + "step": 4429 + }, + { + "epoch": 1.4355152300712897, + "grad_norm": 0.44204190373420715, + "learning_rate": 5.589858295565333e-06, + "loss": 0.1053, + "step": 4430 + }, + { + "epoch": 1.435839274141283, + "grad_norm": 0.5024438500404358, + "learning_rate": 5.588121257948605e-06, + "loss": 0.111, + "step": 4431 + }, + { + "epoch": 1.4361633182112767, + "grad_norm": 0.4540453553199768, + "learning_rate": 5.586384148351528e-06, + "loss": 0.1003, + "step": 4432 + }, + { + "epoch": 1.4364873622812704, + "grad_norm": 0.4889206290245056, + "learning_rate": 5.584646966986708e-06, + "loss": 0.1197, + "step": 4433 + }, + { + "epoch": 1.4368114063512638, + "grad_norm": 0.46186313033103943, + "learning_rate": 5.582909714066758e-06, + "loss": 0.1103, + "step": 4434 + }, + { + "epoch": 1.4371354504212572, + "grad_norm": 0.4445458948612213, + "learning_rate": 5.581172389804302e-06, + "loss": 0.1042, + "step": 4435 + }, + { + "epoch": 1.4374594944912509, + "grad_norm": 0.433281272649765, + "learning_rate": 5.579434994411972e-06, + "loss": 0.11, + "step": 4436 + }, + { + "epoch": 1.4377835385612443, + "grad_norm": 0.4515359699726105, + "learning_rate": 5.577697528102409e-06, + "loss": 0.1037, + "step": 4437 + }, + { + "epoch": 1.438107582631238, + "grad_norm": 0.48822012543678284, + "learning_rate": 5.57595999108826e-06, + "loss": 0.1224, + "step": 4438 + }, + { + "epoch": 1.4384316267012314, + "grad_norm": 0.442730575799942, + "learning_rate": 5.574222383582184e-06, + "loss": 0.1062, + "step": 4439 + }, + { + "epoch": 1.4387556707712248, + "grad_norm": 0.4672057330608368, + "learning_rate": 5.5724847057968466e-06, + "loss": 0.1131, + "step": 4440 + }, + { + "epoch": 1.4390797148412184, + "grad_norm": 0.46711841225624084, + "learning_rate": 5.5707469579449235e-06, + "loss": 0.1127, + "step": 4441 + }, + { + "epoch": 1.4394037589112119, + "grad_norm": 0.4616106152534485, + "learning_rate": 5.569009140239099e-06, + "loss": 0.1012, + "step": 4442 + }, + { + "epoch": 1.4397278029812055, + "grad_norm": 0.5364953279495239, + "learning_rate": 5.567271252892063e-06, + "loss": 0.1344, + "step": 4443 + }, + { + "epoch": 1.440051847051199, + "grad_norm": 0.44760236144065857, + "learning_rate": 5.565533296116519e-06, + "loss": 0.11, + "step": 4444 + }, + { + "epoch": 1.4403758911211924, + "grad_norm": 0.4851779043674469, + "learning_rate": 5.563795270125173e-06, + "loss": 0.1181, + "step": 4445 + }, + { + "epoch": 1.440699935191186, + "grad_norm": 0.4202938973903656, + "learning_rate": 5.562057175130744e-06, + "loss": 0.091, + "step": 4446 + }, + { + "epoch": 1.4410239792611796, + "grad_norm": 0.4636053442955017, + "learning_rate": 5.560319011345958e-06, + "loss": 0.1153, + "step": 4447 + }, + { + "epoch": 1.441348023331173, + "grad_norm": 0.49078604578971863, + "learning_rate": 5.558580778983549e-06, + "loss": 0.1194, + "step": 4448 + }, + { + "epoch": 1.4416720674011665, + "grad_norm": 0.466388463973999, + "learning_rate": 5.556842478256261e-06, + "loss": 0.1175, + "step": 4449 + }, + { + "epoch": 1.4419961114711601, + "grad_norm": 0.4817126393318176, + "learning_rate": 5.555104109376843e-06, + "loss": 0.1111, + "step": 4450 + }, + { + "epoch": 1.4423201555411536, + "grad_norm": 0.4500816762447357, + "learning_rate": 5.553365672558057e-06, + "loss": 0.1113, + "step": 4451 + }, + { + "epoch": 1.4426441996111472, + "grad_norm": 0.46708181500434875, + "learning_rate": 5.551627168012669e-06, + "loss": 0.1191, + "step": 4452 + }, + { + "epoch": 1.4429682436811406, + "grad_norm": 0.4912620782852173, + "learning_rate": 5.549888595953455e-06, + "loss": 0.1225, + "step": 4453 + }, + { + "epoch": 1.443292287751134, + "grad_norm": 0.46872448921203613, + "learning_rate": 5.548149956593203e-06, + "loss": 0.1189, + "step": 4454 + }, + { + "epoch": 1.4436163318211277, + "grad_norm": 0.4642205834388733, + "learning_rate": 5.546411250144701e-06, + "loss": 0.1085, + "step": 4455 + }, + { + "epoch": 1.4439403758911211, + "grad_norm": 0.45991188287734985, + "learning_rate": 5.544672476820751e-06, + "loss": 0.1147, + "step": 4456 + }, + { + "epoch": 1.4442644199611148, + "grad_norm": 0.4836006462574005, + "learning_rate": 5.542933636834164e-06, + "loss": 0.1153, + "step": 4457 + }, + { + "epoch": 1.4445884640311082, + "grad_norm": 0.5125424861907959, + "learning_rate": 5.541194730397755e-06, + "loss": 0.1242, + "step": 4458 + }, + { + "epoch": 1.4449125081011016, + "grad_norm": 0.4503360688686371, + "learning_rate": 5.53945575772435e-06, + "loss": 0.1066, + "step": 4459 + }, + { + "epoch": 1.4452365521710953, + "grad_norm": 0.4397493004798889, + "learning_rate": 5.537716719026784e-06, + "loss": 0.1121, + "step": 4460 + }, + { + "epoch": 1.445560596241089, + "grad_norm": 0.46807533502578735, + "learning_rate": 5.535977614517896e-06, + "loss": 0.1107, + "step": 4461 + }, + { + "epoch": 1.4458846403110823, + "grad_norm": 0.4748874306678772, + "learning_rate": 5.534238444410537e-06, + "loss": 0.1214, + "step": 4462 + }, + { + "epoch": 1.4462086843810757, + "grad_norm": 0.46195435523986816, + "learning_rate": 5.532499208917563e-06, + "loss": 0.113, + "step": 4463 + }, + { + "epoch": 1.4465327284510694, + "grad_norm": 0.43683934211730957, + "learning_rate": 5.530759908251842e-06, + "loss": 0.1029, + "step": 4464 + }, + { + "epoch": 1.4468567725210628, + "grad_norm": 0.4722435772418976, + "learning_rate": 5.529020542626246e-06, + "loss": 0.1173, + "step": 4465 + }, + { + "epoch": 1.4471808165910565, + "grad_norm": 0.4351162016391754, + "learning_rate": 5.527281112253657e-06, + "loss": 0.1048, + "step": 4466 + }, + { + "epoch": 1.4475048606610499, + "grad_norm": 0.45564013719558716, + "learning_rate": 5.525541617346965e-06, + "loss": 0.1147, + "step": 4467 + }, + { + "epoch": 1.4478289047310433, + "grad_norm": 0.48730671405792236, + "learning_rate": 5.523802058119067e-06, + "loss": 0.1217, + "step": 4468 + }, + { + "epoch": 1.448152948801037, + "grad_norm": 0.40554869174957275, + "learning_rate": 5.522062434782867e-06, + "loss": 0.0939, + "step": 4469 + }, + { + "epoch": 1.4484769928710304, + "grad_norm": 0.45881539583206177, + "learning_rate": 5.520322747551278e-06, + "loss": 0.1095, + "step": 4470 + }, + { + "epoch": 1.448801036941024, + "grad_norm": 0.4682929813861847, + "learning_rate": 5.518582996637223e-06, + "loss": 0.1145, + "step": 4471 + }, + { + "epoch": 1.4491250810110174, + "grad_norm": 0.45971962809562683, + "learning_rate": 5.516843182253628e-06, + "loss": 0.1088, + "step": 4472 + }, + { + "epoch": 1.449449125081011, + "grad_norm": 0.4667341113090515, + "learning_rate": 5.515103304613434e-06, + "loss": 0.1163, + "step": 4473 + }, + { + "epoch": 1.4497731691510045, + "grad_norm": 0.440285325050354, + "learning_rate": 5.5133633639295795e-06, + "loss": 0.103, + "step": 4474 + }, + { + "epoch": 1.4500972132209982, + "grad_norm": 0.4824848473072052, + "learning_rate": 5.511623360415019e-06, + "loss": 0.1233, + "step": 4475 + }, + { + "epoch": 1.4504212572909916, + "grad_norm": 0.42888009548187256, + "learning_rate": 5.509883294282714e-06, + "loss": 0.1049, + "step": 4476 + }, + { + "epoch": 1.450745301360985, + "grad_norm": 0.4593934714794159, + "learning_rate": 5.508143165745628e-06, + "loss": 0.1139, + "step": 4477 + }, + { + "epoch": 1.4510693454309787, + "grad_norm": 0.49577581882476807, + "learning_rate": 5.506402975016738e-06, + "loss": 0.1189, + "step": 4478 + }, + { + "epoch": 1.451393389500972, + "grad_norm": 0.49611929059028625, + "learning_rate": 5.5046627223090265e-06, + "loss": 0.1206, + "step": 4479 + }, + { + "epoch": 1.4517174335709657, + "grad_norm": 0.4758347272872925, + "learning_rate": 5.5029224078354844e-06, + "loss": 0.1104, + "step": 4480 + }, + { + "epoch": 1.4520414776409591, + "grad_norm": 0.48814454674720764, + "learning_rate": 5.501182031809107e-06, + "loss": 0.1205, + "step": 4481 + }, + { + "epoch": 1.4523655217109526, + "grad_norm": 0.47150522470474243, + "learning_rate": 5.4994415944429e-06, + "loss": 0.1059, + "step": 4482 + }, + { + "epoch": 1.4526895657809462, + "grad_norm": 0.47519728541374207, + "learning_rate": 5.497701095949879e-06, + "loss": 0.1186, + "step": 4483 + }, + { + "epoch": 1.4530136098509399, + "grad_norm": 0.4623326361179352, + "learning_rate": 5.49596053654306e-06, + "loss": 0.1091, + "step": 4484 + }, + { + "epoch": 1.4533376539209333, + "grad_norm": 0.4716527462005615, + "learning_rate": 5.494219916435474e-06, + "loss": 0.116, + "step": 4485 + }, + { + "epoch": 1.4536616979909267, + "grad_norm": 0.46047505736351013, + "learning_rate": 5.492479235840154e-06, + "loss": 0.1041, + "step": 4486 + }, + { + "epoch": 1.4539857420609203, + "grad_norm": 0.49836093187332153, + "learning_rate": 5.490738494970144e-06, + "loss": 0.1063, + "step": 4487 + }, + { + "epoch": 1.4543097861309138, + "grad_norm": 0.43920251727104187, + "learning_rate": 5.4889976940384915e-06, + "loss": 0.0945, + "step": 4488 + }, + { + "epoch": 1.4546338302009074, + "grad_norm": 0.43778035044670105, + "learning_rate": 5.487256833258256e-06, + "loss": 0.1046, + "step": 4489 + }, + { + "epoch": 1.4549578742709008, + "grad_norm": 0.469485342502594, + "learning_rate": 5.485515912842499e-06, + "loss": 0.1098, + "step": 4490 + }, + { + "epoch": 1.4552819183408943, + "grad_norm": 0.4534398317337036, + "learning_rate": 5.4837749330042965e-06, + "loss": 0.1093, + "step": 4491 + }, + { + "epoch": 1.455605962410888, + "grad_norm": 0.47168147563934326, + "learning_rate": 5.4820338939567265e-06, + "loss": 0.1121, + "step": 4492 + }, + { + "epoch": 1.4559300064808813, + "grad_norm": 0.47512897849082947, + "learning_rate": 5.480292795912873e-06, + "loss": 0.1136, + "step": 4493 + }, + { + "epoch": 1.456254050550875, + "grad_norm": 0.4722960293292999, + "learning_rate": 5.478551639085831e-06, + "loss": 0.1065, + "step": 4494 + }, + { + "epoch": 1.4565780946208684, + "grad_norm": 0.44116073846817017, + "learning_rate": 5.4768104236887e-06, + "loss": 0.1021, + "step": 4495 + }, + { + "epoch": 1.4569021386908618, + "grad_norm": 0.457285612821579, + "learning_rate": 5.47506914993459e-06, + "loss": 0.1123, + "step": 4496 + }, + { + "epoch": 1.4572261827608555, + "grad_norm": 0.47741320729255676, + "learning_rate": 5.473327818036615e-06, + "loss": 0.1175, + "step": 4497 + }, + { + "epoch": 1.4575502268308491, + "grad_norm": 0.5050625801086426, + "learning_rate": 5.471586428207899e-06, + "loss": 0.133, + "step": 4498 + }, + { + "epoch": 1.4578742709008425, + "grad_norm": 0.46975886821746826, + "learning_rate": 5.469844980661567e-06, + "loss": 0.1158, + "step": 4499 + }, + { + "epoch": 1.458198314970836, + "grad_norm": 0.46380752325057983, + "learning_rate": 5.468103475610758e-06, + "loss": 0.1148, + "step": 4500 + }, + { + "epoch": 1.4585223590408296, + "grad_norm": 0.4460957646369934, + "learning_rate": 5.466361913268616e-06, + "loss": 0.1074, + "step": 4501 + }, + { + "epoch": 1.458846403110823, + "grad_norm": 0.49102339148521423, + "learning_rate": 5.464620293848291e-06, + "loss": 0.1167, + "step": 4502 + }, + { + "epoch": 1.4591704471808167, + "grad_norm": 0.45689043402671814, + "learning_rate": 5.462878617562939e-06, + "loss": 0.1102, + "step": 4503 + }, + { + "epoch": 1.45949449125081, + "grad_norm": 0.44406747817993164, + "learning_rate": 5.461136884625727e-06, + "loss": 0.104, + "step": 4504 + }, + { + "epoch": 1.4598185353208035, + "grad_norm": 0.40898412466049194, + "learning_rate": 5.459395095249822e-06, + "loss": 0.0999, + "step": 4505 + }, + { + "epoch": 1.4601425793907972, + "grad_norm": 0.430336058139801, + "learning_rate": 5.457653249648405e-06, + "loss": 0.0998, + "step": 4506 + }, + { + "epoch": 1.4604666234607906, + "grad_norm": 0.5189803242683411, + "learning_rate": 5.455911348034661e-06, + "loss": 0.1157, + "step": 4507 + }, + { + "epoch": 1.4607906675307842, + "grad_norm": 0.4046836197376251, + "learning_rate": 5.454169390621783e-06, + "loss": 0.0932, + "step": 4508 + }, + { + "epoch": 1.4611147116007777, + "grad_norm": 0.45686987042427063, + "learning_rate": 5.452427377622967e-06, + "loss": 0.113, + "step": 4509 + }, + { + "epoch": 1.4614387556707713, + "grad_norm": 0.5095760822296143, + "learning_rate": 5.45068530925142e-06, + "loss": 0.1104, + "step": 4510 + }, + { + "epoch": 1.4617627997407647, + "grad_norm": 0.5702762007713318, + "learning_rate": 5.448943185720355e-06, + "loss": 0.102, + "step": 4511 + }, + { + "epoch": 1.4620868438107584, + "grad_norm": 0.45763012766838074, + "learning_rate": 5.447201007242988e-06, + "loss": 0.1131, + "step": 4512 + }, + { + "epoch": 1.4624108878807518, + "grad_norm": 0.4436397850513458, + "learning_rate": 5.4454587740325485e-06, + "loss": 0.1121, + "step": 4513 + }, + { + "epoch": 1.4627349319507452, + "grad_norm": 0.47292545437812805, + "learning_rate": 5.443716486302266e-06, + "loss": 0.1138, + "step": 4514 + }, + { + "epoch": 1.4630589760207389, + "grad_norm": 0.5149725675582886, + "learning_rate": 5.441974144265383e-06, + "loss": 0.1249, + "step": 4515 + }, + { + "epoch": 1.4633830200907323, + "grad_norm": 0.46080783009529114, + "learning_rate": 5.440231748135143e-06, + "loss": 0.1067, + "step": 4516 + }, + { + "epoch": 1.463707064160726, + "grad_norm": 0.48276081681251526, + "learning_rate": 5.438489298124798e-06, + "loss": 0.113, + "step": 4517 + }, + { + "epoch": 1.4640311082307194, + "grad_norm": 0.4716761112213135, + "learning_rate": 5.436746794447608e-06, + "loss": 0.1095, + "step": 4518 + }, + { + "epoch": 1.4643551523007128, + "grad_norm": 0.4616851806640625, + "learning_rate": 5.43500423731684e-06, + "loss": 0.1043, + "step": 4519 + }, + { + "epoch": 1.4646791963707064, + "grad_norm": 0.46129775047302246, + "learning_rate": 5.433261626945763e-06, + "loss": 0.1055, + "step": 4520 + }, + { + "epoch": 1.4650032404407, + "grad_norm": 0.4611496329307556, + "learning_rate": 5.431518963547656e-06, + "loss": 0.1093, + "step": 4521 + }, + { + "epoch": 1.4653272845106935, + "grad_norm": 0.45577365159988403, + "learning_rate": 5.429776247335807e-06, + "loss": 0.112, + "step": 4522 + }, + { + "epoch": 1.465651328580687, + "grad_norm": 0.4398552477359772, + "learning_rate": 5.428033478523505e-06, + "loss": 0.1071, + "step": 4523 + }, + { + "epoch": 1.4659753726506806, + "grad_norm": 0.4762032628059387, + "learning_rate": 5.426290657324051e-06, + "loss": 0.1153, + "step": 4524 + }, + { + "epoch": 1.466299416720674, + "grad_norm": 0.4628520607948303, + "learning_rate": 5.424547783950744e-06, + "loss": 0.113, + "step": 4525 + }, + { + "epoch": 1.4666234607906676, + "grad_norm": 0.47359782457351685, + "learning_rate": 5.4228048586169005e-06, + "loss": 0.121, + "step": 4526 + }, + { + "epoch": 1.466947504860661, + "grad_norm": 0.43229299783706665, + "learning_rate": 5.421061881535834e-06, + "loss": 0.0964, + "step": 4527 + }, + { + "epoch": 1.4672715489306545, + "grad_norm": 0.4732006788253784, + "learning_rate": 5.4193188529208715e-06, + "loss": 0.1118, + "step": 4528 + }, + { + "epoch": 1.4675955930006481, + "grad_norm": 0.4338208734989166, + "learning_rate": 5.417575772985339e-06, + "loss": 0.1034, + "step": 4529 + }, + { + "epoch": 1.4679196370706415, + "grad_norm": 0.45097798109054565, + "learning_rate": 5.415832641942576e-06, + "loss": 0.1042, + "step": 4530 + }, + { + "epoch": 1.4682436811406352, + "grad_norm": 0.42899981141090393, + "learning_rate": 5.4140894600059215e-06, + "loss": 0.1067, + "step": 4531 + }, + { + "epoch": 1.4685677252106286, + "grad_norm": 0.4907374978065491, + "learning_rate": 5.412346227388726e-06, + "loss": 0.1165, + "step": 4532 + }, + { + "epoch": 1.468891769280622, + "grad_norm": 0.43755388259887695, + "learning_rate": 5.410602944304344e-06, + "loss": 0.1038, + "step": 4533 + }, + { + "epoch": 1.4692158133506157, + "grad_norm": 0.49814197421073914, + "learning_rate": 5.4088596109661374e-06, + "loss": 0.1186, + "step": 4534 + }, + { + "epoch": 1.4695398574206093, + "grad_norm": 0.5008993148803711, + "learning_rate": 5.407116227587472e-06, + "loss": 0.1151, + "step": 4535 + }, + { + "epoch": 1.4698639014906028, + "grad_norm": 0.49627819657325745, + "learning_rate": 5.4053727943817215e-06, + "loss": 0.1146, + "step": 4536 + }, + { + "epoch": 1.4701879455605962, + "grad_norm": 0.45009931921958923, + "learning_rate": 5.403629311562265e-06, + "loss": 0.1108, + "step": 4537 + }, + { + "epoch": 1.4705119896305898, + "grad_norm": 0.4893830716609955, + "learning_rate": 5.4018857793424885e-06, + "loss": 0.1161, + "step": 4538 + }, + { + "epoch": 1.4708360337005832, + "grad_norm": 0.46369659900665283, + "learning_rate": 5.400142197935784e-06, + "loss": 0.1138, + "step": 4539 + }, + { + "epoch": 1.471160077770577, + "grad_norm": 0.4574586749076843, + "learning_rate": 5.398398567555546e-06, + "loss": 0.1101, + "step": 4540 + }, + { + "epoch": 1.4714841218405703, + "grad_norm": 0.42343491315841675, + "learning_rate": 5.396654888415183e-06, + "loss": 0.0933, + "step": 4541 + }, + { + "epoch": 1.4718081659105637, + "grad_norm": 0.46502062678337097, + "learning_rate": 5.3949111607281e-06, + "loss": 0.1127, + "step": 4542 + }, + { + "epoch": 1.4721322099805574, + "grad_norm": 0.47962379455566406, + "learning_rate": 5.3931673847077135e-06, + "loss": 0.1211, + "step": 4543 + }, + { + "epoch": 1.4724562540505508, + "grad_norm": 0.4389127790927887, + "learning_rate": 5.391423560567446e-06, + "loss": 0.1075, + "step": 4544 + }, + { + "epoch": 1.4727802981205445, + "grad_norm": 0.43341097235679626, + "learning_rate": 5.389679688520722e-06, + "loss": 0.1065, + "step": 4545 + }, + { + "epoch": 1.4731043421905379, + "grad_norm": 0.47744542360305786, + "learning_rate": 5.3879357687809795e-06, + "loss": 0.1156, + "step": 4546 + }, + { + "epoch": 1.4734283862605313, + "grad_norm": 0.5202251672744751, + "learning_rate": 5.3861918015616536e-06, + "loss": 0.1354, + "step": 4547 + }, + { + "epoch": 1.473752430330525, + "grad_norm": 0.46379417181015015, + "learning_rate": 5.384447787076189e-06, + "loss": 0.1186, + "step": 4548 + }, + { + "epoch": 1.4740764744005186, + "grad_norm": 0.44243863224983215, + "learning_rate": 5.382703725538036e-06, + "loss": 0.1085, + "step": 4549 + }, + { + "epoch": 1.474400518470512, + "grad_norm": 0.47953954339027405, + "learning_rate": 5.380959617160655e-06, + "loss": 0.1106, + "step": 4550 + }, + { + "epoch": 1.4747245625405054, + "grad_norm": 0.43394193053245544, + "learning_rate": 5.379215462157502e-06, + "loss": 0.1, + "step": 4551 + }, + { + "epoch": 1.475048606610499, + "grad_norm": 0.4366772174835205, + "learning_rate": 5.377471260742048e-06, + "loss": 0.1038, + "step": 4552 + }, + { + "epoch": 1.4753726506804925, + "grad_norm": 0.45473575592041016, + "learning_rate": 5.375727013127769e-06, + "loss": 0.1128, + "step": 4553 + }, + { + "epoch": 1.4756966947504861, + "grad_norm": 0.4511753022670746, + "learning_rate": 5.373982719528137e-06, + "loss": 0.11, + "step": 4554 + }, + { + "epoch": 1.4760207388204796, + "grad_norm": 0.4673917889595032, + "learning_rate": 5.3722383801566425e-06, + "loss": 0.111, + "step": 4555 + }, + { + "epoch": 1.476344782890473, + "grad_norm": 0.459878534078598, + "learning_rate": 5.370493995226772e-06, + "loss": 0.1158, + "step": 4556 + }, + { + "epoch": 1.4766688269604666, + "grad_norm": 0.45323413610458374, + "learning_rate": 5.368749564952025e-06, + "loss": 0.11, + "step": 4557 + }, + { + "epoch": 1.47699287103046, + "grad_norm": 0.4526347517967224, + "learning_rate": 5.367005089545899e-06, + "loss": 0.1029, + "step": 4558 + }, + { + "epoch": 1.4773169151004537, + "grad_norm": 0.44370052218437195, + "learning_rate": 5.365260569221906e-06, + "loss": 0.1088, + "step": 4559 + }, + { + "epoch": 1.4776409591704471, + "grad_norm": 0.4556959271430969, + "learning_rate": 5.363516004193553e-06, + "loss": 0.1042, + "step": 4560 + }, + { + "epoch": 1.4779650032404408, + "grad_norm": 0.44470837712287903, + "learning_rate": 5.361771394674362e-06, + "loss": 0.1057, + "step": 4561 + }, + { + "epoch": 1.4782890473104342, + "grad_norm": 0.40587565302848816, + "learning_rate": 5.360026740877853e-06, + "loss": 0.0937, + "step": 4562 + }, + { + "epoch": 1.4786130913804278, + "grad_norm": 0.45987752079963684, + "learning_rate": 5.358282043017557e-06, + "loss": 0.1148, + "step": 4563 + }, + { + "epoch": 1.4789371354504213, + "grad_norm": 0.44257667660713196, + "learning_rate": 5.356537301307006e-06, + "loss": 0.1055, + "step": 4564 + }, + { + "epoch": 1.4792611795204147, + "grad_norm": 0.4302980899810791, + "learning_rate": 5.3547925159597426e-06, + "loss": 0.1047, + "step": 4565 + }, + { + "epoch": 1.4795852235904083, + "grad_norm": 0.4797317087650299, + "learning_rate": 5.353047687189309e-06, + "loss": 0.1154, + "step": 4566 + }, + { + "epoch": 1.4799092676604018, + "grad_norm": 0.46265506744384766, + "learning_rate": 5.351302815209256e-06, + "loss": 0.116, + "step": 4567 + }, + { + "epoch": 1.4802333117303954, + "grad_norm": 0.4362986385822296, + "learning_rate": 5.34955790023314e-06, + "loss": 0.0969, + "step": 4568 + }, + { + "epoch": 1.4805573558003888, + "grad_norm": 0.43595537543296814, + "learning_rate": 5.347812942474519e-06, + "loss": 0.1098, + "step": 4569 + }, + { + "epoch": 1.4808813998703823, + "grad_norm": 0.49564412236213684, + "learning_rate": 5.346067942146963e-06, + "loss": 0.1273, + "step": 4570 + }, + { + "epoch": 1.481205443940376, + "grad_norm": 0.4789816737174988, + "learning_rate": 5.344322899464042e-06, + "loss": 0.114, + "step": 4571 + }, + { + "epoch": 1.4815294880103695, + "grad_norm": 0.4422919750213623, + "learning_rate": 5.342577814639332e-06, + "loss": 0.1061, + "step": 4572 + }, + { + "epoch": 1.481853532080363, + "grad_norm": 0.4468146562576294, + "learning_rate": 5.34083268788641e-06, + "loss": 0.1108, + "step": 4573 + }, + { + "epoch": 1.4821775761503564, + "grad_norm": 0.47361573576927185, + "learning_rate": 5.339087519418868e-06, + "loss": 0.1089, + "step": 4574 + }, + { + "epoch": 1.48250162022035, + "grad_norm": 0.4521580636501312, + "learning_rate": 5.337342309450298e-06, + "loss": 0.1075, + "step": 4575 + }, + { + "epoch": 1.4828256642903435, + "grad_norm": 0.44739869236946106, + "learning_rate": 5.335597058194293e-06, + "loss": 0.1047, + "step": 4576 + }, + { + "epoch": 1.483149708360337, + "grad_norm": 0.5015192627906799, + "learning_rate": 5.333851765864458e-06, + "loss": 0.1117, + "step": 4577 + }, + { + "epoch": 1.4834737524303305, + "grad_norm": 0.4476906657218933, + "learning_rate": 5.332106432674399e-06, + "loss": 0.1036, + "step": 4578 + }, + { + "epoch": 1.483797796500324, + "grad_norm": 0.42498645186424255, + "learning_rate": 5.330361058837726e-06, + "loss": 0.1037, + "step": 4579 + }, + { + "epoch": 1.4841218405703176, + "grad_norm": 0.42732563614845276, + "learning_rate": 5.328615644568059e-06, + "loss": 0.1, + "step": 4580 + }, + { + "epoch": 1.484445884640311, + "grad_norm": 0.5075024366378784, + "learning_rate": 5.326870190079019e-06, + "loss": 0.1221, + "step": 4581 + }, + { + "epoch": 1.4847699287103047, + "grad_norm": 0.45853862166404724, + "learning_rate": 5.325124695584232e-06, + "loss": 0.1117, + "step": 4582 + }, + { + "epoch": 1.485093972780298, + "grad_norm": 0.4977118968963623, + "learning_rate": 5.323379161297329e-06, + "loss": 0.1177, + "step": 4583 + }, + { + "epoch": 1.4854180168502915, + "grad_norm": 0.47412171959877014, + "learning_rate": 5.321633587431947e-06, + "loss": 0.1134, + "step": 4584 + }, + { + "epoch": 1.4857420609202852, + "grad_norm": 0.4670596718788147, + "learning_rate": 5.319887974201727e-06, + "loss": 0.1119, + "step": 4585 + }, + { + "epoch": 1.4860661049902788, + "grad_norm": 0.46831014752388, + "learning_rate": 5.318142321820316e-06, + "loss": 0.1042, + "step": 4586 + }, + { + "epoch": 1.4863901490602722, + "grad_norm": 0.42625847458839417, + "learning_rate": 5.316396630501365e-06, + "loss": 0.1005, + "step": 4587 + }, + { + "epoch": 1.4867141931302656, + "grad_norm": 0.46382203698158264, + "learning_rate": 5.314650900458529e-06, + "loss": 0.1105, + "step": 4588 + }, + { + "epoch": 1.4870382372002593, + "grad_norm": 0.44759857654571533, + "learning_rate": 5.31290513190547e-06, + "loss": 0.1043, + "step": 4589 + }, + { + "epoch": 1.4873622812702527, + "grad_norm": 0.44344985485076904, + "learning_rate": 5.3111593250558515e-06, + "loss": 0.1048, + "step": 4590 + }, + { + "epoch": 1.4876863253402464, + "grad_norm": 0.44920921325683594, + "learning_rate": 5.309413480123343e-06, + "loss": 0.1056, + "step": 4591 + }, + { + "epoch": 1.4880103694102398, + "grad_norm": 0.46111756563186646, + "learning_rate": 5.307667597321621e-06, + "loss": 0.1053, + "step": 4592 + }, + { + "epoch": 1.4883344134802332, + "grad_norm": 0.45845603942871094, + "learning_rate": 5.305921676864363e-06, + "loss": 0.1153, + "step": 4593 + }, + { + "epoch": 1.4886584575502269, + "grad_norm": 0.45684415102005005, + "learning_rate": 5.3041757189652535e-06, + "loss": 0.109, + "step": 4594 + }, + { + "epoch": 1.4889825016202203, + "grad_norm": 0.47713303565979004, + "learning_rate": 5.302429723837982e-06, + "loss": 0.1178, + "step": 4595 + }, + { + "epoch": 1.489306545690214, + "grad_norm": 0.598270833492279, + "learning_rate": 5.3006836916962375e-06, + "loss": 0.1202, + "step": 4596 + }, + { + "epoch": 1.4896305897602073, + "grad_norm": 0.4470941722393036, + "learning_rate": 5.298937622753722e-06, + "loss": 0.1056, + "step": 4597 + }, + { + "epoch": 1.4899546338302008, + "grad_norm": 0.45994114875793457, + "learning_rate": 5.297191517224133e-06, + "loss": 0.1155, + "step": 4598 + }, + { + "epoch": 1.4902786779001944, + "grad_norm": 0.45040905475616455, + "learning_rate": 5.295445375321181e-06, + "loss": 0.1097, + "step": 4599 + }, + { + "epoch": 1.490602721970188, + "grad_norm": 0.4630436897277832, + "learning_rate": 5.293699197258574e-06, + "loss": 0.1134, + "step": 4600 + }, + { + "epoch": 1.4909267660401815, + "grad_norm": 0.4739874005317688, + "learning_rate": 5.29195298325003e-06, + "loss": 0.1127, + "step": 4601 + }, + { + "epoch": 1.491250810110175, + "grad_norm": 0.45205971598625183, + "learning_rate": 5.290206733509266e-06, + "loss": 0.1093, + "step": 4602 + }, + { + "epoch": 1.4915748541801686, + "grad_norm": 0.4669693112373352, + "learning_rate": 5.288460448250009e-06, + "loss": 0.1118, + "step": 4603 + }, + { + "epoch": 1.491898898250162, + "grad_norm": 0.4435705542564392, + "learning_rate": 5.286714127685985e-06, + "loss": 0.1039, + "step": 4604 + }, + { + "epoch": 1.4922229423201556, + "grad_norm": 0.43490707874298096, + "learning_rate": 5.284967772030927e-06, + "loss": 0.1034, + "step": 4605 + }, + { + "epoch": 1.492546986390149, + "grad_norm": 0.4700656235218048, + "learning_rate": 5.283221381498572e-06, + "loss": 0.1149, + "step": 4606 + }, + { + "epoch": 1.4928710304601425, + "grad_norm": 0.4963572323322296, + "learning_rate": 5.281474956302662e-06, + "loss": 0.1316, + "step": 4607 + }, + { + "epoch": 1.4931950745301361, + "grad_norm": 0.4392409324645996, + "learning_rate": 5.279728496656943e-06, + "loss": 0.1002, + "step": 4608 + }, + { + "epoch": 1.4935191186001295, + "grad_norm": 0.48771601915359497, + "learning_rate": 5.277982002775163e-06, + "loss": 0.1186, + "step": 4609 + }, + { + "epoch": 1.4938431626701232, + "grad_norm": 0.43994951248168945, + "learning_rate": 5.276235474871076e-06, + "loss": 0.1029, + "step": 4610 + }, + { + "epoch": 1.4941672067401166, + "grad_norm": 0.43616870045661926, + "learning_rate": 5.274488913158442e-06, + "loss": 0.1043, + "step": 4611 + }, + { + "epoch": 1.4944912508101102, + "grad_norm": 0.46900299191474915, + "learning_rate": 5.272742317851023e-06, + "loss": 0.1123, + "step": 4612 + }, + { + "epoch": 1.4948152948801037, + "grad_norm": 0.4342197775840759, + "learning_rate": 5.2709956891625845e-06, + "loss": 0.1063, + "step": 4613 + }, + { + "epoch": 1.4951393389500973, + "grad_norm": 0.44443580508232117, + "learning_rate": 5.2692490273068965e-06, + "loss": 0.1068, + "step": 4614 + }, + { + "epoch": 1.4954633830200907, + "grad_norm": 0.4577859938144684, + "learning_rate": 5.2675023324977356e-06, + "loss": 0.1069, + "step": 4615 + }, + { + "epoch": 1.4957874270900842, + "grad_norm": 0.4948973059654236, + "learning_rate": 5.265755604948877e-06, + "loss": 0.1189, + "step": 4616 + }, + { + "epoch": 1.4961114711600778, + "grad_norm": 0.4664367735385895, + "learning_rate": 5.264008844874105e-06, + "loss": 0.1085, + "step": 4617 + }, + { + "epoch": 1.4964355152300712, + "grad_norm": 0.4518917202949524, + "learning_rate": 5.262262052487207e-06, + "loss": 0.1018, + "step": 4618 + }, + { + "epoch": 1.4967595593000649, + "grad_norm": 0.49077895283699036, + "learning_rate": 5.260515228001973e-06, + "loss": 0.123, + "step": 4619 + }, + { + "epoch": 1.4970836033700583, + "grad_norm": 0.49957966804504395, + "learning_rate": 5.258768371632197e-06, + "loss": 0.1155, + "step": 4620 + }, + { + "epoch": 1.4974076474400517, + "grad_norm": 0.44145315885543823, + "learning_rate": 5.257021483591677e-06, + "loss": 0.1046, + "step": 4621 + }, + { + "epoch": 1.4977316915100454, + "grad_norm": 0.42506128549575806, + "learning_rate": 5.255274564094215e-06, + "loss": 0.0977, + "step": 4622 + }, + { + "epoch": 1.498055735580039, + "grad_norm": 0.473082572221756, + "learning_rate": 5.2535276133536175e-06, + "loss": 0.108, + "step": 4623 + }, + { + "epoch": 1.4983797796500324, + "grad_norm": 0.4737880229949951, + "learning_rate": 5.251780631583696e-06, + "loss": 0.1133, + "step": 4624 + }, + { + "epoch": 1.4987038237200259, + "grad_norm": 0.4546349346637726, + "learning_rate": 5.250033618998262e-06, + "loss": 0.1072, + "step": 4625 + }, + { + "epoch": 1.4990278677900195, + "grad_norm": 0.4377857744693756, + "learning_rate": 5.2482865758111335e-06, + "loss": 0.1056, + "step": 4626 + }, + { + "epoch": 1.499351911860013, + "grad_norm": 0.43855395913124084, + "learning_rate": 5.246539502236131e-06, + "loss": 0.1028, + "step": 4627 + }, + { + "epoch": 1.4996759559300066, + "grad_norm": 0.4623560607433319, + "learning_rate": 5.244792398487081e-06, + "loss": 0.1086, + "step": 4628 + }, + { + "epoch": 1.5, + "grad_norm": 0.4703526198863983, + "learning_rate": 5.2430452647778095e-06, + "loss": 0.1064, + "step": 4629 + }, + { + "epoch": 1.5003240440699934, + "grad_norm": 0.45636919140815735, + "learning_rate": 5.241298101322152e-06, + "loss": 0.1064, + "step": 4630 + }, + { + "epoch": 1.500648088139987, + "grad_norm": 0.4634525179862976, + "learning_rate": 5.239550908333943e-06, + "loss": 0.1094, + "step": 4631 + }, + { + "epoch": 1.5009721322099807, + "grad_norm": 0.4673732817173004, + "learning_rate": 5.237803686027021e-06, + "loss": 0.1123, + "step": 4632 + }, + { + "epoch": 1.5012961762799741, + "grad_norm": 0.4772822856903076, + "learning_rate": 5.236056434615231e-06, + "loss": 0.1199, + "step": 4633 + }, + { + "epoch": 1.5016202203499676, + "grad_norm": 0.43391144275665283, + "learning_rate": 5.234309154312417e-06, + "loss": 0.1037, + "step": 4634 + }, + { + "epoch": 1.501944264419961, + "grad_norm": 0.47034046053886414, + "learning_rate": 5.232561845332433e-06, + "loss": 0.1058, + "step": 4635 + }, + { + "epoch": 1.5022683084899546, + "grad_norm": 0.45668259263038635, + "learning_rate": 5.230814507889129e-06, + "loss": 0.1049, + "step": 4636 + }, + { + "epoch": 1.5025923525599483, + "grad_norm": 0.4736466705799103, + "learning_rate": 5.2290671421963635e-06, + "loss": 0.1201, + "step": 4637 + }, + { + "epoch": 1.5029163966299417, + "grad_norm": 0.47415265440940857, + "learning_rate": 5.227319748467998e-06, + "loss": 0.1045, + "step": 4638 + }, + { + "epoch": 1.5032404406999351, + "grad_norm": 0.4864136576652527, + "learning_rate": 5.225572326917896e-06, + "loss": 0.1155, + "step": 4639 + }, + { + "epoch": 1.5035644847699285, + "grad_norm": 0.46752864122390747, + "learning_rate": 5.223824877759924e-06, + "loss": 0.1135, + "step": 4640 + }, + { + "epoch": 1.5038885288399222, + "grad_norm": 0.5391199588775635, + "learning_rate": 5.222077401207954e-06, + "loss": 0.1318, + "step": 4641 + }, + { + "epoch": 1.5042125729099158, + "grad_norm": 0.4489925503730774, + "learning_rate": 5.220329897475861e-06, + "loss": 0.1032, + "step": 4642 + }, + { + "epoch": 1.5045366169799093, + "grad_norm": 0.46553587913513184, + "learning_rate": 5.2185823667775204e-06, + "loss": 0.1149, + "step": 4643 + }, + { + "epoch": 1.5048606610499027, + "grad_norm": 0.5243228077888489, + "learning_rate": 5.216834809326815e-06, + "loss": 0.1236, + "step": 4644 + }, + { + "epoch": 1.5051847051198963, + "grad_norm": 0.4822893440723419, + "learning_rate": 5.215087225337628e-06, + "loss": 0.1119, + "step": 4645 + }, + { + "epoch": 1.50550874918989, + "grad_norm": 0.4582579433917999, + "learning_rate": 5.213339615023847e-06, + "loss": 0.1078, + "step": 4646 + }, + { + "epoch": 1.5058327932598834, + "grad_norm": 0.43378040194511414, + "learning_rate": 5.211591978599362e-06, + "loss": 0.1034, + "step": 4647 + }, + { + "epoch": 1.5061568373298768, + "grad_norm": 0.451219767332077, + "learning_rate": 5.209844316278066e-06, + "loss": 0.1084, + "step": 4648 + }, + { + "epoch": 1.5064808813998702, + "grad_norm": 0.4611717760562897, + "learning_rate": 5.208096628273859e-06, + "loss": 0.1011, + "step": 4649 + }, + { + "epoch": 1.5068049254698639, + "grad_norm": 0.4647367298603058, + "learning_rate": 5.206348914800638e-06, + "loss": 0.1104, + "step": 4650 + }, + { + "epoch": 1.5071289695398575, + "grad_norm": 0.4628082811832428, + "learning_rate": 5.204601176072308e-06, + "loss": 0.1116, + "step": 4651 + }, + { + "epoch": 1.507453013609851, + "grad_norm": 0.45257583260536194, + "learning_rate": 5.202853412302775e-06, + "loss": 0.1009, + "step": 4652 + }, + { + "epoch": 1.5077770576798444, + "grad_norm": 0.46580031514167786, + "learning_rate": 5.2011056237059476e-06, + "loss": 0.1084, + "step": 4653 + }, + { + "epoch": 1.508101101749838, + "grad_norm": 0.4912484288215637, + "learning_rate": 5.1993578104957385e-06, + "loss": 0.1147, + "step": 4654 + }, + { + "epoch": 1.5084251458198314, + "grad_norm": 0.4861374795436859, + "learning_rate": 5.197609972886063e-06, + "loss": 0.1194, + "step": 4655 + }, + { + "epoch": 1.508749189889825, + "grad_norm": 0.4274054765701294, + "learning_rate": 5.195862111090842e-06, + "loss": 0.0995, + "step": 4656 + }, + { + "epoch": 1.5090732339598185, + "grad_norm": 0.47026312351226807, + "learning_rate": 5.194114225323994e-06, + "loss": 0.1109, + "step": 4657 + }, + { + "epoch": 1.509397278029812, + "grad_norm": 0.46407759189605713, + "learning_rate": 5.192366315799443e-06, + "loss": 0.1135, + "step": 4658 + }, + { + "epoch": 1.5097213220998056, + "grad_norm": 0.45349907875061035, + "learning_rate": 5.190618382731117e-06, + "loss": 0.1095, + "step": 4659 + }, + { + "epoch": 1.5100453661697992, + "grad_norm": 0.43043872714042664, + "learning_rate": 5.188870426332946e-06, + "loss": 0.1075, + "step": 4660 + }, + { + "epoch": 1.5103694102397927, + "grad_norm": 0.47293248772621155, + "learning_rate": 5.187122446818864e-06, + "loss": 0.1172, + "step": 4661 + }, + { + "epoch": 1.510693454309786, + "grad_norm": 0.476509690284729, + "learning_rate": 5.185374444402806e-06, + "loss": 0.1167, + "step": 4662 + }, + { + "epoch": 1.5110174983797795, + "grad_norm": 0.44831207394599915, + "learning_rate": 5.1836264192987104e-06, + "loss": 0.1054, + "step": 4663 + }, + { + "epoch": 1.5113415424497731, + "grad_norm": 0.4845344126224518, + "learning_rate": 5.181878371720519e-06, + "loss": 0.1199, + "step": 4664 + }, + { + "epoch": 1.5116655865197668, + "grad_norm": 0.44630926847457886, + "learning_rate": 5.180130301882175e-06, + "loss": 0.1093, + "step": 4665 + }, + { + "epoch": 1.5119896305897602, + "grad_norm": 0.4507032036781311, + "learning_rate": 5.1783822099976265e-06, + "loss": 0.1091, + "step": 4666 + }, + { + "epoch": 1.5123136746597536, + "grad_norm": 0.4598129987716675, + "learning_rate": 5.1766340962808225e-06, + "loss": 0.1125, + "step": 4667 + }, + { + "epoch": 1.5126377187297473, + "grad_norm": 0.43906885385513306, + "learning_rate": 5.174885960945716e-06, + "loss": 0.1064, + "step": 4668 + }, + { + "epoch": 1.512961762799741, + "grad_norm": 0.4334750771522522, + "learning_rate": 5.1731378042062584e-06, + "loss": 0.1025, + "step": 4669 + }, + { + "epoch": 1.5132858068697344, + "grad_norm": 0.46838998794555664, + "learning_rate": 5.171389626276411e-06, + "loss": 0.1048, + "step": 4670 + }, + { + "epoch": 1.5136098509397278, + "grad_norm": 0.48773279786109924, + "learning_rate": 5.169641427370132e-06, + "loss": 0.1182, + "step": 4671 + }, + { + "epoch": 1.5139338950097212, + "grad_norm": 0.44825443625450134, + "learning_rate": 5.167893207701385e-06, + "loss": 0.1055, + "step": 4672 + }, + { + "epoch": 1.5142579390797148, + "grad_norm": 0.43601194024086, + "learning_rate": 5.166144967484135e-06, + "loss": 0.1058, + "step": 4673 + }, + { + "epoch": 1.5145819831497085, + "grad_norm": 0.49638938903808594, + "learning_rate": 5.16439670693235e-06, + "loss": 0.1201, + "step": 4674 + }, + { + "epoch": 1.514906027219702, + "grad_norm": 0.4662867784500122, + "learning_rate": 5.162648426259997e-06, + "loss": 0.1033, + "step": 4675 + }, + { + "epoch": 1.5152300712896953, + "grad_norm": 0.4531441926956177, + "learning_rate": 5.160900125681053e-06, + "loss": 0.1096, + "step": 4676 + }, + { + "epoch": 1.5155541153596888, + "grad_norm": 0.4930696487426758, + "learning_rate": 5.159151805409491e-06, + "loss": 0.1169, + "step": 4677 + }, + { + "epoch": 1.5158781594296824, + "grad_norm": 0.4691605865955353, + "learning_rate": 5.15740346565929e-06, + "loss": 0.1078, + "step": 4678 + }, + { + "epoch": 1.516202203499676, + "grad_norm": 0.492055207490921, + "learning_rate": 5.155655106644427e-06, + "loss": 0.116, + "step": 4679 + }, + { + "epoch": 1.5165262475696695, + "grad_norm": 0.4335920512676239, + "learning_rate": 5.153906728578887e-06, + "loss": 0.1053, + "step": 4680 + }, + { + "epoch": 1.516850291639663, + "grad_norm": 0.4806187152862549, + "learning_rate": 5.152158331676652e-06, + "loss": 0.1084, + "step": 4681 + }, + { + "epoch": 1.5171743357096565, + "grad_norm": 0.5146906971931458, + "learning_rate": 5.150409916151711e-06, + "loss": 0.1173, + "step": 4682 + }, + { + "epoch": 1.5174983797796502, + "grad_norm": 0.4676697552204132, + "learning_rate": 5.148661482218051e-06, + "loss": 0.1226, + "step": 4683 + }, + { + "epoch": 1.5178224238496436, + "grad_norm": 0.45594385266304016, + "learning_rate": 5.146913030089665e-06, + "loss": 0.1118, + "step": 4684 + }, + { + "epoch": 1.518146467919637, + "grad_norm": 0.423243910074234, + "learning_rate": 5.1451645599805475e-06, + "loss": 0.0956, + "step": 4685 + }, + { + "epoch": 1.5184705119896305, + "grad_norm": 0.46194830536842346, + "learning_rate": 5.143416072104693e-06, + "loss": 0.1094, + "step": 4686 + }, + { + "epoch": 1.518794556059624, + "grad_norm": 0.4480394124984741, + "learning_rate": 5.1416675666761e-06, + "loss": 0.1093, + "step": 4687 + }, + { + "epoch": 1.5191186001296177, + "grad_norm": 0.4406319856643677, + "learning_rate": 5.1399190439087675e-06, + "loss": 0.1108, + "step": 4688 + }, + { + "epoch": 1.5194426441996112, + "grad_norm": 0.4938640892505646, + "learning_rate": 5.138170504016699e-06, + "loss": 0.1198, + "step": 4689 + }, + { + "epoch": 1.5197666882696046, + "grad_norm": 0.4330286383628845, + "learning_rate": 5.1364219472138984e-06, + "loss": 0.1036, + "step": 4690 + }, + { + "epoch": 1.5200907323395982, + "grad_norm": 0.5021396279335022, + "learning_rate": 5.1346733737143715e-06, + "loss": 0.1202, + "step": 4691 + }, + { + "epoch": 1.5204147764095917, + "grad_norm": 0.43403375148773193, + "learning_rate": 5.132924783732128e-06, + "loss": 0.1043, + "step": 4692 + }, + { + "epoch": 1.5207388204795853, + "grad_norm": 0.4966108202934265, + "learning_rate": 5.131176177481179e-06, + "loss": 0.1176, + "step": 4693 + }, + { + "epoch": 1.5210628645495787, + "grad_norm": 0.468465119600296, + "learning_rate": 5.129427555175534e-06, + "loss": 0.1166, + "step": 4694 + }, + { + "epoch": 1.5213869086195722, + "grad_norm": 0.4209233820438385, + "learning_rate": 5.127678917029209e-06, + "loss": 0.0997, + "step": 4695 + }, + { + "epoch": 1.5217109526895658, + "grad_norm": 0.49298617243766785, + "learning_rate": 5.12593026325622e-06, + "loss": 0.1115, + "step": 4696 + }, + { + "epoch": 1.5220349967595594, + "grad_norm": 0.4441665709018707, + "learning_rate": 5.1241815940705874e-06, + "loss": 0.1085, + "step": 4697 + }, + { + "epoch": 1.5223590408295529, + "grad_norm": 0.5018707513809204, + "learning_rate": 5.12243290968633e-06, + "loss": 0.1186, + "step": 4698 + }, + { + "epoch": 1.5226830848995463, + "grad_norm": 0.4586740732192993, + "learning_rate": 5.120684210317469e-06, + "loss": 0.1071, + "step": 4699 + }, + { + "epoch": 1.5230071289695397, + "grad_norm": 0.4645669758319855, + "learning_rate": 5.1189354961780305e-06, + "loss": 0.1162, + "step": 4700 + }, + { + "epoch": 1.5233311730395334, + "grad_norm": 0.47067075967788696, + "learning_rate": 5.117186767482036e-06, + "loss": 0.1156, + "step": 4701 + }, + { + "epoch": 1.523655217109527, + "grad_norm": 0.4710082709789276, + "learning_rate": 5.115438024443517e-06, + "loss": 0.1132, + "step": 4702 + }, + { + "epoch": 1.5239792611795204, + "grad_norm": 0.453998327255249, + "learning_rate": 5.1136892672765e-06, + "loss": 0.1106, + "step": 4703 + }, + { + "epoch": 1.5243033052495139, + "grad_norm": 0.4350145161151886, + "learning_rate": 5.111940496195019e-06, + "loss": 0.1048, + "step": 4704 + }, + { + "epoch": 1.5246273493195075, + "grad_norm": 0.45774731040000916, + "learning_rate": 5.110191711413105e-06, + "loss": 0.1111, + "step": 4705 + }, + { + "epoch": 1.524951393389501, + "grad_norm": 0.5091844797134399, + "learning_rate": 5.108442913144792e-06, + "loss": 0.1282, + "step": 4706 + }, + { + "epoch": 1.5252754374594946, + "grad_norm": 0.462661474943161, + "learning_rate": 5.106694101604116e-06, + "loss": 0.1123, + "step": 4707 + }, + { + "epoch": 1.525599481529488, + "grad_norm": 0.45566895604133606, + "learning_rate": 5.104945277005114e-06, + "loss": 0.1112, + "step": 4708 + }, + { + "epoch": 1.5259235255994814, + "grad_norm": 0.47541677951812744, + "learning_rate": 5.1031964395618285e-06, + "loss": 0.1155, + "step": 4709 + }, + { + "epoch": 1.526247569669475, + "grad_norm": 0.4878292679786682, + "learning_rate": 5.1014475894882956e-06, + "loss": 0.1143, + "step": 4710 + }, + { + "epoch": 1.5265716137394687, + "grad_norm": 0.4552808701992035, + "learning_rate": 5.099698726998561e-06, + "loss": 0.1123, + "step": 4711 + }, + { + "epoch": 1.5268956578094621, + "grad_norm": 0.5104338526725769, + "learning_rate": 5.0979498523066665e-06, + "loss": 0.1199, + "step": 4712 + }, + { + "epoch": 1.5272197018794555, + "grad_norm": 0.4424220323562622, + "learning_rate": 5.096200965626658e-06, + "loss": 0.1057, + "step": 4713 + }, + { + "epoch": 1.527543745949449, + "grad_norm": 0.463344931602478, + "learning_rate": 5.094452067172583e-06, + "loss": 0.107, + "step": 4714 + }, + { + "epoch": 1.5278677900194426, + "grad_norm": 0.48906680941581726, + "learning_rate": 5.09270315715849e-06, + "loss": 0.1132, + "step": 4715 + }, + { + "epoch": 1.5281918340894363, + "grad_norm": 0.44374555349349976, + "learning_rate": 5.090954235798426e-06, + "loss": 0.1064, + "step": 4716 + }, + { + "epoch": 1.5285158781594297, + "grad_norm": 0.4655635356903076, + "learning_rate": 5.089205303306447e-06, + "loss": 0.1166, + "step": 4717 + }, + { + "epoch": 1.528839922229423, + "grad_norm": 0.5116678476333618, + "learning_rate": 5.087456359896601e-06, + "loss": 0.1198, + "step": 4718 + }, + { + "epoch": 1.5291639662994168, + "grad_norm": 0.46683815121650696, + "learning_rate": 5.085707405782942e-06, + "loss": 0.1147, + "step": 4719 + }, + { + "epoch": 1.5294880103694104, + "grad_norm": 0.4789724051952362, + "learning_rate": 5.08395844117953e-06, + "loss": 0.1187, + "step": 4720 + }, + { + "epoch": 1.5298120544394038, + "grad_norm": 0.4618801474571228, + "learning_rate": 5.082209466300414e-06, + "loss": 0.1106, + "step": 4721 + }, + { + "epoch": 1.5301360985093972, + "grad_norm": 0.43246573209762573, + "learning_rate": 5.080460481359656e-06, + "loss": 0.1056, + "step": 4722 + }, + { + "epoch": 1.5304601425793907, + "grad_norm": 0.452864408493042, + "learning_rate": 5.078711486571315e-06, + "loss": 0.1083, + "step": 4723 + }, + { + "epoch": 1.5307841866493843, + "grad_norm": 0.4218159317970276, + "learning_rate": 5.076962482149449e-06, + "loss": 0.103, + "step": 4724 + }, + { + "epoch": 1.531108230719378, + "grad_norm": 0.4770122766494751, + "learning_rate": 5.07521346830812e-06, + "loss": 0.1159, + "step": 4725 + }, + { + "epoch": 1.5314322747893714, + "grad_norm": 0.46023043990135193, + "learning_rate": 5.073464445261391e-06, + "loss": 0.1115, + "step": 4726 + }, + { + "epoch": 1.5317563188593648, + "grad_norm": 0.4554358422756195, + "learning_rate": 5.0717154132233245e-06, + "loss": 0.1088, + "step": 4727 + }, + { + "epoch": 1.5320803629293582, + "grad_norm": 0.45043107867240906, + "learning_rate": 5.069966372407986e-06, + "loss": 0.1079, + "step": 4728 + }, + { + "epoch": 1.5324044069993519, + "grad_norm": 0.46434617042541504, + "learning_rate": 5.0682173230294415e-06, + "loss": 0.111, + "step": 4729 + }, + { + "epoch": 1.5327284510693455, + "grad_norm": 0.4614694118499756, + "learning_rate": 5.066468265301757e-06, + "loss": 0.1109, + "step": 4730 + }, + { + "epoch": 1.533052495139339, + "grad_norm": 0.4331463575363159, + "learning_rate": 5.064719199439001e-06, + "loss": 0.0982, + "step": 4731 + }, + { + "epoch": 1.5333765392093324, + "grad_norm": 0.44229406118392944, + "learning_rate": 5.06297012565524e-06, + "loss": 0.1013, + "step": 4732 + }, + { + "epoch": 1.533700583279326, + "grad_norm": 0.4934309720993042, + "learning_rate": 5.061221044164546e-06, + "loss": 0.1173, + "step": 4733 + }, + { + "epoch": 1.5340246273493197, + "grad_norm": 0.49272412061691284, + "learning_rate": 5.059471955180988e-06, + "loss": 0.1126, + "step": 4734 + }, + { + "epoch": 1.534348671419313, + "grad_norm": 0.4679400324821472, + "learning_rate": 5.05772285891864e-06, + "loss": 0.1106, + "step": 4735 + }, + { + "epoch": 1.5346727154893065, + "grad_norm": 0.49218079447746277, + "learning_rate": 5.055973755591572e-06, + "loss": 0.1196, + "step": 4736 + }, + { + "epoch": 1.5349967595593, + "grad_norm": 0.45480284094810486, + "learning_rate": 5.054224645413858e-06, + "loss": 0.1019, + "step": 4737 + }, + { + "epoch": 1.5353208036292936, + "grad_norm": 0.48111364245414734, + "learning_rate": 5.052475528599572e-06, + "loss": 0.1137, + "step": 4738 + }, + { + "epoch": 1.5356448476992872, + "grad_norm": 0.4530400037765503, + "learning_rate": 5.050726405362789e-06, + "loss": 0.1006, + "step": 4739 + }, + { + "epoch": 1.5359688917692806, + "grad_norm": 0.4552764296531677, + "learning_rate": 5.048977275917586e-06, + "loss": 0.109, + "step": 4740 + }, + { + "epoch": 1.536292935839274, + "grad_norm": 0.5146282315254211, + "learning_rate": 5.047228140478039e-06, + "loss": 0.1191, + "step": 4741 + }, + { + "epoch": 1.5366169799092677, + "grad_norm": 0.436003178358078, + "learning_rate": 5.045478999258224e-06, + "loss": 0.0997, + "step": 4742 + }, + { + "epoch": 1.5369410239792611, + "grad_norm": 0.45526957511901855, + "learning_rate": 5.043729852472221e-06, + "loss": 0.1102, + "step": 4743 + }, + { + "epoch": 1.5372650680492548, + "grad_norm": 0.46070560812950134, + "learning_rate": 5.041980700334106e-06, + "loss": 0.1093, + "step": 4744 + }, + { + "epoch": 1.5375891121192482, + "grad_norm": 0.45329949259757996, + "learning_rate": 5.040231543057959e-06, + "loss": 0.1115, + "step": 4745 + }, + { + "epoch": 1.5379131561892416, + "grad_norm": 0.45783063769340515, + "learning_rate": 5.038482380857862e-06, + "loss": 0.1086, + "step": 4746 + }, + { + "epoch": 1.5382372002592353, + "grad_norm": 0.4675714373588562, + "learning_rate": 5.036733213947894e-06, + "loss": 0.1166, + "step": 4747 + }, + { + "epoch": 1.538561244329229, + "grad_norm": 0.5060395002365112, + "learning_rate": 5.034984042542136e-06, + "loss": 0.125, + "step": 4748 + }, + { + "epoch": 1.5388852883992223, + "grad_norm": 0.45864853262901306, + "learning_rate": 5.033234866854669e-06, + "loss": 0.1071, + "step": 4749 + }, + { + "epoch": 1.5392093324692158, + "grad_norm": 0.42704102396965027, + "learning_rate": 5.0314856870995775e-06, + "loss": 0.1058, + "step": 4750 + }, + { + "epoch": 1.5395333765392092, + "grad_norm": 0.424561470746994, + "learning_rate": 5.029736503490941e-06, + "loss": 0.1093, + "step": 4751 + }, + { + "epoch": 1.5398574206092028, + "grad_norm": 0.4763147234916687, + "learning_rate": 5.027987316242847e-06, + "loss": 0.117, + "step": 4752 + }, + { + "epoch": 1.5401814646791965, + "grad_norm": 0.41369420289993286, + "learning_rate": 5.026238125569375e-06, + "loss": 0.0943, + "step": 4753 + }, + { + "epoch": 1.54050550874919, + "grad_norm": 0.4703889489173889, + "learning_rate": 5.02448893168461e-06, + "loss": 0.1138, + "step": 4754 + }, + { + "epoch": 1.5408295528191833, + "grad_norm": 0.4704046845436096, + "learning_rate": 5.022739734802637e-06, + "loss": 0.1218, + "step": 4755 + }, + { + "epoch": 1.541153596889177, + "grad_norm": 0.4563504755496979, + "learning_rate": 5.020990535137541e-06, + "loss": 0.1103, + "step": 4756 + }, + { + "epoch": 1.5414776409591704, + "grad_norm": 0.47579801082611084, + "learning_rate": 5.019241332903406e-06, + "loss": 0.1245, + "step": 4757 + }, + { + "epoch": 1.541801685029164, + "grad_norm": 0.45955783128738403, + "learning_rate": 5.017492128314319e-06, + "loss": 0.1129, + "step": 4758 + }, + { + "epoch": 1.5421257290991575, + "grad_norm": 0.47311556339263916, + "learning_rate": 5.015742921584365e-06, + "loss": 0.1158, + "step": 4759 + }, + { + "epoch": 1.5424497731691509, + "grad_norm": 0.4540937840938568, + "learning_rate": 5.013993712927628e-06, + "loss": 0.1121, + "step": 4760 + }, + { + "epoch": 1.5427738172391445, + "grad_norm": 0.4233587682247162, + "learning_rate": 5.012244502558198e-06, + "loss": 0.108, + "step": 4761 + }, + { + "epoch": 1.5430978613091382, + "grad_norm": 0.5149884819984436, + "learning_rate": 5.0104952906901576e-06, + "loss": 0.1231, + "step": 4762 + }, + { + "epoch": 1.5434219053791316, + "grad_norm": 0.44611334800720215, + "learning_rate": 5.008746077537598e-06, + "loss": 0.1074, + "step": 4763 + }, + { + "epoch": 1.543745949449125, + "grad_norm": 0.5051946640014648, + "learning_rate": 5.0069968633146006e-06, + "loss": 0.1243, + "step": 4764 + }, + { + "epoch": 1.5440699935191184, + "grad_norm": 0.516586184501648, + "learning_rate": 5.005247648235257e-06, + "loss": 0.1251, + "step": 4765 + }, + { + "epoch": 1.544394037589112, + "grad_norm": 0.47310107946395874, + "learning_rate": 5.003498432513649e-06, + "loss": 0.1074, + "step": 4766 + }, + { + "epoch": 1.5447180816591057, + "grad_norm": 0.4739862084388733, + "learning_rate": 5.001749216363869e-06, + "loss": 0.1146, + "step": 4767 + }, + { + "epoch": 1.5450421257290992, + "grad_norm": 0.4782755374908447, + "learning_rate": 5e-06, + "loss": 0.1127, + "step": 4768 + }, + { + "epoch": 1.5453661697990926, + "grad_norm": 0.4727858304977417, + "learning_rate": 4.9982507836361335e-06, + "loss": 0.1141, + "step": 4769 + }, + { + "epoch": 1.5456902138690862, + "grad_norm": 0.4504522681236267, + "learning_rate": 4.996501567486352e-06, + "loss": 0.1149, + "step": 4770 + }, + { + "epoch": 1.5460142579390799, + "grad_norm": 0.5180319547653198, + "learning_rate": 4.994752351764747e-06, + "loss": 0.1215, + "step": 4771 + }, + { + "epoch": 1.5463383020090733, + "grad_norm": 0.46624067425727844, + "learning_rate": 4.993003136685401e-06, + "loss": 0.1155, + "step": 4772 + }, + { + "epoch": 1.5466623460790667, + "grad_norm": 0.47008898854255676, + "learning_rate": 4.991253922462405e-06, + "loss": 0.1049, + "step": 4773 + }, + { + "epoch": 1.5469863901490601, + "grad_norm": 0.4565297067165375, + "learning_rate": 4.989504709309842e-06, + "loss": 0.1124, + "step": 4774 + }, + { + "epoch": 1.5473104342190538, + "grad_norm": 0.4473397135734558, + "learning_rate": 4.9877554974418045e-06, + "loss": 0.1072, + "step": 4775 + }, + { + "epoch": 1.5476344782890474, + "grad_norm": 0.48428457975387573, + "learning_rate": 4.986006287072374e-06, + "loss": 0.1249, + "step": 4776 + }, + { + "epoch": 1.5479585223590409, + "grad_norm": 0.44468632340431213, + "learning_rate": 4.984257078415637e-06, + "loss": 0.1122, + "step": 4777 + }, + { + "epoch": 1.5482825664290343, + "grad_norm": 0.4435969591140747, + "learning_rate": 4.982507871685684e-06, + "loss": 0.1036, + "step": 4778 + }, + { + "epoch": 1.5486066104990277, + "grad_norm": 0.49106940627098083, + "learning_rate": 4.980758667096594e-06, + "loss": 0.1201, + "step": 4779 + }, + { + "epoch": 1.5489306545690213, + "grad_norm": 0.5072797536849976, + "learning_rate": 4.9790094648624605e-06, + "loss": 0.114, + "step": 4780 + }, + { + "epoch": 1.549254698639015, + "grad_norm": 0.44148191809654236, + "learning_rate": 4.977260265197365e-06, + "loss": 0.1056, + "step": 4781 + }, + { + "epoch": 1.5495787427090084, + "grad_norm": 0.43735694885253906, + "learning_rate": 4.975511068315391e-06, + "loss": 0.1013, + "step": 4782 + }, + { + "epoch": 1.5499027867790018, + "grad_norm": 0.43392884731292725, + "learning_rate": 4.9737618744306274e-06, + "loss": 0.0987, + "step": 4783 + }, + { + "epoch": 1.5502268308489955, + "grad_norm": 0.48035189509391785, + "learning_rate": 4.972012683757155e-06, + "loss": 0.1149, + "step": 4784 + }, + { + "epoch": 1.5505508749189891, + "grad_norm": 0.41511499881744385, + "learning_rate": 4.970263496509059e-06, + "loss": 0.0949, + "step": 4785 + }, + { + "epoch": 1.5508749189889826, + "grad_norm": 0.4699539542198181, + "learning_rate": 4.968514312900423e-06, + "loss": 0.1172, + "step": 4786 + }, + { + "epoch": 1.551198963058976, + "grad_norm": 0.460025817155838, + "learning_rate": 4.9667651331453315e-06, + "loss": 0.1053, + "step": 4787 + }, + { + "epoch": 1.5515230071289694, + "grad_norm": 0.4553222358226776, + "learning_rate": 4.965015957457866e-06, + "loss": 0.1137, + "step": 4788 + }, + { + "epoch": 1.551847051198963, + "grad_norm": 0.47193190455436707, + "learning_rate": 4.963266786052107e-06, + "loss": 0.1175, + "step": 4789 + }, + { + "epoch": 1.5521710952689567, + "grad_norm": 0.4407771825790405, + "learning_rate": 4.961517619142139e-06, + "loss": 0.1051, + "step": 4790 + }, + { + "epoch": 1.5524951393389501, + "grad_norm": 0.4660543203353882, + "learning_rate": 4.959768456942041e-06, + "loss": 0.1074, + "step": 4791 + }, + { + "epoch": 1.5528191834089435, + "grad_norm": 0.4992606043815613, + "learning_rate": 4.958019299665895e-06, + "loss": 0.1146, + "step": 4792 + }, + { + "epoch": 1.5531432274789372, + "grad_norm": 0.47650009393692017, + "learning_rate": 4.956270147527782e-06, + "loss": 0.1147, + "step": 4793 + }, + { + "epoch": 1.5534672715489306, + "grad_norm": 0.4672715961933136, + "learning_rate": 4.954521000741777e-06, + "loss": 0.1098, + "step": 4794 + }, + { + "epoch": 1.5537913156189243, + "grad_norm": 0.4578569829463959, + "learning_rate": 4.952771859521962e-06, + "loss": 0.1177, + "step": 4795 + }, + { + "epoch": 1.5541153596889177, + "grad_norm": 0.4631955325603485, + "learning_rate": 4.951022724082414e-06, + "loss": 0.1125, + "step": 4796 + }, + { + "epoch": 1.554439403758911, + "grad_norm": 0.41857489943504333, + "learning_rate": 4.949273594637213e-06, + "loss": 0.0922, + "step": 4797 + }, + { + "epoch": 1.5547634478289047, + "grad_norm": 0.43116259574890137, + "learning_rate": 4.947524471400428e-06, + "loss": 0.1115, + "step": 4798 + }, + { + "epoch": 1.5550874918988984, + "grad_norm": 0.4329448938369751, + "learning_rate": 4.945775354586144e-06, + "loss": 0.1026, + "step": 4799 + }, + { + "epoch": 1.5554115359688918, + "grad_norm": 0.4677078127861023, + "learning_rate": 4.944026244408431e-06, + "loss": 0.1103, + "step": 4800 + }, + { + "epoch": 1.5557355800388852, + "grad_norm": 0.4555562436580658, + "learning_rate": 4.942277141081361e-06, + "loss": 0.1044, + "step": 4801 + }, + { + "epoch": 1.5560596241088787, + "grad_norm": 0.4801924228668213, + "learning_rate": 4.940528044819013e-06, + "loss": 0.1152, + "step": 4802 + }, + { + "epoch": 1.5563836681788723, + "grad_norm": 0.4613364338874817, + "learning_rate": 4.938778955835454e-06, + "loss": 0.1128, + "step": 4803 + }, + { + "epoch": 1.556707712248866, + "grad_norm": 0.43879398703575134, + "learning_rate": 4.937029874344761e-06, + "loss": 0.1068, + "step": 4804 + }, + { + "epoch": 1.5570317563188594, + "grad_norm": 0.4469672739505768, + "learning_rate": 4.935280800561002e-06, + "loss": 0.1102, + "step": 4805 + }, + { + "epoch": 1.5573558003888528, + "grad_norm": 0.42352110147476196, + "learning_rate": 4.933531734698244e-06, + "loss": 0.0962, + "step": 4806 + }, + { + "epoch": 1.5576798444588464, + "grad_norm": 0.4701560437679291, + "learning_rate": 4.93178267697056e-06, + "loss": 0.1156, + "step": 4807 + }, + { + "epoch": 1.55800388852884, + "grad_norm": 0.4738878905773163, + "learning_rate": 4.930033627592014e-06, + "loss": 0.117, + "step": 4808 + }, + { + "epoch": 1.5583279325988335, + "grad_norm": 0.46682971715927124, + "learning_rate": 4.928284586776676e-06, + "loss": 0.1149, + "step": 4809 + }, + { + "epoch": 1.558651976668827, + "grad_norm": 0.4412629306316376, + "learning_rate": 4.9265355547386095e-06, + "loss": 0.1019, + "step": 4810 + }, + { + "epoch": 1.5589760207388204, + "grad_norm": 0.4332481026649475, + "learning_rate": 4.924786531691881e-06, + "loss": 0.0998, + "step": 4811 + }, + { + "epoch": 1.559300064808814, + "grad_norm": 0.46273818612098694, + "learning_rate": 4.923037517850554e-06, + "loss": 0.1141, + "step": 4812 + }, + { + "epoch": 1.5596241088788076, + "grad_norm": 0.4398345947265625, + "learning_rate": 4.921288513428687e-06, + "loss": 0.1053, + "step": 4813 + }, + { + "epoch": 1.559948152948801, + "grad_norm": 0.46956223249435425, + "learning_rate": 4.9195395186403455e-06, + "loss": 0.1148, + "step": 4814 + }, + { + "epoch": 1.5602721970187945, + "grad_norm": 0.4423372745513916, + "learning_rate": 4.917790533699587e-06, + "loss": 0.0994, + "step": 4815 + }, + { + "epoch": 1.560596241088788, + "grad_norm": 0.4394659996032715, + "learning_rate": 4.916041558820473e-06, + "loss": 0.1058, + "step": 4816 + }, + { + "epoch": 1.5609202851587816, + "grad_norm": 0.4328305721282959, + "learning_rate": 4.914292594217059e-06, + "loss": 0.1032, + "step": 4817 + }, + { + "epoch": 1.5612443292287752, + "grad_norm": 0.4785172939300537, + "learning_rate": 4.912543640103401e-06, + "loss": 0.1105, + "step": 4818 + }, + { + "epoch": 1.5615683732987686, + "grad_norm": 0.45546668767929077, + "learning_rate": 4.9107946966935555e-06, + "loss": 0.1097, + "step": 4819 + }, + { + "epoch": 1.561892417368762, + "grad_norm": 0.42712464928627014, + "learning_rate": 4.909045764201574e-06, + "loss": 0.1036, + "step": 4820 + }, + { + "epoch": 1.5622164614387557, + "grad_norm": 0.4540826380252838, + "learning_rate": 4.907296842841512e-06, + "loss": 0.1063, + "step": 4821 + }, + { + "epoch": 1.5625405055087493, + "grad_norm": 0.4345269799232483, + "learning_rate": 4.905547932827417e-06, + "loss": 0.1072, + "step": 4822 + }, + { + "epoch": 1.5628645495787428, + "grad_norm": 0.42495936155319214, + "learning_rate": 4.903799034373343e-06, + "loss": 0.0992, + "step": 4823 + }, + { + "epoch": 1.5631885936487362, + "grad_norm": 0.45917809009552, + "learning_rate": 4.902050147693336e-06, + "loss": 0.1129, + "step": 4824 + }, + { + "epoch": 1.5635126377187296, + "grad_norm": 0.4842449724674225, + "learning_rate": 4.90030127300144e-06, + "loss": 0.1156, + "step": 4825 + }, + { + "epoch": 1.5638366817887233, + "grad_norm": 0.44549980759620667, + "learning_rate": 4.898552410511706e-06, + "loss": 0.1087, + "step": 4826 + }, + { + "epoch": 1.564160725858717, + "grad_norm": 0.5054349899291992, + "learning_rate": 4.896803560438174e-06, + "loss": 0.1139, + "step": 4827 + }, + { + "epoch": 1.5644847699287103, + "grad_norm": 0.45134031772613525, + "learning_rate": 4.8950547229948874e-06, + "loss": 0.1026, + "step": 4828 + }, + { + "epoch": 1.5648088139987038, + "grad_norm": 0.46557843685150146, + "learning_rate": 4.893305898395887e-06, + "loss": 0.1127, + "step": 4829 + }, + { + "epoch": 1.5651328580686974, + "grad_norm": 0.44793131947517395, + "learning_rate": 4.89155708685521e-06, + "loss": 0.1105, + "step": 4830 + }, + { + "epoch": 1.5654569021386908, + "grad_norm": 0.4677371382713318, + "learning_rate": 4.889808288586897e-06, + "loss": 0.1005, + "step": 4831 + }, + { + "epoch": 1.5657809462086845, + "grad_norm": 0.4556576609611511, + "learning_rate": 4.888059503804981e-06, + "loss": 0.1122, + "step": 4832 + }, + { + "epoch": 1.5661049902786779, + "grad_norm": 0.4515218138694763, + "learning_rate": 4.8863107327235005e-06, + "loss": 0.1084, + "step": 4833 + }, + { + "epoch": 1.5664290343486713, + "grad_norm": 0.44593796133995056, + "learning_rate": 4.884561975556483e-06, + "loss": 0.1058, + "step": 4834 + }, + { + "epoch": 1.566753078418665, + "grad_norm": 0.4894297122955322, + "learning_rate": 4.882813232517965e-06, + "loss": 0.1187, + "step": 4835 + }, + { + "epoch": 1.5670771224886586, + "grad_norm": 0.441567599773407, + "learning_rate": 4.881064503821973e-06, + "loss": 0.1093, + "step": 4836 + }, + { + "epoch": 1.567401166558652, + "grad_norm": 0.4360569417476654, + "learning_rate": 4.879315789682533e-06, + "loss": 0.1079, + "step": 4837 + }, + { + "epoch": 1.5677252106286454, + "grad_norm": 0.4840516746044159, + "learning_rate": 4.877567090313671e-06, + "loss": 0.1228, + "step": 4838 + }, + { + "epoch": 1.5680492546986389, + "grad_norm": 0.4710215926170349, + "learning_rate": 4.875818405929413e-06, + "loss": 0.1118, + "step": 4839 + }, + { + "epoch": 1.5683732987686325, + "grad_norm": 0.43163785338401794, + "learning_rate": 4.874069736743781e-06, + "loss": 0.1013, + "step": 4840 + }, + { + "epoch": 1.5686973428386262, + "grad_norm": 0.43744251132011414, + "learning_rate": 4.872321082970792e-06, + "loss": 0.107, + "step": 4841 + }, + { + "epoch": 1.5690213869086196, + "grad_norm": 0.457549124956131, + "learning_rate": 4.870572444824469e-06, + "loss": 0.1083, + "step": 4842 + }, + { + "epoch": 1.569345430978613, + "grad_norm": 0.451315701007843, + "learning_rate": 4.868823822518825e-06, + "loss": 0.1123, + "step": 4843 + }, + { + "epoch": 1.5696694750486067, + "grad_norm": 0.3946089446544647, + "learning_rate": 4.867075216267873e-06, + "loss": 0.0902, + "step": 4844 + }, + { + "epoch": 1.5699935191186, + "grad_norm": 0.45311471819877625, + "learning_rate": 4.865326626285629e-06, + "loss": 0.1133, + "step": 4845 + }, + { + "epoch": 1.5703175631885937, + "grad_norm": 0.42419302463531494, + "learning_rate": 4.863578052786103e-06, + "loss": 0.0989, + "step": 4846 + }, + { + "epoch": 1.5706416072585871, + "grad_norm": 0.4588811993598938, + "learning_rate": 4.861829495983302e-06, + "loss": 0.1053, + "step": 4847 + }, + { + "epoch": 1.5709656513285806, + "grad_norm": 0.44163978099823, + "learning_rate": 4.860080956091234e-06, + "loss": 0.1155, + "step": 4848 + }, + { + "epoch": 1.5712896953985742, + "grad_norm": 0.44390761852264404, + "learning_rate": 4.858332433323902e-06, + "loss": 0.1041, + "step": 4849 + }, + { + "epoch": 1.5716137394685679, + "grad_norm": 0.47635310888290405, + "learning_rate": 4.856583927895309e-06, + "loss": 0.1112, + "step": 4850 + }, + { + "epoch": 1.5719377835385613, + "grad_norm": 0.45071783661842346, + "learning_rate": 4.854835440019453e-06, + "loss": 0.1029, + "step": 4851 + }, + { + "epoch": 1.5722618276085547, + "grad_norm": 0.47139331698417664, + "learning_rate": 4.853086969910336e-06, + "loss": 0.1149, + "step": 4852 + }, + { + "epoch": 1.5725858716785481, + "grad_norm": 0.5124130249023438, + "learning_rate": 4.85133851778195e-06, + "loss": 0.1167, + "step": 4853 + }, + { + "epoch": 1.5729099157485418, + "grad_norm": 0.45137298107147217, + "learning_rate": 4.8495900838482915e-06, + "loss": 0.116, + "step": 4854 + }, + { + "epoch": 1.5732339598185354, + "grad_norm": 0.45909252762794495, + "learning_rate": 4.847841668323351e-06, + "loss": 0.1119, + "step": 4855 + }, + { + "epoch": 1.5735580038885288, + "grad_norm": 0.4754926860332489, + "learning_rate": 4.846093271421115e-06, + "loss": 0.1099, + "step": 4856 + }, + { + "epoch": 1.5738820479585223, + "grad_norm": 0.4783075451850891, + "learning_rate": 4.844344893355575e-06, + "loss": 0.118, + "step": 4857 + }, + { + "epoch": 1.574206092028516, + "grad_norm": 0.45256945490837097, + "learning_rate": 4.842596534340712e-06, + "loss": 0.1053, + "step": 4858 + }, + { + "epoch": 1.5745301360985096, + "grad_norm": 0.4893447458744049, + "learning_rate": 4.84084819459051e-06, + "loss": 0.1132, + "step": 4859 + }, + { + "epoch": 1.574854180168503, + "grad_norm": 0.47987404465675354, + "learning_rate": 4.839099874318948e-06, + "loss": 0.1169, + "step": 4860 + }, + { + "epoch": 1.5751782242384964, + "grad_norm": 0.4620630145072937, + "learning_rate": 4.837351573740004e-06, + "loss": 0.1033, + "step": 4861 + }, + { + "epoch": 1.5755022683084898, + "grad_norm": 0.48919492959976196, + "learning_rate": 4.835603293067653e-06, + "loss": 0.1187, + "step": 4862 + }, + { + "epoch": 1.5758263123784835, + "grad_norm": 0.45602983236312866, + "learning_rate": 4.833855032515866e-06, + "loss": 0.1127, + "step": 4863 + }, + { + "epoch": 1.5761503564484771, + "grad_norm": 0.4560927152633667, + "learning_rate": 4.832106792298616e-06, + "loss": 0.1142, + "step": 4864 + }, + { + "epoch": 1.5764744005184705, + "grad_norm": 0.48238566517829895, + "learning_rate": 4.830358572629868e-06, + "loss": 0.1113, + "step": 4865 + }, + { + "epoch": 1.576798444588464, + "grad_norm": 0.46003594994544983, + "learning_rate": 4.82861037372359e-06, + "loss": 0.1095, + "step": 4866 + }, + { + "epoch": 1.5771224886584574, + "grad_norm": 0.46108075976371765, + "learning_rate": 4.826862195793743e-06, + "loss": 0.1192, + "step": 4867 + }, + { + "epoch": 1.577446532728451, + "grad_norm": 0.45616206526756287, + "learning_rate": 4.825114039054286e-06, + "loss": 0.1084, + "step": 4868 + }, + { + "epoch": 1.5777705767984447, + "grad_norm": 0.45565494894981384, + "learning_rate": 4.823365903719179e-06, + "loss": 0.1098, + "step": 4869 + }, + { + "epoch": 1.578094620868438, + "grad_norm": 0.42677366733551025, + "learning_rate": 4.821617790002374e-06, + "loss": 0.1021, + "step": 4870 + }, + { + "epoch": 1.5784186649384315, + "grad_norm": 0.4569999873638153, + "learning_rate": 4.819869698117826e-06, + "loss": 0.1168, + "step": 4871 + }, + { + "epoch": 1.5787427090084252, + "grad_norm": 0.4749670922756195, + "learning_rate": 4.818121628279483e-06, + "loss": 0.1223, + "step": 4872 + }, + { + "epoch": 1.5790667530784188, + "grad_norm": 0.46919116377830505, + "learning_rate": 4.81637358070129e-06, + "loss": 0.1075, + "step": 4873 + }, + { + "epoch": 1.5793907971484122, + "grad_norm": 0.4429967403411865, + "learning_rate": 4.8146255555971964e-06, + "loss": 0.0993, + "step": 4874 + }, + { + "epoch": 1.5797148412184057, + "grad_norm": 0.41248396039009094, + "learning_rate": 4.812877553181136e-06, + "loss": 0.0934, + "step": 4875 + }, + { + "epoch": 1.580038885288399, + "grad_norm": 0.43637144565582275, + "learning_rate": 4.8111295736670545e-06, + "loss": 0.1069, + "step": 4876 + }, + { + "epoch": 1.5803629293583927, + "grad_norm": 0.4072212278842926, + "learning_rate": 4.809381617268884e-06, + "loss": 0.101, + "step": 4877 + }, + { + "epoch": 1.5806869734283864, + "grad_norm": 0.48271292448043823, + "learning_rate": 4.807633684200559e-06, + "loss": 0.1169, + "step": 4878 + }, + { + "epoch": 1.5810110174983798, + "grad_norm": 0.4623858332633972, + "learning_rate": 4.805885774676009e-06, + "loss": 0.111, + "step": 4879 + }, + { + "epoch": 1.5813350615683732, + "grad_norm": 0.4412733316421509, + "learning_rate": 4.804137888909159e-06, + "loss": 0.1021, + "step": 4880 + }, + { + "epoch": 1.5816591056383669, + "grad_norm": 0.48238319158554077, + "learning_rate": 4.802390027113938e-06, + "loss": 0.1137, + "step": 4881 + }, + { + "epoch": 1.5819831497083603, + "grad_norm": 0.4745473265647888, + "learning_rate": 4.800642189504262e-06, + "loss": 0.1126, + "step": 4882 + }, + { + "epoch": 1.582307193778354, + "grad_norm": 0.46913549304008484, + "learning_rate": 4.798894376294054e-06, + "loss": 0.1094, + "step": 4883 + }, + { + "epoch": 1.5826312378483474, + "grad_norm": 0.4792521297931671, + "learning_rate": 4.7971465876972274e-06, + "loss": 0.1053, + "step": 4884 + }, + { + "epoch": 1.5829552819183408, + "grad_norm": 0.4615633189678192, + "learning_rate": 4.795398823927693e-06, + "loss": 0.1135, + "step": 4885 + }, + { + "epoch": 1.5832793259883344, + "grad_norm": 0.4534716010093689, + "learning_rate": 4.7936510851993635e-06, + "loss": 0.1097, + "step": 4886 + }, + { + "epoch": 1.583603370058328, + "grad_norm": 0.4541871249675751, + "learning_rate": 4.791903371726141e-06, + "loss": 0.1051, + "step": 4887 + }, + { + "epoch": 1.5839274141283215, + "grad_norm": 0.47825732827186584, + "learning_rate": 4.790155683721935e-06, + "loss": 0.1121, + "step": 4888 + }, + { + "epoch": 1.584251458198315, + "grad_norm": 0.4308681786060333, + "learning_rate": 4.78840802140064e-06, + "loss": 0.0917, + "step": 4889 + }, + { + "epoch": 1.5845755022683083, + "grad_norm": 0.4852618873119354, + "learning_rate": 4.7866603849761535e-06, + "loss": 0.1168, + "step": 4890 + }, + { + "epoch": 1.584899546338302, + "grad_norm": 0.41038641333580017, + "learning_rate": 4.7849127746623735e-06, + "loss": 0.1029, + "step": 4891 + }, + { + "epoch": 1.5852235904082956, + "grad_norm": 0.4608546495437622, + "learning_rate": 4.783165190673186e-06, + "loss": 0.1084, + "step": 4892 + }, + { + "epoch": 1.585547634478289, + "grad_norm": 0.513335645198822, + "learning_rate": 4.781417633222481e-06, + "loss": 0.1186, + "step": 4893 + }, + { + "epoch": 1.5858716785482825, + "grad_norm": 0.46274006366729736, + "learning_rate": 4.779670102524139e-06, + "loss": 0.1118, + "step": 4894 + }, + { + "epoch": 1.5861957226182761, + "grad_norm": 0.48566991090774536, + "learning_rate": 4.777922598792047e-06, + "loss": 0.1218, + "step": 4895 + }, + { + "epoch": 1.5865197666882696, + "grad_norm": 0.45120298862457275, + "learning_rate": 4.776175122240077e-06, + "loss": 0.1126, + "step": 4896 + }, + { + "epoch": 1.5868438107582632, + "grad_norm": 0.46900275349617004, + "learning_rate": 4.774427673082105e-06, + "loss": 0.1143, + "step": 4897 + }, + { + "epoch": 1.5871678548282566, + "grad_norm": 0.477458655834198, + "learning_rate": 4.772680251532003e-06, + "loss": 0.1144, + "step": 4898 + }, + { + "epoch": 1.58749189889825, + "grad_norm": 0.42421650886535645, + "learning_rate": 4.7709328578036365e-06, + "loss": 0.1071, + "step": 4899 + }, + { + "epoch": 1.5878159429682437, + "grad_norm": 0.4880410432815552, + "learning_rate": 4.769185492110873e-06, + "loss": 0.1218, + "step": 4900 + }, + { + "epoch": 1.5881399870382373, + "grad_norm": 0.4839230477809906, + "learning_rate": 4.767438154667568e-06, + "loss": 0.1096, + "step": 4901 + }, + { + "epoch": 1.5884640311082308, + "grad_norm": 0.45489901304244995, + "learning_rate": 4.765690845687584e-06, + "loss": 0.1192, + "step": 4902 + }, + { + "epoch": 1.5887880751782242, + "grad_norm": 0.4356914460659027, + "learning_rate": 4.763943565384772e-06, + "loss": 0.1093, + "step": 4903 + }, + { + "epoch": 1.5891121192482176, + "grad_norm": 0.4305530786514282, + "learning_rate": 4.7621963139729795e-06, + "loss": 0.1021, + "step": 4904 + }, + { + "epoch": 1.5894361633182112, + "grad_norm": 0.43889176845550537, + "learning_rate": 4.76044909166606e-06, + "loss": 0.1029, + "step": 4905 + }, + { + "epoch": 1.589760207388205, + "grad_norm": 0.4710550904273987, + "learning_rate": 4.758701898677848e-06, + "loss": 0.1161, + "step": 4906 + }, + { + "epoch": 1.5900842514581983, + "grad_norm": 0.44871771335601807, + "learning_rate": 4.756954735222192e-06, + "loss": 0.1169, + "step": 4907 + }, + { + "epoch": 1.5904082955281917, + "grad_norm": 0.4578435719013214, + "learning_rate": 4.755207601512922e-06, + "loss": 0.1081, + "step": 4908 + }, + { + "epoch": 1.5907323395981854, + "grad_norm": 0.4500477612018585, + "learning_rate": 4.75346049776387e-06, + "loss": 0.1108, + "step": 4909 + }, + { + "epoch": 1.591056383668179, + "grad_norm": 0.44578132033348083, + "learning_rate": 4.751713424188869e-06, + "loss": 0.1028, + "step": 4910 + }, + { + "epoch": 1.5913804277381725, + "grad_norm": 0.4344480037689209, + "learning_rate": 4.749966381001741e-06, + "loss": 0.1079, + "step": 4911 + }, + { + "epoch": 1.5917044718081659, + "grad_norm": 0.42421379685401917, + "learning_rate": 4.748219368416306e-06, + "loss": 0.1009, + "step": 4912 + }, + { + "epoch": 1.5920285158781593, + "grad_norm": 0.45676690340042114, + "learning_rate": 4.746472386646383e-06, + "loss": 0.1078, + "step": 4913 + }, + { + "epoch": 1.592352559948153, + "grad_norm": 0.4680733382701874, + "learning_rate": 4.744725435905787e-06, + "loss": 0.1063, + "step": 4914 + }, + { + "epoch": 1.5926766040181466, + "grad_norm": 0.46036866307258606, + "learning_rate": 4.742978516408326e-06, + "loss": 0.1012, + "step": 4915 + }, + { + "epoch": 1.59300064808814, + "grad_norm": 0.4334714412689209, + "learning_rate": 4.741231628367805e-06, + "loss": 0.1049, + "step": 4916 + }, + { + "epoch": 1.5933246921581334, + "grad_norm": 0.4619475305080414, + "learning_rate": 4.739484771998029e-06, + "loss": 0.1124, + "step": 4917 + }, + { + "epoch": 1.5936487362281269, + "grad_norm": 0.45115354657173157, + "learning_rate": 4.737737947512793e-06, + "loss": 0.1089, + "step": 4918 + }, + { + "epoch": 1.5939727802981205, + "grad_norm": 0.46323361992836, + "learning_rate": 4.735991155125896e-06, + "loss": 0.1125, + "step": 4919 + }, + { + "epoch": 1.5942968243681142, + "grad_norm": 0.4536522328853607, + "learning_rate": 4.734244395051123e-06, + "loss": 0.1078, + "step": 4920 + }, + { + "epoch": 1.5946208684381076, + "grad_norm": 0.4844619631767273, + "learning_rate": 4.732497667502266e-06, + "loss": 0.1117, + "step": 4921 + }, + { + "epoch": 1.594944912508101, + "grad_norm": 0.47589054703712463, + "learning_rate": 4.730750972693104e-06, + "loss": 0.1229, + "step": 4922 + }, + { + "epoch": 1.5952689565780946, + "grad_norm": 0.4797200858592987, + "learning_rate": 4.729004310837417e-06, + "loss": 0.1115, + "step": 4923 + }, + { + "epoch": 1.5955930006480883, + "grad_norm": 0.49576741456985474, + "learning_rate": 4.727257682148979e-06, + "loss": 0.1177, + "step": 4924 + }, + { + "epoch": 1.5959170447180817, + "grad_norm": 0.4299625754356384, + "learning_rate": 4.725511086841557e-06, + "loss": 0.1033, + "step": 4925 + }, + { + "epoch": 1.5962410887880751, + "grad_norm": 0.5060962438583374, + "learning_rate": 4.723764525128925e-06, + "loss": 0.127, + "step": 4926 + }, + { + "epoch": 1.5965651328580686, + "grad_norm": 0.46263906359672546, + "learning_rate": 4.72201799722484e-06, + "loss": 0.115, + "step": 4927 + }, + { + "epoch": 1.5968891769280622, + "grad_norm": 0.42641258239746094, + "learning_rate": 4.720271503343059e-06, + "loss": 0.1085, + "step": 4928 + }, + { + "epoch": 1.5972132209980558, + "grad_norm": 0.4287305772304535, + "learning_rate": 4.71852504369734e-06, + "loss": 0.1099, + "step": 4929 + }, + { + "epoch": 1.5975372650680493, + "grad_norm": 0.4682078957557678, + "learning_rate": 4.716778618501429e-06, + "loss": 0.1144, + "step": 4930 + }, + { + "epoch": 1.5978613091380427, + "grad_norm": 0.443132609128952, + "learning_rate": 4.715032227969075e-06, + "loss": 0.1045, + "step": 4931 + }, + { + "epoch": 1.5981853532080363, + "grad_norm": 0.3979075253009796, + "learning_rate": 4.713285872314016e-06, + "loss": 0.0959, + "step": 4932 + }, + { + "epoch": 1.5985093972780298, + "grad_norm": 0.43410465121269226, + "learning_rate": 4.711539551749993e-06, + "loss": 0.0988, + "step": 4933 + }, + { + "epoch": 1.5988334413480234, + "grad_norm": 0.4905455708503723, + "learning_rate": 4.709793266490735e-06, + "loss": 0.1202, + "step": 4934 + }, + { + "epoch": 1.5991574854180168, + "grad_norm": 0.45668309926986694, + "learning_rate": 4.7080470167499705e-06, + "loss": 0.11, + "step": 4935 + }, + { + "epoch": 1.5994815294880103, + "grad_norm": 0.45235005021095276, + "learning_rate": 4.706300802741427e-06, + "loss": 0.1109, + "step": 4936 + }, + { + "epoch": 1.599805573558004, + "grad_norm": 0.4434433877468109, + "learning_rate": 4.70455462467882e-06, + "loss": 0.1076, + "step": 4937 + }, + { + "epoch": 1.6001296176279975, + "grad_norm": 0.43349820375442505, + "learning_rate": 4.702808482775869e-06, + "loss": 0.1068, + "step": 4938 + }, + { + "epoch": 1.600453661697991, + "grad_norm": 0.45358890295028687, + "learning_rate": 4.701062377246282e-06, + "loss": 0.1063, + "step": 4939 + }, + { + "epoch": 1.6007777057679844, + "grad_norm": 0.4753774404525757, + "learning_rate": 4.699316308303764e-06, + "loss": 0.1064, + "step": 4940 + }, + { + "epoch": 1.6011017498379778, + "grad_norm": 0.45095357298851013, + "learning_rate": 4.697570276162021e-06, + "loss": 0.1153, + "step": 4941 + }, + { + "epoch": 1.6014257939079715, + "grad_norm": 0.47847557067871094, + "learning_rate": 4.695824281034747e-06, + "loss": 0.1183, + "step": 4942 + }, + { + "epoch": 1.601749837977965, + "grad_norm": 0.44354525208473206, + "learning_rate": 4.694078323135638e-06, + "loss": 0.1026, + "step": 4943 + }, + { + "epoch": 1.6020738820479585, + "grad_norm": 0.4864445924758911, + "learning_rate": 4.69233240267838e-06, + "loss": 0.1131, + "step": 4944 + }, + { + "epoch": 1.602397926117952, + "grad_norm": 0.4234994649887085, + "learning_rate": 4.690586519876658e-06, + "loss": 0.103, + "step": 4945 + }, + { + "epoch": 1.6027219701879456, + "grad_norm": 0.43492475152015686, + "learning_rate": 4.688840674944151e-06, + "loss": 0.1085, + "step": 4946 + }, + { + "epoch": 1.6030460142579392, + "grad_norm": 0.4122197926044464, + "learning_rate": 4.687094868094531e-06, + "loss": 0.1008, + "step": 4947 + }, + { + "epoch": 1.6033700583279327, + "grad_norm": 0.46981731057167053, + "learning_rate": 4.685349099541473e-06, + "loss": 0.1127, + "step": 4948 + }, + { + "epoch": 1.603694102397926, + "grad_norm": 0.4796302914619446, + "learning_rate": 4.683603369498636e-06, + "loss": 0.1127, + "step": 4949 + }, + { + "epoch": 1.6040181464679195, + "grad_norm": 0.44304171204566956, + "learning_rate": 4.681857678179685e-06, + "loss": 0.1103, + "step": 4950 + }, + { + "epoch": 1.6043421905379132, + "grad_norm": 0.4452519714832306, + "learning_rate": 4.680112025798275e-06, + "loss": 0.1099, + "step": 4951 + }, + { + "epoch": 1.6046662346079068, + "grad_norm": 0.464995801448822, + "learning_rate": 4.678366412568055e-06, + "loss": 0.1123, + "step": 4952 + }, + { + "epoch": 1.6049902786779002, + "grad_norm": 0.46111589670181274, + "learning_rate": 4.676620838702674e-06, + "loss": 0.1055, + "step": 4953 + }, + { + "epoch": 1.6053143227478937, + "grad_norm": 0.43135613203048706, + "learning_rate": 4.6748753044157705e-06, + "loss": 0.0967, + "step": 4954 + }, + { + "epoch": 1.605638366817887, + "grad_norm": 0.4677044153213501, + "learning_rate": 4.673129809920983e-06, + "loss": 0.1084, + "step": 4955 + }, + { + "epoch": 1.6059624108878807, + "grad_norm": 0.5228461623191833, + "learning_rate": 4.671384355431941e-06, + "loss": 0.131, + "step": 4956 + }, + { + "epoch": 1.6062864549578744, + "grad_norm": 0.44885966181755066, + "learning_rate": 4.669638941162274e-06, + "loss": 0.104, + "step": 4957 + }, + { + "epoch": 1.6066104990278678, + "grad_norm": 0.4723048210144043, + "learning_rate": 4.6678935673256036e-06, + "loss": 0.1108, + "step": 4958 + }, + { + "epoch": 1.6069345430978612, + "grad_norm": 0.5116240382194519, + "learning_rate": 4.666148234135543e-06, + "loss": 0.1297, + "step": 4959 + }, + { + "epoch": 1.6072585871678549, + "grad_norm": 0.4954541027545929, + "learning_rate": 4.664402941805709e-06, + "loss": 0.1221, + "step": 4960 + }, + { + "epoch": 1.6075826312378485, + "grad_norm": 0.45687317848205566, + "learning_rate": 4.662657690549703e-06, + "loss": 0.1065, + "step": 4961 + }, + { + "epoch": 1.607906675307842, + "grad_norm": 0.45222848653793335, + "learning_rate": 4.6609124805811325e-06, + "loss": 0.1117, + "step": 4962 + }, + { + "epoch": 1.6082307193778353, + "grad_norm": 0.48122653365135193, + "learning_rate": 4.659167312113592e-06, + "loss": 0.107, + "step": 4963 + }, + { + "epoch": 1.6085547634478288, + "grad_norm": 0.47544240951538086, + "learning_rate": 4.657422185360671e-06, + "loss": 0.1191, + "step": 4964 + }, + { + "epoch": 1.6088788075178224, + "grad_norm": 0.47073894739151, + "learning_rate": 4.65567710053596e-06, + "loss": 0.1115, + "step": 4965 + }, + { + "epoch": 1.609202851587816, + "grad_norm": 0.49683570861816406, + "learning_rate": 4.653932057853037e-06, + "loss": 0.1252, + "step": 4966 + }, + { + "epoch": 1.6095268956578095, + "grad_norm": 0.5067100524902344, + "learning_rate": 4.6521870575254815e-06, + "loss": 0.1291, + "step": 4967 + }, + { + "epoch": 1.609850939727803, + "grad_norm": 0.4912336766719818, + "learning_rate": 4.650442099766861e-06, + "loss": 0.119, + "step": 4968 + }, + { + "epoch": 1.6101749837977966, + "grad_norm": 0.4592455327510834, + "learning_rate": 4.648697184790745e-06, + "loss": 0.1073, + "step": 4969 + }, + { + "epoch": 1.61049902786779, + "grad_norm": 0.49731016159057617, + "learning_rate": 4.646952312810694e-06, + "loss": 0.1223, + "step": 4970 + }, + { + "epoch": 1.6108230719377836, + "grad_norm": 0.4806722402572632, + "learning_rate": 4.645207484040259e-06, + "loss": 0.1132, + "step": 4971 + }, + { + "epoch": 1.611147116007777, + "grad_norm": 0.474865198135376, + "learning_rate": 4.6434626986929954e-06, + "loss": 0.1147, + "step": 4972 + }, + { + "epoch": 1.6114711600777705, + "grad_norm": 0.4066011905670166, + "learning_rate": 4.641717956982444e-06, + "loss": 0.0925, + "step": 4973 + }, + { + "epoch": 1.6117952041477641, + "grad_norm": 0.4577323794364929, + "learning_rate": 4.639973259122148e-06, + "loss": 0.1128, + "step": 4974 + }, + { + "epoch": 1.6121192482177578, + "grad_norm": 0.47177231311798096, + "learning_rate": 4.638228605325641e-06, + "loss": 0.11, + "step": 4975 + }, + { + "epoch": 1.6124432922877512, + "grad_norm": 0.45513543486595154, + "learning_rate": 4.636483995806448e-06, + "loss": 0.1096, + "step": 4976 + }, + { + "epoch": 1.6127673363577446, + "grad_norm": 0.45789656043052673, + "learning_rate": 4.634739430778097e-06, + "loss": 0.1162, + "step": 4977 + }, + { + "epoch": 1.613091380427738, + "grad_norm": 0.4321471154689789, + "learning_rate": 4.6329949104541e-06, + "loss": 0.1006, + "step": 4978 + }, + { + "epoch": 1.6134154244977317, + "grad_norm": 0.45868197083473206, + "learning_rate": 4.631250435047977e-06, + "loss": 0.1118, + "step": 4979 + }, + { + "epoch": 1.6137394685677253, + "grad_norm": 0.4248989522457123, + "learning_rate": 4.629506004773227e-06, + "loss": 0.1028, + "step": 4980 + }, + { + "epoch": 1.6140635126377187, + "grad_norm": 0.5093784928321838, + "learning_rate": 4.627761619843359e-06, + "loss": 0.1277, + "step": 4981 + }, + { + "epoch": 1.6143875567077122, + "grad_norm": 0.43014755845069885, + "learning_rate": 4.626017280471865e-06, + "loss": 0.0985, + "step": 4982 + }, + { + "epoch": 1.6147116007777058, + "grad_norm": 0.47510039806365967, + "learning_rate": 4.624272986872234e-06, + "loss": 0.1129, + "step": 4983 + }, + { + "epoch": 1.6150356448476992, + "grad_norm": 0.4764617383480072, + "learning_rate": 4.622528739257952e-06, + "loss": 0.1209, + "step": 4984 + }, + { + "epoch": 1.6153596889176929, + "grad_norm": 0.4343688189983368, + "learning_rate": 4.620784537842499e-06, + "loss": 0.1084, + "step": 4985 + }, + { + "epoch": 1.6156837329876863, + "grad_norm": 0.4666691720485687, + "learning_rate": 4.6190403828393464e-06, + "loss": 0.1077, + "step": 4986 + }, + { + "epoch": 1.6160077770576797, + "grad_norm": 0.44340160489082336, + "learning_rate": 4.617296274461964e-06, + "loss": 0.0994, + "step": 4987 + }, + { + "epoch": 1.6163318211276734, + "grad_norm": 0.4468873143196106, + "learning_rate": 4.6155522129238124e-06, + "loss": 0.1084, + "step": 4988 + }, + { + "epoch": 1.616655865197667, + "grad_norm": 0.44237035512924194, + "learning_rate": 4.613808198438349e-06, + "loss": 0.1041, + "step": 4989 + }, + { + "epoch": 1.6169799092676604, + "grad_norm": 0.44018039107322693, + "learning_rate": 4.612064231219021e-06, + "loss": 0.101, + "step": 4990 + }, + { + "epoch": 1.6173039533376539, + "grad_norm": 0.41502729058265686, + "learning_rate": 4.610320311479279e-06, + "loss": 0.0933, + "step": 4991 + }, + { + "epoch": 1.6176279974076473, + "grad_norm": 0.4824730157852173, + "learning_rate": 4.608576439432555e-06, + "loss": 0.1183, + "step": 4992 + }, + { + "epoch": 1.617952041477641, + "grad_norm": 0.4967108368873596, + "learning_rate": 4.606832615292288e-06, + "loss": 0.1192, + "step": 4993 + }, + { + "epoch": 1.6182760855476346, + "grad_norm": 0.4692974090576172, + "learning_rate": 4.605088839271903e-06, + "loss": 0.1091, + "step": 4994 + }, + { + "epoch": 1.618600129617628, + "grad_norm": 0.43657752871513367, + "learning_rate": 4.603345111584819e-06, + "loss": 0.1037, + "step": 4995 + }, + { + "epoch": 1.6189241736876214, + "grad_norm": 0.48579537868499756, + "learning_rate": 4.6016014324444545e-06, + "loss": 0.1194, + "step": 4996 + }, + { + "epoch": 1.619248217757615, + "grad_norm": 0.44366809725761414, + "learning_rate": 4.5998578020642185e-06, + "loss": 0.1083, + "step": 4997 + }, + { + "epoch": 1.6195722618276087, + "grad_norm": 0.43966636061668396, + "learning_rate": 4.598114220657514e-06, + "loss": 0.101, + "step": 4998 + }, + { + "epoch": 1.6198963058976021, + "grad_norm": 0.4372827708721161, + "learning_rate": 4.596370688437736e-06, + "loss": 0.1036, + "step": 4999 + }, + { + "epoch": 1.6202203499675956, + "grad_norm": 0.47269198298454285, + "learning_rate": 4.59462720561828e-06, + "loss": 0.1055, + "step": 5000 + }, + { + "epoch": 1.620544394037589, + "grad_norm": 0.5441954135894775, + "learning_rate": 4.592883772412531e-06, + "loss": 0.1284, + "step": 5001 + }, + { + "epoch": 1.6208684381075826, + "grad_norm": 0.44606369733810425, + "learning_rate": 4.591140389033863e-06, + "loss": 0.1076, + "step": 5002 + }, + { + "epoch": 1.6211924821775763, + "grad_norm": 0.4738067388534546, + "learning_rate": 4.589397055695658e-06, + "loss": 0.1144, + "step": 5003 + }, + { + "epoch": 1.6215165262475697, + "grad_norm": 0.44347885251045227, + "learning_rate": 4.587653772611275e-06, + "loss": 0.1049, + "step": 5004 + }, + { + "epoch": 1.6218405703175631, + "grad_norm": 0.49658679962158203, + "learning_rate": 4.58591053999408e-06, + "loss": 0.1182, + "step": 5005 + }, + { + "epoch": 1.6221646143875565, + "grad_norm": 0.5013179183006287, + "learning_rate": 4.584167358057427e-06, + "loss": 0.1141, + "step": 5006 + }, + { + "epoch": 1.6224886584575502, + "grad_norm": 0.429353266954422, + "learning_rate": 4.582424227014662e-06, + "loss": 0.1069, + "step": 5007 + }, + { + "epoch": 1.6228127025275438, + "grad_norm": 0.4228212833404541, + "learning_rate": 4.58068114707913e-06, + "loss": 0.1019, + "step": 5008 + }, + { + "epoch": 1.6231367465975373, + "grad_norm": 0.49328097701072693, + "learning_rate": 4.5789381184641655e-06, + "loss": 0.1215, + "step": 5009 + }, + { + "epoch": 1.6234607906675307, + "grad_norm": 0.4080681800842285, + "learning_rate": 4.577195141383101e-06, + "loss": 0.0916, + "step": 5010 + }, + { + "epoch": 1.6237848347375243, + "grad_norm": 0.45990294218063354, + "learning_rate": 4.575452216049256e-06, + "loss": 0.1064, + "step": 5011 + }, + { + "epoch": 1.624108878807518, + "grad_norm": 0.4920955002307892, + "learning_rate": 4.573709342675951e-06, + "loss": 0.1205, + "step": 5012 + }, + { + "epoch": 1.6244329228775114, + "grad_norm": 0.4758051931858063, + "learning_rate": 4.571966521476496e-06, + "loss": 0.112, + "step": 5013 + }, + { + "epoch": 1.6247569669475048, + "grad_norm": 0.4831903278827667, + "learning_rate": 4.570223752664194e-06, + "loss": 0.1207, + "step": 5014 + }, + { + "epoch": 1.6250810110174982, + "grad_norm": 0.49597156047821045, + "learning_rate": 4.568481036452345e-06, + "loss": 0.1099, + "step": 5015 + }, + { + "epoch": 1.625405055087492, + "grad_norm": 0.4464362561702728, + "learning_rate": 4.566738373054238e-06, + "loss": 0.1136, + "step": 5016 + }, + { + "epoch": 1.6257290991574855, + "grad_norm": 0.47042131423950195, + "learning_rate": 4.564995762683162e-06, + "loss": 0.1163, + "step": 5017 + }, + { + "epoch": 1.626053143227479, + "grad_norm": 0.4361095726490021, + "learning_rate": 4.563253205552393e-06, + "loss": 0.1104, + "step": 5018 + }, + { + "epoch": 1.6263771872974724, + "grad_norm": 0.4437979757785797, + "learning_rate": 4.561510701875204e-06, + "loss": 0.109, + "step": 5019 + }, + { + "epoch": 1.626701231367466, + "grad_norm": 0.427032470703125, + "learning_rate": 4.55976825186486e-06, + "loss": 0.104, + "step": 5020 + }, + { + "epoch": 1.6270252754374595, + "grad_norm": 0.4684242904186249, + "learning_rate": 4.558025855734618e-06, + "loss": 0.113, + "step": 5021 + }, + { + "epoch": 1.627349319507453, + "grad_norm": 0.4585596024990082, + "learning_rate": 4.5562835136977355e-06, + "loss": 0.1179, + "step": 5022 + }, + { + "epoch": 1.6276733635774465, + "grad_norm": 0.4794493019580841, + "learning_rate": 4.554541225967452e-06, + "loss": 0.1186, + "step": 5023 + }, + { + "epoch": 1.62799740764744, + "grad_norm": 0.46028104424476624, + "learning_rate": 4.552798992757013e-06, + "loss": 0.1122, + "step": 5024 + }, + { + "epoch": 1.6283214517174336, + "grad_norm": 0.4441133439540863, + "learning_rate": 4.5510568142796485e-06, + "loss": 0.1011, + "step": 5025 + }, + { + "epoch": 1.6286454957874272, + "grad_norm": 0.41523277759552, + "learning_rate": 4.549314690748581e-06, + "loss": 0.0969, + "step": 5026 + }, + { + "epoch": 1.6289695398574207, + "grad_norm": 0.4720069169998169, + "learning_rate": 4.547572622377035e-06, + "loss": 0.1141, + "step": 5027 + }, + { + "epoch": 1.629293583927414, + "grad_norm": 0.45224857330322266, + "learning_rate": 4.545830609378219e-06, + "loss": 0.1108, + "step": 5028 + }, + { + "epoch": 1.6296176279974075, + "grad_norm": 0.4049452543258667, + "learning_rate": 4.5440886519653404e-06, + "loss": 0.0942, + "step": 5029 + }, + { + "epoch": 1.6299416720674011, + "grad_norm": 0.48412343859672546, + "learning_rate": 4.542346750351597e-06, + "loss": 0.1161, + "step": 5030 + }, + { + "epoch": 1.6302657161373948, + "grad_norm": 0.4523051381111145, + "learning_rate": 4.54060490475018e-06, + "loss": 0.1043, + "step": 5031 + }, + { + "epoch": 1.6305897602073882, + "grad_norm": 0.44593173265457153, + "learning_rate": 4.538863115374277e-06, + "loss": 0.1093, + "step": 5032 + }, + { + "epoch": 1.6309138042773816, + "grad_norm": 0.4349612295627594, + "learning_rate": 4.537121382437062e-06, + "loss": 0.0989, + "step": 5033 + }, + { + "epoch": 1.6312378483473753, + "grad_norm": 0.5067662596702576, + "learning_rate": 4.535379706151711e-06, + "loss": 0.121, + "step": 5034 + }, + { + "epoch": 1.6315618924173687, + "grad_norm": 0.46852073073387146, + "learning_rate": 4.533638086731384e-06, + "loss": 0.1083, + "step": 5035 + }, + { + "epoch": 1.6318859364873624, + "grad_norm": 0.4713311195373535, + "learning_rate": 4.531896524389242e-06, + "loss": 0.1061, + "step": 5036 + }, + { + "epoch": 1.6322099805573558, + "grad_norm": 0.48059168457984924, + "learning_rate": 4.530155019338435e-06, + "loss": 0.1173, + "step": 5037 + }, + { + "epoch": 1.6325340246273492, + "grad_norm": 0.4491584300994873, + "learning_rate": 4.528413571792103e-06, + "loss": 0.1095, + "step": 5038 + }, + { + "epoch": 1.6328580686973428, + "grad_norm": 0.47590669989585876, + "learning_rate": 4.526672181963386e-06, + "loss": 0.1081, + "step": 5039 + }, + { + "epoch": 1.6331821127673365, + "grad_norm": 0.5039756298065186, + "learning_rate": 4.524930850065411e-06, + "loss": 0.1202, + "step": 5040 + }, + { + "epoch": 1.63350615683733, + "grad_norm": 0.4588005542755127, + "learning_rate": 4.523189576311301e-06, + "loss": 0.1117, + "step": 5041 + }, + { + "epoch": 1.6338302009073233, + "grad_norm": 0.4382549524307251, + "learning_rate": 4.521448360914173e-06, + "loss": 0.1044, + "step": 5042 + }, + { + "epoch": 1.6341542449773168, + "grad_norm": 0.42880919575691223, + "learning_rate": 4.519707204087129e-06, + "loss": 0.1076, + "step": 5043 + }, + { + "epoch": 1.6344782890473104, + "grad_norm": 0.4497835636138916, + "learning_rate": 4.517966106043276e-06, + "loss": 0.1062, + "step": 5044 + }, + { + "epoch": 1.634802333117304, + "grad_norm": 0.6321564316749573, + "learning_rate": 4.5162250669957035e-06, + "loss": 0.1222, + "step": 5045 + }, + { + "epoch": 1.6351263771872975, + "grad_norm": 0.44169488549232483, + "learning_rate": 4.514484087157502e-06, + "loss": 0.1056, + "step": 5046 + }, + { + "epoch": 1.635450421257291, + "grad_norm": 0.4321480691432953, + "learning_rate": 4.512743166741745e-06, + "loss": 0.1039, + "step": 5047 + }, + { + "epoch": 1.6357744653272845, + "grad_norm": 0.4385624825954437, + "learning_rate": 4.51100230596151e-06, + "loss": 0.1078, + "step": 5048 + }, + { + "epoch": 1.6360985093972782, + "grad_norm": 0.5026413202285767, + "learning_rate": 4.5092615050298585e-06, + "loss": 0.1207, + "step": 5049 + }, + { + "epoch": 1.6364225534672716, + "grad_norm": 0.46124643087387085, + "learning_rate": 4.507520764159848e-06, + "loss": 0.1135, + "step": 5050 + }, + { + "epoch": 1.636746597537265, + "grad_norm": 0.4504324793815613, + "learning_rate": 4.505780083564527e-06, + "loss": 0.1038, + "step": 5051 + }, + { + "epoch": 1.6370706416072585, + "grad_norm": 0.4859355092048645, + "learning_rate": 4.5040394634569405e-06, + "loss": 0.1212, + "step": 5052 + }, + { + "epoch": 1.637394685677252, + "grad_norm": 0.4548870921134949, + "learning_rate": 4.502298904050123e-06, + "loss": 0.113, + "step": 5053 + }, + { + "epoch": 1.6377187297472457, + "grad_norm": 0.41375717520713806, + "learning_rate": 4.5005584055571016e-06, + "loss": 0.0959, + "step": 5054 + }, + { + "epoch": 1.6380427738172392, + "grad_norm": 0.5188780426979065, + "learning_rate": 4.498817968190894e-06, + "loss": 0.1205, + "step": 5055 + }, + { + "epoch": 1.6383668178872326, + "grad_norm": 0.4686286747455597, + "learning_rate": 4.497077592164518e-06, + "loss": 0.1135, + "step": 5056 + }, + { + "epoch": 1.638690861957226, + "grad_norm": 0.44579342007637024, + "learning_rate": 4.4953372776909735e-06, + "loss": 0.1066, + "step": 5057 + }, + { + "epoch": 1.6390149060272197, + "grad_norm": 0.47754591703414917, + "learning_rate": 4.493597024983263e-06, + "loss": 0.1068, + "step": 5058 + }, + { + "epoch": 1.6393389500972133, + "grad_norm": 0.4362635910511017, + "learning_rate": 4.4918568342543725e-06, + "loss": 0.1094, + "step": 5059 + }, + { + "epoch": 1.6396629941672067, + "grad_norm": 0.48197826743125916, + "learning_rate": 4.490116705717287e-06, + "loss": 0.12, + "step": 5060 + }, + { + "epoch": 1.6399870382372002, + "grad_norm": 0.46622657775878906, + "learning_rate": 4.488376639584982e-06, + "loss": 0.1114, + "step": 5061 + }, + { + "epoch": 1.6403110823071938, + "grad_norm": 0.45262354612350464, + "learning_rate": 4.486636636070422e-06, + "loss": 0.1059, + "step": 5062 + }, + { + "epoch": 1.6406351263771874, + "grad_norm": 0.47442132234573364, + "learning_rate": 4.484896695386569e-06, + "loss": 0.1075, + "step": 5063 + }, + { + "epoch": 1.6409591704471809, + "grad_norm": 0.43097731471061707, + "learning_rate": 4.483156817746372e-06, + "loss": 0.1012, + "step": 5064 + }, + { + "epoch": 1.6412832145171743, + "grad_norm": 0.43258407711982727, + "learning_rate": 4.481417003362779e-06, + "loss": 0.104, + "step": 5065 + }, + { + "epoch": 1.6416072585871677, + "grad_norm": 0.45446422696113586, + "learning_rate": 4.479677252448722e-06, + "loss": 0.1121, + "step": 5066 + }, + { + "epoch": 1.6419313026571614, + "grad_norm": 0.4537167549133301, + "learning_rate": 4.477937565217135e-06, + "loss": 0.1056, + "step": 5067 + }, + { + "epoch": 1.642255346727155, + "grad_norm": 0.46545058488845825, + "learning_rate": 4.476197941880936e-06, + "loss": 0.1112, + "step": 5068 + }, + { + "epoch": 1.6425793907971484, + "grad_norm": 0.42365697026252747, + "learning_rate": 4.474458382653035e-06, + "loss": 0.0989, + "step": 5069 + }, + { + "epoch": 1.6429034348671419, + "grad_norm": 0.4858678877353668, + "learning_rate": 4.472718887746344e-06, + "loss": 0.1191, + "step": 5070 + }, + { + "epoch": 1.6432274789371355, + "grad_norm": 0.45485833287239075, + "learning_rate": 4.4709794573737545e-06, + "loss": 0.1081, + "step": 5071 + }, + { + "epoch": 1.643551523007129, + "grad_norm": 0.43576475977897644, + "learning_rate": 4.46924009174816e-06, + "loss": 0.1043, + "step": 5072 + }, + { + "epoch": 1.6438755670771226, + "grad_norm": 0.4274187684059143, + "learning_rate": 4.467500791082438e-06, + "loss": 0.0974, + "step": 5073 + }, + { + "epoch": 1.644199611147116, + "grad_norm": 0.42950570583343506, + "learning_rate": 4.465761555589465e-06, + "loss": 0.1044, + "step": 5074 + }, + { + "epoch": 1.6445236552171094, + "grad_norm": 0.43484798073768616, + "learning_rate": 4.464022385482106e-06, + "loss": 0.0985, + "step": 5075 + }, + { + "epoch": 1.644847699287103, + "grad_norm": 0.5140442848205566, + "learning_rate": 4.462283280973217e-06, + "loss": 0.1241, + "step": 5076 + }, + { + "epoch": 1.6451717433570967, + "grad_norm": 0.46632710099220276, + "learning_rate": 4.460544242275651e-06, + "loss": 0.1084, + "step": 5077 + }, + { + "epoch": 1.6454957874270901, + "grad_norm": 0.4264677166938782, + "learning_rate": 4.458805269602245e-06, + "loss": 0.1006, + "step": 5078 + }, + { + "epoch": 1.6458198314970836, + "grad_norm": 0.47796615958213806, + "learning_rate": 4.457066363165837e-06, + "loss": 0.1131, + "step": 5079 + }, + { + "epoch": 1.646143875567077, + "grad_norm": 0.45501673221588135, + "learning_rate": 4.45532752317925e-06, + "loss": 0.1039, + "step": 5080 + }, + { + "epoch": 1.6464679196370706, + "grad_norm": 0.43680480122566223, + "learning_rate": 4.453588749855301e-06, + "loss": 0.1012, + "step": 5081 + }, + { + "epoch": 1.6467919637070643, + "grad_norm": 0.4651895761489868, + "learning_rate": 4.451850043406798e-06, + "loss": 0.112, + "step": 5082 + }, + { + "epoch": 1.6471160077770577, + "grad_norm": 0.37965553998947144, + "learning_rate": 4.450111404046545e-06, + "loss": 0.0892, + "step": 5083 + }, + { + "epoch": 1.6474400518470511, + "grad_norm": 0.48205476999282837, + "learning_rate": 4.448372831987333e-06, + "loss": 0.1164, + "step": 5084 + }, + { + "epoch": 1.6477640959170448, + "grad_norm": 0.4473625123500824, + "learning_rate": 4.446634327441946e-06, + "loss": 0.1093, + "step": 5085 + }, + { + "epoch": 1.6480881399870384, + "grad_norm": 0.4586276710033417, + "learning_rate": 4.444895890623158e-06, + "loss": 0.1135, + "step": 5086 + }, + { + "epoch": 1.6484121840570318, + "grad_norm": 0.5061674118041992, + "learning_rate": 4.443157521743741e-06, + "loss": 0.132, + "step": 5087 + }, + { + "epoch": 1.6487362281270252, + "grad_norm": 0.47010958194732666, + "learning_rate": 4.441419221016452e-06, + "loss": 0.1141, + "step": 5088 + }, + { + "epoch": 1.6490602721970187, + "grad_norm": 0.45167699456214905, + "learning_rate": 4.439680988654043e-06, + "loss": 0.107, + "step": 5089 + }, + { + "epoch": 1.6493843162670123, + "grad_norm": 0.44836100935935974, + "learning_rate": 4.437942824869256e-06, + "loss": 0.1039, + "step": 5090 + }, + { + "epoch": 1.649708360337006, + "grad_norm": 0.48193061351776123, + "learning_rate": 4.436204729874828e-06, + "loss": 0.1143, + "step": 5091 + }, + { + "epoch": 1.6500324044069994, + "grad_norm": 0.47236403822898865, + "learning_rate": 4.434466703883483e-06, + "loss": 0.1068, + "step": 5092 + }, + { + "epoch": 1.6503564484769928, + "grad_norm": 0.46453502774238586, + "learning_rate": 4.4327287471079375e-06, + "loss": 0.1144, + "step": 5093 + }, + { + "epoch": 1.6506804925469862, + "grad_norm": 0.44503045082092285, + "learning_rate": 4.430990859760903e-06, + "loss": 0.1104, + "step": 5094 + }, + { + "epoch": 1.6510045366169799, + "grad_norm": 0.4143137037754059, + "learning_rate": 4.429253042055076e-06, + "loss": 0.0875, + "step": 5095 + }, + { + "epoch": 1.6513285806869735, + "grad_norm": 0.47698596119880676, + "learning_rate": 4.427515294203154e-06, + "loss": 0.1157, + "step": 5096 + }, + { + "epoch": 1.651652624756967, + "grad_norm": 0.46756431460380554, + "learning_rate": 4.425777616417819e-06, + "loss": 0.109, + "step": 5097 + }, + { + "epoch": 1.6519766688269604, + "grad_norm": 0.44503191113471985, + "learning_rate": 4.424040008911741e-06, + "loss": 0.1044, + "step": 5098 + }, + { + "epoch": 1.652300712896954, + "grad_norm": 0.4633592367172241, + "learning_rate": 4.422302471897593e-06, + "loss": 0.1098, + "step": 5099 + }, + { + "epoch": 1.6526247569669477, + "grad_norm": 0.48274174332618713, + "learning_rate": 4.4205650055880286e-06, + "loss": 0.1058, + "step": 5100 + }, + { + "epoch": 1.652948801036941, + "grad_norm": 0.45976313948631287, + "learning_rate": 4.418827610195699e-06, + "loss": 0.1147, + "step": 5101 + }, + { + "epoch": 1.6532728451069345, + "grad_norm": 0.43249958753585815, + "learning_rate": 4.417090285933243e-06, + "loss": 0.0984, + "step": 5102 + }, + { + "epoch": 1.653596889176928, + "grad_norm": 0.48664548993110657, + "learning_rate": 4.415353033013294e-06, + "loss": 0.1146, + "step": 5103 + }, + { + "epoch": 1.6539209332469216, + "grad_norm": 0.45588254928588867, + "learning_rate": 4.413615851648474e-06, + "loss": 0.101, + "step": 5104 + }, + { + "epoch": 1.6542449773169152, + "grad_norm": 0.46738970279693604, + "learning_rate": 4.411878742051396e-06, + "loss": 0.1131, + "step": 5105 + }, + { + "epoch": 1.6545690213869086, + "grad_norm": 0.4817322790622711, + "learning_rate": 4.410141704434668e-06, + "loss": 0.1147, + "step": 5106 + }, + { + "epoch": 1.654893065456902, + "grad_norm": 0.46531572937965393, + "learning_rate": 4.408404739010882e-06, + "loss": 0.1076, + "step": 5107 + }, + { + "epoch": 1.6552171095268955, + "grad_norm": 0.48336824774742126, + "learning_rate": 4.406667845992632e-06, + "loss": 0.1162, + "step": 5108 + }, + { + "epoch": 1.6555411535968891, + "grad_norm": 0.40397506952285767, + "learning_rate": 4.404931025592494e-06, + "loss": 0.091, + "step": 5109 + }, + { + "epoch": 1.6558651976668828, + "grad_norm": 0.47702574729919434, + "learning_rate": 4.4031942780230345e-06, + "loss": 0.114, + "step": 5110 + }, + { + "epoch": 1.6561892417368762, + "grad_norm": 0.4436090290546417, + "learning_rate": 4.401457603496821e-06, + "loss": 0.1002, + "step": 5111 + }, + { + "epoch": 1.6565132858068696, + "grad_norm": 0.43865570425987244, + "learning_rate": 4.399721002226399e-06, + "loss": 0.1031, + "step": 5112 + }, + { + "epoch": 1.6568373298768633, + "grad_norm": 0.4540642201900482, + "learning_rate": 4.39798447442432e-06, + "loss": 0.1109, + "step": 5113 + }, + { + "epoch": 1.657161373946857, + "grad_norm": 0.4558948576450348, + "learning_rate": 4.3962480203031095e-06, + "loss": 0.1121, + "step": 5114 + }, + { + "epoch": 1.6574854180168503, + "grad_norm": 0.44171616435050964, + "learning_rate": 4.3945116400752994e-06, + "loss": 0.1074, + "step": 5115 + }, + { + "epoch": 1.6578094620868438, + "grad_norm": 0.4583776891231537, + "learning_rate": 4.3927753339534015e-06, + "loss": 0.1027, + "step": 5116 + }, + { + "epoch": 1.6581335061568372, + "grad_norm": 0.506412923336029, + "learning_rate": 4.391039102149923e-06, + "loss": 0.1148, + "step": 5117 + }, + { + "epoch": 1.6584575502268308, + "grad_norm": 0.48240935802459717, + "learning_rate": 4.389302944877365e-06, + "loss": 0.1188, + "step": 5118 + }, + { + "epoch": 1.6587815942968245, + "grad_norm": 0.4528309106826782, + "learning_rate": 4.387566862348213e-06, + "loss": 0.1084, + "step": 5119 + }, + { + "epoch": 1.659105638366818, + "grad_norm": 0.42765727639198303, + "learning_rate": 4.38583085477495e-06, + "loss": 0.1012, + "step": 5120 + }, + { + "epoch": 1.6594296824368113, + "grad_norm": 0.42442506551742554, + "learning_rate": 4.384094922370045e-06, + "loss": 0.0994, + "step": 5121 + }, + { + "epoch": 1.659753726506805, + "grad_norm": 0.4268704950809479, + "learning_rate": 4.382359065345957e-06, + "loss": 0.1012, + "step": 5122 + }, + { + "epoch": 1.6600777705767984, + "grad_norm": 0.4246152937412262, + "learning_rate": 4.380623283915142e-06, + "loss": 0.0955, + "step": 5123 + }, + { + "epoch": 1.660401814646792, + "grad_norm": 0.45979657769203186, + "learning_rate": 4.37888757829004e-06, + "loss": 0.1028, + "step": 5124 + }, + { + "epoch": 1.6607258587167855, + "grad_norm": 0.49330389499664307, + "learning_rate": 4.377151948683086e-06, + "loss": 0.1235, + "step": 5125 + }, + { + "epoch": 1.6610499027867789, + "grad_norm": 0.4123222231864929, + "learning_rate": 4.375416395306703e-06, + "loss": 0.0947, + "step": 5126 + }, + { + "epoch": 1.6613739468567725, + "grad_norm": 0.46925610303878784, + "learning_rate": 4.373680918373308e-06, + "loss": 0.1161, + "step": 5127 + }, + { + "epoch": 1.6616979909267662, + "grad_norm": 0.4626232385635376, + "learning_rate": 4.371945518095306e-06, + "loss": 0.1041, + "step": 5128 + }, + { + "epoch": 1.6620220349967596, + "grad_norm": 0.5107243061065674, + "learning_rate": 4.370210194685091e-06, + "loss": 0.1278, + "step": 5129 + }, + { + "epoch": 1.662346079066753, + "grad_norm": 0.4590868651866913, + "learning_rate": 4.3684749483550524e-06, + "loss": 0.1068, + "step": 5130 + }, + { + "epoch": 1.6626701231367464, + "grad_norm": 0.4151780605316162, + "learning_rate": 4.366739779317563e-06, + "loss": 0.098, + "step": 5131 + }, + { + "epoch": 1.66299416720674, + "grad_norm": 0.49163949489593506, + "learning_rate": 4.365004687784999e-06, + "loss": 0.1074, + "step": 5132 + }, + { + "epoch": 1.6633182112767337, + "grad_norm": 0.4786001741886139, + "learning_rate": 4.363269673969711e-06, + "loss": 0.1213, + "step": 5133 + }, + { + "epoch": 1.6636422553467272, + "grad_norm": 0.43519163131713867, + "learning_rate": 4.361534738084052e-06, + "loss": 0.1035, + "step": 5134 + }, + { + "epoch": 1.6639662994167206, + "grad_norm": 0.4144386947154999, + "learning_rate": 4.3597998803403604e-06, + "loss": 0.095, + "step": 5135 + }, + { + "epoch": 1.6642903434867142, + "grad_norm": 0.48504820466041565, + "learning_rate": 4.3580651009509654e-06, + "loss": 0.109, + "step": 5136 + }, + { + "epoch": 1.6646143875567079, + "grad_norm": 0.490996390581131, + "learning_rate": 4.356330400128189e-06, + "loss": 0.1197, + "step": 5137 + }, + { + "epoch": 1.6649384316267013, + "grad_norm": 0.41804537177085876, + "learning_rate": 4.354595778084338e-06, + "loss": 0.0998, + "step": 5138 + }, + { + "epoch": 1.6652624756966947, + "grad_norm": 0.4619435667991638, + "learning_rate": 4.3528612350317175e-06, + "loss": 0.1113, + "step": 5139 + }, + { + "epoch": 1.6655865197666881, + "grad_norm": 0.4360075294971466, + "learning_rate": 4.351126771182617e-06, + "loss": 0.1011, + "step": 5140 + }, + { + "epoch": 1.6659105638366818, + "grad_norm": 0.4541635811328888, + "learning_rate": 4.349392386749316e-06, + "loss": 0.1111, + "step": 5141 + }, + { + "epoch": 1.6662346079066754, + "grad_norm": 0.45814111828804016, + "learning_rate": 4.347658081944092e-06, + "loss": 0.1076, + "step": 5142 + }, + { + "epoch": 1.6665586519766689, + "grad_norm": 0.4763301610946655, + "learning_rate": 4.3459238569792e-06, + "loss": 0.1128, + "step": 5143 + }, + { + "epoch": 1.6668826960466623, + "grad_norm": 0.4556414783000946, + "learning_rate": 4.3441897120668985e-06, + "loss": 0.1139, + "step": 5144 + }, + { + "epoch": 1.6672067401166557, + "grad_norm": 0.40329378843307495, + "learning_rate": 4.342455647419426e-06, + "loss": 0.0921, + "step": 5145 + }, + { + "epoch": 1.6675307841866494, + "grad_norm": 0.5186251997947693, + "learning_rate": 4.3407216632490185e-06, + "loss": 0.1307, + "step": 5146 + }, + { + "epoch": 1.667854828256643, + "grad_norm": 0.47811368107795715, + "learning_rate": 4.338987759767896e-06, + "loss": 0.1164, + "step": 5147 + }, + { + "epoch": 1.6681788723266364, + "grad_norm": 0.4457603693008423, + "learning_rate": 4.337253937188272e-06, + "loss": 0.1048, + "step": 5148 + }, + { + "epoch": 1.6685029163966298, + "grad_norm": 0.46842989325523376, + "learning_rate": 4.335520195722352e-06, + "loss": 0.1043, + "step": 5149 + }, + { + "epoch": 1.6688269604666235, + "grad_norm": 0.4409325122833252, + "learning_rate": 4.333786535582325e-06, + "loss": 0.1064, + "step": 5150 + }, + { + "epoch": 1.6691510045366171, + "grad_norm": 0.4449619650840759, + "learning_rate": 4.332052956980378e-06, + "loss": 0.0991, + "step": 5151 + }, + { + "epoch": 1.6694750486066106, + "grad_norm": 0.447503924369812, + "learning_rate": 4.3303194601286835e-06, + "loss": 0.1009, + "step": 5152 + }, + { + "epoch": 1.669799092676604, + "grad_norm": 0.4481426775455475, + "learning_rate": 4.3285860452394025e-06, + "loss": 0.0965, + "step": 5153 + }, + { + "epoch": 1.6701231367465974, + "grad_norm": 0.4478634297847748, + "learning_rate": 4.326852712524691e-06, + "loss": 0.1114, + "step": 5154 + }, + { + "epoch": 1.670447180816591, + "grad_norm": 0.43121814727783203, + "learning_rate": 4.32511946219669e-06, + "loss": 0.0996, + "step": 5155 + }, + { + "epoch": 1.6707712248865847, + "grad_norm": 0.3942187428474426, + "learning_rate": 4.323386294467534e-06, + "loss": 0.0916, + "step": 5156 + }, + { + "epoch": 1.6710952689565781, + "grad_norm": 0.49547019600868225, + "learning_rate": 4.3216532095493445e-06, + "loss": 0.1248, + "step": 5157 + }, + { + "epoch": 1.6714193130265715, + "grad_norm": 0.48921433091163635, + "learning_rate": 4.319920207654237e-06, + "loss": 0.1204, + "step": 5158 + }, + { + "epoch": 1.6717433570965652, + "grad_norm": 0.4274780750274658, + "learning_rate": 4.3181872889943126e-06, + "loss": 0.0996, + "step": 5159 + }, + { + "epoch": 1.6720674011665586, + "grad_norm": 0.4929424822330475, + "learning_rate": 4.316454453781661e-06, + "loss": 0.1219, + "step": 5160 + }, + { + "epoch": 1.6723914452365523, + "grad_norm": 0.47485384345054626, + "learning_rate": 4.314721702228369e-06, + "loss": 0.1192, + "step": 5161 + }, + { + "epoch": 1.6727154893065457, + "grad_norm": 0.4370744228363037, + "learning_rate": 4.312989034546505e-06, + "loss": 0.1121, + "step": 5162 + }, + { + "epoch": 1.673039533376539, + "grad_norm": 0.4904261827468872, + "learning_rate": 4.311256450948134e-06, + "loss": 0.1207, + "step": 5163 + }, + { + "epoch": 1.6733635774465327, + "grad_norm": 0.4401930868625641, + "learning_rate": 4.309523951645306e-06, + "loss": 0.1111, + "step": 5164 + }, + { + "epoch": 1.6736876215165264, + "grad_norm": 0.44211092591285706, + "learning_rate": 4.3077915368500605e-06, + "loss": 0.1053, + "step": 5165 + }, + { + "epoch": 1.6740116655865198, + "grad_norm": 0.4240233898162842, + "learning_rate": 4.306059206774431e-06, + "loss": 0.1056, + "step": 5166 + }, + { + "epoch": 1.6743357096565132, + "grad_norm": 0.4609757661819458, + "learning_rate": 4.304326961630436e-06, + "loss": 0.112, + "step": 5167 + }, + { + "epoch": 1.6746597537265067, + "grad_norm": 0.46120592951774597, + "learning_rate": 4.302594801630088e-06, + "loss": 0.1118, + "step": 5168 + }, + { + "epoch": 1.6749837977965003, + "grad_norm": 0.45138785243034363, + "learning_rate": 4.300862726985382e-06, + "loss": 0.1101, + "step": 5169 + }, + { + "epoch": 1.675307841866494, + "grad_norm": 0.45417898893356323, + "learning_rate": 4.2991307379083125e-06, + "loss": 0.1132, + "step": 5170 + }, + { + "epoch": 1.6756318859364874, + "grad_norm": 0.4745250344276428, + "learning_rate": 4.297398834610855e-06, + "loss": 0.1173, + "step": 5171 + }, + { + "epoch": 1.6759559300064808, + "grad_norm": 0.4576803743839264, + "learning_rate": 4.295667017304977e-06, + "loss": 0.108, + "step": 5172 + }, + { + "epoch": 1.6762799740764744, + "grad_norm": 0.43262675404548645, + "learning_rate": 4.29393528620264e-06, + "loss": 0.1031, + "step": 5173 + }, + { + "epoch": 1.6766040181464679, + "grad_norm": 0.47831442952156067, + "learning_rate": 4.2922036415157865e-06, + "loss": 0.1143, + "step": 5174 + }, + { + "epoch": 1.6769280622164615, + "grad_norm": 0.4054790735244751, + "learning_rate": 4.290472083456357e-06, + "loss": 0.0972, + "step": 5175 + }, + { + "epoch": 1.677252106286455, + "grad_norm": 0.4552663266658783, + "learning_rate": 4.288740612236276e-06, + "loss": 0.1054, + "step": 5176 + }, + { + "epoch": 1.6775761503564484, + "grad_norm": 0.43283385038375854, + "learning_rate": 4.287009228067456e-06, + "loss": 0.101, + "step": 5177 + }, + { + "epoch": 1.677900194426442, + "grad_norm": 0.4280630350112915, + "learning_rate": 4.285277931161806e-06, + "loss": 0.0966, + "step": 5178 + }, + { + "epoch": 1.6782242384964356, + "grad_norm": 0.46848851442337036, + "learning_rate": 4.283546721731218e-06, + "loss": 0.1077, + "step": 5179 + }, + { + "epoch": 1.678548282566429, + "grad_norm": 0.4811386168003082, + "learning_rate": 4.281815599987577e-06, + "loss": 0.1122, + "step": 5180 + }, + { + "epoch": 1.6788723266364225, + "grad_norm": 0.4960719645023346, + "learning_rate": 4.2800845661427505e-06, + "loss": 0.1196, + "step": 5181 + }, + { + "epoch": 1.679196370706416, + "grad_norm": 0.48916101455688477, + "learning_rate": 4.2783536204086065e-06, + "loss": 0.1116, + "step": 5182 + }, + { + "epoch": 1.6795204147764096, + "grad_norm": 0.5164931416511536, + "learning_rate": 4.276622762996993e-06, + "loss": 0.1246, + "step": 5183 + }, + { + "epoch": 1.6798444588464032, + "grad_norm": 0.4530620276927948, + "learning_rate": 4.274891994119748e-06, + "loss": 0.1049, + "step": 5184 + }, + { + "epoch": 1.6801685029163966, + "grad_norm": 0.4720141589641571, + "learning_rate": 4.273161313988707e-06, + "loss": 0.1067, + "step": 5185 + }, + { + "epoch": 1.68049254698639, + "grad_norm": 0.4903331995010376, + "learning_rate": 4.2714307228156814e-06, + "loss": 0.1164, + "step": 5186 + }, + { + "epoch": 1.6808165910563837, + "grad_norm": 0.49225887656211853, + "learning_rate": 4.2697002208124845e-06, + "loss": 0.114, + "step": 5187 + }, + { + "epoch": 1.6811406351263773, + "grad_norm": 0.47676360607147217, + "learning_rate": 4.267969808190911e-06, + "loss": 0.1179, + "step": 5188 + }, + { + "epoch": 1.6814646791963708, + "grad_norm": 0.4683300852775574, + "learning_rate": 4.266239485162746e-06, + "loss": 0.1087, + "step": 5189 + }, + { + "epoch": 1.6817887232663642, + "grad_norm": 0.461770236492157, + "learning_rate": 4.264509251939765e-06, + "loss": 0.1103, + "step": 5190 + }, + { + "epoch": 1.6821127673363576, + "grad_norm": 0.47698649764060974, + "learning_rate": 4.262779108733729e-06, + "loss": 0.1087, + "step": 5191 + }, + { + "epoch": 1.6824368114063513, + "grad_norm": 0.4547366499900818, + "learning_rate": 4.2610490557563955e-06, + "loss": 0.1079, + "step": 5192 + }, + { + "epoch": 1.682760855476345, + "grad_norm": 0.4503345191478729, + "learning_rate": 4.259319093219502e-06, + "loss": 0.1066, + "step": 5193 + }, + { + "epoch": 1.6830848995463383, + "grad_norm": 0.487965852022171, + "learning_rate": 4.257589221334783e-06, + "loss": 0.1096, + "step": 5194 + }, + { + "epoch": 1.6834089436163318, + "grad_norm": 0.4674997627735138, + "learning_rate": 4.2558594403139574e-06, + "loss": 0.1077, + "step": 5195 + }, + { + "epoch": 1.6837329876863252, + "grad_norm": 0.48449698090553284, + "learning_rate": 4.25412975036873e-06, + "loss": 0.1161, + "step": 5196 + }, + { + "epoch": 1.6840570317563188, + "grad_norm": 0.43596965074539185, + "learning_rate": 4.252400151710802e-06, + "loss": 0.1014, + "step": 5197 + }, + { + "epoch": 1.6843810758263125, + "grad_norm": 0.4734188914299011, + "learning_rate": 4.250670644551856e-06, + "loss": 0.1156, + "step": 5198 + }, + { + "epoch": 1.684705119896306, + "grad_norm": 0.41340628266334534, + "learning_rate": 4.2489412291035706e-06, + "loss": 0.0975, + "step": 5199 + }, + { + "epoch": 1.6850291639662993, + "grad_norm": 0.43714478611946106, + "learning_rate": 4.247211905577609e-06, + "loss": 0.0997, + "step": 5200 + }, + { + "epoch": 1.685353208036293, + "grad_norm": 0.4615105986595154, + "learning_rate": 4.245482674185621e-06, + "loss": 0.1094, + "step": 5201 + }, + { + "epoch": 1.6856772521062866, + "grad_norm": 0.46344971656799316, + "learning_rate": 4.243753535139251e-06, + "loss": 0.1085, + "step": 5202 + }, + { + "epoch": 1.68600129617628, + "grad_norm": 0.4436193108558655, + "learning_rate": 4.242024488650125e-06, + "loss": 0.1017, + "step": 5203 + }, + { + "epoch": 1.6863253402462735, + "grad_norm": 0.4480687975883484, + "learning_rate": 4.240295534929865e-06, + "loss": 0.1088, + "step": 5204 + }, + { + "epoch": 1.6866493843162669, + "grad_norm": 0.46114522218704224, + "learning_rate": 4.238566674190076e-06, + "loss": 0.1077, + "step": 5205 + }, + { + "epoch": 1.6869734283862605, + "grad_norm": 0.44118571281433105, + "learning_rate": 4.236837906642357e-06, + "loss": 0.1053, + "step": 5206 + }, + { + "epoch": 1.6872974724562542, + "grad_norm": 0.4613795280456543, + "learning_rate": 4.23510923249829e-06, + "loss": 0.1129, + "step": 5207 + }, + { + "epoch": 1.6876215165262476, + "grad_norm": 0.47433406114578247, + "learning_rate": 4.2333806519694455e-06, + "loss": 0.1176, + "step": 5208 + }, + { + "epoch": 1.687945560596241, + "grad_norm": 0.4397332966327667, + "learning_rate": 4.23165216526739e-06, + "loss": 0.0993, + "step": 5209 + }, + { + "epoch": 1.6882696046662347, + "grad_norm": 0.4279300570487976, + "learning_rate": 4.22992377260367e-06, + "loss": 0.1059, + "step": 5210 + }, + { + "epoch": 1.688593648736228, + "grad_norm": 0.46599432826042175, + "learning_rate": 4.228195474189828e-06, + "loss": 0.1102, + "step": 5211 + }, + { + "epoch": 1.6889176928062217, + "grad_norm": 0.48240232467651367, + "learning_rate": 4.2264672702373845e-06, + "loss": 0.1187, + "step": 5212 + }, + { + "epoch": 1.6892417368762151, + "grad_norm": 0.49114811420440674, + "learning_rate": 4.2247391609578614e-06, + "loss": 0.1126, + "step": 5213 + }, + { + "epoch": 1.6895657809462086, + "grad_norm": 0.43978747725486755, + "learning_rate": 4.22301114656276e-06, + "loss": 0.1026, + "step": 5214 + }, + { + "epoch": 1.6898898250162022, + "grad_norm": 0.5041015148162842, + "learning_rate": 4.22128322726357e-06, + "loss": 0.1214, + "step": 5215 + }, + { + "epoch": 1.6902138690861959, + "grad_norm": 0.44519177079200745, + "learning_rate": 4.219555403271778e-06, + "loss": 0.1081, + "step": 5216 + }, + { + "epoch": 1.6905379131561893, + "grad_norm": 0.5372322797775269, + "learning_rate": 4.217827674798845e-06, + "loss": 0.1136, + "step": 5217 + }, + { + "epoch": 1.6908619572261827, + "grad_norm": 0.4537216126918793, + "learning_rate": 4.216100042056236e-06, + "loss": 0.1021, + "step": 5218 + }, + { + "epoch": 1.6911860012961761, + "grad_norm": 0.4697504937648773, + "learning_rate": 4.214372505255393e-06, + "loss": 0.1059, + "step": 5219 + }, + { + "epoch": 1.6915100453661698, + "grad_norm": 0.43239736557006836, + "learning_rate": 4.212645064607749e-06, + "loss": 0.095, + "step": 5220 + }, + { + "epoch": 1.6918340894361634, + "grad_norm": 0.4243602454662323, + "learning_rate": 4.210917720324727e-06, + "loss": 0.1018, + "step": 5221 + }, + { + "epoch": 1.6921581335061568, + "grad_norm": 0.4734940528869629, + "learning_rate": 4.209190472617736e-06, + "loss": 0.112, + "step": 5222 + }, + { + "epoch": 1.6924821775761503, + "grad_norm": 0.45138147473335266, + "learning_rate": 4.207463321698177e-06, + "loss": 0.1067, + "step": 5223 + }, + { + "epoch": 1.692806221646144, + "grad_norm": 0.4541546106338501, + "learning_rate": 4.205736267777433e-06, + "loss": 0.114, + "step": 5224 + }, + { + "epoch": 1.6931302657161373, + "grad_norm": 0.4667559564113617, + "learning_rate": 4.204009311066884e-06, + "loss": 0.1143, + "step": 5225 + }, + { + "epoch": 1.693454309786131, + "grad_norm": 0.41926097869873047, + "learning_rate": 4.202282451777888e-06, + "loss": 0.0961, + "step": 5226 + }, + { + "epoch": 1.6937783538561244, + "grad_norm": 0.4348689615726471, + "learning_rate": 4.200555690121796e-06, + "loss": 0.1045, + "step": 5227 + }, + { + "epoch": 1.6941023979261178, + "grad_norm": 0.4243723154067993, + "learning_rate": 4.198829026309951e-06, + "loss": 0.1014, + "step": 5228 + }, + { + "epoch": 1.6944264419961115, + "grad_norm": 0.4645223021507263, + "learning_rate": 4.197102460553673e-06, + "loss": 0.1095, + "step": 5229 + }, + { + "epoch": 1.6947504860661051, + "grad_norm": 0.4362965226173401, + "learning_rate": 4.195375993064286e-06, + "loss": 0.0962, + "step": 5230 + }, + { + "epoch": 1.6950745301360985, + "grad_norm": 0.43382829427719116, + "learning_rate": 4.1936496240530865e-06, + "loss": 0.0943, + "step": 5231 + }, + { + "epoch": 1.695398574206092, + "grad_norm": 0.47366800904273987, + "learning_rate": 4.191923353731366e-06, + "loss": 0.1124, + "step": 5232 + }, + { + "epoch": 1.6957226182760854, + "grad_norm": 0.4507708251476288, + "learning_rate": 4.190197182310406e-06, + "loss": 0.1107, + "step": 5233 + }, + { + "epoch": 1.696046662346079, + "grad_norm": 0.43914100527763367, + "learning_rate": 4.188471110001468e-06, + "loss": 0.1068, + "step": 5234 + }, + { + "epoch": 1.6963707064160727, + "grad_norm": 0.48613178730010986, + "learning_rate": 4.186745137015814e-06, + "loss": 0.1119, + "step": 5235 + }, + { + "epoch": 1.696694750486066, + "grad_norm": 0.44728830456733704, + "learning_rate": 4.185019263564679e-06, + "loss": 0.1021, + "step": 5236 + }, + { + "epoch": 1.6970187945560595, + "grad_norm": 0.4880698621273041, + "learning_rate": 4.183293489859298e-06, + "loss": 0.1093, + "step": 5237 + }, + { + "epoch": 1.6973428386260532, + "grad_norm": 0.4449664354324341, + "learning_rate": 4.1815678161108885e-06, + "loss": 0.0999, + "step": 5238 + }, + { + "epoch": 1.6976668826960468, + "grad_norm": 0.47024253010749817, + "learning_rate": 4.1798422425306525e-06, + "loss": 0.1105, + "step": 5239 + }, + { + "epoch": 1.6979909267660402, + "grad_norm": 0.4396377503871918, + "learning_rate": 4.178116769329789e-06, + "loss": 0.1078, + "step": 5240 + }, + { + "epoch": 1.6983149708360337, + "grad_norm": 0.43690598011016846, + "learning_rate": 4.176391396719475e-06, + "loss": 0.1035, + "step": 5241 + }, + { + "epoch": 1.698639014906027, + "grad_norm": 0.46832552552223206, + "learning_rate": 4.174666124910882e-06, + "loss": 0.108, + "step": 5242 + }, + { + "epoch": 1.6989630589760207, + "grad_norm": 0.4596247673034668, + "learning_rate": 4.172940954115165e-06, + "loss": 0.1153, + "step": 5243 + }, + { + "epoch": 1.6992871030460144, + "grad_norm": 0.4955204725265503, + "learning_rate": 4.171215884543468e-06, + "loss": 0.1173, + "step": 5244 + }, + { + "epoch": 1.6996111471160078, + "grad_norm": 0.47185131907463074, + "learning_rate": 4.169490916406925e-06, + "loss": 0.1095, + "step": 5245 + }, + { + "epoch": 1.6999351911860012, + "grad_norm": 0.47953593730926514, + "learning_rate": 4.167766049916651e-06, + "loss": 0.1138, + "step": 5246 + }, + { + "epoch": 1.7002592352559946, + "grad_norm": 0.44474267959594727, + "learning_rate": 4.166041285283759e-06, + "loss": 0.0999, + "step": 5247 + }, + { + "epoch": 1.7005832793259883, + "grad_norm": 0.46185240149497986, + "learning_rate": 4.164316622719337e-06, + "loss": 0.1108, + "step": 5248 + }, + { + "epoch": 1.700907323395982, + "grad_norm": 0.44396278262138367, + "learning_rate": 4.162592062434474e-06, + "loss": 0.1078, + "step": 5249 + }, + { + "epoch": 1.7012313674659754, + "grad_norm": 0.4664950370788574, + "learning_rate": 4.160867604640234e-06, + "loss": 0.1098, + "step": 5250 + }, + { + "epoch": 1.7015554115359688, + "grad_norm": 0.44771596789360046, + "learning_rate": 4.159143249547675e-06, + "loss": 0.1067, + "step": 5251 + }, + { + "epoch": 1.7018794556059624, + "grad_norm": 0.45071953535079956, + "learning_rate": 4.157418997367844e-06, + "loss": 0.1075, + "step": 5252 + }, + { + "epoch": 1.702203499675956, + "grad_norm": 0.44093945622444153, + "learning_rate": 4.155694848311769e-06, + "loss": 0.0984, + "step": 5253 + }, + { + "epoch": 1.7025275437459495, + "grad_norm": 0.4549207091331482, + "learning_rate": 4.1539708025904736e-06, + "loss": 0.1078, + "step": 5254 + }, + { + "epoch": 1.702851587815943, + "grad_norm": 0.4288365840911865, + "learning_rate": 4.1522468604149606e-06, + "loss": 0.1011, + "step": 5255 + }, + { + "epoch": 1.7031756318859363, + "grad_norm": 0.4736907184123993, + "learning_rate": 4.150523021996223e-06, + "loss": 0.1148, + "step": 5256 + }, + { + "epoch": 1.70349967595593, + "grad_norm": 0.4149734675884247, + "learning_rate": 4.148799287545247e-06, + "loss": 0.0958, + "step": 5257 + }, + { + "epoch": 1.7038237200259236, + "grad_norm": 0.4175003170967102, + "learning_rate": 4.147075657272994e-06, + "loss": 0.0931, + "step": 5258 + }, + { + "epoch": 1.704147764095917, + "grad_norm": 0.4388391077518463, + "learning_rate": 4.145352131390427e-06, + "loss": 0.094, + "step": 5259 + }, + { + "epoch": 1.7044718081659105, + "grad_norm": 0.4352574646472931, + "learning_rate": 4.1436287101084835e-06, + "loss": 0.0998, + "step": 5260 + }, + { + "epoch": 1.7047958522359041, + "grad_norm": 0.4395085275173187, + "learning_rate": 4.141905393638097e-06, + "loss": 0.1044, + "step": 5261 + }, + { + "epoch": 1.7051198963058976, + "grad_norm": 0.5015331506729126, + "learning_rate": 4.140182182190184e-06, + "loss": 0.1087, + "step": 5262 + }, + { + "epoch": 1.7054439403758912, + "grad_norm": 0.4683492183685303, + "learning_rate": 4.1384590759756474e-06, + "loss": 0.112, + "step": 5263 + }, + { + "epoch": 1.7057679844458846, + "grad_norm": 0.4542158842086792, + "learning_rate": 4.1367360752053795e-06, + "loss": 0.1148, + "step": 5264 + }, + { + "epoch": 1.706092028515878, + "grad_norm": 0.48041650652885437, + "learning_rate": 4.1350131800902575e-06, + "loss": 0.1222, + "step": 5265 + }, + { + "epoch": 1.7064160725858717, + "grad_norm": 0.4712051451206207, + "learning_rate": 4.13329039084115e-06, + "loss": 0.1076, + "step": 5266 + }, + { + "epoch": 1.7067401166558653, + "grad_norm": 0.4659982919692993, + "learning_rate": 4.131567707668909e-06, + "loss": 0.1101, + "step": 5267 + }, + { + "epoch": 1.7070641607258588, + "grad_norm": 0.48965689539909363, + "learning_rate": 4.129845130784371e-06, + "loss": 0.118, + "step": 5268 + }, + { + "epoch": 1.7073882047958522, + "grad_norm": 0.4107588827610016, + "learning_rate": 4.128122660398368e-06, + "loss": 0.1018, + "step": 5269 + }, + { + "epoch": 1.7077122488658456, + "grad_norm": 0.4468505382537842, + "learning_rate": 4.126400296721709e-06, + "loss": 0.1065, + "step": 5270 + }, + { + "epoch": 1.7080362929358393, + "grad_norm": 0.4403822124004364, + "learning_rate": 4.124678039965198e-06, + "loss": 0.1032, + "step": 5271 + }, + { + "epoch": 1.708360337005833, + "grad_norm": 0.46318909525871277, + "learning_rate": 4.12295589033962e-06, + "loss": 0.1145, + "step": 5272 + }, + { + "epoch": 1.7086843810758263, + "grad_norm": 0.4636130630970001, + "learning_rate": 4.1212338480557504e-06, + "loss": 0.1105, + "step": 5273 + }, + { + "epoch": 1.7090084251458197, + "grad_norm": 0.44539788365364075, + "learning_rate": 4.119511913324352e-06, + "loss": 0.1092, + "step": 5274 + }, + { + "epoch": 1.7093324692158134, + "grad_norm": 0.45345407724380493, + "learning_rate": 4.11779008635617e-06, + "loss": 0.1108, + "step": 5275 + }, + { + "epoch": 1.709656513285807, + "grad_norm": 0.43254977464675903, + "learning_rate": 4.1160683673619435e-06, + "loss": 0.1018, + "step": 5276 + }, + { + "epoch": 1.7099805573558005, + "grad_norm": 0.43789052963256836, + "learning_rate": 4.114346756552389e-06, + "loss": 0.1095, + "step": 5277 + }, + { + "epoch": 1.7103046014257939, + "grad_norm": 0.4524228572845459, + "learning_rate": 4.112625254138219e-06, + "loss": 0.1042, + "step": 5278 + }, + { + "epoch": 1.7106286454957873, + "grad_norm": 0.5019272565841675, + "learning_rate": 4.110903860330126e-06, + "loss": 0.1185, + "step": 5279 + }, + { + "epoch": 1.710952689565781, + "grad_norm": 0.4378442168235779, + "learning_rate": 4.109182575338796e-06, + "loss": 0.0988, + "step": 5280 + }, + { + "epoch": 1.7112767336357746, + "grad_norm": 0.49997493624687195, + "learning_rate": 4.107461399374894e-06, + "loss": 0.1151, + "step": 5281 + }, + { + "epoch": 1.711600777705768, + "grad_norm": 0.47978028655052185, + "learning_rate": 4.105740332649074e-06, + "loss": 0.1064, + "step": 5282 + }, + { + "epoch": 1.7119248217757614, + "grad_norm": 0.4691540598869324, + "learning_rate": 4.1040193753719835e-06, + "loss": 0.1062, + "step": 5283 + }, + { + "epoch": 1.7122488658457549, + "grad_norm": 0.46821358799934387, + "learning_rate": 4.102298527754246e-06, + "loss": 0.111, + "step": 5284 + }, + { + "epoch": 1.7125729099157485, + "grad_norm": 0.4568684697151184, + "learning_rate": 4.100577790006479e-06, + "loss": 0.1151, + "step": 5285 + }, + { + "epoch": 1.7128969539857422, + "grad_norm": 0.5008982419967651, + "learning_rate": 4.098857162339283e-06, + "loss": 0.1277, + "step": 5286 + }, + { + "epoch": 1.7132209980557356, + "grad_norm": 0.4188055694103241, + "learning_rate": 4.097136644963246e-06, + "loss": 0.1024, + "step": 5287 + }, + { + "epoch": 1.713545042125729, + "grad_norm": 0.46634042263031006, + "learning_rate": 4.095416238088945e-06, + "loss": 0.1025, + "step": 5288 + }, + { + "epoch": 1.7138690861957226, + "grad_norm": 0.42526668310165405, + "learning_rate": 4.093695941926936e-06, + "loss": 0.0974, + "step": 5289 + }, + { + "epoch": 1.7141931302657163, + "grad_norm": 0.4504096806049347, + "learning_rate": 4.0919757566877735e-06, + "loss": 0.1112, + "step": 5290 + }, + { + "epoch": 1.7145171743357097, + "grad_norm": 0.4655960202217102, + "learning_rate": 4.090255682581986e-06, + "loss": 0.1071, + "step": 5291 + }, + { + "epoch": 1.7148412184057031, + "grad_norm": 0.47816991806030273, + "learning_rate": 4.088535719820097e-06, + "loss": 0.1067, + "step": 5292 + }, + { + "epoch": 1.7151652624756966, + "grad_norm": 0.4879434108734131, + "learning_rate": 4.086815868612612e-06, + "loss": 0.1111, + "step": 5293 + }, + { + "epoch": 1.7154893065456902, + "grad_norm": 0.48248669505119324, + "learning_rate": 4.085096129170025e-06, + "loss": 0.1134, + "step": 5294 + }, + { + "epoch": 1.7158133506156839, + "grad_norm": 0.4520329236984253, + "learning_rate": 4.083376501702814e-06, + "loss": 0.1091, + "step": 5295 + }, + { + "epoch": 1.7161373946856773, + "grad_norm": 0.48300623893737793, + "learning_rate": 4.081656986421445e-06, + "loss": 0.1101, + "step": 5296 + }, + { + "epoch": 1.7164614387556707, + "grad_norm": 0.4790801405906677, + "learning_rate": 4.079937583536372e-06, + "loss": 0.1192, + "step": 5297 + }, + { + "epoch": 1.7167854828256643, + "grad_norm": 0.44851386547088623, + "learning_rate": 4.078218293258032e-06, + "loss": 0.1074, + "step": 5298 + }, + { + "epoch": 1.7171095268956578, + "grad_norm": 0.45854687690734863, + "learning_rate": 4.076499115796847e-06, + "loss": 0.1024, + "step": 5299 + }, + { + "epoch": 1.7174335709656514, + "grad_norm": 0.44694340229034424, + "learning_rate": 4.074780051363233e-06, + "loss": 0.1045, + "step": 5300 + }, + { + "epoch": 1.7177576150356448, + "grad_norm": 0.4249899685382843, + "learning_rate": 4.073061100167581e-06, + "loss": 0.103, + "step": 5301 + }, + { + "epoch": 1.7180816591056383, + "grad_norm": 0.4468053877353668, + "learning_rate": 4.071342262420279e-06, + "loss": 0.1078, + "step": 5302 + }, + { + "epoch": 1.718405703175632, + "grad_norm": 0.45494428277015686, + "learning_rate": 4.069623538331693e-06, + "loss": 0.1093, + "step": 5303 + }, + { + "epoch": 1.7187297472456255, + "grad_norm": 0.5028614401817322, + "learning_rate": 4.06790492811218e-06, + "loss": 0.1087, + "step": 5304 + }, + { + "epoch": 1.719053791315619, + "grad_norm": 0.42624178528785706, + "learning_rate": 4.06618643197208e-06, + "loss": 0.0988, + "step": 5305 + }, + { + "epoch": 1.7193778353856124, + "grad_norm": 0.4529658854007721, + "learning_rate": 4.06446805012172e-06, + "loss": 0.1068, + "step": 5306 + }, + { + "epoch": 1.7197018794556058, + "grad_norm": 0.42587921023368835, + "learning_rate": 4.062749782771416e-06, + "loss": 0.0994, + "step": 5307 + }, + { + "epoch": 1.7200259235255995, + "grad_norm": 0.45960429310798645, + "learning_rate": 4.061031630131463e-06, + "loss": 0.1078, + "step": 5308 + }, + { + "epoch": 1.720349967595593, + "grad_norm": 0.45776546001434326, + "learning_rate": 4.0593135924121506e-06, + "loss": 0.1062, + "step": 5309 + }, + { + "epoch": 1.7206740116655865, + "grad_norm": 0.4353899359703064, + "learning_rate": 4.057595669823747e-06, + "loss": 0.1047, + "step": 5310 + }, + { + "epoch": 1.72099805573558, + "grad_norm": 0.4940280616283417, + "learning_rate": 4.055877862576509e-06, + "loss": 0.1187, + "step": 5311 + }, + { + "epoch": 1.7213220998055736, + "grad_norm": 0.4680522680282593, + "learning_rate": 4.054160170880682e-06, + "loss": 0.1069, + "step": 5312 + }, + { + "epoch": 1.721646143875567, + "grad_norm": 0.4386557638645172, + "learning_rate": 4.052442594946493e-06, + "loss": 0.0983, + "step": 5313 + }, + { + "epoch": 1.7219701879455607, + "grad_norm": 0.4579738974571228, + "learning_rate": 4.050725134984159e-06, + "loss": 0.1005, + "step": 5314 + }, + { + "epoch": 1.722294232015554, + "grad_norm": 0.46672308444976807, + "learning_rate": 4.049007791203877e-06, + "loss": 0.1084, + "step": 5315 + }, + { + "epoch": 1.7226182760855475, + "grad_norm": 0.4470560550689697, + "learning_rate": 4.047290563815837e-06, + "loss": 0.1079, + "step": 5316 + }, + { + "epoch": 1.7229423201555412, + "grad_norm": 0.4670076072216034, + "learning_rate": 4.045573453030207e-06, + "loss": 0.105, + "step": 5317 + }, + { + "epoch": 1.7232663642255348, + "grad_norm": 0.4503187835216522, + "learning_rate": 4.043856459057147e-06, + "loss": 0.1136, + "step": 5318 + }, + { + "epoch": 1.7235904082955282, + "grad_norm": 0.4329877197742462, + "learning_rate": 4.042139582106802e-06, + "loss": 0.1009, + "step": 5319 + }, + { + "epoch": 1.7239144523655217, + "grad_norm": 0.43727365136146545, + "learning_rate": 4.040422822389297e-06, + "loss": 0.1031, + "step": 5320 + }, + { + "epoch": 1.724238496435515, + "grad_norm": 0.496261864900589, + "learning_rate": 4.03870618011475e-06, + "loss": 0.122, + "step": 5321 + }, + { + "epoch": 1.7245625405055087, + "grad_norm": 0.4283820688724518, + "learning_rate": 4.036989655493262e-06, + "loss": 0.0971, + "step": 5322 + }, + { + "epoch": 1.7248865845755024, + "grad_norm": 0.48006966710090637, + "learning_rate": 4.035273248734915e-06, + "loss": 0.1208, + "step": 5323 + }, + { + "epoch": 1.7252106286454958, + "grad_norm": 0.4690515100955963, + "learning_rate": 4.033556960049785e-06, + "loss": 0.1091, + "step": 5324 + }, + { + "epoch": 1.7255346727154892, + "grad_norm": 0.4697633981704712, + "learning_rate": 4.031840789647925e-06, + "loss": 0.1099, + "step": 5325 + }, + { + "epoch": 1.7258587167854829, + "grad_norm": 0.4803153872489929, + "learning_rate": 4.030124737739382e-06, + "loss": 0.1157, + "step": 5326 + }, + { + "epoch": 1.7261827608554765, + "grad_norm": 0.4614790380001068, + "learning_rate": 4.028408804534181e-06, + "loss": 0.1129, + "step": 5327 + }, + { + "epoch": 1.72650680492547, + "grad_norm": 0.45441630482673645, + "learning_rate": 4.026692990242338e-06, + "loss": 0.1099, + "step": 5328 + }, + { + "epoch": 1.7268308489954634, + "grad_norm": 0.4581306278705597, + "learning_rate": 4.02497729507385e-06, + "loss": 0.1103, + "step": 5329 + }, + { + "epoch": 1.7271548930654568, + "grad_norm": 0.4611577093601227, + "learning_rate": 4.0232617192387005e-06, + "loss": 0.1098, + "step": 5330 + }, + { + "epoch": 1.7274789371354504, + "grad_norm": 0.45392879843711853, + "learning_rate": 4.021546262946863e-06, + "loss": 0.1028, + "step": 5331 + }, + { + "epoch": 1.727802981205444, + "grad_norm": 0.4982890784740448, + "learning_rate": 4.019830926408289e-06, + "loss": 0.1239, + "step": 5332 + }, + { + "epoch": 1.7281270252754375, + "grad_norm": 0.4632214605808258, + "learning_rate": 4.018115709832923e-06, + "loss": 0.1077, + "step": 5333 + }, + { + "epoch": 1.728451069345431, + "grad_norm": 0.46273502707481384, + "learning_rate": 4.016400613430687e-06, + "loss": 0.11, + "step": 5334 + }, + { + "epoch": 1.7287751134154243, + "grad_norm": 0.4509614109992981, + "learning_rate": 4.014685637411494e-06, + "loss": 0.109, + "step": 5335 + }, + { + "epoch": 1.729099157485418, + "grad_norm": 0.4570104777812958, + "learning_rate": 4.012970781985242e-06, + "loss": 0.1065, + "step": 5336 + }, + { + "epoch": 1.7294232015554116, + "grad_norm": 0.4149811863899231, + "learning_rate": 4.011256047361809e-06, + "loss": 0.0989, + "step": 5337 + }, + { + "epoch": 1.729747245625405, + "grad_norm": 0.46206188201904297, + "learning_rate": 4.009541433751066e-06, + "loss": 0.1128, + "step": 5338 + }, + { + "epoch": 1.7300712896953985, + "grad_norm": 0.4685775935649872, + "learning_rate": 4.007826941362861e-06, + "loss": 0.1088, + "step": 5339 + }, + { + "epoch": 1.7303953337653921, + "grad_norm": 0.47735917568206787, + "learning_rate": 4.006112570407035e-06, + "loss": 0.1154, + "step": 5340 + }, + { + "epoch": 1.7307193778353858, + "grad_norm": 0.44871029257774353, + "learning_rate": 4.0043983210934086e-06, + "loss": 0.1049, + "step": 5341 + }, + { + "epoch": 1.7310434219053792, + "grad_norm": 0.4410346746444702, + "learning_rate": 4.002684193631789e-06, + "loss": 0.1031, + "step": 5342 + }, + { + "epoch": 1.7313674659753726, + "grad_norm": 0.47302764654159546, + "learning_rate": 4.0009701882319695e-06, + "loss": 0.1095, + "step": 5343 + }, + { + "epoch": 1.731691510045366, + "grad_norm": 0.4517134130001068, + "learning_rate": 3.999256305103726e-06, + "loss": 0.1041, + "step": 5344 + }, + { + "epoch": 1.7320155541153597, + "grad_norm": 0.43479084968566895, + "learning_rate": 3.9975425444568265e-06, + "loss": 0.1009, + "step": 5345 + }, + { + "epoch": 1.7323395981853533, + "grad_norm": 0.42933210730552673, + "learning_rate": 3.995828906501013e-06, + "loss": 0.0967, + "step": 5346 + }, + { + "epoch": 1.7326636422553467, + "grad_norm": 0.4475741982460022, + "learning_rate": 3.994115391446019e-06, + "loss": 0.1052, + "step": 5347 + }, + { + "epoch": 1.7329876863253402, + "grad_norm": 0.4910019338130951, + "learning_rate": 3.992401999501566e-06, + "loss": 0.1153, + "step": 5348 + }, + { + "epoch": 1.7333117303953338, + "grad_norm": 0.42681607604026794, + "learning_rate": 3.990688730877352e-06, + "loss": 0.1019, + "step": 5349 + }, + { + "epoch": 1.7336357744653272, + "grad_norm": 0.4786801040172577, + "learning_rate": 3.988975585783068e-06, + "loss": 0.1184, + "step": 5350 + }, + { + "epoch": 1.7339598185353209, + "grad_norm": 0.45410358905792236, + "learning_rate": 3.987262564428382e-06, + "loss": 0.1123, + "step": 5351 + }, + { + "epoch": 1.7342838626053143, + "grad_norm": 0.4435284733772278, + "learning_rate": 3.985549667022956e-06, + "loss": 0.1112, + "step": 5352 + }, + { + "epoch": 1.7346079066753077, + "grad_norm": 0.4696560502052307, + "learning_rate": 3.983836893776432e-06, + "loss": 0.1286, + "step": 5353 + }, + { + "epoch": 1.7349319507453014, + "grad_norm": 0.46720215678215027, + "learning_rate": 3.982124244898431e-06, + "loss": 0.1182, + "step": 5354 + }, + { + "epoch": 1.735255994815295, + "grad_norm": 0.46293580532073975, + "learning_rate": 3.980411720598571e-06, + "loss": 0.1093, + "step": 5355 + }, + { + "epoch": 1.7355800388852884, + "grad_norm": 0.44470348954200745, + "learning_rate": 3.978699321086444e-06, + "loss": 0.1007, + "step": 5356 + }, + { + "epoch": 1.7359040829552819, + "grad_norm": 0.4292317032814026, + "learning_rate": 3.976987046571636e-06, + "loss": 0.0904, + "step": 5357 + }, + { + "epoch": 1.7362281270252753, + "grad_norm": 0.45817989110946655, + "learning_rate": 3.975274897263709e-06, + "loss": 0.1143, + "step": 5358 + }, + { + "epoch": 1.736552171095269, + "grad_norm": 0.4869634807109833, + "learning_rate": 3.973562873372215e-06, + "loss": 0.1208, + "step": 5359 + }, + { + "epoch": 1.7368762151652626, + "grad_norm": 0.4258679747581482, + "learning_rate": 3.971850975106687e-06, + "loss": 0.0977, + "step": 5360 + }, + { + "epoch": 1.737200259235256, + "grad_norm": 0.45118412375450134, + "learning_rate": 3.970139202676647e-06, + "loss": 0.1053, + "step": 5361 + }, + { + "epoch": 1.7375243033052494, + "grad_norm": 0.46550124883651733, + "learning_rate": 3.9684275562916e-06, + "loss": 0.1155, + "step": 5362 + }, + { + "epoch": 1.737848347375243, + "grad_norm": 0.43546655774116516, + "learning_rate": 3.966716036161031e-06, + "loss": 0.1002, + "step": 5363 + }, + { + "epoch": 1.7381723914452365, + "grad_norm": 0.45683375000953674, + "learning_rate": 3.965004642494419e-06, + "loss": 0.1097, + "step": 5364 + }, + { + "epoch": 1.7384964355152301, + "grad_norm": 0.49274900555610657, + "learning_rate": 3.963293375501219e-06, + "loss": 0.1109, + "step": 5365 + }, + { + "epoch": 1.7388204795852236, + "grad_norm": 0.43467313051223755, + "learning_rate": 3.961582235390871e-06, + "loss": 0.1012, + "step": 5366 + }, + { + "epoch": 1.739144523655217, + "grad_norm": 0.471316933631897, + "learning_rate": 3.959871222372807e-06, + "loss": 0.1102, + "step": 5367 + }, + { + "epoch": 1.7394685677252106, + "grad_norm": 0.4421023428440094, + "learning_rate": 3.958160336656436e-06, + "loss": 0.1073, + "step": 5368 + }, + { + "epoch": 1.7397926117952043, + "grad_norm": 0.43588271737098694, + "learning_rate": 3.9564495784511535e-06, + "loss": 0.1093, + "step": 5369 + }, + { + "epoch": 1.7401166558651977, + "grad_norm": 0.4536246657371521, + "learning_rate": 3.9547389479663404e-06, + "loss": 0.1084, + "step": 5370 + }, + { + "epoch": 1.7404406999351911, + "grad_norm": 0.46806734800338745, + "learning_rate": 3.953028445411362e-06, + "loss": 0.1078, + "step": 5371 + }, + { + "epoch": 1.7407647440051845, + "grad_norm": 0.45266732573509216, + "learning_rate": 3.951318070995566e-06, + "loss": 0.1079, + "step": 5372 + }, + { + "epoch": 1.7410887880751782, + "grad_norm": 0.46891167759895325, + "learning_rate": 3.949607824928285e-06, + "loss": 0.1067, + "step": 5373 + }, + { + "epoch": 1.7414128321451718, + "grad_norm": 0.43091535568237305, + "learning_rate": 3.947897707418839e-06, + "loss": 0.1016, + "step": 5374 + }, + { + "epoch": 1.7417368762151653, + "grad_norm": 0.47220826148986816, + "learning_rate": 3.946187718676526e-06, + "loss": 0.1145, + "step": 5375 + }, + { + "epoch": 1.7420609202851587, + "grad_norm": 0.46615976095199585, + "learning_rate": 3.9444778589106375e-06, + "loss": 0.1033, + "step": 5376 + }, + { + "epoch": 1.7423849643551523, + "grad_norm": 0.4272662103176117, + "learning_rate": 3.942768128330441e-06, + "loss": 0.1008, + "step": 5377 + }, + { + "epoch": 1.742709008425146, + "grad_norm": 0.46686840057373047, + "learning_rate": 3.941058527145188e-06, + "loss": 0.1155, + "step": 5378 + }, + { + "epoch": 1.7430330524951394, + "grad_norm": 0.4431476891040802, + "learning_rate": 3.9393490555641226e-06, + "loss": 0.1115, + "step": 5379 + }, + { + "epoch": 1.7433570965651328, + "grad_norm": 0.48964086174964905, + "learning_rate": 3.937639713796463e-06, + "loss": 0.1101, + "step": 5380 + }, + { + "epoch": 1.7436811406351262, + "grad_norm": 0.4731682538986206, + "learning_rate": 3.93593050205142e-06, + "loss": 0.1205, + "step": 5381 + }, + { + "epoch": 1.74400518470512, + "grad_norm": 0.4271281063556671, + "learning_rate": 3.93422142053818e-06, + "loss": 0.0992, + "step": 5382 + }, + { + "epoch": 1.7443292287751135, + "grad_norm": 0.43892544507980347, + "learning_rate": 3.932512469465923e-06, + "loss": 0.102, + "step": 5383 + }, + { + "epoch": 1.744653272845107, + "grad_norm": 0.4575996696949005, + "learning_rate": 3.930803649043805e-06, + "loss": 0.1127, + "step": 5384 + }, + { + "epoch": 1.7449773169151004, + "grad_norm": 0.4452469050884247, + "learning_rate": 3.9290949594809676e-06, + "loss": 0.1051, + "step": 5385 + }, + { + "epoch": 1.7453013609850938, + "grad_norm": 0.44725051522254944, + "learning_rate": 3.927386400986542e-06, + "loss": 0.1077, + "step": 5386 + }, + { + "epoch": 1.7456254050550875, + "grad_norm": 0.4135163724422455, + "learning_rate": 3.925677973769636e-06, + "loss": 0.0981, + "step": 5387 + }, + { + "epoch": 1.745949449125081, + "grad_norm": 0.48988252878189087, + "learning_rate": 3.923969678039347e-06, + "loss": 0.1179, + "step": 5388 + }, + { + "epoch": 1.7462734931950745, + "grad_norm": 0.4563140571117401, + "learning_rate": 3.922261514004753e-06, + "loss": 0.1075, + "step": 5389 + }, + { + "epoch": 1.746597537265068, + "grad_norm": 0.43328210711479187, + "learning_rate": 3.920553481874916e-06, + "loss": 0.1017, + "step": 5390 + }, + { + "epoch": 1.7469215813350616, + "grad_norm": 0.481424480676651, + "learning_rate": 3.918845581858882e-06, + "loss": 0.116, + "step": 5391 + }, + { + "epoch": 1.7472456254050552, + "grad_norm": 0.4085712134838104, + "learning_rate": 3.9171378141656825e-06, + "loss": 0.0948, + "step": 5392 + }, + { + "epoch": 1.7475696694750487, + "grad_norm": 0.4701805114746094, + "learning_rate": 3.915430179004333e-06, + "loss": 0.1113, + "step": 5393 + }, + { + "epoch": 1.747893713545042, + "grad_norm": 0.5021188855171204, + "learning_rate": 3.9137226765838285e-06, + "loss": 0.1189, + "step": 5394 + }, + { + "epoch": 1.7482177576150355, + "grad_norm": 0.44054004549980164, + "learning_rate": 3.912015307113153e-06, + "loss": 0.1072, + "step": 5395 + }, + { + "epoch": 1.7485418016850292, + "grad_norm": 0.465827614068985, + "learning_rate": 3.910308070801273e-06, + "loss": 0.1104, + "step": 5396 + }, + { + "epoch": 1.7488658457550228, + "grad_norm": 0.46348854899406433, + "learning_rate": 3.908600967857134e-06, + "loss": 0.1047, + "step": 5397 + }, + { + "epoch": 1.7491898898250162, + "grad_norm": 0.4622736871242523, + "learning_rate": 3.906893998489672e-06, + "loss": 0.1125, + "step": 5398 + }, + { + "epoch": 1.7495139338950096, + "grad_norm": 0.44009384512901306, + "learning_rate": 3.9051871629078e-06, + "loss": 0.106, + "step": 5399 + }, + { + "epoch": 1.7498379779650033, + "grad_norm": 0.42945173382759094, + "learning_rate": 3.903480461320424e-06, + "loss": 0.0989, + "step": 5400 + }, + { + "epoch": 1.7501620220349967, + "grad_norm": 0.4769463837146759, + "learning_rate": 3.901773893936424e-06, + "loss": 0.117, + "step": 5401 + }, + { + "epoch": 1.7504860661049904, + "grad_norm": 0.4809808135032654, + "learning_rate": 3.900067460964667e-06, + "loss": 0.1139, + "step": 5402 + }, + { + "epoch": 1.7508101101749838, + "grad_norm": 0.40791723132133484, + "learning_rate": 3.898361162614005e-06, + "loss": 0.0923, + "step": 5403 + }, + { + "epoch": 1.7511341542449772, + "grad_norm": 0.4657929241657257, + "learning_rate": 3.89665499909327e-06, + "loss": 0.1111, + "step": 5404 + }, + { + "epoch": 1.7514581983149708, + "grad_norm": 0.483063280582428, + "learning_rate": 3.894948970611284e-06, + "loss": 0.1108, + "step": 5405 + }, + { + "epoch": 1.7517822423849645, + "grad_norm": 0.43636569380760193, + "learning_rate": 3.893243077376845e-06, + "loss": 0.1023, + "step": 5406 + }, + { + "epoch": 1.752106286454958, + "grad_norm": 0.49354323744773865, + "learning_rate": 3.891537319598741e-06, + "loss": 0.1215, + "step": 5407 + }, + { + "epoch": 1.7524303305249513, + "grad_norm": 0.44748640060424805, + "learning_rate": 3.8898316974857375e-06, + "loss": 0.1058, + "step": 5408 + }, + { + "epoch": 1.7527543745949448, + "grad_norm": 0.5163024663925171, + "learning_rate": 3.888126211246585e-06, + "loss": 0.1163, + "step": 5409 + }, + { + "epoch": 1.7530784186649384, + "grad_norm": 0.48390907049179077, + "learning_rate": 3.8864208610900234e-06, + "loss": 0.1088, + "step": 5410 + }, + { + "epoch": 1.753402462734932, + "grad_norm": 0.47634682059288025, + "learning_rate": 3.884715647224766e-06, + "loss": 0.1169, + "step": 5411 + }, + { + "epoch": 1.7537265068049255, + "grad_norm": 0.46805477142333984, + "learning_rate": 3.883010569859517e-06, + "loss": 0.1158, + "step": 5412 + }, + { + "epoch": 1.754050550874919, + "grad_norm": 0.4899493455886841, + "learning_rate": 3.881305629202963e-06, + "loss": 0.1252, + "step": 5413 + }, + { + "epoch": 1.7543745949449125, + "grad_norm": 0.470268189907074, + "learning_rate": 3.879600825463768e-06, + "loss": 0.1117, + "step": 5414 + }, + { + "epoch": 1.7546986390149062, + "grad_norm": 0.4910193681716919, + "learning_rate": 3.877896158850587e-06, + "loss": 0.1216, + "step": 5415 + }, + { + "epoch": 1.7550226830848996, + "grad_norm": 0.49009987711906433, + "learning_rate": 3.876191629572051e-06, + "loss": 0.1167, + "step": 5416 + }, + { + "epoch": 1.755346727154893, + "grad_norm": 0.44104844331741333, + "learning_rate": 3.874487237836782e-06, + "loss": 0.1074, + "step": 5417 + }, + { + "epoch": 1.7556707712248865, + "grad_norm": 0.48006612062454224, + "learning_rate": 3.872782983853378e-06, + "loss": 0.1114, + "step": 5418 + }, + { + "epoch": 1.75599481529488, + "grad_norm": 0.4347625970840454, + "learning_rate": 3.871078867830427e-06, + "loss": 0.0971, + "step": 5419 + }, + { + "epoch": 1.7563188593648738, + "grad_norm": 0.4484483301639557, + "learning_rate": 3.869374889976493e-06, + "loss": 0.1034, + "step": 5420 + }, + { + "epoch": 1.7566429034348672, + "grad_norm": 0.44795218110084534, + "learning_rate": 3.867671050500125e-06, + "loss": 0.1111, + "step": 5421 + }, + { + "epoch": 1.7569669475048606, + "grad_norm": 0.4586280882358551, + "learning_rate": 3.86596734960986e-06, + "loss": 0.1099, + "step": 5422 + }, + { + "epoch": 1.757290991574854, + "grad_norm": 0.458940327167511, + "learning_rate": 3.864263787514214e-06, + "loss": 0.0964, + "step": 5423 + }, + { + "epoch": 1.7576150356448477, + "grad_norm": 0.3986370265483856, + "learning_rate": 3.862560364421685e-06, + "loss": 0.0921, + "step": 5424 + }, + { + "epoch": 1.7579390797148413, + "grad_norm": 0.46346795558929443, + "learning_rate": 3.860857080540755e-06, + "loss": 0.1125, + "step": 5425 + }, + { + "epoch": 1.7582631237848347, + "grad_norm": 0.45013266801834106, + "learning_rate": 3.859153936079892e-06, + "loss": 0.1057, + "step": 5426 + }, + { + "epoch": 1.7585871678548282, + "grad_norm": 0.48645591735839844, + "learning_rate": 3.857450931247544e-06, + "loss": 0.1183, + "step": 5427 + }, + { + "epoch": 1.7589112119248218, + "grad_norm": 0.49759355187416077, + "learning_rate": 3.855748066252138e-06, + "loss": 0.1227, + "step": 5428 + }, + { + "epoch": 1.7592352559948155, + "grad_norm": 0.47369781136512756, + "learning_rate": 3.854045341302094e-06, + "loss": 0.1138, + "step": 5429 + }, + { + "epoch": 1.7595593000648089, + "grad_norm": 0.48496073484420776, + "learning_rate": 3.852342756605805e-06, + "loss": 0.1213, + "step": 5430 + }, + { + "epoch": 1.7598833441348023, + "grad_norm": 0.43131017684936523, + "learning_rate": 3.850640312371653e-06, + "loss": 0.1004, + "step": 5431 + }, + { + "epoch": 1.7602073882047957, + "grad_norm": 0.44668450951576233, + "learning_rate": 3.848938008808002e-06, + "loss": 0.1016, + "step": 5432 + }, + { + "epoch": 1.7605314322747894, + "grad_norm": 0.3990452289581299, + "learning_rate": 3.847235846123193e-06, + "loss": 0.0964, + "step": 5433 + }, + { + "epoch": 1.760855476344783, + "grad_norm": 0.41115233302116394, + "learning_rate": 3.845533824525558e-06, + "loss": 0.094, + "step": 5434 + }, + { + "epoch": 1.7611795204147764, + "grad_norm": 0.4076218903064728, + "learning_rate": 3.843831944223406e-06, + "loss": 0.0929, + "step": 5435 + }, + { + "epoch": 1.7615035644847699, + "grad_norm": 0.4631011486053467, + "learning_rate": 3.842130205425033e-06, + "loss": 0.1141, + "step": 5436 + }, + { + "epoch": 1.7618276085547635, + "grad_norm": 0.4253556728363037, + "learning_rate": 3.840428608338711e-06, + "loss": 0.1029, + "step": 5437 + }, + { + "epoch": 1.762151652624757, + "grad_norm": 0.45234552025794983, + "learning_rate": 3.838727153172704e-06, + "loss": 0.1103, + "step": 5438 + }, + { + "epoch": 1.7624756966947506, + "grad_norm": 0.4516621232032776, + "learning_rate": 3.837025840135252e-06, + "loss": 0.1077, + "step": 5439 + }, + { + "epoch": 1.762799740764744, + "grad_norm": 0.45661982893943787, + "learning_rate": 3.835324669434577e-06, + "loss": 0.1097, + "step": 5440 + }, + { + "epoch": 1.7631237848347374, + "grad_norm": 0.4486618638038635, + "learning_rate": 3.833623641278889e-06, + "loss": 0.1057, + "step": 5441 + }, + { + "epoch": 1.763447828904731, + "grad_norm": 0.4842875599861145, + "learning_rate": 3.831922755876374e-06, + "loss": 0.1144, + "step": 5442 + }, + { + "epoch": 1.7637718729747247, + "grad_norm": 0.4536530673503876, + "learning_rate": 3.8302220134352075e-06, + "loss": 0.1105, + "step": 5443 + }, + { + "epoch": 1.7640959170447181, + "grad_norm": 0.5151732563972473, + "learning_rate": 3.828521414163542e-06, + "loss": 0.1336, + "step": 5444 + }, + { + "epoch": 1.7644199611147116, + "grad_norm": 0.4194684624671936, + "learning_rate": 3.826820958269514e-06, + "loss": 0.0976, + "step": 5445 + }, + { + "epoch": 1.764744005184705, + "grad_norm": 0.4612726867198944, + "learning_rate": 3.825120645961245e-06, + "loss": 0.1127, + "step": 5446 + }, + { + "epoch": 1.7650680492546986, + "grad_norm": 0.47306978702545166, + "learning_rate": 3.823420477446833e-06, + "loss": 0.114, + "step": 5447 + }, + { + "epoch": 1.7653920933246923, + "grad_norm": 0.4526808261871338, + "learning_rate": 3.821720452934366e-06, + "loss": 0.1076, + "step": 5448 + }, + { + "epoch": 1.7657161373946857, + "grad_norm": 0.48292243480682373, + "learning_rate": 3.820020572631906e-06, + "loss": 0.1149, + "step": 5449 + }, + { + "epoch": 1.7660401814646791, + "grad_norm": 0.4273426830768585, + "learning_rate": 3.818320836747509e-06, + "loss": 0.0982, + "step": 5450 + }, + { + "epoch": 1.7663642255346728, + "grad_norm": 0.4303819537162781, + "learning_rate": 3.8166212454892e-06, + "loss": 0.1053, + "step": 5451 + }, + { + "epoch": 1.7666882696046662, + "grad_norm": 0.436567097902298, + "learning_rate": 3.814921799064994e-06, + "loss": 0.1023, + "step": 5452 + }, + { + "epoch": 1.7670123136746598, + "grad_norm": 0.4349686801433563, + "learning_rate": 3.813222497682889e-06, + "loss": 0.0964, + "step": 5453 + }, + { + "epoch": 1.7673363577446533, + "grad_norm": 0.4685845971107483, + "learning_rate": 3.81152334155086e-06, + "loss": 0.1088, + "step": 5454 + }, + { + "epoch": 1.7676604018146467, + "grad_norm": 0.45242762565612793, + "learning_rate": 3.80982433087687e-06, + "loss": 0.1006, + "step": 5455 + }, + { + "epoch": 1.7679844458846403, + "grad_norm": 0.4908858835697174, + "learning_rate": 3.8081254658688592e-06, + "loss": 0.1139, + "step": 5456 + }, + { + "epoch": 1.768308489954634, + "grad_norm": 0.46974053978919983, + "learning_rate": 3.8064267467347527e-06, + "loss": 0.1075, + "step": 5457 + }, + { + "epoch": 1.7686325340246274, + "grad_norm": 0.4594678282737732, + "learning_rate": 3.8047281736824593e-06, + "loss": 0.1128, + "step": 5458 + }, + { + "epoch": 1.7689565780946208, + "grad_norm": 0.4492245018482208, + "learning_rate": 3.8030297469198633e-06, + "loss": 0.1017, + "step": 5459 + }, + { + "epoch": 1.7692806221646142, + "grad_norm": 0.4470076262950897, + "learning_rate": 3.801331466654842e-06, + "loss": 0.1044, + "step": 5460 + }, + { + "epoch": 1.7696046662346079, + "grad_norm": 0.4670352637767792, + "learning_rate": 3.799633333095242e-06, + "loss": 0.1059, + "step": 5461 + }, + { + "epoch": 1.7699287103046015, + "grad_norm": 0.49116596579551697, + "learning_rate": 3.7979353464489044e-06, + "loss": 0.1194, + "step": 5462 + }, + { + "epoch": 1.770252754374595, + "grad_norm": 0.5399103164672852, + "learning_rate": 3.7962375069236433e-06, + "loss": 0.1262, + "step": 5463 + }, + { + "epoch": 1.7705767984445884, + "grad_norm": 0.5004605054855347, + "learning_rate": 3.7945398147272566e-06, + "loss": 0.1139, + "step": 5464 + }, + { + "epoch": 1.770900842514582, + "grad_norm": 0.4163171350955963, + "learning_rate": 3.7928422700675273e-06, + "loss": 0.0984, + "step": 5465 + }, + { + "epoch": 1.7712248865845757, + "grad_norm": 0.4578644037246704, + "learning_rate": 3.791144873152218e-06, + "loss": 0.107, + "step": 5466 + }, + { + "epoch": 1.771548930654569, + "grad_norm": 0.45029592514038086, + "learning_rate": 3.7894476241890743e-06, + "loss": 0.1016, + "step": 5467 + }, + { + "epoch": 1.7718729747245625, + "grad_norm": 0.483422189950943, + "learning_rate": 3.7877505233858224e-06, + "loss": 0.1181, + "step": 5468 + }, + { + "epoch": 1.772197018794556, + "grad_norm": 0.485103577375412, + "learning_rate": 3.786053570950169e-06, + "loss": 0.115, + "step": 5469 + }, + { + "epoch": 1.7725210628645496, + "grad_norm": 0.49042415618896484, + "learning_rate": 3.7843567670898085e-06, + "loss": 0.1233, + "step": 5470 + }, + { + "epoch": 1.7728451069345432, + "grad_norm": 0.42680788040161133, + "learning_rate": 3.7826601120124094e-06, + "loss": 0.1013, + "step": 5471 + }, + { + "epoch": 1.7731691510045366, + "grad_norm": 0.45376521348953247, + "learning_rate": 3.78096360592563e-06, + "loss": 0.1052, + "step": 5472 + }, + { + "epoch": 1.77349319507453, + "grad_norm": 0.42488327622413635, + "learning_rate": 3.779267249037102e-06, + "loss": 0.0959, + "step": 5473 + }, + { + "epoch": 1.7738172391445235, + "grad_norm": 0.46989819407463074, + "learning_rate": 3.777571041554447e-06, + "loss": 0.1114, + "step": 5474 + }, + { + "epoch": 1.7741412832145171, + "grad_norm": 0.47452855110168457, + "learning_rate": 3.7758749836852625e-06, + "loss": 0.1262, + "step": 5475 + }, + { + "epoch": 1.7744653272845108, + "grad_norm": 0.4259603023529053, + "learning_rate": 3.7741790756371287e-06, + "loss": 0.1007, + "step": 5476 + }, + { + "epoch": 1.7747893713545042, + "grad_norm": 0.4658740162849426, + "learning_rate": 3.77248331761761e-06, + "loss": 0.1079, + "step": 5477 + }, + { + "epoch": 1.7751134154244976, + "grad_norm": 0.39768102765083313, + "learning_rate": 3.7707877098342484e-06, + "loss": 0.0927, + "step": 5478 + }, + { + "epoch": 1.7754374594944913, + "grad_norm": 0.47288915514945984, + "learning_rate": 3.7690922524945727e-06, + "loss": 0.1129, + "step": 5479 + }, + { + "epoch": 1.775761503564485, + "grad_norm": 0.4316980838775635, + "learning_rate": 3.76739694580609e-06, + "loss": 0.1091, + "step": 5480 + }, + { + "epoch": 1.7760855476344783, + "grad_norm": 0.4133599102497101, + "learning_rate": 3.765701789976286e-06, + "loss": 0.1014, + "step": 5481 + }, + { + "epoch": 1.7764095917044718, + "grad_norm": 0.43327245116233826, + "learning_rate": 3.764006785212636e-06, + "loss": 0.1054, + "step": 5482 + }, + { + "epoch": 1.7767336357744652, + "grad_norm": 0.4340040683746338, + "learning_rate": 3.7623119317225883e-06, + "loss": 0.1028, + "step": 5483 + }, + { + "epoch": 1.7770576798444588, + "grad_norm": 0.4386005997657776, + "learning_rate": 3.76061722971358e-06, + "loss": 0.1046, + "step": 5484 + }, + { + "epoch": 1.7773817239144525, + "grad_norm": 0.4821830987930298, + "learning_rate": 3.758922679393024e-06, + "loss": 0.1166, + "step": 5485 + }, + { + "epoch": 1.777705767984446, + "grad_norm": 0.4418981671333313, + "learning_rate": 3.7572282809683174e-06, + "loss": 0.1039, + "step": 5486 + }, + { + "epoch": 1.7780298120544393, + "grad_norm": 0.44899412989616394, + "learning_rate": 3.7555340346468396e-06, + "loss": 0.1077, + "step": 5487 + }, + { + "epoch": 1.778353856124433, + "grad_norm": 0.47012320160865784, + "learning_rate": 3.7538399406359473e-06, + "loss": 0.1132, + "step": 5488 + }, + { + "epoch": 1.7786779001944264, + "grad_norm": 0.41314542293548584, + "learning_rate": 3.752145999142983e-06, + "loss": 0.0979, + "step": 5489 + }, + { + "epoch": 1.77900194426442, + "grad_norm": 0.4720423221588135, + "learning_rate": 3.7504522103752674e-06, + "loss": 0.1134, + "step": 5490 + }, + { + "epoch": 1.7793259883344135, + "grad_norm": 0.43733474612236023, + "learning_rate": 3.7487585745401058e-06, + "loss": 0.1095, + "step": 5491 + }, + { + "epoch": 1.779650032404407, + "grad_norm": 0.44804707169532776, + "learning_rate": 3.747065091844782e-06, + "loss": 0.1034, + "step": 5492 + }, + { + "epoch": 1.7799740764744005, + "grad_norm": 0.44590461254119873, + "learning_rate": 3.7453717624965595e-06, + "loss": 0.1028, + "step": 5493 + }, + { + "epoch": 1.7802981205443942, + "grad_norm": 0.44284960627555847, + "learning_rate": 3.74367858670269e-06, + "loss": 0.106, + "step": 5494 + }, + { + "epoch": 1.7806221646143876, + "grad_norm": 0.43234145641326904, + "learning_rate": 3.741985564670396e-06, + "loss": 0.1013, + "step": 5495 + }, + { + "epoch": 1.780946208684381, + "grad_norm": 0.46835172176361084, + "learning_rate": 3.7402926966068942e-06, + "loss": 0.1147, + "step": 5496 + }, + { + "epoch": 1.7812702527543745, + "grad_norm": 0.4646758735179901, + "learning_rate": 3.7385999827193692e-06, + "loss": 0.1094, + "step": 5497 + }, + { + "epoch": 1.781594296824368, + "grad_norm": 0.4845737814903259, + "learning_rate": 3.7369074232149965e-06, + "loss": 0.1239, + "step": 5498 + }, + { + "epoch": 1.7819183408943617, + "grad_norm": 0.46913474798202515, + "learning_rate": 3.7352150183009274e-06, + "loss": 0.1126, + "step": 5499 + }, + { + "epoch": 1.7822423849643552, + "grad_norm": 0.43723833560943604, + "learning_rate": 3.733522768184294e-06, + "loss": 0.0965, + "step": 5500 + }, + { + "epoch": 1.7825664290343486, + "grad_norm": 0.4722057282924652, + "learning_rate": 3.7318306730722153e-06, + "loss": 0.1103, + "step": 5501 + }, + { + "epoch": 1.7828904731043422, + "grad_norm": 0.455968976020813, + "learning_rate": 3.7301387331717832e-06, + "loss": 0.108, + "step": 5502 + }, + { + "epoch": 1.7832145171743357, + "grad_norm": 0.5014598369598389, + "learning_rate": 3.728446948690079e-06, + "loss": 0.1178, + "step": 5503 + }, + { + "epoch": 1.7835385612443293, + "grad_norm": 0.43590158224105835, + "learning_rate": 3.7267553198341566e-06, + "loss": 0.0998, + "step": 5504 + }, + { + "epoch": 1.7838626053143227, + "grad_norm": 0.4979320466518402, + "learning_rate": 3.72506384681106e-06, + "loss": 0.1187, + "step": 5505 + }, + { + "epoch": 1.7841866493843161, + "grad_norm": 0.4761984050273895, + "learning_rate": 3.723372529827805e-06, + "loss": 0.1162, + "step": 5506 + }, + { + "epoch": 1.7845106934543098, + "grad_norm": 0.4369703531265259, + "learning_rate": 3.7216813690913935e-06, + "loss": 0.0991, + "step": 5507 + }, + { + "epoch": 1.7848347375243034, + "grad_norm": 0.44826263189315796, + "learning_rate": 3.7199903648088065e-06, + "loss": 0.0997, + "step": 5508 + }, + { + "epoch": 1.7851587815942969, + "grad_norm": 0.43109968304634094, + "learning_rate": 3.7182995171870082e-06, + "loss": 0.1003, + "step": 5509 + }, + { + "epoch": 1.7854828256642903, + "grad_norm": 0.4809877276420593, + "learning_rate": 3.716608826432942e-06, + "loss": 0.1218, + "step": 5510 + }, + { + "epoch": 1.7858068697342837, + "grad_norm": 0.46688663959503174, + "learning_rate": 3.714918292753531e-06, + "loss": 0.1139, + "step": 5511 + }, + { + "epoch": 1.7861309138042774, + "grad_norm": 0.4388737082481384, + "learning_rate": 3.7132279163556784e-06, + "loss": 0.104, + "step": 5512 + }, + { + "epoch": 1.786454957874271, + "grad_norm": 0.4492657780647278, + "learning_rate": 3.711537697446274e-06, + "loss": 0.111, + "step": 5513 + }, + { + "epoch": 1.7867790019442644, + "grad_norm": 0.4479140639305115, + "learning_rate": 3.70984763623218e-06, + "loss": 0.1065, + "step": 5514 + }, + { + "epoch": 1.7871030460142578, + "grad_norm": 0.4573601186275482, + "learning_rate": 3.708157732920248e-06, + "loss": 0.1086, + "step": 5515 + }, + { + "epoch": 1.7874270900842515, + "grad_norm": 0.4481222331523895, + "learning_rate": 3.7064679877173027e-06, + "loss": 0.1009, + "step": 5516 + }, + { + "epoch": 1.7877511341542451, + "grad_norm": 0.4937301576137543, + "learning_rate": 3.704778400830153e-06, + "loss": 0.1161, + "step": 5517 + }, + { + "epoch": 1.7880751782242386, + "grad_norm": 0.4630590081214905, + "learning_rate": 3.7030889724655894e-06, + "loss": 0.1092, + "step": 5518 + }, + { + "epoch": 1.788399222294232, + "grad_norm": 0.453305184841156, + "learning_rate": 3.7013997028303792e-06, + "loss": 0.1042, + "step": 5519 + }, + { + "epoch": 1.7887232663642254, + "grad_norm": 0.4361039698123932, + "learning_rate": 3.6997105921312755e-06, + "loss": 0.1023, + "step": 5520 + }, + { + "epoch": 1.789047310434219, + "grad_norm": 0.44130071997642517, + "learning_rate": 3.6980216405750047e-06, + "loss": 0.1049, + "step": 5521 + }, + { + "epoch": 1.7893713545042127, + "grad_norm": 0.511313796043396, + "learning_rate": 3.696332848368284e-06, + "loss": 0.1254, + "step": 5522 + }, + { + "epoch": 1.7896953985742061, + "grad_norm": 0.4271393120288849, + "learning_rate": 3.6946442157178013e-06, + "loss": 0.1025, + "step": 5523 + }, + { + "epoch": 1.7900194426441995, + "grad_norm": 0.4805537164211273, + "learning_rate": 3.692955742830228e-06, + "loss": 0.1182, + "step": 5524 + }, + { + "epoch": 1.790343486714193, + "grad_norm": 0.44407039880752563, + "learning_rate": 3.691267429912221e-06, + "loss": 0.1049, + "step": 5525 + }, + { + "epoch": 1.7906675307841866, + "grad_norm": 0.44100818037986755, + "learning_rate": 3.6895792771704085e-06, + "loss": 0.1103, + "step": 5526 + }, + { + "epoch": 1.7909915748541803, + "grad_norm": 0.4450782835483551, + "learning_rate": 3.687891284811409e-06, + "loss": 0.1084, + "step": 5527 + }, + { + "epoch": 1.7913156189241737, + "grad_norm": 0.4650527834892273, + "learning_rate": 3.6862034530418135e-06, + "loss": 0.1056, + "step": 5528 + }, + { + "epoch": 1.791639662994167, + "grad_norm": 0.447388231754303, + "learning_rate": 3.684515782068197e-06, + "loss": 0.1096, + "step": 5529 + }, + { + "epoch": 1.7919637070641607, + "grad_norm": 0.44772782921791077, + "learning_rate": 3.682828272097113e-06, + "loss": 0.1051, + "step": 5530 + }, + { + "epoch": 1.7922877511341544, + "grad_norm": 0.4599211812019348, + "learning_rate": 3.681140923335098e-06, + "loss": 0.1106, + "step": 5531 + }, + { + "epoch": 1.7926117952041478, + "grad_norm": 0.4325055181980133, + "learning_rate": 3.6794537359886667e-06, + "loss": 0.0994, + "step": 5532 + }, + { + "epoch": 1.7929358392741412, + "grad_norm": 0.46257203817367554, + "learning_rate": 3.6777667102643123e-06, + "loss": 0.1054, + "step": 5533 + }, + { + "epoch": 1.7932598833441347, + "grad_norm": 0.4851507842540741, + "learning_rate": 3.676079846368514e-06, + "loss": 0.11, + "step": 5534 + }, + { + "epoch": 1.7935839274141283, + "grad_norm": 0.4444712698459625, + "learning_rate": 3.6743931445077273e-06, + "loss": 0.1056, + "step": 5535 + }, + { + "epoch": 1.793907971484122, + "grad_norm": 0.44603532552719116, + "learning_rate": 3.672706604888384e-06, + "loss": 0.1091, + "step": 5536 + }, + { + "epoch": 1.7942320155541154, + "grad_norm": 0.41911235451698303, + "learning_rate": 3.671020227716905e-06, + "loss": 0.0951, + "step": 5537 + }, + { + "epoch": 1.7945560596241088, + "grad_norm": 0.48308685421943665, + "learning_rate": 3.6693340131996823e-06, + "loss": 0.1204, + "step": 5538 + }, + { + "epoch": 1.7948801036941024, + "grad_norm": 0.43123897910118103, + "learning_rate": 3.6676479615430973e-06, + "loss": 0.0991, + "step": 5539 + }, + { + "epoch": 1.7952041477640959, + "grad_norm": 0.5153533816337585, + "learning_rate": 3.6659620729535022e-06, + "loss": 0.1234, + "step": 5540 + }, + { + "epoch": 1.7955281918340895, + "grad_norm": 0.4650425910949707, + "learning_rate": 3.6642763476372357e-06, + "loss": 0.1082, + "step": 5541 + }, + { + "epoch": 1.795852235904083, + "grad_norm": 0.4416308104991913, + "learning_rate": 3.6625907858006137e-06, + "loss": 0.1003, + "step": 5542 + }, + { + "epoch": 1.7961762799740764, + "grad_norm": 0.41461002826690674, + "learning_rate": 3.6609053876499306e-06, + "loss": 0.0952, + "step": 5543 + }, + { + "epoch": 1.79650032404407, + "grad_norm": 0.42781490087509155, + "learning_rate": 3.6592201533914662e-06, + "loss": 0.0933, + "step": 5544 + }, + { + "epoch": 1.7968243681140637, + "grad_norm": 0.498722106218338, + "learning_rate": 3.657535083231474e-06, + "loss": 0.1121, + "step": 5545 + }, + { + "epoch": 1.797148412184057, + "grad_norm": 0.44308096170425415, + "learning_rate": 3.6558501773761923e-06, + "loss": 0.1028, + "step": 5546 + }, + { + "epoch": 1.7974724562540505, + "grad_norm": 0.4877070486545563, + "learning_rate": 3.654165436031838e-06, + "loss": 0.1228, + "step": 5547 + }, + { + "epoch": 1.797796500324044, + "grad_norm": 0.46494802832603455, + "learning_rate": 3.6524808594046025e-06, + "loss": 0.1128, + "step": 5548 + }, + { + "epoch": 1.7981205443940376, + "grad_norm": 0.4507131576538086, + "learning_rate": 3.6507964477006675e-06, + "loss": 0.1131, + "step": 5549 + }, + { + "epoch": 1.7984445884640312, + "grad_norm": 0.41261738538742065, + "learning_rate": 3.6491122011261842e-06, + "loss": 0.0928, + "step": 5550 + }, + { + "epoch": 1.7987686325340246, + "grad_norm": 0.45822086930274963, + "learning_rate": 3.647428119887292e-06, + "loss": 0.1066, + "step": 5551 + }, + { + "epoch": 1.799092676604018, + "grad_norm": 0.4836665987968445, + "learning_rate": 3.645744204190101e-06, + "loss": 0.1135, + "step": 5552 + }, + { + "epoch": 1.7994167206740117, + "grad_norm": 0.43096283078193665, + "learning_rate": 3.6440604542407114e-06, + "loss": 0.0993, + "step": 5553 + }, + { + "epoch": 1.7997407647440054, + "grad_norm": 0.47783106565475464, + "learning_rate": 3.6423768702451955e-06, + "loss": 0.1168, + "step": 5554 + }, + { + "epoch": 1.8000648088139988, + "grad_norm": 0.4445684552192688, + "learning_rate": 3.6406934524096066e-06, + "loss": 0.0983, + "step": 5555 + }, + { + "epoch": 1.8003888528839922, + "grad_norm": 0.49606752395629883, + "learning_rate": 3.639010200939982e-06, + "loss": 0.1198, + "step": 5556 + }, + { + "epoch": 1.8007128969539856, + "grad_norm": 0.420172780752182, + "learning_rate": 3.637327116042331e-06, + "loss": 0.0952, + "step": 5557 + }, + { + "epoch": 1.8010369410239793, + "grad_norm": 0.4633721709251404, + "learning_rate": 3.635644197922651e-06, + "loss": 0.1064, + "step": 5558 + }, + { + "epoch": 1.801360985093973, + "grad_norm": 0.4439217150211334, + "learning_rate": 3.6339614467869135e-06, + "loss": 0.1008, + "step": 5559 + }, + { + "epoch": 1.8016850291639663, + "grad_norm": 0.46251246333122253, + "learning_rate": 3.6322788628410687e-06, + "loss": 0.1099, + "step": 5560 + }, + { + "epoch": 1.8020090732339598, + "grad_norm": 0.46781501173973083, + "learning_rate": 3.6305964462910524e-06, + "loss": 0.1144, + "step": 5561 + }, + { + "epoch": 1.8023331173039532, + "grad_norm": 0.4599486291408539, + "learning_rate": 3.6289141973427733e-06, + "loss": 0.1003, + "step": 5562 + }, + { + "epoch": 1.8026571613739468, + "grad_norm": 0.47066158056259155, + "learning_rate": 3.6272321162021247e-06, + "loss": 0.1154, + "step": 5563 + }, + { + "epoch": 1.8029812054439405, + "grad_norm": 0.48277753591537476, + "learning_rate": 3.625550203074973e-06, + "loss": 0.1131, + "step": 5564 + }, + { + "epoch": 1.803305249513934, + "grad_norm": 0.5130548477172852, + "learning_rate": 3.623868458167173e-06, + "loss": 0.1204, + "step": 5565 + }, + { + "epoch": 1.8036292935839273, + "grad_norm": 0.4539277255535126, + "learning_rate": 3.6221868816845517e-06, + "loss": 0.1028, + "step": 5566 + }, + { + "epoch": 1.803953337653921, + "grad_norm": 0.4764772355556488, + "learning_rate": 3.620505473832916e-06, + "loss": 0.1112, + "step": 5567 + }, + { + "epoch": 1.8042773817239146, + "grad_norm": 0.49271273612976074, + "learning_rate": 3.6188242348180577e-06, + "loss": 0.1194, + "step": 5568 + }, + { + "epoch": 1.804601425793908, + "grad_norm": 0.4314182698726654, + "learning_rate": 3.61714316484574e-06, + "loss": 0.1058, + "step": 5569 + }, + { + "epoch": 1.8049254698639015, + "grad_norm": 0.46068280935287476, + "learning_rate": 3.6154622641217143e-06, + "loss": 0.1167, + "step": 5570 + }, + { + "epoch": 1.8052495139338949, + "grad_norm": 0.47679510712623596, + "learning_rate": 3.613781532851702e-06, + "loss": 0.1109, + "step": 5571 + }, + { + "epoch": 1.8055735580038885, + "grad_norm": 0.47514277696609497, + "learning_rate": 3.6121009712414124e-06, + "loss": 0.1124, + "step": 5572 + }, + { + "epoch": 1.8058976020738822, + "grad_norm": 0.48157599568367004, + "learning_rate": 3.6104205794965286e-06, + "loss": 0.1187, + "step": 5573 + }, + { + "epoch": 1.8062216461438756, + "grad_norm": 0.4681515395641327, + "learning_rate": 3.6087403578227104e-06, + "loss": 0.108, + "step": 5574 + }, + { + "epoch": 1.806545690213869, + "grad_norm": 0.44759202003479004, + "learning_rate": 3.6070603064256065e-06, + "loss": 0.107, + "step": 5575 + }, + { + "epoch": 1.8068697342838627, + "grad_norm": 0.47079500555992126, + "learning_rate": 3.6053804255108344e-06, + "loss": 0.1064, + "step": 5576 + }, + { + "epoch": 1.807193778353856, + "grad_norm": 0.47174084186553955, + "learning_rate": 3.603700715283999e-06, + "loss": 0.1108, + "step": 5577 + }, + { + "epoch": 1.8075178224238497, + "grad_norm": 0.4588591456413269, + "learning_rate": 3.6020211759506795e-06, + "loss": 0.1108, + "step": 5578 + }, + { + "epoch": 1.8078418664938432, + "grad_norm": 0.45068198442459106, + "learning_rate": 3.600341807716432e-06, + "loss": 0.1127, + "step": 5579 + }, + { + "epoch": 1.8081659105638366, + "grad_norm": 0.42220938205718994, + "learning_rate": 3.5986626107867996e-06, + "loss": 0.0989, + "step": 5580 + }, + { + "epoch": 1.8084899546338302, + "grad_norm": 0.43438515067100525, + "learning_rate": 3.596983585367297e-06, + "loss": 0.0997, + "step": 5581 + }, + { + "epoch": 1.8088139987038239, + "grad_norm": 0.44364234805107117, + "learning_rate": 3.595304731663421e-06, + "loss": 0.1031, + "step": 5582 + }, + { + "epoch": 1.8091380427738173, + "grad_norm": 0.48173975944519043, + "learning_rate": 3.5936260498806476e-06, + "loss": 0.1232, + "step": 5583 + }, + { + "epoch": 1.8094620868438107, + "grad_norm": 0.47106197476387024, + "learning_rate": 3.5919475402244315e-06, + "loss": 0.1097, + "step": 5584 + }, + { + "epoch": 1.8097861309138041, + "grad_norm": 0.46400800347328186, + "learning_rate": 3.5902692029002055e-06, + "loss": 0.1185, + "step": 5585 + }, + { + "epoch": 1.8101101749837978, + "grad_norm": 0.4439902901649475, + "learning_rate": 3.5885910381133797e-06, + "loss": 0.1044, + "step": 5586 + }, + { + "epoch": 1.8104342190537914, + "grad_norm": 0.4391462802886963, + "learning_rate": 3.5869130460693504e-06, + "loss": 0.0995, + "step": 5587 + }, + { + "epoch": 1.8107582631237849, + "grad_norm": 0.46092846989631653, + "learning_rate": 3.5852352269734815e-06, + "loss": 0.1063, + "step": 5588 + }, + { + "epoch": 1.8110823071937783, + "grad_norm": 0.44837290048599243, + "learning_rate": 3.583557581031127e-06, + "loss": 0.1019, + "step": 5589 + }, + { + "epoch": 1.811406351263772, + "grad_norm": 0.48929664492607117, + "learning_rate": 3.581880108447612e-06, + "loss": 0.1208, + "step": 5590 + }, + { + "epoch": 1.8117303953337653, + "grad_norm": 0.4306938648223877, + "learning_rate": 3.5802028094282416e-06, + "loss": 0.1057, + "step": 5591 + }, + { + "epoch": 1.812054439403759, + "grad_norm": 0.4768986403942108, + "learning_rate": 3.5785256841783052e-06, + "loss": 0.1156, + "step": 5592 + }, + { + "epoch": 1.8123784834737524, + "grad_norm": 0.44823887944221497, + "learning_rate": 3.576848732903062e-06, + "loss": 0.1018, + "step": 5593 + }, + { + "epoch": 1.8127025275437458, + "grad_norm": 0.4879656136035919, + "learning_rate": 3.575171955807759e-06, + "loss": 0.1113, + "step": 5594 + }, + { + "epoch": 1.8130265716137395, + "grad_norm": 0.45889967679977417, + "learning_rate": 3.5734953530976122e-06, + "loss": 0.1064, + "step": 5595 + }, + { + "epoch": 1.8133506156837331, + "grad_norm": 0.4404192566871643, + "learning_rate": 3.571818924977827e-06, + "loss": 0.1063, + "step": 5596 + }, + { + "epoch": 1.8136746597537265, + "grad_norm": 0.4716794788837433, + "learning_rate": 3.5701426716535793e-06, + "loss": 0.1105, + "step": 5597 + }, + { + "epoch": 1.81399870382372, + "grad_norm": 0.4661045968532562, + "learning_rate": 3.5684665933300244e-06, + "loss": 0.1094, + "step": 5598 + }, + { + "epoch": 1.8143227478937134, + "grad_norm": 0.4793951213359833, + "learning_rate": 3.5667906902123027e-06, + "loss": 0.1089, + "step": 5599 + }, + { + "epoch": 1.814646791963707, + "grad_norm": 0.4394589960575104, + "learning_rate": 3.5651149625055235e-06, + "loss": 0.0979, + "step": 5600 + }, + { + "epoch": 1.8149708360337007, + "grad_norm": 0.44929248094558716, + "learning_rate": 3.563439410414784e-06, + "loss": 0.1023, + "step": 5601 + }, + { + "epoch": 1.815294880103694, + "grad_norm": 0.445188045501709, + "learning_rate": 3.5617640341451545e-06, + "loss": 0.1044, + "step": 5602 + }, + { + "epoch": 1.8156189241736875, + "grad_norm": 0.42353734374046326, + "learning_rate": 3.5600888339016827e-06, + "loss": 0.0996, + "step": 5603 + }, + { + "epoch": 1.8159429682436812, + "grad_norm": 0.4926955997943878, + "learning_rate": 3.5584138098893974e-06, + "loss": 0.1165, + "step": 5604 + }, + { + "epoch": 1.8162670123136748, + "grad_norm": 0.46474507451057434, + "learning_rate": 3.5567389623133068e-06, + "loss": 0.1109, + "step": 5605 + }, + { + "epoch": 1.8165910563836682, + "grad_norm": 0.43861955404281616, + "learning_rate": 3.555064291378396e-06, + "loss": 0.1047, + "step": 5606 + }, + { + "epoch": 1.8169151004536617, + "grad_norm": 0.4347488582134247, + "learning_rate": 3.5533897972896263e-06, + "loss": 0.1049, + "step": 5607 + }, + { + "epoch": 1.817239144523655, + "grad_norm": 0.47579696774482727, + "learning_rate": 3.5517154802519432e-06, + "loss": 0.1033, + "step": 5608 + }, + { + "epoch": 1.8175631885936487, + "grad_norm": 0.4459049105644226, + "learning_rate": 3.550041340470265e-06, + "loss": 0.1096, + "step": 5609 + }, + { + "epoch": 1.8178872326636424, + "grad_norm": 0.4174961447715759, + "learning_rate": 3.5483673781494876e-06, + "loss": 0.1017, + "step": 5610 + }, + { + "epoch": 1.8182112767336358, + "grad_norm": 0.47640395164489746, + "learning_rate": 3.5466935934944917e-06, + "loss": 0.1145, + "step": 5611 + }, + { + "epoch": 1.8185353208036292, + "grad_norm": 0.4541381895542145, + "learning_rate": 3.5450199867101298e-06, + "loss": 0.1034, + "step": 5612 + }, + { + "epoch": 1.8188593648736227, + "grad_norm": 0.4641683101654053, + "learning_rate": 3.5433465580012377e-06, + "loss": 0.1022, + "step": 5613 + }, + { + "epoch": 1.8191834089436163, + "grad_norm": 0.4748760163784027, + "learning_rate": 3.5416733075726258e-06, + "loss": 0.1105, + "step": 5614 + }, + { + "epoch": 1.81950745301361, + "grad_norm": 0.47564688324928284, + "learning_rate": 3.5400002356290817e-06, + "loss": 0.1144, + "step": 5615 + }, + { + "epoch": 1.8198314970836034, + "grad_norm": 0.44213637709617615, + "learning_rate": 3.5383273423753766e-06, + "loss": 0.1012, + "step": 5616 + }, + { + "epoch": 1.8201555411535968, + "grad_norm": 0.44395461678504944, + "learning_rate": 3.536654628016252e-06, + "loss": 0.1014, + "step": 5617 + }, + { + "epoch": 1.8204795852235904, + "grad_norm": 0.4228231906890869, + "learning_rate": 3.534982092756437e-06, + "loss": 0.0976, + "step": 5618 + }, + { + "epoch": 1.820803629293584, + "grad_norm": 0.4690253436565399, + "learning_rate": 3.53330973680063e-06, + "loss": 0.1181, + "step": 5619 + }, + { + "epoch": 1.8211276733635775, + "grad_norm": 0.42680132389068604, + "learning_rate": 3.5316375603535135e-06, + "loss": 0.1057, + "step": 5620 + }, + { + "epoch": 1.821451717433571, + "grad_norm": 0.42568618059158325, + "learning_rate": 3.5299655636197454e-06, + "loss": 0.0944, + "step": 5621 + }, + { + "epoch": 1.8217757615035644, + "grad_norm": 0.44182854890823364, + "learning_rate": 3.528293746803959e-06, + "loss": 0.1015, + "step": 5622 + }, + { + "epoch": 1.822099805573558, + "grad_norm": 0.4019891917705536, + "learning_rate": 3.5266221101107735e-06, + "loss": 0.0953, + "step": 5623 + }, + { + "epoch": 1.8224238496435516, + "grad_norm": 0.4538567364215851, + "learning_rate": 3.5249506537447763e-06, + "loss": 0.1021, + "step": 5624 + }, + { + "epoch": 1.822747893713545, + "grad_norm": 0.4837633967399597, + "learning_rate": 3.523279377910541e-06, + "loss": 0.1178, + "step": 5625 + }, + { + "epoch": 1.8230719377835385, + "grad_norm": 0.4749487340450287, + "learning_rate": 3.521608282812613e-06, + "loss": 0.1084, + "step": 5626 + }, + { + "epoch": 1.8233959818535321, + "grad_norm": 0.5325826406478882, + "learning_rate": 3.519937368655519e-06, + "loss": 0.1255, + "step": 5627 + }, + { + "epoch": 1.8237200259235256, + "grad_norm": 0.526648759841919, + "learning_rate": 3.5182666356437646e-06, + "loss": 0.1211, + "step": 5628 + }, + { + "epoch": 1.8240440699935192, + "grad_norm": 0.4531908929347992, + "learning_rate": 3.516596083981827e-06, + "loss": 0.0992, + "step": 5629 + }, + { + "epoch": 1.8243681140635126, + "grad_norm": 0.492156058549881, + "learning_rate": 3.514925713874171e-06, + "loss": 0.1204, + "step": 5630 + }, + { + "epoch": 1.824692158133506, + "grad_norm": 0.47805356979370117, + "learning_rate": 3.513255525525228e-06, + "loss": 0.1151, + "step": 5631 + }, + { + "epoch": 1.8250162022034997, + "grad_norm": 0.47491201758384705, + "learning_rate": 3.5115855191394187e-06, + "loss": 0.1097, + "step": 5632 + }, + { + "epoch": 1.8253402462734933, + "grad_norm": 0.47285890579223633, + "learning_rate": 3.5099156949211323e-06, + "loss": 0.1123, + "step": 5633 + }, + { + "epoch": 1.8256642903434868, + "grad_norm": 0.46936270594596863, + "learning_rate": 3.508246053074738e-06, + "loss": 0.104, + "step": 5634 + }, + { + "epoch": 1.8259883344134802, + "grad_norm": 0.43193650245666504, + "learning_rate": 3.5065765938045883e-06, + "loss": 0.0975, + "step": 5635 + }, + { + "epoch": 1.8263123784834736, + "grad_norm": 0.47375643253326416, + "learning_rate": 3.504907317315004e-06, + "loss": 0.1088, + "step": 5636 + }, + { + "epoch": 1.8266364225534673, + "grad_norm": 0.4534396827220917, + "learning_rate": 3.5032382238102912e-06, + "loss": 0.0999, + "step": 5637 + }, + { + "epoch": 1.826960466623461, + "grad_norm": 0.46225836873054504, + "learning_rate": 3.5015693134947287e-06, + "loss": 0.104, + "step": 5638 + }, + { + "epoch": 1.8272845106934543, + "grad_norm": 0.45612695813179016, + "learning_rate": 3.499900586572578e-06, + "loss": 0.1095, + "step": 5639 + }, + { + "epoch": 1.8276085547634477, + "grad_norm": 0.455872118473053, + "learning_rate": 3.4982320432480736e-06, + "loss": 0.1048, + "step": 5640 + }, + { + "epoch": 1.8279325988334414, + "grad_norm": 0.4655687212944031, + "learning_rate": 3.4965636837254267e-06, + "loss": 0.1144, + "step": 5641 + }, + { + "epoch": 1.8282566429034348, + "grad_norm": 0.4484519362449646, + "learning_rate": 3.494895508208833e-06, + "loss": 0.1062, + "step": 5642 + }, + { + "epoch": 1.8285806869734285, + "grad_norm": 0.4629919230937958, + "learning_rate": 3.493227516902456e-06, + "loss": 0.1033, + "step": 5643 + }, + { + "epoch": 1.8289047310434219, + "grad_norm": 0.4088801145553589, + "learning_rate": 3.4915597100104464e-06, + "loss": 0.0981, + "step": 5644 + }, + { + "epoch": 1.8292287751134153, + "grad_norm": 0.4668869376182556, + "learning_rate": 3.489892087736926e-06, + "loss": 0.1155, + "step": 5645 + }, + { + "epoch": 1.829552819183409, + "grad_norm": 0.47804510593414307, + "learning_rate": 3.4882246502859937e-06, + "loss": 0.1154, + "step": 5646 + }, + { + "epoch": 1.8298768632534026, + "grad_norm": 0.48258739709854126, + "learning_rate": 3.4865573978617295e-06, + "loss": 0.1073, + "step": 5647 + }, + { + "epoch": 1.830200907323396, + "grad_norm": 0.44823774695396423, + "learning_rate": 3.4848903306681868e-06, + "loss": 0.1016, + "step": 5648 + }, + { + "epoch": 1.8305249513933894, + "grad_norm": 0.4402213990688324, + "learning_rate": 3.483223448909403e-06, + "loss": 0.1098, + "step": 5649 + }, + { + "epoch": 1.8308489954633829, + "grad_norm": 0.4777219891548157, + "learning_rate": 3.4815567527893823e-06, + "loss": 0.1184, + "step": 5650 + }, + { + "epoch": 1.8311730395333765, + "grad_norm": 0.47285157442092896, + "learning_rate": 3.4798902425121185e-06, + "loss": 0.1162, + "step": 5651 + }, + { + "epoch": 1.8314970836033702, + "grad_norm": 0.45721814036369324, + "learning_rate": 3.4782239182815725e-06, + "loss": 0.1114, + "step": 5652 + }, + { + "epoch": 1.8318211276733636, + "grad_norm": 0.48380038142204285, + "learning_rate": 3.4765577803016852e-06, + "loss": 0.1142, + "step": 5653 + }, + { + "epoch": 1.832145171743357, + "grad_norm": 0.48291128873825073, + "learning_rate": 3.4748918287763798e-06, + "loss": 0.1152, + "step": 5654 + }, + { + "epoch": 1.8324692158133506, + "grad_norm": 0.43343526124954224, + "learning_rate": 3.4732260639095493e-06, + "loss": 0.097, + "step": 5655 + }, + { + "epoch": 1.8327932598833443, + "grad_norm": 0.4707466661930084, + "learning_rate": 3.471560485905068e-06, + "loss": 0.1117, + "step": 5656 + }, + { + "epoch": 1.8331173039533377, + "grad_norm": 0.4821437895298004, + "learning_rate": 3.4698950949667875e-06, + "loss": 0.1071, + "step": 5657 + }, + { + "epoch": 1.8334413480233311, + "grad_norm": 0.46275627613067627, + "learning_rate": 3.4682298912985344e-06, + "loss": 0.113, + "step": 5658 + }, + { + "epoch": 1.8337653920933246, + "grad_norm": 0.4579031467437744, + "learning_rate": 3.466564875104115e-06, + "loss": 0.1102, + "step": 5659 + }, + { + "epoch": 1.8340894361633182, + "grad_norm": 0.4628545045852661, + "learning_rate": 3.4649000465873073e-06, + "loss": 0.0974, + "step": 5660 + }, + { + "epoch": 1.8344134802333119, + "grad_norm": 0.4802587628364563, + "learning_rate": 3.463235405951876e-06, + "loss": 0.1185, + "step": 5661 + }, + { + "epoch": 1.8347375243033053, + "grad_norm": 0.4990658760070801, + "learning_rate": 3.4615709534015512e-06, + "loss": 0.1284, + "step": 5662 + }, + { + "epoch": 1.8350615683732987, + "grad_norm": 0.498654842376709, + "learning_rate": 3.4599066891400507e-06, + "loss": 0.111, + "step": 5663 + }, + { + "epoch": 1.8353856124432921, + "grad_norm": 0.4472542107105255, + "learning_rate": 3.4582426133710623e-06, + "loss": 0.1071, + "step": 5664 + }, + { + "epoch": 1.8357096565132858, + "grad_norm": 0.48664551973342896, + "learning_rate": 3.4565787262982507e-06, + "loss": 0.1136, + "step": 5665 + }, + { + "epoch": 1.8360337005832794, + "grad_norm": 0.4686160385608673, + "learning_rate": 3.4549150281252635e-06, + "loss": 0.1135, + "step": 5666 + }, + { + "epoch": 1.8363577446532728, + "grad_norm": 0.45993703603744507, + "learning_rate": 3.4532515190557183e-06, + "loss": 0.1071, + "step": 5667 + }, + { + "epoch": 1.8366817887232663, + "grad_norm": 0.475545197725296, + "learning_rate": 3.451588199293214e-06, + "loss": 0.108, + "step": 5668 + }, + { + "epoch": 1.83700583279326, + "grad_norm": 0.4416792392730713, + "learning_rate": 3.4499250690413246e-06, + "loss": 0.1078, + "step": 5669 + }, + { + "epoch": 1.8373298768632536, + "grad_norm": 0.45210230350494385, + "learning_rate": 3.4482621285035996e-06, + "loss": 0.1058, + "step": 5670 + }, + { + "epoch": 1.837653920933247, + "grad_norm": 0.4606115221977234, + "learning_rate": 3.4465993778835692e-06, + "loss": 0.1112, + "step": 5671 + }, + { + "epoch": 1.8379779650032404, + "grad_norm": 0.4382837414741516, + "learning_rate": 3.4449368173847354e-06, + "loss": 0.0989, + "step": 5672 + }, + { + "epoch": 1.8383020090732338, + "grad_norm": 0.42500272393226624, + "learning_rate": 3.443274447210583e-06, + "loss": 0.1037, + "step": 5673 + }, + { + "epoch": 1.8386260531432275, + "grad_norm": 0.45148107409477234, + "learning_rate": 3.4416122675645656e-06, + "loss": 0.1102, + "step": 5674 + }, + { + "epoch": 1.8389500972132211, + "grad_norm": 0.426503986120224, + "learning_rate": 3.4399502786501227e-06, + "loss": 0.1035, + "step": 5675 + }, + { + "epoch": 1.8392741412832145, + "grad_norm": 0.4459628760814667, + "learning_rate": 3.438288480670663e-06, + "loss": 0.1039, + "step": 5676 + }, + { + "epoch": 1.839598185353208, + "grad_norm": 0.4436597526073456, + "learning_rate": 3.4366268738295733e-06, + "loss": 0.1022, + "step": 5677 + }, + { + "epoch": 1.8399222294232016, + "grad_norm": 0.5080084204673767, + "learning_rate": 3.43496545833022e-06, + "loss": 0.1139, + "step": 5678 + }, + { + "epoch": 1.840246273493195, + "grad_norm": 0.44031599164009094, + "learning_rate": 3.433304234375944e-06, + "loss": 0.108, + "step": 5679 + }, + { + "epoch": 1.8405703175631887, + "grad_norm": 0.4491998553276062, + "learning_rate": 3.4316432021700636e-06, + "loss": 0.1006, + "step": 5680 + }, + { + "epoch": 1.840894361633182, + "grad_norm": 0.4462529122829437, + "learning_rate": 3.4299823619158722e-06, + "loss": 0.1104, + "step": 5681 + }, + { + "epoch": 1.8412184057031755, + "grad_norm": 0.4552091360092163, + "learning_rate": 3.428321713816639e-06, + "loss": 0.1062, + "step": 5682 + }, + { + "epoch": 1.8415424497731692, + "grad_norm": 0.4409257173538208, + "learning_rate": 3.4266612580756155e-06, + "loss": 0.1011, + "step": 5683 + }, + { + "epoch": 1.8418664938431628, + "grad_norm": 0.4279926121234894, + "learning_rate": 3.4250009948960205e-06, + "loss": 0.0936, + "step": 5684 + }, + { + "epoch": 1.8421905379131562, + "grad_norm": 0.4621392786502838, + "learning_rate": 3.423340924481059e-06, + "loss": 0.1024, + "step": 5685 + }, + { + "epoch": 1.8425145819831497, + "grad_norm": 0.46478646993637085, + "learning_rate": 3.4216810470339022e-06, + "loss": 0.1095, + "step": 5686 + }, + { + "epoch": 1.842838626053143, + "grad_norm": 0.40607044100761414, + "learning_rate": 3.4200213627577087e-06, + "loss": 0.0902, + "step": 5687 + }, + { + "epoch": 1.8431626701231367, + "grad_norm": 0.4548855423927307, + "learning_rate": 3.418361871855605e-06, + "loss": 0.1119, + "step": 5688 + }, + { + "epoch": 1.8434867141931304, + "grad_norm": 0.4773532748222351, + "learning_rate": 3.4167025745306954e-06, + "loss": 0.1181, + "step": 5689 + }, + { + "epoch": 1.8438107582631238, + "grad_norm": 0.4710024893283844, + "learning_rate": 3.4150434709860648e-06, + "loss": 0.1134, + "step": 5690 + }, + { + "epoch": 1.8441348023331172, + "grad_norm": 0.45383182168006897, + "learning_rate": 3.4133845614247667e-06, + "loss": 0.1053, + "step": 5691 + }, + { + "epoch": 1.8444588464031109, + "grad_norm": 0.4348435699939728, + "learning_rate": 3.4117258460498414e-06, + "loss": 0.1008, + "step": 5692 + }, + { + "epoch": 1.8447828904731045, + "grad_norm": 0.4294411242008209, + "learning_rate": 3.4100673250642967e-06, + "loss": 0.0993, + "step": 5693 + }, + { + "epoch": 1.845106934543098, + "grad_norm": 0.44455474615097046, + "learning_rate": 3.408408998671118e-06, + "loss": 0.1039, + "step": 5694 + }, + { + "epoch": 1.8454309786130914, + "grad_norm": 0.46732982993125916, + "learning_rate": 3.4067508670732712e-06, + "loss": 0.1103, + "step": 5695 + }, + { + "epoch": 1.8457550226830848, + "grad_norm": 0.47106361389160156, + "learning_rate": 3.405092930473693e-06, + "loss": 0.1151, + "step": 5696 + }, + { + "epoch": 1.8460790667530784, + "grad_norm": 0.45745667815208435, + "learning_rate": 3.403435189075302e-06, + "loss": 0.1103, + "step": 5697 + }, + { + "epoch": 1.846403110823072, + "grad_norm": 0.43495872616767883, + "learning_rate": 3.4017776430809866e-06, + "loss": 0.1061, + "step": 5698 + }, + { + "epoch": 1.8467271548930655, + "grad_norm": 0.4372204542160034, + "learning_rate": 3.4001202926936177e-06, + "loss": 0.1018, + "step": 5699 + }, + { + "epoch": 1.847051198963059, + "grad_norm": 0.4519484341144562, + "learning_rate": 3.3984631381160355e-06, + "loss": 0.1047, + "step": 5700 + }, + { + "epoch": 1.8473752430330523, + "grad_norm": 0.4064551889896393, + "learning_rate": 3.396806179551061e-06, + "loss": 0.0904, + "step": 5701 + }, + { + "epoch": 1.847699287103046, + "grad_norm": 0.45493823289871216, + "learning_rate": 3.395149417201491e-06, + "loss": 0.1089, + "step": 5702 + }, + { + "epoch": 1.8480233311730396, + "grad_norm": 0.45996221899986267, + "learning_rate": 3.3934928512700936e-06, + "loss": 0.1034, + "step": 5703 + }, + { + "epoch": 1.848347375243033, + "grad_norm": 0.46049776673316956, + "learning_rate": 3.3918364819596222e-06, + "loss": 0.1139, + "step": 5704 + }, + { + "epoch": 1.8486714193130265, + "grad_norm": 0.47365984320640564, + "learning_rate": 3.390180309472796e-06, + "loss": 0.1114, + "step": 5705 + }, + { + "epoch": 1.8489954633830201, + "grad_norm": 0.44574666023254395, + "learning_rate": 3.388524334012315e-06, + "loss": 0.1061, + "step": 5706 + }, + { + "epoch": 1.8493195074530138, + "grad_norm": 0.4225917160511017, + "learning_rate": 3.386868555780856e-06, + "loss": 0.1057, + "step": 5707 + }, + { + "epoch": 1.8496435515230072, + "grad_norm": 0.4410113990306854, + "learning_rate": 3.385212974981068e-06, + "loss": 0.108, + "step": 5708 + }, + { + "epoch": 1.8499675955930006, + "grad_norm": 0.4146822392940521, + "learning_rate": 3.3835575918155814e-06, + "loss": 0.1014, + "step": 5709 + }, + { + "epoch": 1.850291639662994, + "grad_norm": 0.4648820161819458, + "learning_rate": 3.3819024064869967e-06, + "loss": 0.1209, + "step": 5710 + }, + { + "epoch": 1.8506156837329877, + "grad_norm": 0.48059096932411194, + "learning_rate": 3.3802474191978927e-06, + "loss": 0.1197, + "step": 5711 + }, + { + "epoch": 1.8509397278029813, + "grad_norm": 0.42399024963378906, + "learning_rate": 3.3785926301508255e-06, + "loss": 0.0955, + "step": 5712 + }, + { + "epoch": 1.8512637718729748, + "grad_norm": 0.4546625316143036, + "learning_rate": 3.3769380395483215e-06, + "loss": 0.1038, + "step": 5713 + }, + { + "epoch": 1.8515878159429682, + "grad_norm": 0.5044828057289124, + "learning_rate": 3.3752836475928906e-06, + "loss": 0.1213, + "step": 5714 + }, + { + "epoch": 1.8519118600129616, + "grad_norm": 0.48981040716171265, + "learning_rate": 3.3736294544870114e-06, + "loss": 0.1122, + "step": 5715 + }, + { + "epoch": 1.8522359040829552, + "grad_norm": 0.4367123544216156, + "learning_rate": 3.3719754604331447e-06, + "loss": 0.102, + "step": 5716 + }, + { + "epoch": 1.8525599481529489, + "grad_norm": 0.45596742630004883, + "learning_rate": 3.3703216656337177e-06, + "loss": 0.1072, + "step": 5717 + }, + { + "epoch": 1.8528839922229423, + "grad_norm": 0.4703064262866974, + "learning_rate": 3.3686680702911456e-06, + "loss": 0.1086, + "step": 5718 + }, + { + "epoch": 1.8532080362929357, + "grad_norm": 0.45928290486335754, + "learning_rate": 3.367014674607809e-06, + "loss": 0.1013, + "step": 5719 + }, + { + "epoch": 1.8535320803629294, + "grad_norm": 0.4606897234916687, + "learning_rate": 3.3653614787860667e-06, + "loss": 0.1059, + "step": 5720 + }, + { + "epoch": 1.853856124432923, + "grad_norm": 0.4154520332813263, + "learning_rate": 3.3637084830282545e-06, + "loss": 0.0918, + "step": 5721 + }, + { + "epoch": 1.8541801685029164, + "grad_norm": 0.47586536407470703, + "learning_rate": 3.3620556875366837e-06, + "loss": 0.1091, + "step": 5722 + }, + { + "epoch": 1.8545042125729099, + "grad_norm": 0.47497645020484924, + "learning_rate": 3.360403092513641e-06, + "loss": 0.1187, + "step": 5723 + }, + { + "epoch": 1.8548282566429033, + "grad_norm": 0.4510015547275543, + "learning_rate": 3.3587506981613877e-06, + "loss": 0.1117, + "step": 5724 + }, + { + "epoch": 1.855152300712897, + "grad_norm": 0.4139556586742401, + "learning_rate": 3.357098504682158e-06, + "loss": 0.0932, + "step": 5725 + }, + { + "epoch": 1.8554763447828906, + "grad_norm": 0.43587085604667664, + "learning_rate": 3.355446512278169e-06, + "loss": 0.1044, + "step": 5726 + }, + { + "epoch": 1.855800388852884, + "grad_norm": 0.44422632455825806, + "learning_rate": 3.3537947211516043e-06, + "loss": 0.1004, + "step": 5727 + }, + { + "epoch": 1.8561244329228774, + "grad_norm": 0.43391063809394836, + "learning_rate": 3.3521431315046317e-06, + "loss": 0.0964, + "step": 5728 + }, + { + "epoch": 1.856448476992871, + "grad_norm": 0.42529645562171936, + "learning_rate": 3.3504917435393857e-06, + "loss": 0.1007, + "step": 5729 + }, + { + "epoch": 1.8567725210628645, + "grad_norm": 0.45272016525268555, + "learning_rate": 3.348840557457982e-06, + "loss": 0.1075, + "step": 5730 + }, + { + "epoch": 1.8570965651328581, + "grad_norm": 0.43974587321281433, + "learning_rate": 3.3471895734625106e-06, + "loss": 0.1019, + "step": 5731 + }, + { + "epoch": 1.8574206092028516, + "grad_norm": 0.48083487153053284, + "learning_rate": 3.3455387917550344e-06, + "loss": 0.1127, + "step": 5732 + }, + { + "epoch": 1.857744653272845, + "grad_norm": 0.47117093205451965, + "learning_rate": 3.343888212537594e-06, + "loss": 0.1122, + "step": 5733 + }, + { + "epoch": 1.8580686973428386, + "grad_norm": 0.5059354901313782, + "learning_rate": 3.342237836012202e-06, + "loss": 0.1219, + "step": 5734 + }, + { + "epoch": 1.8583927414128323, + "grad_norm": 0.4689786434173584, + "learning_rate": 3.3405876623808525e-06, + "loss": 0.1059, + "step": 5735 + }, + { + "epoch": 1.8587167854828257, + "grad_norm": 0.4725983440876007, + "learning_rate": 3.338937691845509e-06, + "loss": 0.1063, + "step": 5736 + }, + { + "epoch": 1.8590408295528191, + "grad_norm": 0.4368756413459778, + "learning_rate": 3.3372879246081096e-06, + "loss": 0.1061, + "step": 5737 + }, + { + "epoch": 1.8593648736228126, + "grad_norm": 0.42094147205352783, + "learning_rate": 3.3356383608705746e-06, + "loss": 0.0944, + "step": 5738 + }, + { + "epoch": 1.8596889176928062, + "grad_norm": 0.4665633738040924, + "learning_rate": 3.3339890008347888e-06, + "loss": 0.1064, + "step": 5739 + }, + { + "epoch": 1.8600129617627998, + "grad_norm": 0.457518070936203, + "learning_rate": 3.3323398447026235e-06, + "loss": 0.1102, + "step": 5740 + }, + { + "epoch": 1.8603370058327933, + "grad_norm": 0.4493125379085541, + "learning_rate": 3.3306908926759163e-06, + "loss": 0.11, + "step": 5741 + }, + { + "epoch": 1.8606610499027867, + "grad_norm": 0.4822162985801697, + "learning_rate": 3.3290421449564846e-06, + "loss": 0.1196, + "step": 5742 + }, + { + "epoch": 1.8609850939727803, + "grad_norm": 0.47358089685440063, + "learning_rate": 3.327393601746117e-06, + "loss": 0.115, + "step": 5743 + }, + { + "epoch": 1.861309138042774, + "grad_norm": 0.39975741505622864, + "learning_rate": 3.3257452632465804e-06, + "loss": 0.0873, + "step": 5744 + }, + { + "epoch": 1.8616331821127674, + "grad_norm": 0.43184319138526917, + "learning_rate": 3.324097129659617e-06, + "loss": 0.093, + "step": 5745 + }, + { + "epoch": 1.8619572261827608, + "grad_norm": 0.45764079689979553, + "learning_rate": 3.3224492011869387e-06, + "loss": 0.1067, + "step": 5746 + }, + { + "epoch": 1.8622812702527543, + "grad_norm": 0.41477134823799133, + "learning_rate": 3.32080147803024e-06, + "loss": 0.0948, + "step": 5747 + }, + { + "epoch": 1.862605314322748, + "grad_norm": 0.4603864848613739, + "learning_rate": 3.319153960391185e-06, + "loss": 0.1203, + "step": 5748 + }, + { + "epoch": 1.8629293583927415, + "grad_norm": 0.4607555568218231, + "learning_rate": 3.3175066484714107e-06, + "loss": 0.1018, + "step": 5749 + }, + { + "epoch": 1.863253402462735, + "grad_norm": 0.439416766166687, + "learning_rate": 3.315859542472537e-06, + "loss": 0.0984, + "step": 5750 + }, + { + "epoch": 1.8635774465327284, + "grad_norm": 0.5201691389083862, + "learning_rate": 3.3142126425961506e-06, + "loss": 0.1164, + "step": 5751 + }, + { + "epoch": 1.8639014906027218, + "grad_norm": 0.47248104214668274, + "learning_rate": 3.3125659490438177e-06, + "loss": 0.1102, + "step": 5752 + }, + { + "epoch": 1.8642255346727155, + "grad_norm": 0.535966157913208, + "learning_rate": 3.3109194620170766e-06, + "loss": 0.1067, + "step": 5753 + }, + { + "epoch": 1.864549578742709, + "grad_norm": 0.503506064414978, + "learning_rate": 3.3092731817174427e-06, + "loss": 0.0935, + "step": 5754 + }, + { + "epoch": 1.8648736228127025, + "grad_norm": 0.4545847773551941, + "learning_rate": 3.307627108346404e-06, + "loss": 0.1075, + "step": 5755 + }, + { + "epoch": 1.865197666882696, + "grad_norm": 0.486439049243927, + "learning_rate": 3.3059812421054214e-06, + "loss": 0.116, + "step": 5756 + }, + { + "epoch": 1.8655217109526896, + "grad_norm": 0.46297916769981384, + "learning_rate": 3.3043355831959376e-06, + "loss": 0.1089, + "step": 5757 + }, + { + "epoch": 1.8658457550226832, + "grad_norm": 0.46269655227661133, + "learning_rate": 3.302690131819361e-06, + "loss": 0.1092, + "step": 5758 + }, + { + "epoch": 1.8661697990926767, + "grad_norm": 0.4384731948375702, + "learning_rate": 3.301044888177083e-06, + "loss": 0.1047, + "step": 5759 + }, + { + "epoch": 1.86649384316267, + "grad_norm": 0.48540347814559937, + "learning_rate": 3.299399852470464e-06, + "loss": 0.1175, + "step": 5760 + }, + { + "epoch": 1.8668178872326635, + "grad_norm": 0.44267207384109497, + "learning_rate": 3.2977550249008377e-06, + "loss": 0.1014, + "step": 5761 + }, + { + "epoch": 1.8671419313026572, + "grad_norm": 0.4874940514564514, + "learning_rate": 3.2961104056695194e-06, + "loss": 0.124, + "step": 5762 + }, + { + "epoch": 1.8674659753726508, + "grad_norm": 0.4731050431728363, + "learning_rate": 3.294465994977791e-06, + "loss": 0.1135, + "step": 5763 + }, + { + "epoch": 1.8677900194426442, + "grad_norm": 0.42587292194366455, + "learning_rate": 3.2928217930269155e-06, + "loss": 0.1033, + "step": 5764 + }, + { + "epoch": 1.8681140635126376, + "grad_norm": 0.46378660202026367, + "learning_rate": 3.291177800018124e-06, + "loss": 0.1159, + "step": 5765 + }, + { + "epoch": 1.8684381075826313, + "grad_norm": 0.4911424219608307, + "learning_rate": 3.289534016152629e-06, + "loss": 0.1149, + "step": 5766 + }, + { + "epoch": 1.8687621516526247, + "grad_norm": 0.464822381734848, + "learning_rate": 3.2878904416316116e-06, + "loss": 0.1121, + "step": 5767 + }, + { + "epoch": 1.8690861957226184, + "grad_norm": 0.4543420672416687, + "learning_rate": 3.286247076656227e-06, + "loss": 0.1095, + "step": 5768 + }, + { + "epoch": 1.8694102397926118, + "grad_norm": 0.44206535816192627, + "learning_rate": 3.2846039214276127e-06, + "loss": 0.1103, + "step": 5769 + }, + { + "epoch": 1.8697342838626052, + "grad_norm": 0.434787392616272, + "learning_rate": 3.28296097614687e-06, + "loss": 0.1026, + "step": 5770 + }, + { + "epoch": 1.8700583279325989, + "grad_norm": 0.43137770891189575, + "learning_rate": 3.2813182410150834e-06, + "loss": 0.1095, + "step": 5771 + }, + { + "epoch": 1.8703823720025925, + "grad_norm": 0.4367537200450897, + "learning_rate": 3.279675716233306e-06, + "loss": 0.1023, + "step": 5772 + }, + { + "epoch": 1.870706416072586, + "grad_norm": 0.4092714190483093, + "learning_rate": 3.278033402002565e-06, + "loss": 0.0964, + "step": 5773 + }, + { + "epoch": 1.8710304601425793, + "grad_norm": 0.4678616523742676, + "learning_rate": 3.276391298523868e-06, + "loss": 0.1116, + "step": 5774 + }, + { + "epoch": 1.8713545042125728, + "grad_norm": 0.4475710093975067, + "learning_rate": 3.2747494059981887e-06, + "loss": 0.1023, + "step": 5775 + }, + { + "epoch": 1.8716785482825664, + "grad_norm": 0.44051387906074524, + "learning_rate": 3.273107724626481e-06, + "loss": 0.1134, + "step": 5776 + }, + { + "epoch": 1.87200259235256, + "grad_norm": 0.43110376596450806, + "learning_rate": 3.2714662546096686e-06, + "loss": 0.0995, + "step": 5777 + }, + { + "epoch": 1.8723266364225535, + "grad_norm": 0.4657581150531769, + "learning_rate": 3.2698249961486556e-06, + "loss": 0.1134, + "step": 5778 + }, + { + "epoch": 1.872650680492547, + "grad_norm": 0.4725774824619293, + "learning_rate": 3.2681839494443137e-06, + "loss": 0.1033, + "step": 5779 + }, + { + "epoch": 1.8729747245625405, + "grad_norm": 0.45880869030952454, + "learning_rate": 3.266543114697488e-06, + "loss": 0.1111, + "step": 5780 + }, + { + "epoch": 1.873298768632534, + "grad_norm": 0.4175031781196594, + "learning_rate": 3.264902492109007e-06, + "loss": 0.0936, + "step": 5781 + }, + { + "epoch": 1.8736228127025276, + "grad_norm": 0.42836299538612366, + "learning_rate": 3.2632620818796612e-06, + "loss": 0.1009, + "step": 5782 + }, + { + "epoch": 1.873946856772521, + "grad_norm": 0.47268831729888916, + "learning_rate": 3.2616218842102264e-06, + "loss": 0.1138, + "step": 5783 + }, + { + "epoch": 1.8742709008425145, + "grad_norm": 0.41969698667526245, + "learning_rate": 3.2599818993014427e-06, + "loss": 0.1052, + "step": 5784 + }, + { + "epoch": 1.874594944912508, + "grad_norm": 0.504946768283844, + "learning_rate": 3.2583421273540304e-06, + "loss": 0.1163, + "step": 5785 + }, + { + "epoch": 1.8749189889825018, + "grad_norm": 0.42377039790153503, + "learning_rate": 3.256702568568682e-06, + "loss": 0.1013, + "step": 5786 + }, + { + "epoch": 1.8752430330524952, + "grad_norm": 0.46529340744018555, + "learning_rate": 3.2550632231460603e-06, + "loss": 0.1076, + "step": 5787 + }, + { + "epoch": 1.8755670771224886, + "grad_norm": 0.46467939019203186, + "learning_rate": 3.25342409128681e-06, + "loss": 0.1119, + "step": 5788 + }, + { + "epoch": 1.875891121192482, + "grad_norm": 0.4557555615901947, + "learning_rate": 3.2517851731915407e-06, + "loss": 0.1064, + "step": 5789 + }, + { + "epoch": 1.8762151652624757, + "grad_norm": 0.4496322572231293, + "learning_rate": 3.250146469060844e-06, + "loss": 0.1057, + "step": 5790 + }, + { + "epoch": 1.8765392093324693, + "grad_norm": 0.45844563841819763, + "learning_rate": 3.24850797909528e-06, + "loss": 0.1047, + "step": 5791 + }, + { + "epoch": 1.8768632534024627, + "grad_norm": 0.42063984274864197, + "learning_rate": 3.246869703495381e-06, + "loss": 0.1013, + "step": 5792 + }, + { + "epoch": 1.8771872974724562, + "grad_norm": 0.4641461968421936, + "learning_rate": 3.2452316424616614e-06, + "loss": 0.1174, + "step": 5793 + }, + { + "epoch": 1.8775113415424498, + "grad_norm": 0.4542684853076935, + "learning_rate": 3.2435937961945996e-06, + "loss": 0.109, + "step": 5794 + }, + { + "epoch": 1.8778353856124435, + "grad_norm": 0.44021913409233093, + "learning_rate": 3.241956164894654e-06, + "loss": 0.0934, + "step": 5795 + }, + { + "epoch": 1.8781594296824369, + "grad_norm": 0.4576353430747986, + "learning_rate": 3.240318748762255e-06, + "loss": 0.108, + "step": 5796 + }, + { + "epoch": 1.8784834737524303, + "grad_norm": 0.4571791887283325, + "learning_rate": 3.2386815479978074e-06, + "loss": 0.1062, + "step": 5797 + }, + { + "epoch": 1.8788075178224237, + "grad_norm": 0.46839702129364014, + "learning_rate": 3.2370445628016868e-06, + "loss": 0.1057, + "step": 5798 + }, + { + "epoch": 1.8791315618924174, + "grad_norm": 0.44016075134277344, + "learning_rate": 3.2354077933742426e-06, + "loss": 0.1063, + "step": 5799 + }, + { + "epoch": 1.879455605962411, + "grad_norm": 0.4617091119289398, + "learning_rate": 3.233771239915805e-06, + "loss": 0.1066, + "step": 5800 + }, + { + "epoch": 1.8797796500324044, + "grad_norm": 0.4427884519100189, + "learning_rate": 3.2321349026266664e-06, + "loss": 0.0998, + "step": 5801 + }, + { + "epoch": 1.8801036941023979, + "grad_norm": 0.47309133410453796, + "learning_rate": 3.230498781707104e-06, + "loss": 0.1159, + "step": 5802 + }, + { + "epoch": 1.8804277381723913, + "grad_norm": 0.45895934104919434, + "learning_rate": 3.22886287735736e-06, + "loss": 0.1003, + "step": 5803 + }, + { + "epoch": 1.880751782242385, + "grad_norm": 0.45034587383270264, + "learning_rate": 3.227227189777652e-06, + "loss": 0.1086, + "step": 5804 + }, + { + "epoch": 1.8810758263123786, + "grad_norm": 0.4406733512878418, + "learning_rate": 3.225591719168176e-06, + "loss": 0.1075, + "step": 5805 + }, + { + "epoch": 1.881399870382372, + "grad_norm": 0.44266191124916077, + "learning_rate": 3.223956465729096e-06, + "loss": 0.1008, + "step": 5806 + }, + { + "epoch": 1.8817239144523654, + "grad_norm": 0.46553224325180054, + "learning_rate": 3.2223214296605516e-06, + "loss": 0.1112, + "step": 5807 + }, + { + "epoch": 1.882047958522359, + "grad_norm": 0.43898141384124756, + "learning_rate": 3.220686611162653e-06, + "loss": 0.1035, + "step": 5808 + }, + { + "epoch": 1.8823720025923527, + "grad_norm": 0.4390580952167511, + "learning_rate": 3.2190520104354893e-06, + "loss": 0.0977, + "step": 5809 + }, + { + "epoch": 1.8826960466623461, + "grad_norm": 0.4468221366405487, + "learning_rate": 3.2174176276791197e-06, + "loss": 0.0996, + "step": 5810 + }, + { + "epoch": 1.8830200907323396, + "grad_norm": 0.4968999922275543, + "learning_rate": 3.2157834630935735e-06, + "loss": 0.1196, + "step": 5811 + }, + { + "epoch": 1.883344134802333, + "grad_norm": 0.4182228446006775, + "learning_rate": 3.2141495168788605e-06, + "loss": 0.0969, + "step": 5812 + }, + { + "epoch": 1.8836681788723266, + "grad_norm": 0.4635598659515381, + "learning_rate": 3.212515789234957e-06, + "loss": 0.1138, + "step": 5813 + }, + { + "epoch": 1.8839922229423203, + "grad_norm": 0.5124261379241943, + "learning_rate": 3.210882280361818e-06, + "loss": 0.1231, + "step": 5814 + }, + { + "epoch": 1.8843162670123137, + "grad_norm": 0.4198930263519287, + "learning_rate": 3.2092489904593677e-06, + "loss": 0.1012, + "step": 5815 + }, + { + "epoch": 1.8846403110823071, + "grad_norm": 0.4284537136554718, + "learning_rate": 3.2076159197275046e-06, + "loss": 0.098, + "step": 5816 + }, + { + "epoch": 1.8849643551523008, + "grad_norm": 0.469592422246933, + "learning_rate": 3.2059830683661006e-06, + "loss": 0.1099, + "step": 5817 + }, + { + "epoch": 1.8852883992222942, + "grad_norm": 0.4751318693161011, + "learning_rate": 3.2043504365750024e-06, + "loss": 0.119, + "step": 5818 + }, + { + "epoch": 1.8856124432922878, + "grad_norm": 0.43352800607681274, + "learning_rate": 3.2027180245540286e-06, + "loss": 0.0981, + "step": 5819 + }, + { + "epoch": 1.8859364873622813, + "grad_norm": 0.47290316224098206, + "learning_rate": 3.201085832502967e-06, + "loss": 0.1019, + "step": 5820 + }, + { + "epoch": 1.8862605314322747, + "grad_norm": 0.4248226583003998, + "learning_rate": 3.1994538606215875e-06, + "loss": 0.0956, + "step": 5821 + }, + { + "epoch": 1.8865845755022683, + "grad_norm": 0.44013458490371704, + "learning_rate": 3.197822109109624e-06, + "loss": 0.1088, + "step": 5822 + }, + { + "epoch": 1.886908619572262, + "grad_norm": 0.44821250438690186, + "learning_rate": 3.1961905781667858e-06, + "loss": 0.1011, + "step": 5823 + }, + { + "epoch": 1.8872326636422554, + "grad_norm": 0.45587074756622314, + "learning_rate": 3.194559267992761e-06, + "loss": 0.0994, + "step": 5824 + }, + { + "epoch": 1.8875567077122488, + "grad_norm": 0.4564206302165985, + "learning_rate": 3.192928178787203e-06, + "loss": 0.1084, + "step": 5825 + }, + { + "epoch": 1.8878807517822422, + "grad_norm": 0.47661349177360535, + "learning_rate": 3.191297310749742e-06, + "loss": 0.1159, + "step": 5826 + }, + { + "epoch": 1.8882047958522359, + "grad_norm": 0.4797954261302948, + "learning_rate": 3.189666664079981e-06, + "loss": 0.116, + "step": 5827 + }, + { + "epoch": 1.8885288399222295, + "grad_norm": 0.4409724473953247, + "learning_rate": 3.1880362389774944e-06, + "loss": 0.1036, + "step": 5828 + }, + { + "epoch": 1.888852883992223, + "grad_norm": 0.3927188217639923, + "learning_rate": 3.1864060356418325e-06, + "loss": 0.0926, + "step": 5829 + }, + { + "epoch": 1.8891769280622164, + "grad_norm": 0.42764827609062195, + "learning_rate": 3.184776054272512e-06, + "loss": 0.1003, + "step": 5830 + }, + { + "epoch": 1.88950097213221, + "grad_norm": 0.4615615904331207, + "learning_rate": 3.183146295069032e-06, + "loss": 0.113, + "step": 5831 + }, + { + "epoch": 1.8898250162022034, + "grad_norm": 0.46609097719192505, + "learning_rate": 3.181516758230855e-06, + "loss": 0.1112, + "step": 5832 + }, + { + "epoch": 1.890149060272197, + "grad_norm": 0.4488867223262787, + "learning_rate": 3.1798874439574248e-06, + "loss": 0.1038, + "step": 5833 + }, + { + "epoch": 1.8904731043421905, + "grad_norm": 0.4500874876976013, + "learning_rate": 3.1782583524481514e-06, + "loss": 0.0997, + "step": 5834 + }, + { + "epoch": 1.890797148412184, + "grad_norm": 0.4450390934944153, + "learning_rate": 3.176629483902417e-06, + "loss": 0.1015, + "step": 5835 + }, + { + "epoch": 1.8911211924821776, + "grad_norm": 0.43396297097206116, + "learning_rate": 3.1750008385195852e-06, + "loss": 0.1025, + "step": 5836 + }, + { + "epoch": 1.8914452365521712, + "grad_norm": 0.46059921383857727, + "learning_rate": 3.1733724164989815e-06, + "loss": 0.1083, + "step": 5837 + }, + { + "epoch": 1.8917692806221647, + "grad_norm": 0.4415331482887268, + "learning_rate": 3.1717442180399128e-06, + "loss": 0.0962, + "step": 5838 + }, + { + "epoch": 1.892093324692158, + "grad_norm": 0.4565853178501129, + "learning_rate": 3.170116243341651e-06, + "loss": 0.1067, + "step": 5839 + }, + { + "epoch": 1.8924173687621515, + "grad_norm": 0.4561675488948822, + "learning_rate": 3.168488492603447e-06, + "loss": 0.111, + "step": 5840 + }, + { + "epoch": 1.8927414128321451, + "grad_norm": 0.47068914771080017, + "learning_rate": 3.166860966024522e-06, + "loss": 0.0999, + "step": 5841 + }, + { + "epoch": 1.8930654569021388, + "grad_norm": 0.45309674739837646, + "learning_rate": 3.1652336638040664e-06, + "loss": 0.1049, + "step": 5842 + }, + { + "epoch": 1.8933895009721322, + "grad_norm": 0.45928430557250977, + "learning_rate": 3.163606586141251e-06, + "loss": 0.1031, + "step": 5843 + }, + { + "epoch": 1.8937135450421256, + "grad_norm": 0.4930223822593689, + "learning_rate": 3.161979733235209e-06, + "loss": 0.1124, + "step": 5844 + }, + { + "epoch": 1.8940375891121193, + "grad_norm": 0.45104873180389404, + "learning_rate": 3.1603531052850565e-06, + "loss": 0.1077, + "step": 5845 + }, + { + "epoch": 1.894361633182113, + "grad_norm": 0.45227307081222534, + "learning_rate": 3.1587267024898747e-06, + "loss": 0.1077, + "step": 5846 + }, + { + "epoch": 1.8946856772521063, + "grad_norm": 0.4638083875179291, + "learning_rate": 3.157100525048718e-06, + "loss": 0.1122, + "step": 5847 + }, + { + "epoch": 1.8950097213220998, + "grad_norm": 0.45469921827316284, + "learning_rate": 3.1554745731606183e-06, + "loss": 0.0987, + "step": 5848 + }, + { + "epoch": 1.8953337653920932, + "grad_norm": 0.4775233268737793, + "learning_rate": 3.1538488470245733e-06, + "loss": 0.1042, + "step": 5849 + }, + { + "epoch": 1.8956578094620868, + "grad_norm": 0.4605369567871094, + "learning_rate": 3.152223346839558e-06, + "loss": 0.1094, + "step": 5850 + }, + { + "epoch": 1.8959818535320805, + "grad_norm": 0.4589992165565491, + "learning_rate": 3.1505980728045176e-06, + "loss": 0.1087, + "step": 5851 + }, + { + "epoch": 1.896305897602074, + "grad_norm": 0.42988982796669006, + "learning_rate": 3.1489730251183675e-06, + "loss": 0.0984, + "step": 5852 + }, + { + "epoch": 1.8966299416720673, + "grad_norm": 0.4849797189235687, + "learning_rate": 3.147348203980002e-06, + "loss": 0.1115, + "step": 5853 + }, + { + "epoch": 1.8969539857420608, + "grad_norm": 0.45637208223342896, + "learning_rate": 3.1457236095882786e-06, + "loss": 0.111, + "step": 5854 + }, + { + "epoch": 1.8972780298120544, + "grad_norm": 0.45666077733039856, + "learning_rate": 3.144099242142037e-06, + "loss": 0.0968, + "step": 5855 + }, + { + "epoch": 1.897602073882048, + "grad_norm": 0.46886733174324036, + "learning_rate": 3.1424751018400794e-06, + "loss": 0.1042, + "step": 5856 + }, + { + "epoch": 1.8979261179520415, + "grad_norm": 0.4961826205253601, + "learning_rate": 3.1408511888811894e-06, + "loss": 0.1097, + "step": 5857 + }, + { + "epoch": 1.898250162022035, + "grad_norm": 0.4678135812282562, + "learning_rate": 3.1392275034641163e-06, + "loss": 0.1074, + "step": 5858 + }, + { + "epoch": 1.8985742060920285, + "grad_norm": 0.46530935168266296, + "learning_rate": 3.137604045787581e-06, + "loss": 0.1111, + "step": 5859 + }, + { + "epoch": 1.8988982501620222, + "grad_norm": 0.4346042275428772, + "learning_rate": 3.135980816050283e-06, + "loss": 0.1053, + "step": 5860 + }, + { + "epoch": 1.8992222942320156, + "grad_norm": 0.5243309736251831, + "learning_rate": 3.134357814450886e-06, + "loss": 0.1208, + "step": 5861 + }, + { + "epoch": 1.899546338302009, + "grad_norm": 0.48006880283355713, + "learning_rate": 3.132735041188033e-06, + "loss": 0.1138, + "step": 5862 + }, + { + "epoch": 1.8998703823720025, + "grad_norm": 0.46991440653800964, + "learning_rate": 3.1311124964603327e-06, + "loss": 0.1027, + "step": 5863 + }, + { + "epoch": 1.900194426441996, + "grad_norm": 0.5111856460571289, + "learning_rate": 3.129490180466373e-06, + "loss": 0.114, + "step": 5864 + }, + { + "epoch": 1.9005184705119897, + "grad_norm": 0.4719022214412689, + "learning_rate": 3.1278680934047068e-06, + "loss": 0.1115, + "step": 5865 + }, + { + "epoch": 1.9008425145819832, + "grad_norm": 0.44093480706214905, + "learning_rate": 3.126246235473861e-06, + "loss": 0.0983, + "step": 5866 + }, + { + "epoch": 1.9011665586519766, + "grad_norm": 0.46212008595466614, + "learning_rate": 3.124624606872338e-06, + "loss": 0.108, + "step": 5867 + }, + { + "epoch": 1.9014906027219702, + "grad_norm": 0.46517205238342285, + "learning_rate": 3.123003207798607e-06, + "loss": 0.1114, + "step": 5868 + }, + { + "epoch": 1.9018146467919637, + "grad_norm": 0.4682121276855469, + "learning_rate": 3.121382038451113e-06, + "loss": 0.1076, + "step": 5869 + }, + { + "epoch": 1.9021386908619573, + "grad_norm": 0.47169503569602966, + "learning_rate": 3.1197610990282725e-06, + "loss": 0.1095, + "step": 5870 + }, + { + "epoch": 1.9024627349319507, + "grad_norm": 0.4562494456768036, + "learning_rate": 3.1181403897284696e-06, + "loss": 0.1077, + "step": 5871 + }, + { + "epoch": 1.9027867790019442, + "grad_norm": 0.45159628987312317, + "learning_rate": 3.1165199107500665e-06, + "loss": 0.1066, + "step": 5872 + }, + { + "epoch": 1.9031108230719378, + "grad_norm": 0.4589924216270447, + "learning_rate": 3.1148996622913906e-06, + "loss": 0.1151, + "step": 5873 + }, + { + "epoch": 1.9034348671419314, + "grad_norm": 0.44332441687583923, + "learning_rate": 3.113279644550749e-06, + "loss": 0.1004, + "step": 5874 + }, + { + "epoch": 1.9037589112119249, + "grad_norm": 0.44975969195365906, + "learning_rate": 3.1116598577264122e-06, + "loss": 0.1005, + "step": 5875 + }, + { + "epoch": 1.9040829552819183, + "grad_norm": 0.4652109444141388, + "learning_rate": 3.11004030201663e-06, + "loss": 0.0988, + "step": 5876 + }, + { + "epoch": 1.9044069993519117, + "grad_norm": 0.4715868830680847, + "learning_rate": 3.1084209776196185e-06, + "loss": 0.1137, + "step": 5877 + }, + { + "epoch": 1.9047310434219054, + "grad_norm": 0.4277364909648895, + "learning_rate": 3.106801884733566e-06, + "loss": 0.0979, + "step": 5878 + }, + { + "epoch": 1.905055087491899, + "grad_norm": 0.45353928208351135, + "learning_rate": 3.1051830235566365e-06, + "loss": 0.1082, + "step": 5879 + }, + { + "epoch": 1.9053791315618924, + "grad_norm": 0.492728054523468, + "learning_rate": 3.103564394286961e-06, + "loss": 0.1139, + "step": 5880 + }, + { + "epoch": 1.9057031756318858, + "grad_norm": 0.45384249091148376, + "learning_rate": 3.1019459971226463e-06, + "loss": 0.1027, + "step": 5881 + }, + { + "epoch": 1.9060272197018795, + "grad_norm": 0.46857234835624695, + "learning_rate": 3.1003278322617657e-06, + "loss": 0.1133, + "step": 5882 + }, + { + "epoch": 1.9063512637718731, + "grad_norm": 0.48681899905204773, + "learning_rate": 3.0987098999023667e-06, + "loss": 0.108, + "step": 5883 + }, + { + "epoch": 1.9066753078418666, + "grad_norm": 0.4567790627479553, + "learning_rate": 3.097092200242473e-06, + "loss": 0.1162, + "step": 5884 + }, + { + "epoch": 1.90699935191186, + "grad_norm": 0.46391937136650085, + "learning_rate": 3.0954747334800695e-06, + "loss": 0.1116, + "step": 5885 + }, + { + "epoch": 1.9073233959818534, + "grad_norm": 0.43967387080192566, + "learning_rate": 3.093857499813123e-06, + "loss": 0.1021, + "step": 5886 + }, + { + "epoch": 1.907647440051847, + "grad_norm": 0.4478691816329956, + "learning_rate": 3.0922404994395642e-06, + "loss": 0.102, + "step": 5887 + }, + { + "epoch": 1.9079714841218407, + "grad_norm": 0.4580485224723816, + "learning_rate": 3.0906237325573017e-06, + "loss": 0.1056, + "step": 5888 + }, + { + "epoch": 1.9082955281918341, + "grad_norm": 0.489286333322525, + "learning_rate": 3.08900719936421e-06, + "loss": 0.1196, + "step": 5889 + }, + { + "epoch": 1.9086195722618275, + "grad_norm": 0.45502734184265137, + "learning_rate": 3.087390900058137e-06, + "loss": 0.1039, + "step": 5890 + }, + { + "epoch": 1.908943616331821, + "grad_norm": 0.4502509534358978, + "learning_rate": 3.0857748348369017e-06, + "loss": 0.1044, + "step": 5891 + }, + { + "epoch": 1.9092676604018146, + "grad_norm": 0.47804975509643555, + "learning_rate": 3.084159003898295e-06, + "loss": 0.1149, + "step": 5892 + }, + { + "epoch": 1.9095917044718083, + "grad_norm": 0.44914719462394714, + "learning_rate": 3.082543407440081e-06, + "loss": 0.1027, + "step": 5893 + }, + { + "epoch": 1.9099157485418017, + "grad_norm": 0.42086276412010193, + "learning_rate": 3.080928045659992e-06, + "loss": 0.0942, + "step": 5894 + }, + { + "epoch": 1.910239792611795, + "grad_norm": 0.46406880021095276, + "learning_rate": 3.079312918755729e-06, + "loss": 0.104, + "step": 5895 + }, + { + "epoch": 1.9105638366817888, + "grad_norm": 0.4733922779560089, + "learning_rate": 3.077698026924974e-06, + "loss": 0.1115, + "step": 5896 + }, + { + "epoch": 1.9108878807517824, + "grad_norm": 0.4340214133262634, + "learning_rate": 3.076083370365369e-06, + "loss": 0.1028, + "step": 5897 + }, + { + "epoch": 1.9112119248217758, + "grad_norm": 0.4886152446269989, + "learning_rate": 3.074468949274536e-06, + "loss": 0.1166, + "step": 5898 + }, + { + "epoch": 1.9115359688917692, + "grad_norm": 0.4482693672180176, + "learning_rate": 3.0728547638500617e-06, + "loss": 0.1127, + "step": 5899 + }, + { + "epoch": 1.9118600129617627, + "grad_norm": 0.4709647595882416, + "learning_rate": 3.07124081428951e-06, + "loss": 0.112, + "step": 5900 + }, + { + "epoch": 1.9121840570317563, + "grad_norm": 0.4490049183368683, + "learning_rate": 3.06962710079041e-06, + "loss": 0.1075, + "step": 5901 + }, + { + "epoch": 1.91250810110175, + "grad_norm": 0.46207594871520996, + "learning_rate": 3.0680136235502657e-06, + "loss": 0.1114, + "step": 5902 + }, + { + "epoch": 1.9128321451717434, + "grad_norm": 0.42192792892456055, + "learning_rate": 3.0664003827665507e-06, + "loss": 0.0984, + "step": 5903 + }, + { + "epoch": 1.9131561892417368, + "grad_norm": 0.43207815289497375, + "learning_rate": 3.0647873786367083e-06, + "loss": 0.1044, + "step": 5904 + }, + { + "epoch": 1.9134802333117304, + "grad_norm": 0.45980560779571533, + "learning_rate": 3.0631746113581582e-06, + "loss": 0.1097, + "step": 5905 + }, + { + "epoch": 1.9138042773817239, + "grad_norm": 0.41908833384513855, + "learning_rate": 3.0615620811282866e-06, + "loss": 0.096, + "step": 5906 + }, + { + "epoch": 1.9141283214517175, + "grad_norm": 0.46487948298454285, + "learning_rate": 3.0599497881444482e-06, + "loss": 0.108, + "step": 5907 + }, + { + "epoch": 1.914452365521711, + "grad_norm": 0.44004693627357483, + "learning_rate": 3.058337732603977e-06, + "loss": 0.1016, + "step": 5908 + }, + { + "epoch": 1.9147764095917044, + "grad_norm": 0.45447227358818054, + "learning_rate": 3.0567259147041682e-06, + "loss": 0.1036, + "step": 5909 + }, + { + "epoch": 1.915100453661698, + "grad_norm": 0.45923227071762085, + "learning_rate": 3.0551143346422973e-06, + "loss": 0.1015, + "step": 5910 + }, + { + "epoch": 1.9154244977316917, + "grad_norm": 0.5280866026878357, + "learning_rate": 3.0535029926156027e-06, + "loss": 0.1208, + "step": 5911 + }, + { + "epoch": 1.915748541801685, + "grad_norm": 0.42134809494018555, + "learning_rate": 3.0518918888212994e-06, + "loss": 0.0936, + "step": 5912 + }, + { + "epoch": 1.9160725858716785, + "grad_norm": 0.4500383138656616, + "learning_rate": 3.0502810234565687e-06, + "loss": 0.1044, + "step": 5913 + }, + { + "epoch": 1.916396629941672, + "grad_norm": 0.49136853218078613, + "learning_rate": 3.048670396718566e-06, + "loss": 0.1126, + "step": 5914 + }, + { + "epoch": 1.9167206740116656, + "grad_norm": 0.478859543800354, + "learning_rate": 3.0470600088044177e-06, + "loss": 0.1122, + "step": 5915 + }, + { + "epoch": 1.9170447180816592, + "grad_norm": 0.4460492730140686, + "learning_rate": 3.045449859911216e-06, + "loss": 0.1054, + "step": 5916 + }, + { + "epoch": 1.9173687621516526, + "grad_norm": 0.44223952293395996, + "learning_rate": 3.0438399502360323e-06, + "loss": 0.1033, + "step": 5917 + }, + { + "epoch": 1.917692806221646, + "grad_norm": 0.4340362250804901, + "learning_rate": 3.042230279975901e-06, + "loss": 0.0968, + "step": 5918 + }, + { + "epoch": 1.9180168502916397, + "grad_norm": 0.44690605998039246, + "learning_rate": 3.0406208493278287e-06, + "loss": 0.1071, + "step": 5919 + }, + { + "epoch": 1.9183408943616331, + "grad_norm": 0.47108444571495056, + "learning_rate": 3.039011658488799e-06, + "loss": 0.1199, + "step": 5920 + }, + { + "epoch": 1.9186649384316268, + "grad_norm": 0.44304531812667847, + "learning_rate": 3.037402707655756e-06, + "loss": 0.1078, + "step": 5921 + }, + { + "epoch": 1.9189889825016202, + "grad_norm": 0.4661952257156372, + "learning_rate": 3.0357939970256244e-06, + "loss": 0.1055, + "step": 5922 + }, + { + "epoch": 1.9193130265716136, + "grad_norm": 0.5036419034004211, + "learning_rate": 3.0341855267952914e-06, + "loss": 0.1165, + "step": 5923 + }, + { + "epoch": 1.9196370706416073, + "grad_norm": 0.4615981876850128, + "learning_rate": 3.0325772971616203e-06, + "loss": 0.1088, + "step": 5924 + }, + { + "epoch": 1.919961114711601, + "grad_norm": 0.46507641673088074, + "learning_rate": 3.030969308321442e-06, + "loss": 0.1108, + "step": 5925 + }, + { + "epoch": 1.9202851587815943, + "grad_norm": 0.46919676661491394, + "learning_rate": 3.0293615604715564e-06, + "loss": 0.1177, + "step": 5926 + }, + { + "epoch": 1.9206092028515878, + "grad_norm": 0.4771016836166382, + "learning_rate": 3.027754053808741e-06, + "loss": 0.1141, + "step": 5927 + }, + { + "epoch": 1.9209332469215812, + "grad_norm": 0.48941096663475037, + "learning_rate": 3.026146788529734e-06, + "loss": 0.1125, + "step": 5928 + }, + { + "epoch": 1.9212572909915748, + "grad_norm": 0.44794967770576477, + "learning_rate": 3.0245397648312543e-06, + "loss": 0.1, + "step": 5929 + }, + { + "epoch": 1.9215813350615685, + "grad_norm": 0.47503042221069336, + "learning_rate": 3.0229329829099805e-06, + "loss": 0.1061, + "step": 5930 + }, + { + "epoch": 1.921905379131562, + "grad_norm": 0.4251140356063843, + "learning_rate": 3.021326442962573e-06, + "loss": 0.0978, + "step": 5931 + }, + { + "epoch": 1.9222294232015553, + "grad_norm": 0.4348359704017639, + "learning_rate": 3.0197201451856537e-06, + "loss": 0.1039, + "step": 5932 + }, + { + "epoch": 1.922553467271549, + "grad_norm": 0.4612886607646942, + "learning_rate": 3.0181140897758175e-06, + "loss": 0.1131, + "step": 5933 + }, + { + "epoch": 1.9228775113415426, + "grad_norm": 0.43259257078170776, + "learning_rate": 3.0165082769296307e-06, + "loss": 0.1057, + "step": 5934 + }, + { + "epoch": 1.923201555411536, + "grad_norm": 0.44745153188705444, + "learning_rate": 3.0149027068436275e-06, + "loss": 0.1044, + "step": 5935 + }, + { + "epoch": 1.9235255994815295, + "grad_norm": 0.43348148465156555, + "learning_rate": 3.0132973797143176e-06, + "loss": 0.1002, + "step": 5936 + }, + { + "epoch": 1.9238496435515229, + "grad_norm": 0.44396522641181946, + "learning_rate": 3.0116922957381757e-06, + "loss": 0.1069, + "step": 5937 + }, + { + "epoch": 1.9241736876215165, + "grad_norm": 0.41942650079727173, + "learning_rate": 3.0100874551116467e-06, + "loss": 0.0939, + "step": 5938 + }, + { + "epoch": 1.9244977316915102, + "grad_norm": 0.47876089811325073, + "learning_rate": 3.008482858031151e-06, + "loss": 0.1094, + "step": 5939 + }, + { + "epoch": 1.9248217757615036, + "grad_norm": 0.4715111553668976, + "learning_rate": 3.0068785046930728e-06, + "loss": 0.1057, + "step": 5940 + }, + { + "epoch": 1.925145819831497, + "grad_norm": 0.4314781427383423, + "learning_rate": 3.005274395293772e-06, + "loss": 0.091, + "step": 5941 + }, + { + "epoch": 1.9254698639014904, + "grad_norm": 0.46073848009109497, + "learning_rate": 3.0036705300295743e-06, + "loss": 0.1075, + "step": 5942 + }, + { + "epoch": 1.925793907971484, + "grad_norm": 0.43426087498664856, + "learning_rate": 3.002066909096777e-06, + "loss": 0.0993, + "step": 5943 + }, + { + "epoch": 1.9261179520414777, + "grad_norm": 0.48200058937072754, + "learning_rate": 3.00046353269165e-06, + "loss": 0.1192, + "step": 5944 + }, + { + "epoch": 1.9264419961114712, + "grad_norm": 0.45754122734069824, + "learning_rate": 2.9988604010104283e-06, + "loss": 0.1174, + "step": 5945 + }, + { + "epoch": 1.9267660401814646, + "grad_norm": 0.42181843519210815, + "learning_rate": 2.997257514249323e-06, + "loss": 0.098, + "step": 5946 + }, + { + "epoch": 1.9270900842514582, + "grad_norm": 0.4637039601802826, + "learning_rate": 2.9956548726045064e-06, + "loss": 0.1126, + "step": 5947 + }, + { + "epoch": 1.9274141283214519, + "grad_norm": 0.47404715418815613, + "learning_rate": 2.994052476272133e-06, + "loss": 0.1118, + "step": 5948 + }, + { + "epoch": 1.9277381723914453, + "grad_norm": 0.4384154677391052, + "learning_rate": 2.9924503254483166e-06, + "loss": 0.1003, + "step": 5949 + }, + { + "epoch": 1.9280622164614387, + "grad_norm": 0.44925567507743835, + "learning_rate": 2.9908484203291444e-06, + "loss": 0.1052, + "step": 5950 + }, + { + "epoch": 1.9283862605314321, + "grad_norm": 0.4423099458217621, + "learning_rate": 2.9892467611106774e-06, + "loss": 0.0996, + "step": 5951 + }, + { + "epoch": 1.9287103046014258, + "grad_norm": 0.45022711157798767, + "learning_rate": 2.9876453479889388e-06, + "loss": 0.1052, + "step": 5952 + }, + { + "epoch": 1.9290343486714194, + "grad_norm": 0.4542100131511688, + "learning_rate": 2.9860441811599304e-06, + "loss": 0.1076, + "step": 5953 + }, + { + "epoch": 1.9293583927414129, + "grad_norm": 0.44740357995033264, + "learning_rate": 2.984443260819617e-06, + "loss": 0.112, + "step": 5954 + }, + { + "epoch": 1.9296824368114063, + "grad_norm": 0.49884316325187683, + "learning_rate": 2.9828425871639378e-06, + "loss": 0.1171, + "step": 5955 + }, + { + "epoch": 1.9300064808814, + "grad_norm": 0.5064646005630493, + "learning_rate": 2.981242160388797e-06, + "loss": 0.1199, + "step": 5956 + }, + { + "epoch": 1.9303305249513933, + "grad_norm": 0.47748908400535583, + "learning_rate": 2.9796419806900723e-06, + "loss": 0.1107, + "step": 5957 + }, + { + "epoch": 1.930654569021387, + "grad_norm": 0.4773690104484558, + "learning_rate": 2.978042048263612e-06, + "loss": 0.1149, + "step": 5958 + }, + { + "epoch": 1.9309786130913804, + "grad_norm": 0.4412952959537506, + "learning_rate": 2.9764423633052288e-06, + "loss": 0.0995, + "step": 5959 + }, + { + "epoch": 1.9313026571613738, + "grad_norm": 0.44443589448928833, + "learning_rate": 2.9748429260107124e-06, + "loss": 0.1, + "step": 5960 + }, + { + "epoch": 1.9316267012313675, + "grad_norm": 0.47857704758644104, + "learning_rate": 2.9732437365758177e-06, + "loss": 0.1111, + "step": 5961 + }, + { + "epoch": 1.9319507453013611, + "grad_norm": 0.4563223123550415, + "learning_rate": 2.971644795196267e-06, + "loss": 0.1025, + "step": 5962 + }, + { + "epoch": 1.9322747893713546, + "grad_norm": 0.4517981708049774, + "learning_rate": 2.9700461020677585e-06, + "loss": 0.1061, + "step": 5963 + }, + { + "epoch": 1.932598833441348, + "grad_norm": 0.4651930034160614, + "learning_rate": 2.9684476573859554e-06, + "loss": 0.1075, + "step": 5964 + }, + { + "epoch": 1.9329228775113414, + "grad_norm": 0.4484946131706238, + "learning_rate": 2.9668494613464914e-06, + "loss": 0.107, + "step": 5965 + }, + { + "epoch": 1.933246921581335, + "grad_norm": 0.479286789894104, + "learning_rate": 2.9652515141449713e-06, + "loss": 0.1096, + "step": 5966 + }, + { + "epoch": 1.9335709656513287, + "grad_norm": 0.44580888748168945, + "learning_rate": 2.963653815976969e-06, + "loss": 0.0991, + "step": 5967 + }, + { + "epoch": 1.9338950097213221, + "grad_norm": 0.4828713834285736, + "learning_rate": 2.9620563670380265e-06, + "loss": 0.1135, + "step": 5968 + }, + { + "epoch": 1.9342190537913155, + "grad_norm": 0.48791760206222534, + "learning_rate": 2.9604591675236536e-06, + "loss": 0.1092, + "step": 5969 + }, + { + "epoch": 1.9345430978613092, + "grad_norm": 0.4554043412208557, + "learning_rate": 2.958862217629336e-06, + "loss": 0.1073, + "step": 5970 + }, + { + "epoch": 1.9348671419313026, + "grad_norm": 0.4587395489215851, + "learning_rate": 2.9572655175505217e-06, + "loss": 0.0978, + "step": 5971 + }, + { + "epoch": 1.9351911860012962, + "grad_norm": 0.45185279846191406, + "learning_rate": 2.9556690674826348e-06, + "loss": 0.103, + "step": 5972 + }, + { + "epoch": 1.9355152300712897, + "grad_norm": 0.4833644926548004, + "learning_rate": 2.954072867621063e-06, + "loss": 0.1161, + "step": 5973 + }, + { + "epoch": 1.935839274141283, + "grad_norm": 0.44658008217811584, + "learning_rate": 2.9524769181611646e-06, + "loss": 0.1062, + "step": 5974 + }, + { + "epoch": 1.9361633182112767, + "grad_norm": 0.4688814878463745, + "learning_rate": 2.950881219298272e-06, + "loss": 0.1084, + "step": 5975 + }, + { + "epoch": 1.9364873622812704, + "grad_norm": 0.47974029183387756, + "learning_rate": 2.949285771227679e-06, + "loss": 0.1162, + "step": 5976 + }, + { + "epoch": 1.9368114063512638, + "grad_norm": 0.42677879333496094, + "learning_rate": 2.947690574144657e-06, + "loss": 0.1038, + "step": 5977 + }, + { + "epoch": 1.9371354504212572, + "grad_norm": 0.4472271800041199, + "learning_rate": 2.9460956282444387e-06, + "loss": 0.1058, + "step": 5978 + }, + { + "epoch": 1.9374594944912507, + "grad_norm": 0.48130524158477783, + "learning_rate": 2.9445009337222343e-06, + "loss": 0.1174, + "step": 5979 + }, + { + "epoch": 1.9377835385612443, + "grad_norm": 0.4491761326789856, + "learning_rate": 2.942906490773217e-06, + "loss": 0.1086, + "step": 5980 + }, + { + "epoch": 1.938107582631238, + "grad_norm": 0.4414316415786743, + "learning_rate": 2.9413122995925287e-06, + "loss": 0.1031, + "step": 5981 + }, + { + "epoch": 1.9384316267012314, + "grad_norm": 0.46407943964004517, + "learning_rate": 2.939718360375287e-06, + "loss": 0.1066, + "step": 5982 + }, + { + "epoch": 1.9387556707712248, + "grad_norm": 0.47318655252456665, + "learning_rate": 2.9381246733165713e-06, + "loss": 0.117, + "step": 5983 + }, + { + "epoch": 1.9390797148412184, + "grad_norm": 0.4662487804889679, + "learning_rate": 2.9365312386114377e-06, + "loss": 0.109, + "step": 5984 + }, + { + "epoch": 1.939403758911212, + "grad_norm": 0.4455025792121887, + "learning_rate": 2.9349380564549033e-06, + "loss": 0.0995, + "step": 5985 + }, + { + "epoch": 1.9397278029812055, + "grad_norm": 0.49028605222702026, + "learning_rate": 2.933345127041959e-06, + "loss": 0.1138, + "step": 5986 + }, + { + "epoch": 1.940051847051199, + "grad_norm": 0.4964102506637573, + "learning_rate": 2.9317524505675643e-06, + "loss": 0.1068, + "step": 5987 + }, + { + "epoch": 1.9403758911211924, + "grad_norm": 0.49319037795066833, + "learning_rate": 2.9301600272266477e-06, + "loss": 0.1159, + "step": 5988 + }, + { + "epoch": 1.940699935191186, + "grad_norm": 0.4496191740036011, + "learning_rate": 2.9285678572141075e-06, + "loss": 0.1021, + "step": 5989 + }, + { + "epoch": 1.9410239792611796, + "grad_norm": 0.4779997766017914, + "learning_rate": 2.9269759407248053e-06, + "loss": 0.1137, + "step": 5990 + }, + { + "epoch": 1.941348023331173, + "grad_norm": 0.4509337246417999, + "learning_rate": 2.925384277953583e-06, + "loss": 0.1001, + "step": 5991 + }, + { + "epoch": 1.9416720674011665, + "grad_norm": 0.43564677238464355, + "learning_rate": 2.9237928690952405e-06, + "loss": 0.1031, + "step": 5992 + }, + { + "epoch": 1.94199611147116, + "grad_norm": 0.45528966188430786, + "learning_rate": 2.9222017143445495e-06, + "loss": 0.1063, + "step": 5993 + }, + { + "epoch": 1.9423201555411536, + "grad_norm": 0.4465389549732208, + "learning_rate": 2.9206108138962563e-06, + "loss": 0.1057, + "step": 5994 + }, + { + "epoch": 1.9426441996111472, + "grad_norm": 0.4120558798313141, + "learning_rate": 2.9190201679450676e-06, + "loss": 0.0918, + "step": 5995 + }, + { + "epoch": 1.9429682436811406, + "grad_norm": 0.44868025183677673, + "learning_rate": 2.9174297766856675e-06, + "loss": 0.1052, + "step": 5996 + }, + { + "epoch": 1.943292287751134, + "grad_norm": 0.41865020990371704, + "learning_rate": 2.9158396403127e-06, + "loss": 0.0996, + "step": 5997 + }, + { + "epoch": 1.9436163318211277, + "grad_norm": 0.43086937069892883, + "learning_rate": 2.9142497590207842e-06, + "loss": 0.1026, + "step": 5998 + }, + { + "epoch": 1.9439403758911213, + "grad_norm": 0.4507567286491394, + "learning_rate": 2.912660133004507e-06, + "loss": 0.0957, + "step": 5999 + }, + { + "epoch": 1.9442644199611148, + "grad_norm": 0.4489535093307495, + "learning_rate": 2.911070762458421e-06, + "loss": 0.1018, + "step": 6000 + }, + { + "epoch": 1.9445884640311082, + "grad_norm": 0.48330259323120117, + "learning_rate": 2.9094816475770525e-06, + "loss": 0.1149, + "step": 6001 + }, + { + "epoch": 1.9449125081011016, + "grad_norm": 0.39563846588134766, + "learning_rate": 2.9078927885548924e-06, + "loss": 0.0866, + "step": 6002 + }, + { + "epoch": 1.9452365521710953, + "grad_norm": 0.4709007740020752, + "learning_rate": 2.9063041855864033e-06, + "loss": 0.1099, + "step": 6003 + }, + { + "epoch": 1.945560596241089, + "grad_norm": 0.44115516543388367, + "learning_rate": 2.904715838866012e-06, + "loss": 0.1023, + "step": 6004 + }, + { + "epoch": 1.9458846403110823, + "grad_norm": 0.4623752534389496, + "learning_rate": 2.903127748588117e-06, + "loss": 0.1129, + "step": 6005 + }, + { + "epoch": 1.9462086843810757, + "grad_norm": 0.44601622223854065, + "learning_rate": 2.9015399149470873e-06, + "loss": 0.1016, + "step": 6006 + }, + { + "epoch": 1.9465327284510694, + "grad_norm": 0.452635794878006, + "learning_rate": 2.8999523381372573e-06, + "loss": 0.1, + "step": 6007 + }, + { + "epoch": 1.9468567725210628, + "grad_norm": 0.4444234073162079, + "learning_rate": 2.898365018352931e-06, + "loss": 0.1016, + "step": 6008 + }, + { + "epoch": 1.9471808165910565, + "grad_norm": 0.46949273347854614, + "learning_rate": 2.8967779557883807e-06, + "loss": 0.1122, + "step": 6009 + }, + { + "epoch": 1.9475048606610499, + "grad_norm": 0.42208442091941833, + "learning_rate": 2.895191150637848e-06, + "loss": 0.094, + "step": 6010 + }, + { + "epoch": 1.9478289047310433, + "grad_norm": 0.48604580760002136, + "learning_rate": 2.8936046030955445e-06, + "loss": 0.1214, + "step": 6011 + }, + { + "epoch": 1.948152948801037, + "grad_norm": 0.44420698285102844, + "learning_rate": 2.892018313355644e-06, + "loss": 0.1021, + "step": 6012 + }, + { + "epoch": 1.9484769928710306, + "grad_norm": 0.4467221796512604, + "learning_rate": 2.8904322816122955e-06, + "loss": 0.1022, + "step": 6013 + }, + { + "epoch": 1.948801036941024, + "grad_norm": 0.4772205948829651, + "learning_rate": 2.888846508059613e-06, + "loss": 0.1177, + "step": 6014 + }, + { + "epoch": 1.9491250810110174, + "grad_norm": 0.4955427646636963, + "learning_rate": 2.88726099289168e-06, + "loss": 0.1174, + "step": 6015 + }, + { + "epoch": 1.9494491250810109, + "grad_norm": 0.45310765504837036, + "learning_rate": 2.885675736302551e-06, + "loss": 0.1051, + "step": 6016 + }, + { + "epoch": 1.9497731691510045, + "grad_norm": 0.4388439357280731, + "learning_rate": 2.8840907384862394e-06, + "loss": 0.0971, + "step": 6017 + }, + { + "epoch": 1.9500972132209982, + "grad_norm": 0.42228397727012634, + "learning_rate": 2.882505999636742e-06, + "loss": 0.1025, + "step": 6018 + }, + { + "epoch": 1.9504212572909916, + "grad_norm": 0.4494069814682007, + "learning_rate": 2.880921519948008e-06, + "loss": 0.1012, + "step": 6019 + }, + { + "epoch": 1.950745301360985, + "grad_norm": 0.44469448924064636, + "learning_rate": 2.879337299613969e-06, + "loss": 0.0992, + "step": 6020 + }, + { + "epoch": 1.9510693454309787, + "grad_norm": 0.45649954676628113, + "learning_rate": 2.8777533388285106e-06, + "loss": 0.1121, + "step": 6021 + }, + { + "epoch": 1.9513933895009723, + "grad_norm": 0.4270290434360504, + "learning_rate": 2.876169637785503e-06, + "loss": 0.0975, + "step": 6022 + }, + { + "epoch": 1.9517174335709657, + "grad_norm": 0.4835495948791504, + "learning_rate": 2.8745861966787697e-06, + "loss": 0.1131, + "step": 6023 + }, + { + "epoch": 1.9520414776409591, + "grad_norm": 0.4646935760974884, + "learning_rate": 2.8730030157021106e-06, + "loss": 0.1078, + "step": 6024 + }, + { + "epoch": 1.9523655217109526, + "grad_norm": 0.4147730767726898, + "learning_rate": 2.8714200950492925e-06, + "loss": 0.0939, + "step": 6025 + }, + { + "epoch": 1.9526895657809462, + "grad_norm": 0.4704440236091614, + "learning_rate": 2.869837434914045e-06, + "loss": 0.1048, + "step": 6026 + }, + { + "epoch": 1.9530136098509399, + "grad_norm": 0.42922016978263855, + "learning_rate": 2.8682550354900778e-06, + "loss": 0.098, + "step": 6027 + }, + { + "epoch": 1.9533376539209333, + "grad_norm": 0.43618518114089966, + "learning_rate": 2.8666728969710555e-06, + "loss": 0.1032, + "step": 6028 + }, + { + "epoch": 1.9536616979909267, + "grad_norm": 0.4387398958206177, + "learning_rate": 2.865091019550618e-06, + "loss": 0.1027, + "step": 6029 + }, + { + "epoch": 1.9539857420609201, + "grad_norm": 0.4201517403125763, + "learning_rate": 2.863509403422373e-06, + "loss": 0.0977, + "step": 6030 + }, + { + "epoch": 1.9543097861309138, + "grad_norm": 0.47074347734451294, + "learning_rate": 2.8619280487798935e-06, + "loss": 0.1092, + "step": 6031 + }, + { + "epoch": 1.9546338302009074, + "grad_norm": 0.45103365182876587, + "learning_rate": 2.860346955816723e-06, + "loss": 0.1096, + "step": 6032 + }, + { + "epoch": 1.9549578742709008, + "grad_norm": 0.46009188890457153, + "learning_rate": 2.8587661247263714e-06, + "loss": 0.1035, + "step": 6033 + }, + { + "epoch": 1.9552819183408943, + "grad_norm": 0.47794514894485474, + "learning_rate": 2.8571855557023196e-06, + "loss": 0.1082, + "step": 6034 + }, + { + "epoch": 1.955605962410888, + "grad_norm": 0.4807623624801636, + "learning_rate": 2.855605248938009e-06, + "loss": 0.112, + "step": 6035 + }, + { + "epoch": 1.9559300064808816, + "grad_norm": 0.45439618825912476, + "learning_rate": 2.854025204626858e-06, + "loss": 0.1027, + "step": 6036 + }, + { + "epoch": 1.956254050550875, + "grad_norm": 0.47287946939468384, + "learning_rate": 2.8524454229622466e-06, + "loss": 0.1107, + "step": 6037 + }, + { + "epoch": 1.9565780946208684, + "grad_norm": 0.4567338526248932, + "learning_rate": 2.850865904137525e-06, + "loss": 0.1075, + "step": 6038 + }, + { + "epoch": 1.9569021386908618, + "grad_norm": 0.4594586193561554, + "learning_rate": 2.8492866483460124e-06, + "loss": 0.1077, + "step": 6039 + }, + { + "epoch": 1.9572261827608555, + "grad_norm": 0.4383203983306885, + "learning_rate": 2.8477076557809946e-06, + "loss": 0.1063, + "step": 6040 + }, + { + "epoch": 1.9575502268308491, + "grad_norm": 0.43546706438064575, + "learning_rate": 2.8461289266357205e-06, + "loss": 0.0968, + "step": 6041 + }, + { + "epoch": 1.9578742709008425, + "grad_norm": 0.4216805100440979, + "learning_rate": 2.8445504611034185e-06, + "loss": 0.102, + "step": 6042 + }, + { + "epoch": 1.958198314970836, + "grad_norm": 0.45357009768486023, + "learning_rate": 2.84297225937727e-06, + "loss": 0.1057, + "step": 6043 + }, + { + "epoch": 1.9585223590408296, + "grad_norm": 0.4711031913757324, + "learning_rate": 2.841394321650439e-06, + "loss": 0.1173, + "step": 6044 + }, + { + "epoch": 1.958846403110823, + "grad_norm": 0.4412153363227844, + "learning_rate": 2.8398166481160437e-06, + "loss": 0.1015, + "step": 6045 + }, + { + "epoch": 1.9591704471808167, + "grad_norm": 0.489576518535614, + "learning_rate": 2.83823923896718e-06, + "loss": 0.1137, + "step": 6046 + }, + { + "epoch": 1.95949449125081, + "grad_norm": 0.44831836223602295, + "learning_rate": 2.8366620943969063e-06, + "loss": 0.1029, + "step": 6047 + }, + { + "epoch": 1.9598185353208035, + "grad_norm": 0.4388909339904785, + "learning_rate": 2.8350852145982468e-06, + "loss": 0.1024, + "step": 6048 + }, + { + "epoch": 1.9601425793907972, + "grad_norm": 0.47623389959335327, + "learning_rate": 2.833508599764202e-06, + "loss": 0.1065, + "step": 6049 + }, + { + "epoch": 1.9604666234607908, + "grad_norm": 0.4384922385215759, + "learning_rate": 2.831932250087728e-06, + "loss": 0.1003, + "step": 6050 + }, + { + "epoch": 1.9607906675307842, + "grad_norm": 0.44177010655403137, + "learning_rate": 2.830356165761762e-06, + "loss": 0.1014, + "step": 6051 + }, + { + "epoch": 1.9611147116007777, + "grad_norm": 0.4402632415294647, + "learning_rate": 2.8287803469791946e-06, + "loss": 0.0958, + "step": 6052 + }, + { + "epoch": 1.961438755670771, + "grad_norm": 0.477048397064209, + "learning_rate": 2.8272047939328943e-06, + "loss": 0.1034, + "step": 6053 + }, + { + "epoch": 1.9617627997407647, + "grad_norm": 0.4629821181297302, + "learning_rate": 2.8256295068156938e-06, + "loss": 0.1077, + "step": 6054 + }, + { + "epoch": 1.9620868438107584, + "grad_norm": 0.43252432346343994, + "learning_rate": 2.824054485820391e-06, + "loss": 0.1023, + "step": 6055 + }, + { + "epoch": 1.9624108878807518, + "grad_norm": 0.45714935660362244, + "learning_rate": 2.8224797311397544e-06, + "loss": 0.1075, + "step": 6056 + }, + { + "epoch": 1.9627349319507452, + "grad_norm": 0.46026867628097534, + "learning_rate": 2.820905242966519e-06, + "loss": 0.1057, + "step": 6057 + }, + { + "epoch": 1.9630589760207389, + "grad_norm": 0.44352877140045166, + "learning_rate": 2.8193310214933887e-06, + "loss": 0.1031, + "step": 6058 + }, + { + "epoch": 1.9633830200907323, + "grad_norm": 0.4600834846496582, + "learning_rate": 2.817757066913029e-06, + "loss": 0.1107, + "step": 6059 + }, + { + "epoch": 1.963707064160726, + "grad_norm": 0.4361191391944885, + "learning_rate": 2.8161833794180783e-06, + "loss": 0.1051, + "step": 6060 + }, + { + "epoch": 1.9640311082307194, + "grad_norm": 0.44225600361824036, + "learning_rate": 2.814609959201141e-06, + "loss": 0.1049, + "step": 6061 + }, + { + "epoch": 1.9643551523007128, + "grad_norm": 0.4521048069000244, + "learning_rate": 2.8130368064547884e-06, + "loss": 0.1102, + "step": 6062 + }, + { + "epoch": 1.9646791963707064, + "grad_norm": 0.45793232321739197, + "learning_rate": 2.811463921371559e-06, + "loss": 0.1072, + "step": 6063 + }, + { + "epoch": 1.9650032404407, + "grad_norm": 0.46674272418022156, + "learning_rate": 2.809891304143961e-06, + "loss": 0.1106, + "step": 6064 + }, + { + "epoch": 1.9653272845106935, + "grad_norm": 0.43551090359687805, + "learning_rate": 2.808318954964462e-06, + "loss": 0.1027, + "step": 6065 + }, + { + "epoch": 1.965651328580687, + "grad_norm": 0.4202669858932495, + "learning_rate": 2.80674687402551e-06, + "loss": 0.0938, + "step": 6066 + }, + { + "epoch": 1.9659753726506803, + "grad_norm": 0.41163575649261475, + "learning_rate": 2.8051750615195055e-06, + "loss": 0.0948, + "step": 6067 + }, + { + "epoch": 1.966299416720674, + "grad_norm": 0.4615885019302368, + "learning_rate": 2.8036035176388264e-06, + "loss": 0.1025, + "step": 6068 + }, + { + "epoch": 1.9666234607906676, + "grad_norm": 0.42400792241096497, + "learning_rate": 2.802032242575814e-06, + "loss": 0.0936, + "step": 6069 + }, + { + "epoch": 1.966947504860661, + "grad_norm": 0.44435328245162964, + "learning_rate": 2.800461236522777e-06, + "loss": 0.1079, + "step": 6070 + }, + { + "epoch": 1.9672715489306545, + "grad_norm": 0.4586362838745117, + "learning_rate": 2.7988904996719927e-06, + "loss": 0.1123, + "step": 6071 + }, + { + "epoch": 1.9675955930006481, + "grad_norm": 0.4277156591415405, + "learning_rate": 2.7973200322157e-06, + "loss": 0.0987, + "step": 6072 + }, + { + "epoch": 1.9679196370706418, + "grad_norm": 0.46500012278556824, + "learning_rate": 2.7957498343461154e-06, + "loss": 0.1096, + "step": 6073 + }, + { + "epoch": 1.9682436811406352, + "grad_norm": 0.4366726279258728, + "learning_rate": 2.794179906255408e-06, + "loss": 0.1029, + "step": 6074 + }, + { + "epoch": 1.9685677252106286, + "grad_norm": 0.4122413992881775, + "learning_rate": 2.7926102481357303e-06, + "loss": 0.0965, + "step": 6075 + }, + { + "epoch": 1.968891769280622, + "grad_norm": 0.4758448898792267, + "learning_rate": 2.7910408601791873e-06, + "loss": 0.1059, + "step": 6076 + }, + { + "epoch": 1.9692158133506157, + "grad_norm": 0.4578683376312256, + "learning_rate": 2.7894717425778585e-06, + "loss": 0.1038, + "step": 6077 + }, + { + "epoch": 1.9695398574206093, + "grad_norm": 0.43178144097328186, + "learning_rate": 2.7879028955237887e-06, + "loss": 0.1034, + "step": 6078 + }, + { + "epoch": 1.9698639014906028, + "grad_norm": 0.4421790838241577, + "learning_rate": 2.7863343192089893e-06, + "loss": 0.1025, + "step": 6079 + }, + { + "epoch": 1.9701879455605962, + "grad_norm": 0.4443241059780121, + "learning_rate": 2.784766013825442e-06, + "loss": 0.101, + "step": 6080 + }, + { + "epoch": 1.9705119896305896, + "grad_norm": 0.4201869070529938, + "learning_rate": 2.7831979795650848e-06, + "loss": 0.0965, + "step": 6081 + }, + { + "epoch": 1.9708360337005832, + "grad_norm": 0.43988552689552307, + "learning_rate": 2.781630216619839e-06, + "loss": 0.1052, + "step": 6082 + }, + { + "epoch": 1.971160077770577, + "grad_norm": 0.4632330536842346, + "learning_rate": 2.7800627251815772e-06, + "loss": 0.1058, + "step": 6083 + }, + { + "epoch": 1.9714841218405703, + "grad_norm": 0.4753658175468445, + "learning_rate": 2.778495505442147e-06, + "loss": 0.111, + "step": 6084 + }, + { + "epoch": 1.9718081659105637, + "grad_norm": 0.45654088258743286, + "learning_rate": 2.776928557593361e-06, + "loss": 0.1087, + "step": 6085 + }, + { + "epoch": 1.9721322099805574, + "grad_norm": 0.47004789113998413, + "learning_rate": 2.7753618818269988e-06, + "loss": 0.1097, + "step": 6086 + }, + { + "epoch": 1.972456254050551, + "grad_norm": 0.4864579141139984, + "learning_rate": 2.7737954783348066e-06, + "loss": 0.1196, + "step": 6087 + }, + { + "epoch": 1.9727802981205445, + "grad_norm": 0.4297422766685486, + "learning_rate": 2.772229347308496e-06, + "loss": 0.1035, + "step": 6088 + }, + { + "epoch": 1.9731043421905379, + "grad_norm": 0.4059481918811798, + "learning_rate": 2.770663488939749e-06, + "loss": 0.0959, + "step": 6089 + }, + { + "epoch": 1.9734283862605313, + "grad_norm": 0.49687182903289795, + "learning_rate": 2.769097903420207e-06, + "loss": 0.1258, + "step": 6090 + }, + { + "epoch": 1.973752430330525, + "grad_norm": 0.4761006832122803, + "learning_rate": 2.767532590941485e-06, + "loss": 0.1155, + "step": 6091 + }, + { + "epoch": 1.9740764744005186, + "grad_norm": 0.46504855155944824, + "learning_rate": 2.7659675516951616e-06, + "loss": 0.1086, + "step": 6092 + }, + { + "epoch": 1.974400518470512, + "grad_norm": 0.451185017824173, + "learning_rate": 2.7644027858727827e-06, + "loss": 0.1068, + "step": 6093 + }, + { + "epoch": 1.9747245625405054, + "grad_norm": 0.4202289581298828, + "learning_rate": 2.7628382936658614e-06, + "loss": 0.0974, + "step": 6094 + }, + { + "epoch": 1.975048606610499, + "grad_norm": 0.49244967103004456, + "learning_rate": 2.7612740752658775e-06, + "loss": 0.1062, + "step": 6095 + }, + { + "epoch": 1.9753726506804925, + "grad_norm": 0.45967769622802734, + "learning_rate": 2.7597101308642694e-06, + "loss": 0.1043, + "step": 6096 + }, + { + "epoch": 1.9756966947504861, + "grad_norm": 0.4675883948802948, + "learning_rate": 2.758146460652458e-06, + "loss": 0.1126, + "step": 6097 + }, + { + "epoch": 1.9760207388204796, + "grad_norm": 0.4642001986503601, + "learning_rate": 2.756583064821815e-06, + "loss": 0.118, + "step": 6098 + }, + { + "epoch": 1.976344782890473, + "grad_norm": 0.4494239389896393, + "learning_rate": 2.7550199435636864e-06, + "loss": 0.1098, + "step": 6099 + }, + { + "epoch": 1.9766688269604666, + "grad_norm": 0.47027140855789185, + "learning_rate": 2.753457097069384e-06, + "loss": 0.1068, + "step": 6100 + }, + { + "epoch": 1.9769928710304603, + "grad_norm": 0.453249990940094, + "learning_rate": 2.7518945255301852e-06, + "loss": 0.1017, + "step": 6101 + }, + { + "epoch": 1.9773169151004537, + "grad_norm": 0.4894610643386841, + "learning_rate": 2.7503322291373346e-06, + "loss": 0.1118, + "step": 6102 + }, + { + "epoch": 1.9776409591704471, + "grad_norm": 0.4645687937736511, + "learning_rate": 2.7487702080820366e-06, + "loss": 0.1088, + "step": 6103 + }, + { + "epoch": 1.9779650032404406, + "grad_norm": 0.44122567772865295, + "learning_rate": 2.7472084625554763e-06, + "loss": 0.1032, + "step": 6104 + }, + { + "epoch": 1.9782890473104342, + "grad_norm": 0.47128430008888245, + "learning_rate": 2.7456469927487863e-06, + "loss": 0.1003, + "step": 6105 + }, + { + "epoch": 1.9786130913804278, + "grad_norm": 0.44808393716812134, + "learning_rate": 2.7440857988530855e-06, + "loss": 0.102, + "step": 6106 + }, + { + "epoch": 1.9789371354504213, + "grad_norm": 0.4369852542877197, + "learning_rate": 2.7425248810594417e-06, + "loss": 0.098, + "step": 6107 + }, + { + "epoch": 1.9792611795204147, + "grad_norm": 0.4913753569126129, + "learning_rate": 2.7409642395588983e-06, + "loss": 0.1188, + "step": 6108 + }, + { + "epoch": 1.9795852235904083, + "grad_norm": 0.41489270329475403, + "learning_rate": 2.739403874542462e-06, + "loss": 0.0952, + "step": 6109 + }, + { + "epoch": 1.9799092676604018, + "grad_norm": 0.4610608220100403, + "learning_rate": 2.7378437862011086e-06, + "loss": 0.1057, + "step": 6110 + }, + { + "epoch": 1.9802333117303954, + "grad_norm": 0.4758901000022888, + "learning_rate": 2.736283974725778e-06, + "loss": 0.1124, + "step": 6111 + }, + { + "epoch": 1.9805573558003888, + "grad_norm": 0.446199506521225, + "learning_rate": 2.7347244403073704e-06, + "loss": 0.1026, + "step": 6112 + }, + { + "epoch": 1.9808813998703823, + "grad_norm": 0.45175614953041077, + "learning_rate": 2.7331651831367657e-06, + "loss": 0.1018, + "step": 6113 + }, + { + "epoch": 1.981205443940376, + "grad_norm": 0.48578208684921265, + "learning_rate": 2.7316062034047953e-06, + "loss": 0.1174, + "step": 6114 + }, + { + "epoch": 1.9815294880103695, + "grad_norm": 0.4611910879611969, + "learning_rate": 2.7300475013022666e-06, + "loss": 0.1017, + "step": 6115 + }, + { + "epoch": 1.981853532080363, + "grad_norm": 0.467991441488266, + "learning_rate": 2.728489077019949e-06, + "loss": 0.104, + "step": 6116 + }, + { + "epoch": 1.9821775761503564, + "grad_norm": 0.458097368478775, + "learning_rate": 2.726930930748578e-06, + "loss": 0.0999, + "step": 6117 + }, + { + "epoch": 1.9825016202203498, + "grad_norm": 0.4219837784767151, + "learning_rate": 2.725373062678856e-06, + "loss": 0.0932, + "step": 6118 + }, + { + "epoch": 1.9828256642903435, + "grad_norm": 0.46422630548477173, + "learning_rate": 2.7238154730014533e-06, + "loss": 0.1109, + "step": 6119 + }, + { + "epoch": 1.983149708360337, + "grad_norm": 0.45828792452812195, + "learning_rate": 2.7222581619069994e-06, + "loss": 0.1028, + "step": 6120 + }, + { + "epoch": 1.9834737524303305, + "grad_norm": 0.45479437708854675, + "learning_rate": 2.7207011295860962e-06, + "loss": 0.105, + "step": 6121 + }, + { + "epoch": 1.983797796500324, + "grad_norm": 0.4284836947917938, + "learning_rate": 2.7191443762293096e-06, + "loss": 0.0948, + "step": 6122 + }, + { + "epoch": 1.9841218405703176, + "grad_norm": 0.4283240735530853, + "learning_rate": 2.717587902027171e-06, + "loss": 0.0998, + "step": 6123 + }, + { + "epoch": 1.9844458846403112, + "grad_norm": 0.44493427872657776, + "learning_rate": 2.716031707170177e-06, + "loss": 0.1057, + "step": 6124 + }, + { + "epoch": 1.9847699287103047, + "grad_norm": 0.4590212404727936, + "learning_rate": 2.714475791848792e-06, + "loss": 0.11, + "step": 6125 + }, + { + "epoch": 1.985093972780298, + "grad_norm": 0.45337870717048645, + "learning_rate": 2.712920156253447e-06, + "loss": 0.1053, + "step": 6126 + }, + { + "epoch": 1.9854180168502915, + "grad_norm": 0.4287721812725067, + "learning_rate": 2.7113648005745295e-06, + "loss": 0.0961, + "step": 6127 + }, + { + "epoch": 1.9857420609202852, + "grad_norm": 0.4195595383644104, + "learning_rate": 2.7098097250024093e-06, + "loss": 0.1022, + "step": 6128 + }, + { + "epoch": 1.9860661049902788, + "grad_norm": 0.5380899906158447, + "learning_rate": 2.708254929727406e-06, + "loss": 0.1222, + "step": 6129 + }, + { + "epoch": 1.9863901490602722, + "grad_norm": 0.42683035135269165, + "learning_rate": 2.706700414939813e-06, + "loss": 0.0998, + "step": 6130 + }, + { + "epoch": 1.9867141931302656, + "grad_norm": 0.43054234981536865, + "learning_rate": 2.7051461808298885e-06, + "loss": 0.0953, + "step": 6131 + }, + { + "epoch": 1.987038237200259, + "grad_norm": 0.4568624198436737, + "learning_rate": 2.703592227587856e-06, + "loss": 0.1085, + "step": 6132 + }, + { + "epoch": 1.9873622812702527, + "grad_norm": 0.4255097806453705, + "learning_rate": 2.7020385554039055e-06, + "loss": 0.0993, + "step": 6133 + }, + { + "epoch": 1.9876863253402464, + "grad_norm": 0.4464300274848938, + "learning_rate": 2.700485164468185e-06, + "loss": 0.096, + "step": 6134 + }, + { + "epoch": 1.9880103694102398, + "grad_norm": 0.4486854672431946, + "learning_rate": 2.6989320549708244e-06, + "loss": 0.1043, + "step": 6135 + }, + { + "epoch": 1.9883344134802332, + "grad_norm": 0.4583606421947479, + "learning_rate": 2.6973792271019005e-06, + "loss": 0.1089, + "step": 6136 + }, + { + "epoch": 1.9886584575502269, + "grad_norm": 0.45357078313827515, + "learning_rate": 2.695826681051471e-06, + "loss": 0.1059, + "step": 6137 + }, + { + "epoch": 1.9889825016202205, + "grad_norm": 0.41597336530685425, + "learning_rate": 2.6942744170095486e-06, + "loss": 0.0982, + "step": 6138 + }, + { + "epoch": 1.989306545690214, + "grad_norm": 0.42208510637283325, + "learning_rate": 2.6927224351661157e-06, + "loss": 0.1004, + "step": 6139 + }, + { + "epoch": 1.9896305897602073, + "grad_norm": 0.45657432079315186, + "learning_rate": 2.691170735711121e-06, + "loss": 0.1065, + "step": 6140 + }, + { + "epoch": 1.9899546338302008, + "grad_norm": 0.43733304738998413, + "learning_rate": 2.6896193188344766e-06, + "loss": 0.0963, + "step": 6141 + }, + { + "epoch": 1.9902786779001944, + "grad_norm": 0.41876494884490967, + "learning_rate": 2.688068184726064e-06, + "loss": 0.0967, + "step": 6142 + }, + { + "epoch": 1.990602721970188, + "grad_norm": 0.47125309705734253, + "learning_rate": 2.686517333575722e-06, + "loss": 0.1037, + "step": 6143 + }, + { + "epoch": 1.9909267660401815, + "grad_norm": 0.41714105010032654, + "learning_rate": 2.6849667655732623e-06, + "loss": 0.0979, + "step": 6144 + }, + { + "epoch": 1.991250810110175, + "grad_norm": 0.4522852599620819, + "learning_rate": 2.683416480908459e-06, + "loss": 0.1059, + "step": 6145 + }, + { + "epoch": 1.9915748541801686, + "grad_norm": 0.4258798658847809, + "learning_rate": 2.6818664797710526e-06, + "loss": 0.1001, + "step": 6146 + }, + { + "epoch": 1.991898898250162, + "grad_norm": 0.44886261224746704, + "learning_rate": 2.680316762350747e-06, + "loss": 0.1011, + "step": 6147 + }, + { + "epoch": 1.9922229423201556, + "grad_norm": 0.4444274604320526, + "learning_rate": 2.678767328837214e-06, + "loss": 0.1019, + "step": 6148 + }, + { + "epoch": 1.992546986390149, + "grad_norm": 0.45031675696372986, + "learning_rate": 2.6772181794200885e-06, + "loss": 0.094, + "step": 6149 + }, + { + "epoch": 1.9928710304601425, + "grad_norm": 0.4273860454559326, + "learning_rate": 2.6756693142889733e-06, + "loss": 0.0907, + "step": 6150 + }, + { + "epoch": 1.9931950745301361, + "grad_norm": 0.41163426637649536, + "learning_rate": 2.6741207336334312e-06, + "loss": 0.0935, + "step": 6151 + }, + { + "epoch": 1.9935191186001298, + "grad_norm": 0.4350992441177368, + "learning_rate": 2.6725724376429953e-06, + "loss": 0.0981, + "step": 6152 + }, + { + "epoch": 1.9938431626701232, + "grad_norm": 0.4312846064567566, + "learning_rate": 2.671024426507161e-06, + "loss": 0.1033, + "step": 6153 + }, + { + "epoch": 1.9941672067401166, + "grad_norm": 0.4249867796897888, + "learning_rate": 2.669476700415391e-06, + "loss": 0.0983, + "step": 6154 + }, + { + "epoch": 1.99449125081011, + "grad_norm": 0.4431942403316498, + "learning_rate": 2.667929259557112e-06, + "loss": 0.1071, + "step": 6155 + }, + { + "epoch": 1.9948152948801037, + "grad_norm": 0.47021448612213135, + "learning_rate": 2.666382104121715e-06, + "loss": 0.1103, + "step": 6156 + }, + { + "epoch": 1.9951393389500973, + "grad_norm": 0.4780392348766327, + "learning_rate": 2.6648352342985596e-06, + "loss": 0.112, + "step": 6157 + }, + { + "epoch": 1.9954633830200907, + "grad_norm": 0.4610086977481842, + "learning_rate": 2.6632886502769617e-06, + "loss": 0.1091, + "step": 6158 + }, + { + "epoch": 1.9957874270900842, + "grad_norm": 0.4752705991268158, + "learning_rate": 2.661742352246215e-06, + "loss": 0.1137, + "step": 6159 + }, + { + "epoch": 1.9961114711600778, + "grad_norm": 0.4885751008987427, + "learning_rate": 2.6601963403955667e-06, + "loss": 0.1137, + "step": 6160 + }, + { + "epoch": 1.9964355152300715, + "grad_norm": 0.4374214708805084, + "learning_rate": 2.6586506149142355e-06, + "loss": 0.1007, + "step": 6161 + }, + { + "epoch": 1.9967595593000649, + "grad_norm": 0.45261216163635254, + "learning_rate": 2.6571051759914023e-06, + "loss": 0.1001, + "step": 6162 + }, + { + "epoch": 1.9970836033700583, + "grad_norm": 0.4949720799922943, + "learning_rate": 2.6555600238162153e-06, + "loss": 0.1148, + "step": 6163 + }, + { + "epoch": 1.9974076474400517, + "grad_norm": 0.46937358379364014, + "learning_rate": 2.6540151585777875e-06, + "loss": 0.109, + "step": 6164 + }, + { + "epoch": 1.9977316915100454, + "grad_norm": 0.4581141471862793, + "learning_rate": 2.652470580465189e-06, + "loss": 0.1087, + "step": 6165 + }, + { + "epoch": 1.998055735580039, + "grad_norm": 0.48938125371932983, + "learning_rate": 2.65092628966747e-06, + "loss": 0.1101, + "step": 6166 + }, + { + "epoch": 1.9983797796500324, + "grad_norm": 0.4437830448150635, + "learning_rate": 2.649382286373628e-06, + "loss": 0.1016, + "step": 6167 + }, + { + "epoch": 1.9987038237200259, + "grad_norm": 0.45508936047554016, + "learning_rate": 2.647838570772642e-06, + "loss": 0.0992, + "step": 6168 + }, + { + "epoch": 1.9990278677900193, + "grad_norm": 0.46347787976264954, + "learning_rate": 2.6462951430534434e-06, + "loss": 0.1094, + "step": 6169 + }, + { + "epoch": 1.999351911860013, + "grad_norm": 0.43078288435935974, + "learning_rate": 2.6447520034049323e-06, + "loss": 0.1063, + "step": 6170 + }, + { + "epoch": 1.9996759559300066, + "grad_norm": 0.43163982033729553, + "learning_rate": 2.6432091520159764e-06, + "loss": 0.0985, + "step": 6171 + }, + { + "epoch": 2.0, + "grad_norm": 0.4776883125305176, + "learning_rate": 2.6416665890754044e-06, + "loss": 0.1241, + "step": 6172 + }, + { + "epoch": 2.0003240440699934, + "grad_norm": 0.39515194296836853, + "learning_rate": 2.6401243147720136e-06, + "loss": 0.0775, + "step": 6173 + }, + { + "epoch": 2.000648088139987, + "grad_norm": 0.3835315704345703, + "learning_rate": 2.6385823292945593e-06, + "loss": 0.0777, + "step": 6174 + }, + { + "epoch": 2.0009721322099807, + "grad_norm": 0.3732658922672272, + "learning_rate": 2.6370406328317676e-06, + "loss": 0.0741, + "step": 6175 + }, + { + "epoch": 2.001296176279974, + "grad_norm": 0.36997994780540466, + "learning_rate": 2.635499225572327e-06, + "loss": 0.0743, + "step": 6176 + }, + { + "epoch": 2.0016202203499676, + "grad_norm": 0.3874099552631378, + "learning_rate": 2.6339581077048914e-06, + "loss": 0.0782, + "step": 6177 + }, + { + "epoch": 2.001944264419961, + "grad_norm": 0.3987525999546051, + "learning_rate": 2.632417279418078e-06, + "loss": 0.0803, + "step": 6178 + }, + { + "epoch": 2.002268308489955, + "grad_norm": 0.36920928955078125, + "learning_rate": 2.6308767409004697e-06, + "loss": 0.0759, + "step": 6179 + }, + { + "epoch": 2.0025923525599483, + "grad_norm": 0.38145503401756287, + "learning_rate": 2.6293364923406138e-06, + "loss": 0.0764, + "step": 6180 + }, + { + "epoch": 2.0029163966299417, + "grad_norm": 0.37188780307769775, + "learning_rate": 2.6277965339270234e-06, + "loss": 0.0701, + "step": 6181 + }, + { + "epoch": 2.003240440699935, + "grad_norm": 0.3832031488418579, + "learning_rate": 2.626256865848168e-06, + "loss": 0.0705, + "step": 6182 + }, + { + "epoch": 2.0035644847699285, + "grad_norm": 0.3867064118385315, + "learning_rate": 2.6247174882924974e-06, + "loss": 0.0735, + "step": 6183 + }, + { + "epoch": 2.0038885288399224, + "grad_norm": 0.4124574065208435, + "learning_rate": 2.623178401448409e-06, + "loss": 0.0758, + "step": 6184 + }, + { + "epoch": 2.004212572909916, + "grad_norm": 0.41653987765312195, + "learning_rate": 2.6216396055042747e-06, + "loss": 0.0709, + "step": 6185 + }, + { + "epoch": 2.0045366169799093, + "grad_norm": 0.38831084966659546, + "learning_rate": 2.620101100648431e-06, + "loss": 0.0668, + "step": 6186 + }, + { + "epoch": 2.0048606610499027, + "grad_norm": 0.41165921092033386, + "learning_rate": 2.618562887069169e-06, + "loss": 0.0748, + "step": 6187 + }, + { + "epoch": 2.005184705119896, + "grad_norm": 0.3913106322288513, + "learning_rate": 2.6170249649547595e-06, + "loss": 0.0678, + "step": 6188 + }, + { + "epoch": 2.00550874918989, + "grad_norm": 0.4491140842437744, + "learning_rate": 2.615487334493422e-06, + "loss": 0.0778, + "step": 6189 + }, + { + "epoch": 2.0058327932598834, + "grad_norm": 0.4365970194339752, + "learning_rate": 2.613949995873354e-06, + "loss": 0.0789, + "step": 6190 + }, + { + "epoch": 2.006156837329877, + "grad_norm": 0.4461490511894226, + "learning_rate": 2.6124129492827045e-06, + "loss": 0.0764, + "step": 6191 + }, + { + "epoch": 2.0064808813998702, + "grad_norm": 0.4379573166370392, + "learning_rate": 2.6108761949095996e-06, + "loss": 0.0725, + "step": 6192 + }, + { + "epoch": 2.006804925469864, + "grad_norm": 0.46649301052093506, + "learning_rate": 2.609339732942119e-06, + "loss": 0.0774, + "step": 6193 + }, + { + "epoch": 2.0071289695398575, + "grad_norm": 0.42365261912345886, + "learning_rate": 2.6078035635683106e-06, + "loss": 0.068, + "step": 6194 + }, + { + "epoch": 2.007453013609851, + "grad_norm": 0.4476017355918884, + "learning_rate": 2.6062676869761905e-06, + "loss": 0.0751, + "step": 6195 + }, + { + "epoch": 2.0077770576798444, + "grad_norm": 0.48353251814842224, + "learning_rate": 2.6047321033537276e-06, + "loss": 0.0825, + "step": 6196 + }, + { + "epoch": 2.008101101749838, + "grad_norm": 0.46962183713912964, + "learning_rate": 2.603196812888872e-06, + "loss": 0.0828, + "step": 6197 + }, + { + "epoch": 2.0084251458198317, + "grad_norm": 0.46140119433403015, + "learning_rate": 2.601661815769521e-06, + "loss": 0.0777, + "step": 6198 + }, + { + "epoch": 2.008749189889825, + "grad_norm": 0.47431838512420654, + "learning_rate": 2.600127112183547e-06, + "loss": 0.0718, + "step": 6199 + }, + { + "epoch": 2.0090732339598185, + "grad_norm": 0.4523472487926483, + "learning_rate": 2.598592702318781e-06, + "loss": 0.0751, + "step": 6200 + }, + { + "epoch": 2.009397278029812, + "grad_norm": 0.4582405686378479, + "learning_rate": 2.597058586363021e-06, + "loss": 0.0745, + "step": 6201 + }, + { + "epoch": 2.0097213220998054, + "grad_norm": 0.4842430651187897, + "learning_rate": 2.5955247645040282e-06, + "loss": 0.0776, + "step": 6202 + }, + { + "epoch": 2.0100453661697992, + "grad_norm": 0.5084421634674072, + "learning_rate": 2.593991236929526e-06, + "loss": 0.0839, + "step": 6203 + }, + { + "epoch": 2.0103694102397927, + "grad_norm": 0.4512103199958801, + "learning_rate": 2.5924580038272056e-06, + "loss": 0.0704, + "step": 6204 + }, + { + "epoch": 2.010693454309786, + "grad_norm": 0.41768187284469604, + "learning_rate": 2.5909250653847205e-06, + "loss": 0.0707, + "step": 6205 + }, + { + "epoch": 2.0110174983797795, + "grad_norm": 0.43947720527648926, + "learning_rate": 2.589392421789684e-06, + "loss": 0.0699, + "step": 6206 + }, + { + "epoch": 2.0113415424497734, + "grad_norm": 0.453520804643631, + "learning_rate": 2.5878600732296778e-06, + "loss": 0.0733, + "step": 6207 + }, + { + "epoch": 2.011665586519767, + "grad_norm": 0.4499374032020569, + "learning_rate": 2.5863280198922474e-06, + "loss": 0.0722, + "step": 6208 + }, + { + "epoch": 2.01198963058976, + "grad_norm": 0.4869464039802551, + "learning_rate": 2.5847962619649015e-06, + "loss": 0.0813, + "step": 6209 + }, + { + "epoch": 2.0123136746597536, + "grad_norm": 0.4863876700401306, + "learning_rate": 2.583264799635114e-06, + "loss": 0.0797, + "step": 6210 + }, + { + "epoch": 2.012637718729747, + "grad_norm": 0.43555524945259094, + "learning_rate": 2.5817336330903154e-06, + "loss": 0.0701, + "step": 6211 + }, + { + "epoch": 2.012961762799741, + "grad_norm": 0.4633532762527466, + "learning_rate": 2.580202762517914e-06, + "loss": 0.0726, + "step": 6212 + }, + { + "epoch": 2.0132858068697344, + "grad_norm": 0.4807261526584625, + "learning_rate": 2.578672188105264e-06, + "loss": 0.0772, + "step": 6213 + }, + { + "epoch": 2.0136098509397278, + "grad_norm": 0.5036196112632751, + "learning_rate": 2.577141910039702e-06, + "loss": 0.0775, + "step": 6214 + }, + { + "epoch": 2.013933895009721, + "grad_norm": 0.47016188502311707, + "learning_rate": 2.5756119285085133e-06, + "loss": 0.0789, + "step": 6215 + }, + { + "epoch": 2.0142579390797146, + "grad_norm": 0.43279916048049927, + "learning_rate": 2.5740822436989556e-06, + "loss": 0.0735, + "step": 6216 + }, + { + "epoch": 2.0145819831497085, + "grad_norm": 0.41868194937705994, + "learning_rate": 2.572552855798247e-06, + "loss": 0.0695, + "step": 6217 + }, + { + "epoch": 2.014906027219702, + "grad_norm": 0.4771308898925781, + "learning_rate": 2.5710237649935665e-06, + "loss": 0.0761, + "step": 6218 + }, + { + "epoch": 2.0152300712896953, + "grad_norm": 0.4490995407104492, + "learning_rate": 2.5694949714720665e-06, + "loss": 0.0737, + "step": 6219 + }, + { + "epoch": 2.0155541153596888, + "grad_norm": 0.48034578561782837, + "learning_rate": 2.567966475420849e-06, + "loss": 0.0759, + "step": 6220 + }, + { + "epoch": 2.0158781594296826, + "grad_norm": 0.49473416805267334, + "learning_rate": 2.5664382770269945e-06, + "loss": 0.0853, + "step": 6221 + }, + { + "epoch": 2.016202203499676, + "grad_norm": 0.47211727499961853, + "learning_rate": 2.5649103764775328e-06, + "loss": 0.0663, + "step": 6222 + }, + { + "epoch": 2.0165262475696695, + "grad_norm": 0.4685417413711548, + "learning_rate": 2.5633827739594706e-06, + "loss": 0.0789, + "step": 6223 + }, + { + "epoch": 2.016850291639663, + "grad_norm": 0.44643479585647583, + "learning_rate": 2.5618554696597676e-06, + "loss": 0.0772, + "step": 6224 + }, + { + "epoch": 2.0171743357096563, + "grad_norm": 0.42165714502334595, + "learning_rate": 2.5603284637653517e-06, + "loss": 0.0696, + "step": 6225 + }, + { + "epoch": 2.01749837977965, + "grad_norm": 0.4879651665687561, + "learning_rate": 2.558801756463114e-06, + "loss": 0.0845, + "step": 6226 + }, + { + "epoch": 2.0178224238496436, + "grad_norm": 0.4335845112800598, + "learning_rate": 2.5572753479399094e-06, + "loss": 0.0725, + "step": 6227 + }, + { + "epoch": 2.018146467919637, + "grad_norm": 0.4804597795009613, + "learning_rate": 2.5557492383825557e-06, + "loss": 0.0852, + "step": 6228 + }, + { + "epoch": 2.0184705119896305, + "grad_norm": 0.41719064116477966, + "learning_rate": 2.554223427977831e-06, + "loss": 0.0672, + "step": 6229 + }, + { + "epoch": 2.0187945560596243, + "grad_norm": 0.4422290027141571, + "learning_rate": 2.552697916912482e-06, + "loss": 0.0738, + "step": 6230 + }, + { + "epoch": 2.0191186001296177, + "grad_norm": 0.4339218735694885, + "learning_rate": 2.5511727053732173e-06, + "loss": 0.0734, + "step": 6231 + }, + { + "epoch": 2.019442644199611, + "grad_norm": 0.4676489233970642, + "learning_rate": 2.5496477935467057e-06, + "loss": 0.0764, + "step": 6232 + }, + { + "epoch": 2.0197666882696046, + "grad_norm": 0.45664387941360474, + "learning_rate": 2.548123181619583e-06, + "loss": 0.0817, + "step": 6233 + }, + { + "epoch": 2.020090732339598, + "grad_norm": 0.47742414474487305, + "learning_rate": 2.5465988697784473e-06, + "loss": 0.0734, + "step": 6234 + }, + { + "epoch": 2.020414776409592, + "grad_norm": 0.464341938495636, + "learning_rate": 2.5450748582098592e-06, + "loss": 0.0734, + "step": 6235 + }, + { + "epoch": 2.0207388204795853, + "grad_norm": 0.4464026987552643, + "learning_rate": 2.543551147100345e-06, + "loss": 0.0749, + "step": 6236 + }, + { + "epoch": 2.0210628645495787, + "grad_norm": 0.4652882218360901, + "learning_rate": 2.5420277366363875e-06, + "loss": 0.0795, + "step": 6237 + }, + { + "epoch": 2.021386908619572, + "grad_norm": 0.4414404332637787, + "learning_rate": 2.540504627004441e-06, + "loss": 0.0731, + "step": 6238 + }, + { + "epoch": 2.0217109526895656, + "grad_norm": 0.41924309730529785, + "learning_rate": 2.5389818183909176e-06, + "loss": 0.0667, + "step": 6239 + }, + { + "epoch": 2.0220349967595594, + "grad_norm": 0.4610547125339508, + "learning_rate": 2.5374593109821955e-06, + "loss": 0.0796, + "step": 6240 + }, + { + "epoch": 2.022359040829553, + "grad_norm": 0.45951277017593384, + "learning_rate": 2.5359371049646164e-06, + "loss": 0.0722, + "step": 6241 + }, + { + "epoch": 2.0226830848995463, + "grad_norm": 0.46303337812423706, + "learning_rate": 2.534415200524477e-06, + "loss": 0.0734, + "step": 6242 + }, + { + "epoch": 2.0230071289695397, + "grad_norm": 0.4553036689758301, + "learning_rate": 2.532893597848053e-06, + "loss": 0.0741, + "step": 6243 + }, + { + "epoch": 2.0233311730395336, + "grad_norm": 0.4600156247615814, + "learning_rate": 2.5313722971215655e-06, + "loss": 0.0743, + "step": 6244 + }, + { + "epoch": 2.023655217109527, + "grad_norm": 0.4609796404838562, + "learning_rate": 2.529851298531214e-06, + "loss": 0.074, + "step": 6245 + }, + { + "epoch": 2.0239792611795204, + "grad_norm": 0.4632244408130646, + "learning_rate": 2.5283306022631493e-06, + "loss": 0.0763, + "step": 6246 + }, + { + "epoch": 2.024303305249514, + "grad_norm": 0.4586898684501648, + "learning_rate": 2.5268102085034906e-06, + "loss": 0.0738, + "step": 6247 + }, + { + "epoch": 2.0246273493195073, + "grad_norm": 0.4696769118309021, + "learning_rate": 2.5252901174383203e-06, + "loss": 0.0742, + "step": 6248 + }, + { + "epoch": 2.024951393389501, + "grad_norm": 0.7212674617767334, + "learning_rate": 2.523770329253683e-06, + "loss": 0.0723, + "step": 6249 + }, + { + "epoch": 2.0252754374594946, + "grad_norm": 0.4639633893966675, + "learning_rate": 2.5222508441355875e-06, + "loss": 0.0738, + "step": 6250 + }, + { + "epoch": 2.025599481529488, + "grad_norm": 0.41529393196105957, + "learning_rate": 2.520731662269998e-06, + "loss": 0.0631, + "step": 6251 + }, + { + "epoch": 2.0259235255994814, + "grad_norm": 0.44837164878845215, + "learning_rate": 2.519212783842856e-06, + "loss": 0.0691, + "step": 6252 + }, + { + "epoch": 2.026247569669475, + "grad_norm": 0.46720263361930847, + "learning_rate": 2.5176942090400512e-06, + "loss": 0.0754, + "step": 6253 + }, + { + "epoch": 2.0265716137394687, + "grad_norm": 0.48482298851013184, + "learning_rate": 2.5161759380474448e-06, + "loss": 0.0709, + "step": 6254 + }, + { + "epoch": 2.026895657809462, + "grad_norm": 0.46289435029029846, + "learning_rate": 2.514657971050858e-06, + "loss": 0.0767, + "step": 6255 + }, + { + "epoch": 2.0272197018794555, + "grad_norm": 0.4710228741168976, + "learning_rate": 2.513140308236076e-06, + "loss": 0.0734, + "step": 6256 + }, + { + "epoch": 2.027543745949449, + "grad_norm": 0.48888158798217773, + "learning_rate": 2.511622949788845e-06, + "loss": 0.077, + "step": 6257 + }, + { + "epoch": 2.027867790019443, + "grad_norm": 0.4653327763080597, + "learning_rate": 2.5101058958948766e-06, + "loss": 0.0776, + "step": 6258 + }, + { + "epoch": 2.0281918340894363, + "grad_norm": 0.4562969207763672, + "learning_rate": 2.5085891467398433e-06, + "loss": 0.0721, + "step": 6259 + }, + { + "epoch": 2.0285158781594297, + "grad_norm": 0.4664032459259033, + "learning_rate": 2.5070727025093785e-06, + "loss": 0.0683, + "step": 6260 + }, + { + "epoch": 2.028839922229423, + "grad_norm": 0.49336710572242737, + "learning_rate": 2.505556563389081e-06, + "loss": 0.0775, + "step": 6261 + }, + { + "epoch": 2.0291639662994165, + "grad_norm": 0.4181060791015625, + "learning_rate": 2.5040407295645126e-06, + "loss": 0.0662, + "step": 6262 + }, + { + "epoch": 2.0294880103694104, + "grad_norm": 0.4688979685306549, + "learning_rate": 2.5025252012211955e-06, + "loss": 0.0757, + "step": 6263 + }, + { + "epoch": 2.029812054439404, + "grad_norm": 0.5041515827178955, + "learning_rate": 2.5010099785446166e-06, + "loss": 0.0806, + "step": 6264 + }, + { + "epoch": 2.0301360985093972, + "grad_norm": 0.4415440559387207, + "learning_rate": 2.4994950617202268e-06, + "loss": 0.0715, + "step": 6265 + }, + { + "epoch": 2.0304601425793907, + "grad_norm": 0.4888792037963867, + "learning_rate": 2.49798045093343e-06, + "loss": 0.0786, + "step": 6266 + }, + { + "epoch": 2.0307841866493845, + "grad_norm": 0.43033257126808167, + "learning_rate": 2.4964661463696087e-06, + "loss": 0.0708, + "step": 6267 + }, + { + "epoch": 2.031108230719378, + "grad_norm": 0.4612560272216797, + "learning_rate": 2.494952148214093e-06, + "loss": 0.0763, + "step": 6268 + }, + { + "epoch": 2.0314322747893714, + "grad_norm": 0.4671456515789032, + "learning_rate": 2.493438456652184e-06, + "loss": 0.0794, + "step": 6269 + }, + { + "epoch": 2.031756318859365, + "grad_norm": 0.5146381855010986, + "learning_rate": 2.4919250718691424e-06, + "loss": 0.0784, + "step": 6270 + }, + { + "epoch": 2.0320803629293582, + "grad_norm": 0.44912979006767273, + "learning_rate": 2.4904119940501914e-06, + "loss": 0.0678, + "step": 6271 + }, + { + "epoch": 2.032404406999352, + "grad_norm": 0.506953775882721, + "learning_rate": 2.4888992233805205e-06, + "loss": 0.0795, + "step": 6272 + }, + { + "epoch": 2.0327284510693455, + "grad_norm": 0.5138766169548035, + "learning_rate": 2.4873867600452705e-06, + "loss": 0.0826, + "step": 6273 + }, + { + "epoch": 2.033052495139339, + "grad_norm": 0.4463346600532532, + "learning_rate": 2.4858746042295616e-06, + "loss": 0.0708, + "step": 6274 + }, + { + "epoch": 2.0333765392093324, + "grad_norm": 0.5110496878623962, + "learning_rate": 2.4843627561184587e-06, + "loss": 0.0803, + "step": 6275 + }, + { + "epoch": 2.033700583279326, + "grad_norm": 0.4695765972137451, + "learning_rate": 2.4828512158970042e-06, + "loss": 0.0727, + "step": 6276 + }, + { + "epoch": 2.0340246273493197, + "grad_norm": 0.47844061255455017, + "learning_rate": 2.481339983750192e-06, + "loss": 0.0775, + "step": 6277 + }, + { + "epoch": 2.034348671419313, + "grad_norm": 0.4744821786880493, + "learning_rate": 2.4798290598629826e-06, + "loss": 0.0765, + "step": 6278 + }, + { + "epoch": 2.0346727154893065, + "grad_norm": 0.45612332224845886, + "learning_rate": 2.4783184444202995e-06, + "loss": 0.0782, + "step": 6279 + }, + { + "epoch": 2.0349967595593, + "grad_norm": 0.4476688504219055, + "learning_rate": 2.476808137607027e-06, + "loss": 0.0703, + "step": 6280 + }, + { + "epoch": 2.035320803629294, + "grad_norm": 0.4483274817466736, + "learning_rate": 2.4752981396080138e-06, + "loss": 0.0729, + "step": 6281 + }, + { + "epoch": 2.035644847699287, + "grad_norm": 0.4268377423286438, + "learning_rate": 2.473788450608064e-06, + "loss": 0.0659, + "step": 6282 + }, + { + "epoch": 2.0359688917692806, + "grad_norm": 0.4591849446296692, + "learning_rate": 2.472279070791955e-06, + "loss": 0.0711, + "step": 6283 + }, + { + "epoch": 2.036292935839274, + "grad_norm": 0.4683074355125427, + "learning_rate": 2.470770000344417e-06, + "loss": 0.0763, + "step": 6284 + }, + { + "epoch": 2.0366169799092675, + "grad_norm": 0.46519917249679565, + "learning_rate": 2.4692612394501454e-06, + "loss": 0.0749, + "step": 6285 + }, + { + "epoch": 2.0369410239792614, + "grad_norm": 0.4236977994441986, + "learning_rate": 2.4677527882937986e-06, + "loss": 0.068, + "step": 6286 + }, + { + "epoch": 2.037265068049255, + "grad_norm": 0.45990490913391113, + "learning_rate": 2.4662446470599967e-06, + "loss": 0.0759, + "step": 6287 + }, + { + "epoch": 2.037589112119248, + "grad_norm": 0.4591306149959564, + "learning_rate": 2.4647368159333207e-06, + "loss": 0.0733, + "step": 6288 + }, + { + "epoch": 2.0379131561892416, + "grad_norm": 0.4575057625770569, + "learning_rate": 2.4632292950983156e-06, + "loss": 0.0678, + "step": 6289 + }, + { + "epoch": 2.038237200259235, + "grad_norm": 0.42534708976745605, + "learning_rate": 2.461722084739489e-06, + "loss": 0.07, + "step": 6290 + }, + { + "epoch": 2.038561244329229, + "grad_norm": 0.4384348690509796, + "learning_rate": 2.460215185041305e-06, + "loss": 0.0706, + "step": 6291 + }, + { + "epoch": 2.0388852883992223, + "grad_norm": 0.5067781805992126, + "learning_rate": 2.4587085961881947e-06, + "loss": 0.0728, + "step": 6292 + }, + { + "epoch": 2.0392093324692158, + "grad_norm": 0.44250717759132385, + "learning_rate": 2.4572023183645512e-06, + "loss": 0.0666, + "step": 6293 + }, + { + "epoch": 2.039533376539209, + "grad_norm": 0.4823816120624542, + "learning_rate": 2.455696351754727e-06, + "loss": 0.0797, + "step": 6294 + }, + { + "epoch": 2.039857420609203, + "grad_norm": 0.4811875820159912, + "learning_rate": 2.4541906965430395e-06, + "loss": 0.0763, + "step": 6295 + }, + { + "epoch": 2.0401814646791965, + "grad_norm": 0.475002646446228, + "learning_rate": 2.452685352913767e-06, + "loss": 0.0724, + "step": 6296 + }, + { + "epoch": 2.04050550874919, + "grad_norm": 0.5020356774330139, + "learning_rate": 2.451180321051143e-06, + "loss": 0.0807, + "step": 6297 + }, + { + "epoch": 2.0408295528191833, + "grad_norm": 0.4758034944534302, + "learning_rate": 2.449675601139378e-06, + "loss": 0.0749, + "step": 6298 + }, + { + "epoch": 2.0411535968891767, + "grad_norm": 0.45018815994262695, + "learning_rate": 2.448171193362628e-06, + "loss": 0.0684, + "step": 6299 + }, + { + "epoch": 2.0414776409591706, + "grad_norm": 0.46852201223373413, + "learning_rate": 2.44666709790502e-06, + "loss": 0.0732, + "step": 6300 + }, + { + "epoch": 2.041801685029164, + "grad_norm": 0.4619670510292053, + "learning_rate": 2.4451633149506416e-06, + "loss": 0.0728, + "step": 6301 + }, + { + "epoch": 2.0421257290991575, + "grad_norm": 0.46022775769233704, + "learning_rate": 2.4436598446835404e-06, + "loss": 0.0722, + "step": 6302 + }, + { + "epoch": 2.042449773169151, + "grad_norm": 0.45504963397979736, + "learning_rate": 2.442156687287729e-06, + "loss": 0.0704, + "step": 6303 + }, + { + "epoch": 2.0427738172391443, + "grad_norm": 0.5202746391296387, + "learning_rate": 2.4406538429471733e-06, + "loss": 0.0739, + "step": 6304 + }, + { + "epoch": 2.043097861309138, + "grad_norm": 0.4696272611618042, + "learning_rate": 2.4391513118458145e-06, + "loss": 0.072, + "step": 6305 + }, + { + "epoch": 2.0434219053791316, + "grad_norm": 0.46362751722335815, + "learning_rate": 2.4376490941675403e-06, + "loss": 0.0702, + "step": 6306 + }, + { + "epoch": 2.043745949449125, + "grad_norm": 0.5203388333320618, + "learning_rate": 2.436147190096216e-06, + "loss": 0.0815, + "step": 6307 + }, + { + "epoch": 2.0440699935191184, + "grad_norm": 0.4749559164047241, + "learning_rate": 2.434645599815653e-06, + "loss": 0.0705, + "step": 6308 + }, + { + "epoch": 2.0443940375891123, + "grad_norm": 0.4685365855693817, + "learning_rate": 2.4331443235096343e-06, + "loss": 0.0757, + "step": 6309 + }, + { + "epoch": 2.0447180816591057, + "grad_norm": 0.4908275902271271, + "learning_rate": 2.431643361361901e-06, + "loss": 0.0748, + "step": 6310 + }, + { + "epoch": 2.045042125729099, + "grad_norm": 0.48974791169166565, + "learning_rate": 2.4301427135561572e-06, + "loss": 0.0761, + "step": 6311 + }, + { + "epoch": 2.0453661697990926, + "grad_norm": 0.4484266936779022, + "learning_rate": 2.4286423802760683e-06, + "loss": 0.0713, + "step": 6312 + }, + { + "epoch": 2.045690213869086, + "grad_norm": 0.4263099431991577, + "learning_rate": 2.4271423617052564e-06, + "loss": 0.0691, + "step": 6313 + }, + { + "epoch": 2.04601425793908, + "grad_norm": 0.4288794696331024, + "learning_rate": 2.4256426580273156e-06, + "loss": 0.07, + "step": 6314 + }, + { + "epoch": 2.0463383020090733, + "grad_norm": 0.48074692487716675, + "learning_rate": 2.42414326942579e-06, + "loss": 0.075, + "step": 6315 + }, + { + "epoch": 2.0466623460790667, + "grad_norm": 0.46625542640686035, + "learning_rate": 2.422644196084192e-06, + "loss": 0.0682, + "step": 6316 + }, + { + "epoch": 2.04698639014906, + "grad_norm": 0.47445952892303467, + "learning_rate": 2.4211454381859935e-06, + "loss": 0.0764, + "step": 6317 + }, + { + "epoch": 2.047310434219054, + "grad_norm": 0.48692288994789124, + "learning_rate": 2.419646995914628e-06, + "loss": 0.0768, + "step": 6318 + }, + { + "epoch": 2.0476344782890474, + "grad_norm": 0.5131134986877441, + "learning_rate": 2.4181488694534903e-06, + "loss": 0.0823, + "step": 6319 + }, + { + "epoch": 2.047958522359041, + "grad_norm": 0.4429638087749481, + "learning_rate": 2.4166510589859394e-06, + "loss": 0.0712, + "step": 6320 + }, + { + "epoch": 2.0482825664290343, + "grad_norm": 0.49024632573127747, + "learning_rate": 2.4151535646952877e-06, + "loss": 0.0748, + "step": 6321 + }, + { + "epoch": 2.0486066104990277, + "grad_norm": 0.4927096664905548, + "learning_rate": 2.413656386764817e-06, + "loss": 0.0782, + "step": 6322 + }, + { + "epoch": 2.0489306545690216, + "grad_norm": 0.5299798250198364, + "learning_rate": 2.4121595253777657e-06, + "loss": 0.0866, + "step": 6323 + }, + { + "epoch": 2.049254698639015, + "grad_norm": 0.436034232378006, + "learning_rate": 2.410662980717337e-06, + "loss": 0.0638, + "step": 6324 + }, + { + "epoch": 2.0495787427090084, + "grad_norm": 0.461628794670105, + "learning_rate": 2.4091667529666923e-06, + "loss": 0.0738, + "step": 6325 + }, + { + "epoch": 2.049902786779002, + "grad_norm": 0.4576832354068756, + "learning_rate": 2.4076708423089563e-06, + "loss": 0.0756, + "step": 6326 + }, + { + "epoch": 2.0502268308489953, + "grad_norm": 0.4655950963497162, + "learning_rate": 2.4061752489272156e-06, + "loss": 0.0734, + "step": 6327 + }, + { + "epoch": 2.050550874918989, + "grad_norm": 0.446172833442688, + "learning_rate": 2.40467997300451e-06, + "loss": 0.0706, + "step": 6328 + }, + { + "epoch": 2.0508749189889826, + "grad_norm": 0.43468722701072693, + "learning_rate": 2.403185014723855e-06, + "loss": 0.068, + "step": 6329 + }, + { + "epoch": 2.051198963058976, + "grad_norm": 0.4308585524559021, + "learning_rate": 2.401690374268211e-06, + "loss": 0.068, + "step": 6330 + }, + { + "epoch": 2.0515230071289694, + "grad_norm": 0.42982977628707886, + "learning_rate": 2.400196051820516e-06, + "loss": 0.0718, + "step": 6331 + }, + { + "epoch": 2.0518470511989633, + "grad_norm": 0.47812700271606445, + "learning_rate": 2.3987020475636538e-06, + "loss": 0.0784, + "step": 6332 + }, + { + "epoch": 2.0521710952689567, + "grad_norm": 0.447527676820755, + "learning_rate": 2.3972083616804786e-06, + "loss": 0.0688, + "step": 6333 + }, + { + "epoch": 2.05249513933895, + "grad_norm": 0.5247117877006531, + "learning_rate": 2.395714994353805e-06, + "loss": 0.0834, + "step": 6334 + }, + { + "epoch": 2.0528191834089435, + "grad_norm": 0.46339911222457886, + "learning_rate": 2.3942219457664007e-06, + "loss": 0.0761, + "step": 6335 + }, + { + "epoch": 2.053143227478937, + "grad_norm": 0.41818615794181824, + "learning_rate": 2.392729216101008e-06, + "loss": 0.0674, + "step": 6336 + }, + { + "epoch": 2.053467271548931, + "grad_norm": 0.479206919670105, + "learning_rate": 2.391236805540315e-06, + "loss": 0.0777, + "step": 6337 + }, + { + "epoch": 2.0537913156189243, + "grad_norm": 0.4307020306587219, + "learning_rate": 2.3897447142669864e-06, + "loss": 0.0706, + "step": 6338 + }, + { + "epoch": 2.0541153596889177, + "grad_norm": 0.43551212549209595, + "learning_rate": 2.3882529424636335e-06, + "loss": 0.0713, + "step": 6339 + }, + { + "epoch": 2.054439403758911, + "grad_norm": 0.4901053011417389, + "learning_rate": 2.3867614903128372e-06, + "loss": 0.0739, + "step": 6340 + }, + { + "epoch": 2.0547634478289045, + "grad_norm": 0.5215500593185425, + "learning_rate": 2.385270357997136e-06, + "loss": 0.0743, + "step": 6341 + }, + { + "epoch": 2.0550874918988984, + "grad_norm": 0.4421338737010956, + "learning_rate": 2.38377954569903e-06, + "loss": 0.0676, + "step": 6342 + }, + { + "epoch": 2.055411535968892, + "grad_norm": 0.4533500373363495, + "learning_rate": 2.3822890536009835e-06, + "loss": 0.0712, + "step": 6343 + }, + { + "epoch": 2.0557355800388852, + "grad_norm": 0.4607000946998596, + "learning_rate": 2.3807988818854126e-06, + "loss": 0.072, + "step": 6344 + }, + { + "epoch": 2.0560596241088787, + "grad_norm": 0.4839719533920288, + "learning_rate": 2.3793090307347034e-06, + "loss": 0.0761, + "step": 6345 + }, + { + "epoch": 2.0563836681788725, + "grad_norm": 0.44080618023872375, + "learning_rate": 2.3778195003311983e-06, + "loss": 0.0706, + "step": 6346 + }, + { + "epoch": 2.056707712248866, + "grad_norm": 0.4685359001159668, + "learning_rate": 2.3763302908572016e-06, + "loss": 0.0746, + "step": 6347 + }, + { + "epoch": 2.0570317563188594, + "grad_norm": 0.45527467131614685, + "learning_rate": 2.3748414024949783e-06, + "loss": 0.0708, + "step": 6348 + }, + { + "epoch": 2.057355800388853, + "grad_norm": 0.46099653840065, + "learning_rate": 2.373352835426754e-06, + "loss": 0.0712, + "step": 6349 + }, + { + "epoch": 2.057679844458846, + "grad_norm": 0.45064616203308105, + "learning_rate": 2.3718645898347144e-06, + "loss": 0.0703, + "step": 6350 + }, + { + "epoch": 2.05800388852884, + "grad_norm": 0.47433632612228394, + "learning_rate": 2.3703766659010086e-06, + "loss": 0.073, + "step": 6351 + }, + { + "epoch": 2.0583279325988335, + "grad_norm": 0.5127187371253967, + "learning_rate": 2.368889063807739e-06, + "loss": 0.0785, + "step": 6352 + }, + { + "epoch": 2.058651976668827, + "grad_norm": 0.4757230281829834, + "learning_rate": 2.36740178373698e-06, + "loss": 0.0739, + "step": 6353 + }, + { + "epoch": 2.0589760207388204, + "grad_norm": 0.47789400815963745, + "learning_rate": 2.3659148258707553e-06, + "loss": 0.0795, + "step": 6354 + }, + { + "epoch": 2.059300064808814, + "grad_norm": 0.489640474319458, + "learning_rate": 2.364428190391056e-06, + "loss": 0.0692, + "step": 6355 + }, + { + "epoch": 2.0596241088788076, + "grad_norm": 0.4995042383670807, + "learning_rate": 2.362941877479834e-06, + "loss": 0.0708, + "step": 6356 + }, + { + "epoch": 2.059948152948801, + "grad_norm": 0.47687268257141113, + "learning_rate": 2.3614558873189934e-06, + "loss": 0.0795, + "step": 6357 + }, + { + "epoch": 2.0602721970187945, + "grad_norm": 0.5026583075523376, + "learning_rate": 2.3599702200904134e-06, + "loss": 0.0769, + "step": 6358 + }, + { + "epoch": 2.060596241088788, + "grad_norm": 0.5032001733779907, + "learning_rate": 2.3584848759759165e-06, + "loss": 0.0763, + "step": 6359 + }, + { + "epoch": 2.060920285158782, + "grad_norm": 0.48778533935546875, + "learning_rate": 2.356999855157303e-06, + "loss": 0.0764, + "step": 6360 + }, + { + "epoch": 2.061244329228775, + "grad_norm": 0.47545087337493896, + "learning_rate": 2.355515157816318e-06, + "loss": 0.0723, + "step": 6361 + }, + { + "epoch": 2.0615683732987686, + "grad_norm": 0.47968313097953796, + "learning_rate": 2.35403078413468e-06, + "loss": 0.0769, + "step": 6362 + }, + { + "epoch": 2.061892417368762, + "grad_norm": 0.5082609057426453, + "learning_rate": 2.352546734294057e-06, + "loss": 0.08, + "step": 6363 + }, + { + "epoch": 2.0622164614387555, + "grad_norm": 0.4632527232170105, + "learning_rate": 2.351063008476085e-06, + "loss": 0.0682, + "step": 6364 + }, + { + "epoch": 2.0625405055087493, + "grad_norm": 0.4676029086112976, + "learning_rate": 2.3495796068623565e-06, + "loss": 0.0748, + "step": 6365 + }, + { + "epoch": 2.0628645495787428, + "grad_norm": 0.49084705114364624, + "learning_rate": 2.3480965296344264e-06, + "loss": 0.0789, + "step": 6366 + }, + { + "epoch": 2.063188593648736, + "grad_norm": 0.47968021035194397, + "learning_rate": 2.3466137769738106e-06, + "loss": 0.0794, + "step": 6367 + }, + { + "epoch": 2.0635126377187296, + "grad_norm": 0.4509599804878235, + "learning_rate": 2.345131349061978e-06, + "loss": 0.064, + "step": 6368 + }, + { + "epoch": 2.0638366817887235, + "grad_norm": 0.4971236288547516, + "learning_rate": 2.343649246080371e-06, + "loss": 0.0829, + "step": 6369 + }, + { + "epoch": 2.064160725858717, + "grad_norm": 0.49975186586380005, + "learning_rate": 2.3421674682103784e-06, + "loss": 0.0799, + "step": 6370 + }, + { + "epoch": 2.0644847699287103, + "grad_norm": 0.4948911666870117, + "learning_rate": 2.3406860156333584e-06, + "loss": 0.0817, + "step": 6371 + }, + { + "epoch": 2.0648088139987038, + "grad_norm": 0.4723863899707794, + "learning_rate": 2.339204888530626e-06, + "loss": 0.0748, + "step": 6372 + }, + { + "epoch": 2.065132858068697, + "grad_norm": 0.4942839741706848, + "learning_rate": 2.337724087083456e-06, + "loss": 0.0806, + "step": 6373 + }, + { + "epoch": 2.065456902138691, + "grad_norm": 0.4573349952697754, + "learning_rate": 2.3362436114730858e-06, + "loss": 0.0724, + "step": 6374 + }, + { + "epoch": 2.0657809462086845, + "grad_norm": 0.49293631315231323, + "learning_rate": 2.334763461880712e-06, + "loss": 0.0791, + "step": 6375 + }, + { + "epoch": 2.066104990278678, + "grad_norm": 0.5131592750549316, + "learning_rate": 2.333283638487487e-06, + "loss": 0.0731, + "step": 6376 + }, + { + "epoch": 2.0664290343486713, + "grad_norm": 0.4619980752468109, + "learning_rate": 2.3318041414745286e-06, + "loss": 0.072, + "step": 6377 + }, + { + "epoch": 2.0667530784186647, + "grad_norm": 0.4428941309452057, + "learning_rate": 2.330324971022913e-06, + "loss": 0.0698, + "step": 6378 + }, + { + "epoch": 2.0670771224886586, + "grad_norm": 0.4406226873397827, + "learning_rate": 2.328846127313677e-06, + "loss": 0.0728, + "step": 6379 + }, + { + "epoch": 2.067401166558652, + "grad_norm": 0.44868943095207214, + "learning_rate": 2.3273676105278163e-06, + "loss": 0.071, + "step": 6380 + }, + { + "epoch": 2.0677252106286454, + "grad_norm": 0.4691688120365143, + "learning_rate": 2.325889420846287e-06, + "loss": 0.0757, + "step": 6381 + }, + { + "epoch": 2.068049254698639, + "grad_norm": 0.47268572449684143, + "learning_rate": 2.3244115584500065e-06, + "loss": 0.0759, + "step": 6382 + }, + { + "epoch": 2.0683732987686327, + "grad_norm": 0.4854334890842438, + "learning_rate": 2.3229340235198462e-06, + "loss": 0.0765, + "step": 6383 + }, + { + "epoch": 2.068697342838626, + "grad_norm": 0.44808229804039, + "learning_rate": 2.321456816236649e-06, + "loss": 0.0697, + "step": 6384 + }, + { + "epoch": 2.0690213869086196, + "grad_norm": 0.48902401328086853, + "learning_rate": 2.3199799367812052e-06, + "loss": 0.0746, + "step": 6385 + }, + { + "epoch": 2.069345430978613, + "grad_norm": 0.4742581844329834, + "learning_rate": 2.3185033853342733e-06, + "loss": 0.0726, + "step": 6386 + }, + { + "epoch": 2.0696694750486064, + "grad_norm": 0.46202272176742554, + "learning_rate": 2.317027162076567e-06, + "loss": 0.0732, + "step": 6387 + }, + { + "epoch": 2.0699935191186003, + "grad_norm": 0.49215349555015564, + "learning_rate": 2.3155512671887637e-06, + "loss": 0.0764, + "step": 6388 + }, + { + "epoch": 2.0703175631885937, + "grad_norm": 0.47210338711738586, + "learning_rate": 2.3140757008514997e-06, + "loss": 0.0663, + "step": 6389 + }, + { + "epoch": 2.070641607258587, + "grad_norm": 0.47308242321014404, + "learning_rate": 2.312600463245364e-06, + "loss": 0.071, + "step": 6390 + }, + { + "epoch": 2.0709656513285806, + "grad_norm": 0.47955775260925293, + "learning_rate": 2.311125554550919e-06, + "loss": 0.0782, + "step": 6391 + }, + { + "epoch": 2.071289695398574, + "grad_norm": 0.4697352945804596, + "learning_rate": 2.309650974948673e-06, + "loss": 0.0719, + "step": 6392 + }, + { + "epoch": 2.071613739468568, + "grad_norm": 0.5027076601982117, + "learning_rate": 2.308176724619106e-06, + "loss": 0.071, + "step": 6393 + }, + { + "epoch": 2.0719377835385613, + "grad_norm": 0.47702887654304504, + "learning_rate": 2.3067028037426476e-06, + "loss": 0.0752, + "step": 6394 + }, + { + "epoch": 2.0722618276085547, + "grad_norm": 0.4764552116394043, + "learning_rate": 2.3052292124996927e-06, + "loss": 0.0755, + "step": 6395 + }, + { + "epoch": 2.072585871678548, + "grad_norm": 0.47523608803749084, + "learning_rate": 2.3037559510705954e-06, + "loss": 0.0767, + "step": 6396 + }, + { + "epoch": 2.072909915748542, + "grad_norm": 0.4495546519756317, + "learning_rate": 2.302283019635667e-06, + "loss": 0.0715, + "step": 6397 + }, + { + "epoch": 2.0732339598185354, + "grad_norm": 0.4740990996360779, + "learning_rate": 2.3008104183751835e-06, + "loss": 0.0721, + "step": 6398 + }, + { + "epoch": 2.073558003888529, + "grad_norm": 0.4597020447254181, + "learning_rate": 2.299338147469373e-06, + "loss": 0.071, + "step": 6399 + }, + { + "epoch": 2.0738820479585223, + "grad_norm": 0.4866323471069336, + "learning_rate": 2.297866207098428e-06, + "loss": 0.0784, + "step": 6400 + }, + { + "epoch": 2.0742060920285157, + "grad_norm": 0.4788459837436676, + "learning_rate": 2.296394597442501e-06, + "loss": 0.0733, + "step": 6401 + }, + { + "epoch": 2.0745301360985096, + "grad_norm": 0.47980794310569763, + "learning_rate": 2.2949233186817026e-06, + "loss": 0.0755, + "step": 6402 + }, + { + "epoch": 2.074854180168503, + "grad_norm": 0.4675251245498657, + "learning_rate": 2.2934523709961027e-06, + "loss": 0.0759, + "step": 6403 + }, + { + "epoch": 2.0751782242384964, + "grad_norm": 0.47605273127555847, + "learning_rate": 2.2919817545657315e-06, + "loss": 0.0747, + "step": 6404 + }, + { + "epoch": 2.07550226830849, + "grad_norm": 0.49417728185653687, + "learning_rate": 2.290511469570577e-06, + "loss": 0.0737, + "step": 6405 + }, + { + "epoch": 2.0758263123784833, + "grad_norm": 0.5057334899902344, + "learning_rate": 2.289041516190591e-06, + "loss": 0.0759, + "step": 6406 + }, + { + "epoch": 2.076150356448477, + "grad_norm": 0.49573424458503723, + "learning_rate": 2.287571894605678e-06, + "loss": 0.0835, + "step": 6407 + }, + { + "epoch": 2.0764744005184705, + "grad_norm": 0.5077083706855774, + "learning_rate": 2.286102604995706e-06, + "loss": 0.0842, + "step": 6408 + }, + { + "epoch": 2.076798444588464, + "grad_norm": 0.47334715723991394, + "learning_rate": 2.284633647540503e-06, + "loss": 0.068, + "step": 6409 + }, + { + "epoch": 2.0771224886584574, + "grad_norm": 0.46106183528900146, + "learning_rate": 2.2831650224198547e-06, + "loss": 0.063, + "step": 6410 + }, + { + "epoch": 2.0774465327284513, + "grad_norm": 0.44112372398376465, + "learning_rate": 2.2816967298135082e-06, + "loss": 0.0688, + "step": 6411 + }, + { + "epoch": 2.0777705767984447, + "grad_norm": 0.43434029817581177, + "learning_rate": 2.2802287699011633e-06, + "loss": 0.0685, + "step": 6412 + }, + { + "epoch": 2.078094620868438, + "grad_norm": 0.49972984194755554, + "learning_rate": 2.2787611428624913e-06, + "loss": 0.0787, + "step": 6413 + }, + { + "epoch": 2.0784186649384315, + "grad_norm": 0.49321141839027405, + "learning_rate": 2.277293848877108e-06, + "loss": 0.0788, + "step": 6414 + }, + { + "epoch": 2.078742709008425, + "grad_norm": 0.48927661776542664, + "learning_rate": 2.2758268881246036e-06, + "loss": 0.0791, + "step": 6415 + }, + { + "epoch": 2.079066753078419, + "grad_norm": 0.4573841989040375, + "learning_rate": 2.274360260784514e-06, + "loss": 0.0677, + "step": 6416 + }, + { + "epoch": 2.0793907971484122, + "grad_norm": 0.5282468795776367, + "learning_rate": 2.2728939670363425e-06, + "loss": 0.0816, + "step": 6417 + }, + { + "epoch": 2.0797148412184057, + "grad_norm": 0.43157610297203064, + "learning_rate": 2.271428007059549e-06, + "loss": 0.0675, + "step": 6418 + }, + { + "epoch": 2.080038885288399, + "grad_norm": 0.48366230726242065, + "learning_rate": 2.2699623810335523e-06, + "loss": 0.0728, + "step": 6419 + }, + { + "epoch": 2.080362929358393, + "grad_norm": 0.46891725063323975, + "learning_rate": 2.268497089137734e-06, + "loss": 0.069, + "step": 6420 + }, + { + "epoch": 2.0806869734283864, + "grad_norm": 0.4607992470264435, + "learning_rate": 2.2670321315514242e-06, + "loss": 0.0746, + "step": 6421 + }, + { + "epoch": 2.08101101749838, + "grad_norm": 0.4852626621723175, + "learning_rate": 2.265567508453929e-06, + "loss": 0.077, + "step": 6422 + }, + { + "epoch": 2.0813350615683732, + "grad_norm": 0.4649800956249237, + "learning_rate": 2.2641032200244973e-06, + "loss": 0.0758, + "step": 6423 + }, + { + "epoch": 2.0816591056383666, + "grad_norm": 0.48343998193740845, + "learning_rate": 2.2626392664423457e-06, + "loss": 0.0768, + "step": 6424 + }, + { + "epoch": 2.0819831497083605, + "grad_norm": 0.5083569288253784, + "learning_rate": 2.261175647886648e-06, + "loss": 0.0757, + "step": 6425 + }, + { + "epoch": 2.082307193778354, + "grad_norm": 0.4693621098995209, + "learning_rate": 2.2597123645365375e-06, + "loss": 0.0749, + "step": 6426 + }, + { + "epoch": 2.0826312378483474, + "grad_norm": 0.5031575560569763, + "learning_rate": 2.2582494165711055e-06, + "loss": 0.0762, + "step": 6427 + }, + { + "epoch": 2.082955281918341, + "grad_norm": 0.49334973096847534, + "learning_rate": 2.256786804169403e-06, + "loss": 0.0696, + "step": 6428 + }, + { + "epoch": 2.083279325988334, + "grad_norm": 0.4514414370059967, + "learning_rate": 2.2553245275104406e-06, + "loss": 0.07, + "step": 6429 + }, + { + "epoch": 2.083603370058328, + "grad_norm": 0.4750472903251648, + "learning_rate": 2.253862586773184e-06, + "loss": 0.0748, + "step": 6430 + }, + { + "epoch": 2.0839274141283215, + "grad_norm": 0.4798753261566162, + "learning_rate": 2.2524009821365622e-06, + "loss": 0.0786, + "step": 6431 + }, + { + "epoch": 2.084251458198315, + "grad_norm": 0.4874429702758789, + "learning_rate": 2.2509397137794616e-06, + "loss": 0.0813, + "step": 6432 + }, + { + "epoch": 2.0845755022683083, + "grad_norm": 0.5213260054588318, + "learning_rate": 2.249478781880726e-06, + "loss": 0.0819, + "step": 6433 + }, + { + "epoch": 2.084899546338302, + "grad_norm": 0.45666658878326416, + "learning_rate": 2.248018186619161e-06, + "loss": 0.0707, + "step": 6434 + }, + { + "epoch": 2.0852235904082956, + "grad_norm": 0.4775700569152832, + "learning_rate": 2.2465579281735288e-06, + "loss": 0.0679, + "step": 6435 + }, + { + "epoch": 2.085547634478289, + "grad_norm": 0.4781835675239563, + "learning_rate": 2.245098006722551e-06, + "loss": 0.073, + "step": 6436 + }, + { + "epoch": 2.0858716785482825, + "grad_norm": 0.45587706565856934, + "learning_rate": 2.2436384224449094e-06, + "loss": 0.0739, + "step": 6437 + }, + { + "epoch": 2.086195722618276, + "grad_norm": 0.44620490074157715, + "learning_rate": 2.242179175519239e-06, + "loss": 0.0697, + "step": 6438 + }, + { + "epoch": 2.0865197666882698, + "grad_norm": 0.5067679286003113, + "learning_rate": 2.24072026612414e-06, + "loss": 0.0744, + "step": 6439 + }, + { + "epoch": 2.086843810758263, + "grad_norm": 0.47220391035079956, + "learning_rate": 2.239261694438169e-06, + "loss": 0.0733, + "step": 6440 + }, + { + "epoch": 2.0871678548282566, + "grad_norm": 0.48890239000320435, + "learning_rate": 2.2378034606398396e-06, + "loss": 0.0768, + "step": 6441 + }, + { + "epoch": 2.08749189889825, + "grad_norm": 0.48164820671081543, + "learning_rate": 2.2363455649076295e-06, + "loss": 0.0744, + "step": 6442 + }, + { + "epoch": 2.087815942968244, + "grad_norm": 0.5287774801254272, + "learning_rate": 2.234888007419963e-06, + "loss": 0.078, + "step": 6443 + }, + { + "epoch": 2.0881399870382373, + "grad_norm": 0.5205512642860413, + "learning_rate": 2.233430788355241e-06, + "loss": 0.0795, + "step": 6444 + }, + { + "epoch": 2.0884640311082308, + "grad_norm": 0.479859858751297, + "learning_rate": 2.2319739078918036e-06, + "loss": 0.0703, + "step": 6445 + }, + { + "epoch": 2.088788075178224, + "grad_norm": 0.5163217782974243, + "learning_rate": 2.230517366207967e-06, + "loss": 0.08, + "step": 6446 + }, + { + "epoch": 2.0891121192482176, + "grad_norm": 0.4633115530014038, + "learning_rate": 2.2290611634819925e-06, + "loss": 0.0738, + "step": 6447 + }, + { + "epoch": 2.0894361633182115, + "grad_norm": 0.4752098619937897, + "learning_rate": 2.2276052998921064e-06, + "loss": 0.0697, + "step": 6448 + }, + { + "epoch": 2.089760207388205, + "grad_norm": 0.4927937686443329, + "learning_rate": 2.2261497756164934e-06, + "loss": 0.0757, + "step": 6449 + }, + { + "epoch": 2.0900842514581983, + "grad_norm": 0.44520872831344604, + "learning_rate": 2.2246945908332946e-06, + "loss": 0.0657, + "step": 6450 + }, + { + "epoch": 2.0904082955281917, + "grad_norm": 0.4533376693725586, + "learning_rate": 2.2232397457206122e-06, + "loss": 0.071, + "step": 6451 + }, + { + "epoch": 2.090732339598185, + "grad_norm": 0.4914572238922119, + "learning_rate": 2.221785240456501e-06, + "loss": 0.0733, + "step": 6452 + }, + { + "epoch": 2.091056383668179, + "grad_norm": 0.46928560733795166, + "learning_rate": 2.220331075218984e-06, + "loss": 0.0747, + "step": 6453 + }, + { + "epoch": 2.0913804277381725, + "grad_norm": 0.46369004249572754, + "learning_rate": 2.218877250186033e-06, + "loss": 0.0691, + "step": 6454 + }, + { + "epoch": 2.091704471808166, + "grad_norm": 0.48942112922668457, + "learning_rate": 2.217423765535583e-06, + "loss": 0.0759, + "step": 6455 + }, + { + "epoch": 2.0920285158781593, + "grad_norm": 0.44329163432121277, + "learning_rate": 2.2159706214455267e-06, + "loss": 0.0666, + "step": 6456 + }, + { + "epoch": 2.0923525599481527, + "grad_norm": 0.47483310103416443, + "learning_rate": 2.2145178180937142e-06, + "loss": 0.0794, + "step": 6457 + }, + { + "epoch": 2.0926766040181466, + "grad_norm": 0.49389538168907166, + "learning_rate": 2.2130653556579564e-06, + "loss": 0.0774, + "step": 6458 + }, + { + "epoch": 2.09300064808814, + "grad_norm": 0.5197028517723083, + "learning_rate": 2.2116132343160183e-06, + "loss": 0.078, + "step": 6459 + }, + { + "epoch": 2.0933246921581334, + "grad_norm": 0.4676690399646759, + "learning_rate": 2.210161454245629e-06, + "loss": 0.0702, + "step": 6460 + }, + { + "epoch": 2.093648736228127, + "grad_norm": 0.464379221200943, + "learning_rate": 2.2087100156244684e-06, + "loss": 0.0703, + "step": 6461 + }, + { + "epoch": 2.0939727802981207, + "grad_norm": 0.46508681774139404, + "learning_rate": 2.2072589186301797e-06, + "loss": 0.0721, + "step": 6462 + }, + { + "epoch": 2.094296824368114, + "grad_norm": 0.46484997868537903, + "learning_rate": 2.2058081634403637e-06, + "loss": 0.0727, + "step": 6463 + }, + { + "epoch": 2.0946208684381076, + "grad_norm": 0.47296300530433655, + "learning_rate": 2.2043577502325786e-06, + "loss": 0.0705, + "step": 6464 + }, + { + "epoch": 2.094944912508101, + "grad_norm": 0.4883367121219635, + "learning_rate": 2.2029076791843408e-06, + "loss": 0.0761, + "step": 6465 + }, + { + "epoch": 2.0952689565780944, + "grad_norm": 0.47738826274871826, + "learning_rate": 2.201457950473127e-06, + "loss": 0.0717, + "step": 6466 + }, + { + "epoch": 2.0955930006480883, + "grad_norm": 0.44886377453804016, + "learning_rate": 2.2000085642763647e-06, + "loss": 0.0655, + "step": 6467 + }, + { + "epoch": 2.0959170447180817, + "grad_norm": 0.4837161898612976, + "learning_rate": 2.1985595207714515e-06, + "loss": 0.0685, + "step": 6468 + }, + { + "epoch": 2.096241088788075, + "grad_norm": 0.4646311104297638, + "learning_rate": 2.1971108201357294e-06, + "loss": 0.0728, + "step": 6469 + }, + { + "epoch": 2.0965651328580686, + "grad_norm": 0.49209609627723694, + "learning_rate": 2.1956624625465124e-06, + "loss": 0.0762, + "step": 6470 + }, + { + "epoch": 2.0968891769280624, + "grad_norm": 0.46310967206954956, + "learning_rate": 2.19421444818106e-06, + "loss": 0.0719, + "step": 6471 + }, + { + "epoch": 2.097213220998056, + "grad_norm": 0.494526743888855, + "learning_rate": 2.1927667772165974e-06, + "loss": 0.0779, + "step": 6472 + }, + { + "epoch": 2.0975372650680493, + "grad_norm": 0.4947710633277893, + "learning_rate": 2.1913194498303064e-06, + "loss": 0.0739, + "step": 6473 + }, + { + "epoch": 2.0978613091380427, + "grad_norm": 0.45872384309768677, + "learning_rate": 2.189872466199321e-06, + "loss": 0.0715, + "step": 6474 + }, + { + "epoch": 2.098185353208036, + "grad_norm": 0.45841169357299805, + "learning_rate": 2.188425826500746e-06, + "loss": 0.0682, + "step": 6475 + }, + { + "epoch": 2.09850939727803, + "grad_norm": 0.485802561044693, + "learning_rate": 2.186979530911627e-06, + "loss": 0.072, + "step": 6476 + }, + { + "epoch": 2.0988334413480234, + "grad_norm": 0.4475204348564148, + "learning_rate": 2.1855335796089854e-06, + "loss": 0.0691, + "step": 6477 + }, + { + "epoch": 2.099157485418017, + "grad_norm": 0.45817169547080994, + "learning_rate": 2.184087972769786e-06, + "loss": 0.0727, + "step": 6478 + }, + { + "epoch": 2.0994815294880103, + "grad_norm": 0.4263944923877716, + "learning_rate": 2.182642710570958e-06, + "loss": 0.0633, + "step": 6479 + }, + { + "epoch": 2.0998055735580037, + "grad_norm": 0.5023675560951233, + "learning_rate": 2.1811977931893884e-06, + "loss": 0.0774, + "step": 6480 + }, + { + "epoch": 2.1001296176279975, + "grad_norm": 0.45618337392807007, + "learning_rate": 2.1797532208019212e-06, + "loss": 0.0711, + "step": 6481 + }, + { + "epoch": 2.100453661697991, + "grad_norm": 0.5437365770339966, + "learning_rate": 2.178308993585359e-06, + "loss": 0.0856, + "step": 6482 + }, + { + "epoch": 2.1007777057679844, + "grad_norm": 0.4399570822715759, + "learning_rate": 2.1768651117164565e-06, + "loss": 0.0685, + "step": 6483 + }, + { + "epoch": 2.101101749837978, + "grad_norm": 0.46941646933555603, + "learning_rate": 2.1754215753719376e-06, + "loss": 0.0711, + "step": 6484 + }, + { + "epoch": 2.1014257939079717, + "grad_norm": 0.5223105549812317, + "learning_rate": 2.173978384728472e-06, + "loss": 0.0776, + "step": 6485 + }, + { + "epoch": 2.101749837977965, + "grad_norm": 0.5064131617546082, + "learning_rate": 2.1725355399626943e-06, + "loss": 0.0734, + "step": 6486 + }, + { + "epoch": 2.1020738820479585, + "grad_norm": 0.46504563093185425, + "learning_rate": 2.1710930412511943e-06, + "loss": 0.0689, + "step": 6487 + }, + { + "epoch": 2.102397926117952, + "grad_norm": 0.46737441420555115, + "learning_rate": 2.1696508887705203e-06, + "loss": 0.0722, + "step": 6488 + }, + { + "epoch": 2.1027219701879454, + "grad_norm": 0.4769153594970703, + "learning_rate": 2.168209082697178e-06, + "loss": 0.0759, + "step": 6489 + }, + { + "epoch": 2.1030460142579392, + "grad_norm": 0.4670775830745697, + "learning_rate": 2.166767623207631e-06, + "loss": 0.0686, + "step": 6490 + }, + { + "epoch": 2.1033700583279327, + "grad_norm": 0.49240654706954956, + "learning_rate": 2.1653265104782967e-06, + "loss": 0.0737, + "step": 6491 + }, + { + "epoch": 2.103694102397926, + "grad_norm": 0.4757388234138489, + "learning_rate": 2.163885744685558e-06, + "loss": 0.074, + "step": 6492 + }, + { + "epoch": 2.1040181464679195, + "grad_norm": 0.48718348145484924, + "learning_rate": 2.1624453260057477e-06, + "loss": 0.0753, + "step": 6493 + }, + { + "epoch": 2.1043421905379134, + "grad_norm": 0.4722293019294739, + "learning_rate": 2.1610052546151593e-06, + "loss": 0.0741, + "step": 6494 + }, + { + "epoch": 2.104666234607907, + "grad_norm": 0.5052745342254639, + "learning_rate": 2.1595655306900444e-06, + "loss": 0.0769, + "step": 6495 + }, + { + "epoch": 2.1049902786779002, + "grad_norm": 0.4688330590724945, + "learning_rate": 2.1581261544066113e-06, + "loss": 0.0775, + "step": 6496 + }, + { + "epoch": 2.1053143227478937, + "grad_norm": 0.49035385251045227, + "learning_rate": 2.1566871259410267e-06, + "loss": 0.08, + "step": 6497 + }, + { + "epoch": 2.105638366817887, + "grad_norm": 0.4556289315223694, + "learning_rate": 2.1552484454694087e-06, + "loss": 0.0667, + "step": 6498 + }, + { + "epoch": 2.105962410887881, + "grad_norm": 0.4465049207210541, + "learning_rate": 2.1538101131678457e-06, + "loss": 0.0718, + "step": 6499 + }, + { + "epoch": 2.1062864549578744, + "grad_norm": 0.48459509015083313, + "learning_rate": 2.1523721292123676e-06, + "loss": 0.0742, + "step": 6500 + }, + { + "epoch": 2.106610499027868, + "grad_norm": 0.45342642068862915, + "learning_rate": 2.1509344937789778e-06, + "loss": 0.0697, + "step": 6501 + }, + { + "epoch": 2.106934543097861, + "grad_norm": 0.4800603687763214, + "learning_rate": 2.149497207043623e-06, + "loss": 0.0757, + "step": 6502 + }, + { + "epoch": 2.1072585871678546, + "grad_norm": 0.4755440354347229, + "learning_rate": 2.148060269182215e-06, + "loss": 0.0745, + "step": 6503 + }, + { + "epoch": 2.1075826312378485, + "grad_norm": 0.4882161617279053, + "learning_rate": 2.1466236803706225e-06, + "loss": 0.0767, + "step": 6504 + }, + { + "epoch": 2.107906675307842, + "grad_norm": 0.508690357208252, + "learning_rate": 2.1451874407846655e-06, + "loss": 0.0794, + "step": 6505 + }, + { + "epoch": 2.1082307193778353, + "grad_norm": 0.4412241578102112, + "learning_rate": 2.143751550600133e-06, + "loss": 0.0666, + "step": 6506 + }, + { + "epoch": 2.1085547634478288, + "grad_norm": 0.5109474062919617, + "learning_rate": 2.142316009992756e-06, + "loss": 0.0719, + "step": 6507 + }, + { + "epoch": 2.108878807517822, + "grad_norm": 0.4810276925563812, + "learning_rate": 2.140880819138238e-06, + "loss": 0.0726, + "step": 6508 + }, + { + "epoch": 2.109202851587816, + "grad_norm": 0.4703182578086853, + "learning_rate": 2.1394459782122283e-06, + "loss": 0.0721, + "step": 6509 + }, + { + "epoch": 2.1095268956578095, + "grad_norm": 0.47323593497276306, + "learning_rate": 2.1380114873903377e-06, + "loss": 0.0746, + "step": 6510 + }, + { + "epoch": 2.109850939727803, + "grad_norm": 0.48979589343070984, + "learning_rate": 2.136577346848136e-06, + "loss": 0.0657, + "step": 6511 + }, + { + "epoch": 2.1101749837977963, + "grad_norm": 0.43888792395591736, + "learning_rate": 2.135143556761146e-06, + "loss": 0.0657, + "step": 6512 + }, + { + "epoch": 2.11049902786779, + "grad_norm": 0.47470882534980774, + "learning_rate": 2.133710117304851e-06, + "loss": 0.0721, + "step": 6513 + }, + { + "epoch": 2.1108230719377836, + "grad_norm": 0.5063864588737488, + "learning_rate": 2.13227702865469e-06, + "loss": 0.0658, + "step": 6514 + }, + { + "epoch": 2.111147116007777, + "grad_norm": 0.4682093858718872, + "learning_rate": 2.130844290986061e-06, + "loss": 0.0707, + "step": 6515 + }, + { + "epoch": 2.1114711600777705, + "grad_norm": 0.4421197474002838, + "learning_rate": 2.129411904474313e-06, + "loss": 0.0688, + "step": 6516 + }, + { + "epoch": 2.111795204147764, + "grad_norm": 0.5191793441772461, + "learning_rate": 2.1279798692947585e-06, + "loss": 0.0859, + "step": 6517 + }, + { + "epoch": 2.1121192482177578, + "grad_norm": 0.47814905643463135, + "learning_rate": 2.1265481856226646e-06, + "loss": 0.0669, + "step": 6518 + }, + { + "epoch": 2.112443292287751, + "grad_norm": 0.4759092926979065, + "learning_rate": 2.1251168536332556e-06, + "loss": 0.0739, + "step": 6519 + }, + { + "epoch": 2.1127673363577446, + "grad_norm": 0.4706220328807831, + "learning_rate": 2.123685873501713e-06, + "loss": 0.0708, + "step": 6520 + }, + { + "epoch": 2.113091380427738, + "grad_norm": 0.48442214727401733, + "learning_rate": 2.122255245403176e-06, + "loss": 0.073, + "step": 6521 + }, + { + "epoch": 2.113415424497732, + "grad_norm": 0.4349236488342285, + "learning_rate": 2.120824969512735e-06, + "loss": 0.0649, + "step": 6522 + }, + { + "epoch": 2.1137394685677253, + "grad_norm": 0.5289400815963745, + "learning_rate": 2.1193950460054486e-06, + "loss": 0.0795, + "step": 6523 + }, + { + "epoch": 2.1140635126377187, + "grad_norm": 0.45300814509391785, + "learning_rate": 2.11796547505632e-06, + "loss": 0.0729, + "step": 6524 + }, + { + "epoch": 2.114387556707712, + "grad_norm": 0.4735052287578583, + "learning_rate": 2.1165362568403175e-06, + "loss": 0.0719, + "step": 6525 + }, + { + "epoch": 2.1147116007777056, + "grad_norm": 0.4929650127887726, + "learning_rate": 2.115107391532363e-06, + "loss": 0.0702, + "step": 6526 + }, + { + "epoch": 2.1150356448476995, + "grad_norm": 0.44897279143333435, + "learning_rate": 2.113678879307336e-06, + "loss": 0.0648, + "step": 6527 + }, + { + "epoch": 2.115359688917693, + "grad_norm": 0.5100932121276855, + "learning_rate": 2.112250720340074e-06, + "loss": 0.0781, + "step": 6528 + }, + { + "epoch": 2.1156837329876863, + "grad_norm": 0.47918304800987244, + "learning_rate": 2.1108229148053645e-06, + "loss": 0.0726, + "step": 6529 + }, + { + "epoch": 2.1160077770576797, + "grad_norm": 0.5191142559051514, + "learning_rate": 2.1093954628779644e-06, + "loss": 0.0802, + "step": 6530 + }, + { + "epoch": 2.116331821127673, + "grad_norm": 0.4737345278263092, + "learning_rate": 2.1079683647325734e-06, + "loss": 0.0744, + "step": 6531 + }, + { + "epoch": 2.116655865197667, + "grad_norm": 0.48412197828292847, + "learning_rate": 2.1065416205438607e-06, + "loss": 0.0755, + "step": 6532 + }, + { + "epoch": 2.1169799092676604, + "grad_norm": 0.499695748090744, + "learning_rate": 2.105115230486441e-06, + "loss": 0.0771, + "step": 6533 + }, + { + "epoch": 2.117303953337654, + "grad_norm": 0.5066558718681335, + "learning_rate": 2.103689194734892e-06, + "loss": 0.0766, + "step": 6534 + }, + { + "epoch": 2.1176279974076473, + "grad_norm": 0.4975617527961731, + "learning_rate": 2.1022635134637474e-06, + "loss": 0.0736, + "step": 6535 + }, + { + "epoch": 2.117952041477641, + "grad_norm": 0.4812944233417511, + "learning_rate": 2.100838186847497e-06, + "loss": 0.0734, + "step": 6536 + }, + { + "epoch": 2.1182760855476346, + "grad_norm": 0.46450215578079224, + "learning_rate": 2.0994132150605878e-06, + "loss": 0.0708, + "step": 6537 + }, + { + "epoch": 2.118600129617628, + "grad_norm": 0.4873053729534149, + "learning_rate": 2.0979885982774177e-06, + "loss": 0.0753, + "step": 6538 + }, + { + "epoch": 2.1189241736876214, + "grad_norm": 0.46467098593711853, + "learning_rate": 2.0965643366723533e-06, + "loss": 0.071, + "step": 6539 + }, + { + "epoch": 2.119248217757615, + "grad_norm": 0.4787776470184326, + "learning_rate": 2.0951404304197044e-06, + "loss": 0.0726, + "step": 6540 + }, + { + "epoch": 2.1195722618276087, + "grad_norm": 0.46391019225120544, + "learning_rate": 2.0937168796937457e-06, + "loss": 0.0725, + "step": 6541 + }, + { + "epoch": 2.119896305897602, + "grad_norm": 0.4304513931274414, + "learning_rate": 2.0922936846687065e-06, + "loss": 0.0667, + "step": 6542 + }, + { + "epoch": 2.1202203499675956, + "grad_norm": 0.4738362729549408, + "learning_rate": 2.090870845518771e-06, + "loss": 0.0736, + "step": 6543 + }, + { + "epoch": 2.120544394037589, + "grad_norm": 0.42504286766052246, + "learning_rate": 2.0894483624180816e-06, + "loss": 0.0666, + "step": 6544 + }, + { + "epoch": 2.120868438107583, + "grad_norm": 0.4533616304397583, + "learning_rate": 2.0880262355407384e-06, + "loss": 0.0711, + "step": 6545 + }, + { + "epoch": 2.1211924821775763, + "grad_norm": 0.4556693732738495, + "learning_rate": 2.086604465060791e-06, + "loss": 0.0711, + "step": 6546 + }, + { + "epoch": 2.1215165262475697, + "grad_norm": 0.46734166145324707, + "learning_rate": 2.085183051152254e-06, + "loss": 0.0699, + "step": 6547 + }, + { + "epoch": 2.121840570317563, + "grad_norm": 0.4497268497943878, + "learning_rate": 2.083761993989094e-06, + "loss": 0.0722, + "step": 6548 + }, + { + "epoch": 2.1221646143875565, + "grad_norm": 0.4714108407497406, + "learning_rate": 2.0823412937452345e-06, + "loss": 0.0746, + "step": 6549 + }, + { + "epoch": 2.1224886584575504, + "grad_norm": 0.488873690366745, + "learning_rate": 2.080920950594556e-06, + "loss": 0.0767, + "step": 6550 + }, + { + "epoch": 2.122812702527544, + "grad_norm": 0.46497318148612976, + "learning_rate": 2.079500964710894e-06, + "loss": 0.0723, + "step": 6551 + }, + { + "epoch": 2.1231367465975373, + "grad_norm": 0.45600226521492004, + "learning_rate": 2.0780813362680424e-06, + "loss": 0.0738, + "step": 6552 + }, + { + "epoch": 2.1234607906675307, + "grad_norm": 0.4736258089542389, + "learning_rate": 2.0766620654397455e-06, + "loss": 0.0712, + "step": 6553 + }, + { + "epoch": 2.123784834737524, + "grad_norm": 0.4751882553100586, + "learning_rate": 2.0752431523997156e-06, + "loss": 0.0718, + "step": 6554 + }, + { + "epoch": 2.124108878807518, + "grad_norm": 0.49826884269714355, + "learning_rate": 2.073824597321609e-06, + "loss": 0.0714, + "step": 6555 + }, + { + "epoch": 2.1244329228775114, + "grad_norm": 0.47052130103111267, + "learning_rate": 2.0724064003790428e-06, + "loss": 0.073, + "step": 6556 + }, + { + "epoch": 2.124756966947505, + "grad_norm": 0.4417573809623718, + "learning_rate": 2.0709885617455923e-06, + "loss": 0.0693, + "step": 6557 + }, + { + "epoch": 2.1250810110174982, + "grad_norm": 0.48403653502464294, + "learning_rate": 2.0695710815947866e-06, + "loss": 0.0792, + "step": 6558 + }, + { + "epoch": 2.1254050550874917, + "grad_norm": 0.4700932502746582, + "learning_rate": 2.068153960100114e-06, + "loss": 0.0737, + "step": 6559 + }, + { + "epoch": 2.1257290991574855, + "grad_norm": 0.44414281845092773, + "learning_rate": 2.0667371974350105e-06, + "loss": 0.0691, + "step": 6560 + }, + { + "epoch": 2.126053143227479, + "grad_norm": 0.49838581681251526, + "learning_rate": 2.0653207937728814e-06, + "loss": 0.0757, + "step": 6561 + }, + { + "epoch": 2.1263771872974724, + "grad_norm": 0.46537691354751587, + "learning_rate": 2.0639047492870735e-06, + "loss": 0.0708, + "step": 6562 + }, + { + "epoch": 2.126701231367466, + "grad_norm": 0.49669939279556274, + "learning_rate": 2.0624890641509043e-06, + "loss": 0.0763, + "step": 6563 + }, + { + "epoch": 2.1270252754374597, + "grad_norm": 0.4871048331260681, + "learning_rate": 2.061073738537635e-06, + "loss": 0.076, + "step": 6564 + }, + { + "epoch": 2.127349319507453, + "grad_norm": 0.48748287558555603, + "learning_rate": 2.059658772620489e-06, + "loss": 0.0757, + "step": 6565 + }, + { + "epoch": 2.1276733635774465, + "grad_norm": 0.4756983518600464, + "learning_rate": 2.0582441665726438e-06, + "loss": 0.0722, + "step": 6566 + }, + { + "epoch": 2.12799740764744, + "grad_norm": 0.5200271606445312, + "learning_rate": 2.0568299205672347e-06, + "loss": 0.0762, + "step": 6567 + }, + { + "epoch": 2.1283214517174334, + "grad_norm": 0.44534698128700256, + "learning_rate": 2.0554160347773532e-06, + "loss": 0.0708, + "step": 6568 + }, + { + "epoch": 2.1286454957874272, + "grad_norm": 0.4336491525173187, + "learning_rate": 2.0540025093760414e-06, + "loss": 0.0685, + "step": 6569 + }, + { + "epoch": 2.1289695398574207, + "grad_norm": 0.4875744879245758, + "learning_rate": 2.0525893445363027e-06, + "loss": 0.0758, + "step": 6570 + }, + { + "epoch": 2.129293583927414, + "grad_norm": 0.48467862606048584, + "learning_rate": 2.0511765404310953e-06, + "loss": 0.0716, + "step": 6571 + }, + { + "epoch": 2.1296176279974075, + "grad_norm": 0.44703033566474915, + "learning_rate": 2.049764097233332e-06, + "loss": 0.0659, + "step": 6572 + }, + { + "epoch": 2.1299416720674014, + "grad_norm": 0.4997273087501526, + "learning_rate": 2.0483520151158837e-06, + "loss": 0.077, + "step": 6573 + }, + { + "epoch": 2.130265716137395, + "grad_norm": 0.4592891335487366, + "learning_rate": 2.0469402942515735e-06, + "loss": 0.071, + "step": 6574 + }, + { + "epoch": 2.130589760207388, + "grad_norm": 0.45686131715774536, + "learning_rate": 2.0455289348131845e-06, + "loss": 0.0689, + "step": 6575 + }, + { + "epoch": 2.1309138042773816, + "grad_norm": 0.4669840335845947, + "learning_rate": 2.0441179369734538e-06, + "loss": 0.0695, + "step": 6576 + }, + { + "epoch": 2.131237848347375, + "grad_norm": 0.4641000032424927, + "learning_rate": 2.04270730090507e-06, + "loss": 0.0698, + "step": 6577 + }, + { + "epoch": 2.131561892417369, + "grad_norm": 0.4740959107875824, + "learning_rate": 2.0412970267806847e-06, + "loss": 0.0711, + "step": 6578 + }, + { + "epoch": 2.1318859364873624, + "grad_norm": 0.45085427165031433, + "learning_rate": 2.0398871147729004e-06, + "loss": 0.0684, + "step": 6579 + }, + { + "epoch": 2.1322099805573558, + "grad_norm": 0.4833509922027588, + "learning_rate": 2.038477565054277e-06, + "loss": 0.0718, + "step": 6580 + }, + { + "epoch": 2.132534024627349, + "grad_norm": 0.5095663070678711, + "learning_rate": 2.03706837779733e-06, + "loss": 0.0796, + "step": 6581 + }, + { + "epoch": 2.1328580686973426, + "grad_norm": 0.4786771237850189, + "learning_rate": 2.0356595531745303e-06, + "loss": 0.0734, + "step": 6582 + }, + { + "epoch": 2.1331821127673365, + "grad_norm": 0.45221537351608276, + "learning_rate": 2.0342510913583062e-06, + "loss": 0.0714, + "step": 6583 + }, + { + "epoch": 2.13350615683733, + "grad_norm": 0.496338814496994, + "learning_rate": 2.032842992521034e-06, + "loss": 0.0784, + "step": 6584 + }, + { + "epoch": 2.1338302009073233, + "grad_norm": 0.49012452363967896, + "learning_rate": 2.0314352568350586e-06, + "loss": 0.077, + "step": 6585 + }, + { + "epoch": 2.1341542449773168, + "grad_norm": 0.4398423433303833, + "learning_rate": 2.0300278844726685e-06, + "loss": 0.0627, + "step": 6586 + }, + { + "epoch": 2.1344782890473106, + "grad_norm": 0.4631512463092804, + "learning_rate": 2.028620875606113e-06, + "loss": 0.0714, + "step": 6587 + }, + { + "epoch": 2.134802333117304, + "grad_norm": 0.5162291526794434, + "learning_rate": 2.027214230407598e-06, + "loss": 0.0788, + "step": 6588 + }, + { + "epoch": 2.1351263771872975, + "grad_norm": 0.46149012446403503, + "learning_rate": 2.025807949049282e-06, + "loss": 0.0742, + "step": 6589 + }, + { + "epoch": 2.135450421257291, + "grad_norm": 0.47114327549934387, + "learning_rate": 2.0244020317032825e-06, + "loss": 0.0714, + "step": 6590 + }, + { + "epoch": 2.1357744653272843, + "grad_norm": 0.5015454888343811, + "learning_rate": 2.022996478541665e-06, + "loss": 0.0831, + "step": 6591 + }, + { + "epoch": 2.136098509397278, + "grad_norm": 0.4543628990650177, + "learning_rate": 2.021591289736462e-06, + "loss": 0.0714, + "step": 6592 + }, + { + "epoch": 2.1364225534672716, + "grad_norm": 0.5102995038032532, + "learning_rate": 2.020186465459649e-06, + "loss": 0.0764, + "step": 6593 + }, + { + "epoch": 2.136746597537265, + "grad_norm": 0.47079968452453613, + "learning_rate": 2.0187820058831685e-06, + "loss": 0.0689, + "step": 6594 + }, + { + "epoch": 2.1370706416072585, + "grad_norm": 0.4647594392299652, + "learning_rate": 2.017377911178909e-06, + "loss": 0.0714, + "step": 6595 + }, + { + "epoch": 2.1373946856772523, + "grad_norm": 0.4710672199726105, + "learning_rate": 2.0159741815187184e-06, + "loss": 0.0731, + "step": 6596 + }, + { + "epoch": 2.1377187297472457, + "grad_norm": 0.46359890699386597, + "learning_rate": 2.014570817074401e-06, + "loss": 0.0722, + "step": 6597 + }, + { + "epoch": 2.138042773817239, + "grad_norm": 0.4689292311668396, + "learning_rate": 2.0131678180177144e-06, + "loss": 0.0697, + "step": 6598 + }, + { + "epoch": 2.1383668178872326, + "grad_norm": 0.48033982515335083, + "learning_rate": 2.0117651845203733e-06, + "loss": 0.0739, + "step": 6599 + }, + { + "epoch": 2.138690861957226, + "grad_norm": 0.4882848858833313, + "learning_rate": 2.010362916754044e-06, + "loss": 0.08, + "step": 6600 + }, + { + "epoch": 2.13901490602722, + "grad_norm": 0.4545619785785675, + "learning_rate": 2.0089610148903515e-06, + "loss": 0.0664, + "step": 6601 + }, + { + "epoch": 2.1393389500972133, + "grad_norm": 0.4811970591545105, + "learning_rate": 2.007559479100876e-06, + "loss": 0.0759, + "step": 6602 + }, + { + "epoch": 2.1396629941672067, + "grad_norm": 0.4705151915550232, + "learning_rate": 2.00615830955715e-06, + "loss": 0.0786, + "step": 6603 + }, + { + "epoch": 2.1399870382372, + "grad_norm": 0.47506824135780334, + "learning_rate": 2.004757506430665e-06, + "loss": 0.0738, + "step": 6604 + }, + { + "epoch": 2.1403110823071936, + "grad_norm": 0.47582632303237915, + "learning_rate": 2.0033570698928652e-06, + "loss": 0.0744, + "step": 6605 + }, + { + "epoch": 2.1406351263771874, + "grad_norm": 0.4712214171886444, + "learning_rate": 2.0019570001151494e-06, + "loss": 0.0687, + "step": 6606 + }, + { + "epoch": 2.140959170447181, + "grad_norm": 0.4881756603717804, + "learning_rate": 2.0005572972688757e-06, + "loss": 0.0762, + "step": 6607 + }, + { + "epoch": 2.1412832145171743, + "grad_norm": 0.4259354770183563, + "learning_rate": 1.9991579615253507e-06, + "loss": 0.0618, + "step": 6608 + }, + { + "epoch": 2.1416072585871677, + "grad_norm": 0.4770180881023407, + "learning_rate": 1.9977589930558406e-06, + "loss": 0.0758, + "step": 6609 + }, + { + "epoch": 2.141931302657161, + "grad_norm": 0.4740351140499115, + "learning_rate": 1.9963603920315655e-06, + "loss": 0.0765, + "step": 6610 + }, + { + "epoch": 2.142255346727155, + "grad_norm": 0.47260332107543945, + "learning_rate": 1.994962158623701e-06, + "loss": 0.0737, + "step": 6611 + }, + { + "epoch": 2.1425793907971484, + "grad_norm": 0.49404194951057434, + "learning_rate": 1.9935642930033786e-06, + "loss": 0.0716, + "step": 6612 + }, + { + "epoch": 2.142903434867142, + "grad_norm": 0.4638146162033081, + "learning_rate": 1.9921667953416796e-06, + "loss": 0.069, + "step": 6613 + }, + { + "epoch": 2.1432274789371353, + "grad_norm": 0.44174426794052124, + "learning_rate": 1.9907696658096496e-06, + "loss": 0.0665, + "step": 6614 + }, + { + "epoch": 2.143551523007129, + "grad_norm": 0.5062809586524963, + "learning_rate": 1.989372904578278e-06, + "loss": 0.0769, + "step": 6615 + }, + { + "epoch": 2.1438755670771226, + "grad_norm": 0.5006329417228699, + "learning_rate": 1.987976511818521e-06, + "loss": 0.0804, + "step": 6616 + }, + { + "epoch": 2.144199611147116, + "grad_norm": 0.4906056225299835, + "learning_rate": 1.986580487701276e-06, + "loss": 0.0757, + "step": 6617 + }, + { + "epoch": 2.1445236552171094, + "grad_norm": 0.4586811363697052, + "learning_rate": 1.9851848323974114e-06, + "loss": 0.0703, + "step": 6618 + }, + { + "epoch": 2.144847699287103, + "grad_norm": 0.48461246490478516, + "learning_rate": 1.9837895460777364e-06, + "loss": 0.0729, + "step": 6619 + }, + { + "epoch": 2.1451717433570967, + "grad_norm": 0.49435991048812866, + "learning_rate": 1.982394628913021e-06, + "loss": 0.0809, + "step": 6620 + }, + { + "epoch": 2.14549578742709, + "grad_norm": 0.4677877724170685, + "learning_rate": 1.981000081073992e-06, + "loss": 0.078, + "step": 6621 + }, + { + "epoch": 2.1458198314970836, + "grad_norm": 0.4829846918582916, + "learning_rate": 1.9796059027313237e-06, + "loss": 0.0666, + "step": 6622 + }, + { + "epoch": 2.146143875567077, + "grad_norm": 0.5388849973678589, + "learning_rate": 1.9782120940556573e-06, + "loss": 0.0784, + "step": 6623 + }, + { + "epoch": 2.146467919637071, + "grad_norm": 0.483267217874527, + "learning_rate": 1.9768186552175743e-06, + "loss": 0.0732, + "step": 6624 + }, + { + "epoch": 2.1467919637070643, + "grad_norm": 0.4950624108314514, + "learning_rate": 1.9754255863876222e-06, + "loss": 0.0756, + "step": 6625 + }, + { + "epoch": 2.1471160077770577, + "grad_norm": 0.4850957989692688, + "learning_rate": 1.974032887736298e-06, + "loss": 0.074, + "step": 6626 + }, + { + "epoch": 2.147440051847051, + "grad_norm": 0.4783896803855896, + "learning_rate": 1.9726405594340547e-06, + "loss": 0.0726, + "step": 6627 + }, + { + "epoch": 2.1477640959170445, + "grad_norm": 0.4619564712047577, + "learning_rate": 1.9712486016513e-06, + "loss": 0.071, + "step": 6628 + }, + { + "epoch": 2.1480881399870384, + "grad_norm": 0.49050572514533997, + "learning_rate": 1.9698570145583956e-06, + "loss": 0.0733, + "step": 6629 + }, + { + "epoch": 2.148412184057032, + "grad_norm": 0.4600142538547516, + "learning_rate": 1.96846579832566e-06, + "loss": 0.0691, + "step": 6630 + }, + { + "epoch": 2.1487362281270252, + "grad_norm": 0.5022426247596741, + "learning_rate": 1.9670749531233617e-06, + "loss": 0.0742, + "step": 6631 + }, + { + "epoch": 2.1490602721970187, + "grad_norm": 0.45566675066947937, + "learning_rate": 1.965684479121728e-06, + "loss": 0.07, + "step": 6632 + }, + { + "epoch": 2.149384316267012, + "grad_norm": 0.46745166182518005, + "learning_rate": 1.9642943764909406e-06, + "loss": 0.0692, + "step": 6633 + }, + { + "epoch": 2.149708360337006, + "grad_norm": 0.46203088760375977, + "learning_rate": 1.9629046454011325e-06, + "loss": 0.0724, + "step": 6634 + }, + { + "epoch": 2.1500324044069994, + "grad_norm": 0.46705809235572815, + "learning_rate": 1.961515286022395e-06, + "loss": 0.0753, + "step": 6635 + }, + { + "epoch": 2.150356448476993, + "grad_norm": 0.46384093165397644, + "learning_rate": 1.9601262985247733e-06, + "loss": 0.0639, + "step": 6636 + }, + { + "epoch": 2.1506804925469862, + "grad_norm": 0.4611685276031494, + "learning_rate": 1.9587376830782608e-06, + "loss": 0.0672, + "step": 6637 + }, + { + "epoch": 2.15100453661698, + "grad_norm": 0.4918757975101471, + "learning_rate": 1.9573494398528175e-06, + "loss": 0.0716, + "step": 6638 + }, + { + "epoch": 2.1513285806869735, + "grad_norm": 0.4503691792488098, + "learning_rate": 1.9559615690183444e-06, + "loss": 0.0698, + "step": 6639 + }, + { + "epoch": 2.151652624756967, + "grad_norm": 0.45643165707588196, + "learning_rate": 1.95457407074471e-06, + "loss": 0.0683, + "step": 6640 + }, + { + "epoch": 2.1519766688269604, + "grad_norm": 0.4970737099647522, + "learning_rate": 1.953186945201726e-06, + "loss": 0.0753, + "step": 6641 + }, + { + "epoch": 2.152300712896954, + "grad_norm": 0.48359444737434387, + "learning_rate": 1.951800192559164e-06, + "loss": 0.0747, + "step": 6642 + }, + { + "epoch": 2.1526247569669477, + "grad_norm": 0.49781110882759094, + "learning_rate": 1.9504138129867516e-06, + "loss": 0.0765, + "step": 6643 + }, + { + "epoch": 2.152948801036941, + "grad_norm": 0.4743006229400635, + "learning_rate": 1.9490278066541624e-06, + "loss": 0.0764, + "step": 6644 + }, + { + "epoch": 2.1532728451069345, + "grad_norm": 0.4961163401603699, + "learning_rate": 1.9476421737310375e-06, + "loss": 0.0782, + "step": 6645 + }, + { + "epoch": 2.153596889176928, + "grad_norm": 0.5054019093513489, + "learning_rate": 1.946256914386958e-06, + "loss": 0.0724, + "step": 6646 + }, + { + "epoch": 2.153920933246922, + "grad_norm": 0.5000643134117126, + "learning_rate": 1.9448720287914735e-06, + "loss": 0.0776, + "step": 6647 + }, + { + "epoch": 2.154244977316915, + "grad_norm": 0.47631070017814636, + "learning_rate": 1.943487517114075e-06, + "loss": 0.071, + "step": 6648 + }, + { + "epoch": 2.1545690213869086, + "grad_norm": 0.5166507959365845, + "learning_rate": 1.9421033795242144e-06, + "loss": 0.0804, + "step": 6649 + }, + { + "epoch": 2.154893065456902, + "grad_norm": 0.46033474802970886, + "learning_rate": 1.9407196161912976e-06, + "loss": 0.069, + "step": 6650 + }, + { + "epoch": 2.1552171095268955, + "grad_norm": 0.45443040132522583, + "learning_rate": 1.9393362272846844e-06, + "loss": 0.0714, + "step": 6651 + }, + { + "epoch": 2.1555411535968894, + "grad_norm": 0.45767298340797424, + "learning_rate": 1.937953212973687e-06, + "loss": 0.074, + "step": 6652 + }, + { + "epoch": 2.155865197666883, + "grad_norm": 0.5098868608474731, + "learning_rate": 1.936570573427573e-06, + "loss": 0.073, + "step": 6653 + }, + { + "epoch": 2.156189241736876, + "grad_norm": 0.48986905813217163, + "learning_rate": 1.9351883088155666e-06, + "loss": 0.0773, + "step": 6654 + }, + { + "epoch": 2.1565132858068696, + "grad_norm": 0.44977831840515137, + "learning_rate": 1.93380641930684e-06, + "loss": 0.0679, + "step": 6655 + }, + { + "epoch": 2.156837329876863, + "grad_norm": 0.45664796233177185, + "learning_rate": 1.932424905070524e-06, + "loss": 0.0713, + "step": 6656 + }, + { + "epoch": 2.157161373946857, + "grad_norm": 0.49763405323028564, + "learning_rate": 1.9310437662757037e-06, + "loss": 0.075, + "step": 6657 + }, + { + "epoch": 2.1574854180168503, + "grad_norm": 0.49584758281707764, + "learning_rate": 1.9296630030914165e-06, + "loss": 0.0805, + "step": 6658 + }, + { + "epoch": 2.1578094620868438, + "grad_norm": 0.4893285930156708, + "learning_rate": 1.928282615686655e-06, + "loss": 0.0717, + "step": 6659 + }, + { + "epoch": 2.158133506156837, + "grad_norm": 0.4704589545726776, + "learning_rate": 1.926902604230364e-06, + "loss": 0.0683, + "step": 6660 + }, + { + "epoch": 2.158457550226831, + "grad_norm": 0.45831263065338135, + "learning_rate": 1.9255229688914445e-06, + "loss": 0.0667, + "step": 6661 + }, + { + "epoch": 2.1587815942968245, + "grad_norm": 0.47325196862220764, + "learning_rate": 1.9241437098387528e-06, + "loss": 0.0739, + "step": 6662 + }, + { + "epoch": 2.159105638366818, + "grad_norm": 0.4823969900608063, + "learning_rate": 1.922764827241092e-06, + "loss": 0.0716, + "step": 6663 + }, + { + "epoch": 2.1594296824368113, + "grad_norm": 0.4870539605617523, + "learning_rate": 1.921386321267227e-06, + "loss": 0.0765, + "step": 6664 + }, + { + "epoch": 2.1597537265068047, + "grad_norm": 0.47531864047050476, + "learning_rate": 1.920008192085872e-06, + "loss": 0.0743, + "step": 6665 + }, + { + "epoch": 2.1600777705767986, + "grad_norm": 0.46033617854118347, + "learning_rate": 1.9186304398656987e-06, + "loss": 0.0643, + "step": 6666 + }, + { + "epoch": 2.160401814646792, + "grad_norm": 0.47263818979263306, + "learning_rate": 1.9172530647753306e-06, + "loss": 0.0752, + "step": 6667 + }, + { + "epoch": 2.1607258587167855, + "grad_norm": 0.5137720108032227, + "learning_rate": 1.915876066983341e-06, + "loss": 0.0811, + "step": 6668 + }, + { + "epoch": 2.161049902786779, + "grad_norm": 0.4592183232307434, + "learning_rate": 1.9144994466582674e-06, + "loss": 0.069, + "step": 6669 + }, + { + "epoch": 2.1613739468567728, + "grad_norm": 0.5136011838912964, + "learning_rate": 1.913123203968588e-06, + "loss": 0.0833, + "step": 6670 + }, + { + "epoch": 2.161697990926766, + "grad_norm": 0.4686625897884369, + "learning_rate": 1.911747339082749e-06, + "loss": 0.0744, + "step": 6671 + }, + { + "epoch": 2.1620220349967596, + "grad_norm": 0.4724651575088501, + "learning_rate": 1.910371852169137e-06, + "loss": 0.0761, + "step": 6672 + }, + { + "epoch": 2.162346079066753, + "grad_norm": 0.4784649610519409, + "learning_rate": 1.908996743396101e-06, + "loss": 0.0701, + "step": 6673 + }, + { + "epoch": 2.1626701231367464, + "grad_norm": 0.476528525352478, + "learning_rate": 1.90762201293194e-06, + "loss": 0.0733, + "step": 6674 + }, + { + "epoch": 2.1629941672067403, + "grad_norm": 0.4567173719406128, + "learning_rate": 1.9062476609449075e-06, + "loss": 0.07, + "step": 6675 + }, + { + "epoch": 2.1633182112767337, + "grad_norm": 0.466298371553421, + "learning_rate": 1.9048736876032142e-06, + "loss": 0.0698, + "step": 6676 + }, + { + "epoch": 2.163642255346727, + "grad_norm": 0.48197922110557556, + "learning_rate": 1.9035000930750142e-06, + "loss": 0.075, + "step": 6677 + }, + { + "epoch": 2.1639662994167206, + "grad_norm": 0.460759699344635, + "learning_rate": 1.9021268775284301e-06, + "loss": 0.0694, + "step": 6678 + }, + { + "epoch": 2.164290343486714, + "grad_norm": 0.488559752702713, + "learning_rate": 1.900754041131525e-06, + "loss": 0.0747, + "step": 6679 + }, + { + "epoch": 2.164614387556708, + "grad_norm": 0.48355886340141296, + "learning_rate": 1.8993815840523217e-06, + "loss": 0.0712, + "step": 6680 + }, + { + "epoch": 2.1649384316267013, + "grad_norm": 0.49874863028526306, + "learning_rate": 1.8980095064587967e-06, + "loss": 0.0736, + "step": 6681 + }, + { + "epoch": 2.1652624756966947, + "grad_norm": 0.4778255522251129, + "learning_rate": 1.896637808518878e-06, + "loss": 0.0744, + "step": 6682 + }, + { + "epoch": 2.165586519766688, + "grad_norm": 0.4567441940307617, + "learning_rate": 1.895266490400449e-06, + "loss": 0.0712, + "step": 6683 + }, + { + "epoch": 2.1659105638366816, + "grad_norm": 0.46725553274154663, + "learning_rate": 1.8938955522713455e-06, + "loss": 0.0741, + "step": 6684 + }, + { + "epoch": 2.1662346079066754, + "grad_norm": 0.48114606738090515, + "learning_rate": 1.8925249942993585e-06, + "loss": 0.0707, + "step": 6685 + }, + { + "epoch": 2.166558651976669, + "grad_norm": 0.5054229497909546, + "learning_rate": 1.8911548166522276e-06, + "loss": 0.0809, + "step": 6686 + }, + { + "epoch": 2.1668826960466623, + "grad_norm": 0.4752826690673828, + "learning_rate": 1.8897850194976514e-06, + "loss": 0.0684, + "step": 6687 + }, + { + "epoch": 2.1672067401166557, + "grad_norm": 0.4631727635860443, + "learning_rate": 1.8884156030032797e-06, + "loss": 0.0724, + "step": 6688 + }, + { + "epoch": 2.1675307841866496, + "grad_norm": 0.4974950850009918, + "learning_rate": 1.8870465673367154e-06, + "loss": 0.0802, + "step": 6689 + }, + { + "epoch": 2.167854828256643, + "grad_norm": 0.5214879512786865, + "learning_rate": 1.885677912665516e-06, + "loss": 0.0822, + "step": 6690 + }, + { + "epoch": 2.1681788723266364, + "grad_norm": 0.5338072180747986, + "learning_rate": 1.8843096391571924e-06, + "loss": 0.0792, + "step": 6691 + }, + { + "epoch": 2.16850291639663, + "grad_norm": 0.4919593036174774, + "learning_rate": 1.8829417469792038e-06, + "loss": 0.076, + "step": 6692 + }, + { + "epoch": 2.1688269604666233, + "grad_norm": 0.5080357789993286, + "learning_rate": 1.881574236298973e-06, + "loss": 0.0753, + "step": 6693 + }, + { + "epoch": 2.169151004536617, + "grad_norm": 0.492123007774353, + "learning_rate": 1.8802071072838652e-06, + "loss": 0.0801, + "step": 6694 + }, + { + "epoch": 2.1694750486066106, + "grad_norm": 0.4710664451122284, + "learning_rate": 1.8788403601012056e-06, + "loss": 0.0704, + "step": 6695 + }, + { + "epoch": 2.169799092676604, + "grad_norm": 0.4927854835987091, + "learning_rate": 1.8774739949182707e-06, + "loss": 0.0742, + "step": 6696 + }, + { + "epoch": 2.1701231367465974, + "grad_norm": 0.49946191906929016, + "learning_rate": 1.8761080119022907e-06, + "loss": 0.0736, + "step": 6697 + }, + { + "epoch": 2.1704471808165913, + "grad_norm": 0.48680731654167175, + "learning_rate": 1.8747424112204499e-06, + "loss": 0.0765, + "step": 6698 + }, + { + "epoch": 2.1707712248865847, + "grad_norm": 0.5381428599357605, + "learning_rate": 1.8733771930398797e-06, + "loss": 0.0859, + "step": 6699 + }, + { + "epoch": 2.171095268956578, + "grad_norm": 0.4549349248409271, + "learning_rate": 1.8720123575276766e-06, + "loss": 0.0707, + "step": 6700 + }, + { + "epoch": 2.1714193130265715, + "grad_norm": 0.45881056785583496, + "learning_rate": 1.8706479048508764e-06, + "loss": 0.0723, + "step": 6701 + }, + { + "epoch": 2.171743357096565, + "grad_norm": 0.4502636194229126, + "learning_rate": 1.8692838351764814e-06, + "loss": 0.0691, + "step": 6702 + }, + { + "epoch": 2.172067401166559, + "grad_norm": 0.47965043783187866, + "learning_rate": 1.8679201486714354e-06, + "loss": 0.0708, + "step": 6703 + }, + { + "epoch": 2.1723914452365523, + "grad_norm": 0.46118342876434326, + "learning_rate": 1.8665568455026424e-06, + "loss": 0.0713, + "step": 6704 + }, + { + "epoch": 2.1727154893065457, + "grad_norm": 0.4682093560695648, + "learning_rate": 1.8651939258369577e-06, + "loss": 0.071, + "step": 6705 + }, + { + "epoch": 2.173039533376539, + "grad_norm": 0.49599429965019226, + "learning_rate": 1.8638313898411885e-06, + "loss": 0.0754, + "step": 6706 + }, + { + "epoch": 2.1733635774465325, + "grad_norm": 0.4504205584526062, + "learning_rate": 1.8624692376820992e-06, + "loss": 0.0699, + "step": 6707 + }, + { + "epoch": 2.1736876215165264, + "grad_norm": 0.4703887701034546, + "learning_rate": 1.861107469526398e-06, + "loss": 0.0705, + "step": 6708 + }, + { + "epoch": 2.17401166558652, + "grad_norm": 0.48415201902389526, + "learning_rate": 1.859746085540759e-06, + "loss": 0.0743, + "step": 6709 + }, + { + "epoch": 2.1743357096565132, + "grad_norm": 0.4921909272670746, + "learning_rate": 1.8583850858917974e-06, + "loss": 0.0757, + "step": 6710 + }, + { + "epoch": 2.1746597537265067, + "grad_norm": 0.4651033282279968, + "learning_rate": 1.8570244707460878e-06, + "loss": 0.0697, + "step": 6711 + }, + { + "epoch": 2.1749837977965005, + "grad_norm": 0.48125144839286804, + "learning_rate": 1.8556642402701569e-06, + "loss": 0.0727, + "step": 6712 + }, + { + "epoch": 2.175307841866494, + "grad_norm": 0.4617149531841278, + "learning_rate": 1.8543043946304835e-06, + "loss": 0.0711, + "step": 6713 + }, + { + "epoch": 2.1756318859364874, + "grad_norm": 0.5271691083908081, + "learning_rate": 1.8529449339934997e-06, + "loss": 0.0783, + "step": 6714 + }, + { + "epoch": 2.175955930006481, + "grad_norm": 0.5199576616287231, + "learning_rate": 1.8515858585255913e-06, + "loss": 0.0839, + "step": 6715 + }, + { + "epoch": 2.176279974076474, + "grad_norm": 0.5006012916564941, + "learning_rate": 1.8502271683930933e-06, + "loss": 0.0768, + "step": 6716 + }, + { + "epoch": 2.176604018146468, + "grad_norm": 0.47408223152160645, + "learning_rate": 1.8488688637622981e-06, + "loss": 0.0722, + "step": 6717 + }, + { + "epoch": 2.1769280622164615, + "grad_norm": 0.44292151927948, + "learning_rate": 1.8475109447994483e-06, + "loss": 0.0697, + "step": 6718 + }, + { + "epoch": 2.177252106286455, + "grad_norm": 0.4794071316719055, + "learning_rate": 1.8461534116707403e-06, + "loss": 0.074, + "step": 6719 + }, + { + "epoch": 2.1775761503564484, + "grad_norm": 0.4785098433494568, + "learning_rate": 1.8447962645423233e-06, + "loss": 0.0749, + "step": 6720 + }, + { + "epoch": 2.1779001944264422, + "grad_norm": 0.49060362577438354, + "learning_rate": 1.8434395035802987e-06, + "loss": 0.0701, + "step": 6721 + }, + { + "epoch": 2.1782242384964356, + "grad_norm": 0.4688788056373596, + "learning_rate": 1.842083128950723e-06, + "loss": 0.0697, + "step": 6722 + }, + { + "epoch": 2.178548282566429, + "grad_norm": 0.45464015007019043, + "learning_rate": 1.8407271408195975e-06, + "loss": 0.0686, + "step": 6723 + }, + { + "epoch": 2.1788723266364225, + "grad_norm": 0.4882039725780487, + "learning_rate": 1.8393715393528893e-06, + "loss": 0.0739, + "step": 6724 + }, + { + "epoch": 2.179196370706416, + "grad_norm": 0.4844962954521179, + "learning_rate": 1.8380163247165062e-06, + "loss": 0.0765, + "step": 6725 + }, + { + "epoch": 2.17952041477641, + "grad_norm": 0.46952706575393677, + "learning_rate": 1.8366614970763142e-06, + "loss": 0.0717, + "step": 6726 + }, + { + "epoch": 2.179844458846403, + "grad_norm": 0.5064206123352051, + "learning_rate": 1.8353070565981313e-06, + "loss": 0.0794, + "step": 6727 + }, + { + "epoch": 2.1801685029163966, + "grad_norm": 0.5096865296363831, + "learning_rate": 1.8339530034477283e-06, + "loss": 0.0761, + "step": 6728 + }, + { + "epoch": 2.18049254698639, + "grad_norm": 0.46830105781555176, + "learning_rate": 1.8325993377908296e-06, + "loss": 0.0728, + "step": 6729 + }, + { + "epoch": 2.1808165910563835, + "grad_norm": 0.4919731318950653, + "learning_rate": 1.8312460597931058e-06, + "loss": 0.0742, + "step": 6730 + }, + { + "epoch": 2.1811406351263773, + "grad_norm": 0.4882954955101013, + "learning_rate": 1.8298931696201915e-06, + "loss": 0.0746, + "step": 6731 + }, + { + "epoch": 2.1814646791963708, + "grad_norm": 0.4879685640335083, + "learning_rate": 1.828540667437661e-06, + "loss": 0.0745, + "step": 6732 + }, + { + "epoch": 2.181788723266364, + "grad_norm": 0.457216739654541, + "learning_rate": 1.8271885534110544e-06, + "loss": 0.0714, + "step": 6733 + }, + { + "epoch": 2.1821127673363576, + "grad_norm": 0.46054744720458984, + "learning_rate": 1.8258368277058519e-06, + "loss": 0.0719, + "step": 6734 + }, + { + "epoch": 2.182436811406351, + "grad_norm": 0.49528566002845764, + "learning_rate": 1.824485490487493e-06, + "loss": 0.0709, + "step": 6735 + }, + { + "epoch": 2.182760855476345, + "grad_norm": 0.5130648612976074, + "learning_rate": 1.8231345419213692e-06, + "loss": 0.0779, + "step": 6736 + }, + { + "epoch": 2.1830848995463383, + "grad_norm": 0.5079078674316406, + "learning_rate": 1.8217839821728222e-06, + "loss": 0.0798, + "step": 6737 + }, + { + "epoch": 2.1834089436163318, + "grad_norm": 0.5093557834625244, + "learning_rate": 1.8204338114071506e-06, + "loss": 0.0769, + "step": 6738 + }, + { + "epoch": 2.183732987686325, + "grad_norm": 0.46685534715652466, + "learning_rate": 1.8190840297895968e-06, + "loss": 0.0693, + "step": 6739 + }, + { + "epoch": 2.184057031756319, + "grad_norm": 0.44228246808052063, + "learning_rate": 1.8177346374853672e-06, + "loss": 0.0691, + "step": 6740 + }, + { + "epoch": 2.1843810758263125, + "grad_norm": 0.4679490327835083, + "learning_rate": 1.8163856346596092e-06, + "loss": 0.0696, + "step": 6741 + }, + { + "epoch": 2.184705119896306, + "grad_norm": 0.5041935443878174, + "learning_rate": 1.8150370214774298e-06, + "loss": 0.0772, + "step": 6742 + }, + { + "epoch": 2.1850291639662993, + "grad_norm": 0.478503555059433, + "learning_rate": 1.8136887981038864e-06, + "loss": 0.0755, + "step": 6743 + }, + { + "epoch": 2.1853532080362927, + "grad_norm": 0.46148279309272766, + "learning_rate": 1.8123409647039885e-06, + "loss": 0.0753, + "step": 6744 + }, + { + "epoch": 2.1856772521062866, + "grad_norm": 0.5056387782096863, + "learning_rate": 1.8109935214426971e-06, + "loss": 0.0778, + "step": 6745 + }, + { + "epoch": 2.18600129617628, + "grad_norm": 0.44552767276763916, + "learning_rate": 1.8096464684849285e-06, + "loss": 0.0715, + "step": 6746 + }, + { + "epoch": 2.1863253402462735, + "grad_norm": 0.46720853447914124, + "learning_rate": 1.808299805995546e-06, + "loss": 0.0753, + "step": 6747 + }, + { + "epoch": 2.186649384316267, + "grad_norm": 0.46596813201904297, + "learning_rate": 1.8069535341393685e-06, + "loss": 0.0778, + "step": 6748 + }, + { + "epoch": 2.1869734283862607, + "grad_norm": 0.4803813695907593, + "learning_rate": 1.8056076530811672e-06, + "loss": 0.0788, + "step": 6749 + }, + { + "epoch": 2.187297472456254, + "grad_norm": 0.4746699333190918, + "learning_rate": 1.8042621629856656e-06, + "loss": 0.0732, + "step": 6750 + }, + { + "epoch": 2.1876215165262476, + "grad_norm": 0.4526144564151764, + "learning_rate": 1.802917064017538e-06, + "loss": 0.0706, + "step": 6751 + }, + { + "epoch": 2.187945560596241, + "grad_norm": 0.4581446051597595, + "learning_rate": 1.8015723563414112e-06, + "loss": 0.0718, + "step": 6752 + }, + { + "epoch": 2.1882696046662344, + "grad_norm": 0.4723081886768341, + "learning_rate": 1.8002280401218669e-06, + "loss": 0.0736, + "step": 6753 + }, + { + "epoch": 2.1885936487362283, + "grad_norm": 0.5154812932014465, + "learning_rate": 1.798884115523431e-06, + "loss": 0.0787, + "step": 6754 + }, + { + "epoch": 2.1889176928062217, + "grad_norm": 0.49709799885749817, + "learning_rate": 1.7975405827105929e-06, + "loss": 0.079, + "step": 6755 + }, + { + "epoch": 2.189241736876215, + "grad_norm": 0.46235930919647217, + "learning_rate": 1.7961974418477845e-06, + "loss": 0.0696, + "step": 6756 + }, + { + "epoch": 2.1895657809462086, + "grad_norm": 0.5224365592002869, + "learning_rate": 1.7948546930993932e-06, + "loss": 0.0736, + "step": 6757 + }, + { + "epoch": 2.189889825016202, + "grad_norm": 0.4718781113624573, + "learning_rate": 1.7935123366297596e-06, + "loss": 0.0694, + "step": 6758 + }, + { + "epoch": 2.190213869086196, + "grad_norm": 0.4754774570465088, + "learning_rate": 1.7921703726031748e-06, + "loss": 0.0672, + "step": 6759 + }, + { + "epoch": 2.1905379131561893, + "grad_norm": 0.47679826617240906, + "learning_rate": 1.7908288011838843e-06, + "loss": 0.0711, + "step": 6760 + }, + { + "epoch": 2.1908619572261827, + "grad_norm": 0.5035863518714905, + "learning_rate": 1.7894876225360774e-06, + "loss": 0.0759, + "step": 6761 + }, + { + "epoch": 2.191186001296176, + "grad_norm": 0.4582345485687256, + "learning_rate": 1.7881468368239085e-06, + "loss": 0.0689, + "step": 6762 + }, + { + "epoch": 2.19151004536617, + "grad_norm": 0.44637078046798706, + "learning_rate": 1.7868064442114707e-06, + "loss": 0.0673, + "step": 6763 + }, + { + "epoch": 2.1918340894361634, + "grad_norm": 0.47751113772392273, + "learning_rate": 1.7854664448628211e-06, + "loss": 0.0731, + "step": 6764 + }, + { + "epoch": 2.192158133506157, + "grad_norm": 0.4775833785533905, + "learning_rate": 1.784126838941958e-06, + "loss": 0.0722, + "step": 6765 + }, + { + "epoch": 2.1924821775761503, + "grad_norm": 0.49275335669517517, + "learning_rate": 1.782787626612838e-06, + "loss": 0.0722, + "step": 6766 + }, + { + "epoch": 2.1928062216461437, + "grad_norm": 0.44845327734947205, + "learning_rate": 1.7814488080393672e-06, + "loss": 0.0682, + "step": 6767 + }, + { + "epoch": 2.1931302657161376, + "grad_norm": 0.4543963372707367, + "learning_rate": 1.7801103833854044e-06, + "loss": 0.0692, + "step": 6768 + }, + { + "epoch": 2.193454309786131, + "grad_norm": 0.5207314491271973, + "learning_rate": 1.7787723528147615e-06, + "loss": 0.0782, + "step": 6769 + }, + { + "epoch": 2.1937783538561244, + "grad_norm": 0.44605255126953125, + "learning_rate": 1.777434716491197e-06, + "loss": 0.0648, + "step": 6770 + }, + { + "epoch": 2.194102397926118, + "grad_norm": 0.4934347867965698, + "learning_rate": 1.776097474578426e-06, + "loss": 0.0738, + "step": 6771 + }, + { + "epoch": 2.1944264419961117, + "grad_norm": 0.48282739520072937, + "learning_rate": 1.7747606272401147e-06, + "loss": 0.0747, + "step": 6772 + }, + { + "epoch": 2.194750486066105, + "grad_norm": 0.4789862632751465, + "learning_rate": 1.77342417463988e-06, + "loss": 0.0734, + "step": 6773 + }, + { + "epoch": 2.1950745301360985, + "grad_norm": 0.46827223896980286, + "learning_rate": 1.7720881169412902e-06, + "loss": 0.0698, + "step": 6774 + }, + { + "epoch": 2.195398574206092, + "grad_norm": 0.505933940410614, + "learning_rate": 1.7707524543078664e-06, + "loss": 0.0713, + "step": 6775 + }, + { + "epoch": 2.1957226182760854, + "grad_norm": 0.4550766944885254, + "learning_rate": 1.7694171869030807e-06, + "loss": 0.0654, + "step": 6776 + }, + { + "epoch": 2.1960466623460793, + "grad_norm": 0.48559170961380005, + "learning_rate": 1.7680823148903585e-06, + "loss": 0.0709, + "step": 6777 + }, + { + "epoch": 2.1963707064160727, + "grad_norm": 0.4962014853954315, + "learning_rate": 1.7667478384330704e-06, + "loss": 0.0708, + "step": 6778 + }, + { + "epoch": 2.196694750486066, + "grad_norm": 0.42602312564849854, + "learning_rate": 1.7654137576945502e-06, + "loss": 0.0626, + "step": 6779 + }, + { + "epoch": 2.1970187945560595, + "grad_norm": 0.47422635555267334, + "learning_rate": 1.7640800728380702e-06, + "loss": 0.0725, + "step": 6780 + }, + { + "epoch": 2.197342838626053, + "grad_norm": 0.4561498165130615, + "learning_rate": 1.7627467840268642e-06, + "loss": 0.0708, + "step": 6781 + }, + { + "epoch": 2.197666882696047, + "grad_norm": 0.495524138212204, + "learning_rate": 1.7614138914241141e-06, + "loss": 0.0757, + "step": 6782 + }, + { + "epoch": 2.1979909267660402, + "grad_norm": 0.5002186894416809, + "learning_rate": 1.760081395192948e-06, + "loss": 0.0776, + "step": 6783 + }, + { + "epoch": 2.1983149708360337, + "grad_norm": 0.48374322056770325, + "learning_rate": 1.758749295496458e-06, + "loss": 0.0756, + "step": 6784 + }, + { + "epoch": 2.198639014906027, + "grad_norm": 0.4546158015727997, + "learning_rate": 1.7574175924976733e-06, + "loss": 0.0692, + "step": 6785 + }, + { + "epoch": 2.1989630589760205, + "grad_norm": 0.544314444065094, + "learning_rate": 1.7560862863595873e-06, + "loss": 0.0828, + "step": 6786 + }, + { + "epoch": 2.1992871030460144, + "grad_norm": 0.4958459734916687, + "learning_rate": 1.7547553772451336e-06, + "loss": 0.0755, + "step": 6787 + }, + { + "epoch": 2.199611147116008, + "grad_norm": 0.4743732213973999, + "learning_rate": 1.7534248653172087e-06, + "loss": 0.0719, + "step": 6788 + }, + { + "epoch": 2.1999351911860012, + "grad_norm": 0.5188460350036621, + "learning_rate": 1.7520947507386487e-06, + "loss": 0.0794, + "step": 6789 + }, + { + "epoch": 2.2002592352559946, + "grad_norm": 0.47565385699272156, + "learning_rate": 1.7507650336722497e-06, + "loss": 0.0707, + "step": 6790 + }, + { + "epoch": 2.2005832793259885, + "grad_norm": 0.48570117354393005, + "learning_rate": 1.7494357142807572e-06, + "loss": 0.0739, + "step": 6791 + }, + { + "epoch": 2.200907323395982, + "grad_norm": 0.4817291796207428, + "learning_rate": 1.748106792726862e-06, + "loss": 0.072, + "step": 6792 + }, + { + "epoch": 2.2012313674659754, + "grad_norm": 0.4850255250930786, + "learning_rate": 1.7467782691732176e-06, + "loss": 0.0717, + "step": 6793 + }, + { + "epoch": 2.201555411535969, + "grad_norm": 0.4843173623085022, + "learning_rate": 1.7454501437824178e-06, + "loss": 0.0724, + "step": 6794 + }, + { + "epoch": 2.201879455605962, + "grad_norm": 0.5217100977897644, + "learning_rate": 1.744122416717014e-06, + "loss": 0.0758, + "step": 6795 + }, + { + "epoch": 2.202203499675956, + "grad_norm": 0.4569748044013977, + "learning_rate": 1.7427950881395072e-06, + "loss": 0.07, + "step": 6796 + }, + { + "epoch": 2.2025275437459495, + "grad_norm": 0.48870840668678284, + "learning_rate": 1.7414681582123493e-06, + "loss": 0.0742, + "step": 6797 + }, + { + "epoch": 2.202851587815943, + "grad_norm": 0.5258080363273621, + "learning_rate": 1.7401416270979443e-06, + "loss": 0.0809, + "step": 6798 + }, + { + "epoch": 2.2031756318859363, + "grad_norm": 0.471910685300827, + "learning_rate": 1.7388154949586455e-06, + "loss": 0.0717, + "step": 6799 + }, + { + "epoch": 2.20349967595593, + "grad_norm": 0.48028430342674255, + "learning_rate": 1.7374897619567598e-06, + "loss": 0.0779, + "step": 6800 + }, + { + "epoch": 2.2038237200259236, + "grad_norm": 0.4923180043697357, + "learning_rate": 1.7361644282545454e-06, + "loss": 0.0774, + "step": 6801 + }, + { + "epoch": 2.204147764095917, + "grad_norm": 0.46793532371520996, + "learning_rate": 1.7348394940142067e-06, + "loss": 0.0687, + "step": 6802 + }, + { + "epoch": 2.2044718081659105, + "grad_norm": 0.5037127733230591, + "learning_rate": 1.7335149593979051e-06, + "loss": 0.0789, + "step": 6803 + }, + { + "epoch": 2.204795852235904, + "grad_norm": 0.4806075990200043, + "learning_rate": 1.73219082456775e-06, + "loss": 0.0726, + "step": 6804 + }, + { + "epoch": 2.2051198963058978, + "grad_norm": 0.49096283316612244, + "learning_rate": 1.7308670896858032e-06, + "loss": 0.0717, + "step": 6805 + }, + { + "epoch": 2.205443940375891, + "grad_norm": 0.44644638895988464, + "learning_rate": 1.729543754914077e-06, + "loss": 0.0691, + "step": 6806 + }, + { + "epoch": 2.2057679844458846, + "grad_norm": 0.4400399923324585, + "learning_rate": 1.7282208204145351e-06, + "loss": 0.0671, + "step": 6807 + }, + { + "epoch": 2.206092028515878, + "grad_norm": 0.4830164313316345, + "learning_rate": 1.726898286349093e-06, + "loss": 0.073, + "step": 6808 + }, + { + "epoch": 2.2064160725858715, + "grad_norm": 0.4604962468147278, + "learning_rate": 1.725576152879611e-06, + "loss": 0.0694, + "step": 6809 + }, + { + "epoch": 2.2067401166558653, + "grad_norm": 0.4484296143054962, + "learning_rate": 1.7242544201679124e-06, + "loss": 0.0671, + "step": 6810 + }, + { + "epoch": 2.2070641607258588, + "grad_norm": 0.4855106472969055, + "learning_rate": 1.7229330883757595e-06, + "loss": 0.0673, + "step": 6811 + }, + { + "epoch": 2.207388204795852, + "grad_norm": 0.5392202138900757, + "learning_rate": 1.721612157664872e-06, + "loss": 0.0866, + "step": 6812 + }, + { + "epoch": 2.2077122488658456, + "grad_norm": 0.4744535982608795, + "learning_rate": 1.7202916281969212e-06, + "loss": 0.0723, + "step": 6813 + }, + { + "epoch": 2.2080362929358395, + "grad_norm": 0.4794631600379944, + "learning_rate": 1.7189715001335211e-06, + "loss": 0.0678, + "step": 6814 + }, + { + "epoch": 2.208360337005833, + "grad_norm": 0.4904131591320038, + "learning_rate": 1.7176517736362502e-06, + "loss": 0.0754, + "step": 6815 + }, + { + "epoch": 2.2086843810758263, + "grad_norm": 0.48358476161956787, + "learning_rate": 1.7163324488666233e-06, + "loss": 0.078, + "step": 6816 + }, + { + "epoch": 2.2090084251458197, + "grad_norm": 0.46504324674606323, + "learning_rate": 1.7150135259861201e-06, + "loss": 0.0696, + "step": 6817 + }, + { + "epoch": 2.209332469215813, + "grad_norm": 0.459061861038208, + "learning_rate": 1.7136950051561562e-06, + "loss": 0.0698, + "step": 6818 + }, + { + "epoch": 2.209656513285807, + "grad_norm": 0.492106169462204, + "learning_rate": 1.7123768865381136e-06, + "loss": 0.0801, + "step": 6819 + }, + { + "epoch": 2.2099805573558005, + "grad_norm": 0.487063467502594, + "learning_rate": 1.7110591702933111e-06, + "loss": 0.0756, + "step": 6820 + }, + { + "epoch": 2.210304601425794, + "grad_norm": 0.4428948163986206, + "learning_rate": 1.709741856583027e-06, + "loss": 0.0731, + "step": 6821 + }, + { + "epoch": 2.2106286454957873, + "grad_norm": 0.5058284401893616, + "learning_rate": 1.7084249455684876e-06, + "loss": 0.08, + "step": 6822 + }, + { + "epoch": 2.210952689565781, + "grad_norm": 0.4309074282646179, + "learning_rate": 1.7071084374108704e-06, + "loss": 0.0666, + "step": 6823 + }, + { + "epoch": 2.2112767336357746, + "grad_norm": 0.47959843277931213, + "learning_rate": 1.7057923322713038e-06, + "loss": 0.073, + "step": 6824 + }, + { + "epoch": 2.211600777705768, + "grad_norm": 0.46819719672203064, + "learning_rate": 1.704476630310864e-06, + "loss": 0.0698, + "step": 6825 + }, + { + "epoch": 2.2119248217757614, + "grad_norm": 0.4914425015449524, + "learning_rate": 1.7031613316905816e-06, + "loss": 0.0678, + "step": 6826 + }, + { + "epoch": 2.212248865845755, + "grad_norm": 0.5222875475883484, + "learning_rate": 1.701846436571436e-06, + "loss": 0.0738, + "step": 6827 + }, + { + "epoch": 2.2125729099157487, + "grad_norm": 0.47440800070762634, + "learning_rate": 1.7005319451143581e-06, + "loss": 0.0756, + "step": 6828 + }, + { + "epoch": 2.212896953985742, + "grad_norm": 0.46596425771713257, + "learning_rate": 1.6992178574802288e-06, + "loss": 0.0691, + "step": 6829 + }, + { + "epoch": 2.2132209980557356, + "grad_norm": 0.4916023910045624, + "learning_rate": 1.6979041738298796e-06, + "loss": 0.0729, + "step": 6830 + }, + { + "epoch": 2.213545042125729, + "grad_norm": 0.4842454791069031, + "learning_rate": 1.6965908943240928e-06, + "loss": 0.07, + "step": 6831 + }, + { + "epoch": 2.2138690861957224, + "grad_norm": 0.44420167803764343, + "learning_rate": 1.695278019123603e-06, + "loss": 0.0656, + "step": 6832 + }, + { + "epoch": 2.2141931302657163, + "grad_norm": 0.4698340892791748, + "learning_rate": 1.6939655483890894e-06, + "loss": 0.0741, + "step": 6833 + }, + { + "epoch": 2.2145171743357097, + "grad_norm": 0.4900783896446228, + "learning_rate": 1.692653482281188e-06, + "loss": 0.075, + "step": 6834 + }, + { + "epoch": 2.214841218405703, + "grad_norm": 0.4853631258010864, + "learning_rate": 1.6913418209604825e-06, + "loss": 0.0721, + "step": 6835 + }, + { + "epoch": 2.2151652624756966, + "grad_norm": 0.460990846157074, + "learning_rate": 1.6900305645875082e-06, + "loss": 0.0741, + "step": 6836 + }, + { + "epoch": 2.21548930654569, + "grad_norm": 0.44160157442092896, + "learning_rate": 1.6887197133227512e-06, + "loss": 0.0644, + "step": 6837 + }, + { + "epoch": 2.215813350615684, + "grad_norm": 0.48136088252067566, + "learning_rate": 1.6874092673266424e-06, + "loss": 0.0726, + "step": 6838 + }, + { + "epoch": 2.2161373946856773, + "grad_norm": 0.4865271747112274, + "learning_rate": 1.6860992267595745e-06, + "loss": 0.0779, + "step": 6839 + }, + { + "epoch": 2.2164614387556707, + "grad_norm": 0.45788124203681946, + "learning_rate": 1.6847895917818762e-06, + "loss": 0.071, + "step": 6840 + }, + { + "epoch": 2.216785482825664, + "grad_norm": 0.46521875262260437, + "learning_rate": 1.683480362553842e-06, + "loss": 0.0722, + "step": 6841 + }, + { + "epoch": 2.217109526895658, + "grad_norm": 0.5691971182823181, + "learning_rate": 1.6821715392357036e-06, + "loss": 0.0859, + "step": 6842 + }, + { + "epoch": 2.2174335709656514, + "grad_norm": 0.48135796189308167, + "learning_rate": 1.6808631219876491e-06, + "loss": 0.0742, + "step": 6843 + }, + { + "epoch": 2.217757615035645, + "grad_norm": 0.4701908528804779, + "learning_rate": 1.6795551109698171e-06, + "loss": 0.0726, + "step": 6844 + }, + { + "epoch": 2.2180816591056383, + "grad_norm": 0.5185319781303406, + "learning_rate": 1.6782475063422947e-06, + "loss": 0.0749, + "step": 6845 + }, + { + "epoch": 2.2184057031756317, + "grad_norm": 0.47821444272994995, + "learning_rate": 1.6769403082651225e-06, + "loss": 0.0768, + "step": 6846 + }, + { + "epoch": 2.2187297472456255, + "grad_norm": 0.5236374735832214, + "learning_rate": 1.6756335168982834e-06, + "loss": 0.0817, + "step": 6847 + }, + { + "epoch": 2.219053791315619, + "grad_norm": 0.4434678256511688, + "learning_rate": 1.674327132401723e-06, + "loss": 0.0685, + "step": 6848 + }, + { + "epoch": 2.2193778353856124, + "grad_norm": 0.4766329228878021, + "learning_rate": 1.673021154935325e-06, + "loss": 0.0682, + "step": 6849 + }, + { + "epoch": 2.219701879455606, + "grad_norm": 0.5388830900192261, + "learning_rate": 1.6717155846589294e-06, + "loss": 0.0737, + "step": 6850 + }, + { + "epoch": 2.2200259235255997, + "grad_norm": 0.5101863145828247, + "learning_rate": 1.6704104217323268e-06, + "loss": 0.0746, + "step": 6851 + }, + { + "epoch": 2.220349967595593, + "grad_norm": 0.4756203293800354, + "learning_rate": 1.669105666315255e-06, + "loss": 0.0702, + "step": 6852 + }, + { + "epoch": 2.2206740116655865, + "grad_norm": 0.48440685868263245, + "learning_rate": 1.6678013185674041e-06, + "loss": 0.0736, + "step": 6853 + }, + { + "epoch": 2.22099805573558, + "grad_norm": 0.4911997318267822, + "learning_rate": 1.666497378648414e-06, + "loss": 0.0811, + "step": 6854 + }, + { + "epoch": 2.2213220998055734, + "grad_norm": 0.45472726225852966, + "learning_rate": 1.6651938467178751e-06, + "loss": 0.0737, + "step": 6855 + }, + { + "epoch": 2.2216461438755672, + "grad_norm": 0.48038211464881897, + "learning_rate": 1.6638907229353252e-06, + "loss": 0.0729, + "step": 6856 + }, + { + "epoch": 2.2219701879455607, + "grad_norm": 0.44669100642204285, + "learning_rate": 1.662588007460254e-06, + "loss": 0.0682, + "step": 6857 + }, + { + "epoch": 2.222294232015554, + "grad_norm": 0.5000935196876526, + "learning_rate": 1.6612857004521022e-06, + "loss": 0.0756, + "step": 6858 + }, + { + "epoch": 2.2226182760855475, + "grad_norm": 0.4641658365726471, + "learning_rate": 1.6599838020702592e-06, + "loss": 0.0704, + "step": 6859 + }, + { + "epoch": 2.222942320155541, + "grad_norm": 0.47990691661834717, + "learning_rate": 1.6586823124740654e-06, + "loss": 0.0701, + "step": 6860 + }, + { + "epoch": 2.223266364225535, + "grad_norm": 0.49902471899986267, + "learning_rate": 1.6573812318228116e-06, + "loss": 0.074, + "step": 6861 + }, + { + "epoch": 2.2235904082955282, + "grad_norm": 0.5331015586853027, + "learning_rate": 1.6560805602757324e-06, + "loss": 0.0788, + "step": 6862 + }, + { + "epoch": 2.2239144523655217, + "grad_norm": 0.5091055035591125, + "learning_rate": 1.654780297992024e-06, + "loss": 0.0782, + "step": 6863 + }, + { + "epoch": 2.224238496435515, + "grad_norm": 0.513282835483551, + "learning_rate": 1.6534804451308224e-06, + "loss": 0.081, + "step": 6864 + }, + { + "epoch": 2.224562540505509, + "grad_norm": 0.4953143894672394, + "learning_rate": 1.6521810018512163e-06, + "loss": 0.0668, + "step": 6865 + }, + { + "epoch": 2.2248865845755024, + "grad_norm": 0.5035532116889954, + "learning_rate": 1.6508819683122468e-06, + "loss": 0.0786, + "step": 6866 + }, + { + "epoch": 2.225210628645496, + "grad_norm": 0.46315574645996094, + "learning_rate": 1.649583344672902e-06, + "loss": 0.0704, + "step": 6867 + }, + { + "epoch": 2.225534672715489, + "grad_norm": 0.5127713084220886, + "learning_rate": 1.6482851310921232e-06, + "loss": 0.0763, + "step": 6868 + }, + { + "epoch": 2.2258587167854826, + "grad_norm": 0.4451947808265686, + "learning_rate": 1.646987327728794e-06, + "loss": 0.0686, + "step": 6869 + }, + { + "epoch": 2.2261827608554765, + "grad_norm": 0.474811315536499, + "learning_rate": 1.6456899347417593e-06, + "loss": 0.0754, + "step": 6870 + }, + { + "epoch": 2.22650680492547, + "grad_norm": 0.496021568775177, + "learning_rate": 1.6443929522898017e-06, + "loss": 0.0759, + "step": 6871 + }, + { + "epoch": 2.2268308489954634, + "grad_norm": 0.4854046404361725, + "learning_rate": 1.6430963805316646e-06, + "loss": 0.0758, + "step": 6872 + }, + { + "epoch": 2.2271548930654568, + "grad_norm": 0.4593440890312195, + "learning_rate": 1.6418002196260314e-06, + "loss": 0.0686, + "step": 6873 + }, + { + "epoch": 2.2274789371354506, + "grad_norm": 0.45711126923561096, + "learning_rate": 1.640504469731542e-06, + "loss": 0.0706, + "step": 6874 + }, + { + "epoch": 2.227802981205444, + "grad_norm": 0.4680411219596863, + "learning_rate": 1.6392091310067825e-06, + "loss": 0.0739, + "step": 6875 + }, + { + "epoch": 2.2281270252754375, + "grad_norm": 0.4617251455783844, + "learning_rate": 1.6379142036102908e-06, + "loss": 0.0709, + "step": 6876 + }, + { + "epoch": 2.228451069345431, + "grad_norm": 0.44760486483573914, + "learning_rate": 1.6366196877005541e-06, + "loss": 0.072, + "step": 6877 + }, + { + "epoch": 2.2287751134154243, + "grad_norm": 0.5043668746948242, + "learning_rate": 1.635325583436005e-06, + "loss": 0.0763, + "step": 6878 + }, + { + "epoch": 2.229099157485418, + "grad_norm": 0.4933907091617584, + "learning_rate": 1.6340318909750347e-06, + "loss": 0.0772, + "step": 6879 + }, + { + "epoch": 2.2294232015554116, + "grad_norm": 0.4615463316440582, + "learning_rate": 1.6327386104759746e-06, + "loss": 0.0714, + "step": 6880 + }, + { + "epoch": 2.229747245625405, + "grad_norm": 0.47937801480293274, + "learning_rate": 1.6314457420971107e-06, + "loss": 0.0777, + "step": 6881 + }, + { + "epoch": 2.2300712896953985, + "grad_norm": 0.46577373147010803, + "learning_rate": 1.630153285996678e-06, + "loss": 0.0685, + "step": 6882 + }, + { + "epoch": 2.230395333765392, + "grad_norm": 0.4340088963508606, + "learning_rate": 1.6288612423328604e-06, + "loss": 0.0674, + "step": 6883 + }, + { + "epoch": 2.2307193778353858, + "grad_norm": 0.5063320994377136, + "learning_rate": 1.6275696112637918e-06, + "loss": 0.0752, + "step": 6884 + }, + { + "epoch": 2.231043421905379, + "grad_norm": 0.4855647683143616, + "learning_rate": 1.6262783929475545e-06, + "loss": 0.0775, + "step": 6885 + }, + { + "epoch": 2.2313674659753726, + "grad_norm": 0.5098389983177185, + "learning_rate": 1.624987587542184e-06, + "loss": 0.0773, + "step": 6886 + }, + { + "epoch": 2.231691510045366, + "grad_norm": 0.4832960069179535, + "learning_rate": 1.6236971952056584e-06, + "loss": 0.0722, + "step": 6887 + }, + { + "epoch": 2.2320155541153595, + "grad_norm": 0.5732384324073792, + "learning_rate": 1.6224072160959109e-06, + "loss": 0.0742, + "step": 6888 + }, + { + "epoch": 2.2323395981853533, + "grad_norm": 0.4994621276855469, + "learning_rate": 1.621117650370822e-06, + "loss": 0.0759, + "step": 6889 + }, + { + "epoch": 2.2326636422553467, + "grad_norm": 0.4736066162586212, + "learning_rate": 1.6198284981882234e-06, + "loss": 0.0738, + "step": 6890 + }, + { + "epoch": 2.23298768632534, + "grad_norm": 0.5226100087165833, + "learning_rate": 1.618539759705894e-06, + "loss": 0.0724, + "step": 6891 + }, + { + "epoch": 2.2333117303953336, + "grad_norm": 0.4682101905345917, + "learning_rate": 1.6172514350815638e-06, + "loss": 0.0718, + "step": 6892 + }, + { + "epoch": 2.2336357744653275, + "grad_norm": 0.46126919984817505, + "learning_rate": 1.6159635244729077e-06, + "loss": 0.0718, + "step": 6893 + }, + { + "epoch": 2.233959818535321, + "grad_norm": 0.5003137588500977, + "learning_rate": 1.614676028037559e-06, + "loss": 0.0753, + "step": 6894 + }, + { + "epoch": 2.2342838626053143, + "grad_norm": 0.48518186807632446, + "learning_rate": 1.613388945933091e-06, + "loss": 0.0754, + "step": 6895 + }, + { + "epoch": 2.2346079066753077, + "grad_norm": 0.4596126675605774, + "learning_rate": 1.6121022783170305e-06, + "loss": 0.0708, + "step": 6896 + }, + { + "epoch": 2.234931950745301, + "grad_norm": 0.5081773400306702, + "learning_rate": 1.6108160253468542e-06, + "loss": 0.0802, + "step": 6897 + }, + { + "epoch": 2.235255994815295, + "grad_norm": 0.47247710824012756, + "learning_rate": 1.6095301871799862e-06, + "loss": 0.0769, + "step": 6898 + }, + { + "epoch": 2.2355800388852884, + "grad_norm": 0.48782727122306824, + "learning_rate": 1.608244763973803e-06, + "loss": 0.0746, + "step": 6899 + }, + { + "epoch": 2.235904082955282, + "grad_norm": 0.49604710936546326, + "learning_rate": 1.6069597558856225e-06, + "loss": 0.0709, + "step": 6900 + }, + { + "epoch": 2.2362281270252753, + "grad_norm": 0.47329846024513245, + "learning_rate": 1.605675163072724e-06, + "loss": 0.0761, + "step": 6901 + }, + { + "epoch": 2.236552171095269, + "grad_norm": 0.4849427342414856, + "learning_rate": 1.6043909856923222e-06, + "loss": 0.0703, + "step": 6902 + }, + { + "epoch": 2.2368762151652626, + "grad_norm": 0.48779815435409546, + "learning_rate": 1.6031072239015954e-06, + "loss": 0.0764, + "step": 6903 + }, + { + "epoch": 2.237200259235256, + "grad_norm": 0.4910009503364563, + "learning_rate": 1.6018238778576583e-06, + "loss": 0.0695, + "step": 6904 + }, + { + "epoch": 2.2375243033052494, + "grad_norm": 0.47566890716552734, + "learning_rate": 1.6005409477175821e-06, + "loss": 0.0713, + "step": 6905 + }, + { + "epoch": 2.237848347375243, + "grad_norm": 0.4630560874938965, + "learning_rate": 1.5992584336383837e-06, + "loss": 0.0657, + "step": 6906 + }, + { + "epoch": 2.2381723914452367, + "grad_norm": 0.47630998492240906, + "learning_rate": 1.5979763357770316e-06, + "loss": 0.0724, + "step": 6907 + }, + { + "epoch": 2.23849643551523, + "grad_norm": 0.4527610242366791, + "learning_rate": 1.5966946542904438e-06, + "loss": 0.0653, + "step": 6908 + }, + { + "epoch": 2.2388204795852236, + "grad_norm": 0.46687713265419006, + "learning_rate": 1.5954133893354807e-06, + "loss": 0.0714, + "step": 6909 + }, + { + "epoch": 2.239144523655217, + "grad_norm": 0.489058256149292, + "learning_rate": 1.5941325410689624e-06, + "loss": 0.073, + "step": 6910 + }, + { + "epoch": 2.2394685677252104, + "grad_norm": 0.4567910432815552, + "learning_rate": 1.5928521096476484e-06, + "loss": 0.0671, + "step": 6911 + }, + { + "epoch": 2.2397926117952043, + "grad_norm": 0.4794004559516907, + "learning_rate": 1.5915720952282521e-06, + "loss": 0.07, + "step": 6912 + }, + { + "epoch": 2.2401166558651977, + "grad_norm": 0.4764542579650879, + "learning_rate": 1.5902924979674355e-06, + "loss": 0.0714, + "step": 6913 + }, + { + "epoch": 2.240440699935191, + "grad_norm": 0.5001002550125122, + "learning_rate": 1.5890133180218087e-06, + "loss": 0.079, + "step": 6914 + }, + { + "epoch": 2.2407647440051845, + "grad_norm": 0.49677136540412903, + "learning_rate": 1.5877345555479307e-06, + "loss": 0.0747, + "step": 6915 + }, + { + "epoch": 2.2410887880751784, + "grad_norm": 0.4633571207523346, + "learning_rate": 1.5864562107023118e-06, + "loss": 0.0719, + "step": 6916 + }, + { + "epoch": 2.241412832145172, + "grad_norm": 0.5432472825050354, + "learning_rate": 1.5851782836414049e-06, + "loss": 0.0738, + "step": 6917 + }, + { + "epoch": 2.2417368762151653, + "grad_norm": 0.5069789886474609, + "learning_rate": 1.5839007745216184e-06, + "loss": 0.0736, + "step": 6918 + }, + { + "epoch": 2.2420609202851587, + "grad_norm": 0.5018969178199768, + "learning_rate": 1.5826236834993064e-06, + "loss": 0.0779, + "step": 6919 + }, + { + "epoch": 2.242384964355152, + "grad_norm": 0.4906817078590393, + "learning_rate": 1.5813470107307733e-06, + "loss": 0.0754, + "step": 6920 + }, + { + "epoch": 2.242709008425146, + "grad_norm": 0.5077282786369324, + "learning_rate": 1.5800707563722707e-06, + "loss": 0.0711, + "step": 6921 + }, + { + "epoch": 2.2430330524951394, + "grad_norm": 0.5080711245536804, + "learning_rate": 1.5787949205799997e-06, + "loss": 0.0775, + "step": 6922 + }, + { + "epoch": 2.243357096565133, + "grad_norm": 0.42835354804992676, + "learning_rate": 1.5775195035101127e-06, + "loss": 0.0659, + "step": 6923 + }, + { + "epoch": 2.2436811406351262, + "grad_norm": 0.5123438835144043, + "learning_rate": 1.5762445053187025e-06, + "loss": 0.0799, + "step": 6924 + }, + { + "epoch": 2.24400518470512, + "grad_norm": 0.49820834398269653, + "learning_rate": 1.574969926161824e-06, + "loss": 0.0696, + "step": 6925 + }, + { + "epoch": 2.2443292287751135, + "grad_norm": 0.46529993414878845, + "learning_rate": 1.5736957661954662e-06, + "loss": 0.0683, + "step": 6926 + }, + { + "epoch": 2.244653272845107, + "grad_norm": 0.4767801761627197, + "learning_rate": 1.5724220255755806e-06, + "loss": 0.0782, + "step": 6927 + }, + { + "epoch": 2.2449773169151004, + "grad_norm": 0.46832630038261414, + "learning_rate": 1.5711487044580565e-06, + "loss": 0.0723, + "step": 6928 + }, + { + "epoch": 2.245301360985094, + "grad_norm": 0.48059654235839844, + "learning_rate": 1.5698758029987366e-06, + "loss": 0.0713, + "step": 6929 + }, + { + "epoch": 2.2456254050550877, + "grad_norm": 0.4985971748828888, + "learning_rate": 1.568603321353414e-06, + "loss": 0.0757, + "step": 6930 + }, + { + "epoch": 2.245949449125081, + "grad_norm": 0.43783020973205566, + "learning_rate": 1.5673312596778229e-06, + "loss": 0.0665, + "step": 6931 + }, + { + "epoch": 2.2462734931950745, + "grad_norm": 0.49100205302238464, + "learning_rate": 1.5660596181276582e-06, + "loss": 0.0718, + "step": 6932 + }, + { + "epoch": 2.246597537265068, + "grad_norm": 0.4694617986679077, + "learning_rate": 1.5647883968585503e-06, + "loss": 0.0706, + "step": 6933 + }, + { + "epoch": 2.2469215813350614, + "grad_norm": 0.50400310754776, + "learning_rate": 1.5635175960260901e-06, + "loss": 0.0805, + "step": 6934 + }, + { + "epoch": 2.2472456254050552, + "grad_norm": 0.48022282123565674, + "learning_rate": 1.5622472157858066e-06, + "loss": 0.074, + "step": 6935 + }, + { + "epoch": 2.2475696694750487, + "grad_norm": 0.5288910269737244, + "learning_rate": 1.560977256293184e-06, + "loss": 0.0817, + "step": 6936 + }, + { + "epoch": 2.247893713545042, + "grad_norm": 0.47093531489372253, + "learning_rate": 1.559707717703653e-06, + "loss": 0.0751, + "step": 6937 + }, + { + "epoch": 2.2482177576150355, + "grad_norm": 0.5040370225906372, + "learning_rate": 1.5584386001725927e-06, + "loss": 0.0778, + "step": 6938 + }, + { + "epoch": 2.248541801685029, + "grad_norm": 0.4735819697380066, + "learning_rate": 1.5571699038553323e-06, + "loss": 0.0695, + "step": 6939 + }, + { + "epoch": 2.248865845755023, + "grad_norm": 0.4850353002548218, + "learning_rate": 1.555901628907145e-06, + "loss": 0.0734, + "step": 6940 + }, + { + "epoch": 2.249189889825016, + "grad_norm": 0.4938466250896454, + "learning_rate": 1.554633775483257e-06, + "loss": 0.0764, + "step": 6941 + }, + { + "epoch": 2.2495139338950096, + "grad_norm": 0.45608532428741455, + "learning_rate": 1.5533663437388408e-06, + "loss": 0.0698, + "step": 6942 + }, + { + "epoch": 2.249837977965003, + "grad_norm": 0.46821609139442444, + "learning_rate": 1.5520993338290186e-06, + "loss": 0.0772, + "step": 6943 + }, + { + "epoch": 2.250162022034997, + "grad_norm": 0.43394994735717773, + "learning_rate": 1.5508327459088595e-06, + "loss": 0.0713, + "step": 6944 + }, + { + "epoch": 2.2504860661049904, + "grad_norm": 0.45471128821372986, + "learning_rate": 1.5495665801333815e-06, + "loss": 0.0752, + "step": 6945 + }, + { + "epoch": 2.250810110174984, + "grad_norm": 0.4942833185195923, + "learning_rate": 1.5483008366575514e-06, + "loss": 0.0767, + "step": 6946 + }, + { + "epoch": 2.251134154244977, + "grad_norm": 0.44427838921546936, + "learning_rate": 1.547035515636286e-06, + "loss": 0.0656, + "step": 6947 + }, + { + "epoch": 2.251458198314971, + "grad_norm": 0.5050087571144104, + "learning_rate": 1.5457706172244425e-06, + "loss": 0.0785, + "step": 6948 + }, + { + "epoch": 2.2517822423849645, + "grad_norm": 0.5311718583106995, + "learning_rate": 1.5445061415768391e-06, + "loss": 0.0842, + "step": 6949 + }, + { + "epoch": 2.252106286454958, + "grad_norm": 0.48573848605155945, + "learning_rate": 1.5432420888482308e-06, + "loss": 0.0702, + "step": 6950 + }, + { + "epoch": 2.2524303305249513, + "grad_norm": 0.49349531531333923, + "learning_rate": 1.5419784591933267e-06, + "loss": 0.0734, + "step": 6951 + }, + { + "epoch": 2.2527543745949448, + "grad_norm": 0.4787566661834717, + "learning_rate": 1.540715252766783e-06, + "loss": 0.0756, + "step": 6952 + }, + { + "epoch": 2.2530784186649386, + "grad_norm": 0.47929108142852783, + "learning_rate": 1.5394524697232038e-06, + "loss": 0.0718, + "step": 6953 + }, + { + "epoch": 2.253402462734932, + "grad_norm": 0.4798576831817627, + "learning_rate": 1.538190110217143e-06, + "loss": 0.0767, + "step": 6954 + }, + { + "epoch": 2.2537265068049255, + "grad_norm": 0.45218637585639954, + "learning_rate": 1.5369281744030968e-06, + "loss": 0.0666, + "step": 6955 + }, + { + "epoch": 2.254050550874919, + "grad_norm": 0.4727388620376587, + "learning_rate": 1.5356666624355204e-06, + "loss": 0.0693, + "step": 6956 + }, + { + "epoch": 2.2543745949449123, + "grad_norm": 0.5332309603691101, + "learning_rate": 1.5344055744688035e-06, + "loss": 0.0779, + "step": 6957 + }, + { + "epoch": 2.254698639014906, + "grad_norm": 0.4802578091621399, + "learning_rate": 1.5331449106572983e-06, + "loss": 0.0704, + "step": 6958 + }, + { + "epoch": 2.2550226830848996, + "grad_norm": 0.46075379848480225, + "learning_rate": 1.5318846711552926e-06, + "loss": 0.0709, + "step": 6959 + }, + { + "epoch": 2.255346727154893, + "grad_norm": 0.47085317969322205, + "learning_rate": 1.530624856117029e-06, + "loss": 0.0713, + "step": 6960 + }, + { + "epoch": 2.2556707712248865, + "grad_norm": 0.5056251883506775, + "learning_rate": 1.5293654656966972e-06, + "loss": 0.0762, + "step": 6961 + }, + { + "epoch": 2.25599481529488, + "grad_norm": 0.495922714471817, + "learning_rate": 1.528106500048434e-06, + "loss": 0.0773, + "step": 6962 + }, + { + "epoch": 2.2563188593648738, + "grad_norm": 0.5067967176437378, + "learning_rate": 1.5268479593263257e-06, + "loss": 0.075, + "step": 6963 + }, + { + "epoch": 2.256642903434867, + "grad_norm": 0.5422359704971313, + "learning_rate": 1.525589843684402e-06, + "loss": 0.0848, + "step": 6964 + }, + { + "epoch": 2.2569669475048606, + "grad_norm": 0.4754337668418884, + "learning_rate": 1.5243321532766492e-06, + "loss": 0.0714, + "step": 6965 + }, + { + "epoch": 2.257290991574854, + "grad_norm": 0.44811588525772095, + "learning_rate": 1.5230748882569924e-06, + "loss": 0.061, + "step": 6966 + }, + { + "epoch": 2.257615035644848, + "grad_norm": 0.47737687826156616, + "learning_rate": 1.52181804877931e-06, + "loss": 0.0709, + "step": 6967 + }, + { + "epoch": 2.2579390797148413, + "grad_norm": 0.4759818911552429, + "learning_rate": 1.5205616349974273e-06, + "loss": 0.0748, + "step": 6968 + }, + { + "epoch": 2.2582631237848347, + "grad_norm": 0.46108049154281616, + "learning_rate": 1.5193056470651163e-06, + "loss": 0.0699, + "step": 6969 + }, + { + "epoch": 2.258587167854828, + "grad_norm": 0.4533197581768036, + "learning_rate": 1.5180500851360991e-06, + "loss": 0.0693, + "step": 6970 + }, + { + "epoch": 2.2589112119248216, + "grad_norm": 0.522936999797821, + "learning_rate": 1.5167949493640444e-06, + "loss": 0.081, + "step": 6971 + }, + { + "epoch": 2.2592352559948155, + "grad_norm": 0.4949372410774231, + "learning_rate": 1.515540239902567e-06, + "loss": 0.0765, + "step": 6972 + }, + { + "epoch": 2.259559300064809, + "grad_norm": 0.5081701278686523, + "learning_rate": 1.5142859569052315e-06, + "loss": 0.0759, + "step": 6973 + }, + { + "epoch": 2.2598833441348023, + "grad_norm": 0.46086448431015015, + "learning_rate": 1.5130321005255504e-06, + "loss": 0.0681, + "step": 6974 + }, + { + "epoch": 2.2602073882047957, + "grad_norm": 0.5110464692115784, + "learning_rate": 1.5117786709169845e-06, + "loss": 0.0762, + "step": 6975 + }, + { + "epoch": 2.2605314322747896, + "grad_norm": 0.49480530619621277, + "learning_rate": 1.51052566823294e-06, + "loss": 0.0765, + "step": 6976 + }, + { + "epoch": 2.260855476344783, + "grad_norm": 0.5045551657676697, + "learning_rate": 1.5092730926267734e-06, + "loss": 0.076, + "step": 6977 + }, + { + "epoch": 2.2611795204147764, + "grad_norm": 0.4924558401107788, + "learning_rate": 1.508020944251789e-06, + "loss": 0.0733, + "step": 6978 + }, + { + "epoch": 2.26150356448477, + "grad_norm": 0.497134804725647, + "learning_rate": 1.5067692232612323e-06, + "loss": 0.0751, + "step": 6979 + }, + { + "epoch": 2.2618276085547633, + "grad_norm": 0.4783954918384552, + "learning_rate": 1.5055179298083095e-06, + "loss": 0.0724, + "step": 6980 + }, + { + "epoch": 2.262151652624757, + "grad_norm": 0.47952574491500854, + "learning_rate": 1.5042670640461609e-06, + "loss": 0.0724, + "step": 6981 + }, + { + "epoch": 2.2624756966947506, + "grad_norm": 0.4710170328617096, + "learning_rate": 1.5030166261278823e-06, + "loss": 0.0697, + "step": 6982 + }, + { + "epoch": 2.262799740764744, + "grad_norm": 0.49460992217063904, + "learning_rate": 1.5017666162065153e-06, + "loss": 0.0725, + "step": 6983 + }, + { + "epoch": 2.2631237848347374, + "grad_norm": 0.5072700381278992, + "learning_rate": 1.5005170344350489e-06, + "loss": 0.0779, + "step": 6984 + }, + { + "epoch": 2.263447828904731, + "grad_norm": 0.47068458795547485, + "learning_rate": 1.4992678809664218e-06, + "loss": 0.0701, + "step": 6985 + }, + { + "epoch": 2.2637718729747247, + "grad_norm": 0.4834175407886505, + "learning_rate": 1.4980191559535128e-06, + "loss": 0.0768, + "step": 6986 + }, + { + "epoch": 2.264095917044718, + "grad_norm": 0.4545857012271881, + "learning_rate": 1.496770859549161e-06, + "loss": 0.0683, + "step": 6987 + }, + { + "epoch": 2.2644199611147116, + "grad_norm": 0.5007740259170532, + "learning_rate": 1.495522991906138e-06, + "loss": 0.0731, + "step": 6988 + }, + { + "epoch": 2.264744005184705, + "grad_norm": 0.4794744849205017, + "learning_rate": 1.4942755531771785e-06, + "loss": 0.074, + "step": 6989 + }, + { + "epoch": 2.2650680492546984, + "grad_norm": 0.5080423355102539, + "learning_rate": 1.4930285435149522e-06, + "loss": 0.0763, + "step": 6990 + }, + { + "epoch": 2.2653920933246923, + "grad_norm": 0.5505486130714417, + "learning_rate": 1.4917819630720814e-06, + "loss": 0.0827, + "step": 6991 + }, + { + "epoch": 2.2657161373946857, + "grad_norm": 0.45858538150787354, + "learning_rate": 1.490535812001136e-06, + "loss": 0.0663, + "step": 6992 + }, + { + "epoch": 2.266040181464679, + "grad_norm": 0.4944336414337158, + "learning_rate": 1.4892900904546336e-06, + "loss": 0.0753, + "step": 6993 + }, + { + "epoch": 2.2663642255346725, + "grad_norm": 0.4706510603427887, + "learning_rate": 1.4880447985850387e-06, + "loss": 0.0721, + "step": 6994 + }, + { + "epoch": 2.2666882696046664, + "grad_norm": 0.5018250346183777, + "learning_rate": 1.486799936544761e-06, + "loss": 0.078, + "step": 6995 + }, + { + "epoch": 2.26701231367466, + "grad_norm": 0.4667245149612427, + "learning_rate": 1.4855555044861609e-06, + "loss": 0.0672, + "step": 6996 + }, + { + "epoch": 2.2673363577446533, + "grad_norm": 0.4803808331489563, + "learning_rate": 1.484311502561544e-06, + "loss": 0.073, + "step": 6997 + }, + { + "epoch": 2.2676604018146467, + "grad_norm": 0.47602206468582153, + "learning_rate": 1.4830679309231649e-06, + "loss": 0.0717, + "step": 6998 + }, + { + "epoch": 2.2679844458846405, + "grad_norm": 0.4727866053581238, + "learning_rate": 1.4818247897232247e-06, + "loss": 0.0678, + "step": 6999 + }, + { + "epoch": 2.268308489954634, + "grad_norm": 0.4802073836326599, + "learning_rate": 1.4805820791138715e-06, + "loss": 0.0734, + "step": 7000 + }, + { + "epoch": 2.2686325340246274, + "grad_norm": 0.4852180480957031, + "learning_rate": 1.4793397992472009e-06, + "loss": 0.0742, + "step": 7001 + }, + { + "epoch": 2.268956578094621, + "grad_norm": 0.4746887683868408, + "learning_rate": 1.478097950275258e-06, + "loss": 0.0739, + "step": 7002 + }, + { + "epoch": 2.2692806221646142, + "grad_norm": 0.47529059648513794, + "learning_rate": 1.47685653235003e-06, + "loss": 0.0709, + "step": 7003 + }, + { + "epoch": 2.269604666234608, + "grad_norm": 0.47900229692459106, + "learning_rate": 1.4756155456234555e-06, + "loss": 0.0763, + "step": 7004 + }, + { + "epoch": 2.2699287103046015, + "grad_norm": 0.44263532757759094, + "learning_rate": 1.4743749902474197e-06, + "loss": 0.0652, + "step": 7005 + }, + { + "epoch": 2.270252754374595, + "grad_norm": 0.48573142290115356, + "learning_rate": 1.4731348663737543e-06, + "loss": 0.0741, + "step": 7006 + }, + { + "epoch": 2.2705767984445884, + "grad_norm": 0.45377403497695923, + "learning_rate": 1.4718951741542404e-06, + "loss": 0.0711, + "step": 7007 + }, + { + "epoch": 2.270900842514582, + "grad_norm": 0.4826267659664154, + "learning_rate": 1.470655913740599e-06, + "loss": 0.0716, + "step": 7008 + }, + { + "epoch": 2.2712248865845757, + "grad_norm": 0.49614590406417847, + "learning_rate": 1.4694170852845102e-06, + "loss": 0.0728, + "step": 7009 + }, + { + "epoch": 2.271548930654569, + "grad_norm": 0.49249184131622314, + "learning_rate": 1.4681786889375888e-06, + "loss": 0.0718, + "step": 7010 + }, + { + "epoch": 2.2718729747245625, + "grad_norm": 0.5121108889579773, + "learning_rate": 1.4669407248514079e-06, + "loss": 0.0763, + "step": 7011 + }, + { + "epoch": 2.272197018794556, + "grad_norm": 0.4713032841682434, + "learning_rate": 1.4657031931774778e-06, + "loss": 0.0698, + "step": 7012 + }, + { + "epoch": 2.2725210628645494, + "grad_norm": 0.5088580846786499, + "learning_rate": 1.4644660940672628e-06, + "loss": 0.0692, + "step": 7013 + }, + { + "epoch": 2.2728451069345432, + "grad_norm": 0.4591797888278961, + "learning_rate": 1.463229427672171e-06, + "loss": 0.0668, + "step": 7014 + }, + { + "epoch": 2.2731691510045366, + "grad_norm": 0.4763369560241699, + "learning_rate": 1.461993194143559e-06, + "loss": 0.0744, + "step": 7015 + }, + { + "epoch": 2.27349319507453, + "grad_norm": 0.504949688911438, + "learning_rate": 1.4607573936327302e-06, + "loss": 0.0775, + "step": 7016 + }, + { + "epoch": 2.2738172391445235, + "grad_norm": 0.45064422488212585, + "learning_rate": 1.4595220262909315e-06, + "loss": 0.0708, + "step": 7017 + }, + { + "epoch": 2.2741412832145174, + "grad_norm": 0.5188299417495728, + "learning_rate": 1.4582870922693654e-06, + "loss": 0.0749, + "step": 7018 + }, + { + "epoch": 2.274465327284511, + "grad_norm": 0.4455835819244385, + "learning_rate": 1.4570525917191692e-06, + "loss": 0.0702, + "step": 7019 + }, + { + "epoch": 2.274789371354504, + "grad_norm": 0.46820423007011414, + "learning_rate": 1.4558185247914409e-06, + "loss": 0.0727, + "step": 7020 + }, + { + "epoch": 2.2751134154244976, + "grad_norm": 0.4896591305732727, + "learning_rate": 1.4545848916372129e-06, + "loss": 0.0773, + "step": 7021 + }, + { + "epoch": 2.2754374594944915, + "grad_norm": 0.5099110007286072, + "learning_rate": 1.453351692407472e-06, + "loss": 0.0783, + "step": 7022 + }, + { + "epoch": 2.275761503564485, + "grad_norm": 0.4538210332393646, + "learning_rate": 1.45211892725315e-06, + "loss": 0.0709, + "step": 7023 + }, + { + "epoch": 2.2760855476344783, + "grad_norm": 0.4581274390220642, + "learning_rate": 1.4508865963251252e-06, + "loss": 0.0694, + "step": 7024 + }, + { + "epoch": 2.2764095917044718, + "grad_norm": 0.456988662481308, + "learning_rate": 1.4496546997742239e-06, + "loss": 0.067, + "step": 7025 + }, + { + "epoch": 2.276733635774465, + "grad_norm": 0.4534381926059723, + "learning_rate": 1.4484232377512165e-06, + "loss": 0.0685, + "step": 7026 + }, + { + "epoch": 2.277057679844459, + "grad_norm": 0.4984075725078583, + "learning_rate": 1.4471922104068225e-06, + "loss": 0.0792, + "step": 7027 + }, + { + "epoch": 2.2773817239144525, + "grad_norm": 0.4937223792076111, + "learning_rate": 1.445961617891708e-06, + "loss": 0.0744, + "step": 7028 + }, + { + "epoch": 2.277705767984446, + "grad_norm": 0.49529770016670227, + "learning_rate": 1.444731460356486e-06, + "loss": 0.0767, + "step": 7029 + }, + { + "epoch": 2.2780298120544393, + "grad_norm": 0.49039947986602783, + "learning_rate": 1.4435017379517153e-06, + "loss": 0.0763, + "step": 7030 + }, + { + "epoch": 2.2783538561244328, + "grad_norm": 0.4506426453590393, + "learning_rate": 1.4422724508279024e-06, + "loss": 0.0704, + "step": 7031 + }, + { + "epoch": 2.2786779001944266, + "grad_norm": 0.4578206241130829, + "learning_rate": 1.4410435991355004e-06, + "loss": 0.0729, + "step": 7032 + }, + { + "epoch": 2.27900194426442, + "grad_norm": 0.44411706924438477, + "learning_rate": 1.4398151830249096e-06, + "loss": 0.0682, + "step": 7033 + }, + { + "epoch": 2.2793259883344135, + "grad_norm": 0.5245050191879272, + "learning_rate": 1.4385872026464736e-06, + "loss": 0.0771, + "step": 7034 + }, + { + "epoch": 2.279650032404407, + "grad_norm": 0.4982036054134369, + "learning_rate": 1.4373596581504872e-06, + "loss": 0.0767, + "step": 7035 + }, + { + "epoch": 2.2799740764744003, + "grad_norm": 0.4747893512248993, + "learning_rate": 1.4361325496871893e-06, + "loss": 0.0714, + "step": 7036 + }, + { + "epoch": 2.280298120544394, + "grad_norm": 0.440021276473999, + "learning_rate": 1.4349058774067665e-06, + "loss": 0.0689, + "step": 7037 + }, + { + "epoch": 2.2806221646143876, + "grad_norm": 0.46596381068229675, + "learning_rate": 1.4336796414593528e-06, + "loss": 0.0705, + "step": 7038 + }, + { + "epoch": 2.280946208684381, + "grad_norm": 0.4599742293357849, + "learning_rate": 1.4324538419950234e-06, + "loss": 0.0705, + "step": 7039 + }, + { + "epoch": 2.2812702527543745, + "grad_norm": 0.4699169397354126, + "learning_rate": 1.431228479163811e-06, + "loss": 0.0718, + "step": 7040 + }, + { + "epoch": 2.281594296824368, + "grad_norm": 0.4909172058105469, + "learning_rate": 1.4300035531156803e-06, + "loss": 0.0754, + "step": 7041 + }, + { + "epoch": 2.2819183408943617, + "grad_norm": 0.5037856698036194, + "learning_rate": 1.4287790640005578e-06, + "loss": 0.0795, + "step": 7042 + }, + { + "epoch": 2.282242384964355, + "grad_norm": 0.4997573494911194, + "learning_rate": 1.4275550119683046e-06, + "loss": 0.0785, + "step": 7043 + }, + { + "epoch": 2.2825664290343486, + "grad_norm": 0.500052273273468, + "learning_rate": 1.4263313971687337e-06, + "loss": 0.0798, + "step": 7044 + }, + { + "epoch": 2.282890473104342, + "grad_norm": 0.5084229707717896, + "learning_rate": 1.4251082197516043e-06, + "loss": 0.0779, + "step": 7045 + }, + { + "epoch": 2.283214517174336, + "grad_norm": 0.5037830471992493, + "learning_rate": 1.4238854798666208e-06, + "loss": 0.0773, + "step": 7046 + }, + { + "epoch": 2.2835385612443293, + "grad_norm": 0.46606168150901794, + "learning_rate": 1.4226631776634363e-06, + "loss": 0.0687, + "step": 7047 + }, + { + "epoch": 2.2838626053143227, + "grad_norm": 0.5235673189163208, + "learning_rate": 1.421441313291645e-06, + "loss": 0.0829, + "step": 7048 + }, + { + "epoch": 2.284186649384316, + "grad_norm": 0.4666191041469574, + "learning_rate": 1.4202198869007972e-06, + "loss": 0.0712, + "step": 7049 + }, + { + "epoch": 2.28451069345431, + "grad_norm": 0.4553104341030121, + "learning_rate": 1.418998898640378e-06, + "loss": 0.071, + "step": 7050 + }, + { + "epoch": 2.2848347375243034, + "grad_norm": 0.5150062441825867, + "learning_rate": 1.4177783486598273e-06, + "loss": 0.0802, + "step": 7051 + }, + { + "epoch": 2.285158781594297, + "grad_norm": 0.44989052414894104, + "learning_rate": 1.416558237108528e-06, + "loss": 0.0725, + "step": 7052 + }, + { + "epoch": 2.2854828256642903, + "grad_norm": 0.5269073247909546, + "learning_rate": 1.4153385641358102e-06, + "loss": 0.0756, + "step": 7053 + }, + { + "epoch": 2.2858068697342837, + "grad_norm": 0.4914553463459015, + "learning_rate": 1.4141193298909496e-06, + "loss": 0.077, + "step": 7054 + }, + { + "epoch": 2.2861309138042776, + "grad_norm": 0.524666428565979, + "learning_rate": 1.4129005345231694e-06, + "loss": 0.0785, + "step": 7055 + }, + { + "epoch": 2.286454957874271, + "grad_norm": 0.5038748979568481, + "learning_rate": 1.4116821781816391e-06, + "loss": 0.0784, + "step": 7056 + }, + { + "epoch": 2.2867790019442644, + "grad_norm": 0.48034003376960754, + "learning_rate": 1.4104642610154712e-06, + "loss": 0.0704, + "step": 7057 + }, + { + "epoch": 2.287103046014258, + "grad_norm": 0.5103918313980103, + "learning_rate": 1.4092467831737283e-06, + "loss": 0.0765, + "step": 7058 + }, + { + "epoch": 2.2874270900842513, + "grad_norm": 0.4362955689430237, + "learning_rate": 1.408029744805418e-06, + "loss": 0.0638, + "step": 7059 + }, + { + "epoch": 2.287751134154245, + "grad_norm": 0.4751020669937134, + "learning_rate": 1.4068131460594942e-06, + "loss": 0.0724, + "step": 7060 + }, + { + "epoch": 2.2880751782242386, + "grad_norm": 0.47384995222091675, + "learning_rate": 1.4055969870848567e-06, + "loss": 0.0718, + "step": 7061 + }, + { + "epoch": 2.288399222294232, + "grad_norm": 0.596555233001709, + "learning_rate": 1.4043812680303527e-06, + "loss": 0.0824, + "step": 7062 + }, + { + "epoch": 2.2887232663642254, + "grad_norm": 0.5346999764442444, + "learning_rate": 1.4031659890447703e-06, + "loss": 0.0851, + "step": 7063 + }, + { + "epoch": 2.289047310434219, + "grad_norm": 0.505793571472168, + "learning_rate": 1.4019511502768535e-06, + "loss": 0.0778, + "step": 7064 + }, + { + "epoch": 2.2893713545042127, + "grad_norm": 0.48712509870529175, + "learning_rate": 1.400736751875283e-06, + "loss": 0.0742, + "step": 7065 + }, + { + "epoch": 2.289695398574206, + "grad_norm": 0.4694879651069641, + "learning_rate": 1.3995227939886902e-06, + "loss": 0.069, + "step": 7066 + }, + { + "epoch": 2.2900194426441995, + "grad_norm": 0.4688408374786377, + "learning_rate": 1.398309276765652e-06, + "loss": 0.0704, + "step": 7067 + }, + { + "epoch": 2.290343486714193, + "grad_norm": 0.5062603950500488, + "learning_rate": 1.3970962003546911e-06, + "loss": 0.0803, + "step": 7068 + }, + { + "epoch": 2.290667530784187, + "grad_norm": 0.47821182012557983, + "learning_rate": 1.3958835649042785e-06, + "loss": 0.0698, + "step": 7069 + }, + { + "epoch": 2.2909915748541803, + "grad_norm": 0.46454837918281555, + "learning_rate": 1.394671370562824e-06, + "loss": 0.0696, + "step": 7070 + }, + { + "epoch": 2.2913156189241737, + "grad_norm": 0.4921838045120239, + "learning_rate": 1.3934596174786941e-06, + "loss": 0.0788, + "step": 7071 + }, + { + "epoch": 2.291639662994167, + "grad_norm": 0.4985063970088959, + "learning_rate": 1.39224830580019e-06, + "loss": 0.0693, + "step": 7072 + }, + { + "epoch": 2.291963707064161, + "grad_norm": 0.5150036215782166, + "learning_rate": 1.3910374356755707e-06, + "loss": 0.0769, + "step": 7073 + }, + { + "epoch": 2.2922877511341544, + "grad_norm": 0.45300233364105225, + "learning_rate": 1.3898270072530306e-06, + "loss": 0.0705, + "step": 7074 + }, + { + "epoch": 2.292611795204148, + "grad_norm": 0.4846388101577759, + "learning_rate": 1.3886170206807153e-06, + "loss": 0.0718, + "step": 7075 + }, + { + "epoch": 2.2929358392741412, + "grad_norm": 0.47709107398986816, + "learning_rate": 1.3874074761067158e-06, + "loss": 0.0701, + "step": 7076 + }, + { + "epoch": 2.2932598833441347, + "grad_norm": 0.4540219008922577, + "learning_rate": 1.3861983736790685e-06, + "loss": 0.0676, + "step": 7077 + }, + { + "epoch": 2.2935839274141285, + "grad_norm": 0.46152257919311523, + "learning_rate": 1.3849897135457574e-06, + "loss": 0.0693, + "step": 7078 + }, + { + "epoch": 2.293907971484122, + "grad_norm": 0.5037170648574829, + "learning_rate": 1.383781495854707e-06, + "loss": 0.0777, + "step": 7079 + }, + { + "epoch": 2.2942320155541154, + "grad_norm": 0.47302427887916565, + "learning_rate": 1.3825737207537959e-06, + "loss": 0.0689, + "step": 7080 + }, + { + "epoch": 2.294556059624109, + "grad_norm": 0.5259142518043518, + "learning_rate": 1.3813663883908412e-06, + "loss": 0.0786, + "step": 7081 + }, + { + "epoch": 2.2948801036941022, + "grad_norm": 0.46605339646339417, + "learning_rate": 1.380159498913609e-06, + "loss": 0.0716, + "step": 7082 + }, + { + "epoch": 2.295204147764096, + "grad_norm": 0.48375439643859863, + "learning_rate": 1.3789530524698113e-06, + "loss": 0.0727, + "step": 7083 + }, + { + "epoch": 2.2955281918340895, + "grad_norm": 0.4879249930381775, + "learning_rate": 1.377747049207106e-06, + "loss": 0.0751, + "step": 7084 + }, + { + "epoch": 2.295852235904083, + "grad_norm": 0.5046743154525757, + "learning_rate": 1.3765414892730954e-06, + "loss": 0.072, + "step": 7085 + }, + { + "epoch": 2.2961762799740764, + "grad_norm": 0.5172231793403625, + "learning_rate": 1.3753363728153291e-06, + "loss": 0.0809, + "step": 7086 + }, + { + "epoch": 2.29650032404407, + "grad_norm": 0.4662172198295593, + "learning_rate": 1.374131699981301e-06, + "loss": 0.0684, + "step": 7087 + }, + { + "epoch": 2.2968243681140637, + "grad_norm": 0.5307612419128418, + "learning_rate": 1.3729274709184532e-06, + "loss": 0.0796, + "step": 7088 + }, + { + "epoch": 2.297148412184057, + "grad_norm": 0.48822152614593506, + "learning_rate": 1.3717236857741684e-06, + "loss": 0.0748, + "step": 7089 + }, + { + "epoch": 2.2974724562540505, + "grad_norm": 0.4695141017436981, + "learning_rate": 1.3705203446957803e-06, + "loss": 0.0668, + "step": 7090 + }, + { + "epoch": 2.297796500324044, + "grad_norm": 0.47505518794059753, + "learning_rate": 1.369317447830566e-06, + "loss": 0.0733, + "step": 7091 + }, + { + "epoch": 2.2981205443940373, + "grad_norm": 0.45042139291763306, + "learning_rate": 1.3681149953257483e-06, + "loss": 0.0694, + "step": 7092 + }, + { + "epoch": 2.298444588464031, + "grad_norm": 0.487425833940506, + "learning_rate": 1.3669129873284976e-06, + "loss": 0.0764, + "step": 7093 + }, + { + "epoch": 2.2987686325340246, + "grad_norm": 0.47913771867752075, + "learning_rate": 1.3657114239859226e-06, + "loss": 0.0734, + "step": 7094 + }, + { + "epoch": 2.299092676604018, + "grad_norm": 0.4954153597354889, + "learning_rate": 1.3645103054450904e-06, + "loss": 0.0763, + "step": 7095 + }, + { + "epoch": 2.2994167206740115, + "grad_norm": 0.449323445558548, + "learning_rate": 1.3633096318529986e-06, + "loss": 0.0671, + "step": 7096 + }, + { + "epoch": 2.2997407647440054, + "grad_norm": 0.46951982378959656, + "learning_rate": 1.3621094033566057e-06, + "loss": 0.0679, + "step": 7097 + }, + { + "epoch": 2.3000648088139988, + "grad_norm": 0.5145975947380066, + "learning_rate": 1.3609096201028026e-06, + "loss": 0.074, + "step": 7098 + }, + { + "epoch": 2.300388852883992, + "grad_norm": 0.47816240787506104, + "learning_rate": 1.359710282238433e-06, + "loss": 0.073, + "step": 7099 + }, + { + "epoch": 2.3007128969539856, + "grad_norm": 0.5030921697616577, + "learning_rate": 1.3585113899102853e-06, + "loss": 0.0776, + "step": 7100 + }, + { + "epoch": 2.3010369410239795, + "grad_norm": 0.4865356385707855, + "learning_rate": 1.3573129432650882e-06, + "loss": 0.0767, + "step": 7101 + }, + { + "epoch": 2.301360985093973, + "grad_norm": 0.4202616214752197, + "learning_rate": 1.3561149424495263e-06, + "loss": 0.0659, + "step": 7102 + }, + { + "epoch": 2.3016850291639663, + "grad_norm": 0.48426011204719543, + "learning_rate": 1.3549173876102167e-06, + "loss": 0.0696, + "step": 7103 + }, + { + "epoch": 2.3020090732339598, + "grad_norm": 0.46074673533439636, + "learning_rate": 1.3537202788937349e-06, + "loss": 0.0712, + "step": 7104 + }, + { + "epoch": 2.302333117303953, + "grad_norm": 0.4761511981487274, + "learning_rate": 1.3525236164465904e-06, + "loss": 0.0737, + "step": 7105 + }, + { + "epoch": 2.302657161373947, + "grad_norm": 0.5033185482025146, + "learning_rate": 1.351327400415245e-06, + "loss": 0.0787, + "step": 7106 + }, + { + "epoch": 2.3029812054439405, + "grad_norm": 0.456020325422287, + "learning_rate": 1.3501316309461044e-06, + "loss": 0.0723, + "step": 7107 + }, + { + "epoch": 2.303305249513934, + "grad_norm": 0.4962615966796875, + "learning_rate": 1.3489363081855177e-06, + "loss": 0.0737, + "step": 7108 + }, + { + "epoch": 2.3036292935839273, + "grad_norm": 0.4618525207042694, + "learning_rate": 1.3477414322797828e-06, + "loss": 0.0738, + "step": 7109 + }, + { + "epoch": 2.3039533376539207, + "grad_norm": 0.514461100101471, + "learning_rate": 1.3465470033751393e-06, + "loss": 0.0778, + "step": 7110 + }, + { + "epoch": 2.3042773817239146, + "grad_norm": 0.47707101702690125, + "learning_rate": 1.3453530216177763e-06, + "loss": 0.0736, + "step": 7111 + }, + { + "epoch": 2.304601425793908, + "grad_norm": 0.45344793796539307, + "learning_rate": 1.3441594871538221e-06, + "loss": 0.0648, + "step": 7112 + }, + { + "epoch": 2.3049254698639015, + "grad_norm": 0.47207412123680115, + "learning_rate": 1.3429664001293557e-06, + "loss": 0.0752, + "step": 7113 + }, + { + "epoch": 2.305249513933895, + "grad_norm": 0.5023806095123291, + "learning_rate": 1.3417737606903985e-06, + "loss": 0.0731, + "step": 7114 + }, + { + "epoch": 2.3055735580038883, + "grad_norm": 0.4673008918762207, + "learning_rate": 1.3405815689829195e-06, + "loss": 0.0706, + "step": 7115 + }, + { + "epoch": 2.305897602073882, + "grad_norm": 0.4746420085430145, + "learning_rate": 1.3393898251528298e-06, + "loss": 0.0678, + "step": 7116 + }, + { + "epoch": 2.3062216461438756, + "grad_norm": 0.487405925989151, + "learning_rate": 1.3381985293459899e-06, + "loss": 0.0753, + "step": 7117 + }, + { + "epoch": 2.306545690213869, + "grad_norm": 0.49059462547302246, + "learning_rate": 1.3370076817081978e-06, + "loss": 0.0784, + "step": 7118 + }, + { + "epoch": 2.3068697342838624, + "grad_norm": 0.5020880699157715, + "learning_rate": 1.3358172823852077e-06, + "loss": 0.079, + "step": 7119 + }, + { + "epoch": 2.3071937783538563, + "grad_norm": 0.4636348783969879, + "learning_rate": 1.3346273315227094e-06, + "loss": 0.0702, + "step": 7120 + }, + { + "epoch": 2.3075178224238497, + "grad_norm": 0.5022696852684021, + "learning_rate": 1.3334378292663414e-06, + "loss": 0.0747, + "step": 7121 + }, + { + "epoch": 2.307841866493843, + "grad_norm": 0.5041515231132507, + "learning_rate": 1.3322487757616886e-06, + "loss": 0.0776, + "step": 7122 + }, + { + "epoch": 2.3081659105638366, + "grad_norm": 0.45286890864372253, + "learning_rate": 1.3310601711542787e-06, + "loss": 0.0664, + "step": 7123 + }, + { + "epoch": 2.3084899546338304, + "grad_norm": 0.4740108847618103, + "learning_rate": 1.3298720155895879e-06, + "loss": 0.0704, + "step": 7124 + }, + { + "epoch": 2.308813998703824, + "grad_norm": 0.4879896342754364, + "learning_rate": 1.3286843092130292e-06, + "loss": 0.0721, + "step": 7125 + }, + { + "epoch": 2.3091380427738173, + "grad_norm": 0.4944480061531067, + "learning_rate": 1.3274970521699731e-06, + "loss": 0.0775, + "step": 7126 + }, + { + "epoch": 2.3094620868438107, + "grad_norm": 0.47421595454216003, + "learning_rate": 1.326310244605722e-06, + "loss": 0.0689, + "step": 7127 + }, + { + "epoch": 2.309786130913804, + "grad_norm": 0.47668230533599854, + "learning_rate": 1.3251238866655363e-06, + "loss": 0.0697, + "step": 7128 + }, + { + "epoch": 2.310110174983798, + "grad_norm": 0.48620912432670593, + "learning_rate": 1.3239379784946093e-06, + "loss": 0.0712, + "step": 7129 + }, + { + "epoch": 2.3104342190537914, + "grad_norm": 0.5364415645599365, + "learning_rate": 1.322752520238087e-06, + "loss": 0.0757, + "step": 7130 + }, + { + "epoch": 2.310758263123785, + "grad_norm": 0.5277668833732605, + "learning_rate": 1.321567512041058e-06, + "loss": 0.0783, + "step": 7131 + }, + { + "epoch": 2.3110823071937783, + "grad_norm": 0.4718340337276459, + "learning_rate": 1.3203829540485552e-06, + "loss": 0.0689, + "step": 7132 + }, + { + "epoch": 2.3114063512637717, + "grad_norm": 0.4828222393989563, + "learning_rate": 1.3191988464055588e-06, + "loss": 0.0733, + "step": 7133 + }, + { + "epoch": 2.3117303953337656, + "grad_norm": 0.4246669113636017, + "learning_rate": 1.3180151892569882e-06, + "loss": 0.0604, + "step": 7134 + }, + { + "epoch": 2.312054439403759, + "grad_norm": 0.4550812542438507, + "learning_rate": 1.3168319827477166e-06, + "loss": 0.063, + "step": 7135 + }, + { + "epoch": 2.3123784834737524, + "grad_norm": 0.5037466883659363, + "learning_rate": 1.315649227022553e-06, + "loss": 0.0802, + "step": 7136 + }, + { + "epoch": 2.312702527543746, + "grad_norm": 0.43731677532196045, + "learning_rate": 1.3144669222262568e-06, + "loss": 0.0631, + "step": 7137 + }, + { + "epoch": 2.3130265716137393, + "grad_norm": 0.49847137928009033, + "learning_rate": 1.3132850685035304e-06, + "loss": 0.0774, + "step": 7138 + }, + { + "epoch": 2.313350615683733, + "grad_norm": 0.459532767534256, + "learning_rate": 1.3121036659990215e-06, + "loss": 0.0703, + "step": 7139 + }, + { + "epoch": 2.3136746597537265, + "grad_norm": 0.5136400461196899, + "learning_rate": 1.3109227148573227e-06, + "loss": 0.0796, + "step": 7140 + }, + { + "epoch": 2.31399870382372, + "grad_norm": 0.4838277995586395, + "learning_rate": 1.3097422152229715e-06, + "loss": 0.0728, + "step": 7141 + }, + { + "epoch": 2.3143227478937134, + "grad_norm": 0.5083605051040649, + "learning_rate": 1.3085621672404474e-06, + "loss": 0.0819, + "step": 7142 + }, + { + "epoch": 2.314646791963707, + "grad_norm": 0.475130558013916, + "learning_rate": 1.3073825710541787e-06, + "loss": 0.0722, + "step": 7143 + }, + { + "epoch": 2.3149708360337007, + "grad_norm": 0.5326772332191467, + "learning_rate": 1.3062034268085355e-06, + "loss": 0.0801, + "step": 7144 + }, + { + "epoch": 2.315294880103694, + "grad_norm": 0.497607558965683, + "learning_rate": 1.305024734647834e-06, + "loss": 0.0699, + "step": 7145 + }, + { + "epoch": 2.3156189241736875, + "grad_norm": 0.47802019119262695, + "learning_rate": 1.303846494716335e-06, + "loss": 0.0678, + "step": 7146 + }, + { + "epoch": 2.315942968243681, + "grad_norm": 0.4831261932849884, + "learning_rate": 1.3026687071582432e-06, + "loss": 0.0768, + "step": 7147 + }, + { + "epoch": 2.316267012313675, + "grad_norm": 0.5034778118133545, + "learning_rate": 1.3014913721177109e-06, + "loss": 0.0724, + "step": 7148 + }, + { + "epoch": 2.3165910563836682, + "grad_norm": 0.49378588795661926, + "learning_rate": 1.300314489738827e-06, + "loss": 0.0747, + "step": 7149 + }, + { + "epoch": 2.3169151004536617, + "grad_norm": 0.4527478814125061, + "learning_rate": 1.2991380601656366e-06, + "loss": 0.0662, + "step": 7150 + }, + { + "epoch": 2.317239144523655, + "grad_norm": 0.524268388748169, + "learning_rate": 1.2979620835421192e-06, + "loss": 0.073, + "step": 7151 + }, + { + "epoch": 2.317563188593649, + "grad_norm": 0.47499552369117737, + "learning_rate": 1.2967865600122042e-06, + "loss": 0.0692, + "step": 7152 + }, + { + "epoch": 2.3178872326636424, + "grad_norm": 0.4901551306247711, + "learning_rate": 1.2956114897197641e-06, + "loss": 0.0709, + "step": 7153 + }, + { + "epoch": 2.318211276733636, + "grad_norm": 0.5022875666618347, + "learning_rate": 1.294436872808617e-06, + "loss": 0.077, + "step": 7154 + }, + { + "epoch": 2.3185353208036292, + "grad_norm": 0.5198050141334534, + "learning_rate": 1.2932627094225253e-06, + "loss": 0.0775, + "step": 7155 + }, + { + "epoch": 2.3188593648736227, + "grad_norm": 0.46014443039894104, + "learning_rate": 1.2920889997051906e-06, + "loss": 0.0681, + "step": 7156 + }, + { + "epoch": 2.3191834089436165, + "grad_norm": 0.48765018582344055, + "learning_rate": 1.2909157438002706e-06, + "loss": 0.0738, + "step": 7157 + }, + { + "epoch": 2.31950745301361, + "grad_norm": 0.495532751083374, + "learning_rate": 1.2897429418513536e-06, + "loss": 0.0756, + "step": 7158 + }, + { + "epoch": 2.3198314970836034, + "grad_norm": 0.5322849750518799, + "learning_rate": 1.288570594001985e-06, + "loss": 0.0773, + "step": 7159 + }, + { + "epoch": 2.320155541153597, + "grad_norm": 0.5115591883659363, + "learning_rate": 1.2873987003956452e-06, + "loss": 0.0796, + "step": 7160 + }, + { + "epoch": 2.32047958522359, + "grad_norm": 0.4776346981525421, + "learning_rate": 1.2862272611757637e-06, + "loss": 0.069, + "step": 7161 + }, + { + "epoch": 2.320803629293584, + "grad_norm": 0.4989016056060791, + "learning_rate": 1.2850562764857132e-06, + "loss": 0.0713, + "step": 7162 + }, + { + "epoch": 2.3211276733635775, + "grad_norm": 0.4744539260864258, + "learning_rate": 1.283885746468811e-06, + "loss": 0.074, + "step": 7163 + }, + { + "epoch": 2.321451717433571, + "grad_norm": 0.4889492690563202, + "learning_rate": 1.2827156712683204e-06, + "loss": 0.0731, + "step": 7164 + }, + { + "epoch": 2.3217757615035644, + "grad_norm": 0.48890548944473267, + "learning_rate": 1.2815460510274424e-06, + "loss": 0.073, + "step": 7165 + }, + { + "epoch": 2.3220998055735578, + "grad_norm": 0.5087918639183044, + "learning_rate": 1.2803768858893333e-06, + "loss": 0.0793, + "step": 7166 + }, + { + "epoch": 2.3224238496435516, + "grad_norm": 0.49258387088775635, + "learning_rate": 1.2792081759970832e-06, + "loss": 0.0757, + "step": 7167 + }, + { + "epoch": 2.322747893713545, + "grad_norm": 0.4936928451061249, + "learning_rate": 1.2780399214937323e-06, + "loss": 0.0744, + "step": 7168 + }, + { + "epoch": 2.3230719377835385, + "grad_norm": 0.44905778765678406, + "learning_rate": 1.2768721225222635e-06, + "loss": 0.0667, + "step": 7169 + }, + { + "epoch": 2.323395981853532, + "grad_norm": 0.49809473752975464, + "learning_rate": 1.2757047792256045e-06, + "loss": 0.0762, + "step": 7170 + }, + { + "epoch": 2.323720025923526, + "grad_norm": 0.4669495224952698, + "learning_rate": 1.274537891746626e-06, + "loss": 0.0694, + "step": 7171 + }, + { + "epoch": 2.324044069993519, + "grad_norm": 0.47357267141342163, + "learning_rate": 1.273371460228146e-06, + "loss": 0.068, + "step": 7172 + }, + { + "epoch": 2.3243681140635126, + "grad_norm": 0.503506600856781, + "learning_rate": 1.2722054848129217e-06, + "loss": 0.0742, + "step": 7173 + }, + { + "epoch": 2.324692158133506, + "grad_norm": 0.5232956409454346, + "learning_rate": 1.2710399656436578e-06, + "loss": 0.0825, + "step": 7174 + }, + { + "epoch": 2.3250162022035, + "grad_norm": 0.4886522889137268, + "learning_rate": 1.269874902863003e-06, + "loss": 0.0708, + "step": 7175 + }, + { + "epoch": 2.3253402462734933, + "grad_norm": 0.4887654781341553, + "learning_rate": 1.2687102966135501e-06, + "loss": 0.075, + "step": 7176 + }, + { + "epoch": 2.3256642903434868, + "grad_norm": 0.4615480899810791, + "learning_rate": 1.2675461470378348e-06, + "loss": 0.067, + "step": 7177 + }, + { + "epoch": 2.32598833441348, + "grad_norm": 0.5080332159996033, + "learning_rate": 1.2663824542783375e-06, + "loss": 0.0723, + "step": 7178 + }, + { + "epoch": 2.3263123784834736, + "grad_norm": 0.5074442028999329, + "learning_rate": 1.2652192184774858e-06, + "loss": 0.0771, + "step": 7179 + }, + { + "epoch": 2.3266364225534675, + "grad_norm": 0.45548608899116516, + "learning_rate": 1.2640564397776433e-06, + "loss": 0.0698, + "step": 7180 + }, + { + "epoch": 2.326960466623461, + "grad_norm": 0.45758867263793945, + "learning_rate": 1.262894118321129e-06, + "loss": 0.0692, + "step": 7181 + }, + { + "epoch": 2.3272845106934543, + "grad_norm": 0.5235854983329773, + "learning_rate": 1.2617322542501947e-06, + "loss": 0.0762, + "step": 7182 + }, + { + "epoch": 2.3276085547634477, + "grad_norm": 0.4564163386821747, + "learning_rate": 1.2605708477070439e-06, + "loss": 0.0714, + "step": 7183 + }, + { + "epoch": 2.327932598833441, + "grad_norm": 0.5101970434188843, + "learning_rate": 1.25940989883382e-06, + "loss": 0.079, + "step": 7184 + }, + { + "epoch": 2.328256642903435, + "grad_norm": 0.501133918762207, + "learning_rate": 1.2582494077726131e-06, + "loss": 0.0759, + "step": 7185 + }, + { + "epoch": 2.3285806869734285, + "grad_norm": 0.4815445840358734, + "learning_rate": 1.2570893746654579e-06, + "loss": 0.0756, + "step": 7186 + }, + { + "epoch": 2.328904731043422, + "grad_norm": 0.4634091258049011, + "learning_rate": 1.2559297996543252e-06, + "loss": 0.0689, + "step": 7187 + }, + { + "epoch": 2.3292287751134153, + "grad_norm": 0.4817907512187958, + "learning_rate": 1.254770682881143e-06, + "loss": 0.0729, + "step": 7188 + }, + { + "epoch": 2.3295528191834087, + "grad_norm": 0.48379072546958923, + "learning_rate": 1.2536120244877692e-06, + "loss": 0.0719, + "step": 7189 + }, + { + "epoch": 2.3298768632534026, + "grad_norm": 0.4735885560512543, + "learning_rate": 1.252453824616019e-06, + "loss": 0.0724, + "step": 7190 + }, + { + "epoch": 2.330200907323396, + "grad_norm": 0.48822975158691406, + "learning_rate": 1.2512960834076404e-06, + "loss": 0.075, + "step": 7191 + }, + { + "epoch": 2.3305249513933894, + "grad_norm": 0.4866192042827606, + "learning_rate": 1.2501388010043302e-06, + "loss": 0.073, + "step": 7192 + }, + { + "epoch": 2.330848995463383, + "grad_norm": 0.47962766885757446, + "learning_rate": 1.2489819775477302e-06, + "loss": 0.0706, + "step": 7193 + }, + { + "epoch": 2.3311730395333763, + "grad_norm": 0.5421980023384094, + "learning_rate": 1.2478256131794225e-06, + "loss": 0.0795, + "step": 7194 + }, + { + "epoch": 2.33149708360337, + "grad_norm": 0.47424715757369995, + "learning_rate": 1.2466697080409378e-06, + "loss": 0.0719, + "step": 7195 + }, + { + "epoch": 2.3318211276733636, + "grad_norm": 0.47859951853752136, + "learning_rate": 1.2455142622737448e-06, + "loss": 0.0778, + "step": 7196 + }, + { + "epoch": 2.332145171743357, + "grad_norm": 0.47769349813461304, + "learning_rate": 1.2443592760192596e-06, + "loss": 0.0753, + "step": 7197 + }, + { + "epoch": 2.3324692158133504, + "grad_norm": 0.4621089994907379, + "learning_rate": 1.2432047494188415e-06, + "loss": 0.0679, + "step": 7198 + }, + { + "epoch": 2.3327932598833443, + "grad_norm": 0.4695514142513275, + "learning_rate": 1.2420506826137929e-06, + "loss": 0.0706, + "step": 7199 + }, + { + "epoch": 2.3331173039533377, + "grad_norm": 0.46257010102272034, + "learning_rate": 1.240897075745362e-06, + "loss": 0.0657, + "step": 7200 + }, + { + "epoch": 2.333441348023331, + "grad_norm": 0.5223076343536377, + "learning_rate": 1.2397439289547375e-06, + "loss": 0.0798, + "step": 7201 + }, + { + "epoch": 2.3337653920933246, + "grad_norm": 0.49697694182395935, + "learning_rate": 1.2385912423830538e-06, + "loss": 0.0708, + "step": 7202 + }, + { + "epoch": 2.3340894361633184, + "grad_norm": 0.47291916608810425, + "learning_rate": 1.2374390161713906e-06, + "loss": 0.0723, + "step": 7203 + }, + { + "epoch": 2.334413480233312, + "grad_norm": 0.4870017468929291, + "learning_rate": 1.2362872504607659e-06, + "loss": 0.0722, + "step": 7204 + }, + { + "epoch": 2.3347375243033053, + "grad_norm": 0.508466362953186, + "learning_rate": 1.2351359453921463e-06, + "loss": 0.0757, + "step": 7205 + }, + { + "epoch": 2.3350615683732987, + "grad_norm": 0.4754007160663605, + "learning_rate": 1.2339851011064403e-06, + "loss": 0.0714, + "step": 7206 + }, + { + "epoch": 2.335385612443292, + "grad_norm": 0.4860064387321472, + "learning_rate": 1.2328347177444993e-06, + "loss": 0.0749, + "step": 7207 + }, + { + "epoch": 2.335709656513286, + "grad_norm": 0.5065357685089111, + "learning_rate": 1.2316847954471222e-06, + "loss": 0.068, + "step": 7208 + }, + { + "epoch": 2.3360337005832794, + "grad_norm": 0.514007031917572, + "learning_rate": 1.230535334355043e-06, + "loss": 0.0761, + "step": 7209 + }, + { + "epoch": 2.336357744653273, + "grad_norm": 0.48480892181396484, + "learning_rate": 1.2293863346089502e-06, + "loss": 0.0676, + "step": 7210 + }, + { + "epoch": 2.3366817887232663, + "grad_norm": 0.49941307306289673, + "learning_rate": 1.2282377963494647e-06, + "loss": 0.0734, + "step": 7211 + }, + { + "epoch": 2.3370058327932597, + "grad_norm": 0.4956761598587036, + "learning_rate": 1.2270897197171628e-06, + "loss": 0.0735, + "step": 7212 + }, + { + "epoch": 2.3373298768632536, + "grad_norm": 0.5128060579299927, + "learning_rate": 1.2259421048525516e-06, + "loss": 0.0753, + "step": 7213 + }, + { + "epoch": 2.337653920933247, + "grad_norm": 0.46151235699653625, + "learning_rate": 1.2247949518960938e-06, + "loss": 0.0676, + "step": 7214 + }, + { + "epoch": 2.3379779650032404, + "grad_norm": 0.5013076663017273, + "learning_rate": 1.2236482609881857e-06, + "loss": 0.0733, + "step": 7215 + }, + { + "epoch": 2.338302009073234, + "grad_norm": 0.47443968057632446, + "learning_rate": 1.2225020322691721e-06, + "loss": 0.0695, + "step": 7216 + }, + { + "epoch": 2.3386260531432272, + "grad_norm": 0.4645536541938782, + "learning_rate": 1.2213562658793427e-06, + "loss": 0.0695, + "step": 7217 + }, + { + "epoch": 2.338950097213221, + "grad_norm": 0.4841172695159912, + "learning_rate": 1.220210961958923e-06, + "loss": 0.0673, + "step": 7218 + }, + { + "epoch": 2.3392741412832145, + "grad_norm": 0.4824322760105133, + "learning_rate": 1.2190661206480935e-06, + "loss": 0.0712, + "step": 7219 + }, + { + "epoch": 2.339598185353208, + "grad_norm": 0.4811258912086487, + "learning_rate": 1.217921742086967e-06, + "loss": 0.0704, + "step": 7220 + }, + { + "epoch": 2.3399222294232014, + "grad_norm": 0.4667803943157196, + "learning_rate": 1.2167778264156066e-06, + "loss": 0.0675, + "step": 7221 + }, + { + "epoch": 2.3402462734931953, + "grad_norm": 0.45968520641326904, + "learning_rate": 1.215634373774015e-06, + "loss": 0.0703, + "step": 7222 + }, + { + "epoch": 2.3405703175631887, + "grad_norm": 0.5023238062858582, + "learning_rate": 1.2144913843021405e-06, + "loss": 0.0745, + "step": 7223 + }, + { + "epoch": 2.340894361633182, + "grad_norm": 0.4562709629535675, + "learning_rate": 1.2133488581398745e-06, + "loss": 0.0645, + "step": 7224 + }, + { + "epoch": 2.3412184057031755, + "grad_norm": 0.517026960849762, + "learning_rate": 1.2122067954270505e-06, + "loss": 0.075, + "step": 7225 + }, + { + "epoch": 2.3415424497731694, + "grad_norm": 0.45475083589553833, + "learning_rate": 1.2110651963034475e-06, + "loss": 0.0697, + "step": 7226 + }, + { + "epoch": 2.341866493843163, + "grad_norm": 0.527498722076416, + "learning_rate": 1.2099240609087832e-06, + "loss": 0.08, + "step": 7227 + }, + { + "epoch": 2.3421905379131562, + "grad_norm": 0.48453056812286377, + "learning_rate": 1.2087833893827227e-06, + "loss": 0.0697, + "step": 7228 + }, + { + "epoch": 2.3425145819831497, + "grad_norm": 0.5300154089927673, + "learning_rate": 1.2076431818648744e-06, + "loss": 0.0837, + "step": 7229 + }, + { + "epoch": 2.342838626053143, + "grad_norm": 0.45958855748176575, + "learning_rate": 1.206503438494787e-06, + "loss": 0.0695, + "step": 7230 + }, + { + "epoch": 2.343162670123137, + "grad_norm": 0.4899384379386902, + "learning_rate": 1.2053641594119554e-06, + "loss": 0.0711, + "step": 7231 + }, + { + "epoch": 2.3434867141931304, + "grad_norm": 0.47130003571510315, + "learning_rate": 1.204225344755815e-06, + "loss": 0.0715, + "step": 7232 + }, + { + "epoch": 2.343810758263124, + "grad_norm": 0.5333791971206665, + "learning_rate": 1.203086994665747e-06, + "loss": 0.083, + "step": 7233 + }, + { + "epoch": 2.344134802333117, + "grad_norm": 0.4682372212409973, + "learning_rate": 1.2019491092810754e-06, + "loss": 0.0708, + "step": 7234 + }, + { + "epoch": 2.3444588464031106, + "grad_norm": 0.44578856229782104, + "learning_rate": 1.200811688741062e-06, + "loss": 0.0649, + "step": 7235 + }, + { + "epoch": 2.3447828904731045, + "grad_norm": 0.4772430658340454, + "learning_rate": 1.1996747331849211e-06, + "loss": 0.067, + "step": 7236 + }, + { + "epoch": 2.345106934543098, + "grad_norm": 0.48849186301231384, + "learning_rate": 1.1985382427518022e-06, + "loss": 0.0751, + "step": 7237 + }, + { + "epoch": 2.3454309786130914, + "grad_norm": 0.43585848808288574, + "learning_rate": 1.1974022175808014e-06, + "loss": 0.065, + "step": 7238 + }, + { + "epoch": 2.345755022683085, + "grad_norm": 0.4897010326385498, + "learning_rate": 1.1962666578109584e-06, + "loss": 0.072, + "step": 7239 + }, + { + "epoch": 2.346079066753078, + "grad_norm": 0.4719051718711853, + "learning_rate": 1.1951315635812506e-06, + "loss": 0.0684, + "step": 7240 + }, + { + "epoch": 2.346403110823072, + "grad_norm": 0.48736074566841125, + "learning_rate": 1.193996935030608e-06, + "loss": 0.071, + "step": 7241 + }, + { + "epoch": 2.3467271548930655, + "grad_norm": 0.5279738306999207, + "learning_rate": 1.1928627722978931e-06, + "loss": 0.0776, + "step": 7242 + }, + { + "epoch": 2.347051198963059, + "grad_norm": 0.472042441368103, + "learning_rate": 1.1917290755219212e-06, + "loss": 0.067, + "step": 7243 + }, + { + "epoch": 2.3473752430330523, + "grad_norm": 0.5123307704925537, + "learning_rate": 1.190595844841441e-06, + "loss": 0.0765, + "step": 7244 + }, + { + "epoch": 2.347699287103046, + "grad_norm": 0.49297240376472473, + "learning_rate": 1.1894630803951545e-06, + "loss": 0.0745, + "step": 7245 + }, + { + "epoch": 2.3480233311730396, + "grad_norm": 0.43049800395965576, + "learning_rate": 1.1883307823216972e-06, + "loss": 0.0657, + "step": 7246 + }, + { + "epoch": 2.348347375243033, + "grad_norm": 0.4899204671382904, + "learning_rate": 1.1871989507596516e-06, + "loss": 0.0754, + "step": 7247 + }, + { + "epoch": 2.3486714193130265, + "grad_norm": 0.47282499074935913, + "learning_rate": 1.1860675858475452e-06, + "loss": 0.0722, + "step": 7248 + }, + { + "epoch": 2.34899546338302, + "grad_norm": 0.4940069317817688, + "learning_rate": 1.1849366877238416e-06, + "loss": 0.0732, + "step": 7249 + }, + { + "epoch": 2.3493195074530138, + "grad_norm": 0.5003806948661804, + "learning_rate": 1.183806256526958e-06, + "loss": 0.0739, + "step": 7250 + }, + { + "epoch": 2.349643551523007, + "grad_norm": 0.47378280758857727, + "learning_rate": 1.1826762923952435e-06, + "loss": 0.0649, + "step": 7251 + }, + { + "epoch": 2.3499675955930006, + "grad_norm": 0.48774582147598267, + "learning_rate": 1.1815467954669956e-06, + "loss": 0.0698, + "step": 7252 + }, + { + "epoch": 2.350291639662994, + "grad_norm": 0.5039346218109131, + "learning_rate": 1.1804177658804549e-06, + "loss": 0.0737, + "step": 7253 + }, + { + "epoch": 2.350615683732988, + "grad_norm": 0.47579720616340637, + "learning_rate": 1.1792892037738035e-06, + "loss": 0.0662, + "step": 7254 + }, + { + "epoch": 2.3509397278029813, + "grad_norm": 0.474889874458313, + "learning_rate": 1.1781611092851664e-06, + "loss": 0.0677, + "step": 7255 + }, + { + "epoch": 2.3512637718729748, + "grad_norm": 0.4891919493675232, + "learning_rate": 1.1770334825526103e-06, + "loss": 0.0712, + "step": 7256 + }, + { + "epoch": 2.351587815942968, + "grad_norm": 0.5071465373039246, + "learning_rate": 1.1759063237141477e-06, + "loss": 0.0736, + "step": 7257 + }, + { + "epoch": 2.3519118600129616, + "grad_norm": 0.4828701615333557, + "learning_rate": 1.1747796329077315e-06, + "loss": 0.0761, + "step": 7258 + }, + { + "epoch": 2.3522359040829555, + "grad_norm": 0.517517626285553, + "learning_rate": 1.1736534102712566e-06, + "loss": 0.0747, + "step": 7259 + }, + { + "epoch": 2.352559948152949, + "grad_norm": 0.4479466676712036, + "learning_rate": 1.172527655942562e-06, + "loss": 0.0672, + "step": 7260 + }, + { + "epoch": 2.3528839922229423, + "grad_norm": 0.4755881130695343, + "learning_rate": 1.1714023700594296e-06, + "loss": 0.0688, + "step": 7261 + }, + { + "epoch": 2.3532080362929357, + "grad_norm": 0.4864247441291809, + "learning_rate": 1.1702775527595833e-06, + "loss": 0.0754, + "step": 7262 + }, + { + "epoch": 2.353532080362929, + "grad_norm": 0.47295811772346497, + "learning_rate": 1.1691532041806919e-06, + "loss": 0.0725, + "step": 7263 + }, + { + "epoch": 2.353856124432923, + "grad_norm": 0.49665603041648865, + "learning_rate": 1.1680293244603592e-06, + "loss": 0.0745, + "step": 7264 + }, + { + "epoch": 2.3541801685029164, + "grad_norm": 0.5112475156784058, + "learning_rate": 1.1669059137361444e-06, + "loss": 0.0797, + "step": 7265 + }, + { + "epoch": 2.35450421257291, + "grad_norm": 0.5126915574073792, + "learning_rate": 1.1657829721455349e-06, + "loss": 0.0762, + "step": 7266 + }, + { + "epoch": 2.3548282566429033, + "grad_norm": 0.5092496275901794, + "learning_rate": 1.1646604998259747e-06, + "loss": 0.0729, + "step": 7267 + }, + { + "epoch": 2.3551523007128967, + "grad_norm": 0.46421167254447937, + "learning_rate": 1.1635384969148395e-06, + "loss": 0.0711, + "step": 7268 + }, + { + "epoch": 2.3554763447828906, + "grad_norm": 0.47837281227111816, + "learning_rate": 1.162416963549452e-06, + "loss": 0.0694, + "step": 7269 + }, + { + "epoch": 2.355800388852884, + "grad_norm": 0.48247772455215454, + "learning_rate": 1.161295899867077e-06, + "loss": 0.0707, + "step": 7270 + }, + { + "epoch": 2.3561244329228774, + "grad_norm": 0.48974937200546265, + "learning_rate": 1.160175306004923e-06, + "loss": 0.0711, + "step": 7271 + }, + { + "epoch": 2.356448476992871, + "grad_norm": 0.5030304193496704, + "learning_rate": 1.1590551821001406e-06, + "loss": 0.0754, + "step": 7272 + }, + { + "epoch": 2.3567725210628647, + "grad_norm": 0.4614565074443817, + "learning_rate": 1.1579355282898175e-06, + "loss": 0.0662, + "step": 7273 + }, + { + "epoch": 2.357096565132858, + "grad_norm": 0.46911752223968506, + "learning_rate": 1.1568163447109942e-06, + "loss": 0.0711, + "step": 7274 + }, + { + "epoch": 2.3574206092028516, + "grad_norm": 0.463739812374115, + "learning_rate": 1.1556976315006445e-06, + "loss": 0.0708, + "step": 7275 + }, + { + "epoch": 2.357744653272845, + "grad_norm": 0.5395808815956116, + "learning_rate": 1.154579388795689e-06, + "loss": 0.0782, + "step": 7276 + }, + { + "epoch": 2.358068697342839, + "grad_norm": 0.4603893458843231, + "learning_rate": 1.1534616167329899e-06, + "loss": 0.0702, + "step": 7277 + }, + { + "epoch": 2.3583927414128323, + "grad_norm": 0.48242759704589844, + "learning_rate": 1.1523443154493509e-06, + "loss": 0.0726, + "step": 7278 + }, + { + "epoch": 2.3587167854828257, + "grad_norm": 0.5345384478569031, + "learning_rate": 1.1512274850815197e-06, + "loss": 0.0765, + "step": 7279 + }, + { + "epoch": 2.359040829552819, + "grad_norm": 0.48294004797935486, + "learning_rate": 1.1501111257661856e-06, + "loss": 0.076, + "step": 7280 + }, + { + "epoch": 2.3593648736228126, + "grad_norm": 0.5108845233917236, + "learning_rate": 1.148995237639981e-06, + "loss": 0.0756, + "step": 7281 + }, + { + "epoch": 2.3596889176928064, + "grad_norm": 0.49022331833839417, + "learning_rate": 1.1478798208394775e-06, + "loss": 0.0713, + "step": 7282 + }, + { + "epoch": 2.3600129617628, + "grad_norm": 0.5263938903808594, + "learning_rate": 1.146764875501193e-06, + "loss": 0.0773, + "step": 7283 + }, + { + "epoch": 2.3603370058327933, + "grad_norm": 0.4493767023086548, + "learning_rate": 1.145650401761585e-06, + "loss": 0.0687, + "step": 7284 + }, + { + "epoch": 2.3606610499027867, + "grad_norm": 0.4451209306716919, + "learning_rate": 1.1445363997570546e-06, + "loss": 0.067, + "step": 7285 + }, + { + "epoch": 2.36098509397278, + "grad_norm": 0.48385268449783325, + "learning_rate": 1.1434228696239452e-06, + "loss": 0.0726, + "step": 7286 + }, + { + "epoch": 2.361309138042774, + "grad_norm": 0.5328781604766846, + "learning_rate": 1.1423098114985437e-06, + "loss": 0.0794, + "step": 7287 + }, + { + "epoch": 2.3616331821127674, + "grad_norm": 0.46778643131256104, + "learning_rate": 1.1411972255170727e-06, + "loss": 0.0648, + "step": 7288 + }, + { + "epoch": 2.361957226182761, + "grad_norm": 0.49069154262542725, + "learning_rate": 1.1400851118157086e-06, + "loss": 0.0691, + "step": 7289 + }, + { + "epoch": 2.3622812702527543, + "grad_norm": 0.4818800091743469, + "learning_rate": 1.1389734705305583e-06, + "loss": 0.0677, + "step": 7290 + }, + { + "epoch": 2.3626053143227477, + "grad_norm": 0.49379661679267883, + "learning_rate": 1.137862301797677e-06, + "loss": 0.077, + "step": 7291 + }, + { + "epoch": 2.3629293583927415, + "grad_norm": 0.47791507840156555, + "learning_rate": 1.136751605753062e-06, + "loss": 0.0728, + "step": 7292 + }, + { + "epoch": 2.363253402462735, + "grad_norm": 0.48617789149284363, + "learning_rate": 1.1356413825326518e-06, + "loss": 0.0687, + "step": 7293 + }, + { + "epoch": 2.3635774465327284, + "grad_norm": 0.4953933656215668, + "learning_rate": 1.134531632272327e-06, + "loss": 0.0738, + "step": 7294 + }, + { + "epoch": 2.363901490602722, + "grad_norm": 0.5016205310821533, + "learning_rate": 1.1334223551079077e-06, + "loss": 0.0768, + "step": 7295 + }, + { + "epoch": 2.3642255346727157, + "grad_norm": 0.4939001202583313, + "learning_rate": 1.132313551175163e-06, + "loss": 0.0764, + "step": 7296 + }, + { + "epoch": 2.364549578742709, + "grad_norm": 0.5102739334106445, + "learning_rate": 1.131205220609795e-06, + "loss": 0.0762, + "step": 7297 + }, + { + "epoch": 2.3648736228127025, + "grad_norm": 0.520470917224884, + "learning_rate": 1.1300973635474582e-06, + "loss": 0.0735, + "step": 7298 + }, + { + "epoch": 2.365197666882696, + "grad_norm": 0.4633205235004425, + "learning_rate": 1.1289899801237392e-06, + "loss": 0.0688, + "step": 7299 + }, + { + "epoch": 2.3655217109526894, + "grad_norm": 0.4387751519680023, + "learning_rate": 1.1278830704741716e-06, + "loss": 0.0624, + "step": 7300 + }, + { + "epoch": 2.3658457550226832, + "grad_norm": 0.4592018723487854, + "learning_rate": 1.1267766347342318e-06, + "loss": 0.0683, + "step": 7301 + }, + { + "epoch": 2.3661697990926767, + "grad_norm": 0.4675895571708679, + "learning_rate": 1.1256706730393363e-06, + "loss": 0.0678, + "step": 7302 + }, + { + "epoch": 2.36649384316267, + "grad_norm": 0.5069913268089294, + "learning_rate": 1.1245651855248451e-06, + "loss": 0.0771, + "step": 7303 + }, + { + "epoch": 2.3668178872326635, + "grad_norm": 0.4496638774871826, + "learning_rate": 1.1234601723260552e-06, + "loss": 0.0675, + "step": 7304 + }, + { + "epoch": 2.3671419313026574, + "grad_norm": 0.4736442565917969, + "learning_rate": 1.1223556335782153e-06, + "loss": 0.0722, + "step": 7305 + }, + { + "epoch": 2.367465975372651, + "grad_norm": 0.4914408028125763, + "learning_rate": 1.121251569416506e-06, + "loss": 0.0746, + "step": 7306 + }, + { + "epoch": 2.3677900194426442, + "grad_norm": 0.5002232193946838, + "learning_rate": 1.120147979976055e-06, + "loss": 0.0766, + "step": 7307 + }, + { + "epoch": 2.3681140635126376, + "grad_norm": 0.47254279255867004, + "learning_rate": 1.1190448653919323e-06, + "loss": 0.0734, + "step": 7308 + }, + { + "epoch": 2.368438107582631, + "grad_norm": 0.45834389328956604, + "learning_rate": 1.1179422257991469e-06, + "loss": 0.0701, + "step": 7309 + }, + { + "epoch": 2.368762151652625, + "grad_norm": 0.4369758069515228, + "learning_rate": 1.1168400613326519e-06, + "loss": 0.0672, + "step": 7310 + }, + { + "epoch": 2.3690861957226184, + "grad_norm": 0.4970327913761139, + "learning_rate": 1.1157383721273413e-06, + "loss": 0.074, + "step": 7311 + }, + { + "epoch": 2.369410239792612, + "grad_norm": 0.4383191764354706, + "learning_rate": 1.1146371583180532e-06, + "loss": 0.0685, + "step": 7312 + }, + { + "epoch": 2.369734283862605, + "grad_norm": 0.4854133129119873, + "learning_rate": 1.1135364200395615e-06, + "loss": 0.0707, + "step": 7313 + }, + { + "epoch": 2.3700583279325986, + "grad_norm": 0.4635353684425354, + "learning_rate": 1.112436157426589e-06, + "loss": 0.0681, + "step": 7314 + }, + { + "epoch": 2.3703823720025925, + "grad_norm": 0.4272409975528717, + "learning_rate": 1.111336370613796e-06, + "loss": 0.0595, + "step": 7315 + }, + { + "epoch": 2.370706416072586, + "grad_norm": 0.4680071175098419, + "learning_rate": 1.1102370597357858e-06, + "loss": 0.0714, + "step": 7316 + }, + { + "epoch": 2.3710304601425793, + "grad_norm": 0.49491339921951294, + "learning_rate": 1.1091382249271037e-06, + "loss": 0.0736, + "step": 7317 + }, + { + "epoch": 2.3713545042125728, + "grad_norm": 0.4780588150024414, + "learning_rate": 1.108039866322238e-06, + "loss": 0.0718, + "step": 7318 + }, + { + "epoch": 2.371678548282566, + "grad_norm": 0.5072447657585144, + "learning_rate": 1.1069419840556128e-06, + "loss": 0.0769, + "step": 7319 + }, + { + "epoch": 2.37200259235256, + "grad_norm": 0.486572802066803, + "learning_rate": 1.105844578261604e-06, + "loss": 0.0731, + "step": 7320 + }, + { + "epoch": 2.3723266364225535, + "grad_norm": 0.4918803572654724, + "learning_rate": 1.1047476490745191e-06, + "loss": 0.0775, + "step": 7321 + }, + { + "epoch": 2.372650680492547, + "grad_norm": 0.48813650012016296, + "learning_rate": 1.1036511966286123e-06, + "loss": 0.0724, + "step": 7322 + }, + { + "epoch": 2.3729747245625403, + "grad_norm": 0.4994529187679291, + "learning_rate": 1.1025552210580803e-06, + "loss": 0.0773, + "step": 7323 + }, + { + "epoch": 2.373298768632534, + "grad_norm": 0.5022187232971191, + "learning_rate": 1.1014597224970586e-06, + "loss": 0.0747, + "step": 7324 + }, + { + "epoch": 2.3736228127025276, + "grad_norm": 0.48002323508262634, + "learning_rate": 1.1003647010796275e-06, + "loss": 0.073, + "step": 7325 + }, + { + "epoch": 2.373946856772521, + "grad_norm": 0.4666433334350586, + "learning_rate": 1.099270156939803e-06, + "loss": 0.0665, + "step": 7326 + }, + { + "epoch": 2.3742709008425145, + "grad_norm": 0.4680749177932739, + "learning_rate": 1.0981760902115518e-06, + "loss": 0.068, + "step": 7327 + }, + { + "epoch": 2.3745949449125083, + "grad_norm": 0.4894528090953827, + "learning_rate": 1.0970825010287716e-06, + "loss": 0.0742, + "step": 7328 + }, + { + "epoch": 2.3749189889825018, + "grad_norm": 0.48894360661506653, + "learning_rate": 1.0959893895253132e-06, + "loss": 0.0713, + "step": 7329 + }, + { + "epoch": 2.375243033052495, + "grad_norm": 0.49191632866859436, + "learning_rate": 1.0948967558349581e-06, + "loss": 0.0763, + "step": 7330 + }, + { + "epoch": 2.3755670771224886, + "grad_norm": 0.48632246255874634, + "learning_rate": 1.0938046000914365e-06, + "loss": 0.0723, + "step": 7331 + }, + { + "epoch": 2.375891121192482, + "grad_norm": 0.4645938575267792, + "learning_rate": 1.0927129224284166e-06, + "loss": 0.0713, + "step": 7332 + }, + { + "epoch": 2.376215165262476, + "grad_norm": 0.4937532842159271, + "learning_rate": 1.091621722979509e-06, + "loss": 0.0726, + "step": 7333 + }, + { + "epoch": 2.3765392093324693, + "grad_norm": 0.4999132454395294, + "learning_rate": 1.0905310018782682e-06, + "loss": 0.0781, + "step": 7334 + }, + { + "epoch": 2.3768632534024627, + "grad_norm": 0.47685500979423523, + "learning_rate": 1.0894407592581835e-06, + "loss": 0.0714, + "step": 7335 + }, + { + "epoch": 2.377187297472456, + "grad_norm": 0.4990130662918091, + "learning_rate": 1.0883509952526956e-06, + "loss": 0.0758, + "step": 7336 + }, + { + "epoch": 2.3775113415424496, + "grad_norm": 0.507454514503479, + "learning_rate": 1.0872617099951765e-06, + "loss": 0.0792, + "step": 7337 + }, + { + "epoch": 2.3778353856124435, + "grad_norm": 0.4756607115268707, + "learning_rate": 1.0861729036189462e-06, + "loss": 0.0714, + "step": 7338 + }, + { + "epoch": 2.378159429682437, + "grad_norm": 0.4667145311832428, + "learning_rate": 1.0850845762572638e-06, + "loss": 0.0664, + "step": 7339 + }, + { + "epoch": 2.3784834737524303, + "grad_norm": 0.4863603711128235, + "learning_rate": 1.0839967280433294e-06, + "loss": 0.0776, + "step": 7340 + }, + { + "epoch": 2.3788075178224237, + "grad_norm": 0.490573525428772, + "learning_rate": 1.0829093591102858e-06, + "loss": 0.0795, + "step": 7341 + }, + { + "epoch": 2.379131561892417, + "grad_norm": 0.48397448658943176, + "learning_rate": 1.0818224695912178e-06, + "loss": 0.0702, + "step": 7342 + }, + { + "epoch": 2.379455605962411, + "grad_norm": 0.47108104825019836, + "learning_rate": 1.0807360596191473e-06, + "loss": 0.0706, + "step": 7343 + }, + { + "epoch": 2.3797796500324044, + "grad_norm": 0.5208635926246643, + "learning_rate": 1.0796501293270418e-06, + "loss": 0.0744, + "step": 7344 + }, + { + "epoch": 2.380103694102398, + "grad_norm": 0.5025206804275513, + "learning_rate": 1.0785646788478083e-06, + "loss": 0.0774, + "step": 7345 + }, + { + "epoch": 2.3804277381723913, + "grad_norm": 0.5211962461471558, + "learning_rate": 1.0774797083142957e-06, + "loss": 0.0806, + "step": 7346 + }, + { + "epoch": 2.380751782242385, + "grad_norm": 0.49372220039367676, + "learning_rate": 1.0763952178592934e-06, + "loss": 0.0758, + "step": 7347 + }, + { + "epoch": 2.3810758263123786, + "grad_norm": 0.5154086351394653, + "learning_rate": 1.0753112076155335e-06, + "loss": 0.0788, + "step": 7348 + }, + { + "epoch": 2.381399870382372, + "grad_norm": 0.49311190843582153, + "learning_rate": 1.0742276777156896e-06, + "loss": 0.0739, + "step": 7349 + }, + { + "epoch": 2.3817239144523654, + "grad_norm": 0.5144601464271545, + "learning_rate": 1.0731446282923702e-06, + "loss": 0.0743, + "step": 7350 + }, + { + "epoch": 2.3820479585223593, + "grad_norm": 0.510590136051178, + "learning_rate": 1.0720620594781361e-06, + "loss": 0.0779, + "step": 7351 + }, + { + "epoch": 2.3823720025923527, + "grad_norm": 0.4440791606903076, + "learning_rate": 1.0709799714054796e-06, + "loss": 0.0671, + "step": 7352 + }, + { + "epoch": 2.382696046662346, + "grad_norm": 0.46642884612083435, + "learning_rate": 1.0698983642068384e-06, + "loss": 0.0719, + "step": 7353 + }, + { + "epoch": 2.3830200907323396, + "grad_norm": 0.4936695098876953, + "learning_rate": 1.068817238014591e-06, + "loss": 0.074, + "step": 7354 + }, + { + "epoch": 2.383344134802333, + "grad_norm": 0.49078696966171265, + "learning_rate": 1.0677365929610573e-06, + "loss": 0.0715, + "step": 7355 + }, + { + "epoch": 2.383668178872327, + "grad_norm": 0.4980461895465851, + "learning_rate": 1.0666564291784985e-06, + "loss": 0.076, + "step": 7356 + }, + { + "epoch": 2.3839922229423203, + "grad_norm": 0.47679758071899414, + "learning_rate": 1.0655767467991124e-06, + "loss": 0.0687, + "step": 7357 + }, + { + "epoch": 2.3843162670123137, + "grad_norm": 0.4713037312030792, + "learning_rate": 1.0644975459550466e-06, + "loss": 0.0733, + "step": 7358 + }, + { + "epoch": 2.384640311082307, + "grad_norm": 0.482093870639801, + "learning_rate": 1.0634188267783807e-06, + "loss": 0.0742, + "step": 7359 + }, + { + "epoch": 2.3849643551523005, + "grad_norm": 0.507264256477356, + "learning_rate": 1.0623405894011435e-06, + "loss": 0.0747, + "step": 7360 + }, + { + "epoch": 2.3852883992222944, + "grad_norm": 0.48084744811058044, + "learning_rate": 1.0612628339552972e-06, + "loss": 0.0723, + "step": 7361 + }, + { + "epoch": 2.385612443292288, + "grad_norm": 0.4838222563266754, + "learning_rate": 1.06018556057275e-06, + "loss": 0.0691, + "step": 7362 + }, + { + "epoch": 2.3859364873622813, + "grad_norm": 0.47494423389434814, + "learning_rate": 1.0591087693853503e-06, + "loss": 0.0727, + "step": 7363 + }, + { + "epoch": 2.3862605314322747, + "grad_norm": 0.4764205813407898, + "learning_rate": 1.0580324605248865e-06, + "loss": 0.0713, + "step": 7364 + }, + { + "epoch": 2.386584575502268, + "grad_norm": 0.48809897899627686, + "learning_rate": 1.0569566341230892e-06, + "loss": 0.073, + "step": 7365 + }, + { + "epoch": 2.386908619572262, + "grad_norm": 0.5656302571296692, + "learning_rate": 1.0558812903116273e-06, + "loss": 0.0796, + "step": 7366 + }, + { + "epoch": 2.3872326636422554, + "grad_norm": 0.4584273397922516, + "learning_rate": 1.0548064292221134e-06, + "loss": 0.0688, + "step": 7367 + }, + { + "epoch": 2.387556707712249, + "grad_norm": 0.45454642176628113, + "learning_rate": 1.0537320509860998e-06, + "loss": 0.0687, + "step": 7368 + }, + { + "epoch": 2.3878807517822422, + "grad_norm": 0.501089334487915, + "learning_rate": 1.0526581557350802e-06, + "loss": 0.0705, + "step": 7369 + }, + { + "epoch": 2.3882047958522357, + "grad_norm": 0.4731651842594147, + "learning_rate": 1.0515847436004894e-06, + "loss": 0.0705, + "step": 7370 + }, + { + "epoch": 2.3885288399222295, + "grad_norm": 0.46863463521003723, + "learning_rate": 1.0505118147137028e-06, + "loss": 0.0721, + "step": 7371 + }, + { + "epoch": 2.388852883992223, + "grad_norm": 0.5042905211448669, + "learning_rate": 1.0494393692060355e-06, + "loss": 0.0705, + "step": 7372 + }, + { + "epoch": 2.3891769280622164, + "grad_norm": 0.47537532448768616, + "learning_rate": 1.0483674072087462e-06, + "loss": 0.0658, + "step": 7373 + }, + { + "epoch": 2.38950097213221, + "grad_norm": 0.5079628825187683, + "learning_rate": 1.0472959288530305e-06, + "loss": 0.0747, + "step": 7374 + }, + { + "epoch": 2.3898250162022037, + "grad_norm": 0.46602755784988403, + "learning_rate": 1.0462249342700282e-06, + "loss": 0.0699, + "step": 7375 + }, + { + "epoch": 2.390149060272197, + "grad_norm": 0.49709802865982056, + "learning_rate": 1.0451544235908179e-06, + "loss": 0.0683, + "step": 7376 + }, + { + "epoch": 2.3904731043421905, + "grad_norm": 0.4795989394187927, + "learning_rate": 1.0440843969464209e-06, + "loss": 0.0715, + "step": 7377 + }, + { + "epoch": 2.390797148412184, + "grad_norm": 0.48927298188209534, + "learning_rate": 1.0430148544677971e-06, + "loss": 0.0726, + "step": 7378 + }, + { + "epoch": 2.391121192482178, + "grad_norm": 0.4737934172153473, + "learning_rate": 1.041945796285848e-06, + "loss": 0.07, + "step": 7379 + }, + { + "epoch": 2.3914452365521712, + "grad_norm": 0.4971829354763031, + "learning_rate": 1.040877222531419e-06, + "loss": 0.072, + "step": 7380 + }, + { + "epoch": 2.3917692806221647, + "grad_norm": 0.4656839668750763, + "learning_rate": 1.0398091333352872e-06, + "loss": 0.073, + "step": 7381 + }, + { + "epoch": 2.392093324692158, + "grad_norm": 0.47131016850471497, + "learning_rate": 1.0387415288281826e-06, + "loss": 0.068, + "step": 7382 + }, + { + "epoch": 2.3924173687621515, + "grad_norm": 0.5292643904685974, + "learning_rate": 1.0376744091407649e-06, + "loss": 0.0816, + "step": 7383 + }, + { + "epoch": 2.3927414128321454, + "grad_norm": 0.46024826169013977, + "learning_rate": 1.036607774403643e-06, + "loss": 0.0672, + "step": 7384 + }, + { + "epoch": 2.393065456902139, + "grad_norm": 0.498965322971344, + "learning_rate": 1.03554162474736e-06, + "loss": 0.0725, + "step": 7385 + }, + { + "epoch": 2.393389500972132, + "grad_norm": 0.5312982797622681, + "learning_rate": 1.0344759603024029e-06, + "loss": 0.0756, + "step": 7386 + }, + { + "epoch": 2.3937135450421256, + "grad_norm": 0.46625569462776184, + "learning_rate": 1.0334107811992005e-06, + "loss": 0.0721, + "step": 7387 + }, + { + "epoch": 2.394037589112119, + "grad_norm": 0.4639773666858673, + "learning_rate": 1.032346087568117e-06, + "loss": 0.0724, + "step": 7388 + }, + { + "epoch": 2.394361633182113, + "grad_norm": 0.4938350021839142, + "learning_rate": 1.031281879539464e-06, + "loss": 0.0746, + "step": 7389 + }, + { + "epoch": 2.3946856772521063, + "grad_norm": 0.5003551244735718, + "learning_rate": 1.0302181572434866e-06, + "loss": 0.0743, + "step": 7390 + }, + { + "epoch": 2.3950097213220998, + "grad_norm": 0.45628494024276733, + "learning_rate": 1.029154920810379e-06, + "loss": 0.0714, + "step": 7391 + }, + { + "epoch": 2.395333765392093, + "grad_norm": 0.46569904685020447, + "learning_rate": 1.0280921703702672e-06, + "loss": 0.0702, + "step": 7392 + }, + { + "epoch": 2.3956578094620866, + "grad_norm": 0.5279820561408997, + "learning_rate": 1.0270299060532224e-06, + "loss": 0.076, + "step": 7393 + }, + { + "epoch": 2.3959818535320805, + "grad_norm": 0.48654380440711975, + "learning_rate": 1.0259681279892558e-06, + "loss": 0.0702, + "step": 7394 + }, + { + "epoch": 2.396305897602074, + "grad_norm": 0.47849327325820923, + "learning_rate": 1.0249068363083193e-06, + "loss": 0.0684, + "step": 7395 + }, + { + "epoch": 2.3966299416720673, + "grad_norm": 0.49411529302597046, + "learning_rate": 1.023846031140303e-06, + "loss": 0.0707, + "step": 7396 + }, + { + "epoch": 2.3969539857420608, + "grad_norm": 0.5243039131164551, + "learning_rate": 1.0227857126150425e-06, + "loss": 0.0747, + "step": 7397 + }, + { + "epoch": 2.3972780298120546, + "grad_norm": 0.5061764121055603, + "learning_rate": 1.021725880862307e-06, + "loss": 0.0743, + "step": 7398 + }, + { + "epoch": 2.397602073882048, + "grad_norm": 0.5003470182418823, + "learning_rate": 1.0206665360118106e-06, + "loss": 0.0756, + "step": 7399 + }, + { + "epoch": 2.3979261179520415, + "grad_norm": 0.44817647337913513, + "learning_rate": 1.0196076781932078e-06, + "loss": 0.0671, + "step": 7400 + }, + { + "epoch": 2.398250162022035, + "grad_norm": 0.5085217952728271, + "learning_rate": 1.018549307536092e-06, + "loss": 0.0757, + "step": 7401 + }, + { + "epoch": 2.3985742060920288, + "grad_norm": 0.47145208716392517, + "learning_rate": 1.0174914241699968e-06, + "loss": 0.0721, + "step": 7402 + }, + { + "epoch": 2.398898250162022, + "grad_norm": 0.45912501215934753, + "learning_rate": 1.0164340282243984e-06, + "loss": 0.0664, + "step": 7403 + }, + { + "epoch": 2.3992222942320156, + "grad_norm": 0.506362795829773, + "learning_rate": 1.0153771198287116e-06, + "loss": 0.0748, + "step": 7404 + }, + { + "epoch": 2.399546338302009, + "grad_norm": 0.47258928418159485, + "learning_rate": 1.0143206991122888e-06, + "loss": 0.0699, + "step": 7405 + }, + { + "epoch": 2.3998703823720025, + "grad_norm": 0.5271346569061279, + "learning_rate": 1.013264766204431e-06, + "loss": 0.078, + "step": 7406 + }, + { + "epoch": 2.4001944264419963, + "grad_norm": 0.4933684170246124, + "learning_rate": 1.0122093212343698e-06, + "loss": 0.0729, + "step": 7407 + }, + { + "epoch": 2.4005184705119897, + "grad_norm": 0.47689729928970337, + "learning_rate": 1.0111543643312833e-06, + "loss": 0.0666, + "step": 7408 + }, + { + "epoch": 2.400842514581983, + "grad_norm": 0.4799298048019409, + "learning_rate": 1.010099895624288e-06, + "loss": 0.0708, + "step": 7409 + }, + { + "epoch": 2.4011665586519766, + "grad_norm": 0.4732402563095093, + "learning_rate": 1.0090459152424382e-06, + "loss": 0.0699, + "step": 7410 + }, + { + "epoch": 2.40149060272197, + "grad_norm": 0.49609676003456116, + "learning_rate": 1.0079924233147353e-06, + "loss": 0.0752, + "step": 7411 + }, + { + "epoch": 2.401814646791964, + "grad_norm": 0.49415668845176697, + "learning_rate": 1.0069394199701115e-06, + "loss": 0.0711, + "step": 7412 + }, + { + "epoch": 2.4021386908619573, + "grad_norm": 0.4801064729690552, + "learning_rate": 1.0058869053374499e-06, + "loss": 0.0702, + "step": 7413 + }, + { + "epoch": 2.4024627349319507, + "grad_norm": 0.5357898473739624, + "learning_rate": 1.004834879545562e-06, + "loss": 0.0803, + "step": 7414 + }, + { + "epoch": 2.402786779001944, + "grad_norm": 0.48465752601623535, + "learning_rate": 1.003783342723212e-06, + "loss": 0.0696, + "step": 7415 + }, + { + "epoch": 2.4031108230719376, + "grad_norm": 0.5104695558547974, + "learning_rate": 1.0027322949990925e-06, + "loss": 0.0765, + "step": 7416 + }, + { + "epoch": 2.4034348671419314, + "grad_norm": 0.510945200920105, + "learning_rate": 1.0016817365018438e-06, + "loss": 0.0756, + "step": 7417 + }, + { + "epoch": 2.403758911211925, + "grad_norm": 0.4508892893791199, + "learning_rate": 1.0006316673600436e-06, + "loss": 0.0676, + "step": 7418 + }, + { + "epoch": 2.4040829552819183, + "grad_norm": 0.5109551548957825, + "learning_rate": 9.995820877022105e-07, + "loss": 0.076, + "step": 7419 + }, + { + "epoch": 2.4044069993519117, + "grad_norm": 0.4841241240501404, + "learning_rate": 9.985329976568042e-07, + "loss": 0.075, + "step": 7420 + }, + { + "epoch": 2.404731043421905, + "grad_norm": 0.4857771396636963, + "learning_rate": 9.974843973522203e-07, + "loss": 0.0721, + "step": 7421 + }, + { + "epoch": 2.405055087491899, + "grad_norm": 0.4629274308681488, + "learning_rate": 9.964362869167993e-07, + "loss": 0.0699, + "step": 7422 + }, + { + "epoch": 2.4053791315618924, + "grad_norm": 0.5079348087310791, + "learning_rate": 9.953886664788186e-07, + "loss": 0.0766, + "step": 7423 + }, + { + "epoch": 2.405703175631886, + "grad_norm": 0.4648328423500061, + "learning_rate": 9.943415361664982e-07, + "loss": 0.0662, + "step": 7424 + }, + { + "epoch": 2.4060272197018793, + "grad_norm": 0.4789850115776062, + "learning_rate": 9.932948961079952e-07, + "loss": 0.0696, + "step": 7425 + }, + { + "epoch": 2.406351263771873, + "grad_norm": 0.47770291566848755, + "learning_rate": 9.922487464314096e-07, + "loss": 0.0696, + "step": 7426 + }, + { + "epoch": 2.4066753078418666, + "grad_norm": 0.47265005111694336, + "learning_rate": 9.912030872647793e-07, + "loss": 0.0681, + "step": 7427 + }, + { + "epoch": 2.40699935191186, + "grad_norm": 0.4607384502887726, + "learning_rate": 9.901579187360844e-07, + "loss": 0.0731, + "step": 7428 + }, + { + "epoch": 2.4073233959818534, + "grad_norm": 0.5032811760902405, + "learning_rate": 9.891132409732402e-07, + "loss": 0.0709, + "step": 7429 + }, + { + "epoch": 2.4076474400518473, + "grad_norm": 0.49219810962677, + "learning_rate": 9.880690541041072e-07, + "loss": 0.0709, + "step": 7430 + }, + { + "epoch": 2.4079714841218407, + "grad_norm": 0.5096017122268677, + "learning_rate": 9.870253582564838e-07, + "loss": 0.079, + "step": 7431 + }, + { + "epoch": 2.408295528191834, + "grad_norm": 0.48637381196022034, + "learning_rate": 9.859821535581072e-07, + "loss": 0.0695, + "step": 7432 + }, + { + "epoch": 2.4086195722618275, + "grad_norm": 0.42044156789779663, + "learning_rate": 9.84939440136658e-07, + "loss": 0.06, + "step": 7433 + }, + { + "epoch": 2.408943616331821, + "grad_norm": 0.5293217301368713, + "learning_rate": 9.838972181197498e-07, + "loss": 0.0775, + "step": 7434 + }, + { + "epoch": 2.409267660401815, + "grad_norm": 0.49000880122184753, + "learning_rate": 9.82855487634946e-07, + "loss": 0.0677, + "step": 7435 + }, + { + "epoch": 2.4095917044718083, + "grad_norm": 0.4933015704154968, + "learning_rate": 9.818142488097388e-07, + "loss": 0.0683, + "step": 7436 + }, + { + "epoch": 2.4099157485418017, + "grad_norm": 0.4835686683654785, + "learning_rate": 9.807735017715713e-07, + "loss": 0.0741, + "step": 7437 + }, + { + "epoch": 2.410239792611795, + "grad_norm": 0.4612524211406708, + "learning_rate": 9.797332466478165e-07, + "loss": 0.0726, + "step": 7438 + }, + { + "epoch": 2.4105638366817885, + "grad_norm": 0.592125654220581, + "learning_rate": 9.786934835657935e-07, + "loss": 0.0808, + "step": 7439 + }, + { + "epoch": 2.4108878807517824, + "grad_norm": 0.4904743731021881, + "learning_rate": 9.776542126527582e-07, + "loss": 0.0746, + "step": 7440 + }, + { + "epoch": 2.411211924821776, + "grad_norm": 0.499237596988678, + "learning_rate": 9.766154340359085e-07, + "loss": 0.0733, + "step": 7441 + }, + { + "epoch": 2.4115359688917692, + "grad_norm": 0.4838772416114807, + "learning_rate": 9.755771478423815e-07, + "loss": 0.0703, + "step": 7442 + }, + { + "epoch": 2.4118600129617627, + "grad_norm": 0.481381893157959, + "learning_rate": 9.745393541992492e-07, + "loss": 0.0694, + "step": 7443 + }, + { + "epoch": 2.412184057031756, + "grad_norm": 0.4760250747203827, + "learning_rate": 9.735020532335338e-07, + "loss": 0.0725, + "step": 7444 + }, + { + "epoch": 2.41250810110175, + "grad_norm": 0.4487675726413727, + "learning_rate": 9.724652450721855e-07, + "loss": 0.0671, + "step": 7445 + }, + { + "epoch": 2.4128321451717434, + "grad_norm": 0.4938701093196869, + "learning_rate": 9.71428929842102e-07, + "loss": 0.0761, + "step": 7446 + }, + { + "epoch": 2.413156189241737, + "grad_norm": 0.49953126907348633, + "learning_rate": 9.703931076701178e-07, + "loss": 0.0661, + "step": 7447 + }, + { + "epoch": 2.4134802333117302, + "grad_norm": 0.46972179412841797, + "learning_rate": 9.693577786830077e-07, + "loss": 0.0726, + "step": 7448 + }, + { + "epoch": 2.413804277381724, + "grad_norm": 0.4710189700126648, + "learning_rate": 9.683229430074859e-07, + "loss": 0.0735, + "step": 7449 + }, + { + "epoch": 2.4141283214517175, + "grad_norm": 0.5037508010864258, + "learning_rate": 9.67288600770206e-07, + "loss": 0.0794, + "step": 7450 + }, + { + "epoch": 2.414452365521711, + "grad_norm": 0.49730008840560913, + "learning_rate": 9.662547520977632e-07, + "loss": 0.0756, + "step": 7451 + }, + { + "epoch": 2.4147764095917044, + "grad_norm": 0.5043180584907532, + "learning_rate": 9.65221397116688e-07, + "loss": 0.0784, + "step": 7452 + }, + { + "epoch": 2.4151004536616982, + "grad_norm": 0.4728096127510071, + "learning_rate": 9.641885359534536e-07, + "loss": 0.0665, + "step": 7453 + }, + { + "epoch": 2.4154244977316917, + "grad_norm": 0.5002241134643555, + "learning_rate": 9.631561687344733e-07, + "loss": 0.0695, + "step": 7454 + }, + { + "epoch": 2.415748541801685, + "grad_norm": 0.4322189390659332, + "learning_rate": 9.621242955860977e-07, + "loss": 0.0659, + "step": 7455 + }, + { + "epoch": 2.4160725858716785, + "grad_norm": 0.4593190848827362, + "learning_rate": 9.610929166346188e-07, + "loss": 0.0672, + "step": 7456 + }, + { + "epoch": 2.416396629941672, + "grad_norm": 0.5169757008552551, + "learning_rate": 9.60062032006267e-07, + "loss": 0.0748, + "step": 7457 + }, + { + "epoch": 2.416720674011666, + "grad_norm": 0.46424371004104614, + "learning_rate": 9.590316418272134e-07, + "loss": 0.0673, + "step": 7458 + }, + { + "epoch": 2.417044718081659, + "grad_norm": 0.46779513359069824, + "learning_rate": 9.58001746223568e-07, + "loss": 0.0676, + "step": 7459 + }, + { + "epoch": 2.4173687621516526, + "grad_norm": 0.4487968981266022, + "learning_rate": 9.569723453213785e-07, + "loss": 0.0645, + "step": 7460 + }, + { + "epoch": 2.417692806221646, + "grad_norm": 0.44602566957473755, + "learning_rate": 9.559434392466337e-07, + "loss": 0.0657, + "step": 7461 + }, + { + "epoch": 2.4180168502916395, + "grad_norm": 0.46878647804260254, + "learning_rate": 9.549150281252633e-07, + "loss": 0.072, + "step": 7462 + }, + { + "epoch": 2.4183408943616334, + "grad_norm": 0.48305004835128784, + "learning_rate": 9.538871120831332e-07, + "loss": 0.0699, + "step": 7463 + }, + { + "epoch": 2.4186649384316268, + "grad_norm": 0.4601181149482727, + "learning_rate": 9.52859691246053e-07, + "loss": 0.0674, + "step": 7464 + }, + { + "epoch": 2.41898898250162, + "grad_norm": 0.5139188170433044, + "learning_rate": 9.518327657397647e-07, + "loss": 0.0735, + "step": 7465 + }, + { + "epoch": 2.4193130265716136, + "grad_norm": 0.48624053597450256, + "learning_rate": 9.508063356899588e-07, + "loss": 0.0701, + "step": 7466 + }, + { + "epoch": 2.419637070641607, + "grad_norm": 0.5249286890029907, + "learning_rate": 9.497804012222561e-07, + "loss": 0.0762, + "step": 7467 + }, + { + "epoch": 2.419961114711601, + "grad_norm": 0.45333951711654663, + "learning_rate": 9.48754962462225e-07, + "loss": 0.0659, + "step": 7468 + }, + { + "epoch": 2.4202851587815943, + "grad_norm": 0.44248318672180176, + "learning_rate": 9.477300195353667e-07, + "loss": 0.063, + "step": 7469 + }, + { + "epoch": 2.4206092028515878, + "grad_norm": 0.4752964377403259, + "learning_rate": 9.467055725671248e-07, + "loss": 0.0727, + "step": 7470 + }, + { + "epoch": 2.420933246921581, + "grad_norm": 0.49173787236213684, + "learning_rate": 9.456816216828818e-07, + "loss": 0.0747, + "step": 7471 + }, + { + "epoch": 2.4212572909915746, + "grad_norm": 0.5055178999900818, + "learning_rate": 9.446581670079597e-07, + "loss": 0.0785, + "step": 7472 + }, + { + "epoch": 2.4215813350615685, + "grad_norm": 0.45134517550468445, + "learning_rate": 9.436352086676203e-07, + "loss": 0.0674, + "step": 7473 + }, + { + "epoch": 2.421905379131562, + "grad_norm": 0.515214204788208, + "learning_rate": 9.426127467870599e-07, + "loss": 0.077, + "step": 7474 + }, + { + "epoch": 2.4222294232015553, + "grad_norm": 0.4785626232624054, + "learning_rate": 9.415907814914238e-07, + "loss": 0.0697, + "step": 7475 + }, + { + "epoch": 2.4225534672715487, + "grad_norm": 0.4874456524848938, + "learning_rate": 9.405693129057858e-07, + "loss": 0.0756, + "step": 7476 + }, + { + "epoch": 2.4228775113415426, + "grad_norm": 0.44153448939323425, + "learning_rate": 9.395483411551659e-07, + "loss": 0.0646, + "step": 7477 + }, + { + "epoch": 2.423201555411536, + "grad_norm": 0.47840017080307007, + "learning_rate": 9.385278663645209e-07, + "loss": 0.0679, + "step": 7478 + }, + { + "epoch": 2.4235255994815295, + "grad_norm": 0.5270043015480042, + "learning_rate": 9.375078886587469e-07, + "loss": 0.0784, + "step": 7479 + }, + { + "epoch": 2.423849643551523, + "grad_norm": 0.47315293550491333, + "learning_rate": 9.364884081626791e-07, + "loss": 0.0707, + "step": 7480 + }, + { + "epoch": 2.4241736876215167, + "grad_norm": 0.5027167201042175, + "learning_rate": 9.354694250010926e-07, + "loss": 0.0786, + "step": 7481 + }, + { + "epoch": 2.42449773169151, + "grad_norm": 0.516635537147522, + "learning_rate": 9.344509392987023e-07, + "loss": 0.0767, + "step": 7482 + }, + { + "epoch": 2.4248217757615036, + "grad_norm": 0.5132464170455933, + "learning_rate": 9.334329511801577e-07, + "loss": 0.0734, + "step": 7483 + }, + { + "epoch": 2.425145819831497, + "grad_norm": 0.46370166540145874, + "learning_rate": 9.324154607700525e-07, + "loss": 0.0741, + "step": 7484 + }, + { + "epoch": 2.4254698639014904, + "grad_norm": 0.5340031385421753, + "learning_rate": 9.313984681929178e-07, + "loss": 0.0734, + "step": 7485 + }, + { + "epoch": 2.4257939079714843, + "grad_norm": 0.45933422446250916, + "learning_rate": 9.303819735732234e-07, + "loss": 0.0708, + "step": 7486 + }, + { + "epoch": 2.4261179520414777, + "grad_norm": 0.4411062002182007, + "learning_rate": 9.29365977035378e-07, + "loss": 0.0656, + "step": 7487 + }, + { + "epoch": 2.426441996111471, + "grad_norm": 0.4746846556663513, + "learning_rate": 9.283504787037322e-07, + "loss": 0.0683, + "step": 7488 + }, + { + "epoch": 2.4267660401814646, + "grad_norm": 0.4785168170928955, + "learning_rate": 9.273354787025685e-07, + "loss": 0.0698, + "step": 7489 + }, + { + "epoch": 2.427090084251458, + "grad_norm": 0.4767061471939087, + "learning_rate": 9.263209771561182e-07, + "loss": 0.0694, + "step": 7490 + }, + { + "epoch": 2.427414128321452, + "grad_norm": 0.47101208567619324, + "learning_rate": 9.253069741885429e-07, + "loss": 0.07, + "step": 7491 + }, + { + "epoch": 2.4277381723914453, + "grad_norm": 0.47950929403305054, + "learning_rate": 9.242934699239476e-07, + "loss": 0.0669, + "step": 7492 + }, + { + "epoch": 2.4280622164614387, + "grad_norm": 0.47910645604133606, + "learning_rate": 9.232804644863757e-07, + "loss": 0.0686, + "step": 7493 + }, + { + "epoch": 2.428386260531432, + "grad_norm": 0.480186402797699, + "learning_rate": 9.222679579998095e-07, + "loss": 0.0678, + "step": 7494 + }, + { + "epoch": 2.4287103046014256, + "grad_norm": 0.5079746842384338, + "learning_rate": 9.212559505881707e-07, + "loss": 0.0784, + "step": 7495 + }, + { + "epoch": 2.4290343486714194, + "grad_norm": 0.49334797263145447, + "learning_rate": 9.202444423753159e-07, + "loss": 0.0729, + "step": 7496 + }, + { + "epoch": 2.429358392741413, + "grad_norm": 0.544931948184967, + "learning_rate": 9.192334334850489e-07, + "loss": 0.0851, + "step": 7497 + }, + { + "epoch": 2.4296824368114063, + "grad_norm": 0.5725739002227783, + "learning_rate": 9.182229240411023e-07, + "loss": 0.0811, + "step": 7498 + }, + { + "epoch": 2.4300064808813997, + "grad_norm": 0.47585466504096985, + "learning_rate": 9.172129141671571e-07, + "loss": 0.071, + "step": 7499 + }, + { + "epoch": 2.4303305249513936, + "grad_norm": 0.4769103229045868, + "learning_rate": 9.162034039868262e-07, + "loss": 0.0703, + "step": 7500 + }, + { + "epoch": 2.430654569021387, + "grad_norm": 0.47839346528053284, + "learning_rate": 9.15194393623664e-07, + "loss": 0.0741, + "step": 7501 + }, + { + "epoch": 2.4309786130913804, + "grad_norm": 0.46446266770362854, + "learning_rate": 9.141858832011641e-07, + "loss": 0.0692, + "step": 7502 + }, + { + "epoch": 2.431302657161374, + "grad_norm": 0.5156646370887756, + "learning_rate": 9.131778728427582e-07, + "loss": 0.0802, + "step": 7503 + }, + { + "epoch": 2.4316267012313677, + "grad_norm": 0.5064626932144165, + "learning_rate": 9.121703626718187e-07, + "loss": 0.0749, + "step": 7504 + }, + { + "epoch": 2.431950745301361, + "grad_norm": 0.45814090967178345, + "learning_rate": 9.111633528116509e-07, + "loss": 0.0686, + "step": 7505 + }, + { + "epoch": 2.4322747893713546, + "grad_norm": 0.516158938407898, + "learning_rate": 9.101568433855084e-07, + "loss": 0.0751, + "step": 7506 + }, + { + "epoch": 2.432598833441348, + "grad_norm": 0.5115140676498413, + "learning_rate": 9.091508345165739e-07, + "loss": 0.0768, + "step": 7507 + }, + { + "epoch": 2.4329228775113414, + "grad_norm": 0.4667341709136963, + "learning_rate": 9.081453263279749e-07, + "loss": 0.0741, + "step": 7508 + }, + { + "epoch": 2.4332469215813353, + "grad_norm": 0.5099531412124634, + "learning_rate": 9.071403189427757e-07, + "loss": 0.0712, + "step": 7509 + }, + { + "epoch": 2.4335709656513287, + "grad_norm": 0.4739290773868561, + "learning_rate": 9.061358124839798e-07, + "loss": 0.0677, + "step": 7510 + }, + { + "epoch": 2.433895009721322, + "grad_norm": 0.4962294101715088, + "learning_rate": 9.051318070745285e-07, + "loss": 0.0782, + "step": 7511 + }, + { + "epoch": 2.4342190537913155, + "grad_norm": 0.4872860014438629, + "learning_rate": 9.041283028373044e-07, + "loss": 0.0712, + "step": 7512 + }, + { + "epoch": 2.434543097861309, + "grad_norm": 0.4947209358215332, + "learning_rate": 9.031252998951229e-07, + "loss": 0.0765, + "step": 7513 + }, + { + "epoch": 2.434867141931303, + "grad_norm": 0.4560416340827942, + "learning_rate": 9.021227983707442e-07, + "loss": 0.0742, + "step": 7514 + }, + { + "epoch": 2.4351911860012962, + "grad_norm": 0.4807344079017639, + "learning_rate": 9.011207983868647e-07, + "loss": 0.0703, + "step": 7515 + }, + { + "epoch": 2.4355152300712897, + "grad_norm": 0.5113034844398499, + "learning_rate": 9.001193000661191e-07, + "loss": 0.0791, + "step": 7516 + }, + { + "epoch": 2.435839274141283, + "grad_norm": 0.4940589964389801, + "learning_rate": 8.991183035310813e-07, + "loss": 0.0709, + "step": 7517 + }, + { + "epoch": 2.4361633182112765, + "grad_norm": 0.4884170889854431, + "learning_rate": 8.98117808904263e-07, + "loss": 0.0708, + "step": 7518 + }, + { + "epoch": 2.4364873622812704, + "grad_norm": 0.47042766213417053, + "learning_rate": 8.971178163081173e-07, + "loss": 0.0704, + "step": 7519 + }, + { + "epoch": 2.436811406351264, + "grad_norm": 0.4856167733669281, + "learning_rate": 8.961183258650297e-07, + "loss": 0.0732, + "step": 7520 + }, + { + "epoch": 2.4371354504212572, + "grad_norm": 0.4780847430229187, + "learning_rate": 8.951193376973321e-07, + "loss": 0.071, + "step": 7521 + }, + { + "epoch": 2.4374594944912507, + "grad_norm": 0.46283453702926636, + "learning_rate": 8.941208519272876e-07, + "loss": 0.0684, + "step": 7522 + }, + { + "epoch": 2.4377835385612445, + "grad_norm": 0.4621603488922119, + "learning_rate": 8.931228686771048e-07, + "loss": 0.0681, + "step": 7523 + }, + { + "epoch": 2.438107582631238, + "grad_norm": 0.4744093716144562, + "learning_rate": 8.92125388068924e-07, + "loss": 0.0697, + "step": 7524 + }, + { + "epoch": 2.4384316267012314, + "grad_norm": 0.47922131419181824, + "learning_rate": 8.911284102248286e-07, + "loss": 0.0729, + "step": 7525 + }, + { + "epoch": 2.438755670771225, + "grad_norm": 0.4861237704753876, + "learning_rate": 8.901319352668397e-07, + "loss": 0.0723, + "step": 7526 + }, + { + "epoch": 2.439079714841218, + "grad_norm": 0.46727433800697327, + "learning_rate": 8.891359633169134e-07, + "loss": 0.0657, + "step": 7527 + }, + { + "epoch": 2.439403758911212, + "grad_norm": 0.4871031641960144, + "learning_rate": 8.881404944969507e-07, + "loss": 0.0725, + "step": 7528 + }, + { + "epoch": 2.4397278029812055, + "grad_norm": 0.5125278830528259, + "learning_rate": 8.871455289287839e-07, + "loss": 0.0765, + "step": 7529 + }, + { + "epoch": 2.440051847051199, + "grad_norm": 0.5060690641403198, + "learning_rate": 8.861510667341905e-07, + "loss": 0.0721, + "step": 7530 + }, + { + "epoch": 2.4403758911211924, + "grad_norm": 0.48133599758148193, + "learning_rate": 8.851571080348809e-07, + "loss": 0.0716, + "step": 7531 + }, + { + "epoch": 2.440699935191186, + "grad_norm": 0.4717944264411926, + "learning_rate": 8.84163652952506e-07, + "loss": 0.0696, + "step": 7532 + }, + { + "epoch": 2.4410239792611796, + "grad_norm": 0.4690879285335541, + "learning_rate": 8.831707016086561e-07, + "loss": 0.0698, + "step": 7533 + }, + { + "epoch": 2.441348023331173, + "grad_norm": 0.4657110273838043, + "learning_rate": 8.821782541248575e-07, + "loss": 0.0654, + "step": 7534 + }, + { + "epoch": 2.4416720674011665, + "grad_norm": 0.44634830951690674, + "learning_rate": 8.811863106225788e-07, + "loss": 0.0632, + "step": 7535 + }, + { + "epoch": 2.44199611147116, + "grad_norm": 0.43520429730415344, + "learning_rate": 8.8019487122322e-07, + "loss": 0.0632, + "step": 7536 + }, + { + "epoch": 2.442320155541154, + "grad_norm": 0.4770278334617615, + "learning_rate": 8.792039360481286e-07, + "loss": 0.0706, + "step": 7537 + }, + { + "epoch": 2.442644199611147, + "grad_norm": 0.48332685232162476, + "learning_rate": 8.782135052185819e-07, + "loss": 0.0689, + "step": 7538 + }, + { + "epoch": 2.4429682436811406, + "grad_norm": 0.4374450445175171, + "learning_rate": 8.772235788557998e-07, + "loss": 0.0665, + "step": 7539 + }, + { + "epoch": 2.443292287751134, + "grad_norm": 0.5159647464752197, + "learning_rate": 8.762341570809408e-07, + "loss": 0.076, + "step": 7540 + }, + { + "epoch": 2.4436163318211275, + "grad_norm": 0.47382551431655884, + "learning_rate": 8.75245240015099e-07, + "loss": 0.0703, + "step": 7541 + }, + { + "epoch": 2.4439403758911213, + "grad_norm": 0.5286172032356262, + "learning_rate": 8.742568277793095e-07, + "loss": 0.0835, + "step": 7542 + }, + { + "epoch": 2.4442644199611148, + "grad_norm": 0.5125248432159424, + "learning_rate": 8.732689204945449e-07, + "loss": 0.0755, + "step": 7543 + }, + { + "epoch": 2.444588464031108, + "grad_norm": 0.4747607409954071, + "learning_rate": 8.722815182817123e-07, + "loss": 0.0683, + "step": 7544 + }, + { + "epoch": 2.4449125081011016, + "grad_norm": 0.5194625854492188, + "learning_rate": 8.712946212616652e-07, + "loss": 0.0718, + "step": 7545 + }, + { + "epoch": 2.445236552171095, + "grad_norm": 0.5028043985366821, + "learning_rate": 8.703082295551862e-07, + "loss": 0.0797, + "step": 7546 + }, + { + "epoch": 2.445560596241089, + "grad_norm": 0.5011937022209167, + "learning_rate": 8.693223432830012e-07, + "loss": 0.0743, + "step": 7547 + }, + { + "epoch": 2.4458846403110823, + "grad_norm": 0.4265819191932678, + "learning_rate": 8.683369625657734e-07, + "loss": 0.0619, + "step": 7548 + }, + { + "epoch": 2.4462086843810757, + "grad_norm": 0.5144858956336975, + "learning_rate": 8.673520875241037e-07, + "loss": 0.0725, + "step": 7549 + }, + { + "epoch": 2.446532728451069, + "grad_norm": 0.48485949635505676, + "learning_rate": 8.663677182785324e-07, + "loss": 0.0689, + "step": 7550 + }, + { + "epoch": 2.446856772521063, + "grad_norm": 0.4591423273086548, + "learning_rate": 8.653838549495336e-07, + "loss": 0.0658, + "step": 7551 + }, + { + "epoch": 2.4471808165910565, + "grad_norm": 0.5020585656166077, + "learning_rate": 8.64400497657527e-07, + "loss": 0.072, + "step": 7552 + }, + { + "epoch": 2.44750486066105, + "grad_norm": 0.46597227454185486, + "learning_rate": 8.634176465228616e-07, + "loss": 0.0708, + "step": 7553 + }, + { + "epoch": 2.4478289047310433, + "grad_norm": 0.47457510232925415, + "learning_rate": 8.624353016658333e-07, + "loss": 0.0711, + "step": 7554 + }, + { + "epoch": 2.448152948801037, + "grad_norm": 0.4893776774406433, + "learning_rate": 8.614534632066684e-07, + "loss": 0.0755, + "step": 7555 + }, + { + "epoch": 2.4484769928710306, + "grad_norm": 0.5376947522163391, + "learning_rate": 8.604721312655351e-07, + "loss": 0.0772, + "step": 7556 + }, + { + "epoch": 2.448801036941024, + "grad_norm": 0.5056323409080505, + "learning_rate": 8.594913059625404e-07, + "loss": 0.0779, + "step": 7557 + }, + { + "epoch": 2.4491250810110174, + "grad_norm": 0.4631410539150238, + "learning_rate": 8.585109874177244e-07, + "loss": 0.0693, + "step": 7558 + }, + { + "epoch": 2.449449125081011, + "grad_norm": 0.4900587499141693, + "learning_rate": 8.57531175751073e-07, + "loss": 0.0691, + "step": 7559 + }, + { + "epoch": 2.4497731691510047, + "grad_norm": 0.48433420062065125, + "learning_rate": 8.56551871082501e-07, + "loss": 0.0697, + "step": 7560 + }, + { + "epoch": 2.450097213220998, + "grad_norm": 0.49739065766334534, + "learning_rate": 8.555730735318707e-07, + "loss": 0.0707, + "step": 7561 + }, + { + "epoch": 2.4504212572909916, + "grad_norm": 0.4808198809623718, + "learning_rate": 8.545947832189744e-07, + "loss": 0.0693, + "step": 7562 + }, + { + "epoch": 2.450745301360985, + "grad_norm": 0.5187966823577881, + "learning_rate": 8.536170002635452e-07, + "loss": 0.0784, + "step": 7563 + }, + { + "epoch": 2.4510693454309784, + "grad_norm": 0.5044239163398743, + "learning_rate": 8.526397247852558e-07, + "loss": 0.0755, + "step": 7564 + }, + { + "epoch": 2.4513933895009723, + "grad_norm": 0.502315878868103, + "learning_rate": 8.516629569037138e-07, + "loss": 0.0755, + "step": 7565 + }, + { + "epoch": 2.4517174335709657, + "grad_norm": 0.46958866715431213, + "learning_rate": 8.506866967384674e-07, + "loss": 0.0711, + "step": 7566 + }, + { + "epoch": 2.452041477640959, + "grad_norm": 0.5248948335647583, + "learning_rate": 8.497109444090018e-07, + "loss": 0.0769, + "step": 7567 + }, + { + "epoch": 2.4523655217109526, + "grad_norm": 0.4852147102355957, + "learning_rate": 8.487357000347379e-07, + "loss": 0.0659, + "step": 7568 + }, + { + "epoch": 2.452689565780946, + "grad_norm": 0.4870324432849884, + "learning_rate": 8.477609637350365e-07, + "loss": 0.0692, + "step": 7569 + }, + { + "epoch": 2.45301360985094, + "grad_norm": 0.46902889013290405, + "learning_rate": 8.467867356291964e-07, + "loss": 0.0689, + "step": 7570 + }, + { + "epoch": 2.4533376539209333, + "grad_norm": 0.5177121162414551, + "learning_rate": 8.458130158364536e-07, + "loss": 0.0748, + "step": 7571 + }, + { + "epoch": 2.4536616979909267, + "grad_norm": 0.4790119528770447, + "learning_rate": 8.448398044759826e-07, + "loss": 0.0708, + "step": 7572 + }, + { + "epoch": 2.45398574206092, + "grad_norm": 0.49437493085861206, + "learning_rate": 8.438671016668937e-07, + "loss": 0.0709, + "step": 7573 + }, + { + "epoch": 2.454309786130914, + "grad_norm": 0.4641510248184204, + "learning_rate": 8.428949075282389e-07, + "loss": 0.0662, + "step": 7574 + }, + { + "epoch": 2.4546338302009074, + "grad_norm": 0.48109662532806396, + "learning_rate": 8.419232221790003e-07, + "loss": 0.0664, + "step": 7575 + }, + { + "epoch": 2.454957874270901, + "grad_norm": 0.5049029588699341, + "learning_rate": 8.409520457381093e-07, + "loss": 0.0723, + "step": 7576 + }, + { + "epoch": 2.4552819183408943, + "grad_norm": 0.4541318714618683, + "learning_rate": 8.399813783244237e-07, + "loss": 0.0677, + "step": 7577 + }, + { + "epoch": 2.4556059624108877, + "grad_norm": 0.42951998114585876, + "learning_rate": 8.390112200567451e-07, + "loss": 0.0628, + "step": 7578 + }, + { + "epoch": 2.4559300064808816, + "grad_norm": 0.4754899740219116, + "learning_rate": 8.380415710538115e-07, + "loss": 0.0698, + "step": 7579 + }, + { + "epoch": 2.456254050550875, + "grad_norm": 0.48631545901298523, + "learning_rate": 8.370724314342993e-07, + "loss": 0.0704, + "step": 7580 + }, + { + "epoch": 2.4565780946208684, + "grad_norm": 0.4732912480831146, + "learning_rate": 8.361038013168221e-07, + "loss": 0.0713, + "step": 7581 + }, + { + "epoch": 2.456902138690862, + "grad_norm": 0.48035740852355957, + "learning_rate": 8.351356808199274e-07, + "loss": 0.0694, + "step": 7582 + }, + { + "epoch": 2.4572261827608557, + "grad_norm": 0.5123336315155029, + "learning_rate": 8.341680700621091e-07, + "loss": 0.0764, + "step": 7583 + }, + { + "epoch": 2.457550226830849, + "grad_norm": 0.47391778230667114, + "learning_rate": 8.332009691617882e-07, + "loss": 0.0703, + "step": 7584 + }, + { + "epoch": 2.4578742709008425, + "grad_norm": 0.4784128963947296, + "learning_rate": 8.322343782373333e-07, + "loss": 0.0678, + "step": 7585 + }, + { + "epoch": 2.458198314970836, + "grad_norm": 0.4773249626159668, + "learning_rate": 8.312682974070419e-07, + "loss": 0.0679, + "step": 7586 + }, + { + "epoch": 2.4585223590408294, + "grad_norm": 0.45103058218955994, + "learning_rate": 8.303027267891545e-07, + "loss": 0.068, + "step": 7587 + }, + { + "epoch": 2.4588464031108233, + "grad_norm": 0.49889039993286133, + "learning_rate": 8.293376665018482e-07, + "loss": 0.0719, + "step": 7588 + }, + { + "epoch": 2.4591704471808167, + "grad_norm": 0.49812307953834534, + "learning_rate": 8.283731166632359e-07, + "loss": 0.077, + "step": 7589 + }, + { + "epoch": 2.45949449125081, + "grad_norm": 0.4281612038612366, + "learning_rate": 8.274090773913706e-07, + "loss": 0.0591, + "step": 7590 + }, + { + "epoch": 2.4598185353208035, + "grad_norm": 0.473037451505661, + "learning_rate": 8.264455488042395e-07, + "loss": 0.0721, + "step": 7591 + }, + { + "epoch": 2.460142579390797, + "grad_norm": 0.49381524324417114, + "learning_rate": 8.254825310197701e-07, + "loss": 0.0709, + "step": 7592 + }, + { + "epoch": 2.460466623460791, + "grad_norm": 0.550028383731842, + "learning_rate": 8.245200241558265e-07, + "loss": 0.0735, + "step": 7593 + }, + { + "epoch": 2.4607906675307842, + "grad_norm": 0.49053382873535156, + "learning_rate": 8.235580283302097e-07, + "loss": 0.0738, + "step": 7594 + }, + { + "epoch": 2.4611147116007777, + "grad_norm": 0.5001011490821838, + "learning_rate": 8.225965436606598e-07, + "loss": 0.0758, + "step": 7595 + }, + { + "epoch": 2.461438755670771, + "grad_norm": 0.5266391038894653, + "learning_rate": 8.216355702648521e-07, + "loss": 0.0774, + "step": 7596 + }, + { + "epoch": 2.4617627997407645, + "grad_norm": 0.4827960431575775, + "learning_rate": 8.206751082604014e-07, + "loss": 0.0685, + "step": 7597 + }, + { + "epoch": 2.4620868438107584, + "grad_norm": 0.504069983959198, + "learning_rate": 8.197151577648593e-07, + "loss": 0.0751, + "step": 7598 + }, + { + "epoch": 2.462410887880752, + "grad_norm": 0.48526352643966675, + "learning_rate": 8.187557188957123e-07, + "loss": 0.0732, + "step": 7599 + }, + { + "epoch": 2.462734931950745, + "grad_norm": 0.46831855177879333, + "learning_rate": 8.177967917703877e-07, + "loss": 0.0673, + "step": 7600 + }, + { + "epoch": 2.4630589760207386, + "grad_norm": 0.4636510908603668, + "learning_rate": 8.168383765062493e-07, + "loss": 0.0701, + "step": 7601 + }, + { + "epoch": 2.4633830200907325, + "grad_norm": 0.5034536719322205, + "learning_rate": 8.158804732205971e-07, + "loss": 0.0767, + "step": 7602 + }, + { + "epoch": 2.463707064160726, + "grad_norm": 0.5184887647628784, + "learning_rate": 8.149230820306697e-07, + "loss": 0.078, + "step": 7603 + }, + { + "epoch": 2.4640311082307194, + "grad_norm": 0.4871300160884857, + "learning_rate": 8.139662030536421e-07, + "loss": 0.0757, + "step": 7604 + }, + { + "epoch": 2.464355152300713, + "grad_norm": 0.4584501385688782, + "learning_rate": 8.130098364066292e-07, + "loss": 0.0675, + "step": 7605 + }, + { + "epoch": 2.4646791963707066, + "grad_norm": 0.44841256737709045, + "learning_rate": 8.120539822066759e-07, + "loss": 0.0649, + "step": 7606 + }, + { + "epoch": 2.4650032404407, + "grad_norm": 0.46680083870887756, + "learning_rate": 8.110986405707755e-07, + "loss": 0.0669, + "step": 7607 + }, + { + "epoch": 2.4653272845106935, + "grad_norm": 0.49121201038360596, + "learning_rate": 8.101438116158488e-07, + "loss": 0.0745, + "step": 7608 + }, + { + "epoch": 2.465651328580687, + "grad_norm": 0.48516207933425903, + "learning_rate": 8.091894954587582e-07, + "loss": 0.0702, + "step": 7609 + }, + { + "epoch": 2.4659753726506803, + "grad_norm": 0.438103049993515, + "learning_rate": 8.082356922163038e-07, + "loss": 0.0648, + "step": 7610 + }, + { + "epoch": 2.466299416720674, + "grad_norm": 0.47541922330856323, + "learning_rate": 8.072824020052206e-07, + "loss": 0.069, + "step": 7611 + }, + { + "epoch": 2.4666234607906676, + "grad_norm": 0.4582546651363373, + "learning_rate": 8.063296249421843e-07, + "loss": 0.0688, + "step": 7612 + }, + { + "epoch": 2.466947504860661, + "grad_norm": 0.48613741993904114, + "learning_rate": 8.053773611438015e-07, + "loss": 0.0701, + "step": 7613 + }, + { + "epoch": 2.4672715489306545, + "grad_norm": 0.4982259273529053, + "learning_rate": 8.044256107266246e-07, + "loss": 0.0776, + "step": 7614 + }, + { + "epoch": 2.467595593000648, + "grad_norm": 0.46958404779434204, + "learning_rate": 8.034743738071349e-07, + "loss": 0.0637, + "step": 7615 + }, + { + "epoch": 2.4679196370706418, + "grad_norm": 0.48396363854408264, + "learning_rate": 8.02523650501758e-07, + "loss": 0.0725, + "step": 7616 + }, + { + "epoch": 2.468243681140635, + "grad_norm": 0.46536555886268616, + "learning_rate": 8.015734409268511e-07, + "loss": 0.07, + "step": 7617 + }, + { + "epoch": 2.4685677252106286, + "grad_norm": 0.5036252737045288, + "learning_rate": 8.006237451987109e-07, + "loss": 0.0704, + "step": 7618 + }, + { + "epoch": 2.468891769280622, + "grad_norm": 0.524920642375946, + "learning_rate": 7.996745634335712e-07, + "loss": 0.0761, + "step": 7619 + }, + { + "epoch": 2.4692158133506155, + "grad_norm": 0.47863340377807617, + "learning_rate": 7.987258957476024e-07, + "loss": 0.0677, + "step": 7620 + }, + { + "epoch": 2.4695398574206093, + "grad_norm": 0.47143682837486267, + "learning_rate": 7.977777422569138e-07, + "loss": 0.0688, + "step": 7621 + }, + { + "epoch": 2.4698639014906028, + "grad_norm": 0.4850553572177887, + "learning_rate": 7.968301030775477e-07, + "loss": 0.0723, + "step": 7622 + }, + { + "epoch": 2.470187945560596, + "grad_norm": 0.5007749795913696, + "learning_rate": 7.958829783254873e-07, + "loss": 0.0726, + "step": 7623 + }, + { + "epoch": 2.4705119896305896, + "grad_norm": 0.4891062378883362, + "learning_rate": 7.94936368116651e-07, + "loss": 0.0708, + "step": 7624 + }, + { + "epoch": 2.4708360337005835, + "grad_norm": 0.541865348815918, + "learning_rate": 7.939902725668952e-07, + "loss": 0.0716, + "step": 7625 + }, + { + "epoch": 2.471160077770577, + "grad_norm": 0.4788265824317932, + "learning_rate": 7.930446917920126e-07, + "loss": 0.0719, + "step": 7626 + }, + { + "epoch": 2.4714841218405703, + "grad_norm": 0.5064488053321838, + "learning_rate": 7.920996259077335e-07, + "loss": 0.072, + "step": 7627 + }, + { + "epoch": 2.4718081659105637, + "grad_norm": 0.4691329896450043, + "learning_rate": 7.911550750297247e-07, + "loss": 0.0711, + "step": 7628 + }, + { + "epoch": 2.4721322099805576, + "grad_norm": 0.47548046708106995, + "learning_rate": 7.902110392735907e-07, + "loss": 0.0695, + "step": 7629 + }, + { + "epoch": 2.472456254050551, + "grad_norm": 0.5309733748435974, + "learning_rate": 7.892675187548709e-07, + "loss": 0.076, + "step": 7630 + }, + { + "epoch": 2.4727802981205445, + "grad_norm": 0.4893459379673004, + "learning_rate": 7.883245135890432e-07, + "loss": 0.0728, + "step": 7631 + }, + { + "epoch": 2.473104342190538, + "grad_norm": 0.49500328302383423, + "learning_rate": 7.873820238915231e-07, + "loss": 0.0735, + "step": 7632 + }, + { + "epoch": 2.4734283862605313, + "grad_norm": 0.4524442255496979, + "learning_rate": 7.86440049777662e-07, + "loss": 0.0626, + "step": 7633 + }, + { + "epoch": 2.473752430330525, + "grad_norm": 0.491973876953125, + "learning_rate": 7.854985913627494e-07, + "loss": 0.0724, + "step": 7634 + }, + { + "epoch": 2.4740764744005186, + "grad_norm": 0.4897155165672302, + "learning_rate": 7.845576487620076e-07, + "loss": 0.0702, + "step": 7635 + }, + { + "epoch": 2.474400518470512, + "grad_norm": 0.5308480858802795, + "learning_rate": 7.83617222090603e-07, + "loss": 0.082, + "step": 7636 + }, + { + "epoch": 2.4747245625405054, + "grad_norm": 0.5133422017097473, + "learning_rate": 7.826773114636305e-07, + "loss": 0.0741, + "step": 7637 + }, + { + "epoch": 2.475048606610499, + "grad_norm": 0.5054247379302979, + "learning_rate": 7.817379169961309e-07, + "loss": 0.0762, + "step": 7638 + }, + { + "epoch": 2.4753726506804927, + "grad_norm": 0.4538639783859253, + "learning_rate": 7.807990388030728e-07, + "loss": 0.069, + "step": 7639 + }, + { + "epoch": 2.475696694750486, + "grad_norm": 0.470773845911026, + "learning_rate": 7.798606769993672e-07, + "loss": 0.0686, + "step": 7640 + }, + { + "epoch": 2.4760207388204796, + "grad_norm": 0.45735427737236023, + "learning_rate": 7.789228316998604e-07, + "loss": 0.0706, + "step": 7641 + }, + { + "epoch": 2.476344782890473, + "grad_norm": 0.5463230013847351, + "learning_rate": 7.779855030193362e-07, + "loss": 0.0793, + "step": 7642 + }, + { + "epoch": 2.4766688269604664, + "grad_norm": 0.4804290235042572, + "learning_rate": 7.770486910725156e-07, + "loss": 0.0677, + "step": 7643 + }, + { + "epoch": 2.4769928710304603, + "grad_norm": 0.5150229334831238, + "learning_rate": 7.761123959740513e-07, + "loss": 0.078, + "step": 7644 + }, + { + "epoch": 2.4773169151004537, + "grad_norm": 0.5204079747200012, + "learning_rate": 7.751766178385411e-07, + "loss": 0.0691, + "step": 7645 + }, + { + "epoch": 2.477640959170447, + "grad_norm": 0.49729999899864197, + "learning_rate": 7.742413567805129e-07, + "loss": 0.0742, + "step": 7646 + }, + { + "epoch": 2.4779650032404406, + "grad_norm": 0.5113031268119812, + "learning_rate": 7.73306612914434e-07, + "loss": 0.0714, + "step": 7647 + }, + { + "epoch": 2.478289047310434, + "grad_norm": 0.5141634345054626, + "learning_rate": 7.723723863547084e-07, + "loss": 0.0732, + "step": 7648 + }, + { + "epoch": 2.478613091380428, + "grad_norm": 0.48891061544418335, + "learning_rate": 7.714386772156757e-07, + "loss": 0.0705, + "step": 7649 + }, + { + "epoch": 2.4789371354504213, + "grad_norm": 0.46272414922714233, + "learning_rate": 7.705054856116129e-07, + "loss": 0.0641, + "step": 7650 + }, + { + "epoch": 2.4792611795204147, + "grad_norm": 0.48565956950187683, + "learning_rate": 7.695728116567347e-07, + "loss": 0.0705, + "step": 7651 + }, + { + "epoch": 2.479585223590408, + "grad_norm": 0.5136396288871765, + "learning_rate": 7.686406554651915e-07, + "loss": 0.0711, + "step": 7652 + }, + { + "epoch": 2.479909267660402, + "grad_norm": 0.49589043855667114, + "learning_rate": 7.677090171510682e-07, + "loss": 0.0735, + "step": 7653 + }, + { + "epoch": 2.4802333117303954, + "grad_norm": 0.5121117234230042, + "learning_rate": 7.667778968283895e-07, + "loss": 0.0745, + "step": 7654 + }, + { + "epoch": 2.480557355800389, + "grad_norm": 0.5081775784492493, + "learning_rate": 7.658472946111151e-07, + "loss": 0.0687, + "step": 7655 + }, + { + "epoch": 2.4808813998703823, + "grad_norm": 0.43997034430503845, + "learning_rate": 7.649172106131425e-07, + "loss": 0.0656, + "step": 7656 + }, + { + "epoch": 2.481205443940376, + "grad_norm": 0.5061107277870178, + "learning_rate": 7.639876449483047e-07, + "loss": 0.0696, + "step": 7657 + }, + { + "epoch": 2.4815294880103695, + "grad_norm": 0.5054954886436462, + "learning_rate": 7.630585977303717e-07, + "loss": 0.0782, + "step": 7658 + }, + { + "epoch": 2.481853532080363, + "grad_norm": 0.44806042313575745, + "learning_rate": 7.621300690730482e-07, + "loss": 0.0605, + "step": 7659 + }, + { + "epoch": 2.4821775761503564, + "grad_norm": 0.49718645215034485, + "learning_rate": 7.612020590899805e-07, + "loss": 0.0704, + "step": 7660 + }, + { + "epoch": 2.48250162022035, + "grad_norm": 0.5014523267745972, + "learning_rate": 7.602745678947443e-07, + "loss": 0.0706, + "step": 7661 + }, + { + "epoch": 2.4828256642903437, + "grad_norm": 0.5038493871688843, + "learning_rate": 7.593475956008578e-07, + "loss": 0.076, + "step": 7662 + }, + { + "epoch": 2.483149708360337, + "grad_norm": 0.5001026391983032, + "learning_rate": 7.58421142321773e-07, + "loss": 0.0725, + "step": 7663 + }, + { + "epoch": 2.4834737524303305, + "grad_norm": 0.4998968541622162, + "learning_rate": 7.574952081708787e-07, + "loss": 0.0744, + "step": 7664 + }, + { + "epoch": 2.483797796500324, + "grad_norm": 0.46559011936187744, + "learning_rate": 7.565697932615013e-07, + "loss": 0.071, + "step": 7665 + }, + { + "epoch": 2.4841218405703174, + "grad_norm": 0.547747790813446, + "learning_rate": 7.556448977068992e-07, + "loss": 0.0786, + "step": 7666 + }, + { + "epoch": 2.4844458846403112, + "grad_norm": 0.4957687258720398, + "learning_rate": 7.547205216202752e-07, + "loss": 0.0722, + "step": 7667 + }, + { + "epoch": 2.4847699287103047, + "grad_norm": 0.4725322425365448, + "learning_rate": 7.537966651147599e-07, + "loss": 0.0687, + "step": 7668 + }, + { + "epoch": 2.485093972780298, + "grad_norm": 0.4966736435890198, + "learning_rate": 7.52873328303429e-07, + "loss": 0.0736, + "step": 7669 + }, + { + "epoch": 2.4854180168502915, + "grad_norm": 0.4968201220035553, + "learning_rate": 7.519505112992842e-07, + "loss": 0.0739, + "step": 7670 + }, + { + "epoch": 2.485742060920285, + "grad_norm": 0.473723441362381, + "learning_rate": 7.510282142152753e-07, + "loss": 0.0668, + "step": 7671 + }, + { + "epoch": 2.486066104990279, + "grad_norm": 0.49232056736946106, + "learning_rate": 7.501064371642785e-07, + "loss": 0.072, + "step": 7672 + }, + { + "epoch": 2.4863901490602722, + "grad_norm": 0.4883209466934204, + "learning_rate": 7.491851802591121e-07, + "loss": 0.0735, + "step": 7673 + }, + { + "epoch": 2.4867141931302656, + "grad_norm": 0.4559668302536011, + "learning_rate": 7.482644436125291e-07, + "loss": 0.0713, + "step": 7674 + }, + { + "epoch": 2.487038237200259, + "grad_norm": 0.492357462644577, + "learning_rate": 7.473442273372162e-07, + "loss": 0.0728, + "step": 7675 + }, + { + "epoch": 2.487362281270253, + "grad_norm": 0.4837523400783539, + "learning_rate": 7.464245315458029e-07, + "loss": 0.0712, + "step": 7676 + }, + { + "epoch": 2.4876863253402464, + "grad_norm": 0.47218117117881775, + "learning_rate": 7.455053563508485e-07, + "loss": 0.0686, + "step": 7677 + }, + { + "epoch": 2.48801036941024, + "grad_norm": 0.42852234840393066, + "learning_rate": 7.445867018648517e-07, + "loss": 0.0609, + "step": 7678 + }, + { + "epoch": 2.488334413480233, + "grad_norm": 0.5153549313545227, + "learning_rate": 7.436685682002465e-07, + "loss": 0.0771, + "step": 7679 + }, + { + "epoch": 2.488658457550227, + "grad_norm": 0.4937134087085724, + "learning_rate": 7.427509554694046e-07, + "loss": 0.0732, + "step": 7680 + }, + { + "epoch": 2.4889825016202205, + "grad_norm": 0.4915093779563904, + "learning_rate": 7.418338637846323e-07, + "loss": 0.0712, + "step": 7681 + }, + { + "epoch": 2.489306545690214, + "grad_norm": 0.46614953875541687, + "learning_rate": 7.409172932581726e-07, + "loss": 0.0686, + "step": 7682 + }, + { + "epoch": 2.4896305897602073, + "grad_norm": 0.5098023414611816, + "learning_rate": 7.400012440022053e-07, + "loss": 0.0732, + "step": 7683 + }, + { + "epoch": 2.4899546338302008, + "grad_norm": 0.5139278769493103, + "learning_rate": 7.390857161288467e-07, + "loss": 0.0735, + "step": 7684 + }, + { + "epoch": 2.4902786779001946, + "grad_norm": 0.49719876050949097, + "learning_rate": 7.381707097501467e-07, + "loss": 0.0753, + "step": 7685 + }, + { + "epoch": 2.490602721970188, + "grad_norm": 0.4695081114768982, + "learning_rate": 7.37256224978094e-07, + "loss": 0.0681, + "step": 7686 + }, + { + "epoch": 2.4909267660401815, + "grad_norm": 0.4486734867095947, + "learning_rate": 7.36342261924613e-07, + "loss": 0.0668, + "step": 7687 + }, + { + "epoch": 2.491250810110175, + "grad_norm": 0.4767226278781891, + "learning_rate": 7.354288207015636e-07, + "loss": 0.0687, + "step": 7688 + }, + { + "epoch": 2.4915748541801683, + "grad_norm": 0.45571380853652954, + "learning_rate": 7.345159014207432e-07, + "loss": 0.0677, + "step": 7689 + }, + { + "epoch": 2.491898898250162, + "grad_norm": 0.49520525336265564, + "learning_rate": 7.33603504193881e-07, + "loss": 0.0717, + "step": 7690 + }, + { + "epoch": 2.4922229423201556, + "grad_norm": 0.4969030022621155, + "learning_rate": 7.326916291326508e-07, + "loss": 0.0767, + "step": 7691 + }, + { + "epoch": 2.492546986390149, + "grad_norm": 0.5097898840904236, + "learning_rate": 7.31780276348652e-07, + "loss": 0.0773, + "step": 7692 + }, + { + "epoch": 2.4928710304601425, + "grad_norm": 0.49189767241477966, + "learning_rate": 7.308694459534299e-07, + "loss": 0.0721, + "step": 7693 + }, + { + "epoch": 2.493195074530136, + "grad_norm": 0.5023943185806274, + "learning_rate": 7.299591380584581e-07, + "loss": 0.0729, + "step": 7694 + }, + { + "epoch": 2.4935191186001298, + "grad_norm": 0.45873549580574036, + "learning_rate": 7.290493527751508e-07, + "loss": 0.0647, + "step": 7695 + }, + { + "epoch": 2.493843162670123, + "grad_norm": 0.45283275842666626, + "learning_rate": 7.281400902148578e-07, + "loss": 0.0654, + "step": 7696 + }, + { + "epoch": 2.4941672067401166, + "grad_norm": 0.5040050148963928, + "learning_rate": 7.272313504888606e-07, + "loss": 0.0744, + "step": 7697 + }, + { + "epoch": 2.49449125081011, + "grad_norm": 0.4800432324409485, + "learning_rate": 7.263231337083842e-07, + "loss": 0.0641, + "step": 7698 + }, + { + "epoch": 2.4948152948801035, + "grad_norm": 0.49461984634399414, + "learning_rate": 7.254154399845825e-07, + "loss": 0.0718, + "step": 7699 + }, + { + "epoch": 2.4951393389500973, + "grad_norm": 0.4502789378166199, + "learning_rate": 7.245082694285516e-07, + "loss": 0.0668, + "step": 7700 + }, + { + "epoch": 2.4954633830200907, + "grad_norm": 0.5190999507904053, + "learning_rate": 7.236016221513176e-07, + "loss": 0.0787, + "step": 7701 + }, + { + "epoch": 2.495787427090084, + "grad_norm": 0.518627405166626, + "learning_rate": 7.226954982638463e-07, + "loss": 0.0796, + "step": 7702 + }, + { + "epoch": 2.4961114711600776, + "grad_norm": 0.5438644886016846, + "learning_rate": 7.217898978770382e-07, + "loss": 0.0752, + "step": 7703 + }, + { + "epoch": 2.4964355152300715, + "grad_norm": 0.444247305393219, + "learning_rate": 7.208848211017305e-07, + "loss": 0.0639, + "step": 7704 + }, + { + "epoch": 2.496759559300065, + "grad_norm": 0.484861820936203, + "learning_rate": 7.199802680486956e-07, + "loss": 0.0714, + "step": 7705 + }, + { + "epoch": 2.4970836033700583, + "grad_norm": 0.47993481159210205, + "learning_rate": 7.190762388286421e-07, + "loss": 0.0697, + "step": 7706 + }, + { + "epoch": 2.4974076474400517, + "grad_norm": 0.5076626539230347, + "learning_rate": 7.181727335522154e-07, + "loss": 0.0693, + "step": 7707 + }, + { + "epoch": 2.4977316915100456, + "grad_norm": 0.510184109210968, + "learning_rate": 7.172697523299943e-07, + "loss": 0.0725, + "step": 7708 + }, + { + "epoch": 2.498055735580039, + "grad_norm": 0.46183472871780396, + "learning_rate": 7.163672952724948e-07, + "loss": 0.0663, + "step": 7709 + }, + { + "epoch": 2.4983797796500324, + "grad_norm": 0.5151199698448181, + "learning_rate": 7.154653624901697e-07, + "loss": 0.0771, + "step": 7710 + }, + { + "epoch": 2.498703823720026, + "grad_norm": 0.4487936198711395, + "learning_rate": 7.145639540934069e-07, + "loss": 0.0624, + "step": 7711 + }, + { + "epoch": 2.4990278677900193, + "grad_norm": 0.4931066036224365, + "learning_rate": 7.136630701925301e-07, + "loss": 0.075, + "step": 7712 + }, + { + "epoch": 2.499351911860013, + "grad_norm": 0.48808082938194275, + "learning_rate": 7.127627108977991e-07, + "loss": 0.0742, + "step": 7713 + }, + { + "epoch": 2.4996759559300066, + "grad_norm": 0.5426340103149414, + "learning_rate": 7.118628763194068e-07, + "loss": 0.0763, + "step": 7714 + }, + { + "epoch": 2.5, + "grad_norm": 0.458568274974823, + "learning_rate": 7.10963566567488e-07, + "loss": 0.0659, + "step": 7715 + }, + { + "epoch": 2.5003240440699934, + "grad_norm": 0.5094156265258789, + "learning_rate": 7.100647817521067e-07, + "loss": 0.0748, + "step": 7716 + }, + { + "epoch": 2.500648088139987, + "grad_norm": 0.47651737928390503, + "learning_rate": 7.09166521983266e-07, + "loss": 0.0687, + "step": 7717 + }, + { + "epoch": 2.5009721322099807, + "grad_norm": 0.5027143359184265, + "learning_rate": 7.082687873709048e-07, + "loss": 0.071, + "step": 7718 + }, + { + "epoch": 2.501296176279974, + "grad_norm": 0.5082246661186218, + "learning_rate": 7.073715780248969e-07, + "loss": 0.0727, + "step": 7719 + }, + { + "epoch": 2.5016202203499676, + "grad_norm": 0.47659170627593994, + "learning_rate": 7.064748940550531e-07, + "loss": 0.0672, + "step": 7720 + }, + { + "epoch": 2.501944264419961, + "grad_norm": 0.47924235463142395, + "learning_rate": 7.055787355711153e-07, + "loss": 0.0711, + "step": 7721 + }, + { + "epoch": 2.5022683084899544, + "grad_norm": 0.4786684811115265, + "learning_rate": 7.046831026827694e-07, + "loss": 0.0708, + "step": 7722 + }, + { + "epoch": 2.5025923525599483, + "grad_norm": 0.4756191372871399, + "learning_rate": 7.037879954996274e-07, + "loss": 0.073, + "step": 7723 + }, + { + "epoch": 2.5029163966299417, + "grad_norm": 0.495451420545578, + "learning_rate": 7.028934141312466e-07, + "loss": 0.0723, + "step": 7724 + }, + { + "epoch": 2.503240440699935, + "grad_norm": 0.47573190927505493, + "learning_rate": 7.019993586871116e-07, + "loss": 0.0725, + "step": 7725 + }, + { + "epoch": 2.5035644847699285, + "grad_norm": 0.5132853388786316, + "learning_rate": 7.01105829276647e-07, + "loss": 0.0754, + "step": 7726 + }, + { + "epoch": 2.503888528839922, + "grad_norm": 0.46969881653785706, + "learning_rate": 7.002128260092128e-07, + "loss": 0.068, + "step": 7727 + }, + { + "epoch": 2.504212572909916, + "grad_norm": 0.4756503999233246, + "learning_rate": 6.993203489941036e-07, + "loss": 0.0671, + "step": 7728 + }, + { + "epoch": 2.5045366169799093, + "grad_norm": 0.4660719037055969, + "learning_rate": 6.984283983405504e-07, + "loss": 0.0683, + "step": 7729 + }, + { + "epoch": 2.5048606610499027, + "grad_norm": 0.45893529057502747, + "learning_rate": 6.975369741577171e-07, + "loss": 0.0679, + "step": 7730 + }, + { + "epoch": 2.5051847051198965, + "grad_norm": 0.5016549229621887, + "learning_rate": 6.96646076554709e-07, + "loss": 0.0715, + "step": 7731 + }, + { + "epoch": 2.50550874918989, + "grad_norm": 0.4874078929424286, + "learning_rate": 6.957557056405606e-07, + "loss": 0.0704, + "step": 7732 + }, + { + "epoch": 2.5058327932598834, + "grad_norm": 0.46628010272979736, + "learning_rate": 6.948658615242454e-07, + "loss": 0.071, + "step": 7733 + }, + { + "epoch": 2.506156837329877, + "grad_norm": 0.4942968487739563, + "learning_rate": 6.939765443146712e-07, + "loss": 0.0715, + "step": 7734 + }, + { + "epoch": 2.5064808813998702, + "grad_norm": 0.4972381591796875, + "learning_rate": 6.930877541206832e-07, + "loss": 0.074, + "step": 7735 + }, + { + "epoch": 2.506804925469864, + "grad_norm": 0.49421438574790955, + "learning_rate": 6.921994910510599e-07, + "loss": 0.0718, + "step": 7736 + }, + { + "epoch": 2.5071289695398575, + "grad_norm": 0.5161775946617126, + "learning_rate": 6.91311755214517e-07, + "loss": 0.0766, + "step": 7737 + }, + { + "epoch": 2.507453013609851, + "grad_norm": 0.4510141611099243, + "learning_rate": 6.904245467197029e-07, + "loss": 0.0671, + "step": 7738 + }, + { + "epoch": 2.5077770576798444, + "grad_norm": 0.5020382404327393, + "learning_rate": 6.895378656752044e-07, + "loss": 0.0714, + "step": 7739 + }, + { + "epoch": 2.508101101749838, + "grad_norm": 0.4951009154319763, + "learning_rate": 6.886517121895425e-07, + "loss": 0.0765, + "step": 7740 + }, + { + "epoch": 2.5084251458198317, + "grad_norm": 0.4750005900859833, + "learning_rate": 6.877660863711744e-07, + "loss": 0.067, + "step": 7741 + }, + { + "epoch": 2.508749189889825, + "grad_norm": 0.48607584834098816, + "learning_rate": 6.86880988328491e-07, + "loss": 0.0734, + "step": 7742 + }, + { + "epoch": 2.5090732339598185, + "grad_norm": 0.49443286657333374, + "learning_rate": 6.859964181698209e-07, + "loss": 0.0723, + "step": 7743 + }, + { + "epoch": 2.509397278029812, + "grad_norm": 0.45458024740219116, + "learning_rate": 6.851123760034273e-07, + "loss": 0.0671, + "step": 7744 + }, + { + "epoch": 2.5097213220998054, + "grad_norm": 0.5252810120582581, + "learning_rate": 6.842288619375054e-07, + "loss": 0.0797, + "step": 7745 + }, + { + "epoch": 2.5100453661697992, + "grad_norm": 0.5204702019691467, + "learning_rate": 6.833458760801931e-07, + "loss": 0.0783, + "step": 7746 + }, + { + "epoch": 2.5103694102397927, + "grad_norm": 0.49748295545578003, + "learning_rate": 6.824634185395562e-07, + "loss": 0.0703, + "step": 7747 + }, + { + "epoch": 2.510693454309786, + "grad_norm": 0.49332040548324585, + "learning_rate": 6.815814894235994e-07, + "loss": 0.0734, + "step": 7748 + }, + { + "epoch": 2.5110174983797795, + "grad_norm": 0.5083654522895813, + "learning_rate": 6.807000888402631e-07, + "loss": 0.0751, + "step": 7749 + }, + { + "epoch": 2.511341542449773, + "grad_norm": 0.4894023835659027, + "learning_rate": 6.798192168974216e-07, + "loss": 0.0704, + "step": 7750 + }, + { + "epoch": 2.511665586519767, + "grad_norm": 0.4570043683052063, + "learning_rate": 6.789388737028868e-07, + "loss": 0.069, + "step": 7751 + }, + { + "epoch": 2.51198963058976, + "grad_norm": 0.44119545817375183, + "learning_rate": 6.780590593644004e-07, + "loss": 0.0645, + "step": 7752 + }, + { + "epoch": 2.5123136746597536, + "grad_norm": 0.48286327719688416, + "learning_rate": 6.771797739896479e-07, + "loss": 0.0737, + "step": 7753 + }, + { + "epoch": 2.5126377187297475, + "grad_norm": 0.48988592624664307, + "learning_rate": 6.763010176862405e-07, + "loss": 0.0723, + "step": 7754 + }, + { + "epoch": 2.512961762799741, + "grad_norm": 0.5176308751106262, + "learning_rate": 6.754227905617338e-07, + "loss": 0.074, + "step": 7755 + }, + { + "epoch": 2.5132858068697344, + "grad_norm": 0.5146079063415527, + "learning_rate": 6.745450927236119e-07, + "loss": 0.0762, + "step": 7756 + }, + { + "epoch": 2.5136098509397278, + "grad_norm": 0.5212789177894592, + "learning_rate": 6.736679242792965e-07, + "loss": 0.0735, + "step": 7757 + }, + { + "epoch": 2.513933895009721, + "grad_norm": 0.4908018708229065, + "learning_rate": 6.727912853361456e-07, + "loss": 0.07, + "step": 7758 + }, + { + "epoch": 2.514257939079715, + "grad_norm": 0.4421240985393524, + "learning_rate": 6.719151760014503e-07, + "loss": 0.0676, + "step": 7759 + }, + { + "epoch": 2.5145819831497085, + "grad_norm": 0.43792641162872314, + "learning_rate": 6.710395963824396e-07, + "loss": 0.0654, + "step": 7760 + }, + { + "epoch": 2.514906027219702, + "grad_norm": 0.4864790141582489, + "learning_rate": 6.701645465862721e-07, + "loss": 0.0687, + "step": 7761 + }, + { + "epoch": 2.5152300712896953, + "grad_norm": 0.46703585982322693, + "learning_rate": 6.692900267200509e-07, + "loss": 0.0681, + "step": 7762 + }, + { + "epoch": 2.5155541153596888, + "grad_norm": 0.44520705938339233, + "learning_rate": 6.684160368908044e-07, + "loss": 0.0639, + "step": 7763 + }, + { + "epoch": 2.5158781594296826, + "grad_norm": 0.47705909609794617, + "learning_rate": 6.675425772055022e-07, + "loss": 0.0721, + "step": 7764 + }, + { + "epoch": 2.516202203499676, + "grad_norm": 0.4667569398880005, + "learning_rate": 6.666696477710471e-07, + "loss": 0.0662, + "step": 7765 + }, + { + "epoch": 2.5165262475696695, + "grad_norm": 0.5014678239822388, + "learning_rate": 6.657972486942771e-07, + "loss": 0.0741, + "step": 7766 + }, + { + "epoch": 2.516850291639663, + "grad_norm": 0.5059800744056702, + "learning_rate": 6.649253800819655e-07, + "loss": 0.0787, + "step": 7767 + }, + { + "epoch": 2.5171743357096563, + "grad_norm": 0.5104771852493286, + "learning_rate": 6.640540420408214e-07, + "loss": 0.077, + "step": 7768 + }, + { + "epoch": 2.51749837977965, + "grad_norm": 0.44101908802986145, + "learning_rate": 6.631832346774869e-07, + "loss": 0.0642, + "step": 7769 + }, + { + "epoch": 2.5178224238496436, + "grad_norm": 0.4786303639411926, + "learning_rate": 6.623129580985404e-07, + "loss": 0.0688, + "step": 7770 + }, + { + "epoch": 2.518146467919637, + "grad_norm": 0.48153790831565857, + "learning_rate": 6.614432124104958e-07, + "loss": 0.0752, + "step": 7771 + }, + { + "epoch": 2.5184705119896305, + "grad_norm": 0.4702111780643463, + "learning_rate": 6.605739977198017e-07, + "loss": 0.0711, + "step": 7772 + }, + { + "epoch": 2.518794556059624, + "grad_norm": 0.4766141176223755, + "learning_rate": 6.597053141328414e-07, + "loss": 0.0688, + "step": 7773 + }, + { + "epoch": 2.5191186001296177, + "grad_norm": 0.5064833760261536, + "learning_rate": 6.58837161755933e-07, + "loss": 0.0755, + "step": 7774 + }, + { + "epoch": 2.519442644199611, + "grad_norm": 0.4854080379009247, + "learning_rate": 6.579695406953318e-07, + "loss": 0.0737, + "step": 7775 + }, + { + "epoch": 2.5197666882696046, + "grad_norm": 0.49226248264312744, + "learning_rate": 6.571024510572222e-07, + "loss": 0.0748, + "step": 7776 + }, + { + "epoch": 2.5200907323395985, + "grad_norm": 0.4566308856010437, + "learning_rate": 6.562358929477325e-07, + "loss": 0.0698, + "step": 7777 + }, + { + "epoch": 2.5204147764095914, + "grad_norm": 0.45967721939086914, + "learning_rate": 6.553698664729174e-07, + "loss": 0.0694, + "step": 7778 + }, + { + "epoch": 2.5207388204795853, + "grad_norm": 0.6135513782501221, + "learning_rate": 6.545043717387717e-07, + "loss": 0.0671, + "step": 7779 + }, + { + "epoch": 2.5210628645495787, + "grad_norm": 0.464465469121933, + "learning_rate": 6.536394088512227e-07, + "loss": 0.0687, + "step": 7780 + }, + { + "epoch": 2.521386908619572, + "grad_norm": 0.4690185785293579, + "learning_rate": 6.527749779161341e-07, + "loss": 0.0693, + "step": 7781 + }, + { + "epoch": 2.521710952689566, + "grad_norm": 0.5188164710998535, + "learning_rate": 6.519110790393052e-07, + "loss": 0.0748, + "step": 7782 + }, + { + "epoch": 2.5220349967595594, + "grad_norm": 0.522108256816864, + "learning_rate": 6.510477123264652e-07, + "loss": 0.0767, + "step": 7783 + }, + { + "epoch": 2.522359040829553, + "grad_norm": 0.5431333184242249, + "learning_rate": 6.501848778832864e-07, + "loss": 0.0754, + "step": 7784 + }, + { + "epoch": 2.5226830848995463, + "grad_norm": 0.503311038017273, + "learning_rate": 6.493225758153665e-07, + "loss": 0.0742, + "step": 7785 + }, + { + "epoch": 2.5230071289695397, + "grad_norm": 0.5031965374946594, + "learning_rate": 6.484608062282477e-07, + "loss": 0.0747, + "step": 7786 + }, + { + "epoch": 2.5233311730395336, + "grad_norm": 0.4736836552619934, + "learning_rate": 6.475995692273995e-07, + "loss": 0.07, + "step": 7787 + }, + { + "epoch": 2.523655217109527, + "grad_norm": 0.5007358193397522, + "learning_rate": 6.467388649182288e-07, + "loss": 0.0753, + "step": 7788 + }, + { + "epoch": 2.5239792611795204, + "grad_norm": 0.49653831124305725, + "learning_rate": 6.45878693406079e-07, + "loss": 0.0771, + "step": 7789 + }, + { + "epoch": 2.524303305249514, + "grad_norm": 0.4510502219200134, + "learning_rate": 6.450190547962254e-07, + "loss": 0.0684, + "step": 7790 + }, + { + "epoch": 2.5246273493195073, + "grad_norm": 0.4857436418533325, + "learning_rate": 6.441599491938811e-07, + "loss": 0.0701, + "step": 7791 + }, + { + "epoch": 2.524951393389501, + "grad_norm": 0.49388056993484497, + "learning_rate": 6.433013767041901e-07, + "loss": 0.073, + "step": 7792 + }, + { + "epoch": 2.5252754374594946, + "grad_norm": 0.5577260851860046, + "learning_rate": 6.424433374322347e-07, + "loss": 0.0812, + "step": 7793 + }, + { + "epoch": 2.525599481529488, + "grad_norm": 0.4625173509120941, + "learning_rate": 6.415858314830304e-07, + "loss": 0.0659, + "step": 7794 + }, + { + "epoch": 2.5259235255994814, + "grad_norm": 0.4894951283931732, + "learning_rate": 6.407288589615279e-07, + "loss": 0.0704, + "step": 7795 + }, + { + "epoch": 2.526247569669475, + "grad_norm": 0.4609914720058441, + "learning_rate": 6.398724199726114e-07, + "loss": 0.0711, + "step": 7796 + }, + { + "epoch": 2.5265716137394687, + "grad_norm": 0.49507227540016174, + "learning_rate": 6.39016514621102e-07, + "loss": 0.0731, + "step": 7797 + }, + { + "epoch": 2.526895657809462, + "grad_norm": 0.5095334053039551, + "learning_rate": 6.38161143011753e-07, + "loss": 0.0722, + "step": 7798 + }, + { + "epoch": 2.5272197018794555, + "grad_norm": 0.5216260552406311, + "learning_rate": 6.373063052492557e-07, + "loss": 0.0731, + "step": 7799 + }, + { + "epoch": 2.527543745949449, + "grad_norm": 0.44932428002357483, + "learning_rate": 6.364520014382314e-07, + "loss": 0.0667, + "step": 7800 + }, + { + "epoch": 2.5278677900194424, + "grad_norm": 0.5045766234397888, + "learning_rate": 6.355982316832393e-07, + "loss": 0.0738, + "step": 7801 + }, + { + "epoch": 2.5281918340894363, + "grad_norm": 0.46739453077316284, + "learning_rate": 6.347449960887736e-07, + "loss": 0.0658, + "step": 7802 + }, + { + "epoch": 2.5285158781594297, + "grad_norm": 0.4505472779273987, + "learning_rate": 6.338922947592607e-07, + "loss": 0.0689, + "step": 7803 + }, + { + "epoch": 2.528839922229423, + "grad_norm": 0.49780258536338806, + "learning_rate": 6.330401277990656e-07, + "loss": 0.0681, + "step": 7804 + }, + { + "epoch": 2.529163966299417, + "grad_norm": 0.46541908383369446, + "learning_rate": 6.321884953124808e-07, + "loss": 0.0649, + "step": 7805 + }, + { + "epoch": 2.5294880103694104, + "grad_norm": 0.4666377604007721, + "learning_rate": 6.313373974037423e-07, + "loss": 0.066, + "step": 7806 + }, + { + "epoch": 2.529812054439404, + "grad_norm": 0.4736870229244232, + "learning_rate": 6.304868341770127e-07, + "loss": 0.0697, + "step": 7807 + }, + { + "epoch": 2.5301360985093972, + "grad_norm": 0.5035609602928162, + "learning_rate": 6.296368057363966e-07, + "loss": 0.0786, + "step": 7808 + }, + { + "epoch": 2.5304601425793907, + "grad_norm": 0.45186638832092285, + "learning_rate": 6.287873121859251e-07, + "loss": 0.0662, + "step": 7809 + }, + { + "epoch": 2.5307841866493845, + "grad_norm": 0.4887215197086334, + "learning_rate": 6.279383536295719e-07, + "loss": 0.074, + "step": 7810 + }, + { + "epoch": 2.531108230719378, + "grad_norm": 0.4621054232120514, + "learning_rate": 6.27089930171238e-07, + "loss": 0.0661, + "step": 7811 + }, + { + "epoch": 2.5314322747893714, + "grad_norm": 0.4519784152507782, + "learning_rate": 6.262420419147641e-07, + "loss": 0.063, + "step": 7812 + }, + { + "epoch": 2.531756318859365, + "grad_norm": 0.47850343585014343, + "learning_rate": 6.25394688963924e-07, + "loss": 0.0697, + "step": 7813 + }, + { + "epoch": 2.5320803629293582, + "grad_norm": 0.4962216019630432, + "learning_rate": 6.245478714224223e-07, + "loss": 0.0735, + "step": 7814 + }, + { + "epoch": 2.532404406999352, + "grad_norm": 0.5007613301277161, + "learning_rate": 6.237015893939053e-07, + "loss": 0.0721, + "step": 7815 + }, + { + "epoch": 2.5327284510693455, + "grad_norm": 0.493139386177063, + "learning_rate": 6.228558429819459e-07, + "loss": 0.0737, + "step": 7816 + }, + { + "epoch": 2.533052495139339, + "grad_norm": 0.5349908471107483, + "learning_rate": 6.220106322900598e-07, + "loss": 0.0792, + "step": 7817 + }, + { + "epoch": 2.5333765392093324, + "grad_norm": 0.5205992460250854, + "learning_rate": 6.211659574216888e-07, + "loss": 0.0742, + "step": 7818 + }, + { + "epoch": 2.533700583279326, + "grad_norm": 0.5088434815406799, + "learning_rate": 6.20321818480214e-07, + "loss": 0.0716, + "step": 7819 + }, + { + "epoch": 2.5340246273493197, + "grad_norm": 0.44300928711891174, + "learning_rate": 6.194782155689505e-07, + "loss": 0.0666, + "step": 7820 + }, + { + "epoch": 2.534348671419313, + "grad_norm": 0.48843711614608765, + "learning_rate": 6.186351487911463e-07, + "loss": 0.068, + "step": 7821 + }, + { + "epoch": 2.5346727154893065, + "grad_norm": 0.4515663981437683, + "learning_rate": 6.17792618249986e-07, + "loss": 0.0637, + "step": 7822 + }, + { + "epoch": 2.5349967595593, + "grad_norm": 0.4966040253639221, + "learning_rate": 6.169506240485856e-07, + "loss": 0.0738, + "step": 7823 + }, + { + "epoch": 2.5353208036292934, + "grad_norm": 0.4954109787940979, + "learning_rate": 6.161091662899971e-07, + "loss": 0.0723, + "step": 7824 + }, + { + "epoch": 2.535644847699287, + "grad_norm": 0.4602881371974945, + "learning_rate": 6.152682450772074e-07, + "loss": 0.0667, + "step": 7825 + }, + { + "epoch": 2.5359688917692806, + "grad_norm": 0.4792567789554596, + "learning_rate": 6.144278605131371e-07, + "loss": 0.0699, + "step": 7826 + }, + { + "epoch": 2.536292935839274, + "grad_norm": 0.4837825298309326, + "learning_rate": 6.135880127006411e-07, + "loss": 0.0692, + "step": 7827 + }, + { + "epoch": 2.536616979909268, + "grad_norm": 0.48817309737205505, + "learning_rate": 6.127487017425088e-07, + "loss": 0.0731, + "step": 7828 + }, + { + "epoch": 2.536941023979261, + "grad_norm": 0.46039432287216187, + "learning_rate": 6.11909927741463e-07, + "loss": 0.0694, + "step": 7829 + }, + { + "epoch": 2.537265068049255, + "grad_norm": 0.5014173984527588, + "learning_rate": 6.110716908001635e-07, + "loss": 0.0774, + "step": 7830 + }, + { + "epoch": 2.537589112119248, + "grad_norm": 0.5282461047172546, + "learning_rate": 6.102339910211985e-07, + "loss": 0.0802, + "step": 7831 + }, + { + "epoch": 2.5379131561892416, + "grad_norm": 0.48522257804870605, + "learning_rate": 6.093968285070989e-07, + "loss": 0.0729, + "step": 7832 + }, + { + "epoch": 2.5382372002592355, + "grad_norm": 0.49142035841941833, + "learning_rate": 6.085602033603221e-07, + "loss": 0.0702, + "step": 7833 + }, + { + "epoch": 2.538561244329229, + "grad_norm": 0.4869705140590668, + "learning_rate": 6.077241156832641e-07, + "loss": 0.0711, + "step": 7834 + }, + { + "epoch": 2.5388852883992223, + "grad_norm": 0.49308985471725464, + "learning_rate": 6.068885655782553e-07, + "loss": 0.0686, + "step": 7835 + }, + { + "epoch": 2.5392093324692158, + "grad_norm": 0.47202742099761963, + "learning_rate": 6.060535531475548e-07, + "loss": 0.069, + "step": 7836 + }, + { + "epoch": 2.539533376539209, + "grad_norm": 0.47661447525024414, + "learning_rate": 6.052190784933648e-07, + "loss": 0.0689, + "step": 7837 + }, + { + "epoch": 2.539857420609203, + "grad_norm": 0.5208913087844849, + "learning_rate": 6.043851417178132e-07, + "loss": 0.0796, + "step": 7838 + }, + { + "epoch": 2.5401814646791965, + "grad_norm": 0.472245454788208, + "learning_rate": 6.035517429229687e-07, + "loss": 0.0729, + "step": 7839 + }, + { + "epoch": 2.54050550874919, + "grad_norm": 0.4980803430080414, + "learning_rate": 6.027188822108288e-07, + "loss": 0.0736, + "step": 7840 + }, + { + "epoch": 2.5408295528191833, + "grad_norm": 0.4821029007434845, + "learning_rate": 6.018865596833301e-07, + "loss": 0.0713, + "step": 7841 + }, + { + "epoch": 2.5411535968891767, + "grad_norm": 0.4771307706832886, + "learning_rate": 6.010547754423385e-07, + "loss": 0.0742, + "step": 7842 + }, + { + "epoch": 2.5414776409591706, + "grad_norm": 0.49014055728912354, + "learning_rate": 6.002235295896574e-07, + "loss": 0.073, + "step": 7843 + }, + { + "epoch": 2.541801685029164, + "grad_norm": 0.45903417468070984, + "learning_rate": 5.993928222270246e-07, + "loss": 0.0694, + "step": 7844 + }, + { + "epoch": 2.5421257290991575, + "grad_norm": 0.483227014541626, + "learning_rate": 5.985626534561062e-07, + "loss": 0.0713, + "step": 7845 + }, + { + "epoch": 2.542449773169151, + "grad_norm": 0.4825564920902252, + "learning_rate": 5.977330233785128e-07, + "loss": 0.0718, + "step": 7846 + }, + { + "epoch": 2.5427738172391443, + "grad_norm": 0.4853704571723938, + "learning_rate": 5.969039320957787e-07, + "loss": 0.0756, + "step": 7847 + }, + { + "epoch": 2.543097861309138, + "grad_norm": 0.4904906749725342, + "learning_rate": 5.960753797093776e-07, + "loss": 0.0702, + "step": 7848 + }, + { + "epoch": 2.5434219053791316, + "grad_norm": 0.4905846416950226, + "learning_rate": 5.952473663207176e-07, + "loss": 0.0718, + "step": 7849 + }, + { + "epoch": 2.543745949449125, + "grad_norm": 0.510238766670227, + "learning_rate": 5.944198920311378e-07, + "loss": 0.0714, + "step": 7850 + }, + { + "epoch": 2.5440699935191184, + "grad_norm": 0.5145969986915588, + "learning_rate": 5.935929569419147e-07, + "loss": 0.0745, + "step": 7851 + }, + { + "epoch": 2.544394037589112, + "grad_norm": 0.4890921711921692, + "learning_rate": 5.927665611542555e-07, + "loss": 0.0768, + "step": 7852 + }, + { + "epoch": 2.5447180816591057, + "grad_norm": 0.4760702848434448, + "learning_rate": 5.919407047693043e-07, + "loss": 0.0675, + "step": 7853 + }, + { + "epoch": 2.545042125729099, + "grad_norm": 0.4983026385307312, + "learning_rate": 5.911153878881387e-07, + "loss": 0.0748, + "step": 7854 + }, + { + "epoch": 2.5453661697990926, + "grad_norm": 0.48434123396873474, + "learning_rate": 5.902906106117673e-07, + "loss": 0.0745, + "step": 7855 + }, + { + "epoch": 2.5456902138690864, + "grad_norm": 0.47780078649520874, + "learning_rate": 5.894663730411354e-07, + "loss": 0.0673, + "step": 7856 + }, + { + "epoch": 2.54601425793908, + "grad_norm": 0.5478381514549255, + "learning_rate": 5.886426752771224e-07, + "loss": 0.0742, + "step": 7857 + }, + { + "epoch": 2.5463383020090733, + "grad_norm": 0.4975893497467041, + "learning_rate": 5.878195174205409e-07, + "loss": 0.0695, + "step": 7858 + }, + { + "epoch": 2.5466623460790667, + "grad_norm": 0.4923208951950073, + "learning_rate": 5.869968995721382e-07, + "loss": 0.0751, + "step": 7859 + }, + { + "epoch": 2.54698639014906, + "grad_norm": 0.4692155420780182, + "learning_rate": 5.861748218325919e-07, + "loss": 0.074, + "step": 7860 + }, + { + "epoch": 2.547310434219054, + "grad_norm": 0.4731508791446686, + "learning_rate": 5.8535328430252e-07, + "loss": 0.0725, + "step": 7861 + }, + { + "epoch": 2.5476344782890474, + "grad_norm": 0.4851173162460327, + "learning_rate": 5.845322870824671e-07, + "loss": 0.0687, + "step": 7862 + }, + { + "epoch": 2.547958522359041, + "grad_norm": 0.5180558562278748, + "learning_rate": 5.837118302729189e-07, + "loss": 0.077, + "step": 7863 + }, + { + "epoch": 2.5482825664290343, + "grad_norm": 0.48749464750289917, + "learning_rate": 5.828919139742894e-07, + "loss": 0.0724, + "step": 7864 + }, + { + "epoch": 2.5486066104990277, + "grad_norm": 0.48550012707710266, + "learning_rate": 5.82072538286928e-07, + "loss": 0.0689, + "step": 7865 + }, + { + "epoch": 2.5489306545690216, + "grad_norm": 0.5006141066551208, + "learning_rate": 5.812537033111193e-07, + "loss": 0.072, + "step": 7866 + }, + { + "epoch": 2.549254698639015, + "grad_norm": 0.4635670781135559, + "learning_rate": 5.804354091470809e-07, + "loss": 0.0678, + "step": 7867 + }, + { + "epoch": 2.5495787427090084, + "grad_norm": 0.5107144713401794, + "learning_rate": 5.796176558949645e-07, + "loss": 0.0728, + "step": 7868 + }, + { + "epoch": 2.549902786779002, + "grad_norm": 0.4777069687843323, + "learning_rate": 5.788004436548522e-07, + "loss": 0.0725, + "step": 7869 + }, + { + "epoch": 2.5502268308489953, + "grad_norm": 0.5170052647590637, + "learning_rate": 5.779837725267673e-07, + "loss": 0.0743, + "step": 7870 + }, + { + "epoch": 2.550550874918989, + "grad_norm": 0.5209857821464539, + "learning_rate": 5.771676426106593e-07, + "loss": 0.0756, + "step": 7871 + }, + { + "epoch": 2.5508749189889826, + "grad_norm": 0.4826715886592865, + "learning_rate": 5.763520540064149e-07, + "loss": 0.0721, + "step": 7872 + }, + { + "epoch": 2.551198963058976, + "grad_norm": 0.4898536801338196, + "learning_rate": 5.755370068138555e-07, + "loss": 0.0722, + "step": 7873 + }, + { + "epoch": 2.5515230071289694, + "grad_norm": 0.4834844470024109, + "learning_rate": 5.74722501132734e-07, + "loss": 0.0702, + "step": 7874 + }, + { + "epoch": 2.551847051198963, + "grad_norm": 0.4997037947177887, + "learning_rate": 5.739085370627384e-07, + "loss": 0.0733, + "step": 7875 + }, + { + "epoch": 2.5521710952689567, + "grad_norm": 0.5183606147766113, + "learning_rate": 5.730951147034902e-07, + "loss": 0.0755, + "step": 7876 + }, + { + "epoch": 2.55249513933895, + "grad_norm": 0.5163745880126953, + "learning_rate": 5.722822341545453e-07, + "loss": 0.0679, + "step": 7877 + }, + { + "epoch": 2.5528191834089435, + "grad_norm": 0.4591270089149475, + "learning_rate": 5.714698955153897e-07, + "loss": 0.069, + "step": 7878 + }, + { + "epoch": 2.5531432274789374, + "grad_norm": 0.5224382877349854, + "learning_rate": 5.706580988854476e-07, + "loss": 0.0775, + "step": 7879 + }, + { + "epoch": 2.5534672715489304, + "grad_norm": 0.4884330928325653, + "learning_rate": 5.698468443640753e-07, + "loss": 0.077, + "step": 7880 + }, + { + "epoch": 2.5537913156189243, + "grad_norm": 0.4916975796222687, + "learning_rate": 5.690361320505616e-07, + "loss": 0.0765, + "step": 7881 + }, + { + "epoch": 2.5541153596889177, + "grad_norm": 0.4709622859954834, + "learning_rate": 5.682259620441305e-07, + "loss": 0.069, + "step": 7882 + }, + { + "epoch": 2.554439403758911, + "grad_norm": 0.4830895960330963, + "learning_rate": 5.674163344439388e-07, + "loss": 0.0746, + "step": 7883 + }, + { + "epoch": 2.554763447828905, + "grad_norm": 0.48182427883148193, + "learning_rate": 5.66607249349077e-07, + "loss": 0.0654, + "step": 7884 + }, + { + "epoch": 2.5550874918988984, + "grad_norm": 0.4703971743583679, + "learning_rate": 5.657987068585702e-07, + "loss": 0.071, + "step": 7885 + }, + { + "epoch": 2.555411535968892, + "grad_norm": 0.5118240118026733, + "learning_rate": 5.649907070713744e-07, + "loss": 0.0697, + "step": 7886 + }, + { + "epoch": 2.5557355800388852, + "grad_norm": 0.4816184639930725, + "learning_rate": 5.641832500863814e-07, + "loss": 0.0717, + "step": 7887 + }, + { + "epoch": 2.5560596241088787, + "grad_norm": 0.4615735113620758, + "learning_rate": 5.633763360024169e-07, + "loss": 0.0636, + "step": 7888 + }, + { + "epoch": 2.5563836681788725, + "grad_norm": 0.4630856215953827, + "learning_rate": 5.625699649182392e-07, + "loss": 0.0671, + "step": 7889 + }, + { + "epoch": 2.556707712248866, + "grad_norm": 0.4773900508880615, + "learning_rate": 5.61764136932541e-07, + "loss": 0.0726, + "step": 7890 + }, + { + "epoch": 2.5570317563188594, + "grad_norm": 0.4864650368690491, + "learning_rate": 5.609588521439452e-07, + "loss": 0.0717, + "step": 7891 + }, + { + "epoch": 2.557355800388853, + "grad_norm": 0.47471946477890015, + "learning_rate": 5.601541106510144e-07, + "loss": 0.0718, + "step": 7892 + }, + { + "epoch": 2.557679844458846, + "grad_norm": 0.484334260225296, + "learning_rate": 5.593499125522372e-07, + "loss": 0.0707, + "step": 7893 + }, + { + "epoch": 2.55800388852884, + "grad_norm": 0.5048068761825562, + "learning_rate": 5.585462579460443e-07, + "loss": 0.0733, + "step": 7894 + }, + { + "epoch": 2.5583279325988335, + "grad_norm": 0.5181866884231567, + "learning_rate": 5.577431469307915e-07, + "loss": 0.0738, + "step": 7895 + }, + { + "epoch": 2.558651976668827, + "grad_norm": 0.5335702896118164, + "learning_rate": 5.569405796047733e-07, + "loss": 0.08, + "step": 7896 + }, + { + "epoch": 2.5589760207388204, + "grad_norm": 0.49718958139419556, + "learning_rate": 5.561385560662158e-07, + "loss": 0.0719, + "step": 7897 + }, + { + "epoch": 2.559300064808814, + "grad_norm": 0.46473929286003113, + "learning_rate": 5.553370764132793e-07, + "loss": 0.0647, + "step": 7898 + }, + { + "epoch": 2.5596241088788076, + "grad_norm": 0.5155867338180542, + "learning_rate": 5.545361407440581e-07, + "loss": 0.0772, + "step": 7899 + }, + { + "epoch": 2.559948152948801, + "grad_norm": 0.460317462682724, + "learning_rate": 5.537357491565759e-07, + "loss": 0.0667, + "step": 7900 + }, + { + "epoch": 2.5602721970187945, + "grad_norm": 0.4972425699234009, + "learning_rate": 5.529359017487962e-07, + "loss": 0.0748, + "step": 7901 + }, + { + "epoch": 2.560596241088788, + "grad_norm": 0.4749511480331421, + "learning_rate": 5.521365986186111e-07, + "loss": 0.0693, + "step": 7902 + }, + { + "epoch": 2.5609202851587813, + "grad_norm": 0.4954456686973572, + "learning_rate": 5.513378398638469e-07, + "loss": 0.0673, + "step": 7903 + }, + { + "epoch": 2.561244329228775, + "grad_norm": 0.475407212972641, + "learning_rate": 5.505396255822654e-07, + "loss": 0.0703, + "step": 7904 + }, + { + "epoch": 2.5615683732987686, + "grad_norm": 0.4594283401966095, + "learning_rate": 5.497419558715588e-07, + "loss": 0.0674, + "step": 7905 + }, + { + "epoch": 2.561892417368762, + "grad_norm": 0.45197737216949463, + "learning_rate": 5.489448308293554e-07, + "loss": 0.0653, + "step": 7906 + }, + { + "epoch": 2.562216461438756, + "grad_norm": 0.5138728022575378, + "learning_rate": 5.481482505532154e-07, + "loss": 0.0777, + "step": 7907 + }, + { + "epoch": 2.5625405055087493, + "grad_norm": 0.49609872698783875, + "learning_rate": 5.47352215140633e-07, + "loss": 0.0721, + "step": 7908 + }, + { + "epoch": 2.5628645495787428, + "grad_norm": 0.47917693853378296, + "learning_rate": 5.465567246890336e-07, + "loss": 0.0686, + "step": 7909 + }, + { + "epoch": 2.563188593648736, + "grad_norm": 0.48315778374671936, + "learning_rate": 5.457617792957782e-07, + "loss": 0.0702, + "step": 7910 + }, + { + "epoch": 2.5635126377187296, + "grad_norm": 0.5028765201568604, + "learning_rate": 5.449673790581611e-07, + "loss": 0.0744, + "step": 7911 + }, + { + "epoch": 2.5638366817887235, + "grad_norm": 0.48444709181785583, + "learning_rate": 5.441735240734081e-07, + "loss": 0.0714, + "step": 7912 + }, + { + "epoch": 2.564160725858717, + "grad_norm": 0.4968181252479553, + "learning_rate": 5.433802144386808e-07, + "loss": 0.0762, + "step": 7913 + }, + { + "epoch": 2.5644847699287103, + "grad_norm": 0.46512550115585327, + "learning_rate": 5.42587450251072e-07, + "loss": 0.0641, + "step": 7914 + }, + { + "epoch": 2.5648088139987038, + "grad_norm": 0.5445234179496765, + "learning_rate": 5.417952316076069e-07, + "loss": 0.0805, + "step": 7915 + }, + { + "epoch": 2.565132858068697, + "grad_norm": 0.5239993929862976, + "learning_rate": 5.410035586052481e-07, + "loss": 0.0711, + "step": 7916 + }, + { + "epoch": 2.565456902138691, + "grad_norm": 0.46684685349464417, + "learning_rate": 5.402124313408868e-07, + "loss": 0.0678, + "step": 7917 + }, + { + "epoch": 2.5657809462086845, + "grad_norm": 0.46517103910446167, + "learning_rate": 5.394218499113496e-07, + "loss": 0.0673, + "step": 7918 + }, + { + "epoch": 2.566104990278678, + "grad_norm": 0.4972522556781769, + "learning_rate": 5.386318144133961e-07, + "loss": 0.0729, + "step": 7919 + }, + { + "epoch": 2.5664290343486713, + "grad_norm": 0.5065658688545227, + "learning_rate": 5.378423249437193e-07, + "loss": 0.0784, + "step": 7920 + }, + { + "epoch": 2.5667530784186647, + "grad_norm": 0.477154940366745, + "learning_rate": 5.370533815989459e-07, + "loss": 0.0635, + "step": 7921 + }, + { + "epoch": 2.5670771224886586, + "grad_norm": 0.5431901812553406, + "learning_rate": 5.362649844756318e-07, + "loss": 0.0738, + "step": 7922 + }, + { + "epoch": 2.567401166558652, + "grad_norm": 0.4951257109642029, + "learning_rate": 5.354771336702735e-07, + "loss": 0.0708, + "step": 7923 + }, + { + "epoch": 2.5677252106286454, + "grad_norm": 0.4686160087585449, + "learning_rate": 5.346898292792919e-07, + "loss": 0.0678, + "step": 7924 + }, + { + "epoch": 2.568049254698639, + "grad_norm": 0.5159788131713867, + "learning_rate": 5.339030713990495e-07, + "loss": 0.0744, + "step": 7925 + }, + { + "epoch": 2.5683732987686323, + "grad_norm": 0.5049765706062317, + "learning_rate": 5.331168601258352e-07, + "loss": 0.0774, + "step": 7926 + }, + { + "epoch": 2.568697342838626, + "grad_norm": 0.4748237729072571, + "learning_rate": 5.323311955558746e-07, + "loss": 0.0686, + "step": 7927 + }, + { + "epoch": 2.5690213869086196, + "grad_norm": 0.4686489701271057, + "learning_rate": 5.315460777853249e-07, + "loss": 0.0702, + "step": 7928 + }, + { + "epoch": 2.569345430978613, + "grad_norm": 0.5975189805030823, + "learning_rate": 5.307615069102773e-07, + "loss": 0.0668, + "step": 7929 + }, + { + "epoch": 2.569669475048607, + "grad_norm": 0.49249792098999023, + "learning_rate": 5.299774830267573e-07, + "loss": 0.0753, + "step": 7930 + }, + { + "epoch": 2.5699935191186, + "grad_norm": 0.4822183847427368, + "learning_rate": 5.291940062307177e-07, + "loss": 0.0718, + "step": 7931 + }, + { + "epoch": 2.5703175631885937, + "grad_norm": 0.5218304395675659, + "learning_rate": 5.284110766180528e-07, + "loss": 0.0716, + "step": 7932 + }, + { + "epoch": 2.570641607258587, + "grad_norm": 0.4451057016849518, + "learning_rate": 5.276286942845832e-07, + "loss": 0.0644, + "step": 7933 + }, + { + "epoch": 2.5709656513285806, + "grad_norm": 0.45064833760261536, + "learning_rate": 5.268468593260656e-07, + "loss": 0.0674, + "step": 7934 + }, + { + "epoch": 2.5712896953985744, + "grad_norm": 0.5059220790863037, + "learning_rate": 5.260655718381885e-07, + "loss": 0.0714, + "step": 7935 + }, + { + "epoch": 2.571613739468568, + "grad_norm": 0.48486247658729553, + "learning_rate": 5.252848319165744e-07, + "loss": 0.0703, + "step": 7936 + }, + { + "epoch": 2.5719377835385613, + "grad_norm": 0.5442525744438171, + "learning_rate": 5.245046396567788e-07, + "loss": 0.0773, + "step": 7937 + }, + { + "epoch": 2.5722618276085547, + "grad_norm": 0.4933522939682007, + "learning_rate": 5.237249951542895e-07, + "loss": 0.0727, + "step": 7938 + }, + { + "epoch": 2.572585871678548, + "grad_norm": 0.49625441431999207, + "learning_rate": 5.229458985045265e-07, + "loss": 0.0754, + "step": 7939 + }, + { + "epoch": 2.572909915748542, + "grad_norm": 0.4622628390789032, + "learning_rate": 5.22167349802844e-07, + "loss": 0.0646, + "step": 7940 + }, + { + "epoch": 2.5732339598185354, + "grad_norm": 0.5263717174530029, + "learning_rate": 5.213893491445293e-07, + "loss": 0.0758, + "step": 7941 + }, + { + "epoch": 2.573558003888529, + "grad_norm": 0.46194687485694885, + "learning_rate": 5.206118966248019e-07, + "loss": 0.0685, + "step": 7942 + }, + { + "epoch": 2.5738820479585223, + "grad_norm": 0.5024887323379517, + "learning_rate": 5.198349923388146e-07, + "loss": 0.0741, + "step": 7943 + }, + { + "epoch": 2.5742060920285157, + "grad_norm": 0.48845016956329346, + "learning_rate": 5.19058636381653e-07, + "loss": 0.0737, + "step": 7944 + }, + { + "epoch": 2.5745301360985096, + "grad_norm": 0.451090544462204, + "learning_rate": 5.18282828848336e-07, + "loss": 0.0654, + "step": 7945 + }, + { + "epoch": 2.574854180168503, + "grad_norm": 0.5033639669418335, + "learning_rate": 5.175075698338128e-07, + "loss": 0.0735, + "step": 7946 + }, + { + "epoch": 2.5751782242384964, + "grad_norm": 0.47307324409484863, + "learning_rate": 5.167328594329707e-07, + "loss": 0.0698, + "step": 7947 + }, + { + "epoch": 2.57550226830849, + "grad_norm": 0.48564478754997253, + "learning_rate": 5.159586977406244e-07, + "loss": 0.0706, + "step": 7948 + }, + { + "epoch": 2.5758263123784833, + "grad_norm": 0.4771139621734619, + "learning_rate": 5.151850848515249e-07, + "loss": 0.0665, + "step": 7949 + }, + { + "epoch": 2.576150356448477, + "grad_norm": 0.4719444811344147, + "learning_rate": 5.144120208603542e-07, + "loss": 0.0702, + "step": 7950 + }, + { + "epoch": 2.5764744005184705, + "grad_norm": 0.4773831367492676, + "learning_rate": 5.136395058617289e-07, + "loss": 0.072, + "step": 7951 + }, + { + "epoch": 2.576798444588464, + "grad_norm": 0.4886631667613983, + "learning_rate": 5.12867539950197e-07, + "loss": 0.0686, + "step": 7952 + }, + { + "epoch": 2.5771224886584574, + "grad_norm": 0.48384395241737366, + "learning_rate": 5.120961232202382e-07, + "loss": 0.072, + "step": 7953 + }, + { + "epoch": 2.577446532728451, + "grad_norm": 0.5068090558052063, + "learning_rate": 5.11325255766269e-07, + "loss": 0.0766, + "step": 7954 + }, + { + "epoch": 2.5777705767984447, + "grad_norm": 0.43233636021614075, + "learning_rate": 5.10554937682633e-07, + "loss": 0.0612, + "step": 7955 + }, + { + "epoch": 2.578094620868438, + "grad_norm": 0.5227373242378235, + "learning_rate": 5.097851690636135e-07, + "loss": 0.0731, + "step": 7956 + }, + { + "epoch": 2.5784186649384315, + "grad_norm": 0.4451727271080017, + "learning_rate": 5.090159500034198e-07, + "loss": 0.0621, + "step": 7957 + }, + { + "epoch": 2.5787427090084254, + "grad_norm": 0.4760894477367401, + "learning_rate": 5.082472805961974e-07, + "loss": 0.0677, + "step": 7958 + }, + { + "epoch": 2.579066753078419, + "grad_norm": 0.5404555797576904, + "learning_rate": 5.074791609360241e-07, + "loss": 0.0773, + "step": 7959 + }, + { + "epoch": 2.5793907971484122, + "grad_norm": 0.44913235306739807, + "learning_rate": 5.067115911169113e-07, + "loss": 0.0683, + "step": 7960 + }, + { + "epoch": 2.5797148412184057, + "grad_norm": 0.5068316459655762, + "learning_rate": 5.059445712328015e-07, + "loss": 0.076, + "step": 7961 + }, + { + "epoch": 2.580038885288399, + "grad_norm": 0.4982582926750183, + "learning_rate": 5.051781013775687e-07, + "loss": 0.0736, + "step": 7962 + }, + { + "epoch": 2.580362929358393, + "grad_norm": 0.4836603105068207, + "learning_rate": 5.044121816450254e-07, + "loss": 0.0739, + "step": 7963 + }, + { + "epoch": 2.5806869734283864, + "grad_norm": 0.4805230498313904, + "learning_rate": 5.03646812128909e-07, + "loss": 0.0673, + "step": 7964 + }, + { + "epoch": 2.58101101749838, + "grad_norm": 0.45795536041259766, + "learning_rate": 5.028819929228945e-07, + "loss": 0.0672, + "step": 7965 + }, + { + "epoch": 2.5813350615683732, + "grad_norm": 0.5124770998954773, + "learning_rate": 5.021177241205894e-07, + "loss": 0.0724, + "step": 7966 + }, + { + "epoch": 2.5816591056383666, + "grad_norm": 0.4507506489753723, + "learning_rate": 5.013540058155314e-07, + "loss": 0.0649, + "step": 7967 + }, + { + "epoch": 2.5819831497083605, + "grad_norm": 0.4851083755493164, + "learning_rate": 5.005908381011926e-07, + "loss": 0.0685, + "step": 7968 + }, + { + "epoch": 2.582307193778354, + "grad_norm": 0.4840677082538605, + "learning_rate": 4.998282210709788e-07, + "loss": 0.0716, + "step": 7969 + }, + { + "epoch": 2.5826312378483474, + "grad_norm": 0.48498862981796265, + "learning_rate": 4.990661548182252e-07, + "loss": 0.0751, + "step": 7970 + }, + { + "epoch": 2.582955281918341, + "grad_norm": 0.4641924500465393, + "learning_rate": 4.983046394362013e-07, + "loss": 0.0707, + "step": 7971 + }, + { + "epoch": 2.583279325988334, + "grad_norm": 0.48013532161712646, + "learning_rate": 4.975436750181095e-07, + "loss": 0.065, + "step": 7972 + }, + { + "epoch": 2.583603370058328, + "grad_norm": 0.4640522599220276, + "learning_rate": 4.96783261657085e-07, + "loss": 0.0677, + "step": 7973 + }, + { + "epoch": 2.5839274141283215, + "grad_norm": 0.4950551688671112, + "learning_rate": 4.960233994461949e-07, + "loss": 0.0713, + "step": 7974 + }, + { + "epoch": 2.584251458198315, + "grad_norm": 0.45873478055000305, + "learning_rate": 4.952640884784387e-07, + "loss": 0.0626, + "step": 7975 + }, + { + "epoch": 2.5845755022683083, + "grad_norm": 0.5119264721870422, + "learning_rate": 4.9450532884675e-07, + "loss": 0.0767, + "step": 7976 + }, + { + "epoch": 2.5848995463383018, + "grad_norm": 0.5016889572143555, + "learning_rate": 4.937471206439903e-07, + "loss": 0.0724, + "step": 7977 + }, + { + "epoch": 2.5852235904082956, + "grad_norm": 0.4209315776824951, + "learning_rate": 4.929894639629612e-07, + "loss": 0.0608, + "step": 7978 + }, + { + "epoch": 2.585547634478289, + "grad_norm": 0.46673160791397095, + "learning_rate": 4.92232358896389e-07, + "loss": 0.0661, + "step": 7979 + }, + { + "epoch": 2.5858716785482825, + "grad_norm": 0.5124037861824036, + "learning_rate": 4.914758055369389e-07, + "loss": 0.0793, + "step": 7980 + }, + { + "epoch": 2.5861957226182763, + "grad_norm": 0.5049538612365723, + "learning_rate": 4.907198039772032e-07, + "loss": 0.076, + "step": 7981 + }, + { + "epoch": 2.5865197666882693, + "grad_norm": 0.4571637809276581, + "learning_rate": 4.899643543097104e-07, + "loss": 0.0637, + "step": 7982 + }, + { + "epoch": 2.586843810758263, + "grad_norm": 0.4785764813423157, + "learning_rate": 4.892094566269212e-07, + "loss": 0.0713, + "step": 7983 + }, + { + "epoch": 2.5871678548282566, + "grad_norm": 0.48923173546791077, + "learning_rate": 4.884551110212249e-07, + "loss": 0.0677, + "step": 7984 + }, + { + "epoch": 2.58749189889825, + "grad_norm": 0.48202604055404663, + "learning_rate": 4.877013175849493e-07, + "loss": 0.0677, + "step": 7985 + }, + { + "epoch": 2.587815942968244, + "grad_norm": 0.48565518856048584, + "learning_rate": 4.869480764103485e-07, + "loss": 0.0699, + "step": 7986 + }, + { + "epoch": 2.5881399870382373, + "grad_norm": 0.44332680106163025, + "learning_rate": 4.861953875896153e-07, + "loss": 0.0672, + "step": 7987 + }, + { + "epoch": 2.5884640311082308, + "grad_norm": 0.48864465951919556, + "learning_rate": 4.854432512148682e-07, + "loss": 0.0652, + "step": 7988 + }, + { + "epoch": 2.588788075178224, + "grad_norm": 0.5096166133880615, + "learning_rate": 4.846916673781632e-07, + "loss": 0.0732, + "step": 7989 + }, + { + "epoch": 2.5891121192482176, + "grad_norm": 0.5203856229782104, + "learning_rate": 4.839406361714865e-07, + "loss": 0.0804, + "step": 7990 + }, + { + "epoch": 2.5894361633182115, + "grad_norm": 0.45725682377815247, + "learning_rate": 4.831901576867575e-07, + "loss": 0.0688, + "step": 7991 + }, + { + "epoch": 2.589760207388205, + "grad_norm": 0.4365238845348358, + "learning_rate": 4.824402320158267e-07, + "loss": 0.0669, + "step": 7992 + }, + { + "epoch": 2.5900842514581983, + "grad_norm": 0.5002686977386475, + "learning_rate": 4.816908592504794e-07, + "loss": 0.0752, + "step": 7993 + }, + { + "epoch": 2.5904082955281917, + "grad_norm": 0.5027570128440857, + "learning_rate": 4.809420394824288e-07, + "loss": 0.0771, + "step": 7994 + }, + { + "epoch": 2.590732339598185, + "grad_norm": 0.48780351877212524, + "learning_rate": 4.801937728033251e-07, + "loss": 0.0705, + "step": 7995 + }, + { + "epoch": 2.591056383668179, + "grad_norm": 0.5296880006790161, + "learning_rate": 4.794460593047484e-07, + "loss": 0.078, + "step": 7996 + }, + { + "epoch": 2.5913804277381725, + "grad_norm": 0.5127536654472351, + "learning_rate": 4.786988990782115e-07, + "loss": 0.0731, + "step": 7997 + }, + { + "epoch": 2.591704471808166, + "grad_norm": 0.46108219027519226, + "learning_rate": 4.779522922151597e-07, + "loss": 0.0672, + "step": 7998 + }, + { + "epoch": 2.5920285158781593, + "grad_norm": 0.48438650369644165, + "learning_rate": 4.77206238806971e-07, + "loss": 0.0699, + "step": 7999 + }, + { + "epoch": 2.5923525599481527, + "grad_norm": 0.4760129153728485, + "learning_rate": 4.7646073894495546e-07, + "loss": 0.0659, + "step": 8000 + }, + { + "epoch": 2.5926766040181466, + "grad_norm": 0.495082825422287, + "learning_rate": 4.757157927203521e-07, + "loss": 0.0712, + "step": 8001 + }, + { + "epoch": 2.59300064808814, + "grad_norm": 0.4683179557323456, + "learning_rate": 4.749714002243394e-07, + "loss": 0.0674, + "step": 8002 + }, + { + "epoch": 2.5933246921581334, + "grad_norm": 0.4620020389556885, + "learning_rate": 4.742275615480202e-07, + "loss": 0.0688, + "step": 8003 + }, + { + "epoch": 2.593648736228127, + "grad_norm": 0.49512195587158203, + "learning_rate": 4.734842767824349e-07, + "loss": 0.0727, + "step": 8004 + }, + { + "epoch": 2.5939727802981203, + "grad_norm": 0.5007457137107849, + "learning_rate": 4.7274154601855524e-07, + "loss": 0.0729, + "step": 8005 + }, + { + "epoch": 2.594296824368114, + "grad_norm": 0.4940263032913208, + "learning_rate": 4.7199936934728073e-07, + "loss": 0.0749, + "step": 8006 + }, + { + "epoch": 2.5946208684381076, + "grad_norm": 0.4439502954483032, + "learning_rate": 4.712577468594515e-07, + "loss": 0.0619, + "step": 8007 + }, + { + "epoch": 2.594944912508101, + "grad_norm": 0.5139973163604736, + "learning_rate": 4.7051667864582983e-07, + "loss": 0.0727, + "step": 8008 + }, + { + "epoch": 2.595268956578095, + "grad_norm": 0.48900654911994934, + "learning_rate": 4.6977616479711997e-07, + "loss": 0.0689, + "step": 8009 + }, + { + "epoch": 2.5955930006480883, + "grad_norm": 0.5214628577232361, + "learning_rate": 4.690362054039499e-07, + "loss": 0.0753, + "step": 8010 + }, + { + "epoch": 2.5959170447180817, + "grad_norm": 0.47031134366989136, + "learning_rate": 4.682968005568872e-07, + "loss": 0.0654, + "step": 8011 + }, + { + "epoch": 2.596241088788075, + "grad_norm": 0.5013682246208191, + "learning_rate": 4.6755795034642447e-07, + "loss": 0.0743, + "step": 8012 + }, + { + "epoch": 2.5965651328580686, + "grad_norm": 0.48858991265296936, + "learning_rate": 4.6681965486299164e-07, + "loss": 0.0691, + "step": 8013 + }, + { + "epoch": 2.5968891769280624, + "grad_norm": 0.4517333209514618, + "learning_rate": 4.6608191419694803e-07, + "loss": 0.0619, + "step": 8014 + }, + { + "epoch": 2.597213220998056, + "grad_norm": 0.4958752691745758, + "learning_rate": 4.6534472843858647e-07, + "loss": 0.0731, + "step": 8015 + }, + { + "epoch": 2.5975372650680493, + "grad_norm": 0.487255722284317, + "learning_rate": 4.646080976781325e-07, + "loss": 0.067, + "step": 8016 + }, + { + "epoch": 2.5978613091380427, + "grad_norm": 0.5102446675300598, + "learning_rate": 4.638720220057402e-07, + "loss": 0.0723, + "step": 8017 + }, + { + "epoch": 2.598185353208036, + "grad_norm": 0.4882889986038208, + "learning_rate": 4.631365015114991e-07, + "loss": 0.0713, + "step": 8018 + }, + { + "epoch": 2.59850939727803, + "grad_norm": 0.4748913049697876, + "learning_rate": 4.624015362854306e-07, + "loss": 0.0638, + "step": 8019 + }, + { + "epoch": 2.5988334413480234, + "grad_norm": 0.5041070580482483, + "learning_rate": 4.61667126417486e-07, + "loss": 0.0793, + "step": 8020 + }, + { + "epoch": 2.599157485418017, + "grad_norm": 0.4840593636035919, + "learning_rate": 4.609332719975512e-07, + "loss": 0.0708, + "step": 8021 + }, + { + "epoch": 2.5994815294880103, + "grad_norm": 0.4602346420288086, + "learning_rate": 4.601999731154422e-07, + "loss": 0.0683, + "step": 8022 + }, + { + "epoch": 2.5998055735580037, + "grad_norm": 0.48712125420570374, + "learning_rate": 4.5946722986090764e-07, + "loss": 0.0751, + "step": 8023 + }, + { + "epoch": 2.6001296176279975, + "grad_norm": 0.5014486312866211, + "learning_rate": 4.587350423236292e-07, + "loss": 0.0761, + "step": 8024 + }, + { + "epoch": 2.600453661697991, + "grad_norm": 0.4699751138687134, + "learning_rate": 4.5800341059321797e-07, + "loss": 0.0704, + "step": 8025 + }, + { + "epoch": 2.6007777057679844, + "grad_norm": 0.4659191071987152, + "learning_rate": 4.572723347592195e-07, + "loss": 0.0678, + "step": 8026 + }, + { + "epoch": 2.601101749837978, + "grad_norm": 0.4622977077960968, + "learning_rate": 4.5654181491111004e-07, + "loss": 0.0683, + "step": 8027 + }, + { + "epoch": 2.6014257939079712, + "grad_norm": 0.49469050765037537, + "learning_rate": 4.558118511382986e-07, + "loss": 0.0721, + "step": 8028 + }, + { + "epoch": 2.601749837977965, + "grad_norm": 0.4948549270629883, + "learning_rate": 4.550824435301249e-07, + "loss": 0.0746, + "step": 8029 + }, + { + "epoch": 2.6020738820479585, + "grad_norm": 0.46329209208488464, + "learning_rate": 4.543535921758624e-07, + "loss": 0.0666, + "step": 8030 + }, + { + "epoch": 2.602397926117952, + "grad_norm": 0.45832377672195435, + "learning_rate": 4.5362529716471594e-07, + "loss": 0.0662, + "step": 8031 + }, + { + "epoch": 2.602721970187946, + "grad_norm": 0.46872031688690186, + "learning_rate": 4.5289755858581865e-07, + "loss": 0.0684, + "step": 8032 + }, + { + "epoch": 2.6030460142579392, + "grad_norm": 0.46378257870674133, + "learning_rate": 4.5217037652824256e-07, + "loss": 0.0694, + "step": 8033 + }, + { + "epoch": 2.6033700583279327, + "grad_norm": 0.4749624729156494, + "learning_rate": 4.514437510809855e-07, + "loss": 0.0671, + "step": 8034 + }, + { + "epoch": 2.603694102397926, + "grad_norm": 0.5038319826126099, + "learning_rate": 4.507176823329795e-07, + "loss": 0.0703, + "step": 8035 + }, + { + "epoch": 2.6040181464679195, + "grad_norm": 0.4624274969100952, + "learning_rate": 4.4999217037308864e-07, + "loss": 0.0684, + "step": 8036 + }, + { + "epoch": 2.6043421905379134, + "grad_norm": 0.46600112318992615, + "learning_rate": 4.4926721529010895e-07, + "loss": 0.0695, + "step": 8037 + }, + { + "epoch": 2.604666234607907, + "grad_norm": 0.4593249559402466, + "learning_rate": 4.485428171727685e-07, + "loss": 0.0673, + "step": 8038 + }, + { + "epoch": 2.6049902786779002, + "grad_norm": 0.46818676590919495, + "learning_rate": 4.4781897610972347e-07, + "loss": 0.0695, + "step": 8039 + }, + { + "epoch": 2.6053143227478937, + "grad_norm": 0.48558634519577026, + "learning_rate": 4.470956921895697e-07, + "loss": 0.07, + "step": 8040 + }, + { + "epoch": 2.605638366817887, + "grad_norm": 0.4467635750770569, + "learning_rate": 4.4637296550082533e-07, + "loss": 0.0635, + "step": 8041 + }, + { + "epoch": 2.605962410887881, + "grad_norm": 0.47603222727775574, + "learning_rate": 4.456507961319495e-07, + "loss": 0.0687, + "step": 8042 + }, + { + "epoch": 2.6062864549578744, + "grad_norm": 0.4740268290042877, + "learning_rate": 4.44929184171326e-07, + "loss": 0.0671, + "step": 8043 + }, + { + "epoch": 2.606610499027868, + "grad_norm": 0.45359960198402405, + "learning_rate": 4.442081297072731e-07, + "loss": 0.0666, + "step": 8044 + }, + { + "epoch": 2.606934543097861, + "grad_norm": 0.527958333492279, + "learning_rate": 4.434876328280424e-07, + "loss": 0.0725, + "step": 8045 + }, + { + "epoch": 2.6072585871678546, + "grad_norm": 0.5175161957740784, + "learning_rate": 4.42767693621815e-07, + "loss": 0.0733, + "step": 8046 + }, + { + "epoch": 2.6075826312378485, + "grad_norm": 0.4354493021965027, + "learning_rate": 4.42048312176705e-07, + "loss": 0.0649, + "step": 8047 + }, + { + "epoch": 2.607906675307842, + "grad_norm": 0.5269905924797058, + "learning_rate": 4.413294885807562e-07, + "loss": 0.0789, + "step": 8048 + }, + { + "epoch": 2.6082307193778353, + "grad_norm": 0.5139177441596985, + "learning_rate": 4.4061122292194725e-07, + "loss": 0.072, + "step": 8049 + }, + { + "epoch": 2.6085547634478288, + "grad_norm": 0.4772281348705292, + "learning_rate": 4.398935152881856e-07, + "loss": 0.07, + "step": 8050 + }, + { + "epoch": 2.608878807517822, + "grad_norm": 0.4745042026042938, + "learning_rate": 4.391763657673126e-07, + "loss": 0.0686, + "step": 8051 + }, + { + "epoch": 2.609202851587816, + "grad_norm": 0.512739896774292, + "learning_rate": 4.384597744471009e-07, + "loss": 0.0761, + "step": 8052 + }, + { + "epoch": 2.6095268956578095, + "grad_norm": 0.49782106280326843, + "learning_rate": 4.37743741415253e-07, + "loss": 0.0722, + "step": 8053 + }, + { + "epoch": 2.609850939727803, + "grad_norm": 0.5118449330329895, + "learning_rate": 4.37028266759405e-07, + "loss": 0.0726, + "step": 8054 + }, + { + "epoch": 2.6101749837977968, + "grad_norm": 0.5125095844268799, + "learning_rate": 4.363133505671252e-07, + "loss": 0.0733, + "step": 8055 + }, + { + "epoch": 2.6104990278677898, + "grad_norm": 0.48923948407173157, + "learning_rate": 4.355989929259108e-07, + "loss": 0.069, + "step": 8056 + }, + { + "epoch": 2.6108230719377836, + "grad_norm": 0.49759066104888916, + "learning_rate": 4.348851939231924e-07, + "loss": 0.0753, + "step": 8057 + }, + { + "epoch": 2.611147116007777, + "grad_norm": 0.4820518493652344, + "learning_rate": 4.341719536463329e-07, + "loss": 0.0719, + "step": 8058 + }, + { + "epoch": 2.6114711600777705, + "grad_norm": 0.49614572525024414, + "learning_rate": 4.3345927218262583e-07, + "loss": 0.0686, + "step": 8059 + }, + { + "epoch": 2.6117952041477643, + "grad_norm": 0.48075005412101746, + "learning_rate": 4.3274714961929643e-07, + "loss": 0.0714, + "step": 8060 + }, + { + "epoch": 2.6121192482177578, + "grad_norm": 0.5026248693466187, + "learning_rate": 4.320355860435005e-07, + "loss": 0.0741, + "step": 8061 + }, + { + "epoch": 2.612443292287751, + "grad_norm": 0.47251930832862854, + "learning_rate": 4.313245815423289e-07, + "loss": 0.071, + "step": 8062 + }, + { + "epoch": 2.6127673363577446, + "grad_norm": 0.5004166960716248, + "learning_rate": 4.3061413620279825e-07, + "loss": 0.0743, + "step": 8063 + }, + { + "epoch": 2.613091380427738, + "grad_norm": 0.5230023264884949, + "learning_rate": 4.2990425011186443e-07, + "loss": 0.0813, + "step": 8064 + }, + { + "epoch": 2.613415424497732, + "grad_norm": 0.4912107586860657, + "learning_rate": 4.2919492335640744e-07, + "loss": 0.0718, + "step": 8065 + }, + { + "epoch": 2.6137394685677253, + "grad_norm": 0.47911426424980164, + "learning_rate": 4.284861560232428e-07, + "loss": 0.0704, + "step": 8066 + }, + { + "epoch": 2.6140635126377187, + "grad_norm": 0.5090230703353882, + "learning_rate": 4.2777794819911733e-07, + "loss": 0.0757, + "step": 8067 + }, + { + "epoch": 2.614387556707712, + "grad_norm": 0.45187073945999146, + "learning_rate": 4.270702999707083e-07, + "loss": 0.0649, + "step": 8068 + }, + { + "epoch": 2.6147116007777056, + "grad_norm": 0.488574355840683, + "learning_rate": 4.263632114246263e-07, + "loss": 0.0716, + "step": 8069 + }, + { + "epoch": 2.6150356448476995, + "grad_norm": 0.48780202865600586, + "learning_rate": 4.25656682647409e-07, + "loss": 0.0708, + "step": 8070 + }, + { + "epoch": 2.615359688917693, + "grad_norm": 0.4608360230922699, + "learning_rate": 4.2495071372553263e-07, + "loss": 0.0722, + "step": 8071 + }, + { + "epoch": 2.6156837329876863, + "grad_norm": 0.5059040188789368, + "learning_rate": 4.242453047453976e-07, + "loss": 0.075, + "step": 8072 + }, + { + "epoch": 2.6160077770576797, + "grad_norm": 0.47739937901496887, + "learning_rate": 4.235404557933409e-07, + "loss": 0.0655, + "step": 8073 + }, + { + "epoch": 2.616331821127673, + "grad_norm": 0.47227174043655396, + "learning_rate": 4.2283616695562856e-07, + "loss": 0.0666, + "step": 8074 + }, + { + "epoch": 2.616655865197667, + "grad_norm": 0.4968879222869873, + "learning_rate": 4.221324383184594e-07, + "loss": 0.0717, + "step": 8075 + }, + { + "epoch": 2.6169799092676604, + "grad_norm": 0.48942700028419495, + "learning_rate": 4.214292699679623e-07, + "loss": 0.0737, + "step": 8076 + }, + { + "epoch": 2.617303953337654, + "grad_norm": 0.47408753633499146, + "learning_rate": 4.20726661990199e-07, + "loss": 0.0686, + "step": 8077 + }, + { + "epoch": 2.6176279974076473, + "grad_norm": 0.5070276856422424, + "learning_rate": 4.2002461447116174e-07, + "loss": 0.0726, + "step": 8078 + }, + { + "epoch": 2.6179520414776407, + "grad_norm": 0.47806498408317566, + "learning_rate": 4.1932312749677353e-07, + "loss": 0.0693, + "step": 8079 + }, + { + "epoch": 2.6182760855476346, + "grad_norm": 0.47707250714302063, + "learning_rate": 4.186222011528901e-07, + "loss": 0.0685, + "step": 8080 + }, + { + "epoch": 2.618600129617628, + "grad_norm": 0.5167728066444397, + "learning_rate": 4.179218355252984e-07, + "loss": 0.0764, + "step": 8081 + }, + { + "epoch": 2.6189241736876214, + "grad_norm": 0.49237996339797974, + "learning_rate": 4.1722203069971547e-07, + "loss": 0.0696, + "step": 8082 + }, + { + "epoch": 2.6192482177576153, + "grad_norm": 0.4985239505767822, + "learning_rate": 4.165227867617916e-07, + "loss": 0.0772, + "step": 8083 + }, + { + "epoch": 2.6195722618276087, + "grad_norm": 0.4949018359184265, + "learning_rate": 4.158241037971078e-07, + "loss": 0.071, + "step": 8084 + }, + { + "epoch": 2.619896305897602, + "grad_norm": 0.44384217262268066, + "learning_rate": 4.15125981891174e-07, + "loss": 0.0629, + "step": 8085 + }, + { + "epoch": 2.6202203499675956, + "grad_norm": 0.45753324031829834, + "learning_rate": 4.1442842112943635e-07, + "loss": 0.0643, + "step": 8086 + }, + { + "epoch": 2.620544394037589, + "grad_norm": 0.4839048981666565, + "learning_rate": 4.1373142159726766e-07, + "loss": 0.0717, + "step": 8087 + }, + { + "epoch": 2.620868438107583, + "grad_norm": 0.49162375926971436, + "learning_rate": 4.1303498337997407e-07, + "loss": 0.076, + "step": 8088 + }, + { + "epoch": 2.6211924821775763, + "grad_norm": 0.5309725999832153, + "learning_rate": 4.123391065627935e-07, + "loss": 0.0811, + "step": 8089 + }, + { + "epoch": 2.6215165262475697, + "grad_norm": 0.45731958746910095, + "learning_rate": 4.116437912308946e-07, + "loss": 0.0646, + "step": 8090 + }, + { + "epoch": 2.621840570317563, + "grad_norm": 0.5169887542724609, + "learning_rate": 4.1094903746937755e-07, + "loss": 0.0755, + "step": 8091 + }, + { + "epoch": 2.6221646143875565, + "grad_norm": 0.4816588759422302, + "learning_rate": 4.10254845363271e-07, + "loss": 0.0692, + "step": 8092 + }, + { + "epoch": 2.6224886584575504, + "grad_norm": 0.48074889183044434, + "learning_rate": 4.095612149975409e-07, + "loss": 0.0708, + "step": 8093 + }, + { + "epoch": 2.622812702527544, + "grad_norm": 0.5060257315635681, + "learning_rate": 4.0886814645707765e-07, + "loss": 0.0706, + "step": 8094 + }, + { + "epoch": 2.6231367465975373, + "grad_norm": 0.49364620447158813, + "learning_rate": 4.081756398267089e-07, + "loss": 0.067, + "step": 8095 + }, + { + "epoch": 2.6234607906675307, + "grad_norm": 0.4569275677204132, + "learning_rate": 4.0748369519118926e-07, + "loss": 0.0688, + "step": 8096 + }, + { + "epoch": 2.623784834737524, + "grad_norm": 0.47600606083869934, + "learning_rate": 4.067923126352058e-07, + "loss": 0.0685, + "step": 8097 + }, + { + "epoch": 2.624108878807518, + "grad_norm": 0.491263210773468, + "learning_rate": 4.061014922433781e-07, + "loss": 0.0678, + "step": 8098 + }, + { + "epoch": 2.6244329228775114, + "grad_norm": 0.47192656993865967, + "learning_rate": 4.054112341002553e-07, + "loss": 0.0696, + "step": 8099 + }, + { + "epoch": 2.624756966947505, + "grad_norm": 0.511062502861023, + "learning_rate": 4.047215382903191e-07, + "loss": 0.0808, + "step": 8100 + }, + { + "epoch": 2.6250810110174982, + "grad_norm": 0.5204436779022217, + "learning_rate": 4.040324048979788e-07, + "loss": 0.076, + "step": 8101 + }, + { + "epoch": 2.6254050550874917, + "grad_norm": 0.4860605001449585, + "learning_rate": 4.0334383400758184e-07, + "loss": 0.0718, + "step": 8102 + }, + { + "epoch": 2.6257290991574855, + "grad_norm": 0.48059844970703125, + "learning_rate": 4.026558257033997e-07, + "loss": 0.0709, + "step": 8103 + }, + { + "epoch": 2.626053143227479, + "grad_norm": 0.48472920060157776, + "learning_rate": 4.019683800696389e-07, + "loss": 0.064, + "step": 8104 + }, + { + "epoch": 2.6263771872974724, + "grad_norm": 0.48184579610824585, + "learning_rate": 4.0128149719043554e-07, + "loss": 0.0717, + "step": 8105 + }, + { + "epoch": 2.6267012313674662, + "grad_norm": 0.5004919171333313, + "learning_rate": 4.0059517714985786e-07, + "loss": 0.071, + "step": 8106 + }, + { + "epoch": 2.6270252754374592, + "grad_norm": 0.4796049892902374, + "learning_rate": 3.9990942003190535e-07, + "loss": 0.0696, + "step": 8107 + }, + { + "epoch": 2.627349319507453, + "grad_norm": 0.4539622366428375, + "learning_rate": 3.9922422592050704e-07, + "loss": 0.067, + "step": 8108 + }, + { + "epoch": 2.6276733635774465, + "grad_norm": 0.5464158654212952, + "learning_rate": 3.985395948995258e-07, + "loss": 0.0761, + "step": 8109 + }, + { + "epoch": 2.62799740764744, + "grad_norm": 0.4742254316806793, + "learning_rate": 3.978555270527512e-07, + "loss": 0.0722, + "step": 8110 + }, + { + "epoch": 2.628321451717434, + "grad_norm": 0.45420706272125244, + "learning_rate": 3.9717202246390807e-07, + "loss": 0.0678, + "step": 8111 + }, + { + "epoch": 2.6286454957874272, + "grad_norm": 0.4992641508579254, + "learning_rate": 3.964890812166505e-07, + "loss": 0.0748, + "step": 8112 + }, + { + "epoch": 2.6289695398574207, + "grad_norm": 0.48898693919181824, + "learning_rate": 3.9580670339456393e-07, + "loss": 0.0748, + "step": 8113 + }, + { + "epoch": 2.629293583927414, + "grad_norm": 0.519314706325531, + "learning_rate": 3.951248890811649e-07, + "loss": 0.0756, + "step": 8114 + }, + { + "epoch": 2.6296176279974075, + "grad_norm": 0.4789257049560547, + "learning_rate": 3.9444363835990207e-07, + "loss": 0.0706, + "step": 8115 + }, + { + "epoch": 2.6299416720674014, + "grad_norm": 0.5213720202445984, + "learning_rate": 3.9376295131415056e-07, + "loss": 0.0741, + "step": 8116 + }, + { + "epoch": 2.630265716137395, + "grad_norm": 0.4598376452922821, + "learning_rate": 3.9308282802722365e-07, + "loss": 0.0639, + "step": 8117 + }, + { + "epoch": 2.630589760207388, + "grad_norm": 0.48026615381240845, + "learning_rate": 3.924032685823581e-07, + "loss": 0.0698, + "step": 8118 + }, + { + "epoch": 2.6309138042773816, + "grad_norm": 0.4799683392047882, + "learning_rate": 3.917242730627296e-07, + "loss": 0.0684, + "step": 8119 + }, + { + "epoch": 2.631237848347375, + "grad_norm": 0.46403399109840393, + "learning_rate": 3.910458415514379e-07, + "loss": 0.0709, + "step": 8120 + }, + { + "epoch": 2.631561892417369, + "grad_norm": 0.5058795809745789, + "learning_rate": 3.9036797413151693e-07, + "loss": 0.0774, + "step": 8121 + }, + { + "epoch": 2.6318859364873624, + "grad_norm": 0.4726886749267578, + "learning_rate": 3.896906708859322e-07, + "loss": 0.0653, + "step": 8122 + }, + { + "epoch": 2.6322099805573558, + "grad_norm": 0.49544644355773926, + "learning_rate": 3.8901393189757607e-07, + "loss": 0.07, + "step": 8123 + }, + { + "epoch": 2.632534024627349, + "grad_norm": 0.490854948759079, + "learning_rate": 3.883377572492786e-07, + "loss": 0.0727, + "step": 8124 + }, + { + "epoch": 2.6328580686973426, + "grad_norm": 0.456512451171875, + "learning_rate": 3.8766214702379344e-07, + "loss": 0.0669, + "step": 8125 + }, + { + "epoch": 2.6331821127673365, + "grad_norm": 0.4973542392253876, + "learning_rate": 3.8698710130381237e-07, + "loss": 0.0727, + "step": 8126 + }, + { + "epoch": 2.63350615683733, + "grad_norm": 0.49359431862831116, + "learning_rate": 3.863126201719519e-07, + "loss": 0.0714, + "step": 8127 + }, + { + "epoch": 2.6338302009073233, + "grad_norm": 0.46149107813835144, + "learning_rate": 3.8563870371076283e-07, + "loss": 0.066, + "step": 8128 + }, + { + "epoch": 2.6341542449773168, + "grad_norm": 0.4933355748653412, + "learning_rate": 3.8496535200272635e-07, + "loss": 0.0693, + "step": 8129 + }, + { + "epoch": 2.63447828904731, + "grad_norm": 0.5131371021270752, + "learning_rate": 3.842925651302531e-07, + "loss": 0.0802, + "step": 8130 + }, + { + "epoch": 2.634802333117304, + "grad_norm": 0.5407600998878479, + "learning_rate": 3.836203431756874e-07, + "loss": 0.0793, + "step": 8131 + }, + { + "epoch": 2.6351263771872975, + "grad_norm": 0.48129767179489136, + "learning_rate": 3.8294868622130056e-07, + "loss": 0.0675, + "step": 8132 + }, + { + "epoch": 2.635450421257291, + "grad_norm": 0.5019082427024841, + "learning_rate": 3.82277594349299e-07, + "loss": 0.0712, + "step": 8133 + }, + { + "epoch": 2.6357744653272848, + "grad_norm": 0.47515133023262024, + "learning_rate": 3.8160706764181596e-07, + "loss": 0.0664, + "step": 8134 + }, + { + "epoch": 2.636098509397278, + "grad_norm": 0.49677157402038574, + "learning_rate": 3.8093710618091915e-07, + "loss": 0.0709, + "step": 8135 + }, + { + "epoch": 2.6364225534672716, + "grad_norm": 0.450471967458725, + "learning_rate": 3.802677100486035e-07, + "loss": 0.0704, + "step": 8136 + }, + { + "epoch": 2.636746597537265, + "grad_norm": 0.5206382870674133, + "learning_rate": 3.795988793267985e-07, + "loss": 0.0744, + "step": 8137 + }, + { + "epoch": 2.6370706416072585, + "grad_norm": 0.5079112648963928, + "learning_rate": 3.7893061409736143e-07, + "loss": 0.0708, + "step": 8138 + }, + { + "epoch": 2.6373946856772523, + "grad_norm": 0.444327712059021, + "learning_rate": 3.782629144420824e-07, + "loss": 0.065, + "step": 8139 + }, + { + "epoch": 2.6377187297472457, + "grad_norm": 0.4883805811405182, + "learning_rate": 3.775957804426794e-07, + "loss": 0.0723, + "step": 8140 + }, + { + "epoch": 2.638042773817239, + "grad_norm": 0.511352002620697, + "learning_rate": 3.7692921218080604e-07, + "loss": 0.0783, + "step": 8141 + }, + { + "epoch": 2.6383668178872326, + "grad_norm": 0.4845573306083679, + "learning_rate": 3.762632097380414e-07, + "loss": 0.0685, + "step": 8142 + }, + { + "epoch": 2.638690861957226, + "grad_norm": 0.46998390555381775, + "learning_rate": 3.7559777319589873e-07, + "loss": 0.0694, + "step": 8143 + }, + { + "epoch": 2.63901490602722, + "grad_norm": 0.4673968553543091, + "learning_rate": 3.749329026358212e-07, + "loss": 0.0666, + "step": 8144 + }, + { + "epoch": 2.6393389500972133, + "grad_norm": 0.48253533244132996, + "learning_rate": 3.7426859813918194e-07, + "loss": 0.0725, + "step": 8145 + }, + { + "epoch": 2.6396629941672067, + "grad_norm": 0.48341622948646545, + "learning_rate": 3.7360485978728653e-07, + "loss": 0.0734, + "step": 8146 + }, + { + "epoch": 2.6399870382372, + "grad_norm": 0.465791791677475, + "learning_rate": 3.7294168766136786e-07, + "loss": 0.0655, + "step": 8147 + }, + { + "epoch": 2.6403110823071936, + "grad_norm": 0.5187360644340515, + "learning_rate": 3.7227908184259476e-07, + "loss": 0.0767, + "step": 8148 + }, + { + "epoch": 2.6406351263771874, + "grad_norm": 0.4869755804538727, + "learning_rate": 3.716170424120608e-07, + "loss": 0.0703, + "step": 8149 + }, + { + "epoch": 2.640959170447181, + "grad_norm": 0.4809172749519348, + "learning_rate": 3.70955569450796e-07, + "loss": 0.0704, + "step": 8150 + }, + { + "epoch": 2.6412832145171743, + "grad_norm": 0.4652535319328308, + "learning_rate": 3.702946630397564e-07, + "loss": 0.068, + "step": 8151 + }, + { + "epoch": 2.6416072585871677, + "grad_norm": 0.507232129573822, + "learning_rate": 3.696343232598304e-07, + "loss": 0.0731, + "step": 8152 + }, + { + "epoch": 2.641931302657161, + "grad_norm": 0.4860020577907562, + "learning_rate": 3.6897455019183903e-07, + "loss": 0.071, + "step": 8153 + }, + { + "epoch": 2.642255346727155, + "grad_norm": 0.4917910099029541, + "learning_rate": 3.6831534391652935e-07, + "loss": 0.0679, + "step": 8154 + }, + { + "epoch": 2.6425793907971484, + "grad_norm": 0.5022282004356384, + "learning_rate": 3.676567045145851e-07, + "loss": 0.0706, + "step": 8155 + }, + { + "epoch": 2.642903434867142, + "grad_norm": 0.44981980323791504, + "learning_rate": 3.669986320666136e-07, + "loss": 0.0684, + "step": 8156 + }, + { + "epoch": 2.6432274789371357, + "grad_norm": 0.5006513595581055, + "learning_rate": 3.663411266531608e-07, + "loss": 0.0726, + "step": 8157 + }, + { + "epoch": 2.6435515230071287, + "grad_norm": 0.5052984356880188, + "learning_rate": 3.6568418835469523e-07, + "loss": 0.0732, + "step": 8158 + }, + { + "epoch": 2.6438755670771226, + "grad_norm": 0.5216200947761536, + "learning_rate": 3.6502781725162194e-07, + "loss": 0.0751, + "step": 8159 + }, + { + "epoch": 2.644199611147116, + "grad_norm": 0.4888712763786316, + "learning_rate": 3.6437201342427396e-07, + "loss": 0.0681, + "step": 8160 + }, + { + "epoch": 2.6445236552171094, + "grad_norm": 0.480238139629364, + "learning_rate": 3.6371677695291485e-07, + "loss": 0.0677, + "step": 8161 + }, + { + "epoch": 2.6448476992871033, + "grad_norm": 0.5127687454223633, + "learning_rate": 3.6306210791773933e-07, + "loss": 0.076, + "step": 8162 + }, + { + "epoch": 2.6451717433570967, + "grad_norm": 0.4455661177635193, + "learning_rate": 3.6240800639887384e-07, + "loss": 0.0652, + "step": 8163 + }, + { + "epoch": 2.64549578742709, + "grad_norm": 0.5006566643714905, + "learning_rate": 3.6175447247637217e-07, + "loss": 0.0706, + "step": 8164 + }, + { + "epoch": 2.6458198314970836, + "grad_norm": 0.476716011762619, + "learning_rate": 3.611015062302214e-07, + "loss": 0.0699, + "step": 8165 + }, + { + "epoch": 2.646143875567077, + "grad_norm": 0.4877867102622986, + "learning_rate": 3.6044910774033826e-07, + "loss": 0.0664, + "step": 8166 + }, + { + "epoch": 2.646467919637071, + "grad_norm": 0.46837756037712097, + "learning_rate": 3.5979727708656984e-07, + "loss": 0.0636, + "step": 8167 + }, + { + "epoch": 2.6467919637070643, + "grad_norm": 0.47957879304885864, + "learning_rate": 3.591460143486941e-07, + "loss": 0.0679, + "step": 8168 + }, + { + "epoch": 2.6471160077770577, + "grad_norm": 0.4841154217720032, + "learning_rate": 3.584953196064195e-07, + "loss": 0.0719, + "step": 8169 + }, + { + "epoch": 2.647440051847051, + "grad_norm": 0.5195610523223877, + "learning_rate": 3.5784519293938555e-07, + "loss": 0.0712, + "step": 8170 + }, + { + "epoch": 2.6477640959170445, + "grad_norm": 0.4716542959213257, + "learning_rate": 3.571956344271582e-07, + "loss": 0.0694, + "step": 8171 + }, + { + "epoch": 2.6480881399870384, + "grad_norm": 0.4820329546928406, + "learning_rate": 3.56546644149241e-07, + "loss": 0.0683, + "step": 8172 + }, + { + "epoch": 2.648412184057032, + "grad_norm": 0.5184935331344604, + "learning_rate": 3.558982221850621e-07, + "loss": 0.081, + "step": 8173 + }, + { + "epoch": 2.6487362281270252, + "grad_norm": 0.4963172376155853, + "learning_rate": 3.5525036861398244e-07, + "loss": 0.0699, + "step": 8174 + }, + { + "epoch": 2.6490602721970187, + "grad_norm": 0.499603271484375, + "learning_rate": 3.5460308351529247e-07, + "loss": 0.074, + "step": 8175 + }, + { + "epoch": 2.649384316267012, + "grad_norm": 0.48043859004974365, + "learning_rate": 3.5395636696821443e-07, + "loss": 0.0697, + "step": 8176 + }, + { + "epoch": 2.649708360337006, + "grad_norm": 0.5435832738876343, + "learning_rate": 3.5331021905190055e-07, + "loss": 0.0798, + "step": 8177 + }, + { + "epoch": 2.6500324044069994, + "grad_norm": 0.5596851706504822, + "learning_rate": 3.5266463984543145e-07, + "loss": 0.0686, + "step": 8178 + }, + { + "epoch": 2.650356448476993, + "grad_norm": 0.4937089681625366, + "learning_rate": 3.5201962942782165e-07, + "loss": 0.0711, + "step": 8179 + }, + { + "epoch": 2.6506804925469862, + "grad_norm": 0.4810419976711273, + "learning_rate": 3.5137518787801193e-07, + "loss": 0.0665, + "step": 8180 + }, + { + "epoch": 2.6510045366169797, + "grad_norm": 0.5074557662010193, + "learning_rate": 3.507313152748787e-07, + "loss": 0.0773, + "step": 8181 + }, + { + "epoch": 2.6513285806869735, + "grad_norm": 0.47159427404403687, + "learning_rate": 3.5008801169722275e-07, + "loss": 0.0709, + "step": 8182 + }, + { + "epoch": 2.651652624756967, + "grad_norm": 0.4898209273815155, + "learning_rate": 3.4944527722378e-07, + "loss": 0.0722, + "step": 8183 + }, + { + "epoch": 2.6519766688269604, + "grad_norm": 0.46519505977630615, + "learning_rate": 3.488031119332147e-07, + "loss": 0.0649, + "step": 8184 + }, + { + "epoch": 2.6523007128969542, + "grad_norm": 0.5056127309799194, + "learning_rate": 3.4816151590412075e-07, + "loss": 0.0744, + "step": 8185 + }, + { + "epoch": 2.6526247569669477, + "grad_norm": 0.550579309463501, + "learning_rate": 3.4752048921502525e-07, + "loss": 0.0801, + "step": 8186 + }, + { + "epoch": 2.652948801036941, + "grad_norm": 0.47031712532043457, + "learning_rate": 3.468800319443805e-07, + "loss": 0.0693, + "step": 8187 + }, + { + "epoch": 2.6532728451069345, + "grad_norm": 0.5020474791526794, + "learning_rate": 3.462401441705759e-07, + "loss": 0.0738, + "step": 8188 + }, + { + "epoch": 2.653596889176928, + "grad_norm": 0.49726903438568115, + "learning_rate": 3.4560082597192515e-07, + "loss": 0.0734, + "step": 8189 + }, + { + "epoch": 2.653920933246922, + "grad_norm": 0.4900558590888977, + "learning_rate": 3.4496207742667485e-07, + "loss": 0.071, + "step": 8190 + }, + { + "epoch": 2.654244977316915, + "grad_norm": 0.4857633411884308, + "learning_rate": 3.443238986130021e-07, + "loss": 0.0692, + "step": 8191 + }, + { + "epoch": 2.6545690213869086, + "grad_norm": 0.5156083106994629, + "learning_rate": 3.4368628960901427e-07, + "loss": 0.0737, + "step": 8192 + }, + { + "epoch": 2.654893065456902, + "grad_norm": 0.5220831036567688, + "learning_rate": 3.430492504927474e-07, + "loss": 0.0785, + "step": 8193 + }, + { + "epoch": 2.6552171095268955, + "grad_norm": 0.5222134590148926, + "learning_rate": 3.4241278134217017e-07, + "loss": 0.0737, + "step": 8194 + }, + { + "epoch": 2.6555411535968894, + "grad_norm": 0.5155477523803711, + "learning_rate": 3.417768822351791e-07, + "loss": 0.0743, + "step": 8195 + }, + { + "epoch": 2.655865197666883, + "grad_norm": 0.4736745059490204, + "learning_rate": 3.4114155324960263e-07, + "loss": 0.0689, + "step": 8196 + }, + { + "epoch": 2.656189241736876, + "grad_norm": 0.5249689817428589, + "learning_rate": 3.4050679446319847e-07, + "loss": 0.0729, + "step": 8197 + }, + { + "epoch": 2.6565132858068696, + "grad_norm": 0.49144700169563293, + "learning_rate": 3.3987260595365556e-07, + "loss": 0.0701, + "step": 8198 + }, + { + "epoch": 2.656837329876863, + "grad_norm": 0.46199938654899597, + "learning_rate": 3.3923898779859186e-07, + "loss": 0.0647, + "step": 8199 + }, + { + "epoch": 2.657161373946857, + "grad_norm": 0.48296552896499634, + "learning_rate": 3.38605940075557e-07, + "loss": 0.0729, + "step": 8200 + }, + { + "epoch": 2.6574854180168503, + "grad_norm": 0.48330217599868774, + "learning_rate": 3.3797346286202957e-07, + "loss": 0.0689, + "step": 8201 + }, + { + "epoch": 2.6578094620868438, + "grad_norm": 0.48326781392097473, + "learning_rate": 3.373415562354165e-07, + "loss": 0.0712, + "step": 8202 + }, + { + "epoch": 2.658133506156837, + "grad_norm": 0.48442551493644714, + "learning_rate": 3.36710220273061e-07, + "loss": 0.0728, + "step": 8203 + }, + { + "epoch": 2.6584575502268306, + "grad_norm": 0.44781193137168884, + "learning_rate": 3.360794550522295e-07, + "loss": 0.0632, + "step": 8204 + }, + { + "epoch": 2.6587815942968245, + "grad_norm": 0.5006281733512878, + "learning_rate": 3.3544926065012253e-07, + "loss": 0.0711, + "step": 8205 + }, + { + "epoch": 2.659105638366818, + "grad_norm": 0.4574485123157501, + "learning_rate": 3.3481963714386943e-07, + "loss": 0.0668, + "step": 8206 + }, + { + "epoch": 2.6594296824368113, + "grad_norm": 0.46295037865638733, + "learning_rate": 3.3419058461053087e-07, + "loss": 0.0637, + "step": 8207 + }, + { + "epoch": 2.659753726506805, + "grad_norm": 0.6541327238082886, + "learning_rate": 3.335621031270964e-07, + "loss": 0.0779, + "step": 8208 + }, + { + "epoch": 2.660077770576798, + "grad_norm": 0.4976052939891815, + "learning_rate": 3.329341927704843e-07, + "loss": 0.0729, + "step": 8209 + }, + { + "epoch": 2.660401814646792, + "grad_norm": 0.5218827724456787, + "learning_rate": 3.3230685361754833e-07, + "loss": 0.0747, + "step": 8210 + }, + { + "epoch": 2.6607258587167855, + "grad_norm": 0.4913540780544281, + "learning_rate": 3.316800857450647e-07, + "loss": 0.0704, + "step": 8211 + }, + { + "epoch": 2.661049902786779, + "grad_norm": 0.5734509825706482, + "learning_rate": 3.310538892297477e-07, + "loss": 0.0847, + "step": 8212 + }, + { + "epoch": 2.6613739468567728, + "grad_norm": 0.4933965504169464, + "learning_rate": 3.304282641482348e-07, + "loss": 0.0725, + "step": 8213 + }, + { + "epoch": 2.661697990926766, + "grad_norm": 0.49286365509033203, + "learning_rate": 3.298032105770971e-07, + "loss": 0.0736, + "step": 8214 + }, + { + "epoch": 2.6620220349967596, + "grad_norm": 0.5159487724304199, + "learning_rate": 3.2917872859283606e-07, + "loss": 0.0752, + "step": 8215 + }, + { + "epoch": 2.662346079066753, + "grad_norm": 0.4922940135002136, + "learning_rate": 3.285548182718812e-07, + "loss": 0.0717, + "step": 8216 + }, + { + "epoch": 2.6626701231367464, + "grad_norm": 0.523423969745636, + "learning_rate": 3.2793147969059413e-07, + "loss": 0.079, + "step": 8217 + }, + { + "epoch": 2.6629941672067403, + "grad_norm": 0.4571373760700226, + "learning_rate": 3.2730871292526446e-07, + "loss": 0.0692, + "step": 8218 + }, + { + "epoch": 2.6633182112767337, + "grad_norm": 0.5341159701347351, + "learning_rate": 3.2668651805211285e-07, + "loss": 0.0738, + "step": 8219 + }, + { + "epoch": 2.663642255346727, + "grad_norm": 0.4489809572696686, + "learning_rate": 3.2606489514729e-07, + "loss": 0.0666, + "step": 8220 + }, + { + "epoch": 2.6639662994167206, + "grad_norm": 0.47798335552215576, + "learning_rate": 3.2544384428687736e-07, + "loss": 0.0703, + "step": 8221 + }, + { + "epoch": 2.664290343486714, + "grad_norm": 0.49070990085601807, + "learning_rate": 3.2482336554688465e-07, + "loss": 0.0717, + "step": 8222 + }, + { + "epoch": 2.664614387556708, + "grad_norm": 0.501501202583313, + "learning_rate": 3.2420345900325277e-07, + "loss": 0.0746, + "step": 8223 + }, + { + "epoch": 2.6649384316267013, + "grad_norm": 0.525124192237854, + "learning_rate": 3.235841247318522e-07, + "loss": 0.0718, + "step": 8224 + }, + { + "epoch": 2.6652624756966947, + "grad_norm": 0.4553992748260498, + "learning_rate": 3.229653628084845e-07, + "loss": 0.0676, + "step": 8225 + }, + { + "epoch": 2.665586519766688, + "grad_norm": 0.5091498494148254, + "learning_rate": 3.2234717330887844e-07, + "loss": 0.0738, + "step": 8226 + }, + { + "epoch": 2.6659105638366816, + "grad_norm": 0.5092705488204956, + "learning_rate": 3.2172955630869527e-07, + "loss": 0.0766, + "step": 8227 + }, + { + "epoch": 2.6662346079066754, + "grad_norm": 0.5618413686752319, + "learning_rate": 3.211125118835251e-07, + "loss": 0.0806, + "step": 8228 + }, + { + "epoch": 2.666558651976669, + "grad_norm": 0.5213515162467957, + "learning_rate": 3.204960401088886e-07, + "loss": 0.0792, + "step": 8229 + }, + { + "epoch": 2.6668826960466623, + "grad_norm": 0.5249233841896057, + "learning_rate": 3.198801410602359e-07, + "loss": 0.0776, + "step": 8230 + }, + { + "epoch": 2.6672067401166557, + "grad_norm": 0.47902482748031616, + "learning_rate": 3.192648148129457e-07, + "loss": 0.071, + "step": 8231 + }, + { + "epoch": 2.667530784186649, + "grad_norm": 0.5230550169944763, + "learning_rate": 3.1865006144233047e-07, + "loss": 0.075, + "step": 8232 + }, + { + "epoch": 2.667854828256643, + "grad_norm": 0.5067039728164673, + "learning_rate": 3.1803588102362724e-07, + "loss": 0.0729, + "step": 8233 + }, + { + "epoch": 2.6681788723266364, + "grad_norm": 0.46065452694892883, + "learning_rate": 3.1742227363200927e-07, + "loss": 0.0675, + "step": 8234 + }, + { + "epoch": 2.66850291639663, + "grad_norm": 0.4955891966819763, + "learning_rate": 3.1680923934257256e-07, + "loss": 0.0702, + "step": 8235 + }, + { + "epoch": 2.6688269604666237, + "grad_norm": 0.5458259582519531, + "learning_rate": 3.1619677823034875e-07, + "loss": 0.0796, + "step": 8236 + }, + { + "epoch": 2.669151004536617, + "grad_norm": 0.5087775588035583, + "learning_rate": 3.1558489037029626e-07, + "loss": 0.0716, + "step": 8237 + }, + { + "epoch": 2.6694750486066106, + "grad_norm": 0.4729219377040863, + "learning_rate": 3.149735758373046e-07, + "loss": 0.0679, + "step": 8238 + }, + { + "epoch": 2.669799092676604, + "grad_norm": 0.4986790418624878, + "learning_rate": 3.143628347061939e-07, + "loss": 0.0724, + "step": 8239 + }, + { + "epoch": 2.6701231367465974, + "grad_norm": 0.4468283951282501, + "learning_rate": 3.1375266705170935e-07, + "loss": 0.065, + "step": 8240 + }, + { + "epoch": 2.6704471808165913, + "grad_norm": 0.5075255632400513, + "learning_rate": 3.1314307294853405e-07, + "loss": 0.0789, + "step": 8241 + }, + { + "epoch": 2.6707712248865847, + "grad_norm": 0.4599548876285553, + "learning_rate": 3.1253405247127387e-07, + "loss": 0.0657, + "step": 8242 + }, + { + "epoch": 2.671095268956578, + "grad_norm": 0.4865701496601105, + "learning_rate": 3.1192560569446697e-07, + "loss": 0.0707, + "step": 8243 + }, + { + "epoch": 2.6714193130265715, + "grad_norm": 0.5049477815628052, + "learning_rate": 3.1131773269258204e-07, + "loss": 0.0748, + "step": 8244 + }, + { + "epoch": 2.671743357096565, + "grad_norm": 0.4652611315250397, + "learning_rate": 3.1071043354001626e-07, + "loss": 0.0669, + "step": 8245 + }, + { + "epoch": 2.672067401166559, + "grad_norm": 0.48552364110946655, + "learning_rate": 3.1010370831109806e-07, + "loss": 0.0734, + "step": 8246 + }, + { + "epoch": 2.6723914452365523, + "grad_norm": 0.5223255157470703, + "learning_rate": 3.09497557080084e-07, + "loss": 0.0742, + "step": 8247 + }, + { + "epoch": 2.6727154893065457, + "grad_norm": 0.4967799782752991, + "learning_rate": 3.088919799211626e-07, + "loss": 0.0751, + "step": 8248 + }, + { + "epoch": 2.673039533376539, + "grad_norm": 0.47687217593193054, + "learning_rate": 3.0828697690844787e-07, + "loss": 0.0653, + "step": 8249 + }, + { + "epoch": 2.6733635774465325, + "grad_norm": 0.534401535987854, + "learning_rate": 3.076825481159884e-07, + "loss": 0.0766, + "step": 8250 + }, + { + "epoch": 2.6736876215165264, + "grad_norm": 0.4932291507720947, + "learning_rate": 3.0707869361776e-07, + "loss": 0.0724, + "step": 8251 + }, + { + "epoch": 2.67401166558652, + "grad_norm": 0.5111968517303467, + "learning_rate": 3.0647541348766796e-07, + "loss": 0.072, + "step": 8252 + }, + { + "epoch": 2.6743357096565132, + "grad_norm": 0.4867503046989441, + "learning_rate": 3.058727077995488e-07, + "loss": 0.0682, + "step": 8253 + }, + { + "epoch": 2.6746597537265067, + "grad_norm": 0.4913567304611206, + "learning_rate": 3.052705766271674e-07, + "loss": 0.0723, + "step": 8254 + }, + { + "epoch": 2.6749837977965, + "grad_norm": 0.4733397960662842, + "learning_rate": 3.046690200442193e-07, + "loss": 0.0675, + "step": 8255 + }, + { + "epoch": 2.675307841866494, + "grad_norm": 0.48818185925483704, + "learning_rate": 3.040680381243294e-07, + "loss": 0.0729, + "step": 8256 + }, + { + "epoch": 2.6756318859364874, + "grad_norm": 0.44772180914878845, + "learning_rate": 3.0346763094105057e-07, + "loss": 0.0678, + "step": 8257 + }, + { + "epoch": 2.675955930006481, + "grad_norm": 0.4892347753047943, + "learning_rate": 3.0286779856786795e-07, + "loss": 0.0697, + "step": 8258 + }, + { + "epoch": 2.6762799740764747, + "grad_norm": 0.49410268664360046, + "learning_rate": 3.022685410781945e-07, + "loss": 0.073, + "step": 8259 + }, + { + "epoch": 2.6766040181464676, + "grad_norm": 0.5007535219192505, + "learning_rate": 3.016698585453748e-07, + "loss": 0.0717, + "step": 8260 + }, + { + "epoch": 2.6769280622164615, + "grad_norm": 0.4490124583244324, + "learning_rate": 3.010717510426814e-07, + "loss": 0.0611, + "step": 8261 + }, + { + "epoch": 2.677252106286455, + "grad_norm": 0.4742376506328583, + "learning_rate": 3.0047421864331516e-07, + "loss": 0.065, + "step": 8262 + }, + { + "epoch": 2.6775761503564484, + "grad_norm": 0.4849986732006073, + "learning_rate": 2.9987726142041096e-07, + "loss": 0.0707, + "step": 8263 + }, + { + "epoch": 2.6779001944264422, + "grad_norm": 0.5073363780975342, + "learning_rate": 2.9928087944702754e-07, + "loss": 0.0708, + "step": 8264 + }, + { + "epoch": 2.6782242384964356, + "grad_norm": 0.4616087079048157, + "learning_rate": 2.986850727961599e-07, + "loss": 0.0656, + "step": 8265 + }, + { + "epoch": 2.678548282566429, + "grad_norm": 0.5197352170944214, + "learning_rate": 2.980898415407257e-07, + "loss": 0.0719, + "step": 8266 + }, + { + "epoch": 2.6788723266364225, + "grad_norm": 0.46726688742637634, + "learning_rate": 2.9749518575357796e-07, + "loss": 0.064, + "step": 8267 + }, + { + "epoch": 2.679196370706416, + "grad_norm": 0.4749407470226288, + "learning_rate": 2.96901105507495e-07, + "loss": 0.0665, + "step": 8268 + }, + { + "epoch": 2.67952041477641, + "grad_norm": 0.4720465838909149, + "learning_rate": 2.963076008751875e-07, + "loss": 0.0684, + "step": 8269 + }, + { + "epoch": 2.679844458846403, + "grad_norm": 0.46473428606987, + "learning_rate": 2.957146719292947e-07, + "loss": 0.0676, + "step": 8270 + }, + { + "epoch": 2.6801685029163966, + "grad_norm": 0.4980733394622803, + "learning_rate": 2.9512231874238404e-07, + "loss": 0.0741, + "step": 8271 + }, + { + "epoch": 2.68049254698639, + "grad_norm": 0.48748525977134705, + "learning_rate": 2.945305413869559e-07, + "loss": 0.0674, + "step": 8272 + }, + { + "epoch": 2.6808165910563835, + "grad_norm": 0.47855398058891296, + "learning_rate": 2.9393933993543675e-07, + "loss": 0.0706, + "step": 8273 + }, + { + "epoch": 2.6811406351263773, + "grad_norm": 0.475934237241745, + "learning_rate": 2.9334871446018375e-07, + "loss": 0.0691, + "step": 8274 + }, + { + "epoch": 2.6814646791963708, + "grad_norm": 0.47559282183647156, + "learning_rate": 2.927586650334846e-07, + "loss": 0.0702, + "step": 8275 + }, + { + "epoch": 2.681788723266364, + "grad_norm": 0.5388659834861755, + "learning_rate": 2.9216919172755485e-07, + "loss": 0.0819, + "step": 8276 + }, + { + "epoch": 2.6821127673363576, + "grad_norm": 0.4871777892112732, + "learning_rate": 2.9158029461454075e-07, + "loss": 0.0722, + "step": 8277 + }, + { + "epoch": 2.682436811406351, + "grad_norm": 0.4801523983478546, + "learning_rate": 2.909919737665179e-07, + "loss": 0.0706, + "step": 8278 + }, + { + "epoch": 2.682760855476345, + "grad_norm": 0.5030548572540283, + "learning_rate": 2.9040422925549097e-07, + "loss": 0.074, + "step": 8279 + }, + { + "epoch": 2.6830848995463383, + "grad_norm": 0.49741634726524353, + "learning_rate": 2.898170611533935e-07, + "loss": 0.0729, + "step": 8280 + }, + { + "epoch": 2.6834089436163318, + "grad_norm": 0.4476093351840973, + "learning_rate": 2.8923046953208964e-07, + "loss": 0.0647, + "step": 8281 + }, + { + "epoch": 2.683732987686325, + "grad_norm": 0.45768454670906067, + "learning_rate": 2.8864445446337264e-07, + "loss": 0.0721, + "step": 8282 + }, + { + "epoch": 2.6840570317563186, + "grad_norm": 0.49686095118522644, + "learning_rate": 2.8805901601896446e-07, + "loss": 0.0685, + "step": 8283 + }, + { + "epoch": 2.6843810758263125, + "grad_norm": 0.44313284754753113, + "learning_rate": 2.874741542705178e-07, + "loss": 0.063, + "step": 8284 + }, + { + "epoch": 2.684705119896306, + "grad_norm": 0.5290567278862, + "learning_rate": 2.868898692896149e-07, + "loss": 0.0732, + "step": 8285 + }, + { + "epoch": 2.6850291639662993, + "grad_norm": 0.5113586783409119, + "learning_rate": 2.8630616114776413e-07, + "loss": 0.0752, + "step": 8286 + }, + { + "epoch": 2.685353208036293, + "grad_norm": 0.47347286343574524, + "learning_rate": 2.857230299164082e-07, + "loss": 0.0649, + "step": 8287 + }, + { + "epoch": 2.6856772521062866, + "grad_norm": 0.4577503800392151, + "learning_rate": 2.851404756669146e-07, + "loss": 0.0634, + "step": 8288 + }, + { + "epoch": 2.68600129617628, + "grad_norm": 0.48355886340141296, + "learning_rate": 2.8455849847058457e-07, + "loss": 0.0718, + "step": 8289 + }, + { + "epoch": 2.6863253402462735, + "grad_norm": 0.515175998210907, + "learning_rate": 2.839770983986445e-07, + "loss": 0.0788, + "step": 8290 + }, + { + "epoch": 2.686649384316267, + "grad_norm": 0.4759681224822998, + "learning_rate": 2.8339627552225304e-07, + "loss": 0.071, + "step": 8291 + }, + { + "epoch": 2.6869734283862607, + "grad_norm": 0.5212849378585815, + "learning_rate": 2.8281602991249825e-07, + "loss": 0.0756, + "step": 8292 + }, + { + "epoch": 2.687297472456254, + "grad_norm": 0.5105800628662109, + "learning_rate": 2.822363616403939e-07, + "loss": 0.0703, + "step": 8293 + }, + { + "epoch": 2.6876215165262476, + "grad_norm": 0.4698103368282318, + "learning_rate": 2.8165727077688887e-07, + "loss": 0.0706, + "step": 8294 + }, + { + "epoch": 2.687945560596241, + "grad_norm": 0.5046882033348083, + "learning_rate": 2.8107875739285474e-07, + "loss": 0.0749, + "step": 8295 + }, + { + "epoch": 2.6882696046662344, + "grad_norm": 0.4877139925956726, + "learning_rate": 2.805008215591004e-07, + "loss": 0.072, + "step": 8296 + }, + { + "epoch": 2.6885936487362283, + "grad_norm": 0.5047754049301147, + "learning_rate": 2.79923463346356e-07, + "loss": 0.0734, + "step": 8297 + }, + { + "epoch": 2.6889176928062217, + "grad_norm": 0.48555463552474976, + "learning_rate": 2.7934668282528554e-07, + "loss": 0.069, + "step": 8298 + }, + { + "epoch": 2.689241736876215, + "grad_norm": 0.5083166360855103, + "learning_rate": 2.78770480066482e-07, + "loss": 0.0747, + "step": 8299 + }, + { + "epoch": 2.6895657809462086, + "grad_norm": 0.47517675161361694, + "learning_rate": 2.781948551404667e-07, + "loss": 0.0701, + "step": 8300 + }, + { + "epoch": 2.689889825016202, + "grad_norm": 0.4648648798465729, + "learning_rate": 2.7761980811769063e-07, + "loss": 0.0688, + "step": 8301 + }, + { + "epoch": 2.690213869086196, + "grad_norm": 0.4912257492542267, + "learning_rate": 2.770453390685335e-07, + "loss": 0.0716, + "step": 8302 + }, + { + "epoch": 2.6905379131561893, + "grad_norm": 0.468051940202713, + "learning_rate": 2.764714480633057e-07, + "loss": 0.0738, + "step": 8303 + }, + { + "epoch": 2.6908619572261827, + "grad_norm": 0.433032363653183, + "learning_rate": 2.7589813517224504e-07, + "loss": 0.0663, + "step": 8304 + }, + { + "epoch": 2.691186001296176, + "grad_norm": 0.49341756105422974, + "learning_rate": 2.753254004655198e-07, + "loss": 0.0732, + "step": 8305 + }, + { + "epoch": 2.6915100453661696, + "grad_norm": 0.4939447045326233, + "learning_rate": 2.747532440132272e-07, + "loss": 0.075, + "step": 8306 + }, + { + "epoch": 2.6918340894361634, + "grad_norm": 0.4643614888191223, + "learning_rate": 2.741816658853935e-07, + "loss": 0.0675, + "step": 8307 + }, + { + "epoch": 2.692158133506157, + "grad_norm": 0.5280596613883972, + "learning_rate": 2.736106661519744e-07, + "loss": 0.077, + "step": 8308 + }, + { + "epoch": 2.6924821775761503, + "grad_norm": 0.5050312280654907, + "learning_rate": 2.730402448828551e-07, + "loss": 0.0751, + "step": 8309 + }, + { + "epoch": 2.692806221646144, + "grad_norm": 0.4410359859466553, + "learning_rate": 2.724704021478486e-07, + "loss": 0.0645, + "step": 8310 + }, + { + "epoch": 2.693130265716137, + "grad_norm": 0.50943523645401, + "learning_rate": 2.719011380166997e-07, + "loss": 0.0777, + "step": 8311 + }, + { + "epoch": 2.693454309786131, + "grad_norm": 0.5053391456604004, + "learning_rate": 2.7133245255907937e-07, + "loss": 0.073, + "step": 8312 + }, + { + "epoch": 2.6937783538561244, + "grad_norm": 0.4879259467124939, + "learning_rate": 2.7076434584458964e-07, + "loss": 0.0719, + "step": 8313 + }, + { + "epoch": 2.694102397926118, + "grad_norm": 0.5183456540107727, + "learning_rate": 2.7019681794276166e-07, + "loss": 0.0746, + "step": 8314 + }, + { + "epoch": 2.6944264419961117, + "grad_norm": 0.46866390109062195, + "learning_rate": 2.6962986892305533e-07, + "loss": 0.0697, + "step": 8315 + }, + { + "epoch": 2.694750486066105, + "grad_norm": 0.5022212862968445, + "learning_rate": 2.6906349885486015e-07, + "loss": 0.071, + "step": 8316 + }, + { + "epoch": 2.6950745301360985, + "grad_norm": 0.5619996190071106, + "learning_rate": 2.6849770780749186e-07, + "loss": 0.0739, + "step": 8317 + }, + { + "epoch": 2.695398574206092, + "grad_norm": 0.4827101528644562, + "learning_rate": 2.6793249585020163e-07, + "loss": 0.071, + "step": 8318 + }, + { + "epoch": 2.6957226182760854, + "grad_norm": 0.5225281119346619, + "learning_rate": 2.67367863052162e-07, + "loss": 0.0777, + "step": 8319 + }, + { + "epoch": 2.6960466623460793, + "grad_norm": 0.5356566309928894, + "learning_rate": 2.6680380948248207e-07, + "loss": 0.0765, + "step": 8320 + }, + { + "epoch": 2.6963707064160727, + "grad_norm": 0.5039767026901245, + "learning_rate": 2.6624033521019443e-07, + "loss": 0.0732, + "step": 8321 + }, + { + "epoch": 2.696694750486066, + "grad_norm": 0.45116445422172546, + "learning_rate": 2.6567744030426335e-07, + "loss": 0.0641, + "step": 8322 + }, + { + "epoch": 2.6970187945560595, + "grad_norm": 0.4902065396308899, + "learning_rate": 2.6511512483358204e-07, + "loss": 0.0717, + "step": 8323 + }, + { + "epoch": 2.697342838626053, + "grad_norm": 0.47688916325569153, + "learning_rate": 2.6455338886697155e-07, + "loss": 0.0712, + "step": 8324 + }, + { + "epoch": 2.697666882696047, + "grad_norm": 0.5470159649848938, + "learning_rate": 2.639922324731847e-07, + "loss": 0.077, + "step": 8325 + }, + { + "epoch": 2.6979909267660402, + "grad_norm": 0.46849918365478516, + "learning_rate": 2.6343165572089936e-07, + "loss": 0.0709, + "step": 8326 + }, + { + "epoch": 2.6983149708360337, + "grad_norm": 0.4926462173461914, + "learning_rate": 2.6287165867872666e-07, + "loss": 0.0685, + "step": 8327 + }, + { + "epoch": 2.698639014906027, + "grad_norm": 0.4713653028011322, + "learning_rate": 2.623122414152035e-07, + "loss": 0.0684, + "step": 8328 + }, + { + "epoch": 2.6989630589760205, + "grad_norm": 0.49385493993759155, + "learning_rate": 2.617534039987979e-07, + "loss": 0.0718, + "step": 8329 + }, + { + "epoch": 2.6992871030460144, + "grad_norm": 0.47010338306427, + "learning_rate": 2.6119514649790566e-07, + "loss": 0.0692, + "step": 8330 + }, + { + "epoch": 2.699611147116008, + "grad_norm": 0.49910011887550354, + "learning_rate": 2.606374689808522e-07, + "loss": 0.0716, + "step": 8331 + }, + { + "epoch": 2.6999351911860012, + "grad_norm": 0.49807921051979065, + "learning_rate": 2.600803715158917e-07, + "loss": 0.0721, + "step": 8332 + }, + { + "epoch": 2.7002592352559946, + "grad_norm": 0.4971463084220886, + "learning_rate": 2.5952385417120864e-07, + "loss": 0.0752, + "step": 8333 + }, + { + "epoch": 2.700583279325988, + "grad_norm": 0.5131683945655823, + "learning_rate": 2.589679170149145e-07, + "loss": 0.0728, + "step": 8334 + }, + { + "epoch": 2.700907323395982, + "grad_norm": 0.49370115995407104, + "learning_rate": 2.5841256011505e-07, + "loss": 0.0714, + "step": 8335 + }, + { + "epoch": 2.7012313674659754, + "grad_norm": 0.46200138330459595, + "learning_rate": 2.578577835395857e-07, + "loss": 0.0659, + "step": 8336 + }, + { + "epoch": 2.701555411535969, + "grad_norm": 0.47938281297683716, + "learning_rate": 2.5730358735642167e-07, + "loss": 0.0712, + "step": 8337 + }, + { + "epoch": 2.7018794556059627, + "grad_norm": 0.4824425280094147, + "learning_rate": 2.567499716333854e-07, + "loss": 0.0709, + "step": 8338 + }, + { + "epoch": 2.702203499675956, + "grad_norm": 0.47668445110321045, + "learning_rate": 2.561969364382344e-07, + "loss": 0.0707, + "step": 8339 + }, + { + "epoch": 2.7025275437459495, + "grad_norm": 0.5181357860565186, + "learning_rate": 2.556444818386555e-07, + "loss": 0.069, + "step": 8340 + }, + { + "epoch": 2.702851587815943, + "grad_norm": 0.4696907103061676, + "learning_rate": 2.5509260790226195e-07, + "loss": 0.0706, + "step": 8341 + }, + { + "epoch": 2.7031756318859363, + "grad_norm": 0.5065804719924927, + "learning_rate": 2.5454131469660027e-07, + "loss": 0.074, + "step": 8342 + }, + { + "epoch": 2.70349967595593, + "grad_norm": 0.4924456477165222, + "learning_rate": 2.539906022891414e-07, + "loss": 0.0714, + "step": 8343 + }, + { + "epoch": 2.7038237200259236, + "grad_norm": 0.48366984724998474, + "learning_rate": 2.534404707472876e-07, + "loss": 0.0749, + "step": 8344 + }, + { + "epoch": 2.704147764095917, + "grad_norm": 0.48156607151031494, + "learning_rate": 2.5289092013837e-07, + "loss": 0.0647, + "step": 8345 + }, + { + "epoch": 2.7044718081659105, + "grad_norm": 0.49813225865364075, + "learning_rate": 2.5234195052964814e-07, + "loss": 0.0717, + "step": 8346 + }, + { + "epoch": 2.704795852235904, + "grad_norm": 0.5039176940917969, + "learning_rate": 2.5179356198831164e-07, + "loss": 0.0767, + "step": 8347 + }, + { + "epoch": 2.7051198963058978, + "grad_norm": 0.49406805634498596, + "learning_rate": 2.512457545814756e-07, + "loss": 0.0698, + "step": 8348 + }, + { + "epoch": 2.705443940375891, + "grad_norm": 0.5206747055053711, + "learning_rate": 2.5069852837618866e-07, + "loss": 0.0771, + "step": 8349 + }, + { + "epoch": 2.7057679844458846, + "grad_norm": 0.4882453978061676, + "learning_rate": 2.5015188343942397e-07, + "loss": 0.0715, + "step": 8350 + }, + { + "epoch": 2.706092028515878, + "grad_norm": 0.49470728635787964, + "learning_rate": 2.4960581983808796e-07, + "loss": 0.0739, + "step": 8351 + }, + { + "epoch": 2.7064160725858715, + "grad_norm": 0.4713766574859619, + "learning_rate": 2.490603376390116e-07, + "loss": 0.0692, + "step": 8352 + }, + { + "epoch": 2.7067401166558653, + "grad_norm": 0.5327262282371521, + "learning_rate": 2.4851543690895706e-07, + "loss": 0.069, + "step": 8353 + }, + { + "epoch": 2.7070641607258588, + "grad_norm": 0.5107653141021729, + "learning_rate": 2.479711177146155e-07, + "loss": 0.075, + "step": 8354 + }, + { + "epoch": 2.707388204795852, + "grad_norm": 0.5149986743927002, + "learning_rate": 2.474273801226051e-07, + "loss": 0.0793, + "step": 8355 + }, + { + "epoch": 2.7077122488658456, + "grad_norm": 0.4663291275501251, + "learning_rate": 2.4688422419947623e-07, + "loss": 0.0712, + "step": 8356 + }, + { + "epoch": 2.708036292935839, + "grad_norm": 0.49470773339271545, + "learning_rate": 2.4634165001170327e-07, + "loss": 0.0696, + "step": 8357 + }, + { + "epoch": 2.708360337005833, + "grad_norm": 0.5378709435462952, + "learning_rate": 2.4579965762569436e-07, + "loss": 0.0801, + "step": 8358 + }, + { + "epoch": 2.7086843810758263, + "grad_norm": 0.4698738157749176, + "learning_rate": 2.45258247107783e-07, + "loss": 0.0709, + "step": 8359 + }, + { + "epoch": 2.7090084251458197, + "grad_norm": 0.5029922723770142, + "learning_rate": 2.447174185242324e-07, + "loss": 0.074, + "step": 8360 + }, + { + "epoch": 2.7093324692158136, + "grad_norm": 0.45825907588005066, + "learning_rate": 2.4417717194123504e-07, + "loss": 0.0686, + "step": 8361 + }, + { + "epoch": 2.709656513285807, + "grad_norm": 0.47053924202919006, + "learning_rate": 2.43637507424912e-07, + "loss": 0.0706, + "step": 8362 + }, + { + "epoch": 2.7099805573558005, + "grad_norm": 0.48093780875205994, + "learning_rate": 2.4309842504131266e-07, + "loss": 0.0733, + "step": 8363 + }, + { + "epoch": 2.710304601425794, + "grad_norm": 0.4751356244087219, + "learning_rate": 2.4255992485641644e-07, + "loss": 0.0667, + "step": 8364 + }, + { + "epoch": 2.7106286454957873, + "grad_norm": 0.5154885053634644, + "learning_rate": 2.4202200693612955e-07, + "loss": 0.0762, + "step": 8365 + }, + { + "epoch": 2.710952689565781, + "grad_norm": 0.484735906124115, + "learning_rate": 2.4148467134628816e-07, + "loss": 0.0711, + "step": 8366 + }, + { + "epoch": 2.7112767336357746, + "grad_norm": 0.4831327199935913, + "learning_rate": 2.4094791815265637e-07, + "loss": 0.0698, + "step": 8367 + }, + { + "epoch": 2.711600777705768, + "grad_norm": 0.5157594680786133, + "learning_rate": 2.404117474209289e-07, + "loss": 0.0777, + "step": 8368 + }, + { + "epoch": 2.7119248217757614, + "grad_norm": 0.5025253891944885, + "learning_rate": 2.3987615921672645e-07, + "loss": 0.0759, + "step": 8369 + }, + { + "epoch": 2.712248865845755, + "grad_norm": 0.505915105342865, + "learning_rate": 2.3934115360560116e-07, + "loss": 0.0746, + "step": 8370 + }, + { + "epoch": 2.7125729099157487, + "grad_norm": 0.5080195665359497, + "learning_rate": 2.388067306530323e-07, + "loss": 0.0707, + "step": 8371 + }, + { + "epoch": 2.712896953985742, + "grad_norm": 0.4918041527271271, + "learning_rate": 2.382728904244269e-07, + "loss": 0.0711, + "step": 8372 + }, + { + "epoch": 2.7132209980557356, + "grad_norm": 0.5486384630203247, + "learning_rate": 2.3773963298512338e-07, + "loss": 0.0787, + "step": 8373 + }, + { + "epoch": 2.713545042125729, + "grad_norm": 0.47039252519607544, + "learning_rate": 2.372069584003861e-07, + "loss": 0.0696, + "step": 8374 + }, + { + "epoch": 2.7138690861957224, + "grad_norm": 0.47299376130104065, + "learning_rate": 2.3667486673540963e-07, + "loss": 0.0719, + "step": 8375 + }, + { + "epoch": 2.7141931302657163, + "grad_norm": 0.4835331439971924, + "learning_rate": 2.3614335805531686e-07, + "loss": 0.0684, + "step": 8376 + }, + { + "epoch": 2.7145171743357097, + "grad_norm": 0.47559183835983276, + "learning_rate": 2.3561243242515907e-07, + "loss": 0.0712, + "step": 8377 + }, + { + "epoch": 2.714841218405703, + "grad_norm": 0.49402961134910583, + "learning_rate": 2.3508208990991764e-07, + "loss": 0.069, + "step": 8378 + }, + { + "epoch": 2.7151652624756966, + "grad_norm": 0.4470484256744385, + "learning_rate": 2.3455233057449899e-07, + "loss": 0.0668, + "step": 8379 + }, + { + "epoch": 2.71548930654569, + "grad_norm": 0.46793654561042786, + "learning_rate": 2.3402315448374346e-07, + "loss": 0.0647, + "step": 8380 + }, + { + "epoch": 2.715813350615684, + "grad_norm": 0.5528791546821594, + "learning_rate": 2.3349456170241426e-07, + "loss": 0.0765, + "step": 8381 + }, + { + "epoch": 2.7161373946856773, + "grad_norm": 0.5298764705657959, + "learning_rate": 2.3296655229520905e-07, + "loss": 0.0738, + "step": 8382 + }, + { + "epoch": 2.7164614387556707, + "grad_norm": 0.48637211322784424, + "learning_rate": 2.3243912632674782e-07, + "loss": 0.0742, + "step": 8383 + }, + { + "epoch": 2.7167854828256646, + "grad_norm": 0.48481112718582153, + "learning_rate": 2.3191228386158448e-07, + "loss": 0.0698, + "step": 8384 + }, + { + "epoch": 2.7171095268956575, + "grad_norm": 0.43704310059547424, + "learning_rate": 2.3138602496419916e-07, + "loss": 0.0625, + "step": 8385 + }, + { + "epoch": 2.7174335709656514, + "grad_norm": 0.4682506024837494, + "learning_rate": 2.308603496990003e-07, + "loss": 0.0664, + "step": 8386 + }, + { + "epoch": 2.717757615035645, + "grad_norm": 0.5053842067718506, + "learning_rate": 2.3033525813032644e-07, + "loss": 0.0735, + "step": 8387 + }, + { + "epoch": 2.7180816591056383, + "grad_norm": 0.4989098906517029, + "learning_rate": 2.2981075032244282e-07, + "loss": 0.0717, + "step": 8388 + }, + { + "epoch": 2.718405703175632, + "grad_norm": 0.4849379062652588, + "learning_rate": 2.2928682633954368e-07, + "loss": 0.0732, + "step": 8389 + }, + { + "epoch": 2.7187297472456255, + "grad_norm": 0.5042644739151001, + "learning_rate": 2.2876348624575328e-07, + "loss": 0.0685, + "step": 8390 + }, + { + "epoch": 2.719053791315619, + "grad_norm": 0.5008777976036072, + "learning_rate": 2.2824073010512315e-07, + "loss": 0.0751, + "step": 8391 + }, + { + "epoch": 2.7193778353856124, + "grad_norm": 0.4873231053352356, + "learning_rate": 2.2771855798163322e-07, + "loss": 0.0727, + "step": 8392 + }, + { + "epoch": 2.719701879455606, + "grad_norm": 0.47231167554855347, + "learning_rate": 2.2719696993919237e-07, + "loss": 0.0703, + "step": 8393 + }, + { + "epoch": 2.7200259235255997, + "grad_norm": 0.48164358735084534, + "learning_rate": 2.2667596604163844e-07, + "loss": 0.0685, + "step": 8394 + }, + { + "epoch": 2.720349967595593, + "grad_norm": 0.4553871750831604, + "learning_rate": 2.2615554635273763e-07, + "loss": 0.0656, + "step": 8395 + }, + { + "epoch": 2.7206740116655865, + "grad_norm": 0.5053251385688782, + "learning_rate": 2.256357109361823e-07, + "loss": 0.077, + "step": 8396 + }, + { + "epoch": 2.72099805573558, + "grad_norm": 0.45735135674476624, + "learning_rate": 2.2511645985559715e-07, + "loss": 0.0658, + "step": 8397 + }, + { + "epoch": 2.7213220998055734, + "grad_norm": 0.4907129406929016, + "learning_rate": 2.2459779317453246e-07, + "loss": 0.0677, + "step": 8398 + }, + { + "epoch": 2.7216461438755672, + "grad_norm": 0.47853198647499084, + "learning_rate": 2.2407971095646853e-07, + "loss": 0.0702, + "step": 8399 + }, + { + "epoch": 2.7219701879455607, + "grad_norm": 0.5116216540336609, + "learning_rate": 2.2356221326481353e-07, + "loss": 0.0754, + "step": 8400 + }, + { + "epoch": 2.722294232015554, + "grad_norm": 0.4731382131576538, + "learning_rate": 2.2304530016290405e-07, + "loss": 0.0718, + "step": 8401 + }, + { + "epoch": 2.7226182760855475, + "grad_norm": 0.47295665740966797, + "learning_rate": 2.2252897171400613e-07, + "loss": 0.0693, + "step": 8402 + }, + { + "epoch": 2.722942320155541, + "grad_norm": 0.4782242476940155, + "learning_rate": 2.220132279813114e-07, + "loss": 0.0712, + "step": 8403 + }, + { + "epoch": 2.723266364225535, + "grad_norm": 0.4973825514316559, + "learning_rate": 2.214980690279439e-07, + "loss": 0.0692, + "step": 8404 + }, + { + "epoch": 2.7235904082955282, + "grad_norm": 0.4872826039791107, + "learning_rate": 2.2098349491695314e-07, + "loss": 0.0723, + "step": 8405 + }, + { + "epoch": 2.7239144523655217, + "grad_norm": 0.462805837392807, + "learning_rate": 2.2046950571131764e-07, + "loss": 0.065, + "step": 8406 + }, + { + "epoch": 2.724238496435515, + "grad_norm": 0.46699562668800354, + "learning_rate": 2.199561014739454e-07, + "loss": 0.0671, + "step": 8407 + }, + { + "epoch": 2.7245625405055085, + "grad_norm": 0.5196148157119751, + "learning_rate": 2.1944328226767232e-07, + "loss": 0.0757, + "step": 8408 + }, + { + "epoch": 2.7248865845755024, + "grad_norm": 0.5027263760566711, + "learning_rate": 2.18931048155262e-07, + "loss": 0.0717, + "step": 8409 + }, + { + "epoch": 2.725210628645496, + "grad_norm": 0.4779914915561676, + "learning_rate": 2.1841939919940602e-07, + "loss": 0.0695, + "step": 8410 + }, + { + "epoch": 2.725534672715489, + "grad_norm": 0.53648442029953, + "learning_rate": 2.1790833546272816e-07, + "loss": 0.0751, + "step": 8411 + }, + { + "epoch": 2.725858716785483, + "grad_norm": 0.4998103678226471, + "learning_rate": 2.1739785700777395e-07, + "loss": 0.079, + "step": 8412 + }, + { + "epoch": 2.7261827608554765, + "grad_norm": 0.49417588114738464, + "learning_rate": 2.1688796389702393e-07, + "loss": 0.0721, + "step": 8413 + }, + { + "epoch": 2.72650680492547, + "grad_norm": 0.49027085304260254, + "learning_rate": 2.1637865619288322e-07, + "loss": 0.0716, + "step": 8414 + }, + { + "epoch": 2.7268308489954634, + "grad_norm": 0.4957401156425476, + "learning_rate": 2.158699339576853e-07, + "loss": 0.0711, + "step": 8415 + }, + { + "epoch": 2.7271548930654568, + "grad_norm": 0.5493224859237671, + "learning_rate": 2.1536179725369367e-07, + "loss": 0.0747, + "step": 8416 + }, + { + "epoch": 2.7274789371354506, + "grad_norm": 0.46308425068855286, + "learning_rate": 2.1485424614309914e-07, + "loss": 0.0692, + "step": 8417 + }, + { + "epoch": 2.727802981205444, + "grad_norm": 0.48021116852760315, + "learning_rate": 2.1434728068802145e-07, + "loss": 0.0724, + "step": 8418 + }, + { + "epoch": 2.7281270252754375, + "grad_norm": 0.4874224364757538, + "learning_rate": 2.1384090095050768e-07, + "loss": 0.0699, + "step": 8419 + }, + { + "epoch": 2.728451069345431, + "grad_norm": 0.5004766583442688, + "learning_rate": 2.133351069925338e-07, + "loss": 0.0692, + "step": 8420 + }, + { + "epoch": 2.7287751134154243, + "grad_norm": 0.47774559259414673, + "learning_rate": 2.1282989887600468e-07, + "loss": 0.0681, + "step": 8421 + }, + { + "epoch": 2.729099157485418, + "grad_norm": 0.520897626876831, + "learning_rate": 2.1232527666275205e-07, + "loss": 0.0766, + "step": 8422 + }, + { + "epoch": 2.7294232015554116, + "grad_norm": 0.510453999042511, + "learning_rate": 2.1182124041453755e-07, + "loss": 0.0688, + "step": 8423 + }, + { + "epoch": 2.729747245625405, + "grad_norm": 0.49708041548728943, + "learning_rate": 2.1131779019304965e-07, + "loss": 0.0691, + "step": 8424 + }, + { + "epoch": 2.7300712896953985, + "grad_norm": 0.48128974437713623, + "learning_rate": 2.1081492605990682e-07, + "loss": 0.0728, + "step": 8425 + }, + { + "epoch": 2.730395333765392, + "grad_norm": 0.4515824317932129, + "learning_rate": 2.103126480766543e-07, + "loss": 0.0628, + "step": 8426 + }, + { + "epoch": 2.7307193778353858, + "grad_norm": 0.46619004011154175, + "learning_rate": 2.0981095630476457e-07, + "loss": 0.066, + "step": 8427 + }, + { + "epoch": 2.731043421905379, + "grad_norm": 0.5173163414001465, + "learning_rate": 2.0930985080564292e-07, + "loss": 0.0738, + "step": 8428 + }, + { + "epoch": 2.7313674659753726, + "grad_norm": 0.5011807084083557, + "learning_rate": 2.08809331640617e-07, + "loss": 0.075, + "step": 8429 + }, + { + "epoch": 2.731691510045366, + "grad_norm": 0.4686156213283539, + "learning_rate": 2.083093988709467e-07, + "loss": 0.0669, + "step": 8430 + }, + { + "epoch": 2.7320155541153595, + "grad_norm": 0.4761897623538971, + "learning_rate": 2.0781005255781972e-07, + "loss": 0.0714, + "step": 8431 + }, + { + "epoch": 2.7323395981853533, + "grad_norm": 0.4683878421783447, + "learning_rate": 2.0731129276234884e-07, + "loss": 0.0713, + "step": 8432 + }, + { + "epoch": 2.7326636422553467, + "grad_norm": 0.459830105304718, + "learning_rate": 2.0681311954558024e-07, + "loss": 0.0675, + "step": 8433 + }, + { + "epoch": 2.73298768632534, + "grad_norm": 0.4843185544013977, + "learning_rate": 2.0631553296848239e-07, + "loss": 0.0739, + "step": 8434 + }, + { + "epoch": 2.733311730395334, + "grad_norm": 0.4653462767601013, + "learning_rate": 2.0581853309195877e-07, + "loss": 0.0683, + "step": 8435 + }, + { + "epoch": 2.733635774465327, + "grad_norm": 0.4834546744823456, + "learning_rate": 2.0532211997683405e-07, + "loss": 0.0695, + "step": 8436 + }, + { + "epoch": 2.733959818535321, + "grad_norm": 0.45494353771209717, + "learning_rate": 2.0482629368386686e-07, + "loss": 0.0668, + "step": 8437 + }, + { + "epoch": 2.7342838626053143, + "grad_norm": 0.47521457076072693, + "learning_rate": 2.0433105427373978e-07, + "loss": 0.0707, + "step": 8438 + }, + { + "epoch": 2.7346079066753077, + "grad_norm": 0.48294797539711, + "learning_rate": 2.03836401807066e-07, + "loss": 0.0685, + "step": 8439 + }, + { + "epoch": 2.7349319507453016, + "grad_norm": 0.5113422870635986, + "learning_rate": 2.033423363443865e-07, + "loss": 0.076, + "step": 8440 + }, + { + "epoch": 2.735255994815295, + "grad_norm": 0.5801029801368713, + "learning_rate": 2.0284885794616905e-07, + "loss": 0.0818, + "step": 8441 + }, + { + "epoch": 2.7355800388852884, + "grad_norm": 0.5278540849685669, + "learning_rate": 2.0235596667281254e-07, + "loss": 0.0738, + "step": 8442 + }, + { + "epoch": 2.735904082955282, + "grad_norm": 0.5190832614898682, + "learning_rate": 2.0186366258463985e-07, + "loss": 0.0742, + "step": 8443 + }, + { + "epoch": 2.7362281270252753, + "grad_norm": 0.5184222459793091, + "learning_rate": 2.013719457419061e-07, + "loss": 0.0682, + "step": 8444 + }, + { + "epoch": 2.736552171095269, + "grad_norm": 0.47110626101493835, + "learning_rate": 2.0088081620479095e-07, + "loss": 0.0697, + "step": 8445 + }, + { + "epoch": 2.7368762151652626, + "grad_norm": 0.45322057604789734, + "learning_rate": 2.003902740334057e-07, + "loss": 0.0703, + "step": 8446 + }, + { + "epoch": 2.737200259235256, + "grad_norm": 0.474938303232193, + "learning_rate": 1.999003192877863e-07, + "loss": 0.0693, + "step": 8447 + }, + { + "epoch": 2.7375243033052494, + "grad_norm": 0.5155289173126221, + "learning_rate": 1.9941095202789973e-07, + "loss": 0.0731, + "step": 8448 + }, + { + "epoch": 2.737848347375243, + "grad_norm": 0.4893336892127991, + "learning_rate": 1.989221723136392e-07, + "loss": 0.0737, + "step": 8449 + }, + { + "epoch": 2.7381723914452367, + "grad_norm": 0.4888181984424591, + "learning_rate": 1.984339802048274e-07, + "loss": 0.0738, + "step": 8450 + }, + { + "epoch": 2.73849643551523, + "grad_norm": 0.49995023012161255, + "learning_rate": 1.9794637576121324e-07, + "loss": 0.0724, + "step": 8451 + }, + { + "epoch": 2.7388204795852236, + "grad_norm": 0.46177735924720764, + "learning_rate": 1.9745935904247505e-07, + "loss": 0.0679, + "step": 8452 + }, + { + "epoch": 2.739144523655217, + "grad_norm": 0.46565553545951843, + "learning_rate": 1.9697293010821906e-07, + "loss": 0.068, + "step": 8453 + }, + { + "epoch": 2.7394685677252104, + "grad_norm": 0.4851975440979004, + "learning_rate": 1.9648708901797932e-07, + "loss": 0.0706, + "step": 8454 + }, + { + "epoch": 2.7397926117952043, + "grad_norm": 0.44226428866386414, + "learning_rate": 1.9600183583121878e-07, + "loss": 0.0638, + "step": 8455 + }, + { + "epoch": 2.7401166558651977, + "grad_norm": 0.5337146520614624, + "learning_rate": 1.9551717060732667e-07, + "loss": 0.0771, + "step": 8456 + }, + { + "epoch": 2.740440699935191, + "grad_norm": 0.47819724678993225, + "learning_rate": 1.950330934056227e-07, + "loss": 0.0664, + "step": 8457 + }, + { + "epoch": 2.7407647440051845, + "grad_norm": 0.4902496635913849, + "learning_rate": 1.9454960428535118e-07, + "loss": 0.0747, + "step": 8458 + }, + { + "epoch": 2.741088788075178, + "grad_norm": 0.4811018407344818, + "learning_rate": 1.940667033056892e-07, + "loss": 0.0727, + "step": 8459 + }, + { + "epoch": 2.741412832145172, + "grad_norm": 0.5350942611694336, + "learning_rate": 1.9358439052573673e-07, + "loss": 0.0783, + "step": 8460 + }, + { + "epoch": 2.7417368762151653, + "grad_norm": 0.5199815630912781, + "learning_rate": 1.9310266600452542e-07, + "loss": 0.0755, + "step": 8461 + }, + { + "epoch": 2.7420609202851587, + "grad_norm": 0.45431849360466003, + "learning_rate": 1.9262152980101368e-07, + "loss": 0.0665, + "step": 8462 + }, + { + "epoch": 2.7423849643551526, + "grad_norm": 0.5301962494850159, + "learning_rate": 1.921409819740866e-07, + "loss": 0.0772, + "step": 8463 + }, + { + "epoch": 2.742709008425146, + "grad_norm": 0.5110758543014526, + "learning_rate": 1.9166102258256103e-07, + "loss": 0.0734, + "step": 8464 + }, + { + "epoch": 2.7430330524951394, + "grad_norm": 0.4723348319530487, + "learning_rate": 1.9118165168517665e-07, + "loss": 0.0665, + "step": 8465 + }, + { + "epoch": 2.743357096565133, + "grad_norm": 0.5000332593917847, + "learning_rate": 1.9070286934060654e-07, + "loss": 0.0763, + "step": 8466 + }, + { + "epoch": 2.7436811406351262, + "grad_norm": 0.5045048594474792, + "learning_rate": 1.9022467560744606e-07, + "loss": 0.0736, + "step": 8467 + }, + { + "epoch": 2.74400518470512, + "grad_norm": 0.463461309671402, + "learning_rate": 1.8974707054422447e-07, + "loss": 0.0657, + "step": 8468 + }, + { + "epoch": 2.7443292287751135, + "grad_norm": 0.4715588688850403, + "learning_rate": 1.8927005420939394e-07, + "loss": 0.0736, + "step": 8469 + }, + { + "epoch": 2.744653272845107, + "grad_norm": 0.524459719657898, + "learning_rate": 1.8879362666133716e-07, + "loss": 0.0756, + "step": 8470 + }, + { + "epoch": 2.7449773169151004, + "grad_norm": 0.5173646211624146, + "learning_rate": 1.8831778795836476e-07, + "loss": 0.0695, + "step": 8471 + }, + { + "epoch": 2.745301360985094, + "grad_norm": 0.45598724484443665, + "learning_rate": 1.87842538158714e-07, + "loss": 0.066, + "step": 8472 + }, + { + "epoch": 2.7456254050550877, + "grad_norm": 0.4994466304779053, + "learning_rate": 1.8736787732055228e-07, + "loss": 0.0754, + "step": 8473 + }, + { + "epoch": 2.745949449125081, + "grad_norm": 0.45296186208724976, + "learning_rate": 1.8689380550197146e-07, + "loss": 0.0634, + "step": 8474 + }, + { + "epoch": 2.7462734931950745, + "grad_norm": 0.4605759084224701, + "learning_rate": 1.8642032276099454e-07, + "loss": 0.0682, + "step": 8475 + }, + { + "epoch": 2.746597537265068, + "grad_norm": 0.5162789225578308, + "learning_rate": 1.859474291555713e-07, + "loss": 0.0677, + "step": 8476 + }, + { + "epoch": 2.7469215813350614, + "grad_norm": 0.5212785601615906, + "learning_rate": 1.8547512474357876e-07, + "loss": 0.0764, + "step": 8477 + }, + { + "epoch": 2.7472456254050552, + "grad_norm": 0.5051805973052979, + "learning_rate": 1.8500340958282292e-07, + "loss": 0.0763, + "step": 8478 + }, + { + "epoch": 2.7475696694750487, + "grad_norm": 0.4878946840763092, + "learning_rate": 1.8453228373103705e-07, + "loss": 0.0722, + "step": 8479 + }, + { + "epoch": 2.747893713545042, + "grad_norm": 0.46743884682655334, + "learning_rate": 1.840617472458822e-07, + "loss": 0.0694, + "step": 8480 + }, + { + "epoch": 2.7482177576150355, + "grad_norm": 0.4815881848335266, + "learning_rate": 1.8359180018494793e-07, + "loss": 0.0692, + "step": 8481 + }, + { + "epoch": 2.748541801685029, + "grad_norm": 0.5062234401702881, + "learning_rate": 1.831224426057504e-07, + "loss": 0.0741, + "step": 8482 + }, + { + "epoch": 2.748865845755023, + "grad_norm": 0.4984256625175476, + "learning_rate": 1.8265367456573534e-07, + "loss": 0.0702, + "step": 8483 + }, + { + "epoch": 2.749189889825016, + "grad_norm": 0.473633736371994, + "learning_rate": 1.8218549612227464e-07, + "loss": 0.0681, + "step": 8484 + }, + { + "epoch": 2.7495139338950096, + "grad_norm": 0.5147225260734558, + "learning_rate": 1.8171790733266914e-07, + "loss": 0.0726, + "step": 8485 + }, + { + "epoch": 2.7498379779650035, + "grad_norm": 0.4694821536540985, + "learning_rate": 1.812509082541475e-07, + "loss": 0.0719, + "step": 8486 + }, + { + "epoch": 2.7501620220349965, + "grad_norm": 0.4427323341369629, + "learning_rate": 1.8078449894386508e-07, + "loss": 0.0649, + "step": 8487 + }, + { + "epoch": 2.7504860661049904, + "grad_norm": 0.5387277603149414, + "learning_rate": 1.803186794589068e-07, + "loss": 0.079, + "step": 8488 + }, + { + "epoch": 2.750810110174984, + "grad_norm": 0.5092546343803406, + "learning_rate": 1.7985344985628316e-07, + "loss": 0.0701, + "step": 8489 + }, + { + "epoch": 2.751134154244977, + "grad_norm": 0.5002366900444031, + "learning_rate": 1.7938881019293642e-07, + "loss": 0.0743, + "step": 8490 + }, + { + "epoch": 2.751458198314971, + "grad_norm": 0.5159744620323181, + "learning_rate": 1.7892476052573104e-07, + "loss": 0.0761, + "step": 8491 + }, + { + "epoch": 2.7517822423849645, + "grad_norm": 0.4766468107700348, + "learning_rate": 1.784613009114633e-07, + "loss": 0.0678, + "step": 8492 + }, + { + "epoch": 2.752106286454958, + "grad_norm": 0.4794914126396179, + "learning_rate": 1.7799843140685613e-07, + "loss": 0.0685, + "step": 8493 + }, + { + "epoch": 2.7524303305249513, + "grad_norm": 0.44357404112815857, + "learning_rate": 1.7753615206856033e-07, + "loss": 0.0637, + "step": 8494 + }, + { + "epoch": 2.7527543745949448, + "grad_norm": 0.43333783745765686, + "learning_rate": 1.770744629531551e-07, + "loss": 0.0626, + "step": 8495 + }, + { + "epoch": 2.7530784186649386, + "grad_norm": 0.5045689940452576, + "learning_rate": 1.7661336411714526e-07, + "loss": 0.0752, + "step": 8496 + }, + { + "epoch": 2.753402462734932, + "grad_norm": 0.545109748840332, + "learning_rate": 1.761528556169667e-07, + "loss": 0.0779, + "step": 8497 + }, + { + "epoch": 2.7537265068049255, + "grad_norm": 0.47431567311286926, + "learning_rate": 1.7569293750897942e-07, + "loss": 0.0703, + "step": 8498 + }, + { + "epoch": 2.754050550874919, + "grad_norm": 0.4464452862739563, + "learning_rate": 1.7523360984947336e-07, + "loss": 0.0649, + "step": 8499 + }, + { + "epoch": 2.7543745949449123, + "grad_norm": 0.4776403307914734, + "learning_rate": 1.7477487269466632e-07, + "loss": 0.0708, + "step": 8500 + }, + { + "epoch": 2.754698639014906, + "grad_norm": 0.43605953454971313, + "learning_rate": 1.7431672610070337e-07, + "loss": 0.0651, + "step": 8501 + }, + { + "epoch": 2.7550226830848996, + "grad_norm": 0.49093151092529297, + "learning_rate": 1.7385917012365694e-07, + "loss": 0.0658, + "step": 8502 + }, + { + "epoch": 2.755346727154893, + "grad_norm": 0.49357640743255615, + "learning_rate": 1.734022048195272e-07, + "loss": 0.0733, + "step": 8503 + }, + { + "epoch": 2.7556707712248865, + "grad_norm": 0.4901171326637268, + "learning_rate": 1.7294583024424273e-07, + "loss": 0.0659, + "step": 8504 + }, + { + "epoch": 2.75599481529488, + "grad_norm": 0.5076462030410767, + "learning_rate": 1.7249004645365884e-07, + "loss": 0.0754, + "step": 8505 + }, + { + "epoch": 2.7563188593648738, + "grad_norm": 0.47774630784988403, + "learning_rate": 1.7203485350355986e-07, + "loss": 0.0654, + "step": 8506 + }, + { + "epoch": 2.756642903434867, + "grad_norm": 0.4688727855682373, + "learning_rate": 1.715802514496556e-07, + "loss": 0.0663, + "step": 8507 + }, + { + "epoch": 2.7569669475048606, + "grad_norm": 0.45370423793792725, + "learning_rate": 1.7112624034758663e-07, + "loss": 0.0637, + "step": 8508 + }, + { + "epoch": 2.757290991574854, + "grad_norm": 0.4860289990901947, + "learning_rate": 1.7067282025291842e-07, + "loss": 0.0693, + "step": 8509 + }, + { + "epoch": 2.7576150356448474, + "grad_norm": 0.4762254059314728, + "learning_rate": 1.7021999122114552e-07, + "loss": 0.0726, + "step": 8510 + }, + { + "epoch": 2.7579390797148413, + "grad_norm": 0.4850431978702545, + "learning_rate": 1.6976775330768913e-07, + "loss": 0.0708, + "step": 8511 + }, + { + "epoch": 2.7582631237848347, + "grad_norm": 0.4895291030406952, + "learning_rate": 1.693161065679011e-07, + "loss": 0.0703, + "step": 8512 + }, + { + "epoch": 2.758587167854828, + "grad_norm": 0.4404808580875397, + "learning_rate": 1.6886505105705553e-07, + "loss": 0.0638, + "step": 8513 + }, + { + "epoch": 2.758911211924822, + "grad_norm": 0.48556792736053467, + "learning_rate": 1.684145868303594e-07, + "loss": 0.0704, + "step": 8514 + }, + { + "epoch": 2.7592352559948155, + "grad_norm": 0.5026125907897949, + "learning_rate": 1.679647139429441e-07, + "loss": 0.0699, + "step": 8515 + }, + { + "epoch": 2.759559300064809, + "grad_norm": 0.47186747193336487, + "learning_rate": 1.675154324498701e-07, + "loss": 0.0666, + "step": 8516 + }, + { + "epoch": 2.7598833441348023, + "grad_norm": 0.4909684658050537, + "learning_rate": 1.6706674240612563e-07, + "loss": 0.0727, + "step": 8517 + }, + { + "epoch": 2.7602073882047957, + "grad_norm": 0.5015256404876709, + "learning_rate": 1.6661864386662452e-07, + "loss": 0.0729, + "step": 8518 + }, + { + "epoch": 2.7605314322747896, + "grad_norm": 0.5276386141777039, + "learning_rate": 1.6617113688621177e-07, + "loss": 0.0787, + "step": 8519 + }, + { + "epoch": 2.760855476344783, + "grad_norm": 0.47311532497406006, + "learning_rate": 1.6572422151965529e-07, + "loss": 0.0648, + "step": 8520 + }, + { + "epoch": 2.7611795204147764, + "grad_norm": 0.48931893706321716, + "learning_rate": 1.6527789782165627e-07, + "loss": 0.0714, + "step": 8521 + }, + { + "epoch": 2.76150356448477, + "grad_norm": 0.4892522990703583, + "learning_rate": 1.6483216584683825e-07, + "loss": 0.071, + "step": 8522 + }, + { + "epoch": 2.7618276085547633, + "grad_norm": 0.478360116481781, + "learning_rate": 1.6438702564975483e-07, + "loss": 0.0682, + "step": 8523 + }, + { + "epoch": 2.762151652624757, + "grad_norm": 0.48810848593711853, + "learning_rate": 1.639424772848869e-07, + "loss": 0.0719, + "step": 8524 + }, + { + "epoch": 2.7624756966947506, + "grad_norm": 0.4530242681503296, + "learning_rate": 1.6349852080664364e-07, + "loss": 0.0644, + "step": 8525 + }, + { + "epoch": 2.762799740764744, + "grad_norm": 0.49164706468582153, + "learning_rate": 1.6305515626936054e-07, + "loss": 0.0703, + "step": 8526 + }, + { + "epoch": 2.7631237848347374, + "grad_norm": 0.47457531094551086, + "learning_rate": 1.6261238372730025e-07, + "loss": 0.0636, + "step": 8527 + }, + { + "epoch": 2.763447828904731, + "grad_norm": 0.4845804274082184, + "learning_rate": 1.621702032346556e-07, + "loss": 0.0739, + "step": 8528 + }, + { + "epoch": 2.7637718729747247, + "grad_norm": 0.4832020699977875, + "learning_rate": 1.6172861484554382e-07, + "loss": 0.0723, + "step": 8529 + }, + { + "epoch": 2.764095917044718, + "grad_norm": 0.5220732688903809, + "learning_rate": 1.612876186140111e-07, + "loss": 0.071, + "step": 8530 + }, + { + "epoch": 2.7644199611147116, + "grad_norm": 0.46765822172164917, + "learning_rate": 1.608472145940321e-07, + "loss": 0.0671, + "step": 8531 + }, + { + "epoch": 2.764744005184705, + "grad_norm": 0.4872879981994629, + "learning_rate": 1.6040740283950694e-07, + "loss": 0.0755, + "step": 8532 + }, + { + "epoch": 2.7650680492546984, + "grad_norm": 0.4944052994251251, + "learning_rate": 1.599681834042649e-07, + "loss": 0.0732, + "step": 8533 + }, + { + "epoch": 2.7653920933246923, + "grad_norm": 0.49354100227355957, + "learning_rate": 1.5952955634206235e-07, + "loss": 0.0701, + "step": 8534 + }, + { + "epoch": 2.7657161373946857, + "grad_norm": 0.4873565137386322, + "learning_rate": 1.5909152170658304e-07, + "loss": 0.0712, + "step": 8535 + }, + { + "epoch": 2.766040181464679, + "grad_norm": 0.4880267381668091, + "learning_rate": 1.586540795514374e-07, + "loss": 0.0672, + "step": 8536 + }, + { + "epoch": 2.766364225534673, + "grad_norm": 0.4742276072502136, + "learning_rate": 1.582172299301643e-07, + "loss": 0.0697, + "step": 8537 + }, + { + "epoch": 2.766688269604666, + "grad_norm": 0.4935052692890167, + "learning_rate": 1.577809728962304e-07, + "loss": 0.0743, + "step": 8538 + }, + { + "epoch": 2.76701231367466, + "grad_norm": 0.49048107862472534, + "learning_rate": 1.573453085030291e-07, + "loss": 0.0693, + "step": 8539 + }, + { + "epoch": 2.7673363577446533, + "grad_norm": 0.5113756060600281, + "learning_rate": 1.5691023680388162e-07, + "loss": 0.0769, + "step": 8540 + }, + { + "epoch": 2.7676604018146467, + "grad_norm": 0.49390682578086853, + "learning_rate": 1.564757578520365e-07, + "loss": 0.0727, + "step": 8541 + }, + { + "epoch": 2.7679844458846405, + "grad_norm": 0.5174000263214111, + "learning_rate": 1.5604187170066899e-07, + "loss": 0.0784, + "step": 8542 + }, + { + "epoch": 2.768308489954634, + "grad_norm": 0.5169310569763184, + "learning_rate": 1.5560857840288434e-07, + "loss": 0.0757, + "step": 8543 + }, + { + "epoch": 2.7686325340246274, + "grad_norm": 0.48713600635528564, + "learning_rate": 1.551758780117113e-07, + "loss": 0.0686, + "step": 8544 + }, + { + "epoch": 2.768956578094621, + "grad_norm": 0.45703360438346863, + "learning_rate": 1.547437705801097e-07, + "loss": 0.0671, + "step": 8545 + }, + { + "epoch": 2.7692806221646142, + "grad_norm": 0.5233086347579956, + "learning_rate": 1.5431225616096502e-07, + "loss": 0.0724, + "step": 8546 + }, + { + "epoch": 2.769604666234608, + "grad_norm": 0.5077247619628906, + "learning_rate": 1.5388133480709e-07, + "loss": 0.0733, + "step": 8547 + }, + { + "epoch": 2.7699287103046015, + "grad_norm": 0.4755188822746277, + "learning_rate": 1.5345100657122635e-07, + "loss": 0.07, + "step": 8548 + }, + { + "epoch": 2.770252754374595, + "grad_norm": 0.4951607584953308, + "learning_rate": 1.5302127150603973e-07, + "loss": 0.0671, + "step": 8549 + }, + { + "epoch": 2.7705767984445884, + "grad_norm": 0.4756135642528534, + "learning_rate": 1.525921296641286e-07, + "loss": 0.0725, + "step": 8550 + }, + { + "epoch": 2.770900842514582, + "grad_norm": 0.4744229316711426, + "learning_rate": 1.5216358109801267e-07, + "loss": 0.071, + "step": 8551 + }, + { + "epoch": 2.7712248865845757, + "grad_norm": 0.5493775010108948, + "learning_rate": 1.5173562586014546e-07, + "loss": 0.082, + "step": 8552 + }, + { + "epoch": 2.771548930654569, + "grad_norm": 0.510632336139679, + "learning_rate": 1.5130826400290177e-07, + "loss": 0.0707, + "step": 8553 + }, + { + "epoch": 2.7718729747245625, + "grad_norm": 0.47911614179611206, + "learning_rate": 1.5088149557858757e-07, + "loss": 0.0736, + "step": 8554 + }, + { + "epoch": 2.772197018794556, + "grad_norm": 0.5228129625320435, + "learning_rate": 1.5045532063943547e-07, + "loss": 0.0756, + "step": 8555 + }, + { + "epoch": 2.7725210628645494, + "grad_norm": 0.5336810350418091, + "learning_rate": 1.500297392376049e-07, + "loss": 0.073, + "step": 8556 + }, + { + "epoch": 2.7728451069345432, + "grad_norm": 0.4823746383190155, + "learning_rate": 1.4960475142518306e-07, + "loss": 0.069, + "step": 8557 + }, + { + "epoch": 2.7731691510045366, + "grad_norm": 0.45536088943481445, + "learning_rate": 1.4918035725418388e-07, + "loss": 0.0651, + "step": 8558 + }, + { + "epoch": 2.77349319507453, + "grad_norm": 0.5071476101875305, + "learning_rate": 1.4875655677654976e-07, + "loss": 0.0716, + "step": 8559 + }, + { + "epoch": 2.7738172391445235, + "grad_norm": 0.48243248462677, + "learning_rate": 1.4833335004414917e-07, + "loss": 0.0684, + "step": 8560 + }, + { + "epoch": 2.774141283214517, + "grad_norm": 0.4552246332168579, + "learning_rate": 1.4791073710877846e-07, + "loss": 0.0645, + "step": 8561 + }, + { + "epoch": 2.774465327284511, + "grad_norm": 0.47968533635139465, + "learning_rate": 1.474887180221618e-07, + "loss": 0.0681, + "step": 8562 + }, + { + "epoch": 2.774789371354504, + "grad_norm": 0.47891002893447876, + "learning_rate": 1.4706729283595066e-07, + "loss": 0.0713, + "step": 8563 + }, + { + "epoch": 2.7751134154244976, + "grad_norm": 0.5036091804504395, + "learning_rate": 1.4664646160172213e-07, + "loss": 0.0707, + "step": 8564 + }, + { + "epoch": 2.7754374594944915, + "grad_norm": 0.4715958833694458, + "learning_rate": 1.4622622437098333e-07, + "loss": 0.0694, + "step": 8565 + }, + { + "epoch": 2.775761503564485, + "grad_norm": 0.5018330812454224, + "learning_rate": 1.4580658119516589e-07, + "loss": 0.0744, + "step": 8566 + }, + { + "epoch": 2.7760855476344783, + "grad_norm": 0.5229835510253906, + "learning_rate": 1.4538753212563095e-07, + "loss": 0.0786, + "step": 8567 + }, + { + "epoch": 2.7764095917044718, + "grad_norm": 0.4672481119632721, + "learning_rate": 1.4496907721366583e-07, + "loss": 0.0664, + "step": 8568 + }, + { + "epoch": 2.776733635774465, + "grad_norm": 0.4925411641597748, + "learning_rate": 1.445512165104851e-07, + "loss": 0.0753, + "step": 8569 + }, + { + "epoch": 2.777057679844459, + "grad_norm": 0.5130710601806641, + "learning_rate": 1.441339500672312e-07, + "loss": 0.0768, + "step": 8570 + }, + { + "epoch": 2.7773817239144525, + "grad_norm": 0.4710983633995056, + "learning_rate": 1.4371727793497325e-07, + "loss": 0.0682, + "step": 8571 + }, + { + "epoch": 2.777705767984446, + "grad_norm": 0.5014706254005432, + "learning_rate": 1.433012001647083e-07, + "loss": 0.075, + "step": 8572 + }, + { + "epoch": 2.7780298120544393, + "grad_norm": 0.49718159437179565, + "learning_rate": 1.428857168073594e-07, + "loss": 0.0695, + "step": 8573 + }, + { + "epoch": 2.7783538561244328, + "grad_norm": 0.4719870984554291, + "learning_rate": 1.4247082791377932e-07, + "loss": 0.0661, + "step": 8574 + }, + { + "epoch": 2.7786779001944266, + "grad_norm": 0.47441691160202026, + "learning_rate": 1.4205653353474404e-07, + "loss": 0.0652, + "step": 8575 + }, + { + "epoch": 2.77900194426442, + "grad_norm": 0.4929604232311249, + "learning_rate": 1.4164283372096133e-07, + "loss": 0.0721, + "step": 8576 + }, + { + "epoch": 2.7793259883344135, + "grad_norm": 0.5177626013755798, + "learning_rate": 1.4122972852306293e-07, + "loss": 0.0765, + "step": 8577 + }, + { + "epoch": 2.779650032404407, + "grad_norm": 0.5820122361183167, + "learning_rate": 1.408172179916095e-07, + "loss": 0.0749, + "step": 8578 + }, + { + "epoch": 2.7799740764744003, + "grad_norm": 0.46356290578842163, + "learning_rate": 1.4040530217708847e-07, + "loss": 0.0668, + "step": 8579 + }, + { + "epoch": 2.780298120544394, + "grad_norm": 0.5057440400123596, + "learning_rate": 1.399939811299128e-07, + "loss": 0.0726, + "step": 8580 + }, + { + "epoch": 2.7806221646143876, + "grad_norm": 0.4511764645576477, + "learning_rate": 1.3958325490042613e-07, + "loss": 0.0619, + "step": 8581 + }, + { + "epoch": 2.780946208684381, + "grad_norm": 0.49743250012397766, + "learning_rate": 1.3917312353889601e-07, + "loss": 0.0713, + "step": 8582 + }, + { + "epoch": 2.7812702527543745, + "grad_norm": 0.48972567915916443, + "learning_rate": 1.3876358709552006e-07, + "loss": 0.0689, + "step": 8583 + }, + { + "epoch": 2.781594296824368, + "grad_norm": 0.4920805096626282, + "learning_rate": 1.383546456204199e-07, + "loss": 0.071, + "step": 8584 + }, + { + "epoch": 2.7819183408943617, + "grad_norm": 0.43759530782699585, + "learning_rate": 1.3794629916364654e-07, + "loss": 0.0649, + "step": 8585 + }, + { + "epoch": 2.782242384964355, + "grad_norm": 0.48497021198272705, + "learning_rate": 1.375385477751784e-07, + "loss": 0.07, + "step": 8586 + }, + { + "epoch": 2.7825664290343486, + "grad_norm": 0.4718039333820343, + "learning_rate": 1.3713139150491938e-07, + "loss": 0.069, + "step": 8587 + }, + { + "epoch": 2.7828904731043425, + "grad_norm": 0.4685593545436859, + "learning_rate": 1.3672483040270246e-07, + "loss": 0.0701, + "step": 8588 + }, + { + "epoch": 2.7832145171743354, + "grad_norm": 0.4797540307044983, + "learning_rate": 1.3631886451828556e-07, + "loss": 0.0709, + "step": 8589 + }, + { + "epoch": 2.7835385612443293, + "grad_norm": 0.5307188034057617, + "learning_rate": 1.3591349390135565e-07, + "loss": 0.0765, + "step": 8590 + }, + { + "epoch": 2.7838626053143227, + "grad_norm": 0.45155444741249084, + "learning_rate": 1.355087186015258e-07, + "loss": 0.0632, + "step": 8591 + }, + { + "epoch": 2.784186649384316, + "grad_norm": 0.5059266686439514, + "learning_rate": 1.351045386683375e-07, + "loss": 0.0706, + "step": 8592 + }, + { + "epoch": 2.78451069345431, + "grad_norm": 0.5184312462806702, + "learning_rate": 1.3470095415125727e-07, + "loss": 0.0681, + "step": 8593 + }, + { + "epoch": 2.7848347375243034, + "grad_norm": 0.5246706604957581, + "learning_rate": 1.3429796509968062e-07, + "loss": 0.0808, + "step": 8594 + }, + { + "epoch": 2.785158781594297, + "grad_norm": 0.4852238595485687, + "learning_rate": 1.338955715629292e-07, + "loss": 0.066, + "step": 8595 + }, + { + "epoch": 2.7854828256642903, + "grad_norm": 0.5338213443756104, + "learning_rate": 1.3349377359025307e-07, + "loss": 0.0776, + "step": 8596 + }, + { + "epoch": 2.7858068697342837, + "grad_norm": 0.5105713605880737, + "learning_rate": 1.3309257123082674e-07, + "loss": 0.0754, + "step": 8597 + }, + { + "epoch": 2.7861309138042776, + "grad_norm": 0.5126920342445374, + "learning_rate": 1.3269196453375544e-07, + "loss": 0.0745, + "step": 8598 + }, + { + "epoch": 2.786454957874271, + "grad_norm": 0.5031168460845947, + "learning_rate": 1.3229195354806768e-07, + "loss": 0.0731, + "step": 8599 + }, + { + "epoch": 2.7867790019442644, + "grad_norm": 0.47806215286254883, + "learning_rate": 1.3189253832272153e-07, + "loss": 0.0687, + "step": 8600 + }, + { + "epoch": 2.787103046014258, + "grad_norm": 0.5075884461402893, + "learning_rate": 1.314937189066029e-07, + "loss": 0.0731, + "step": 8601 + }, + { + "epoch": 2.7874270900842513, + "grad_norm": 0.48211807012557983, + "learning_rate": 1.310954953485205e-07, + "loss": 0.0712, + "step": 8602 + }, + { + "epoch": 2.787751134154245, + "grad_norm": 0.48329949378967285, + "learning_rate": 1.3069786769721647e-07, + "loss": 0.0757, + "step": 8603 + }, + { + "epoch": 2.7880751782242386, + "grad_norm": 0.48496365547180176, + "learning_rate": 1.3030083600135357e-07, + "loss": 0.0723, + "step": 8604 + }, + { + "epoch": 2.788399222294232, + "grad_norm": 0.4774136543273926, + "learning_rate": 1.2990440030952732e-07, + "loss": 0.0686, + "step": 8605 + }, + { + "epoch": 2.7887232663642254, + "grad_norm": 0.5031121373176575, + "learning_rate": 1.2950856067025507e-07, + "loss": 0.0704, + "step": 8606 + }, + { + "epoch": 2.789047310434219, + "grad_norm": 0.4636278450489044, + "learning_rate": 1.2911331713198582e-07, + "loss": 0.0641, + "step": 8607 + }, + { + "epoch": 2.7893713545042127, + "grad_norm": 0.5307515263557434, + "learning_rate": 1.2871866974309255e-07, + "loss": 0.0775, + "step": 8608 + }, + { + "epoch": 2.789695398574206, + "grad_norm": 0.5113701224327087, + "learning_rate": 1.2832461855187605e-07, + "loss": 0.0738, + "step": 8609 + }, + { + "epoch": 2.7900194426441995, + "grad_norm": 0.4879836440086365, + "learning_rate": 1.27931163606565e-07, + "loss": 0.0701, + "step": 8610 + }, + { + "epoch": 2.790343486714193, + "grad_norm": 0.5034375786781311, + "learning_rate": 1.275383049553147e-07, + "loss": 0.0712, + "step": 8611 + }, + { + "epoch": 2.7906675307841864, + "grad_norm": 0.4796959459781647, + "learning_rate": 1.2714604264620677e-07, + "loss": 0.0658, + "step": 8612 + }, + { + "epoch": 2.7909915748541803, + "grad_norm": 0.47571125626564026, + "learning_rate": 1.2675437672724945e-07, + "loss": 0.0678, + "step": 8613 + }, + { + "epoch": 2.7913156189241737, + "grad_norm": 0.4868853688240051, + "learning_rate": 1.2636330724638045e-07, + "loss": 0.0706, + "step": 8614 + }, + { + "epoch": 2.791639662994167, + "grad_norm": 0.47460609674453735, + "learning_rate": 1.259728342514621e-07, + "loss": 0.0689, + "step": 8615 + }, + { + "epoch": 2.791963707064161, + "grad_norm": 0.4783695340156555, + "learning_rate": 1.2558295779028452e-07, + "loss": 0.0646, + "step": 8616 + }, + { + "epoch": 2.7922877511341544, + "grad_norm": 0.4867151379585266, + "learning_rate": 1.2519367791056502e-07, + "loss": 0.0715, + "step": 8617 + }, + { + "epoch": 2.792611795204148, + "grad_norm": 0.5441122055053711, + "learning_rate": 1.248049946599472e-07, + "loss": 0.0746, + "step": 8618 + }, + { + "epoch": 2.7929358392741412, + "grad_norm": 0.533957839012146, + "learning_rate": 1.244169080860025e-07, + "loss": 0.0832, + "step": 8619 + }, + { + "epoch": 2.7932598833441347, + "grad_norm": 0.4913591742515564, + "learning_rate": 1.2402941823622948e-07, + "loss": 0.0733, + "step": 8620 + }, + { + "epoch": 2.7935839274141285, + "grad_norm": 0.49308526515960693, + "learning_rate": 1.2364252515805252e-07, + "loss": 0.0728, + "step": 8621 + }, + { + "epoch": 2.793907971484122, + "grad_norm": 0.5083343386650085, + "learning_rate": 1.2325622889882317e-07, + "loss": 0.0724, + "step": 8622 + }, + { + "epoch": 2.7942320155541154, + "grad_norm": 0.436282753944397, + "learning_rate": 1.2287052950582134e-07, + "loss": 0.0621, + "step": 8623 + }, + { + "epoch": 2.794556059624109, + "grad_norm": 0.48184362053871155, + "learning_rate": 1.224854270262521e-07, + "loss": 0.071, + "step": 8624 + }, + { + "epoch": 2.7948801036941022, + "grad_norm": 0.46548590064048767, + "learning_rate": 1.2210092150724882e-07, + "loss": 0.0691, + "step": 8625 + }, + { + "epoch": 2.795204147764096, + "grad_norm": 0.5401601791381836, + "learning_rate": 1.2171701299587058e-07, + "loss": 0.0717, + "step": 8626 + }, + { + "epoch": 2.7955281918340895, + "grad_norm": 0.48991283774375916, + "learning_rate": 1.2133370153910528e-07, + "loss": 0.0728, + "step": 8627 + }, + { + "epoch": 2.795852235904083, + "grad_norm": 0.500863254070282, + "learning_rate": 1.209509871838649e-07, + "loss": 0.0693, + "step": 8628 + }, + { + "epoch": 2.7961762799740764, + "grad_norm": 0.48141273856163025, + "learning_rate": 1.2056886997699192e-07, + "loss": 0.0686, + "step": 8629 + }, + { + "epoch": 2.79650032404407, + "grad_norm": 0.4619733989238739, + "learning_rate": 1.2018734996525173e-07, + "loss": 0.0664, + "step": 8630 + }, + { + "epoch": 2.7968243681140637, + "grad_norm": 0.4864651560783386, + "learning_rate": 1.198064271953403e-07, + "loss": 0.0694, + "step": 8631 + }, + { + "epoch": 2.797148412184057, + "grad_norm": 0.5243645906448364, + "learning_rate": 1.194261017138776e-07, + "loss": 0.0721, + "step": 8632 + }, + { + "epoch": 2.7974724562540505, + "grad_norm": 0.4923947751522064, + "learning_rate": 1.19046373567413e-07, + "loss": 0.0679, + "step": 8633 + }, + { + "epoch": 2.797796500324044, + "grad_norm": 0.5136322975158691, + "learning_rate": 1.1866724280242104e-07, + "loss": 0.0743, + "step": 8634 + }, + { + "epoch": 2.7981205443940373, + "grad_norm": 0.47235724329948425, + "learning_rate": 1.1828870946530291e-07, + "loss": 0.0711, + "step": 8635 + }, + { + "epoch": 2.798444588464031, + "grad_norm": 0.5212631225585938, + "learning_rate": 1.1791077360238879e-07, + "loss": 0.0774, + "step": 8636 + }, + { + "epoch": 2.7987686325340246, + "grad_norm": 0.4773974120616913, + "learning_rate": 1.1753343525993277e-07, + "loss": 0.0671, + "step": 8637 + }, + { + "epoch": 2.799092676604018, + "grad_norm": 0.4766692519187927, + "learning_rate": 1.1715669448411959e-07, + "loss": 0.0715, + "step": 8638 + }, + { + "epoch": 2.799416720674012, + "grad_norm": 0.4838842749595642, + "learning_rate": 1.1678055132105682e-07, + "loss": 0.0705, + "step": 8639 + }, + { + "epoch": 2.7997407647440054, + "grad_norm": 0.47889941930770874, + "learning_rate": 1.1640500581678093e-07, + "loss": 0.0687, + "step": 8640 + }, + { + "epoch": 2.8000648088139988, + "grad_norm": 0.49360138177871704, + "learning_rate": 1.1603005801725575e-07, + "loss": 0.0713, + "step": 8641 + }, + { + "epoch": 2.800388852883992, + "grad_norm": 0.4900329113006592, + "learning_rate": 1.1565570796837122e-07, + "loss": 0.0688, + "step": 8642 + }, + { + "epoch": 2.8007128969539856, + "grad_norm": 0.5061361789703369, + "learning_rate": 1.1528195571594403e-07, + "loss": 0.0794, + "step": 8643 + }, + { + "epoch": 2.8010369410239795, + "grad_norm": 0.48438015580177307, + "learning_rate": 1.1490880130571758e-07, + "loss": 0.0728, + "step": 8644 + }, + { + "epoch": 2.801360985093973, + "grad_norm": 0.49202558398246765, + "learning_rate": 1.1453624478336256e-07, + "loss": 0.0782, + "step": 8645 + }, + { + "epoch": 2.8016850291639663, + "grad_norm": 0.4822075664997101, + "learning_rate": 1.1416428619447583e-07, + "loss": 0.0713, + "step": 8646 + }, + { + "epoch": 2.8020090732339598, + "grad_norm": 0.4483577311038971, + "learning_rate": 1.137929255845821e-07, + "loss": 0.0669, + "step": 8647 + }, + { + "epoch": 2.802333117303953, + "grad_norm": 0.42313840985298157, + "learning_rate": 1.1342216299913222e-07, + "loss": 0.0586, + "step": 8648 + }, + { + "epoch": 2.802657161373947, + "grad_norm": 0.4894741177558899, + "learning_rate": 1.1305199848350379e-07, + "loss": 0.0657, + "step": 8649 + }, + { + "epoch": 2.8029812054439405, + "grad_norm": 0.5330514311790466, + "learning_rate": 1.1268243208300111e-07, + "loss": 0.0761, + "step": 8650 + }, + { + "epoch": 2.803305249513934, + "grad_norm": 0.4709966778755188, + "learning_rate": 1.1231346384285691e-07, + "loss": 0.068, + "step": 8651 + }, + { + "epoch": 2.8036292935839273, + "grad_norm": 0.4735042452812195, + "learning_rate": 1.1194509380822727e-07, + "loss": 0.0711, + "step": 8652 + }, + { + "epoch": 2.8039533376539207, + "grad_norm": 0.4827316105365753, + "learning_rate": 1.1157732202419835e-07, + "loss": 0.0709, + "step": 8653 + }, + { + "epoch": 2.8042773817239146, + "grad_norm": 0.46700969338417053, + "learning_rate": 1.1121014853578138e-07, + "loss": 0.0688, + "step": 8654 + }, + { + "epoch": 2.804601425793908, + "grad_norm": 0.4579877257347107, + "learning_rate": 1.1084357338791541e-07, + "loss": 0.0665, + "step": 8655 + }, + { + "epoch": 2.8049254698639015, + "grad_norm": 0.5440908074378967, + "learning_rate": 1.1047759662546564e-07, + "loss": 0.0749, + "step": 8656 + }, + { + "epoch": 2.805249513933895, + "grad_norm": 0.5029870867729187, + "learning_rate": 1.1011221829322294e-07, + "loss": 0.0701, + "step": 8657 + }, + { + "epoch": 2.8055735580038883, + "grad_norm": 0.5622695088386536, + "learning_rate": 1.0974743843590762e-07, + "loss": 0.0785, + "step": 8658 + }, + { + "epoch": 2.805897602073882, + "grad_norm": 0.4912731647491455, + "learning_rate": 1.09383257098164e-07, + "loss": 0.072, + "step": 8659 + }, + { + "epoch": 2.8062216461438756, + "grad_norm": 0.4591299891471863, + "learning_rate": 1.0901967432456583e-07, + "loss": 0.0684, + "step": 8660 + }, + { + "epoch": 2.806545690213869, + "grad_norm": 0.4708253741264343, + "learning_rate": 1.0865669015961033e-07, + "loss": 0.0672, + "step": 8661 + }, + { + "epoch": 2.806869734283863, + "grad_norm": 0.5100660920143127, + "learning_rate": 1.0829430464772417e-07, + "loss": 0.0758, + "step": 8662 + }, + { + "epoch": 2.807193778353856, + "grad_norm": 0.45392340421676636, + "learning_rate": 1.0793251783325965e-07, + "loss": 0.066, + "step": 8663 + }, + { + "epoch": 2.8075178224238497, + "grad_norm": 0.5311992168426514, + "learning_rate": 1.0757132976049634e-07, + "loss": 0.0756, + "step": 8664 + }, + { + "epoch": 2.807841866493843, + "grad_norm": 0.4701693058013916, + "learning_rate": 1.0721074047364055e-07, + "loss": 0.0646, + "step": 8665 + }, + { + "epoch": 2.8081659105638366, + "grad_norm": 0.534783124923706, + "learning_rate": 1.0685075001682255e-07, + "loss": 0.0801, + "step": 8666 + }, + { + "epoch": 2.8084899546338304, + "grad_norm": 0.45467719435691833, + "learning_rate": 1.0649135843410485e-07, + "loss": 0.0658, + "step": 8667 + }, + { + "epoch": 2.808813998703824, + "grad_norm": 0.4908236861228943, + "learning_rate": 1.0613256576947173e-07, + "loss": 0.0693, + "step": 8668 + }, + { + "epoch": 2.8091380427738173, + "grad_norm": 0.5250102281570435, + "learning_rate": 1.0577437206683583e-07, + "loss": 0.0786, + "step": 8669 + }, + { + "epoch": 2.8094620868438107, + "grad_norm": 0.46399766206741333, + "learning_rate": 1.0541677737003709e-07, + "loss": 0.0696, + "step": 8670 + }, + { + "epoch": 2.809786130913804, + "grad_norm": 0.4993619918823242, + "learning_rate": 1.0505978172284214e-07, + "loss": 0.0687, + "step": 8671 + }, + { + "epoch": 2.810110174983798, + "grad_norm": 0.5269954800605774, + "learning_rate": 1.0470338516894273e-07, + "loss": 0.0733, + "step": 8672 + }, + { + "epoch": 2.8104342190537914, + "grad_norm": 0.5174999237060547, + "learning_rate": 1.0434758775195841e-07, + "loss": 0.0782, + "step": 8673 + }, + { + "epoch": 2.810758263123785, + "grad_norm": 0.527529776096344, + "learning_rate": 1.0399238951543712e-07, + "loss": 0.0787, + "step": 8674 + }, + { + "epoch": 2.8110823071937783, + "grad_norm": 0.4615003168582916, + "learning_rate": 1.0363779050284906e-07, + "loss": 0.0614, + "step": 8675 + }, + { + "epoch": 2.8114063512637717, + "grad_norm": 0.5183756947517395, + "learning_rate": 1.0328379075759564e-07, + "loss": 0.0749, + "step": 8676 + }, + { + "epoch": 2.8117303953337656, + "grad_norm": 0.4968220293521881, + "learning_rate": 1.0293039032300168e-07, + "loss": 0.0711, + "step": 8677 + }, + { + "epoch": 2.812054439403759, + "grad_norm": 0.4645611643791199, + "learning_rate": 1.0257758924232142e-07, + "loss": 0.0693, + "step": 8678 + }, + { + "epoch": 2.8123784834737524, + "grad_norm": 0.5472391843795776, + "learning_rate": 1.0222538755873313e-07, + "loss": 0.0807, + "step": 8679 + }, + { + "epoch": 2.812702527543746, + "grad_norm": 0.48376771807670593, + "learning_rate": 1.0187378531534287e-07, + "loss": 0.0707, + "step": 8680 + }, + { + "epoch": 2.8130265716137393, + "grad_norm": 0.4975831210613251, + "learning_rate": 1.0152278255518399e-07, + "loss": 0.0709, + "step": 8681 + }, + { + "epoch": 2.813350615683733, + "grad_norm": 0.5038923621177673, + "learning_rate": 1.011723793212166e-07, + "loss": 0.0711, + "step": 8682 + }, + { + "epoch": 2.8136746597537265, + "grad_norm": 0.4740455150604248, + "learning_rate": 1.0082257565632469e-07, + "loss": 0.0704, + "step": 8683 + }, + { + "epoch": 2.81399870382372, + "grad_norm": 0.4767991602420807, + "learning_rate": 1.0047337160332182e-07, + "loss": 0.0654, + "step": 8684 + }, + { + "epoch": 2.8143227478937134, + "grad_norm": 0.49966102838516235, + "learning_rate": 1.0012476720494713e-07, + "loss": 0.0739, + "step": 8685 + }, + { + "epoch": 2.814646791963707, + "grad_norm": 0.47208863496780396, + "learning_rate": 9.977676250386647e-08, + "loss": 0.0722, + "step": 8686 + }, + { + "epoch": 2.8149708360337007, + "grad_norm": 0.4928867220878601, + "learning_rate": 9.942935754267303e-08, + "loss": 0.0684, + "step": 8687 + }, + { + "epoch": 2.815294880103694, + "grad_norm": 0.5016983151435852, + "learning_rate": 9.908255236388386e-08, + "loss": 0.0723, + "step": 8688 + }, + { + "epoch": 2.8156189241736875, + "grad_norm": 0.4868336319923401, + "learning_rate": 9.873634700994671e-08, + "loss": 0.0701, + "step": 8689 + }, + { + "epoch": 2.8159429682436814, + "grad_norm": 0.4551834762096405, + "learning_rate": 9.839074152323159e-08, + "loss": 0.0671, + "step": 8690 + }, + { + "epoch": 2.816267012313675, + "grad_norm": 0.4637860059738159, + "learning_rate": 9.80457359460396e-08, + "loss": 0.0712, + "step": 8691 + }, + { + "epoch": 2.8165910563836682, + "grad_norm": 0.47512730956077576, + "learning_rate": 9.770133032059481e-08, + "loss": 0.066, + "step": 8692 + }, + { + "epoch": 2.8169151004536617, + "grad_norm": 0.46657752990722656, + "learning_rate": 9.735752468904846e-08, + "loss": 0.0653, + "step": 8693 + }, + { + "epoch": 2.817239144523655, + "grad_norm": 0.5024873614311218, + "learning_rate": 9.70143190934808e-08, + "loss": 0.0728, + "step": 8694 + }, + { + "epoch": 2.817563188593649, + "grad_norm": 0.4788472652435303, + "learning_rate": 9.667171357589489e-08, + "loss": 0.0679, + "step": 8695 + }, + { + "epoch": 2.8178872326636424, + "grad_norm": 0.4856061041355133, + "learning_rate": 9.63297081782244e-08, + "loss": 0.0702, + "step": 8696 + }, + { + "epoch": 2.818211276733636, + "grad_norm": 0.47453123331069946, + "learning_rate": 9.59883029423253e-08, + "loss": 0.0659, + "step": 8697 + }, + { + "epoch": 2.8185353208036292, + "grad_norm": 0.5024823546409607, + "learning_rate": 9.564749790998473e-08, + "loss": 0.076, + "step": 8698 + }, + { + "epoch": 2.8188593648736227, + "grad_norm": 0.4959494173526764, + "learning_rate": 9.530729312291153e-08, + "loss": 0.0715, + "step": 8699 + }, + { + "epoch": 2.8191834089436165, + "grad_norm": 0.500286340713501, + "learning_rate": 9.496768862274519e-08, + "loss": 0.0764, + "step": 8700 + }, + { + "epoch": 2.81950745301361, + "grad_norm": 0.4606480598449707, + "learning_rate": 9.462868445104912e-08, + "loss": 0.0675, + "step": 8701 + }, + { + "epoch": 2.8198314970836034, + "grad_norm": 0.4570009112358093, + "learning_rate": 9.4290280649314e-08, + "loss": 0.0682, + "step": 8702 + }, + { + "epoch": 2.820155541153597, + "grad_norm": 0.46508264541625977, + "learning_rate": 9.395247725895784e-08, + "loss": 0.0662, + "step": 8703 + }, + { + "epoch": 2.82047958522359, + "grad_norm": 0.46841660141944885, + "learning_rate": 9.361527432132478e-08, + "loss": 0.0668, + "step": 8704 + }, + { + "epoch": 2.820803629293584, + "grad_norm": 0.5106972455978394, + "learning_rate": 9.327867187768458e-08, + "loss": 0.073, + "step": 8705 + }, + { + "epoch": 2.8211276733635775, + "grad_norm": 0.484240859746933, + "learning_rate": 9.294266996923373e-08, + "loss": 0.0691, + "step": 8706 + }, + { + "epoch": 2.821451717433571, + "grad_norm": 0.4603111147880554, + "learning_rate": 9.260726863709601e-08, + "loss": 0.0663, + "step": 8707 + }, + { + "epoch": 2.8217757615035644, + "grad_norm": 0.5028695464134216, + "learning_rate": 9.227246792232136e-08, + "loss": 0.0706, + "step": 8708 + }, + { + "epoch": 2.8220998055735578, + "grad_norm": 0.5131447911262512, + "learning_rate": 9.193826786588645e-08, + "loss": 0.0803, + "step": 8709 + }, + { + "epoch": 2.8224238496435516, + "grad_norm": 0.4943181276321411, + "learning_rate": 9.160466850869354e-08, + "loss": 0.0728, + "step": 8710 + }, + { + "epoch": 2.822747893713545, + "grad_norm": 0.49125006794929504, + "learning_rate": 9.127166989157276e-08, + "loss": 0.0719, + "step": 8711 + }, + { + "epoch": 2.8230719377835385, + "grad_norm": 0.4617706537246704, + "learning_rate": 9.093927205527875e-08, + "loss": 0.0669, + "step": 8712 + }, + { + "epoch": 2.8233959818535324, + "grad_norm": 0.5301855802536011, + "learning_rate": 9.060747504049506e-08, + "loss": 0.0729, + "step": 8713 + }, + { + "epoch": 2.8237200259235253, + "grad_norm": 0.4995158314704895, + "learning_rate": 9.02762788878292e-08, + "loss": 0.0723, + "step": 8714 + }, + { + "epoch": 2.824044069993519, + "grad_norm": 0.4507756233215332, + "learning_rate": 8.994568363781764e-08, + "loss": 0.0615, + "step": 8715 + }, + { + "epoch": 2.8243681140635126, + "grad_norm": 0.47523167729377747, + "learning_rate": 8.961568933092136e-08, + "loss": 0.0694, + "step": 8716 + }, + { + "epoch": 2.824692158133506, + "grad_norm": 0.48729458451271057, + "learning_rate": 8.928629600752803e-08, + "loss": 0.0704, + "step": 8717 + }, + { + "epoch": 2.8250162022035, + "grad_norm": 0.5317953824996948, + "learning_rate": 8.89575037079532e-08, + "loss": 0.0749, + "step": 8718 + }, + { + "epoch": 2.8253402462734933, + "grad_norm": 0.48030078411102295, + "learning_rate": 8.862931247243689e-08, + "loss": 0.0694, + "step": 8719 + }, + { + "epoch": 2.8256642903434868, + "grad_norm": 0.4619617462158203, + "learning_rate": 8.830172234114754e-08, + "loss": 0.0645, + "step": 8720 + }, + { + "epoch": 2.82598833441348, + "grad_norm": 0.4722529649734497, + "learning_rate": 8.79747333541775e-08, + "loss": 0.0697, + "step": 8721 + }, + { + "epoch": 2.8263123784834736, + "grad_norm": 0.48667117953300476, + "learning_rate": 8.764834555154867e-08, + "loss": 0.0758, + "step": 8722 + }, + { + "epoch": 2.8266364225534675, + "grad_norm": 0.49473825097084045, + "learning_rate": 8.732255897320685e-08, + "loss": 0.0708, + "step": 8723 + }, + { + "epoch": 2.826960466623461, + "grad_norm": 0.5213518738746643, + "learning_rate": 8.699737365902572e-08, + "loss": 0.0742, + "step": 8724 + }, + { + "epoch": 2.8272845106934543, + "grad_norm": 0.5019205808639526, + "learning_rate": 8.667278964880398e-08, + "loss": 0.0739, + "step": 8725 + }, + { + "epoch": 2.8276085547634477, + "grad_norm": 0.5222662091255188, + "learning_rate": 8.634880698226877e-08, + "loss": 0.0747, + "step": 8726 + }, + { + "epoch": 2.827932598833441, + "grad_norm": 0.4360904395580292, + "learning_rate": 8.602542569907168e-08, + "loss": 0.062, + "step": 8727 + }, + { + "epoch": 2.828256642903435, + "grad_norm": 0.4444887936115265, + "learning_rate": 8.570264583879052e-08, + "loss": 0.0649, + "step": 8728 + }, + { + "epoch": 2.8285806869734285, + "grad_norm": 0.48832395672798157, + "learning_rate": 8.538046744093253e-08, + "loss": 0.0709, + "step": 8729 + }, + { + "epoch": 2.828904731043422, + "grad_norm": 0.46723636984825134, + "learning_rate": 8.505889054492789e-08, + "loss": 0.0675, + "step": 8730 + }, + { + "epoch": 2.8292287751134153, + "grad_norm": 0.5746234655380249, + "learning_rate": 8.473791519013453e-08, + "loss": 0.0764, + "step": 8731 + }, + { + "epoch": 2.8295528191834087, + "grad_norm": 0.4938589632511139, + "learning_rate": 8.441754141583714e-08, + "loss": 0.0716, + "step": 8732 + }, + { + "epoch": 2.8298768632534026, + "grad_norm": 0.5008729696273804, + "learning_rate": 8.409776926124546e-08, + "loss": 0.0666, + "step": 8733 + }, + { + "epoch": 2.830200907323396, + "grad_norm": 0.4394344985485077, + "learning_rate": 8.377859876549821e-08, + "loss": 0.0642, + "step": 8734 + }, + { + "epoch": 2.8305249513933894, + "grad_norm": 0.447689414024353, + "learning_rate": 8.346002996765745e-08, + "loss": 0.0667, + "step": 8735 + }, + { + "epoch": 2.830848995463383, + "grad_norm": 0.5375357866287231, + "learning_rate": 8.314206290671256e-08, + "loss": 0.0815, + "step": 8736 + }, + { + "epoch": 2.8311730395333763, + "grad_norm": 0.5031763911247253, + "learning_rate": 8.28246976215813e-08, + "loss": 0.0695, + "step": 8737 + }, + { + "epoch": 2.83149708360337, + "grad_norm": 0.542471706867218, + "learning_rate": 8.250793415110426e-08, + "loss": 0.0764, + "step": 8738 + }, + { + "epoch": 2.8318211276733636, + "grad_norm": 0.5349651575088501, + "learning_rate": 8.219177253405153e-08, + "loss": 0.0751, + "step": 8739 + }, + { + "epoch": 2.832145171743357, + "grad_norm": 0.46823248267173767, + "learning_rate": 8.187621280911773e-08, + "loss": 0.0685, + "step": 8740 + }, + { + "epoch": 2.832469215813351, + "grad_norm": 0.4961474537849426, + "learning_rate": 8.156125501492417e-08, + "loss": 0.0707, + "step": 8741 + }, + { + "epoch": 2.8327932598833443, + "grad_norm": 0.48361262679100037, + "learning_rate": 8.124689919001894e-08, + "loss": 0.0721, + "step": 8742 + }, + { + "epoch": 2.8331173039533377, + "grad_norm": 0.5136740207672119, + "learning_rate": 8.093314537287567e-08, + "loss": 0.0777, + "step": 8743 + }, + { + "epoch": 2.833441348023331, + "grad_norm": 0.5095680356025696, + "learning_rate": 8.061999360189587e-08, + "loss": 0.0742, + "step": 8744 + }, + { + "epoch": 2.8337653920933246, + "grad_norm": 0.4728745222091675, + "learning_rate": 8.030744391540501e-08, + "loss": 0.0709, + "step": 8745 + }, + { + "epoch": 2.8340894361633184, + "grad_norm": 0.48748818039894104, + "learning_rate": 7.999549635165693e-08, + "loss": 0.0705, + "step": 8746 + }, + { + "epoch": 2.834413480233312, + "grad_norm": 0.4940890967845917, + "learning_rate": 7.968415094883109e-08, + "loss": 0.0704, + "step": 8747 + }, + { + "epoch": 2.8347375243033053, + "grad_norm": 0.5177623629570007, + "learning_rate": 7.9373407745032e-08, + "loss": 0.0746, + "step": 8748 + }, + { + "epoch": 2.8350615683732987, + "grad_norm": 0.4558980464935303, + "learning_rate": 7.906326677829312e-08, + "loss": 0.067, + "step": 8749 + }, + { + "epoch": 2.835385612443292, + "grad_norm": 0.4997522532939911, + "learning_rate": 7.875372808657189e-08, + "loss": 0.0719, + "step": 8750 + }, + { + "epoch": 2.835709656513286, + "grad_norm": 0.4721794128417969, + "learning_rate": 7.844479170775299e-08, + "loss": 0.0701, + "step": 8751 + }, + { + "epoch": 2.8360337005832794, + "grad_norm": 0.4765371084213257, + "learning_rate": 7.813645767964673e-08, + "loss": 0.0708, + "step": 8752 + }, + { + "epoch": 2.836357744653273, + "grad_norm": 0.4879765212535858, + "learning_rate": 7.782872603999126e-08, + "loss": 0.0728, + "step": 8753 + }, + { + "epoch": 2.8366817887232663, + "grad_norm": 0.5061075687408447, + "learning_rate": 7.752159682644921e-08, + "loss": 0.0681, + "step": 8754 + }, + { + "epoch": 2.8370058327932597, + "grad_norm": 0.46989887952804565, + "learning_rate": 7.721507007661055e-08, + "loss": 0.0699, + "step": 8755 + }, + { + "epoch": 2.8373298768632536, + "grad_norm": 0.47777748107910156, + "learning_rate": 7.69091458279908e-08, + "loss": 0.0679, + "step": 8756 + }, + { + "epoch": 2.837653920933247, + "grad_norm": 0.46506989002227783, + "learning_rate": 7.66038241180328e-08, + "loss": 0.0642, + "step": 8757 + }, + { + "epoch": 2.8379779650032404, + "grad_norm": 0.4758833050727844, + "learning_rate": 7.629910498410442e-08, + "loss": 0.0701, + "step": 8758 + }, + { + "epoch": 2.838302009073234, + "grad_norm": 0.4828174114227295, + "learning_rate": 7.599498846350029e-08, + "loss": 0.0741, + "step": 8759 + }, + { + "epoch": 2.8386260531432272, + "grad_norm": 0.4962548613548279, + "learning_rate": 7.569147459344172e-08, + "loss": 0.0694, + "step": 8760 + }, + { + "epoch": 2.838950097213221, + "grad_norm": 0.5564802885055542, + "learning_rate": 7.53885634110757e-08, + "loss": 0.0823, + "step": 8761 + }, + { + "epoch": 2.8392741412832145, + "grad_norm": 0.46635982394218445, + "learning_rate": 7.508625495347533e-08, + "loss": 0.0692, + "step": 8762 + }, + { + "epoch": 2.839598185353208, + "grad_norm": 0.49309298396110535, + "learning_rate": 7.478454925764045e-08, + "loss": 0.0712, + "step": 8763 + }, + { + "epoch": 2.839922229423202, + "grad_norm": 0.5187074542045593, + "learning_rate": 7.448344636049709e-08, + "loss": 0.0742, + "step": 8764 + }, + { + "epoch": 2.840246273493195, + "grad_norm": 0.4811423718929291, + "learning_rate": 7.418294629889744e-08, + "loss": 0.0702, + "step": 8765 + }, + { + "epoch": 2.8405703175631887, + "grad_norm": 0.5099913477897644, + "learning_rate": 7.388304910961985e-08, + "loss": 0.0753, + "step": 8766 + }, + { + "epoch": 2.840894361633182, + "grad_norm": 0.5018423199653625, + "learning_rate": 7.358375482936719e-08, + "loss": 0.0684, + "step": 8767 + }, + { + "epoch": 2.8412184057031755, + "grad_norm": 0.49818873405456543, + "learning_rate": 7.328506349477294e-08, + "loss": 0.0726, + "step": 8768 + }, + { + "epoch": 2.8415424497731694, + "grad_norm": 0.47550129890441895, + "learning_rate": 7.298697514239228e-08, + "loss": 0.0677, + "step": 8769 + }, + { + "epoch": 2.841866493843163, + "grad_norm": 0.4928217828273773, + "learning_rate": 7.268948980870826e-08, + "loss": 0.0699, + "step": 8770 + }, + { + "epoch": 2.8421905379131562, + "grad_norm": 0.46691080927848816, + "learning_rate": 7.239260753013067e-08, + "loss": 0.0693, + "step": 8771 + }, + { + "epoch": 2.8425145819831497, + "grad_norm": 0.49736487865448, + "learning_rate": 7.20963283429954e-08, + "loss": 0.0737, + "step": 8772 + }, + { + "epoch": 2.842838626053143, + "grad_norm": 0.5066372752189636, + "learning_rate": 7.180065228356347e-08, + "loss": 0.0771, + "step": 8773 + }, + { + "epoch": 2.843162670123137, + "grad_norm": 0.47119608521461487, + "learning_rate": 7.15055793880226e-08, + "loss": 0.0709, + "step": 8774 + }, + { + "epoch": 2.8434867141931304, + "grad_norm": 0.4811091125011444, + "learning_rate": 7.121110969248834e-08, + "loss": 0.0706, + "step": 8775 + }, + { + "epoch": 2.843810758263124, + "grad_norm": 0.5038498044013977, + "learning_rate": 7.091724323299853e-08, + "loss": 0.0695, + "step": 8776 + }, + { + "epoch": 2.844134802333117, + "grad_norm": 0.5106398463249207, + "learning_rate": 7.062398004552218e-08, + "loss": 0.0789, + "step": 8777 + }, + { + "epoch": 2.8444588464031106, + "grad_norm": 0.46302106976509094, + "learning_rate": 7.033132016595001e-08, + "loss": 0.0657, + "step": 8778 + }, + { + "epoch": 2.8447828904731045, + "grad_norm": 0.46196722984313965, + "learning_rate": 7.003926363010116e-08, + "loss": 0.0669, + "step": 8779 + }, + { + "epoch": 2.845106934543098, + "grad_norm": 0.48582008481025696, + "learning_rate": 6.974781047372148e-08, + "loss": 0.071, + "step": 8780 + }, + { + "epoch": 2.8454309786130914, + "grad_norm": 0.4900462329387665, + "learning_rate": 6.945696073248077e-08, + "loss": 0.0735, + "step": 8781 + }, + { + "epoch": 2.845755022683085, + "grad_norm": 0.5111523270606995, + "learning_rate": 6.916671444197665e-08, + "loss": 0.0725, + "step": 8782 + }, + { + "epoch": 2.846079066753078, + "grad_norm": 0.44137054681777954, + "learning_rate": 6.887707163773238e-08, + "loss": 0.0596, + "step": 8783 + }, + { + "epoch": 2.846403110823072, + "grad_norm": 0.4662524163722992, + "learning_rate": 6.858803235519795e-08, + "loss": 0.0694, + "step": 8784 + }, + { + "epoch": 2.8467271548930655, + "grad_norm": 0.5039228200912476, + "learning_rate": 6.829959662974839e-08, + "loss": 0.074, + "step": 8785 + }, + { + "epoch": 2.847051198963059, + "grad_norm": 0.47388955950737, + "learning_rate": 6.8011764496686e-08, + "loss": 0.0736, + "step": 8786 + }, + { + "epoch": 2.8473752430330523, + "grad_norm": 0.5250383019447327, + "learning_rate": 6.772453599123763e-08, + "loss": 0.0734, + "step": 8787 + }, + { + "epoch": 2.8476992871030458, + "grad_norm": 0.49519991874694824, + "learning_rate": 6.743791114855847e-08, + "loss": 0.0732, + "step": 8788 + }, + { + "epoch": 2.8480233311730396, + "grad_norm": 0.45928895473480225, + "learning_rate": 6.715189000372768e-08, + "loss": 0.066, + "step": 8789 + }, + { + "epoch": 2.848347375243033, + "grad_norm": 0.4567415714263916, + "learning_rate": 6.686647259175227e-08, + "loss": 0.0629, + "step": 8790 + }, + { + "epoch": 2.8486714193130265, + "grad_norm": 0.4983983039855957, + "learning_rate": 6.65816589475643e-08, + "loss": 0.0717, + "step": 8791 + }, + { + "epoch": 2.8489954633830203, + "grad_norm": 0.5043929219245911, + "learning_rate": 6.629744910602142e-08, + "loss": 0.0747, + "step": 8792 + }, + { + "epoch": 2.8493195074530138, + "grad_norm": 0.45701977610588074, + "learning_rate": 6.601384310190917e-08, + "loss": 0.0683, + "step": 8793 + }, + { + "epoch": 2.849643551523007, + "grad_norm": 0.48810067772865295, + "learning_rate": 6.573084096993809e-08, + "loss": 0.0705, + "step": 8794 + }, + { + "epoch": 2.8499675955930006, + "grad_norm": 0.4656825661659241, + "learning_rate": 6.544844274474438e-08, + "loss": 0.0664, + "step": 8795 + }, + { + "epoch": 2.850291639662994, + "grad_norm": 0.4611987769603729, + "learning_rate": 6.516664846089094e-08, + "loss": 0.0651, + "step": 8796 + }, + { + "epoch": 2.850615683732988, + "grad_norm": 0.5130912065505981, + "learning_rate": 6.488545815286739e-08, + "loss": 0.0766, + "step": 8797 + }, + { + "epoch": 2.8509397278029813, + "grad_norm": 0.49199214577674866, + "learning_rate": 6.460487185508735e-08, + "loss": 0.0715, + "step": 8798 + }, + { + "epoch": 2.8512637718729748, + "grad_norm": 0.4907855987548828, + "learning_rate": 6.432488960189331e-08, + "loss": 0.0737, + "step": 8799 + }, + { + "epoch": 2.851587815942968, + "grad_norm": 0.47007766366004944, + "learning_rate": 6.404551142755178e-08, + "loss": 0.0661, + "step": 8800 + }, + { + "epoch": 2.8519118600129616, + "grad_norm": 0.49704840779304504, + "learning_rate": 6.376673736625538e-08, + "loss": 0.0734, + "step": 8801 + }, + { + "epoch": 2.8522359040829555, + "grad_norm": 0.4636077284812927, + "learning_rate": 6.348856745212461e-08, + "loss": 0.0691, + "step": 8802 + }, + { + "epoch": 2.852559948152949, + "grad_norm": 0.46470972895622253, + "learning_rate": 6.321100171920335e-08, + "loss": 0.0712, + "step": 8803 + }, + { + "epoch": 2.8528839922229423, + "grad_norm": 0.47441092133522034, + "learning_rate": 6.293404020146443e-08, + "loss": 0.0694, + "step": 8804 + }, + { + "epoch": 2.8532080362929357, + "grad_norm": 0.4719110131263733, + "learning_rate": 6.265768293280349e-08, + "loss": 0.0698, + "step": 8805 + }, + { + "epoch": 2.853532080362929, + "grad_norm": 0.4885626435279846, + "learning_rate": 6.238192994704573e-08, + "loss": 0.0672, + "step": 8806 + }, + { + "epoch": 2.853856124432923, + "grad_norm": 0.5021069049835205, + "learning_rate": 6.210678127793912e-08, + "loss": 0.0729, + "step": 8807 + }, + { + "epoch": 2.8541801685029164, + "grad_norm": 0.5236930847167969, + "learning_rate": 6.183223695916119e-08, + "loss": 0.0786, + "step": 8808 + }, + { + "epoch": 2.85450421257291, + "grad_norm": 0.4771360754966736, + "learning_rate": 6.15582970243117e-08, + "loss": 0.0678, + "step": 8809 + }, + { + "epoch": 2.8548282566429033, + "grad_norm": 0.48816344141960144, + "learning_rate": 6.128496150691832e-08, + "loss": 0.0706, + "step": 8810 + }, + { + "epoch": 2.8551523007128967, + "grad_norm": 0.5284494757652283, + "learning_rate": 6.101223044043592e-08, + "loss": 0.0735, + "step": 8811 + }, + { + "epoch": 2.8554763447828906, + "grad_norm": 0.5202637910842896, + "learning_rate": 6.074010385824281e-08, + "loss": 0.0767, + "step": 8812 + }, + { + "epoch": 2.855800388852884, + "grad_norm": 0.49742886424064636, + "learning_rate": 6.046858179364568e-08, + "loss": 0.0739, + "step": 8813 + }, + { + "epoch": 2.8561244329228774, + "grad_norm": 0.5342046618461609, + "learning_rate": 6.019766427987572e-08, + "loss": 0.0672, + "step": 8814 + }, + { + "epoch": 2.8564484769928713, + "grad_norm": 0.4571148753166199, + "learning_rate": 5.992735135009087e-08, + "loss": 0.0674, + "step": 8815 + }, + { + "epoch": 2.8567725210628643, + "grad_norm": 0.5210033655166626, + "learning_rate": 5.965764303737409e-08, + "loss": 0.074, + "step": 8816 + }, + { + "epoch": 2.857096565132858, + "grad_norm": 0.4892861247062683, + "learning_rate": 5.938853937473565e-08, + "loss": 0.0715, + "step": 8817 + }, + { + "epoch": 2.8574206092028516, + "grad_norm": 0.5085569024085999, + "learning_rate": 5.912004039511143e-08, + "loss": 0.0727, + "step": 8818 + }, + { + "epoch": 2.857744653272845, + "grad_norm": 0.5078155398368835, + "learning_rate": 5.8852146131362366e-08, + "loss": 0.0724, + "step": 8819 + }, + { + "epoch": 2.858068697342839, + "grad_norm": 0.49197226762771606, + "learning_rate": 5.858485661627722e-08, + "loss": 0.0717, + "step": 8820 + }, + { + "epoch": 2.8583927414128323, + "grad_norm": 0.4717177450656891, + "learning_rate": 5.831817188256872e-08, + "loss": 0.0679, + "step": 8821 + }, + { + "epoch": 2.8587167854828257, + "grad_norm": 0.5008045434951782, + "learning_rate": 5.805209196287687e-08, + "loss": 0.0718, + "step": 8822 + }, + { + "epoch": 2.859040829552819, + "grad_norm": 0.49238985776901245, + "learning_rate": 5.778661688976728e-08, + "loss": 0.0708, + "step": 8823 + }, + { + "epoch": 2.8593648736228126, + "grad_norm": 0.4850955605506897, + "learning_rate": 5.7521746695731186e-08, + "loss": 0.0706, + "step": 8824 + }, + { + "epoch": 2.8596889176928064, + "grad_norm": 0.4857136309146881, + "learning_rate": 5.725748141318654e-08, + "loss": 0.0705, + "step": 8825 + }, + { + "epoch": 2.8600129617628, + "grad_norm": 0.497732549905777, + "learning_rate": 5.699382107447637e-08, + "loss": 0.0738, + "step": 8826 + }, + { + "epoch": 2.8603370058327933, + "grad_norm": 0.4750259816646576, + "learning_rate": 5.6730765711870975e-08, + "loss": 0.0672, + "step": 8827 + }, + { + "epoch": 2.8606610499027867, + "grad_norm": 0.4785563051700592, + "learning_rate": 5.646831535756569e-08, + "loss": 0.0699, + "step": 8828 + }, + { + "epoch": 2.86098509397278, + "grad_norm": 0.4761826992034912, + "learning_rate": 5.620647004368041e-08, + "loss": 0.0663, + "step": 8829 + }, + { + "epoch": 2.861309138042774, + "grad_norm": 0.46488818526268005, + "learning_rate": 5.594522980226447e-08, + "loss": 0.0659, + "step": 8830 + }, + { + "epoch": 2.8616331821127674, + "grad_norm": 0.5118567943572998, + "learning_rate": 5.568459466529008e-08, + "loss": 0.0744, + "step": 8831 + }, + { + "epoch": 2.861957226182761, + "grad_norm": 0.5114924907684326, + "learning_rate": 5.542456466465618e-08, + "loss": 0.079, + "step": 8832 + }, + { + "epoch": 2.8622812702527543, + "grad_norm": 0.4875848889350891, + "learning_rate": 5.516513983218841e-08, + "loss": 0.0714, + "step": 8833 + }, + { + "epoch": 2.8626053143227477, + "grad_norm": 0.44935011863708496, + "learning_rate": 5.490632019963804e-08, + "loss": 0.0645, + "step": 8834 + }, + { + "epoch": 2.8629293583927415, + "grad_norm": 0.5156768560409546, + "learning_rate": 5.464810579868196e-08, + "loss": 0.0719, + "step": 8835 + }, + { + "epoch": 2.863253402462735, + "grad_norm": 0.5234912037849426, + "learning_rate": 5.439049666092266e-08, + "loss": 0.0735, + "step": 8836 + }, + { + "epoch": 2.8635774465327284, + "grad_norm": 0.5177478194236755, + "learning_rate": 5.4133492817889935e-08, + "loss": 0.0749, + "step": 8837 + }, + { + "epoch": 2.863901490602722, + "grad_norm": 0.47083693742752075, + "learning_rate": 5.387709430103749e-08, + "loss": 0.0665, + "step": 8838 + }, + { + "epoch": 2.8642255346727152, + "grad_norm": 0.4397895038127899, + "learning_rate": 5.362130114174691e-08, + "loss": 0.0616, + "step": 8839 + }, + { + "epoch": 2.864549578742709, + "grad_norm": 0.512173056602478, + "learning_rate": 5.3366113371324245e-08, + "loss": 0.0781, + "step": 8840 + }, + { + "epoch": 2.8648736228127025, + "grad_norm": 0.4663113057613373, + "learning_rate": 5.311153102100175e-08, + "loss": 0.0685, + "step": 8841 + }, + { + "epoch": 2.865197666882696, + "grad_norm": 0.4491751492023468, + "learning_rate": 5.2857554121938935e-08, + "loss": 0.0675, + "step": 8842 + }, + { + "epoch": 2.86552171095269, + "grad_norm": 0.49800777435302734, + "learning_rate": 5.2604182705219274e-08, + "loss": 0.0752, + "step": 8843 + }, + { + "epoch": 2.8658457550226832, + "grad_norm": 0.46831420063972473, + "learning_rate": 5.235141680185296e-08, + "loss": 0.0712, + "step": 8844 + }, + { + "epoch": 2.8661697990926767, + "grad_norm": 0.4684108793735504, + "learning_rate": 5.209925644277636e-08, + "loss": 0.068, + "step": 8845 + }, + { + "epoch": 2.86649384316267, + "grad_norm": 0.5100395083427429, + "learning_rate": 5.1847701658851445e-08, + "loss": 0.0746, + "step": 8846 + }, + { + "epoch": 2.8668178872326635, + "grad_norm": 0.5008055567741394, + "learning_rate": 5.159675248086582e-08, + "loss": 0.0747, + "step": 8847 + }, + { + "epoch": 2.8671419313026574, + "grad_norm": 0.48468106985092163, + "learning_rate": 5.1346408939533795e-08, + "loss": 0.0746, + "step": 8848 + }, + { + "epoch": 2.867465975372651, + "grad_norm": 0.5062259435653687, + "learning_rate": 5.109667106549421e-08, + "loss": 0.0732, + "step": 8849 + }, + { + "epoch": 2.8677900194426442, + "grad_norm": 0.5236890316009521, + "learning_rate": 5.084753888931315e-08, + "loss": 0.0749, + "step": 8850 + }, + { + "epoch": 2.8681140635126376, + "grad_norm": 0.46542856097221375, + "learning_rate": 5.05990124414818e-08, + "loss": 0.0693, + "step": 8851 + }, + { + "epoch": 2.868438107582631, + "grad_norm": 0.4806134104728699, + "learning_rate": 5.035109175241748e-08, + "loss": 0.0691, + "step": 8852 + }, + { + "epoch": 2.868762151652625, + "grad_norm": 0.486376017332077, + "learning_rate": 5.010377685246315e-08, + "loss": 0.0695, + "step": 8853 + }, + { + "epoch": 2.8690861957226184, + "grad_norm": 0.47897252440452576, + "learning_rate": 4.985706777188792e-08, + "loss": 0.0672, + "step": 8854 + }, + { + "epoch": 2.869410239792612, + "grad_norm": 0.4908261299133301, + "learning_rate": 4.961096454088654e-08, + "loss": 0.0729, + "step": 8855 + }, + { + "epoch": 2.869734283862605, + "grad_norm": 0.5017130970954895, + "learning_rate": 4.936546718957935e-08, + "loss": 0.0742, + "step": 8856 + }, + { + "epoch": 2.8700583279325986, + "grad_norm": 0.5411863923072815, + "learning_rate": 4.912057574801343e-08, + "loss": 0.0796, + "step": 8857 + }, + { + "epoch": 2.8703823720025925, + "grad_norm": 0.5324139595031738, + "learning_rate": 4.887629024616036e-08, + "loss": 0.0675, + "step": 8858 + }, + { + "epoch": 2.870706416072586, + "grad_norm": 0.4880392551422119, + "learning_rate": 4.863261071391956e-08, + "loss": 0.0703, + "step": 8859 + }, + { + "epoch": 2.8710304601425793, + "grad_norm": 0.5131406784057617, + "learning_rate": 4.838953718111328e-08, + "loss": 0.0708, + "step": 8860 + }, + { + "epoch": 2.8713545042125728, + "grad_norm": 0.4706338942050934, + "learning_rate": 4.8147069677493274e-08, + "loss": 0.0668, + "step": 8861 + }, + { + "epoch": 2.871678548282566, + "grad_norm": 0.518531084060669, + "learning_rate": 4.790520823273359e-08, + "loss": 0.0758, + "step": 8862 + }, + { + "epoch": 2.87200259235256, + "grad_norm": 0.520061194896698, + "learning_rate": 4.766395287643666e-08, + "loss": 0.0714, + "step": 8863 + }, + { + "epoch": 2.8723266364225535, + "grad_norm": 0.4812447130680084, + "learning_rate": 4.742330363812997e-08, + "loss": 0.0704, + "step": 8864 + }, + { + "epoch": 2.872650680492547, + "grad_norm": 0.4664628803730011, + "learning_rate": 4.718326054726552e-08, + "loss": 0.0707, + "step": 8865 + }, + { + "epoch": 2.8729747245625408, + "grad_norm": 0.48417264223098755, + "learning_rate": 4.694382363322369e-08, + "loss": 0.0739, + "step": 8866 + }, + { + "epoch": 2.8732987686325338, + "grad_norm": 0.45417407155036926, + "learning_rate": 4.670499292530828e-08, + "loss": 0.0672, + "step": 8867 + }, + { + "epoch": 2.8736228127025276, + "grad_norm": 0.5107606649398804, + "learning_rate": 4.6466768452750334e-08, + "loss": 0.0746, + "step": 8868 + }, + { + "epoch": 2.873946856772521, + "grad_norm": 0.5404432415962219, + "learning_rate": 4.622915024470542e-08, + "loss": 0.076, + "step": 8869 + }, + { + "epoch": 2.8742709008425145, + "grad_norm": 0.4897291660308838, + "learning_rate": 4.5992138330256396e-08, + "loss": 0.0709, + "step": 8870 + }, + { + "epoch": 2.8745949449125083, + "grad_norm": 0.5036388039588928, + "learning_rate": 4.5755732738411715e-08, + "loss": 0.0706, + "step": 8871 + }, + { + "epoch": 2.8749189889825018, + "grad_norm": 0.4615165889263153, + "learning_rate": 4.5519933498103795e-08, + "loss": 0.0632, + "step": 8872 + }, + { + "epoch": 2.875243033052495, + "grad_norm": 0.513347327709198, + "learning_rate": 4.5284740638193435e-08, + "loss": 0.0761, + "step": 8873 + }, + { + "epoch": 2.8755670771224886, + "grad_norm": 0.512653648853302, + "learning_rate": 4.505015418746539e-08, + "loss": 0.0733, + "step": 8874 + }, + { + "epoch": 2.875891121192482, + "grad_norm": 0.486761212348938, + "learning_rate": 4.481617417463113e-08, + "loss": 0.0726, + "step": 8875 + }, + { + "epoch": 2.876215165262476, + "grad_norm": 0.49732506275177, + "learning_rate": 4.458280062832665e-08, + "loss": 0.0755, + "step": 8876 + }, + { + "epoch": 2.8765392093324693, + "grad_norm": 0.47248610854148865, + "learning_rate": 4.435003357711576e-08, + "loss": 0.0706, + "step": 8877 + }, + { + "epoch": 2.8768632534024627, + "grad_norm": 0.45865169167518616, + "learning_rate": 4.411787304948567e-08, + "loss": 0.0688, + "step": 8878 + }, + { + "epoch": 2.877187297472456, + "grad_norm": 0.4855770766735077, + "learning_rate": 4.388631907385199e-08, + "loss": 0.0696, + "step": 8879 + }, + { + "epoch": 2.8775113415424496, + "grad_norm": 0.5271292924880981, + "learning_rate": 4.365537167855371e-08, + "loss": 0.0724, + "step": 8880 + }, + { + "epoch": 2.8778353856124435, + "grad_norm": 0.4924415349960327, + "learning_rate": 4.342503089185657e-08, + "loss": 0.0706, + "step": 8881 + }, + { + "epoch": 2.878159429682437, + "grad_norm": 0.5239536762237549, + "learning_rate": 4.319529674195244e-08, + "loss": 0.0747, + "step": 8882 + }, + { + "epoch": 2.8784834737524303, + "grad_norm": 0.5411633849143982, + "learning_rate": 4.296616925695829e-08, + "loss": 0.0772, + "step": 8883 + }, + { + "epoch": 2.8788075178224237, + "grad_norm": 0.49615591764450073, + "learning_rate": 4.2737648464917236e-08, + "loss": 0.066, + "step": 8884 + }, + { + "epoch": 2.879131561892417, + "grad_norm": 0.4745228588581085, + "learning_rate": 4.250973439379858e-08, + "loss": 0.0678, + "step": 8885 + }, + { + "epoch": 2.879455605962411, + "grad_norm": 0.5283622741699219, + "learning_rate": 4.2282427071495545e-08, + "loss": 0.0759, + "step": 8886 + }, + { + "epoch": 2.8797796500324044, + "grad_norm": 0.5028887391090393, + "learning_rate": 4.2055726525829234e-08, + "loss": 0.0761, + "step": 8887 + }, + { + "epoch": 2.880103694102398, + "grad_norm": 0.4696975648403168, + "learning_rate": 4.1829632784545216e-08, + "loss": 0.0653, + "step": 8888 + }, + { + "epoch": 2.8804277381723913, + "grad_norm": 0.45641934871673584, + "learning_rate": 4.160414587531525e-08, + "loss": 0.0637, + "step": 8889 + }, + { + "epoch": 2.8807517822423847, + "grad_norm": 0.4784345030784607, + "learning_rate": 4.137926582573726e-08, + "loss": 0.0674, + "step": 8890 + }, + { + "epoch": 2.8810758263123786, + "grad_norm": 0.4932958781719208, + "learning_rate": 4.1154992663333674e-08, + "loss": 0.0719, + "step": 8891 + }, + { + "epoch": 2.881399870382372, + "grad_norm": 0.4853772521018982, + "learning_rate": 4.093132641555364e-08, + "loss": 0.0695, + "step": 8892 + }, + { + "epoch": 2.8817239144523654, + "grad_norm": 0.4972860515117645, + "learning_rate": 4.0708267109771935e-08, + "loss": 0.0758, + "step": 8893 + }, + { + "epoch": 2.8820479585223593, + "grad_norm": 0.4952300190925598, + "learning_rate": 4.048581477328839e-08, + "loss": 0.067, + "step": 8894 + }, + { + "epoch": 2.8823720025923527, + "grad_norm": 0.4682835638523102, + "learning_rate": 4.026396943332955e-08, + "loss": 0.0673, + "step": 8895 + }, + { + "epoch": 2.882696046662346, + "grad_norm": 0.4913991093635559, + "learning_rate": 4.004273111704704e-08, + "loss": 0.0758, + "step": 8896 + }, + { + "epoch": 2.8830200907323396, + "grad_norm": 0.47748857736587524, + "learning_rate": 3.982209985151753e-08, + "loss": 0.0676, + "step": 8897 + }, + { + "epoch": 2.883344134802333, + "grad_norm": 0.4662216007709503, + "learning_rate": 3.9602075663744964e-08, + "loss": 0.0677, + "step": 8898 + }, + { + "epoch": 2.883668178872327, + "grad_norm": 0.4768160581588745, + "learning_rate": 3.938265858065837e-08, + "loss": 0.0682, + "step": 8899 + }, + { + "epoch": 2.8839922229423203, + "grad_norm": 0.4909741282463074, + "learning_rate": 3.916384862911182e-08, + "loss": 0.0725, + "step": 8900 + }, + { + "epoch": 2.8843162670123137, + "grad_norm": 0.5086570978164673, + "learning_rate": 3.8945645835885556e-08, + "loss": 0.0754, + "step": 8901 + }, + { + "epoch": 2.884640311082307, + "grad_norm": 0.4623122215270996, + "learning_rate": 3.872805022768489e-08, + "loss": 0.0677, + "step": 8902 + }, + { + "epoch": 2.8849643551523005, + "grad_norm": 0.487517774105072, + "learning_rate": 3.8511061831142394e-08, + "loss": 0.0717, + "step": 8903 + }, + { + "epoch": 2.8852883992222944, + "grad_norm": 0.4888652563095093, + "learning_rate": 3.829468067281517e-08, + "loss": 0.0725, + "step": 8904 + }, + { + "epoch": 2.885612443292288, + "grad_norm": 0.4961828291416168, + "learning_rate": 3.807890677918591e-08, + "loss": 0.0769, + "step": 8905 + }, + { + "epoch": 2.8859364873622813, + "grad_norm": 0.48514750599861145, + "learning_rate": 3.78637401766635e-08, + "loss": 0.0725, + "step": 8906 + }, + { + "epoch": 2.8862605314322747, + "grad_norm": 0.4731883704662323, + "learning_rate": 3.764918089158187e-08, + "loss": 0.0716, + "step": 8907 + }, + { + "epoch": 2.886584575502268, + "grad_norm": 0.5104616284370422, + "learning_rate": 3.743522895020168e-08, + "loss": 0.0723, + "step": 8908 + }, + { + "epoch": 2.886908619572262, + "grad_norm": 0.4929802119731903, + "learning_rate": 3.7221884378707554e-08, + "loss": 0.0673, + "step": 8909 + }, + { + "epoch": 2.8872326636422554, + "grad_norm": 0.4420555830001831, + "learning_rate": 3.700914720321136e-08, + "loss": 0.0649, + "step": 8910 + }, + { + "epoch": 2.887556707712249, + "grad_norm": 0.507371187210083, + "learning_rate": 3.679701744975006e-08, + "loss": 0.0692, + "step": 8911 + }, + { + "epoch": 2.8878807517822422, + "grad_norm": 0.5148362517356873, + "learning_rate": 3.658549514428678e-08, + "loss": 0.069, + "step": 8912 + }, + { + "epoch": 2.8882047958522357, + "grad_norm": 0.49123647809028625, + "learning_rate": 3.637458031270913e-08, + "loss": 0.0703, + "step": 8913 + }, + { + "epoch": 2.8885288399222295, + "grad_norm": 0.49537959694862366, + "learning_rate": 3.616427298083092e-08, + "loss": 0.0701, + "step": 8914 + }, + { + "epoch": 2.888852883992223, + "grad_norm": 0.4905968904495239, + "learning_rate": 3.5954573174392106e-08, + "loss": 0.0704, + "step": 8915 + }, + { + "epoch": 2.8891769280622164, + "grad_norm": 0.49671250581741333, + "learning_rate": 3.574548091905827e-08, + "loss": 0.0738, + "step": 8916 + }, + { + "epoch": 2.8895009721322102, + "grad_norm": 0.5079336166381836, + "learning_rate": 3.553699624041951e-08, + "loss": 0.0715, + "step": 8917 + }, + { + "epoch": 2.8898250162022032, + "grad_norm": 0.48482534289360046, + "learning_rate": 3.532911916399262e-08, + "loss": 0.0709, + "step": 8918 + }, + { + "epoch": 2.890149060272197, + "grad_norm": 0.4966800808906555, + "learning_rate": 3.512184971522003e-08, + "loss": 0.0719, + "step": 8919 + }, + { + "epoch": 2.8904731043421905, + "grad_norm": 0.490303635597229, + "learning_rate": 3.491518791946924e-08, + "loss": 0.0677, + "step": 8920 + }, + { + "epoch": 2.890797148412184, + "grad_norm": 0.4899630546569824, + "learning_rate": 3.470913380203389e-08, + "loss": 0.073, + "step": 8921 + }, + { + "epoch": 2.891121192482178, + "grad_norm": 0.4699013829231262, + "learning_rate": 3.450368738813215e-08, + "loss": 0.0659, + "step": 8922 + }, + { + "epoch": 2.8914452365521712, + "grad_norm": 0.4546189606189728, + "learning_rate": 3.4298848702910006e-08, + "loss": 0.0655, + "step": 8923 + }, + { + "epoch": 2.8917692806221647, + "grad_norm": 0.5081414580345154, + "learning_rate": 3.4094617771436854e-08, + "loss": 0.0733, + "step": 8924 + }, + { + "epoch": 2.892093324692158, + "grad_norm": 0.5270804166793823, + "learning_rate": 3.389099461870882e-08, + "loss": 0.0806, + "step": 8925 + }, + { + "epoch": 2.8924173687621515, + "grad_norm": 0.5022570490837097, + "learning_rate": 3.368797926964762e-08, + "loss": 0.0727, + "step": 8926 + }, + { + "epoch": 2.8927414128321454, + "grad_norm": 0.5113040804862976, + "learning_rate": 3.34855717490995e-08, + "loss": 0.0735, + "step": 8927 + }, + { + "epoch": 2.893065456902139, + "grad_norm": 0.5260816812515259, + "learning_rate": 3.3283772081838526e-08, + "loss": 0.0732, + "step": 8928 + }, + { + "epoch": 2.893389500972132, + "grad_norm": 0.5216476321220398, + "learning_rate": 3.308258029256162e-08, + "loss": 0.0672, + "step": 8929 + }, + { + "epoch": 2.8937135450421256, + "grad_norm": 0.5061860680580139, + "learning_rate": 3.288199640589407e-08, + "loss": 0.0724, + "step": 8930 + }, + { + "epoch": 2.894037589112119, + "grad_norm": 0.4715794622898102, + "learning_rate": 3.268202044638458e-08, + "loss": 0.0712, + "step": 8931 + }, + { + "epoch": 2.894361633182113, + "grad_norm": 0.5052686333656311, + "learning_rate": 3.248265243850801e-08, + "loss": 0.0696, + "step": 8932 + }, + { + "epoch": 2.8946856772521063, + "grad_norm": 0.5165843963623047, + "learning_rate": 3.228389240666541e-08, + "loss": 0.0732, + "step": 8933 + }, + { + "epoch": 2.8950097213220998, + "grad_norm": 0.509536623954773, + "learning_rate": 3.208574037518397e-08, + "loss": 0.0747, + "step": 8934 + }, + { + "epoch": 2.895333765392093, + "grad_norm": 0.5373370051383972, + "learning_rate": 3.188819636831375e-08, + "loss": 0.0739, + "step": 8935 + }, + { + "epoch": 2.8956578094620866, + "grad_norm": 0.4681432545185089, + "learning_rate": 3.1691260410234295e-08, + "loss": 0.0673, + "step": 8936 + }, + { + "epoch": 2.8959818535320805, + "grad_norm": 0.5055105090141296, + "learning_rate": 3.1494932525046875e-08, + "loss": 0.0731, + "step": 8937 + }, + { + "epoch": 2.896305897602074, + "grad_norm": 0.4752630293369293, + "learning_rate": 3.1299212736781156e-08, + "loss": 0.0633, + "step": 8938 + }, + { + "epoch": 2.8966299416720673, + "grad_norm": 0.4678106904029846, + "learning_rate": 3.1104101069390766e-08, + "loss": 0.0673, + "step": 8939 + }, + { + "epoch": 2.8969539857420608, + "grad_norm": 0.4853931963443756, + "learning_rate": 3.0909597546756046e-08, + "loss": 0.0713, + "step": 8940 + }, + { + "epoch": 2.897278029812054, + "grad_norm": 0.5015696287155151, + "learning_rate": 3.071570219268183e-08, + "loss": 0.0719, + "step": 8941 + }, + { + "epoch": 2.897602073882048, + "grad_norm": 0.49477383494377136, + "learning_rate": 3.05224150308997e-08, + "loss": 0.0721, + "step": 8942 + }, + { + "epoch": 2.8979261179520415, + "grad_norm": 0.4835866391658783, + "learning_rate": 3.032973608506573e-08, + "loss": 0.0705, + "step": 8943 + }, + { + "epoch": 2.898250162022035, + "grad_norm": 0.5060580968856812, + "learning_rate": 3.013766537876106e-08, + "loss": 0.0744, + "step": 8944 + }, + { + "epoch": 2.8985742060920288, + "grad_norm": 0.4551170766353607, + "learning_rate": 2.9946202935495216e-08, + "loss": 0.0646, + "step": 8945 + }, + { + "epoch": 2.898898250162022, + "grad_norm": 0.5011014342308044, + "learning_rate": 2.9755348778699457e-08, + "loss": 0.07, + "step": 8946 + }, + { + "epoch": 2.8992222942320156, + "grad_norm": 0.45690640807151794, + "learning_rate": 2.9565102931733426e-08, + "loss": 0.0669, + "step": 8947 + }, + { + "epoch": 2.899546338302009, + "grad_norm": 0.46726682782173157, + "learning_rate": 2.937546541788183e-08, + "loss": 0.0657, + "step": 8948 + }, + { + "epoch": 2.8998703823720025, + "grad_norm": 0.49788811802864075, + "learning_rate": 2.9186436260353335e-08, + "loss": 0.0686, + "step": 8949 + }, + { + "epoch": 2.9001944264419963, + "grad_norm": 0.4802239239215851, + "learning_rate": 2.899801548228387e-08, + "loss": 0.0683, + "step": 8950 + }, + { + "epoch": 2.9005184705119897, + "grad_norm": 0.48868319392204285, + "learning_rate": 2.8810203106734436e-08, + "loss": 0.0712, + "step": 8951 + }, + { + "epoch": 2.900842514581983, + "grad_norm": 0.510743260383606, + "learning_rate": 2.8622999156691643e-08, + "loss": 0.0686, + "step": 8952 + }, + { + "epoch": 2.9011665586519766, + "grad_norm": 0.45631998777389526, + "learning_rate": 2.843640365506606e-08, + "loss": 0.0637, + "step": 8953 + }, + { + "epoch": 2.90149060272197, + "grad_norm": 0.5396420955657959, + "learning_rate": 2.8250416624697186e-08, + "loss": 0.0779, + "step": 8954 + }, + { + "epoch": 2.901814646791964, + "grad_norm": 0.519350528717041, + "learning_rate": 2.806503808834682e-08, + "loss": 0.0756, + "step": 8955 + }, + { + "epoch": 2.9021386908619573, + "grad_norm": 0.4911840260028839, + "learning_rate": 2.7880268068703476e-08, + "loss": 0.0702, + "step": 8956 + }, + { + "epoch": 2.9024627349319507, + "grad_norm": 0.5336769819259644, + "learning_rate": 2.7696106588381844e-08, + "loss": 0.0759, + "step": 8957 + }, + { + "epoch": 2.902786779001944, + "grad_norm": 0.45938992500305176, + "learning_rate": 2.7512553669921117e-08, + "loss": 0.0674, + "step": 8958 + }, + { + "epoch": 2.9031108230719376, + "grad_norm": 0.47019660472869873, + "learning_rate": 2.73296093357861e-08, + "loss": 0.0696, + "step": 8959 + }, + { + "epoch": 2.9034348671419314, + "grad_norm": 0.48484107851982117, + "learning_rate": 2.7147273608367775e-08, + "loss": 0.0688, + "step": 8960 + }, + { + "epoch": 2.903758911211925, + "grad_norm": 0.4778381884098053, + "learning_rate": 2.696554650998273e-08, + "loss": 0.07, + "step": 8961 + }, + { + "epoch": 2.9040829552819183, + "grad_norm": 0.49290984869003296, + "learning_rate": 2.6784428062871514e-08, + "loss": 0.0714, + "step": 8962 + }, + { + "epoch": 2.9044069993519117, + "grad_norm": 0.48641347885131836, + "learning_rate": 2.6603918289201948e-08, + "loss": 0.07, + "step": 8963 + }, + { + "epoch": 2.904731043421905, + "grad_norm": 0.46779337525367737, + "learning_rate": 2.6424017211066354e-08, + "loss": 0.0655, + "step": 8964 + }, + { + "epoch": 2.905055087491899, + "grad_norm": 0.48739469051361084, + "learning_rate": 2.624472485048324e-08, + "loss": 0.0723, + "step": 8965 + }, + { + "epoch": 2.9053791315618924, + "grad_norm": 0.4942921996116638, + "learning_rate": 2.6066041229396156e-08, + "loss": 0.0716, + "step": 8966 + }, + { + "epoch": 2.905703175631886, + "grad_norm": 0.5472116470336914, + "learning_rate": 2.5887966369674833e-08, + "loss": 0.0757, + "step": 8967 + }, + { + "epoch": 2.9060272197018797, + "grad_norm": 0.4734596014022827, + "learning_rate": 2.5710500293112394e-08, + "loss": 0.0676, + "step": 8968 + }, + { + "epoch": 2.906351263771873, + "grad_norm": 0.4330841898918152, + "learning_rate": 2.5533643021430355e-08, + "loss": 0.0603, + "step": 8969 + }, + { + "epoch": 2.9066753078418666, + "grad_norm": 0.48858642578125, + "learning_rate": 2.5357394576273618e-08, + "loss": 0.0725, + "step": 8970 + }, + { + "epoch": 2.90699935191186, + "grad_norm": 0.4687802791595459, + "learning_rate": 2.5181754979213823e-08, + "loss": 0.0674, + "step": 8971 + }, + { + "epoch": 2.9073233959818534, + "grad_norm": 0.4720984101295471, + "learning_rate": 2.5006724251747104e-08, + "loss": 0.0685, + "step": 8972 + }, + { + "epoch": 2.9076474400518473, + "grad_norm": 0.5154309868812561, + "learning_rate": 2.483230241529522e-08, + "loss": 0.0749, + "step": 8973 + }, + { + "epoch": 2.9079714841218407, + "grad_norm": 0.48242926597595215, + "learning_rate": 2.4658489491207193e-08, + "loss": 0.0701, + "step": 8974 + }, + { + "epoch": 2.908295528191834, + "grad_norm": 0.49187803268432617, + "learning_rate": 2.4485285500753797e-08, + "loss": 0.0698, + "step": 8975 + }, + { + "epoch": 2.9086195722618275, + "grad_norm": 0.4813985228538513, + "learning_rate": 2.4312690465135846e-08, + "loss": 0.0717, + "step": 8976 + }, + { + "epoch": 2.908943616331821, + "grad_norm": 0.4330715537071228, + "learning_rate": 2.4140704405475336e-08, + "loss": 0.0595, + "step": 8977 + }, + { + "epoch": 2.909267660401815, + "grad_norm": 0.5288068056106567, + "learning_rate": 2.39693273428232e-08, + "loss": 0.0798, + "step": 8978 + }, + { + "epoch": 2.9095917044718083, + "grad_norm": 0.5089949369430542, + "learning_rate": 2.3798559298153224e-08, + "loss": 0.0753, + "step": 8979 + }, + { + "epoch": 2.9099157485418017, + "grad_norm": 0.46731987595558167, + "learning_rate": 2.362840029236646e-08, + "loss": 0.0715, + "step": 8980 + }, + { + "epoch": 2.910239792611795, + "grad_norm": 0.4632418751716614, + "learning_rate": 2.345885034628792e-08, + "loss": 0.0685, + "step": 8981 + }, + { + "epoch": 2.9105638366817885, + "grad_norm": 0.4734707474708557, + "learning_rate": 2.3289909480669892e-08, + "loss": 0.0641, + "step": 8982 + }, + { + "epoch": 2.9108878807517824, + "grad_norm": 0.4756975471973419, + "learning_rate": 2.3121577716189168e-08, + "loss": 0.0696, + "step": 8983 + }, + { + "epoch": 2.911211924821776, + "grad_norm": 0.47261470556259155, + "learning_rate": 2.2953855073446497e-08, + "loss": 0.0648, + "step": 8984 + }, + { + "epoch": 2.9115359688917692, + "grad_norm": 0.5152508616447449, + "learning_rate": 2.2786741572971004e-08, + "loss": 0.0744, + "step": 8985 + }, + { + "epoch": 2.9118600129617627, + "grad_norm": 0.5035756826400757, + "learning_rate": 2.2620237235215226e-08, + "loss": 0.0723, + "step": 8986 + }, + { + "epoch": 2.912184057031756, + "grad_norm": 0.48373931646347046, + "learning_rate": 2.24543420805573e-08, + "loss": 0.0669, + "step": 8987 + }, + { + "epoch": 2.91250810110175, + "grad_norm": 0.49522700905799866, + "learning_rate": 2.2289056129301545e-08, + "loss": 0.0699, + "step": 8988 + }, + { + "epoch": 2.9128321451717434, + "grad_norm": 0.4791494607925415, + "learning_rate": 2.2124379401677888e-08, + "loss": 0.0704, + "step": 8989 + }, + { + "epoch": 2.913156189241737, + "grad_norm": 0.4697430431842804, + "learning_rate": 2.1960311917840206e-08, + "loss": 0.0697, + "step": 8990 + }, + { + "epoch": 2.9134802333117307, + "grad_norm": 0.4840279221534729, + "learning_rate": 2.17968536978691e-08, + "loss": 0.0694, + "step": 8991 + }, + { + "epoch": 2.9138042773817237, + "grad_norm": 0.4704105257987976, + "learning_rate": 2.1634004761770245e-08, + "loss": 0.0733, + "step": 8992 + }, + { + "epoch": 2.9141283214517175, + "grad_norm": 0.49104562401771545, + "learning_rate": 2.1471765129475464e-08, + "loss": 0.0693, + "step": 8993 + }, + { + "epoch": 2.914452365521711, + "grad_norm": 0.5004822611808777, + "learning_rate": 2.131013482083999e-08, + "loss": 0.0682, + "step": 8994 + }, + { + "epoch": 2.9147764095917044, + "grad_norm": 0.5043914318084717, + "learning_rate": 2.114911385564744e-08, + "loss": 0.0714, + "step": 8995 + }, + { + "epoch": 2.9151004536616982, + "grad_norm": 0.5104712247848511, + "learning_rate": 2.098870225360372e-08, + "loss": 0.0728, + "step": 8996 + }, + { + "epoch": 2.9154244977316917, + "grad_norm": 0.4805237650871277, + "learning_rate": 2.0828900034342013e-08, + "loss": 0.0694, + "step": 8997 + }, + { + "epoch": 2.915748541801685, + "grad_norm": 0.48914143443107605, + "learning_rate": 2.0669707217421676e-08, + "loss": 0.0724, + "step": 8998 + }, + { + "epoch": 2.9160725858716785, + "grad_norm": 0.5172355771064758, + "learning_rate": 2.0511123822324897e-08, + "loss": 0.0739, + "step": 8999 + }, + { + "epoch": 2.916396629941672, + "grad_norm": 0.4897490441799164, + "learning_rate": 2.0353149868461708e-08, + "loss": 0.0692, + "step": 9000 + }, + { + "epoch": 2.916720674011666, + "grad_norm": 0.44778183102607727, + "learning_rate": 2.0195785375166088e-08, + "loss": 0.0662, + "step": 9001 + }, + { + "epoch": 2.917044718081659, + "grad_norm": 0.48703789710998535, + "learning_rate": 2.0039030361698187e-08, + "loss": 0.0685, + "step": 9002 + }, + { + "epoch": 2.9173687621516526, + "grad_norm": 0.45660385489463806, + "learning_rate": 1.9882884847243213e-08, + "loss": 0.0636, + "step": 9003 + }, + { + "epoch": 2.917692806221646, + "grad_norm": 0.4920264184474945, + "learning_rate": 1.9727348850911432e-08, + "loss": 0.0702, + "step": 9004 + }, + { + "epoch": 2.9180168502916395, + "grad_norm": 0.4781172573566437, + "learning_rate": 1.957242239173984e-08, + "loss": 0.0677, + "step": 9005 + }, + { + "epoch": 2.9183408943616334, + "grad_norm": 0.5161155462265015, + "learning_rate": 1.9418105488689388e-08, + "loss": 0.0761, + "step": 9006 + }, + { + "epoch": 2.9186649384316268, + "grad_norm": 0.45751678943634033, + "learning_rate": 1.926439816064718e-08, + "loss": 0.0665, + "step": 9007 + }, + { + "epoch": 2.91898898250162, + "grad_norm": 0.4571021497249603, + "learning_rate": 1.911130042642484e-08, + "loss": 0.0667, + "step": 9008 + }, + { + "epoch": 2.9193130265716136, + "grad_norm": 0.4877769649028778, + "learning_rate": 1.8958812304761264e-08, + "loss": 0.0676, + "step": 9009 + }, + { + "epoch": 2.919637070641607, + "grad_norm": 0.5144590139389038, + "learning_rate": 1.880693381431875e-08, + "loss": 0.0726, + "step": 9010 + }, + { + "epoch": 2.919961114711601, + "grad_norm": 0.5145933032035828, + "learning_rate": 1.8655664973685205e-08, + "loss": 0.0713, + "step": 9011 + }, + { + "epoch": 2.9202851587815943, + "grad_norm": 0.5353699326515198, + "learning_rate": 1.850500580137582e-08, + "loss": 0.074, + "step": 9012 + }, + { + "epoch": 2.9206092028515878, + "grad_norm": 0.4878300130367279, + "learning_rate": 1.835495631582862e-08, + "loss": 0.0665, + "step": 9013 + }, + { + "epoch": 2.920933246921581, + "grad_norm": 0.4564133584499359, + "learning_rate": 1.8205516535409472e-08, + "loss": 0.0691, + "step": 9014 + }, + { + "epoch": 2.9212572909915746, + "grad_norm": 0.5551131963729858, + "learning_rate": 1.805668647840708e-08, + "loss": 0.0759, + "step": 9015 + }, + { + "epoch": 2.9215813350615685, + "grad_norm": 0.47328266501426697, + "learning_rate": 1.7908466163036875e-08, + "loss": 0.0693, + "step": 9016 + }, + { + "epoch": 2.921905379131562, + "grad_norm": 0.46604713797569275, + "learning_rate": 1.7760855607440453e-08, + "loss": 0.0686, + "step": 9017 + }, + { + "epoch": 2.9222294232015553, + "grad_norm": 0.4848031997680664, + "learning_rate": 1.7613854829683917e-08, + "loss": 0.066, + "step": 9018 + }, + { + "epoch": 2.922553467271549, + "grad_norm": 0.4901368319988251, + "learning_rate": 1.746746384775788e-08, + "loss": 0.0714, + "step": 9019 + }, + { + "epoch": 2.9228775113415426, + "grad_norm": 0.5199681520462036, + "learning_rate": 1.7321682679579122e-08, + "loss": 0.0753, + "step": 9020 + }, + { + "epoch": 2.923201555411536, + "grad_norm": 0.4540145993232727, + "learning_rate": 1.717651134299114e-08, + "loss": 0.0639, + "step": 9021 + }, + { + "epoch": 2.9235255994815295, + "grad_norm": 0.4961194097995758, + "learning_rate": 1.703194985576029e-08, + "loss": 0.0703, + "step": 9022 + }, + { + "epoch": 2.923849643551523, + "grad_norm": 0.4682721793651581, + "learning_rate": 1.6887998235580183e-08, + "loss": 0.0669, + "step": 9023 + }, + { + "epoch": 2.9241736876215167, + "grad_norm": 0.49916213750839233, + "learning_rate": 1.67446565000684e-08, + "loss": 0.0723, + "step": 9024 + }, + { + "epoch": 2.92449773169151, + "grad_norm": 0.4951789081096649, + "learning_rate": 1.66019246667698e-08, + "loss": 0.0728, + "step": 9025 + }, + { + "epoch": 2.9248217757615036, + "grad_norm": 0.48040568828582764, + "learning_rate": 1.645980275315151e-08, + "loss": 0.0677, + "step": 9026 + }, + { + "epoch": 2.925145819831497, + "grad_norm": 0.4653061330318451, + "learning_rate": 1.631829077661018e-08, + "loss": 0.0656, + "step": 9027 + }, + { + "epoch": 2.9254698639014904, + "grad_norm": 0.477243036031723, + "learning_rate": 1.6177388754463063e-08, + "loss": 0.0697, + "step": 9028 + }, + { + "epoch": 2.9257939079714843, + "grad_norm": 0.4760611951351166, + "learning_rate": 1.6037096703957476e-08, + "loss": 0.0726, + "step": 9029 + }, + { + "epoch": 2.9261179520414777, + "grad_norm": 0.5183154940605164, + "learning_rate": 1.589741464226191e-08, + "loss": 0.0716, + "step": 9030 + }, + { + "epoch": 2.926441996111471, + "grad_norm": 0.4699106812477112, + "learning_rate": 1.5758342586473242e-08, + "loss": 0.0643, + "step": 9031 + }, + { + "epoch": 2.9267660401814646, + "grad_norm": 0.47856253385543823, + "learning_rate": 1.56198805536123e-08, + "loss": 0.0713, + "step": 9032 + }, + { + "epoch": 2.927090084251458, + "grad_norm": 0.4797256886959076, + "learning_rate": 1.548202856062553e-08, + "loss": 0.0679, + "step": 9033 + }, + { + "epoch": 2.927414128321452, + "grad_norm": 0.46025872230529785, + "learning_rate": 1.5344786624384435e-08, + "loss": 0.0696, + "step": 9034 + }, + { + "epoch": 2.9277381723914453, + "grad_norm": 0.47484642267227173, + "learning_rate": 1.5208154761686135e-08, + "loss": 0.0652, + "step": 9035 + }, + { + "epoch": 2.9280622164614387, + "grad_norm": 0.4732259213924408, + "learning_rate": 1.5072132989253362e-08, + "loss": 0.0673, + "step": 9036 + }, + { + "epoch": 2.928386260531432, + "grad_norm": 0.5131701231002808, + "learning_rate": 1.4936721323733915e-08, + "loss": 0.0725, + "step": 9037 + }, + { + "epoch": 2.9287103046014256, + "grad_norm": 0.4793781638145447, + "learning_rate": 1.4801919781700091e-08, + "loss": 0.0702, + "step": 9038 + }, + { + "epoch": 2.9290343486714194, + "grad_norm": 0.5629575252532959, + "learning_rate": 1.466772837965147e-08, + "loss": 0.0783, + "step": 9039 + }, + { + "epoch": 2.929358392741413, + "grad_norm": 0.4751351773738861, + "learning_rate": 1.4534147134010467e-08, + "loss": 0.0694, + "step": 9040 + }, + { + "epoch": 2.9296824368114063, + "grad_norm": 0.48215755820274353, + "learning_rate": 1.4401176061127343e-08, + "loss": 0.0701, + "step": 9041 + }, + { + "epoch": 2.9300064808814, + "grad_norm": 0.5063463449478149, + "learning_rate": 1.4268815177275741e-08, + "loss": 0.0713, + "step": 9042 + }, + { + "epoch": 2.930330524951393, + "grad_norm": 0.4733906686306, + "learning_rate": 1.4137064498655484e-08, + "loss": 0.0708, + "step": 9043 + }, + { + "epoch": 2.930654569021387, + "grad_norm": 0.48800525069236755, + "learning_rate": 1.400592404139145e-08, + "loss": 0.0733, + "step": 9044 + }, + { + "epoch": 2.9309786130913804, + "grad_norm": 0.5243503451347351, + "learning_rate": 1.3875393821534133e-08, + "loss": 0.0762, + "step": 9045 + }, + { + "epoch": 2.931302657161374, + "grad_norm": 0.5137089490890503, + "learning_rate": 1.3745473855059643e-08, + "loss": 0.0728, + "step": 9046 + }, + { + "epoch": 2.9316267012313677, + "grad_norm": 0.4924151301383972, + "learning_rate": 1.3616164157868039e-08, + "loss": 0.0704, + "step": 9047 + }, + { + "epoch": 2.931950745301361, + "grad_norm": 0.4642469882965088, + "learning_rate": 1.3487464745786106e-08, + "loss": 0.0652, + "step": 9048 + }, + { + "epoch": 2.9322747893713546, + "grad_norm": 0.44174107909202576, + "learning_rate": 1.3359375634565685e-08, + "loss": 0.0619, + "step": 9049 + }, + { + "epoch": 2.932598833441348, + "grad_norm": 0.46095919609069824, + "learning_rate": 1.323189683988313e-08, + "loss": 0.0683, + "step": 9050 + }, + { + "epoch": 2.9329228775113414, + "grad_norm": 0.448514461517334, + "learning_rate": 1.3105028377340401e-08, + "loss": 0.0646, + "step": 9051 + }, + { + "epoch": 2.9332469215813353, + "grad_norm": 0.5289997458457947, + "learning_rate": 1.2978770262465634e-08, + "loss": 0.0824, + "step": 9052 + }, + { + "epoch": 2.9335709656513287, + "grad_norm": 0.49298954010009766, + "learning_rate": 1.2853122510710914e-08, + "loss": 0.0714, + "step": 9053 + }, + { + "epoch": 2.933895009721322, + "grad_norm": 0.4439584016799927, + "learning_rate": 1.2728085137455048e-08, + "loss": 0.061, + "step": 9054 + }, + { + "epoch": 2.9342190537913155, + "grad_norm": 0.5283500552177429, + "learning_rate": 1.2603658158000798e-08, + "loss": 0.0794, + "step": 9055 + }, + { + "epoch": 2.934543097861309, + "grad_norm": 0.48534640669822693, + "learning_rate": 1.2479841587577091e-08, + "loss": 0.0689, + "step": 9056 + }, + { + "epoch": 2.934867141931303, + "grad_norm": 0.5000969767570496, + "learning_rate": 1.2356635441337917e-08, + "loss": 0.0745, + "step": 9057 + }, + { + "epoch": 2.9351911860012962, + "grad_norm": 0.47974875569343567, + "learning_rate": 1.2234039734362323e-08, + "loss": 0.0691, + "step": 9058 + }, + { + "epoch": 2.9355152300712897, + "grad_norm": 0.5280157923698425, + "learning_rate": 1.2112054481654977e-08, + "loss": 0.0778, + "step": 9059 + }, + { + "epoch": 2.935839274141283, + "grad_norm": 0.49620354175567627, + "learning_rate": 1.1990679698146158e-08, + "loss": 0.0739, + "step": 9060 + }, + { + "epoch": 2.9361633182112765, + "grad_norm": 0.5137200951576233, + "learning_rate": 1.1869915398689535e-08, + "loss": 0.075, + "step": 9061 + }, + { + "epoch": 2.9364873622812704, + "grad_norm": 0.5006195902824402, + "learning_rate": 1.1749761598067178e-08, + "loss": 0.0723, + "step": 9062 + }, + { + "epoch": 2.936811406351264, + "grad_norm": 0.5092834830284119, + "learning_rate": 1.1630218310983432e-08, + "loss": 0.0712, + "step": 9063 + }, + { + "epoch": 2.9371354504212572, + "grad_norm": 0.5249757170677185, + "learning_rate": 1.1511285552070483e-08, + "loss": 0.0754, + "step": 9064 + }, + { + "epoch": 2.9374594944912507, + "grad_norm": 0.47460222244262695, + "learning_rate": 1.1392963335883356e-08, + "loss": 0.0695, + "step": 9065 + }, + { + "epoch": 2.937783538561244, + "grad_norm": 0.4928366243839264, + "learning_rate": 1.1275251676904352e-08, + "loss": 0.0769, + "step": 9066 + }, + { + "epoch": 2.938107582631238, + "grad_norm": 0.5491718649864197, + "learning_rate": 1.1158150589539729e-08, + "loss": 0.0686, + "step": 9067 + }, + { + "epoch": 2.9384316267012314, + "grad_norm": 0.4939388334751129, + "learning_rate": 1.1041660088121354e-08, + "loss": 0.0699, + "step": 9068 + }, + { + "epoch": 2.938755670771225, + "grad_norm": 0.4884641468524933, + "learning_rate": 1.092578018690782e-08, + "loss": 0.0684, + "step": 9069 + }, + { + "epoch": 2.9390797148412187, + "grad_norm": 0.47037163376808167, + "learning_rate": 1.0810510900080006e-08, + "loss": 0.0681, + "step": 9070 + }, + { + "epoch": 2.939403758911212, + "grad_norm": 0.5198776721954346, + "learning_rate": 1.0695852241747185e-08, + "loss": 0.0753, + "step": 9071 + }, + { + "epoch": 2.9397278029812055, + "grad_norm": 0.5155799388885498, + "learning_rate": 1.0581804225941462e-08, + "loss": 0.0741, + "step": 9072 + }, + { + "epoch": 2.940051847051199, + "grad_norm": 0.49065136909484863, + "learning_rate": 1.046836686662167e-08, + "loss": 0.0716, + "step": 9073 + }, + { + "epoch": 2.9403758911211924, + "grad_norm": 0.5279356837272644, + "learning_rate": 1.0355540177671708e-08, + "loss": 0.0713, + "step": 9074 + }, + { + "epoch": 2.940699935191186, + "grad_norm": 0.5159909725189209, + "learning_rate": 1.0243324172899416e-08, + "loss": 0.0753, + "step": 9075 + }, + { + "epoch": 2.9410239792611796, + "grad_norm": 0.48105043172836304, + "learning_rate": 1.0131718866039919e-08, + "loss": 0.071, + "step": 9076 + }, + { + "epoch": 2.941348023331173, + "grad_norm": 0.48522862792015076, + "learning_rate": 1.0020724270752846e-08, + "loss": 0.0686, + "step": 9077 + }, + { + "epoch": 2.9416720674011665, + "grad_norm": 0.5026848912239075, + "learning_rate": 9.910340400621777e-09, + "loss": 0.0736, + "step": 9078 + }, + { + "epoch": 2.94199611147116, + "grad_norm": 0.48883044719696045, + "learning_rate": 9.800567269157569e-09, + "loss": 0.0701, + "step": 9079 + }, + { + "epoch": 2.942320155541154, + "grad_norm": 0.4957825243473053, + "learning_rate": 9.69140488979503e-09, + "loss": 0.0715, + "step": 9080 + }, + { + "epoch": 2.942644199611147, + "grad_norm": 0.48720213770866394, + "learning_rate": 9.582853275894587e-09, + "loss": 0.0683, + "step": 9081 + }, + { + "epoch": 2.9429682436811406, + "grad_norm": 0.5106403231620789, + "learning_rate": 9.474912440741723e-09, + "loss": 0.0794, + "step": 9082 + }, + { + "epoch": 2.943292287751134, + "grad_norm": 0.48066288232803345, + "learning_rate": 9.367582397547536e-09, + "loss": 0.0683, + "step": 9083 + }, + { + "epoch": 2.9436163318211275, + "grad_norm": 0.45802873373031616, + "learning_rate": 9.260863159448741e-09, + "loss": 0.066, + "step": 9084 + }, + { + "epoch": 2.9439403758911213, + "grad_norm": 0.4963512718677521, + "learning_rate": 9.154754739505444e-09, + "loss": 0.0733, + "step": 9085 + }, + { + "epoch": 2.9442644199611148, + "grad_norm": 0.5030404925346375, + "learning_rate": 9.049257150705592e-09, + "loss": 0.0675, + "step": 9086 + }, + { + "epoch": 2.944588464031108, + "grad_norm": 0.5117946267127991, + "learning_rate": 8.944370405960522e-09, + "loss": 0.0763, + "step": 9087 + }, + { + "epoch": 2.9449125081011016, + "grad_norm": 0.4487900137901306, + "learning_rate": 8.84009451810719e-09, + "loss": 0.0664, + "step": 9088 + }, + { + "epoch": 2.945236552171095, + "grad_norm": 0.5242525339126587, + "learning_rate": 8.73642949990816e-09, + "loss": 0.0773, + "step": 9089 + }, + { + "epoch": 2.945560596241089, + "grad_norm": 0.48610663414001465, + "learning_rate": 8.633375364050511e-09, + "loss": 0.0688, + "step": 9090 + }, + { + "epoch": 2.9458846403110823, + "grad_norm": 0.4557715356349945, + "learning_rate": 8.53093212314804e-09, + "loss": 0.0655, + "step": 9091 + }, + { + "epoch": 2.9462086843810757, + "grad_norm": 0.440714955329895, + "learning_rate": 8.429099789738493e-09, + "loss": 0.0648, + "step": 9092 + }, + { + "epoch": 2.9465327284510696, + "grad_norm": 0.47908642888069153, + "learning_rate": 8.327878376284682e-09, + "loss": 0.0658, + "step": 9093 + }, + { + "epoch": 2.9468567725210626, + "grad_norm": 0.4888572692871094, + "learning_rate": 8.227267895175584e-09, + "loss": 0.0713, + "step": 9094 + }, + { + "epoch": 2.9471808165910565, + "grad_norm": 0.42246827483177185, + "learning_rate": 8.127268358724682e-09, + "loss": 0.0571, + "step": 9095 + }, + { + "epoch": 2.94750486066105, + "grad_norm": 0.49386322498321533, + "learning_rate": 8.027879779171077e-09, + "loss": 0.0744, + "step": 9096 + }, + { + "epoch": 2.9478289047310433, + "grad_norm": 0.517785906791687, + "learning_rate": 7.929102168678926e-09, + "loss": 0.0779, + "step": 9097 + }, + { + "epoch": 2.948152948801037, + "grad_norm": 0.4539172053337097, + "learning_rate": 7.830935539337448e-09, + "loss": 0.0654, + "step": 9098 + }, + { + "epoch": 2.9484769928710306, + "grad_norm": 0.47268781065940857, + "learning_rate": 7.733379903161475e-09, + "loss": 0.0671, + "step": 9099 + }, + { + "epoch": 2.948801036941024, + "grad_norm": 0.4961428642272949, + "learning_rate": 7.636435272091458e-09, + "loss": 0.0675, + "step": 9100 + }, + { + "epoch": 2.9491250810110174, + "grad_norm": 0.48894017934799194, + "learning_rate": 7.540101657991794e-09, + "loss": 0.0692, + "step": 9101 + }, + { + "epoch": 2.949449125081011, + "grad_norm": 0.4910293221473694, + "learning_rate": 7.444379072652497e-09, + "loss": 0.071, + "step": 9102 + }, + { + "epoch": 2.9497731691510047, + "grad_norm": 0.49810612201690674, + "learning_rate": 7.34926752778975e-09, + "loss": 0.0733, + "step": 9103 + }, + { + "epoch": 2.950097213220998, + "grad_norm": 0.4806065261363983, + "learning_rate": 7.254767035044241e-09, + "loss": 0.0667, + "step": 9104 + }, + { + "epoch": 2.9504212572909916, + "grad_norm": 0.5162319540977478, + "learning_rate": 7.16087760598172e-09, + "loss": 0.0771, + "step": 9105 + }, + { + "epoch": 2.950745301360985, + "grad_norm": 0.4736173450946808, + "learning_rate": 7.067599252092994e-09, + "loss": 0.0724, + "step": 9106 + }, + { + "epoch": 2.9510693454309784, + "grad_norm": 0.489728182554245, + "learning_rate": 6.974931984795042e-09, + "loss": 0.0749, + "step": 9107 + }, + { + "epoch": 2.9513933895009723, + "grad_norm": 0.49041807651519775, + "learning_rate": 6.882875815429347e-09, + "loss": 0.0686, + "step": 9108 + }, + { + "epoch": 2.9517174335709657, + "grad_norm": 0.4952937066555023, + "learning_rate": 6.791430755262451e-09, + "loss": 0.0716, + "step": 9109 + }, + { + "epoch": 2.952041477640959, + "grad_norm": 0.48909682035446167, + "learning_rate": 6.7005968154859605e-09, + "loss": 0.0706, + "step": 9110 + }, + { + "epoch": 2.9523655217109526, + "grad_norm": 0.5242758989334106, + "learning_rate": 6.610374007218201e-09, + "loss": 0.0752, + "step": 9111 + }, + { + "epoch": 2.952689565780946, + "grad_norm": 0.49710774421691895, + "learning_rate": 6.520762341500342e-09, + "loss": 0.0685, + "step": 9112 + }, + { + "epoch": 2.95301360985094, + "grad_norm": 0.46828755736351013, + "learning_rate": 6.431761829301386e-09, + "loss": 0.0648, + "step": 9113 + }, + { + "epoch": 2.9533376539209333, + "grad_norm": 0.5269603729248047, + "learning_rate": 6.343372481512066e-09, + "loss": 0.0716, + "step": 9114 + }, + { + "epoch": 2.9536616979909267, + "grad_norm": 0.5137341618537903, + "learning_rate": 6.2555943089526176e-09, + "loss": 0.0709, + "step": 9115 + }, + { + "epoch": 2.95398574206092, + "grad_norm": 0.48662707209587097, + "learning_rate": 6.168427322365001e-09, + "loss": 0.07, + "step": 9116 + }, + { + "epoch": 2.9543097861309136, + "grad_norm": 0.49718114733695984, + "learning_rate": 6.0818715324173495e-09, + "loss": 0.0746, + "step": 9117 + }, + { + "epoch": 2.9546338302009074, + "grad_norm": 0.46967118978500366, + "learning_rate": 5.995926949704522e-09, + "loss": 0.064, + "step": 9118 + }, + { + "epoch": 2.954957874270901, + "grad_norm": 0.515914797782898, + "learning_rate": 5.910593584744217e-09, + "loss": 0.0693, + "step": 9119 + }, + { + "epoch": 2.9552819183408943, + "grad_norm": 0.4651590585708618, + "learning_rate": 5.825871447980303e-09, + "loss": 0.0644, + "step": 9120 + }, + { + "epoch": 2.955605962410888, + "grad_norm": 0.4563072621822357, + "learning_rate": 5.7417605497828155e-09, + "loss": 0.0655, + "step": 9121 + }, + { + "epoch": 2.9559300064808816, + "grad_norm": 0.5311094522476196, + "learning_rate": 5.658260900445744e-09, + "loss": 0.0721, + "step": 9122 + }, + { + "epoch": 2.956254050550875, + "grad_norm": 0.5263093709945679, + "learning_rate": 5.575372510188137e-09, + "loss": 0.0768, + "step": 9123 + }, + { + "epoch": 2.9565780946208684, + "grad_norm": 0.4999168813228607, + "learning_rate": 5.493095389155767e-09, + "loss": 0.0699, + "step": 9124 + }, + { + "epoch": 2.956902138690862, + "grad_norm": 0.4963438808917999, + "learning_rate": 5.411429547417246e-09, + "loss": 0.0731, + "step": 9125 + }, + { + "epoch": 2.9572261827608557, + "grad_norm": 0.46697449684143066, + "learning_rate": 5.330374994969023e-09, + "loss": 0.0664, + "step": 9126 + }, + { + "epoch": 2.957550226830849, + "grad_norm": 0.49287012219429016, + "learning_rate": 5.2499317417303855e-09, + "loss": 0.0714, + "step": 9127 + }, + { + "epoch": 2.9578742709008425, + "grad_norm": 0.49234694242477417, + "learning_rate": 5.1700997975467904e-09, + "loss": 0.0727, + "step": 9128 + }, + { + "epoch": 2.958198314970836, + "grad_norm": 0.5060413479804993, + "learning_rate": 5.090879172189866e-09, + "loss": 0.0747, + "step": 9129 + }, + { + "epoch": 2.9585223590408294, + "grad_norm": 0.49898648262023926, + "learning_rate": 5.012269875354636e-09, + "loss": 0.0695, + "step": 9130 + }, + { + "epoch": 2.9588464031108233, + "grad_norm": 0.47904735803604126, + "learning_rate": 4.934271916662847e-09, + "loss": 0.069, + "step": 9131 + }, + { + "epoch": 2.9591704471808167, + "grad_norm": 0.508891224861145, + "learning_rate": 4.8568853056596425e-09, + "loss": 0.0721, + "step": 9132 + }, + { + "epoch": 2.95949449125081, + "grad_norm": 0.47566792368888855, + "learning_rate": 4.780110051816889e-09, + "loss": 0.0692, + "step": 9133 + }, + { + "epoch": 2.9598185353208035, + "grad_norm": 0.48462051153182983, + "learning_rate": 4.703946164531514e-09, + "loss": 0.0685, + "step": 9134 + }, + { + "epoch": 2.960142579390797, + "grad_norm": 0.511398196220398, + "learning_rate": 4.628393653124952e-09, + "loss": 0.0727, + "step": 9135 + }, + { + "epoch": 2.960466623460791, + "grad_norm": 0.4798050820827484, + "learning_rate": 4.553452526843693e-09, + "loss": 0.0699, + "step": 9136 + }, + { + "epoch": 2.9607906675307842, + "grad_norm": 0.4587969183921814, + "learning_rate": 4.479122794860402e-09, + "loss": 0.0641, + "step": 9137 + }, + { + "epoch": 2.9611147116007777, + "grad_norm": 0.47840559482574463, + "learning_rate": 4.405404466272245e-09, + "loss": 0.0677, + "step": 9138 + }, + { + "epoch": 2.961438755670771, + "grad_norm": 0.4877968430519104, + "learning_rate": 4.332297550100895e-09, + "loss": 0.0721, + "step": 9139 + }, + { + "epoch": 2.9617627997407645, + "grad_norm": 0.514536440372467, + "learning_rate": 4.259802055295304e-09, + "loss": 0.0727, + "step": 9140 + }, + { + "epoch": 2.9620868438107584, + "grad_norm": 0.4699980318546295, + "learning_rate": 4.1879179907267085e-09, + "loss": 0.0661, + "step": 9141 + }, + { + "epoch": 2.962410887880752, + "grad_norm": 0.5246437191963196, + "learning_rate": 4.116645365194183e-09, + "loss": 0.075, + "step": 9142 + }, + { + "epoch": 2.962734931950745, + "grad_norm": 0.5276486873626709, + "learning_rate": 4.045984187420194e-09, + "loss": 0.075, + "step": 9143 + }, + { + "epoch": 2.963058976020739, + "grad_norm": 0.5244945883750916, + "learning_rate": 3.975934466053377e-09, + "loss": 0.0703, + "step": 9144 + }, + { + "epoch": 2.963383020090732, + "grad_norm": 0.5038921236991882, + "learning_rate": 3.9064962096668766e-09, + "loss": 0.0722, + "step": 9145 + }, + { + "epoch": 2.963707064160726, + "grad_norm": 0.4944347143173218, + "learning_rate": 3.837669426758894e-09, + "loss": 0.0686, + "step": 9146 + }, + { + "epoch": 2.9640311082307194, + "grad_norm": 0.467185378074646, + "learning_rate": 3.769454125753802e-09, + "loss": 0.073, + "step": 9147 + }, + { + "epoch": 2.964355152300713, + "grad_norm": 0.5159200429916382, + "learning_rate": 3.701850315000477e-09, + "loss": 0.0746, + "step": 9148 + }, + { + "epoch": 2.9646791963707066, + "grad_norm": 0.4407907724380493, + "learning_rate": 3.6348580027728564e-09, + "loss": 0.0651, + "step": 9149 + }, + { + "epoch": 2.9650032404407, + "grad_norm": 0.47313445806503296, + "learning_rate": 3.568477197269382e-09, + "loss": 0.0692, + "step": 9150 + }, + { + "epoch": 2.9653272845106935, + "grad_norm": 0.5187781453132629, + "learning_rate": 3.5027079066157764e-09, + "loss": 0.08, + "step": 9151 + }, + { + "epoch": 2.965651328580687, + "grad_norm": 0.4831237494945526, + "learning_rate": 3.4375501388606015e-09, + "loss": 0.0681, + "step": 9152 + }, + { + "epoch": 2.9659753726506803, + "grad_norm": 0.47124168276786804, + "learning_rate": 3.373003901979144e-09, + "loss": 0.0686, + "step": 9153 + }, + { + "epoch": 2.966299416720674, + "grad_norm": 0.5350909233093262, + "learning_rate": 3.3090692038700855e-09, + "loss": 0.0735, + "step": 9154 + }, + { + "epoch": 2.9666234607906676, + "grad_norm": 0.5062280893325806, + "learning_rate": 3.2457460523599437e-09, + "loss": 0.0741, + "step": 9155 + }, + { + "epoch": 2.966947504860661, + "grad_norm": 0.4573480784893036, + "learning_rate": 3.183034455198075e-09, + "loss": 0.0628, + "step": 9156 + }, + { + "epoch": 2.9672715489306545, + "grad_norm": 0.5378585457801819, + "learning_rate": 3.1209344200594517e-09, + "loss": 0.0761, + "step": 9157 + }, + { + "epoch": 2.967595593000648, + "grad_norm": 0.47644490003585815, + "learning_rate": 3.059445954545215e-09, + "loss": 0.0706, + "step": 9158 + }, + { + "epoch": 2.9679196370706418, + "grad_norm": 0.5022232532501221, + "learning_rate": 2.998569066181012e-09, + "loss": 0.071, + "step": 9159 + }, + { + "epoch": 2.968243681140635, + "grad_norm": 0.46460914611816406, + "learning_rate": 2.938303762416994e-09, + "loss": 0.0663, + "step": 9160 + }, + { + "epoch": 2.9685677252106286, + "grad_norm": 0.4863564670085907, + "learning_rate": 2.8786500506289284e-09, + "loss": 0.0682, + "step": 9161 + }, + { + "epoch": 2.968891769280622, + "grad_norm": 0.45481154322624207, + "learning_rate": 2.8196079381187513e-09, + "loss": 0.0656, + "step": 9162 + }, + { + "epoch": 2.9692158133506155, + "grad_norm": 0.4881567656993866, + "learning_rate": 2.7611774321117947e-09, + "loss": 0.0697, + "step": 9163 + }, + { + "epoch": 2.9695398574206093, + "grad_norm": 0.49049463868141174, + "learning_rate": 2.7033585397595595e-09, + "loss": 0.0669, + "step": 9164 + }, + { + "epoch": 2.9698639014906028, + "grad_norm": 0.4622054398059845, + "learning_rate": 2.646151268138608e-09, + "loss": 0.069, + "step": 9165 + }, + { + "epoch": 2.970187945560596, + "grad_norm": 0.5025736093521118, + "learning_rate": 2.5895556242511167e-09, + "loss": 0.0726, + "step": 9166 + }, + { + "epoch": 2.9705119896305896, + "grad_norm": 0.4582308232784271, + "learning_rate": 2.5335716150226563e-09, + "loss": 0.0642, + "step": 9167 + }, + { + "epoch": 2.970836033700583, + "grad_norm": 0.46986815333366394, + "learning_rate": 2.478199247306634e-09, + "loss": 0.0704, + "step": 9168 + }, + { + "epoch": 2.971160077770577, + "grad_norm": 0.4792775809764862, + "learning_rate": 2.4234385278787407e-09, + "loss": 0.0658, + "step": 9169 + }, + { + "epoch": 2.9714841218405703, + "grad_norm": 0.4750364124774933, + "learning_rate": 2.3692894634413934e-09, + "loss": 0.0663, + "step": 9170 + }, + { + "epoch": 2.9718081659105637, + "grad_norm": 0.5023250579833984, + "learning_rate": 2.3157520606226226e-09, + "loss": 0.0707, + "step": 9171 + }, + { + "epoch": 2.9721322099805576, + "grad_norm": 0.4666900634765625, + "learning_rate": 2.26282632597441e-09, + "loss": 0.0668, + "step": 9172 + }, + { + "epoch": 2.972456254050551, + "grad_norm": 0.5010154247283936, + "learning_rate": 2.2105122659743515e-09, + "loss": 0.0713, + "step": 9173 + }, + { + "epoch": 2.9727802981205445, + "grad_norm": 0.47971847653388977, + "learning_rate": 2.158809887025659e-09, + "loss": 0.0685, + "step": 9174 + }, + { + "epoch": 2.973104342190538, + "grad_norm": 0.4968419373035431, + "learning_rate": 2.1077191954554933e-09, + "loss": 0.0723, + "step": 9175 + }, + { + "epoch": 2.9734283862605313, + "grad_norm": 0.48627862334251404, + "learning_rate": 2.05724019751663e-09, + "loss": 0.0703, + "step": 9176 + }, + { + "epoch": 2.973752430330525, + "grad_norm": 0.5221262574195862, + "learning_rate": 2.0073728993885712e-09, + "loss": 0.0772, + "step": 9177 + }, + { + "epoch": 2.9740764744005186, + "grad_norm": 0.5231309533119202, + "learning_rate": 1.958117307173102e-09, + "loss": 0.0728, + "step": 9178 + }, + { + "epoch": 2.974400518470512, + "grad_norm": 0.47055307030677795, + "learning_rate": 1.909473426899844e-09, + "loss": 0.064, + "step": 9179 + }, + { + "epoch": 2.9747245625405054, + "grad_norm": 0.44478094577789307, + "learning_rate": 1.8614412645212575e-09, + "loss": 0.0648, + "step": 9180 + }, + { + "epoch": 2.975048606610499, + "grad_norm": 0.43987661600112915, + "learning_rate": 1.8140208259165293e-09, + "loss": 0.0659, + "step": 9181 + }, + { + "epoch": 2.9753726506804927, + "grad_norm": 0.537038266658783, + "learning_rate": 1.7672121168899048e-09, + "loss": 0.0733, + "step": 9182 + }, + { + "epoch": 2.975696694750486, + "grad_norm": 0.46762892603874207, + "learning_rate": 1.72101514316958e-09, + "loss": 0.0666, + "step": 9183 + }, + { + "epoch": 2.9760207388204796, + "grad_norm": 0.5129542946815491, + "learning_rate": 1.6754299104099204e-09, + "loss": 0.0712, + "step": 9184 + }, + { + "epoch": 2.976344782890473, + "grad_norm": 0.4832301735877991, + "learning_rate": 1.630456424190352e-09, + "loss": 0.07, + "step": 9185 + }, + { + "epoch": 2.9766688269604664, + "grad_norm": 0.48235058784484863, + "learning_rate": 1.5860946900148056e-09, + "loss": 0.0749, + "step": 9186 + }, + { + "epoch": 2.9769928710304603, + "grad_norm": 0.4909639358520508, + "learning_rate": 1.5423447133128267e-09, + "loss": 0.0726, + "step": 9187 + }, + { + "epoch": 2.9773169151004537, + "grad_norm": 0.48089101910591125, + "learning_rate": 1.499206499439021e-09, + "loss": 0.0727, + "step": 9188 + }, + { + "epoch": 2.977640959170447, + "grad_norm": 0.44641274213790894, + "learning_rate": 1.4566800536730541e-09, + "loss": 0.0635, + "step": 9189 + }, + { + "epoch": 2.9779650032404406, + "grad_norm": 0.48997771739959717, + "learning_rate": 1.4147653812196515e-09, + "loss": 0.0711, + "step": 9190 + }, + { + "epoch": 2.978289047310434, + "grad_norm": 0.46297505497932434, + "learning_rate": 1.3734624872091539e-09, + "loss": 0.0703, + "step": 9191 + }, + { + "epoch": 2.978613091380428, + "grad_norm": 0.49232834577560425, + "learning_rate": 1.3327713766964068e-09, + "loss": 0.0718, + "step": 9192 + }, + { + "epoch": 2.9789371354504213, + "grad_norm": 0.5025762319564819, + "learning_rate": 1.2926920546613154e-09, + "loss": 0.0788, + "step": 9193 + }, + { + "epoch": 2.9792611795204147, + "grad_norm": 0.46873152256011963, + "learning_rate": 1.2532245260099552e-09, + "loss": 0.0674, + "step": 9194 + }, + { + "epoch": 2.9795852235904086, + "grad_norm": 0.47924181818962097, + "learning_rate": 1.2143687955723516e-09, + "loss": 0.0702, + "step": 9195 + }, + { + "epoch": 2.9799092676604015, + "grad_norm": 0.5147335529327393, + "learning_rate": 1.1761248681035896e-09, + "loss": 0.0725, + "step": 9196 + }, + { + "epoch": 2.9802333117303954, + "grad_norm": 0.5970619916915894, + "learning_rate": 1.138492748284925e-09, + "loss": 0.0666, + "step": 9197 + }, + { + "epoch": 2.980557355800389, + "grad_norm": 0.4947203993797302, + "learning_rate": 1.1014724407215627e-09, + "loss": 0.0714, + "step": 9198 + }, + { + "epoch": 2.9808813998703823, + "grad_norm": 0.48733168840408325, + "learning_rate": 1.065063949945433e-09, + "loss": 0.0715, + "step": 9199 + }, + { + "epoch": 2.981205443940376, + "grad_norm": 0.5004407167434692, + "learning_rate": 1.0292672804118609e-09, + "loss": 0.075, + "step": 9200 + }, + { + "epoch": 2.9815294880103695, + "grad_norm": 0.46872594952583313, + "learning_rate": 9.940824365023417e-10, + "loss": 0.0662, + "step": 9201 + }, + { + "epoch": 2.981853532080363, + "grad_norm": 0.4934064447879791, + "learning_rate": 9.595094225228753e-10, + "loss": 0.0721, + "step": 9202 + }, + { + "epoch": 2.9821775761503564, + "grad_norm": 0.5108233094215393, + "learning_rate": 9.255482427050766e-10, + "loss": 0.0714, + "step": 9203 + }, + { + "epoch": 2.98250162022035, + "grad_norm": 0.48761728405952454, + "learning_rate": 8.92198901205621e-10, + "loss": 0.0679, + "step": 9204 + }, + { + "epoch": 2.9828256642903437, + "grad_norm": 0.4855932891368866, + "learning_rate": 8.594614021051329e-10, + "loss": 0.0725, + "step": 9205 + }, + { + "epoch": 2.983149708360337, + "grad_norm": 0.48095831274986267, + "learning_rate": 8.273357494120726e-10, + "loss": 0.0707, + "step": 9206 + }, + { + "epoch": 2.9834737524303305, + "grad_norm": 0.46818315982818604, + "learning_rate": 7.958219470566297e-10, + "loss": 0.067, + "step": 9207 + }, + { + "epoch": 2.983797796500324, + "grad_norm": 0.4704946279525757, + "learning_rate": 7.649199988968292e-10, + "loss": 0.0665, + "step": 9208 + }, + { + "epoch": 2.9841218405703174, + "grad_norm": 0.5369235873222351, + "learning_rate": 7.346299087146458e-10, + "loss": 0.0787, + "step": 9209 + }, + { + "epoch": 2.9844458846403112, + "grad_norm": 0.510699987411499, + "learning_rate": 7.049516802165591e-10, + "loss": 0.0787, + "step": 9210 + }, + { + "epoch": 2.9847699287103047, + "grad_norm": 0.45229703187942505, + "learning_rate": 6.758853170363289e-10, + "loss": 0.0655, + "step": 9211 + }, + { + "epoch": 2.985093972780298, + "grad_norm": 0.5180689692497253, + "learning_rate": 6.474308227299996e-10, + "loss": 0.07, + "step": 9212 + }, + { + "epoch": 2.9854180168502915, + "grad_norm": 0.4732065796852112, + "learning_rate": 6.195882007803411e-10, + "loss": 0.0659, + "step": 9213 + }, + { + "epoch": 2.985742060920285, + "grad_norm": 0.49241015315055847, + "learning_rate": 5.923574545957378e-10, + "loss": 0.0733, + "step": 9214 + }, + { + "epoch": 2.986066104990279, + "grad_norm": 0.5081177949905396, + "learning_rate": 5.657385875085242e-10, + "loss": 0.0702, + "step": 9215 + }, + { + "epoch": 2.9863901490602722, + "grad_norm": 0.5168772339820862, + "learning_rate": 5.397316027766497e-10, + "loss": 0.0716, + "step": 9216 + }, + { + "epoch": 2.9867141931302656, + "grad_norm": 0.4934365451335907, + "learning_rate": 5.143365035831238e-10, + "loss": 0.0717, + "step": 9217 + }, + { + "epoch": 2.987038237200259, + "grad_norm": 0.4696829915046692, + "learning_rate": 4.895532930360158e-10, + "loss": 0.0642, + "step": 9218 + }, + { + "epoch": 2.9873622812702525, + "grad_norm": 0.47430771589279175, + "learning_rate": 4.653819741684551e-10, + "loss": 0.0627, + "step": 9219 + }, + { + "epoch": 2.9876863253402464, + "grad_norm": 0.45280787348747253, + "learning_rate": 4.4182254993918596e-10, + "loss": 0.0672, + "step": 9220 + }, + { + "epoch": 2.98801036941024, + "grad_norm": 0.4795883595943451, + "learning_rate": 4.1887502323090244e-10, + "loss": 0.0673, + "step": 9221 + }, + { + "epoch": 2.988334413480233, + "grad_norm": 0.49811699986457825, + "learning_rate": 3.96539396853024e-10, + "loss": 0.0723, + "step": 9222 + }, + { + "epoch": 2.988658457550227, + "grad_norm": 0.45733726024627686, + "learning_rate": 3.748156735389197e-10, + "loss": 0.0715, + "step": 9223 + }, + { + "epoch": 2.9889825016202205, + "grad_norm": 0.4766613245010376, + "learning_rate": 3.537038559464634e-10, + "loss": 0.0698, + "step": 9224 + }, + { + "epoch": 2.989306545690214, + "grad_norm": 0.5184661149978638, + "learning_rate": 3.332039466613646e-10, + "loss": 0.0742, + "step": 9225 + }, + { + "epoch": 2.9896305897602073, + "grad_norm": 0.4830702841281891, + "learning_rate": 3.1331594819106194e-10, + "loss": 0.0691, + "step": 9226 + }, + { + "epoch": 2.9899546338302008, + "grad_norm": 0.5173357129096985, + "learning_rate": 2.9403986296971943e-10, + "loss": 0.0688, + "step": 9227 + }, + { + "epoch": 2.9902786779001946, + "grad_norm": 0.48872441053390503, + "learning_rate": 2.7537569335767124e-10, + "loss": 0.0709, + "step": 9228 + }, + { + "epoch": 2.990602721970188, + "grad_norm": 0.466548353433609, + "learning_rate": 2.5732344163809096e-10, + "loss": 0.0702, + "step": 9229 + }, + { + "epoch": 2.9909267660401815, + "grad_norm": 0.48582354187965393, + "learning_rate": 2.398831100214327e-10, + "loss": 0.0695, + "step": 9230 + }, + { + "epoch": 2.991250810110175, + "grad_norm": 0.5102851986885071, + "learning_rate": 2.230547006415451e-10, + "loss": 0.0727, + "step": 9231 + }, + { + "epoch": 2.9915748541801683, + "grad_norm": 0.4786846339702606, + "learning_rate": 2.0683821555789185e-10, + "loss": 0.071, + "step": 9232 + }, + { + "epoch": 2.991898898250162, + "grad_norm": 0.4758155643939972, + "learning_rate": 1.9123365675555172e-10, + "loss": 0.0665, + "step": 9233 + }, + { + "epoch": 2.9922229423201556, + "grad_norm": 0.4775131046772003, + "learning_rate": 1.7624102614410832e-10, + "loss": 0.0699, + "step": 9234 + }, + { + "epoch": 2.992546986390149, + "grad_norm": 0.5171465873718262, + "learning_rate": 1.618603255587603e-10, + "loss": 0.0743, + "step": 9235 + }, + { + "epoch": 2.9928710304601425, + "grad_norm": 0.5166998505592346, + "learning_rate": 1.4809155675976627e-10, + "loss": 0.0732, + "step": 9236 + }, + { + "epoch": 2.993195074530136, + "grad_norm": 0.5328329801559448, + "learning_rate": 1.3493472143188968e-10, + "loss": 0.0712, + "step": 9237 + }, + { + "epoch": 2.9935191186001298, + "grad_norm": 0.4790295958518982, + "learning_rate": 1.2238982118606412e-10, + "loss": 0.0741, + "step": 9238 + }, + { + "epoch": 2.993843162670123, + "grad_norm": 0.4853643476963043, + "learning_rate": 1.1045685755661784e-10, + "loss": 0.0687, + "step": 9239 + }, + { + "epoch": 2.9941672067401166, + "grad_norm": 0.46291592717170715, + "learning_rate": 9.91358320046043e-11, + "loss": 0.0652, + "step": 9240 + }, + { + "epoch": 2.99449125081011, + "grad_norm": 0.49624311923980713, + "learning_rate": 8.842674591558187e-11, + "loss": 0.0691, + "step": 9241 + }, + { + "epoch": 2.9948152948801035, + "grad_norm": 0.4871693253517151, + "learning_rate": 7.832960060016881e-11, + "loss": 0.0697, + "step": 9242 + }, + { + "epoch": 2.9951393389500973, + "grad_norm": 0.4829963445663452, + "learning_rate": 6.884439729459847e-11, + "loss": 0.0723, + "step": 9243 + }, + { + "epoch": 2.9954633830200907, + "grad_norm": 0.4893077611923218, + "learning_rate": 5.99711371590539e-11, + "loss": 0.0718, + "step": 9244 + }, + { + "epoch": 2.995787427090084, + "grad_norm": 0.5166180729866028, + "learning_rate": 5.170982127988833e-11, + "loss": 0.0742, + "step": 9245 + }, + { + "epoch": 2.996111471160078, + "grad_norm": 0.4631396532058716, + "learning_rate": 4.406045066851494e-11, + "loss": 0.0642, + "step": 9246 + }, + { + "epoch": 2.9964355152300715, + "grad_norm": 0.46219220757484436, + "learning_rate": 3.7023026260296633e-11, + "loss": 0.0689, + "step": 9247 + }, + { + "epoch": 2.996759559300065, + "grad_norm": 0.5113931894302368, + "learning_rate": 3.059754891732158e-11, + "loss": 0.0783, + "step": 9248 + }, + { + "epoch": 2.9970836033700583, + "grad_norm": 0.4924919307231903, + "learning_rate": 2.4784019426182804e-11, + "loss": 0.0701, + "step": 9249 + }, + { + "epoch": 2.9974076474400517, + "grad_norm": 0.48740407824516296, + "learning_rate": 1.958243849742303e-11, + "loss": 0.0722, + "step": 9250 + }, + { + "epoch": 2.9977316915100456, + "grad_norm": 0.492016077041626, + "learning_rate": 1.4992806768310274e-11, + "loss": 0.0721, + "step": 9251 + }, + { + "epoch": 2.998055735580039, + "grad_norm": 0.5209946036338806, + "learning_rate": 1.1015124800617395e-11, + "loss": 0.0813, + "step": 9252 + }, + { + "epoch": 2.9983797796500324, + "grad_norm": 0.5001586675643921, + "learning_rate": 7.649393080622069e-12, + "loss": 0.0705, + "step": 9253 + }, + { + "epoch": 2.998703823720026, + "grad_norm": 0.4529384672641754, + "learning_rate": 4.8956120207721554e-12, + "loss": 0.0628, + "step": 9254 + }, + { + "epoch": 2.9990278677900193, + "grad_norm": 0.4866239130496979, + "learning_rate": 2.753781958575452e-12, + "loss": 0.0698, + "step": 9255 + }, + { + "epoch": 2.999351911860013, + "grad_norm": 0.4864363372325897, + "learning_rate": 1.2239031549343693e-12, + "loss": 0.0658, + "step": 9256 + }, + { + "epoch": 2.9996759559300066, + "grad_norm": 0.49378660321235657, + "learning_rate": 3.059757980317102e-13, + "loss": 0.073, + "step": 9257 + }, + { + "epoch": 3.0, + "grad_norm": 0.49170607328414917, + "learning_rate": 0.0, + "loss": 0.0689, + "step": 9258 + } + ], + "logging_steps": 1.0, + "max_steps": 9258, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.880459018641395e+19, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}