{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9983193277310924, "eval_steps": 500, "global_step": 594, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005042016806722689, "grad_norm": 9.45597365399993, "learning_rate": 0.0, "loss": 1.7242, "step": 1 }, { "epoch": 0.010084033613445379, "grad_norm": 9.218921810032594, "learning_rate": 1.6666666666666668e-07, "loss": 1.9603, "step": 2 }, { "epoch": 0.015126050420168067, "grad_norm": 9.19364568473009, "learning_rate": 3.3333333333333335e-07, "loss": 1.7815, "step": 3 }, { "epoch": 0.020168067226890758, "grad_norm": 9.753359655679406, "learning_rate": 5.000000000000001e-07, "loss": 1.8671, "step": 4 }, { "epoch": 0.025210084033613446, "grad_norm": 10.188684139684757, "learning_rate": 6.666666666666667e-07, "loss": 1.8868, "step": 5 }, { "epoch": 0.030252100840336135, "grad_norm": 9.253535763532076, "learning_rate": 8.333333333333333e-07, "loss": 1.8821, "step": 6 }, { "epoch": 0.03529411764705882, "grad_norm": 9.452472463389428, "learning_rate": 1.0000000000000002e-06, "loss": 1.8398, "step": 7 }, { "epoch": 0.040336134453781515, "grad_norm": 8.338459992866273, "learning_rate": 1.1666666666666668e-06, "loss": 1.7522, "step": 8 }, { "epoch": 0.0453781512605042, "grad_norm": 8.599040436901118, "learning_rate": 1.3333333333333334e-06, "loss": 1.7879, "step": 9 }, { "epoch": 0.05042016806722689, "grad_norm": 9.204139051227466, "learning_rate": 1.5e-06, "loss": 1.8949, "step": 10 }, { "epoch": 0.05546218487394958, "grad_norm": 8.383986517840034, "learning_rate": 1.6666666666666667e-06, "loss": 1.7568, "step": 11 }, { "epoch": 0.06050420168067227, "grad_norm": 6.14215523192106, "learning_rate": 1.8333333333333333e-06, "loss": 1.6243, "step": 12 }, { "epoch": 0.06554621848739496, "grad_norm": 5.998914335428499, "learning_rate": 2.0000000000000003e-06, "loss": 1.5973, "step": 13 }, { "epoch": 0.07058823529411765, "grad_norm": 5.047474738743573, "learning_rate": 2.166666666666667e-06, "loss": 1.3774, "step": 14 }, { "epoch": 0.07563025210084033, "grad_norm": 5.330740621399064, "learning_rate": 2.3333333333333336e-06, "loss": 1.5953, "step": 15 }, { "epoch": 0.08067226890756303, "grad_norm": 3.3659526026887012, "learning_rate": 2.5e-06, "loss": 1.3746, "step": 16 }, { "epoch": 0.08571428571428572, "grad_norm": 3.639732034816691, "learning_rate": 2.666666666666667e-06, "loss": 1.4698, "step": 17 }, { "epoch": 0.0907563025210084, "grad_norm": 3.461514147091586, "learning_rate": 2.8333333333333335e-06, "loss": 1.4229, "step": 18 }, { "epoch": 0.0957983193277311, "grad_norm": 3.765309579932919, "learning_rate": 3e-06, "loss": 1.3948, "step": 19 }, { "epoch": 0.10084033613445378, "grad_norm": 2.825230202760748, "learning_rate": 3.1666666666666667e-06, "loss": 1.3286, "step": 20 }, { "epoch": 0.10588235294117647, "grad_norm": 2.387015147619193, "learning_rate": 3.3333333333333333e-06, "loss": 1.2574, "step": 21 }, { "epoch": 0.11092436974789915, "grad_norm": 2.6592293064240176, "learning_rate": 3.5e-06, "loss": 1.2994, "step": 22 }, { "epoch": 0.11596638655462185, "grad_norm": 2.9338685422018163, "learning_rate": 3.6666666666666666e-06, "loss": 1.271, "step": 23 }, { "epoch": 0.12100840336134454, "grad_norm": 2.8053283243940923, "learning_rate": 3.833333333333334e-06, "loss": 1.239, "step": 24 }, { "epoch": 0.12605042016806722, "grad_norm": 2.4764651014882673, "learning_rate": 4.000000000000001e-06, "loss": 1.2632, "step": 25 }, { "epoch": 0.13109243697478992, "grad_norm": 4.193230652323676, "learning_rate": 4.166666666666667e-06, "loss": 1.2494, "step": 26 }, { "epoch": 0.1361344537815126, "grad_norm": 2.166632601601999, "learning_rate": 4.333333333333334e-06, "loss": 1.1772, "step": 27 }, { "epoch": 0.1411764705882353, "grad_norm": 2.0456983888545133, "learning_rate": 4.5e-06, "loss": 1.3323, "step": 28 }, { "epoch": 0.146218487394958, "grad_norm": 1.9041534025850353, "learning_rate": 4.666666666666667e-06, "loss": 1.123, "step": 29 }, { "epoch": 0.15126050420168066, "grad_norm": 1.7473372136225975, "learning_rate": 4.833333333333333e-06, "loss": 1.1116, "step": 30 }, { "epoch": 0.15630252100840336, "grad_norm": 1.9237786068741898, "learning_rate": 5e-06, "loss": 1.2038, "step": 31 }, { "epoch": 0.16134453781512606, "grad_norm": 1.9862371515679214, "learning_rate": 5.1666666666666675e-06, "loss": 1.2171, "step": 32 }, { "epoch": 0.16638655462184873, "grad_norm": 1.5922593116941988, "learning_rate": 5.333333333333334e-06, "loss": 1.0193, "step": 33 }, { "epoch": 0.17142857142857143, "grad_norm": 1.6830455258736572, "learning_rate": 5.500000000000001e-06, "loss": 1.0761, "step": 34 }, { "epoch": 0.17647058823529413, "grad_norm": 1.594143028453368, "learning_rate": 5.666666666666667e-06, "loss": 1.1126, "step": 35 }, { "epoch": 0.1815126050420168, "grad_norm": 1.9420003685481775, "learning_rate": 5.833333333333334e-06, "loss": 1.1203, "step": 36 }, { "epoch": 0.1865546218487395, "grad_norm": 1.5815112240806883, "learning_rate": 6e-06, "loss": 1.0293, "step": 37 }, { "epoch": 0.1915966386554622, "grad_norm": 1.4697006996217221, "learning_rate": 6.166666666666667e-06, "loss": 0.995, "step": 38 }, { "epoch": 0.19663865546218487, "grad_norm": 1.5886739084366435, "learning_rate": 6.333333333333333e-06, "loss": 1.1051, "step": 39 }, { "epoch": 0.20168067226890757, "grad_norm": 1.3717225438634324, "learning_rate": 6.5000000000000004e-06, "loss": 1.0817, "step": 40 }, { "epoch": 0.20672268907563024, "grad_norm": 1.4586233032739204, "learning_rate": 6.666666666666667e-06, "loss": 0.9949, "step": 41 }, { "epoch": 0.21176470588235294, "grad_norm": 1.4404526895251804, "learning_rate": 6.833333333333334e-06, "loss": 1.0369, "step": 42 }, { "epoch": 0.21680672268907564, "grad_norm": 1.5011071614715905, "learning_rate": 7e-06, "loss": 1.0126, "step": 43 }, { "epoch": 0.2218487394957983, "grad_norm": 1.446801500279163, "learning_rate": 7.166666666666667e-06, "loss": 0.9829, "step": 44 }, { "epoch": 0.226890756302521, "grad_norm": 1.3157845464395648, "learning_rate": 7.333333333333333e-06, "loss": 0.9432, "step": 45 }, { "epoch": 0.2319327731092437, "grad_norm": 1.3291092123967403, "learning_rate": 7.500000000000001e-06, "loss": 0.9518, "step": 46 }, { "epoch": 0.23697478991596638, "grad_norm": 1.5105509029003468, "learning_rate": 7.666666666666667e-06, "loss": 1.0235, "step": 47 }, { "epoch": 0.24201680672268908, "grad_norm": 1.420355667391472, "learning_rate": 7.833333333333333e-06, "loss": 0.9567, "step": 48 }, { "epoch": 0.24705882352941178, "grad_norm": 1.463732709856337, "learning_rate": 8.000000000000001e-06, "loss": 1.0417, "step": 49 }, { "epoch": 0.25210084033613445, "grad_norm": 1.4275241446789713, "learning_rate": 8.166666666666668e-06, "loss": 1.0347, "step": 50 }, { "epoch": 0.2571428571428571, "grad_norm": 1.309592587931707, "learning_rate": 8.333333333333334e-06, "loss": 0.9524, "step": 51 }, { "epoch": 0.26218487394957984, "grad_norm": 1.3344872488030621, "learning_rate": 8.5e-06, "loss": 1.0684, "step": 52 }, { "epoch": 0.2672268907563025, "grad_norm": 1.3533956797177575, "learning_rate": 8.666666666666668e-06, "loss": 0.9501, "step": 53 }, { "epoch": 0.2722689075630252, "grad_norm": 1.4422509166091777, "learning_rate": 8.833333333333334e-06, "loss": 0.9452, "step": 54 }, { "epoch": 0.2773109243697479, "grad_norm": 1.3534627088209181, "learning_rate": 9e-06, "loss": 0.9243, "step": 55 }, { "epoch": 0.2823529411764706, "grad_norm": 1.370929089587996, "learning_rate": 9.166666666666666e-06, "loss": 0.9577, "step": 56 }, { "epoch": 0.28739495798319326, "grad_norm": 1.34141912977082, "learning_rate": 9.333333333333334e-06, "loss": 0.9216, "step": 57 }, { "epoch": 0.292436974789916, "grad_norm": 1.437190020022949, "learning_rate": 9.5e-06, "loss": 0.986, "step": 58 }, { "epoch": 0.29747899159663865, "grad_norm": 1.3190591357074484, "learning_rate": 9.666666666666667e-06, "loss": 1.0163, "step": 59 }, { "epoch": 0.3025210084033613, "grad_norm": 1.3230400720636633, "learning_rate": 9.833333333333333e-06, "loss": 0.9071, "step": 60 }, { "epoch": 0.30756302521008405, "grad_norm": 1.570821042981294, "learning_rate": 1e-05, "loss": 1.0532, "step": 61 }, { "epoch": 0.3126050420168067, "grad_norm": 1.3817712282096664, "learning_rate": 9.999913472135126e-06, "loss": 0.9497, "step": 62 }, { "epoch": 0.3176470588235294, "grad_norm": 1.3461235016869455, "learning_rate": 9.99965389153533e-06, "loss": 0.9656, "step": 63 }, { "epoch": 0.3226890756302521, "grad_norm": 1.2703045215015534, "learning_rate": 9.999221267184993e-06, "loss": 0.8563, "step": 64 }, { "epoch": 0.3277310924369748, "grad_norm": 1.4463044763025328, "learning_rate": 9.998615614057743e-06, "loss": 0.9743, "step": 65 }, { "epoch": 0.33277310924369746, "grad_norm": 1.2126520135581191, "learning_rate": 9.997836953115927e-06, "loss": 0.8256, "step": 66 }, { "epoch": 0.3378151260504202, "grad_norm": 1.465456256707118, "learning_rate": 9.996885311309892e-06, "loss": 0.9112, "step": 67 }, { "epoch": 0.34285714285714286, "grad_norm": 1.3774012861831768, "learning_rate": 9.995760721577053e-06, "loss": 1.0031, "step": 68 }, { "epoch": 0.34789915966386553, "grad_norm": 1.214727510886685, "learning_rate": 9.994463222840748e-06, "loss": 0.8777, "step": 69 }, { "epoch": 0.35294117647058826, "grad_norm": 1.3372556283226344, "learning_rate": 9.992992860008893e-06, "loss": 0.9503, "step": 70 }, { "epoch": 0.35798319327731093, "grad_norm": 1.2629663699758409, "learning_rate": 9.991349683972435e-06, "loss": 0.9707, "step": 71 }, { "epoch": 0.3630252100840336, "grad_norm": 1.2961666438854509, "learning_rate": 9.989533751603578e-06, "loss": 0.8987, "step": 72 }, { "epoch": 0.3680672268907563, "grad_norm": 1.3451690514655665, "learning_rate": 9.987545125753818e-06, "loss": 0.9614, "step": 73 }, { "epoch": 0.373109243697479, "grad_norm": 1.3824819884360038, "learning_rate": 9.985383875251783e-06, "loss": 0.9101, "step": 74 }, { "epoch": 0.37815126050420167, "grad_norm": 1.290324816657544, "learning_rate": 9.983050074900824e-06, "loss": 0.8901, "step": 75 }, { "epoch": 0.3831932773109244, "grad_norm": 1.3785449206810632, "learning_rate": 9.980543805476447e-06, "loss": 0.9305, "step": 76 }, { "epoch": 0.38823529411764707, "grad_norm": 1.2723741333137952, "learning_rate": 9.977865153723508e-06, "loss": 0.9145, "step": 77 }, { "epoch": 0.39327731092436974, "grad_norm": 1.3277787150964286, "learning_rate": 9.975014212353212e-06, "loss": 0.9386, "step": 78 }, { "epoch": 0.3983193277310924, "grad_norm": 1.300378629259356, "learning_rate": 9.971991080039912e-06, "loss": 0.9072, "step": 79 }, { "epoch": 0.40336134453781514, "grad_norm": 1.3180887220440103, "learning_rate": 9.968795861417676e-06, "loss": 0.8538, "step": 80 }, { "epoch": 0.4084033613445378, "grad_norm": 1.2852565908527667, "learning_rate": 9.965428667076687e-06, "loss": 0.8625, "step": 81 }, { "epoch": 0.4134453781512605, "grad_norm": 1.22082061679436, "learning_rate": 9.961889613559396e-06, "loss": 0.8002, "step": 82 }, { "epoch": 0.4184873949579832, "grad_norm": 1.3948047447367582, "learning_rate": 9.958178823356503e-06, "loss": 0.9563, "step": 83 }, { "epoch": 0.4235294117647059, "grad_norm": 1.32125427246041, "learning_rate": 9.954296424902709e-06, "loss": 0.9009, "step": 84 }, { "epoch": 0.42857142857142855, "grad_norm": 1.2664915782700163, "learning_rate": 9.950242552572272e-06, "loss": 0.8489, "step": 85 }, { "epoch": 0.4336134453781513, "grad_norm": 1.273298827077617, "learning_rate": 9.946017346674362e-06, "loss": 0.847, "step": 86 }, { "epoch": 0.43865546218487395, "grad_norm": 1.328680054216705, "learning_rate": 9.941620953448195e-06, "loss": 0.9382, "step": 87 }, { "epoch": 0.4436974789915966, "grad_norm": 1.263646905073375, "learning_rate": 9.937053525057977e-06, "loss": 0.8991, "step": 88 }, { "epoch": 0.44873949579831934, "grad_norm": 1.209796673070386, "learning_rate": 9.932315219587641e-06, "loss": 0.8611, "step": 89 }, { "epoch": 0.453781512605042, "grad_norm": 1.1317133515894529, "learning_rate": 9.927406201035368e-06, "loss": 0.8254, "step": 90 }, { "epoch": 0.4588235294117647, "grad_norm": 1.2581352252268798, "learning_rate": 9.922326639307918e-06, "loss": 0.8186, "step": 91 }, { "epoch": 0.4638655462184874, "grad_norm": 1.1615726675287243, "learning_rate": 9.917076710214739e-06, "loss": 0.8217, "step": 92 }, { "epoch": 0.4689075630252101, "grad_norm": 1.3906544125113194, "learning_rate": 9.911656595461899e-06, "loss": 0.9606, "step": 93 }, { "epoch": 0.47394957983193275, "grad_norm": 1.3491688269700184, "learning_rate": 9.906066482645774e-06, "loss": 0.8865, "step": 94 }, { "epoch": 0.4789915966386555, "grad_norm": 1.2884319333617182, "learning_rate": 9.900306565246579e-06, "loss": 0.8608, "step": 95 }, { "epoch": 0.48403361344537815, "grad_norm": 1.332999472417029, "learning_rate": 9.894377042621654e-06, "loss": 0.8476, "step": 96 }, { "epoch": 0.4890756302521008, "grad_norm": 1.3206768360556793, "learning_rate": 9.888278119998573e-06, "loss": 0.898, "step": 97 }, { "epoch": 0.49411764705882355, "grad_norm": 1.3732673184556148, "learning_rate": 9.882010008468038e-06, "loss": 0.9482, "step": 98 }, { "epoch": 0.4991596638655462, "grad_norm": 1.4284063475101123, "learning_rate": 9.875572924976568e-06, "loss": 0.8932, "step": 99 }, { "epoch": 0.5042016806722689, "grad_norm": 1.249757410129038, "learning_rate": 9.868967092319003e-06, "loss": 0.9113, "step": 100 }, { "epoch": 0.5092436974789916, "grad_norm": 1.2033755235104269, "learning_rate": 9.86219273913078e-06, "loss": 0.8373, "step": 101 }, { "epoch": 0.5142857142857142, "grad_norm": 1.3285676372655046, "learning_rate": 9.855250099880026e-06, "loss": 0.82, "step": 102 }, { "epoch": 0.519327731092437, "grad_norm": 1.280372963776325, "learning_rate": 9.848139414859441e-06, "loss": 0.9269, "step": 103 }, { "epoch": 0.5243697478991597, "grad_norm": 1.3597201294098022, "learning_rate": 9.840860930177984e-06, "loss": 0.8917, "step": 104 }, { "epoch": 0.5294117647058824, "grad_norm": 1.3044841757394627, "learning_rate": 9.833414897752346e-06, "loss": 0.8242, "step": 105 }, { "epoch": 0.534453781512605, "grad_norm": 1.2237707733265701, "learning_rate": 9.825801575298248e-06, "loss": 0.8369, "step": 106 }, { "epoch": 0.5394957983193277, "grad_norm": 1.2984723776565605, "learning_rate": 9.818021226321502e-06, "loss": 0.8687, "step": 107 }, { "epoch": 0.5445378151260504, "grad_norm": 1.3966505679016854, "learning_rate": 9.8100741201089e-06, "loss": 0.8698, "step": 108 }, { "epoch": 0.5495798319327732, "grad_norm": 1.3695596995593027, "learning_rate": 9.801960531718898e-06, "loss": 0.9224, "step": 109 }, { "epoch": 0.5546218487394958, "grad_norm": 1.2219956732497297, "learning_rate": 9.793680741972084e-06, "loss": 0.7909, "step": 110 }, { "epoch": 0.5596638655462185, "grad_norm": 1.1958717679101365, "learning_rate": 9.785235037441473e-06, "loss": 0.8222, "step": 111 }, { "epoch": 0.5647058823529412, "grad_norm": 1.3284406137942217, "learning_rate": 9.77662371044258e-06, "loss": 0.9698, "step": 112 }, { "epoch": 0.5697478991596638, "grad_norm": 1.4005342916908725, "learning_rate": 9.767847059023292e-06, "loss": 0.8141, "step": 113 }, { "epoch": 0.5747899159663865, "grad_norm": 1.3280058867861344, "learning_rate": 9.75890538695358e-06, "loss": 0.8281, "step": 114 }, { "epoch": 0.5798319327731093, "grad_norm": 1.348332178712391, "learning_rate": 9.749799003714954e-06, "loss": 0.8174, "step": 115 }, { "epoch": 0.584873949579832, "grad_norm": 1.345901958116435, "learning_rate": 9.74052822448978e-06, "loss": 0.8662, "step": 116 }, { "epoch": 0.5899159663865546, "grad_norm": 1.4938772005815362, "learning_rate": 9.731093370150349e-06, "loss": 0.9227, "step": 117 }, { "epoch": 0.5949579831932773, "grad_norm": 1.5782055001938107, "learning_rate": 9.721494767247779e-06, "loss": 0.9292, "step": 118 }, { "epoch": 0.6, "grad_norm": 1.2813061736782214, "learning_rate": 9.71173274800072e-06, "loss": 0.808, "step": 119 }, { "epoch": 0.6050420168067226, "grad_norm": 1.3387521092808896, "learning_rate": 9.70180765028384e-06, "loss": 0.8052, "step": 120 }, { "epoch": 0.6100840336134454, "grad_norm": 1.1971567112258479, "learning_rate": 9.691719817616148e-06, "loss": 0.8321, "step": 121 }, { "epoch": 0.6151260504201681, "grad_norm": 1.4022847044925355, "learning_rate": 9.681469599149093e-06, "loss": 0.8362, "step": 122 }, { "epoch": 0.6201680672268908, "grad_norm": 1.4458562904255674, "learning_rate": 9.671057349654481e-06, "loss": 0.8753, "step": 123 }, { "epoch": 0.6252100840336134, "grad_norm": 1.3489812277335955, "learning_rate": 9.660483429512198e-06, "loss": 0.8406, "step": 124 }, { "epoch": 0.6302521008403361, "grad_norm": 1.2541520148654464, "learning_rate": 9.649748204697741e-06, "loss": 0.8096, "step": 125 }, { "epoch": 0.6352941176470588, "grad_norm": 1.4166136476450861, "learning_rate": 9.63885204676954e-06, "loss": 0.9279, "step": 126 }, { "epoch": 0.6403361344537815, "grad_norm": 1.2096305649684784, "learning_rate": 9.627795332856107e-06, "loss": 0.8668, "step": 127 }, { "epoch": 0.6453781512605042, "grad_norm": 1.0817129947497557, "learning_rate": 9.616578445642982e-06, "loss": 0.8021, "step": 128 }, { "epoch": 0.6504201680672269, "grad_norm": 1.2857282530529068, "learning_rate": 9.605201773359485e-06, "loss": 0.9031, "step": 129 }, { "epoch": 0.6554621848739496, "grad_norm": 1.2909981390159206, "learning_rate": 9.59366570976528e-06, "loss": 0.9028, "step": 130 }, { "epoch": 0.6605042016806723, "grad_norm": 1.277642300275485, "learning_rate": 9.581970654136752e-06, "loss": 0.8206, "step": 131 }, { "epoch": 0.6655462184873949, "grad_norm": 1.2618202348884826, "learning_rate": 9.570117011253173e-06, "loss": 0.8038, "step": 132 }, { "epoch": 0.6705882352941176, "grad_norm": 1.3158796346136465, "learning_rate": 9.55810519138271e-06, "loss": 0.8594, "step": 133 }, { "epoch": 0.6756302521008404, "grad_norm": 1.464049668724664, "learning_rate": 9.545935610268213e-06, "loss": 0.8946, "step": 134 }, { "epoch": 0.680672268907563, "grad_norm": 1.3568598282729065, "learning_rate": 9.533608689112827e-06, "loss": 0.8747, "step": 135 }, { "epoch": 0.6857142857142857, "grad_norm": 1.459842199207566, "learning_rate": 9.521124854565425e-06, "loss": 0.8665, "step": 136 }, { "epoch": 0.6907563025210084, "grad_norm": 1.2651754016717647, "learning_rate": 9.508484538705823e-06, "loss": 0.8172, "step": 137 }, { "epoch": 0.6957983193277311, "grad_norm": 1.3148283789857567, "learning_rate": 9.495688179029838e-06, "loss": 0.8159, "step": 138 }, { "epoch": 0.7008403361344537, "grad_norm": 1.3062514406684878, "learning_rate": 9.482736218434144e-06, "loss": 0.772, "step": 139 }, { "epoch": 0.7058823529411765, "grad_norm": 1.233357901449911, "learning_rate": 9.469629105200937e-06, "loss": 0.812, "step": 140 }, { "epoch": 0.7109243697478992, "grad_norm": 1.4036092051385856, "learning_rate": 9.45636729298243e-06, "loss": 0.9176, "step": 141 }, { "epoch": 0.7159663865546219, "grad_norm": 1.2475986918890871, "learning_rate": 9.442951240785135e-06, "loss": 0.9227, "step": 142 }, { "epoch": 0.7210084033613445, "grad_norm": 1.33327258291273, "learning_rate": 9.429381412954e-06, "loss": 0.8406, "step": 143 }, { "epoch": 0.7260504201680672, "grad_norm": 1.2457766641422836, "learning_rate": 9.415658279156312e-06, "loss": 0.7944, "step": 144 }, { "epoch": 0.7310924369747899, "grad_norm": 1.214604972950531, "learning_rate": 9.401782314365458e-06, "loss": 0.7889, "step": 145 }, { "epoch": 0.7361344537815127, "grad_norm": 1.4091496584822034, "learning_rate": 9.387753998844482e-06, "loss": 0.8542, "step": 146 }, { "epoch": 0.7411764705882353, "grad_norm": 1.336371637577696, "learning_rate": 9.37357381812946e-06, "loss": 0.8713, "step": 147 }, { "epoch": 0.746218487394958, "grad_norm": 1.2559095107113698, "learning_rate": 9.359242263012693e-06, "loss": 0.8405, "step": 148 }, { "epoch": 0.7512605042016807, "grad_norm": 1.371982879040437, "learning_rate": 9.344759829525734e-06, "loss": 0.8666, "step": 149 }, { "epoch": 0.7563025210084033, "grad_norm": 1.23974913873784, "learning_rate": 9.330127018922195e-06, "loss": 0.7429, "step": 150 }, { "epoch": 0.761344537815126, "grad_norm": 1.3741045518217379, "learning_rate": 9.315344337660422e-06, "loss": 0.8649, "step": 151 }, { "epoch": 0.7663865546218488, "grad_norm": 1.348659089360585, "learning_rate": 9.300412297385954e-06, "loss": 0.8614, "step": 152 }, { "epoch": 0.7714285714285715, "grad_norm": 1.199362811459465, "learning_rate": 9.285331414913816e-06, "loss": 0.837, "step": 153 }, { "epoch": 0.7764705882352941, "grad_norm": 1.2184218309322916, "learning_rate": 9.270102212210632e-06, "loss": 0.8404, "step": 154 }, { "epoch": 0.7815126050420168, "grad_norm": 1.386612554465055, "learning_rate": 9.254725216376562e-06, "loss": 0.9221, "step": 155 }, { "epoch": 0.7865546218487395, "grad_norm": 1.3380478699356555, "learning_rate": 9.239200959627048e-06, "loss": 0.8627, "step": 156 }, { "epoch": 0.7915966386554621, "grad_norm": 1.4014570562834296, "learning_rate": 9.223529979274411e-06, "loss": 0.8525, "step": 157 }, { "epoch": 0.7966386554621848, "grad_norm": 1.3172489244042282, "learning_rate": 9.207712817709237e-06, "loss": 0.7901, "step": 158 }, { "epoch": 0.8016806722689076, "grad_norm": 1.354483035270781, "learning_rate": 9.191750022381613e-06, "loss": 0.865, "step": 159 }, { "epoch": 0.8067226890756303, "grad_norm": 1.2415343975219086, "learning_rate": 9.175642145782179e-06, "loss": 0.7898, "step": 160 }, { "epoch": 0.8117647058823529, "grad_norm": 1.2532359973917484, "learning_rate": 9.159389745423003e-06, "loss": 0.8372, "step": 161 }, { "epoch": 0.8168067226890756, "grad_norm": 1.2390725118364732, "learning_rate": 9.142993383818284e-06, "loss": 0.8383, "step": 162 }, { "epoch": 0.8218487394957983, "grad_norm": 1.3766117307822159, "learning_rate": 9.126453628464889e-06, "loss": 0.8151, "step": 163 }, { "epoch": 0.826890756302521, "grad_norm": 1.3256804846243377, "learning_rate": 9.109771051822702e-06, "loss": 0.8444, "step": 164 }, { "epoch": 0.8319327731092437, "grad_norm": 1.3520618668694473, "learning_rate": 9.09294623129482e-06, "loss": 0.8672, "step": 165 }, { "epoch": 0.8369747899159664, "grad_norm": 1.329653882039925, "learning_rate": 9.07597974920756e-06, "loss": 0.8168, "step": 166 }, { "epoch": 0.8420168067226891, "grad_norm": 1.3543281390803807, "learning_rate": 9.058872192790314e-06, "loss": 0.9118, "step": 167 }, { "epoch": 0.8470588235294118, "grad_norm": 1.3456977881970305, "learning_rate": 9.041624154155208e-06, "loss": 0.8515, "step": 168 }, { "epoch": 0.8521008403361344, "grad_norm": 1.297767613562501, "learning_rate": 9.02423623027663e-06, "loss": 0.7417, "step": 169 }, { "epoch": 0.8571428571428571, "grad_norm": 1.2894576740180352, "learning_rate": 9.006709022970547e-06, "loss": 0.8408, "step": 170 }, { "epoch": 0.8621848739495799, "grad_norm": 1.2240598626483896, "learning_rate": 8.98904313887369e-06, "loss": 0.7358, "step": 171 }, { "epoch": 0.8672268907563025, "grad_norm": 1.1890744366393113, "learning_rate": 8.971239189422555e-06, "loss": 0.8322, "step": 172 }, { "epoch": 0.8722689075630252, "grad_norm": 1.3386067991043302, "learning_rate": 8.953297790832231e-06, "loss": 0.8411, "step": 173 }, { "epoch": 0.8773109243697479, "grad_norm": 1.408000314117784, "learning_rate": 8.935219564075087e-06, "loss": 0.8036, "step": 174 }, { "epoch": 0.8823529411764706, "grad_norm": 1.3426412490545896, "learning_rate": 8.917005134859263e-06, "loss": 0.8035, "step": 175 }, { "epoch": 0.8873949579831932, "grad_norm": 1.4645291848377162, "learning_rate": 8.89865513360703e-06, "loss": 0.8392, "step": 176 }, { "epoch": 0.892436974789916, "grad_norm": 1.2117719390717796, "learning_rate": 8.88017019543296e-06, "loss": 0.8328, "step": 177 }, { "epoch": 0.8974789915966387, "grad_norm": 1.3344830085574295, "learning_rate": 8.861550960121946e-06, "loss": 0.8543, "step": 178 }, { "epoch": 0.9025210084033614, "grad_norm": 1.4853304361578643, "learning_rate": 8.842798072107055e-06, "loss": 0.8512, "step": 179 }, { "epoch": 0.907563025210084, "grad_norm": 1.2284352653979531, "learning_rate": 8.823912180447237e-06, "loss": 0.8598, "step": 180 }, { "epoch": 0.9126050420168067, "grad_norm": 1.37221802812512, "learning_rate": 8.804893938804839e-06, "loss": 0.8613, "step": 181 }, { "epoch": 0.9176470588235294, "grad_norm": 1.4397712752139291, "learning_rate": 8.785744005423003e-06, "loss": 0.8192, "step": 182 }, { "epoch": 0.9226890756302522, "grad_norm": 1.4307484306743805, "learning_rate": 8.766463043102864e-06, "loss": 0.8114, "step": 183 }, { "epoch": 0.9277310924369748, "grad_norm": 1.4036453214728524, "learning_rate": 8.747051719180626e-06, "loss": 0.8922, "step": 184 }, { "epoch": 0.9327731092436975, "grad_norm": 1.4752551479904314, "learning_rate": 8.727510705504453e-06, "loss": 0.8932, "step": 185 }, { "epoch": 0.9378151260504202, "grad_norm": 1.322337640774981, "learning_rate": 8.707840678411223e-06, "loss": 0.7998, "step": 186 }, { "epoch": 0.9428571428571428, "grad_norm": 1.2136277321616975, "learning_rate": 8.688042318703111e-06, "loss": 0.7416, "step": 187 }, { "epoch": 0.9478991596638655, "grad_norm": 1.342849040104635, "learning_rate": 8.66811631162404e-06, "loss": 0.8685, "step": 188 }, { "epoch": 0.9529411764705882, "grad_norm": 1.5250386207067939, "learning_rate": 8.648063346835943e-06, "loss": 0.8485, "step": 189 }, { "epoch": 0.957983193277311, "grad_norm": 1.3173191874193797, "learning_rate": 8.627884118394913e-06, "loss": 0.8286, "step": 190 }, { "epoch": 0.9630252100840336, "grad_norm": 1.32796081599915, "learning_rate": 8.607579324727175e-06, "loss": 0.8544, "step": 191 }, { "epoch": 0.9680672268907563, "grad_norm": 1.350363153783161, "learning_rate": 8.5871496686049e-06, "loss": 0.8102, "step": 192 }, { "epoch": 0.973109243697479, "grad_norm": 1.3655669107662696, "learning_rate": 8.566595857121902e-06, "loss": 0.8122, "step": 193 }, { "epoch": 0.9781512605042016, "grad_norm": 1.3452211499259599, "learning_rate": 8.545918601669147e-06, "loss": 0.8834, "step": 194 }, { "epoch": 0.9831932773109243, "grad_norm": 1.3376410418915317, "learning_rate": 8.525118617910144e-06, "loss": 0.8148, "step": 195 }, { "epoch": 0.9882352941176471, "grad_norm": 1.2489273918302621, "learning_rate": 8.504196625756166e-06, "loss": 0.8271, "step": 196 }, { "epoch": 0.9932773109243698, "grad_norm": 1.4139088289405872, "learning_rate": 8.483153349341336e-06, "loss": 0.845, "step": 197 }, { "epoch": 0.9983193277310924, "grad_norm": 1.384588034693747, "learning_rate": 8.461989516997565e-06, "loss": 0.8312, "step": 198 }, { "epoch": 1.0050420168067227, "grad_norm": 2.499955060770187, "learning_rate": 8.440705861229344e-06, "loss": 1.4381, "step": 199 }, { "epoch": 1.0100840336134453, "grad_norm": 1.413536932523174, "learning_rate": 8.41930311868839e-06, "loss": 0.713, "step": 200 }, { "epoch": 1.015126050420168, "grad_norm": 1.3570359586304308, "learning_rate": 8.397782030148147e-06, "loss": 0.716, "step": 201 }, { "epoch": 1.0201680672268907, "grad_norm": 1.187974845871534, "learning_rate": 8.376143340478153e-06, "loss": 0.6197, "step": 202 }, { "epoch": 1.0252100840336134, "grad_norm": 1.1805636492053666, "learning_rate": 8.354387798618254e-06, "loss": 0.6082, "step": 203 }, { "epoch": 1.030252100840336, "grad_norm": 1.3319326327566277, "learning_rate": 8.332516157552684e-06, "loss": 0.6667, "step": 204 }, { "epoch": 1.035294117647059, "grad_norm": 1.3080442340316867, "learning_rate": 8.310529174284004e-06, "loss": 0.6438, "step": 205 }, { "epoch": 1.0403361344537816, "grad_norm": 1.360919752940988, "learning_rate": 8.288427609806899e-06, "loss": 0.6931, "step": 206 }, { "epoch": 1.0453781512605043, "grad_norm": 1.2928882019326107, "learning_rate": 8.266212229081846e-06, "loss": 0.6571, "step": 207 }, { "epoch": 1.050420168067227, "grad_norm": 1.279346131512037, "learning_rate": 8.243883801008632e-06, "loss": 0.6105, "step": 208 }, { "epoch": 1.0554621848739496, "grad_norm": 1.3976246828088796, "learning_rate": 8.221443098399733e-06, "loss": 0.633, "step": 209 }, { "epoch": 1.0605042016806723, "grad_norm": 1.4051676037106482, "learning_rate": 8.198890897953586e-06, "loss": 0.631, "step": 210 }, { "epoch": 1.065546218487395, "grad_norm": 1.4026478680925658, "learning_rate": 8.176227980227693e-06, "loss": 0.646, "step": 211 }, { "epoch": 1.0705882352941176, "grad_norm": 1.4783461586544826, "learning_rate": 8.153455129611605e-06, "loss": 0.6341, "step": 212 }, { "epoch": 1.0756302521008403, "grad_norm": 1.2992917788523406, "learning_rate": 8.130573134299782e-06, "loss": 0.7027, "step": 213 }, { "epoch": 1.080672268907563, "grad_norm": 1.4403523864907255, "learning_rate": 8.107582786264299e-06, "loss": 0.6745, "step": 214 }, { "epoch": 1.0857142857142856, "grad_norm": 1.2904789259135272, "learning_rate": 8.084484881227449e-06, "loss": 0.6278, "step": 215 }, { "epoch": 1.0907563025210083, "grad_norm": 1.3928383691850674, "learning_rate": 8.061280218634192e-06, "loss": 0.665, "step": 216 }, { "epoch": 1.0957983193277312, "grad_norm": 1.3355440702392616, "learning_rate": 8.037969601624495e-06, "loss": 0.6095, "step": 217 }, { "epoch": 1.1008403361344539, "grad_norm": 1.3135802297885384, "learning_rate": 8.014553837005527e-06, "loss": 0.7134, "step": 218 }, { "epoch": 1.1058823529411765, "grad_norm": 1.3334358438044307, "learning_rate": 7.99103373522373e-06, "loss": 0.6149, "step": 219 }, { "epoch": 1.1109243697478992, "grad_norm": 1.3855125872698653, "learning_rate": 7.967410110336782e-06, "loss": 0.6709, "step": 220 }, { "epoch": 1.1159663865546219, "grad_norm": 1.4082439279428, "learning_rate": 7.943683779985412e-06, "loss": 0.6665, "step": 221 }, { "epoch": 1.1210084033613446, "grad_norm": 1.3849413150174785, "learning_rate": 7.919855565365102e-06, "loss": 0.6698, "step": 222 }, { "epoch": 1.1260504201680672, "grad_norm": 1.3025006342892487, "learning_rate": 7.895926291197667e-06, "loss": 0.6726, "step": 223 }, { "epoch": 1.13109243697479, "grad_norm": 1.3438499346918609, "learning_rate": 7.871896785702707e-06, "loss": 0.6361, "step": 224 }, { "epoch": 1.1361344537815126, "grad_norm": 1.252763414951386, "learning_rate": 7.847767880568944e-06, "loss": 0.6534, "step": 225 }, { "epoch": 1.1411764705882352, "grad_norm": 1.4594024040073388, "learning_rate": 7.823540410925434e-06, "loss": 0.7176, "step": 226 }, { "epoch": 1.146218487394958, "grad_norm": 1.3020082357416656, "learning_rate": 7.799215215312667e-06, "loss": 0.6117, "step": 227 }, { "epoch": 1.1512605042016806, "grad_norm": 1.3344891922181583, "learning_rate": 7.774793135653537e-06, "loss": 0.6502, "step": 228 }, { "epoch": 1.1563025210084033, "grad_norm": 1.1931020476239522, "learning_rate": 7.750275017224208e-06, "loss": 0.5864, "step": 229 }, { "epoch": 1.1613445378151261, "grad_norm": 1.3817137725123274, "learning_rate": 7.725661708624855e-06, "loss": 0.6845, "step": 230 }, { "epoch": 1.1663865546218488, "grad_norm": 1.3718851116188664, "learning_rate": 7.700954061750295e-06, "loss": 0.6666, "step": 231 }, { "epoch": 1.1714285714285715, "grad_norm": 1.3538961263237106, "learning_rate": 7.676152931760496e-06, "loss": 0.6815, "step": 232 }, { "epoch": 1.1764705882352942, "grad_norm": 1.3576998269549865, "learning_rate": 7.651259177050996e-06, "loss": 0.6169, "step": 233 }, { "epoch": 1.1815126050420168, "grad_norm": 1.3317040137841496, "learning_rate": 7.626273659223166e-06, "loss": 0.8546, "step": 234 }, { "epoch": 1.1865546218487395, "grad_norm": 1.368524911957153, "learning_rate": 7.601197243054411e-06, "loss": 0.6168, "step": 235 }, { "epoch": 1.1915966386554622, "grad_norm": 1.3058914037226665, "learning_rate": 7.576030796468233e-06, "loss": 0.7452, "step": 236 }, { "epoch": 1.1966386554621848, "grad_norm": 1.5392470830352827, "learning_rate": 7.5507751905041885e-06, "loss": 0.6195, "step": 237 }, { "epoch": 1.2016806722689075, "grad_norm": 1.4102673119306182, "learning_rate": 7.525431299287737e-06, "loss": 0.6523, "step": 238 }, { "epoch": 1.2067226890756302, "grad_norm": 1.4511322902886419, "learning_rate": 7.500000000000001e-06, "loss": 0.6862, "step": 239 }, { "epoch": 1.2117647058823529, "grad_norm": 1.2661930310847365, "learning_rate": 7.474482172847391e-06, "loss": 0.6528, "step": 240 }, { "epoch": 1.2168067226890757, "grad_norm": 1.3307860380456358, "learning_rate": 7.4488787010311425e-06, "loss": 0.6602, "step": 241 }, { "epoch": 1.2218487394957984, "grad_norm": 1.3750585055686875, "learning_rate": 7.423190470716761e-06, "loss": 0.6432, "step": 242 }, { "epoch": 1.226890756302521, "grad_norm": 1.2979245099980825, "learning_rate": 7.3974183710033334e-06, "loss": 0.6288, "step": 243 }, { "epoch": 1.2319327731092438, "grad_norm": 1.2999814021886877, "learning_rate": 7.371563293892761e-06, "loss": 0.6119, "step": 244 }, { "epoch": 1.2369747899159664, "grad_norm": 1.2917976929827104, "learning_rate": 7.345626134258897e-06, "loss": 0.6657, "step": 245 }, { "epoch": 1.242016806722689, "grad_norm": 1.4010288472470998, "learning_rate": 7.319607789816555e-06, "loss": 0.6586, "step": 246 }, { "epoch": 1.2470588235294118, "grad_norm": 1.4146400942510136, "learning_rate": 7.293509161090453e-06, "loss": 0.6595, "step": 247 }, { "epoch": 1.2521008403361344, "grad_norm": 1.2728109027093242, "learning_rate": 7.2673311513840395e-06, "loss": 0.6353, "step": 248 }, { "epoch": 1.2571428571428571, "grad_norm": 1.3471043709018875, "learning_rate": 7.241074666748228e-06, "loss": 0.6713, "step": 249 }, { "epoch": 1.2621848739495798, "grad_norm": 1.353231427350053, "learning_rate": 7.214740615950041e-06, "loss": 0.6102, "step": 250 }, { "epoch": 1.2672268907563025, "grad_norm": 1.337514944324046, "learning_rate": 7.188329910441154e-06, "loss": 0.6282, "step": 251 }, { "epoch": 1.2722689075630251, "grad_norm": 1.362404295247445, "learning_rate": 7.161843464326349e-06, "loss": 0.6072, "step": 252 }, { "epoch": 1.2773109243697478, "grad_norm": 1.1818447088372563, "learning_rate": 7.135282194331881e-06, "loss": 0.6057, "step": 253 }, { "epoch": 1.2823529411764705, "grad_norm": 1.4982822435126113, "learning_rate": 7.1086470197737405e-06, "loss": 0.6803, "step": 254 }, { "epoch": 1.2873949579831931, "grad_norm": 1.4344811997979932, "learning_rate": 7.0819388625258385e-06, "loss": 0.8567, "step": 255 }, { "epoch": 1.292436974789916, "grad_norm": 1.3859091438882214, "learning_rate": 7.05515864698811e-06, "loss": 0.7355, "step": 256 }, { "epoch": 1.2974789915966387, "grad_norm": 1.1626254136263392, "learning_rate": 7.028307300054499e-06, "loss": 0.5839, "step": 257 }, { "epoch": 1.3025210084033614, "grad_norm": 1.3552944579781003, "learning_rate": 7.0013857510808934e-06, "loss": 0.6836, "step": 258 }, { "epoch": 1.307563025210084, "grad_norm": 1.3028817545835125, "learning_rate": 6.974394931852957e-06, "loss": 0.6284, "step": 259 }, { "epoch": 1.3126050420168067, "grad_norm": 1.5434124541373508, "learning_rate": 6.94733577655387e-06, "loss": 0.7012, "step": 260 }, { "epoch": 1.3176470588235294, "grad_norm": 1.303474015679206, "learning_rate": 6.920209221732007e-06, "loss": 0.5703, "step": 261 }, { "epoch": 1.322689075630252, "grad_norm": 1.3348450903633984, "learning_rate": 6.893016206268518e-06, "loss": 0.5917, "step": 262 }, { "epoch": 1.3277310924369747, "grad_norm": 1.3433706513738732, "learning_rate": 6.865757671344827e-06, "loss": 0.6672, "step": 263 }, { "epoch": 1.3327731092436974, "grad_norm": 1.2935787672149481, "learning_rate": 6.838434560410064e-06, "loss": 0.6701, "step": 264 }, { "epoch": 1.3378151260504203, "grad_norm": 1.3458569492608534, "learning_rate": 6.811047819148413e-06, "loss": 0.6647, "step": 265 }, { "epoch": 1.342857142857143, "grad_norm": 1.3814097147596185, "learning_rate": 6.783598395446371e-06, "loss": 0.6866, "step": 266 }, { "epoch": 1.3478991596638656, "grad_norm": 1.384769236934002, "learning_rate": 6.756087239359948e-06, "loss": 0.6058, "step": 267 }, { "epoch": 1.3529411764705883, "grad_norm": 1.4299755108319103, "learning_rate": 6.728515303081782e-06, "loss": 0.6608, "step": 268 }, { "epoch": 1.357983193277311, "grad_norm": 1.6844501725850975, "learning_rate": 6.700883540908185e-06, "loss": 0.6902, "step": 269 }, { "epoch": 1.3630252100840337, "grad_norm": 1.490837215727114, "learning_rate": 6.673192909206109e-06, "loss": 0.6622, "step": 270 }, { "epoch": 1.3680672268907563, "grad_norm": 1.5025542365103597, "learning_rate": 6.64544436638005e-06, "loss": 0.7318, "step": 271 }, { "epoch": 1.373109243697479, "grad_norm": 1.368007843570876, "learning_rate": 6.617638872838874e-06, "loss": 0.6616, "step": 272 }, { "epoch": 1.3781512605042017, "grad_norm": 1.3302784390410516, "learning_rate": 6.589777390962575e-06, "loss": 0.5837, "step": 273 }, { "epoch": 1.3831932773109243, "grad_norm": 1.3818583989196362, "learning_rate": 6.561860885068972e-06, "loss": 0.7319, "step": 274 }, { "epoch": 1.388235294117647, "grad_norm": 1.3678970576063487, "learning_rate": 6.53389032138032e-06, "loss": 0.6479, "step": 275 }, { "epoch": 1.3932773109243697, "grad_norm": 1.3918528373329961, "learning_rate": 6.505866667989884e-06, "loss": 0.6657, "step": 276 }, { "epoch": 1.3983193277310924, "grad_norm": 1.3578596611461975, "learning_rate": 6.477790894828422e-06, "loss": 0.6227, "step": 277 }, { "epoch": 1.403361344537815, "grad_norm": 1.37442116613121, "learning_rate": 6.449663973630613e-06, "loss": 0.668, "step": 278 }, { "epoch": 1.4084033613445377, "grad_norm": 1.251535744853749, "learning_rate": 6.421486877901436e-06, "loss": 0.6394, "step": 279 }, { "epoch": 1.4134453781512604, "grad_norm": 1.3817098557899696, "learning_rate": 6.393260582882462e-06, "loss": 0.7289, "step": 280 }, { "epoch": 1.4184873949579833, "grad_norm": 1.3924770743130575, "learning_rate": 6.364986065518106e-06, "loss": 0.6632, "step": 281 }, { "epoch": 1.423529411764706, "grad_norm": 1.3388647960669742, "learning_rate": 6.336664304421818e-06, "loss": 0.6445, "step": 282 }, { "epoch": 1.4285714285714286, "grad_norm": 1.3627824010774807, "learning_rate": 6.308296279842204e-06, "loss": 0.6785, "step": 283 }, { "epoch": 1.4336134453781513, "grad_norm": 1.2353887841733255, "learning_rate": 6.279882973629101e-06, "loss": 0.5987, "step": 284 }, { "epoch": 1.438655462184874, "grad_norm": 1.2803646798399686, "learning_rate": 6.2514253691996e-06, "loss": 0.6593, "step": 285 }, { "epoch": 1.4436974789915966, "grad_norm": 1.3106097252223476, "learning_rate": 6.222924451504001e-06, "loss": 0.6612, "step": 286 }, { "epoch": 1.4487394957983193, "grad_norm": 1.491149138722541, "learning_rate": 6.194381206991723e-06, "loss": 0.6603, "step": 287 }, { "epoch": 1.453781512605042, "grad_norm": 1.4729722170121724, "learning_rate": 6.165796623577171e-06, "loss": 0.6458, "step": 288 }, { "epoch": 1.4588235294117646, "grad_norm": 1.2583772868484708, "learning_rate": 6.1371716906055336e-06, "loss": 0.6571, "step": 289 }, { "epoch": 1.4638655462184875, "grad_norm": 1.6484902113991295, "learning_rate": 6.10850739881854e-06, "loss": 0.8048, "step": 290 }, { "epoch": 1.4689075630252102, "grad_norm": 1.1293948636395863, "learning_rate": 6.079804740320181e-06, "loss": 0.631, "step": 291 }, { "epoch": 1.4739495798319329, "grad_norm": 1.357543211738453, "learning_rate": 6.051064708542357e-06, "loss": 0.6834, "step": 292 }, { "epoch": 1.4789915966386555, "grad_norm": 1.422094283192291, "learning_rate": 6.022288298210502e-06, "loss": 0.7688, "step": 293 }, { "epoch": 1.4840336134453782, "grad_norm": 1.3320687626409005, "learning_rate": 5.993476505309154e-06, "loss": 0.6438, "step": 294 }, { "epoch": 1.4890756302521009, "grad_norm": 1.479155880731166, "learning_rate": 5.964630327047485e-06, "loss": 0.6983, "step": 295 }, { "epoch": 1.4941176470588236, "grad_norm": 1.4751670026359378, "learning_rate": 5.935750761824777e-06, "loss": 0.6784, "step": 296 }, { "epoch": 1.4991596638655462, "grad_norm": 1.3971166152312533, "learning_rate": 5.906838809195879e-06, "loss": 0.7934, "step": 297 }, { "epoch": 1.504201680672269, "grad_norm": 1.486282793941636, "learning_rate": 5.877895469836604e-06, "loss": 0.7149, "step": 298 }, { "epoch": 1.5092436974789916, "grad_norm": 1.3831360984251488, "learning_rate": 5.848921745509094e-06, "loss": 0.6853, "step": 299 }, { "epoch": 1.5142857142857142, "grad_norm": 1.373255418518971, "learning_rate": 5.819918639027149e-06, "loss": 0.6262, "step": 300 }, { "epoch": 1.519327731092437, "grad_norm": 1.398139776725886, "learning_rate": 5.790887154221521e-06, "loss": 0.6682, "step": 301 }, { "epoch": 1.5243697478991596, "grad_norm": 1.459786025141565, "learning_rate": 5.7618282959051685e-06, "loss": 0.6596, "step": 302 }, { "epoch": 1.5294117647058822, "grad_norm": 1.386843554966046, "learning_rate": 5.7327430698384775e-06, "loss": 0.662, "step": 303 }, { "epoch": 1.534453781512605, "grad_norm": 1.334093052658649, "learning_rate": 5.703632482694453e-06, "loss": 0.5642, "step": 304 }, { "epoch": 1.5394957983193276, "grad_norm": 1.394936799748242, "learning_rate": 5.674497542023875e-06, "loss": 0.6785, "step": 305 }, { "epoch": 1.5445378151260503, "grad_norm": 1.2487045092120568, "learning_rate": 5.645339256220427e-06, "loss": 0.6405, "step": 306 }, { "epoch": 1.5495798319327732, "grad_norm": 1.449626002944486, "learning_rate": 5.616158634485793e-06, "loss": 0.7186, "step": 307 }, { "epoch": 1.5546218487394958, "grad_norm": 1.3148115913009149, "learning_rate": 5.5869566867947344e-06, "loss": 0.6689, "step": 308 }, { "epoch": 1.5596638655462185, "grad_norm": 1.3031066852612374, "learning_rate": 5.557734423860122e-06, "loss": 0.6865, "step": 309 }, { "epoch": 1.5647058823529412, "grad_norm": 1.4070190634154978, "learning_rate": 5.528492857097966e-06, "loss": 0.692, "step": 310 }, { "epoch": 1.5697478991596638, "grad_norm": 1.424416347019562, "learning_rate": 5.499232998592399e-06, "loss": 0.6712, "step": 311 }, { "epoch": 1.5747899159663865, "grad_norm": 1.4045930546601455, "learning_rate": 5.469955861060653e-06, "loss": 0.692, "step": 312 }, { "epoch": 1.5798319327731094, "grad_norm": 1.4633924161825607, "learning_rate": 5.44066245781801e-06, "loss": 0.6972, "step": 313 }, { "epoch": 1.584873949579832, "grad_norm": 1.3419059215183884, "learning_rate": 5.4113538027427245e-06, "loss": 0.5832, "step": 314 }, { "epoch": 1.5899159663865547, "grad_norm": 1.4651690425379238, "learning_rate": 5.382030910240936e-06, "loss": 0.7263, "step": 315 }, { "epoch": 1.5949579831932774, "grad_norm": 1.3544416080791692, "learning_rate": 5.352694795211555e-06, "loss": 0.6693, "step": 316 }, { "epoch": 1.6, "grad_norm": 1.3796831843734638, "learning_rate": 5.3233464730111426e-06, "loss": 0.6843, "step": 317 }, { "epoch": 1.6050420168067228, "grad_norm": 1.3756368583869594, "learning_rate": 5.29398695941876e-06, "loss": 0.6956, "step": 318 }, { "epoch": 1.6100840336134454, "grad_norm": 1.354906917799083, "learning_rate": 5.2646172706008154e-06, "loss": 0.5865, "step": 319 }, { "epoch": 1.615126050420168, "grad_norm": 1.283604806155226, "learning_rate": 5.235238423075899e-06, "loss": 0.6476, "step": 320 }, { "epoch": 1.6201680672268908, "grad_norm": 1.3323430668544856, "learning_rate": 5.20585143367959e-06, "loss": 0.5978, "step": 321 }, { "epoch": 1.6252100840336134, "grad_norm": 1.4432636768429228, "learning_rate": 5.176457319529264e-06, "loss": 0.7229, "step": 322 }, { "epoch": 1.6302521008403361, "grad_norm": 1.3389659599587687, "learning_rate": 5.147057097988898e-06, "loss": 0.7036, "step": 323 }, { "epoch": 1.6352941176470588, "grad_norm": 1.40224689957347, "learning_rate": 5.1176517866338495e-06, "loss": 0.6524, "step": 324 }, { "epoch": 1.6403361344537815, "grad_norm": 1.448948508673923, "learning_rate": 5.088242403215644e-06, "loss": 0.6574, "step": 325 }, { "epoch": 1.6453781512605041, "grad_norm": 1.4336192786572701, "learning_rate": 5.058829965626742e-06, "loss": 0.6649, "step": 326 }, { "epoch": 1.6504201680672268, "grad_norm": 1.1551398885920936, "learning_rate": 5.029415491865311e-06, "loss": 0.6607, "step": 327 }, { "epoch": 1.6554621848739495, "grad_norm": 1.4081755117550179, "learning_rate": 5e-06, "loss": 0.6308, "step": 328 }, { "epoch": 1.6605042016806721, "grad_norm": 1.2962293823552042, "learning_rate": 4.97058450813469e-06, "loss": 0.6315, "step": 329 }, { "epoch": 1.6655462184873948, "grad_norm": 1.2609233329938516, "learning_rate": 4.94117003437326e-06, "loss": 0.6453, "step": 330 }, { "epoch": 1.6705882352941175, "grad_norm": 1.4395586718171531, "learning_rate": 4.911757596784358e-06, "loss": 0.7056, "step": 331 }, { "epoch": 1.6756302521008404, "grad_norm": 1.490647265803814, "learning_rate": 4.882348213366152e-06, "loss": 0.7463, "step": 332 }, { "epoch": 1.680672268907563, "grad_norm": 1.4744084173114673, "learning_rate": 4.8529429020111035e-06, "loss": 0.6518, "step": 333 }, { "epoch": 1.6857142857142857, "grad_norm": 1.3256051086606053, "learning_rate": 4.823542680470738e-06, "loss": 0.6322, "step": 334 }, { "epoch": 1.6907563025210084, "grad_norm": 1.4043201154667322, "learning_rate": 4.794148566320412e-06, "loss": 0.6623, "step": 335 }, { "epoch": 1.695798319327731, "grad_norm": 1.3058283187944708, "learning_rate": 4.7647615769241e-06, "loss": 0.7233, "step": 336 }, { "epoch": 1.7008403361344537, "grad_norm": 1.3709304051984876, "learning_rate": 4.7353827293991845e-06, "loss": 0.7237, "step": 337 }, { "epoch": 1.7058823529411766, "grad_norm": 1.3476441152074792, "learning_rate": 4.706013040581242e-06, "loss": 0.6408, "step": 338 }, { "epoch": 1.7109243697478993, "grad_norm": 1.4435937624188804, "learning_rate": 4.676653526988858e-06, "loss": 0.6647, "step": 339 }, { "epoch": 1.715966386554622, "grad_norm": 1.3226553142476545, "learning_rate": 4.647305204788445e-06, "loss": 0.6489, "step": 340 }, { "epoch": 1.7210084033613446, "grad_norm": 1.3388051536697478, "learning_rate": 4.617969089759066e-06, "loss": 0.6414, "step": 341 }, { "epoch": 1.7260504201680673, "grad_norm": 1.369018029455846, "learning_rate": 4.588646197257278e-06, "loss": 0.6535, "step": 342 }, { "epoch": 1.73109243697479, "grad_norm": 1.4137443784434733, "learning_rate": 4.559337542181993e-06, "loss": 0.6446, "step": 343 }, { "epoch": 1.7361344537815127, "grad_norm": 1.3718987426836817, "learning_rate": 4.53004413893935e-06, "loss": 0.6477, "step": 344 }, { "epoch": 1.7411764705882353, "grad_norm": 1.262236928246166, "learning_rate": 4.500767001407604e-06, "loss": 0.6059, "step": 345 }, { "epoch": 1.746218487394958, "grad_norm": 1.3613528737566392, "learning_rate": 4.471507142902036e-06, "loss": 0.6545, "step": 346 }, { "epoch": 1.7512605042016807, "grad_norm": 1.303211681985445, "learning_rate": 4.4422655761398785e-06, "loss": 0.633, "step": 347 }, { "epoch": 1.7563025210084033, "grad_norm": 1.3262900181605304, "learning_rate": 4.413043313205266e-06, "loss": 0.6873, "step": 348 }, { "epoch": 1.761344537815126, "grad_norm": 1.5014706286550592, "learning_rate": 4.383841365514208e-06, "loss": 0.6715, "step": 349 }, { "epoch": 1.7663865546218487, "grad_norm": 1.3748458240376293, "learning_rate": 4.354660743779575e-06, "loss": 0.6322, "step": 350 }, { "epoch": 1.7714285714285714, "grad_norm": 1.3200606309946945, "learning_rate": 4.325502457976126e-06, "loss": 0.6468, "step": 351 }, { "epoch": 1.776470588235294, "grad_norm": 1.4363798100469027, "learning_rate": 4.296367517305548e-06, "loss": 0.6424, "step": 352 }, { "epoch": 1.7815126050420167, "grad_norm": 1.3665833844005753, "learning_rate": 4.267256930161523e-06, "loss": 0.6895, "step": 353 }, { "epoch": 1.7865546218487394, "grad_norm": 1.3126702843544444, "learning_rate": 4.238171704094833e-06, "loss": 0.6766, "step": 354 }, { "epoch": 1.791596638655462, "grad_norm": 1.3931998076257006, "learning_rate": 4.209112845778481e-06, "loss": 0.7165, "step": 355 }, { "epoch": 1.7966386554621847, "grad_norm": 1.4120182498478362, "learning_rate": 4.180081360972852e-06, "loss": 0.6909, "step": 356 }, { "epoch": 1.8016806722689076, "grad_norm": 1.3825157448385343, "learning_rate": 4.151078254490908e-06, "loss": 0.6634, "step": 357 }, { "epoch": 1.8067226890756303, "grad_norm": 1.2976324503271779, "learning_rate": 4.122104530163397e-06, "loss": 0.6482, "step": 358 }, { "epoch": 1.811764705882353, "grad_norm": 1.3371821093594873, "learning_rate": 4.09316119080412e-06, "loss": 0.5939, "step": 359 }, { "epoch": 1.8168067226890756, "grad_norm": 1.2815723486743216, "learning_rate": 4.064249238175223e-06, "loss": 0.5873, "step": 360 }, { "epoch": 1.8218487394957983, "grad_norm": 1.2598876616725718, "learning_rate": 4.035369672952516e-06, "loss": 0.6211, "step": 361 }, { "epoch": 1.826890756302521, "grad_norm": 1.3775558524100238, "learning_rate": 4.0065234946908456e-06, "loss": 0.6362, "step": 362 }, { "epoch": 1.8319327731092439, "grad_norm": 1.3605455122282684, "learning_rate": 3.977711701789499e-06, "loss": 0.6173, "step": 363 }, { "epoch": 1.8369747899159665, "grad_norm": 1.2800072707024852, "learning_rate": 3.948935291457645e-06, "loss": 0.6325, "step": 364 }, { "epoch": 1.8420168067226892, "grad_norm": 1.3258336050686086, "learning_rate": 3.920195259679822e-06, "loss": 0.653, "step": 365 }, { "epoch": 1.8470588235294119, "grad_norm": 1.3413446326047822, "learning_rate": 3.891492601181462e-06, "loss": 0.651, "step": 366 }, { "epoch": 1.8521008403361345, "grad_norm": 1.41115994835795, "learning_rate": 3.862828309394469e-06, "loss": 0.6292, "step": 367 }, { "epoch": 1.8571428571428572, "grad_norm": 1.3205359045412157, "learning_rate": 3.834203376422831e-06, "loss": 0.6064, "step": 368 }, { "epoch": 1.8621848739495799, "grad_norm": 1.271016774529, "learning_rate": 3.805618793008279e-06, "loss": 0.6503, "step": 369 }, { "epoch": 1.8672268907563025, "grad_norm": 1.38208148943542, "learning_rate": 3.777075548496001e-06, "loss": 0.673, "step": 370 }, { "epoch": 1.8722689075630252, "grad_norm": 1.4627608316199674, "learning_rate": 3.7485746308004013e-06, "loss": 0.6853, "step": 371 }, { "epoch": 1.877310924369748, "grad_norm": 1.2952312321525565, "learning_rate": 3.7201170263709004e-06, "loss": 0.6164, "step": 372 }, { "epoch": 1.8823529411764706, "grad_norm": 1.4840833764786416, "learning_rate": 3.6917037201577977e-06, "loss": 0.6935, "step": 373 }, { "epoch": 1.8873949579831932, "grad_norm": 1.371096887673559, "learning_rate": 3.6633356955781827e-06, "loss": 0.6571, "step": 374 }, { "epoch": 1.892436974789916, "grad_norm": 1.1787569156110669, "learning_rate": 3.635013934481895e-06, "loss": 0.5976, "step": 375 }, { "epoch": 1.8974789915966386, "grad_norm": 1.292415912438797, "learning_rate": 3.6067394171175397e-06, "loss": 0.662, "step": 376 }, { "epoch": 1.9025210084033612, "grad_norm": 1.4004270726912136, "learning_rate": 3.578513122098566e-06, "loss": 0.6902, "step": 377 }, { "epoch": 1.907563025210084, "grad_norm": 1.3676893820953542, "learning_rate": 3.5503360263693887e-06, "loss": 0.6736, "step": 378 }, { "epoch": 1.9126050420168066, "grad_norm": 1.5497019666472422, "learning_rate": 3.5222091051715803e-06, "loss": 0.6474, "step": 379 }, { "epoch": 1.9176470588235293, "grad_norm": 1.4107058784966016, "learning_rate": 3.4941333320101173e-06, "loss": 0.6214, "step": 380 }, { "epoch": 1.9226890756302522, "grad_norm": 1.3074693513299003, "learning_rate": 3.466109678619681e-06, "loss": 0.5863, "step": 381 }, { "epoch": 1.9277310924369748, "grad_norm": 1.2533065740051568, "learning_rate": 3.4381391149310294e-06, "loss": 0.6145, "step": 382 }, { "epoch": 1.9327731092436975, "grad_norm": 1.279932965905714, "learning_rate": 3.4102226090374246e-06, "loss": 0.6138, "step": 383 }, { "epoch": 1.9378151260504202, "grad_norm": 1.279194036152673, "learning_rate": 3.3823611271611266e-06, "loss": 0.6051, "step": 384 }, { "epoch": 1.9428571428571428, "grad_norm": 1.4523883672700335, "learning_rate": 3.35455563361995e-06, "loss": 0.6475, "step": 385 }, { "epoch": 1.9478991596638655, "grad_norm": 1.319917640705539, "learning_rate": 3.3268070907938915e-06, "loss": 0.575, "step": 386 }, { "epoch": 1.9529411764705882, "grad_norm": 1.356219744351625, "learning_rate": 3.2991164590918162e-06, "loss": 0.6707, "step": 387 }, { "epoch": 1.957983193277311, "grad_norm": 1.3980927144998019, "learning_rate": 3.271484696918218e-06, "loss": 0.62, "step": 388 }, { "epoch": 1.9630252100840337, "grad_norm": 1.3412194145756722, "learning_rate": 3.2439127606400546e-06, "loss": 0.6249, "step": 389 }, { "epoch": 1.9680672268907564, "grad_norm": 1.231905550971943, "learning_rate": 3.2164016045536306e-06, "loss": 0.6542, "step": 390 }, { "epoch": 1.973109243697479, "grad_norm": 1.3549695794420435, "learning_rate": 3.1889521808515888e-06, "loss": 0.6176, "step": 391 }, { "epoch": 1.9781512605042018, "grad_norm": 1.415166811994311, "learning_rate": 3.1615654395899377e-06, "loss": 0.6593, "step": 392 }, { "epoch": 1.9831932773109244, "grad_norm": 1.3126591809141124, "learning_rate": 3.1342423286551756e-06, "loss": 0.6891, "step": 393 }, { "epoch": 1.988235294117647, "grad_norm": 1.3842054436860431, "learning_rate": 3.1069837937314846e-06, "loss": 0.6342, "step": 394 }, { "epoch": 1.9932773109243698, "grad_norm": 1.4424046044230687, "learning_rate": 3.0797907782679944e-06, "loss": 0.6461, "step": 395 }, { "epoch": 1.9983193277310924, "grad_norm": 1.3718751038472339, "learning_rate": 3.0526642234461313e-06, "loss": 0.6338, "step": 396 }, { "epoch": 2.0050420168067227, "grad_norm": 3.363833604785768, "learning_rate": 3.0256050681470446e-06, "loss": 1.2006, "step": 397 }, { "epoch": 2.0100840336134453, "grad_norm": 1.410375521884215, "learning_rate": 2.9986142489191074e-06, "loss": 0.5121, "step": 398 }, { "epoch": 2.015126050420168, "grad_norm": 1.463355598251907, "learning_rate": 2.971692699945502e-06, "loss": 0.4394, "step": 399 }, { "epoch": 2.0201680672268907, "grad_norm": 1.2914998337098158, "learning_rate": 2.9448413530118912e-06, "loss": 0.4978, "step": 400 }, { "epoch": 2.0252100840336134, "grad_norm": 1.3604150815997402, "learning_rate": 2.9180611374741623e-06, "loss": 0.4689, "step": 401 }, { "epoch": 2.030252100840336, "grad_norm": 1.1964953052023972, "learning_rate": 2.891352980226262e-06, "loss": 0.5015, "step": 402 }, { "epoch": 2.0352941176470587, "grad_norm": 1.1694739760631343, "learning_rate": 2.8647178056681197e-06, "loss": 0.447, "step": 403 }, { "epoch": 2.0403361344537814, "grad_norm": 1.3174590682003549, "learning_rate": 2.838156535673652e-06, "loss": 0.414, "step": 404 }, { "epoch": 2.045378151260504, "grad_norm": 1.2140198128144435, "learning_rate": 2.8116700895588473e-06, "loss": 0.4505, "step": 405 }, { "epoch": 2.0504201680672267, "grad_norm": 1.3398119898455612, "learning_rate": 2.785259384049959e-06, "loss": 0.4532, "step": 406 }, { "epoch": 2.0554621848739494, "grad_norm": 1.4229930176202614, "learning_rate": 2.7589253332517736e-06, "loss": 0.5546, "step": 407 }, { "epoch": 2.060504201680672, "grad_norm": 1.4684509907326317, "learning_rate": 2.7326688486159613e-06, "loss": 0.5254, "step": 408 }, { "epoch": 2.065546218487395, "grad_norm": 1.4962520925453975, "learning_rate": 2.706490838909547e-06, "loss": 0.4673, "step": 409 }, { "epoch": 2.070588235294118, "grad_norm": 1.3630229586386085, "learning_rate": 2.680392210183446e-06, "loss": 0.4473, "step": 410 }, { "epoch": 2.0756302521008405, "grad_norm": 1.38978907137299, "learning_rate": 2.6543738657411033e-06, "loss": 0.5159, "step": 411 }, { "epoch": 2.080672268907563, "grad_norm": 1.429662885547244, "learning_rate": 2.628436706107238e-06, "loss": 0.5161, "step": 412 }, { "epoch": 2.085714285714286, "grad_norm": 1.394356185017467, "learning_rate": 2.6025816289966703e-06, "loss": 0.5032, "step": 413 }, { "epoch": 2.0907563025210085, "grad_norm": 1.480088664868798, "learning_rate": 2.5768095292832412e-06, "loss": 0.4802, "step": 414 }, { "epoch": 2.095798319327731, "grad_norm": 1.3859048551297604, "learning_rate": 2.5511212989688587e-06, "loss": 0.4993, "step": 415 }, { "epoch": 2.100840336134454, "grad_norm": 1.440430022618694, "learning_rate": 2.525517827152614e-06, "loss": 0.4551, "step": 416 }, { "epoch": 2.1058823529411765, "grad_norm": 1.4332550806993916, "learning_rate": 2.5000000000000015e-06, "loss": 0.5611, "step": 417 }, { "epoch": 2.110924369747899, "grad_norm": 1.3161188350792523, "learning_rate": 2.4745687007122636e-06, "loss": 0.4602, "step": 418 }, { "epoch": 2.115966386554622, "grad_norm": 1.4145836319136063, "learning_rate": 2.449224809495815e-06, "loss": 0.4464, "step": 419 }, { "epoch": 2.1210084033613446, "grad_norm": 1.3638972016864883, "learning_rate": 2.423969203531768e-06, "loss": 0.4625, "step": 420 }, { "epoch": 2.1260504201680672, "grad_norm": 1.4282920146552893, "learning_rate": 2.3988027569455895e-06, "loss": 0.4809, "step": 421 }, { "epoch": 2.13109243697479, "grad_norm": 1.452704091304085, "learning_rate": 2.373726340776837e-06, "loss": 0.4959, "step": 422 }, { "epoch": 2.1361344537815126, "grad_norm": 1.4474065940760683, "learning_rate": 2.348740822949006e-06, "loss": 0.4557, "step": 423 }, { "epoch": 2.1411764705882352, "grad_norm": 1.406883162238408, "learning_rate": 2.323847068239504e-06, "loss": 0.5069, "step": 424 }, { "epoch": 2.146218487394958, "grad_norm": 1.4713827636564831, "learning_rate": 2.2990459382497086e-06, "loss": 0.4813, "step": 425 }, { "epoch": 2.1512605042016806, "grad_norm": 1.4582227343532888, "learning_rate": 2.274338291375147e-06, "loss": 0.462, "step": 426 }, { "epoch": 2.1563025210084033, "grad_norm": 1.353197229608169, "learning_rate": 2.2497249827757933e-06, "loss": 0.4658, "step": 427 }, { "epoch": 2.161344537815126, "grad_norm": 1.3550947330778897, "learning_rate": 2.225206864346465e-06, "loss": 0.5794, "step": 428 }, { "epoch": 2.1663865546218486, "grad_norm": 1.4137143069445475, "learning_rate": 2.2007847846873342e-06, "loss": 0.4722, "step": 429 }, { "epoch": 2.1714285714285713, "grad_norm": 1.2932234077066185, "learning_rate": 2.176459589074566e-06, "loss": 0.4369, "step": 430 }, { "epoch": 2.176470588235294, "grad_norm": 1.3725308971047603, "learning_rate": 2.1522321194310577e-06, "loss": 0.4958, "step": 431 }, { "epoch": 2.1815126050420166, "grad_norm": 1.4324324040918073, "learning_rate": 2.1281032142972933e-06, "loss": 0.4954, "step": 432 }, { "epoch": 2.1865546218487397, "grad_norm": 1.4153168395436235, "learning_rate": 2.1040737088023323e-06, "loss": 0.4457, "step": 433 }, { "epoch": 2.1915966386554624, "grad_norm": 1.3341155055487035, "learning_rate": 2.080144434634898e-06, "loss": 0.5017, "step": 434 }, { "epoch": 2.196638655462185, "grad_norm": 1.352939614197411, "learning_rate": 2.056316220014588e-06, "loss": 0.4553, "step": 435 }, { "epoch": 2.2016806722689077, "grad_norm": 1.393182470026338, "learning_rate": 2.0325898896632178e-06, "loss": 0.4448, "step": 436 }, { "epoch": 2.2067226890756304, "grad_norm": 1.4033955608191793, "learning_rate": 2.0089662647762716e-06, "loss": 0.441, "step": 437 }, { "epoch": 2.211764705882353, "grad_norm": 1.41226298350313, "learning_rate": 1.9854461629944764e-06, "loss": 0.4656, "step": 438 }, { "epoch": 2.2168067226890757, "grad_norm": 1.3512621478929514, "learning_rate": 1.962030398375506e-06, "loss": 0.5245, "step": 439 }, { "epoch": 2.2218487394957984, "grad_norm": 1.3932479184910864, "learning_rate": 1.9387197813658092e-06, "loss": 0.456, "step": 440 }, { "epoch": 2.226890756302521, "grad_norm": 1.3400595100259751, "learning_rate": 1.915515118772555e-06, "loss": 0.4622, "step": 441 }, { "epoch": 2.2319327731092438, "grad_norm": 1.3239101426319217, "learning_rate": 1.8924172137357038e-06, "loss": 0.4821, "step": 442 }, { "epoch": 2.2369747899159664, "grad_norm": 1.4028557110251756, "learning_rate": 1.8694268657002197e-06, "loss": 0.4592, "step": 443 }, { "epoch": 2.242016806722689, "grad_norm": 1.4043326661254716, "learning_rate": 1.8465448703883959e-06, "loss": 0.4642, "step": 444 }, { "epoch": 2.2470588235294118, "grad_norm": 1.4748018123002309, "learning_rate": 1.8237720197723075e-06, "loss": 0.5244, "step": 445 }, { "epoch": 2.2521008403361344, "grad_norm": 1.3653204295657917, "learning_rate": 1.8011091020464138e-06, "loss": 0.5117, "step": 446 }, { "epoch": 2.257142857142857, "grad_norm": 1.4578979263769525, "learning_rate": 1.7785569016002686e-06, "loss": 0.4622, "step": 447 }, { "epoch": 2.26218487394958, "grad_norm": 1.4739147697577966, "learning_rate": 1.75611619899137e-06, "loss": 0.4524, "step": 448 }, { "epoch": 2.2672268907563025, "grad_norm": 1.3465934593186815, "learning_rate": 1.7337877709181527e-06, "loss": 0.4616, "step": 449 }, { "epoch": 2.272268907563025, "grad_norm": 1.4287084373091115, "learning_rate": 1.711572390193102e-06, "loss": 0.6594, "step": 450 }, { "epoch": 2.277310924369748, "grad_norm": 1.3274840093520053, "learning_rate": 1.689470825715998e-06, "loss": 0.4529, "step": 451 }, { "epoch": 2.2823529411764705, "grad_norm": 1.4216422105253623, "learning_rate": 1.6674838424473172e-06, "loss": 0.4655, "step": 452 }, { "epoch": 2.287394957983193, "grad_norm": 1.452303728671861, "learning_rate": 1.6456122013817477e-06, "loss": 0.4625, "step": 453 }, { "epoch": 2.292436974789916, "grad_norm": 1.4369743256615972, "learning_rate": 1.6238566595218475e-06, "loss": 0.4761, "step": 454 }, { "epoch": 2.2974789915966385, "grad_norm": 1.407023006658543, "learning_rate": 1.6022179698518525e-06, "loss": 0.4505, "step": 455 }, { "epoch": 2.302521008403361, "grad_norm": 1.391039540718536, "learning_rate": 1.580696881311611e-06, "loss": 0.4894, "step": 456 }, { "epoch": 2.307563025210084, "grad_norm": 1.3557281771597436, "learning_rate": 1.5592941387706562e-06, "loss": 0.4108, "step": 457 }, { "epoch": 2.3126050420168065, "grad_norm": 1.3010131467886796, "learning_rate": 1.538010483002435e-06, "loss": 0.425, "step": 458 }, { "epoch": 2.317647058823529, "grad_norm": 1.3625069219769537, "learning_rate": 1.5168466506586654e-06, "loss": 0.4431, "step": 459 }, { "epoch": 2.3226890756302523, "grad_norm": 1.2997097389936179, "learning_rate": 1.4958033742438348e-06, "loss": 0.4058, "step": 460 }, { "epoch": 2.327731092436975, "grad_norm": 1.3546221586310845, "learning_rate": 1.4748813820898554e-06, "loss": 0.5043, "step": 461 }, { "epoch": 2.3327731092436976, "grad_norm": 1.3503940282999218, "learning_rate": 1.454081398330855e-06, "loss": 0.5015, "step": 462 }, { "epoch": 2.3378151260504203, "grad_norm": 1.2879127697899735, "learning_rate": 1.4334041428781003e-06, "loss": 0.4219, "step": 463 }, { "epoch": 2.342857142857143, "grad_norm": 1.5900890446730591, "learning_rate": 1.4128503313951008e-06, "loss": 0.5508, "step": 464 }, { "epoch": 2.3478991596638656, "grad_norm": 1.4693275041182954, "learning_rate": 1.3924206752728282e-06, "loss": 0.5196, "step": 465 }, { "epoch": 2.3529411764705883, "grad_norm": 1.3739526563603481, "learning_rate": 1.3721158816050872e-06, "loss": 0.5223, "step": 466 }, { "epoch": 2.357983193277311, "grad_norm": 1.2888756368302696, "learning_rate": 1.3519366531640589e-06, "loss": 0.4745, "step": 467 }, { "epoch": 2.3630252100840337, "grad_norm": 1.3646861171520672, "learning_rate": 1.3318836883759634e-06, "loss": 0.4765, "step": 468 }, { "epoch": 2.3680672268907563, "grad_norm": 1.3876282049959663, "learning_rate": 1.3119576812968893e-06, "loss": 0.4552, "step": 469 }, { "epoch": 2.373109243697479, "grad_norm": 1.3212811305037033, "learning_rate": 1.292159321588778e-06, "loss": 0.4444, "step": 470 }, { "epoch": 2.3781512605042017, "grad_norm": 1.4025656868262555, "learning_rate": 1.272489294495548e-06, "loss": 0.5373, "step": 471 }, { "epoch": 2.3831932773109243, "grad_norm": 1.3992039142572703, "learning_rate": 1.252948280819375e-06, "loss": 0.4297, "step": 472 }, { "epoch": 2.388235294117647, "grad_norm": 1.438194701698973, "learning_rate": 1.2335369568971362e-06, "loss": 0.4577, "step": 473 }, { "epoch": 2.3932773109243697, "grad_norm": 1.3560235059252677, "learning_rate": 1.2142559945769995e-06, "loss": 0.4576, "step": 474 }, { "epoch": 2.3983193277310924, "grad_norm": 1.357949004614199, "learning_rate": 1.1951060611951615e-06, "loss": 0.5944, "step": 475 }, { "epoch": 2.403361344537815, "grad_norm": 1.2895013043643404, "learning_rate": 1.1760878195527642e-06, "loss": 0.4192, "step": 476 }, { "epoch": 2.4084033613445377, "grad_norm": 1.2608640104913673, "learning_rate": 1.1572019278929457e-06, "loss": 0.4431, "step": 477 }, { "epoch": 2.4134453781512604, "grad_norm": 1.4235058216914491, "learning_rate": 1.1384490398780563e-06, "loss": 0.4835, "step": 478 }, { "epoch": 2.418487394957983, "grad_norm": 1.3849158950764375, "learning_rate": 1.1198298045670402e-06, "loss": 0.4497, "step": 479 }, { "epoch": 2.4235294117647057, "grad_norm": 1.4243621054419897, "learning_rate": 1.1013448663929704e-06, "loss": 0.5031, "step": 480 }, { "epoch": 2.4285714285714284, "grad_norm": 1.2997464135987702, "learning_rate": 1.0829948651407374e-06, "loss": 0.483, "step": 481 }, { "epoch": 2.4336134453781515, "grad_norm": 1.2887117802326669, "learning_rate": 1.0647804359249143e-06, "loss": 0.4424, "step": 482 }, { "epoch": 2.438655462184874, "grad_norm": 1.2955280324064098, "learning_rate": 1.0467022091677692e-06, "loss": 0.4963, "step": 483 }, { "epoch": 2.443697478991597, "grad_norm": 1.5695989821047664, "learning_rate": 1.0287608105774456e-06, "loss": 0.512, "step": 484 }, { "epoch": 2.4487394957983195, "grad_norm": 1.3900121464168351, "learning_rate": 1.0109568611263094e-06, "loss": 0.4418, "step": 485 }, { "epoch": 2.453781512605042, "grad_norm": 1.443290081700745, "learning_rate": 9.932909770294542e-07, "loss": 0.4439, "step": 486 }, { "epoch": 2.458823529411765, "grad_norm": 1.3476484251272791, "learning_rate": 9.757637697233723e-07, "loss": 0.4885, "step": 487 }, { "epoch": 2.4638655462184875, "grad_norm": 1.3389474168899225, "learning_rate": 9.58375845844793e-07, "loss": 0.4486, "step": 488 }, { "epoch": 2.46890756302521, "grad_norm": 1.2353966317116258, "learning_rate": 9.41127807209688e-07, "loss": 0.4321, "step": 489 }, { "epoch": 2.473949579831933, "grad_norm": 1.2849383161233021, "learning_rate": 9.240202507924412e-07, "loss": 0.433, "step": 490 }, { "epoch": 2.4789915966386555, "grad_norm": 1.3336087651970685, "learning_rate": 9.070537687051817e-07, "loss": 0.4516, "step": 491 }, { "epoch": 2.484033613445378, "grad_norm": 1.3550057200939567, "learning_rate": 8.902289481772996e-07, "loss": 0.4616, "step": 492 }, { "epoch": 2.489075630252101, "grad_norm": 1.3590095983206505, "learning_rate": 8.735463715351139e-07, "loss": 0.4203, "step": 493 }, { "epoch": 2.4941176470588236, "grad_norm": 1.2915320514796769, "learning_rate": 8.570066161817176e-07, "loss": 0.4503, "step": 494 }, { "epoch": 2.499159663865546, "grad_norm": 1.2679676777389248, "learning_rate": 8.406102545769989e-07, "loss": 0.4566, "step": 495 }, { "epoch": 2.504201680672269, "grad_norm": 1.426642729326135, "learning_rate": 8.243578542178227e-07, "loss": 0.4707, "step": 496 }, { "epoch": 2.5092436974789916, "grad_norm": 1.4592108582229681, "learning_rate": 8.082499776183883e-07, "loss": 0.4845, "step": 497 }, { "epoch": 2.5142857142857142, "grad_norm": 1.5266839034291377, "learning_rate": 7.922871822907641e-07, "loss": 0.5228, "step": 498 }, { "epoch": 2.519327731092437, "grad_norm": 1.471645595600825, "learning_rate": 7.764700207255904e-07, "loss": 0.4173, "step": 499 }, { "epoch": 2.5243697478991596, "grad_norm": 1.3871858021840573, "learning_rate": 7.607990403729526e-07, "loss": 0.4601, "step": 500 }, { "epoch": 2.5294117647058822, "grad_norm": 1.3138350820905274, "learning_rate": 7.452747836234392e-07, "loss": 0.4504, "step": 501 }, { "epoch": 2.534453781512605, "grad_norm": 1.2975304324598231, "learning_rate": 7.298977877893688e-07, "loss": 0.4265, "step": 502 }, { "epoch": 2.5394957983193276, "grad_norm": 1.3447001192643702, "learning_rate": 7.146685850861851e-07, "loss": 0.466, "step": 503 }, { "epoch": 2.5445378151260503, "grad_norm": 1.3862420743153665, "learning_rate": 6.995877026140468e-07, "loss": 0.4884, "step": 504 }, { "epoch": 2.549579831932773, "grad_norm": 1.4032983423284162, "learning_rate": 6.846556623395795e-07, "loss": 0.4948, "step": 505 }, { "epoch": 2.5546218487394956, "grad_norm": 1.362120295068725, "learning_rate": 6.698729810778065e-07, "loss": 0.4702, "step": 506 }, { "epoch": 2.5596638655462183, "grad_norm": 1.389808913275814, "learning_rate": 6.552401704742678e-07, "loss": 0.4825, "step": 507 }, { "epoch": 2.564705882352941, "grad_norm": 1.2860994495581453, "learning_rate": 6.40757736987307e-07, "loss": 0.4321, "step": 508 }, { "epoch": 2.5697478991596636, "grad_norm": 1.212606448511892, "learning_rate": 6.26426181870542e-07, "loss": 0.3868, "step": 509 }, { "epoch": 2.5747899159663863, "grad_norm": 1.2670489383748516, "learning_rate": 6.122460011555187e-07, "loss": 0.4532, "step": 510 }, { "epoch": 2.5798319327731094, "grad_norm": 1.3801554590726837, "learning_rate": 5.982176856345445e-07, "loss": 0.4263, "step": 511 }, { "epoch": 2.584873949579832, "grad_norm": 1.3394504151016333, "learning_rate": 5.843417208436908e-07, "loss": 0.496, "step": 512 }, { "epoch": 2.5899159663865547, "grad_norm": 1.2955707760211432, "learning_rate": 5.706185870460018e-07, "loss": 0.4253, "step": 513 }, { "epoch": 2.5949579831932774, "grad_norm": 1.289481906227215, "learning_rate": 5.570487592148666e-07, "loss": 0.4035, "step": 514 }, { "epoch": 2.6, "grad_norm": 1.3376266312340062, "learning_rate": 5.436327070175729e-07, "loss": 0.4545, "step": 515 }, { "epoch": 2.6050420168067228, "grad_norm": 1.4001675009701846, "learning_rate": 5.303708947990638e-07, "loss": 0.4684, "step": 516 }, { "epoch": 2.6100840336134454, "grad_norm": 1.4896915805848956, "learning_rate": 5.172637815658583e-07, "loss": 0.4704, "step": 517 }, { "epoch": 2.615126050420168, "grad_norm": 1.430686916061002, "learning_rate": 5.04311820970163e-07, "loss": 0.4782, "step": 518 }, { "epoch": 2.6201680672268908, "grad_norm": 1.3676105828350056, "learning_rate": 4.915154612941781e-07, "loss": 0.5979, "step": 519 }, { "epoch": 2.6252100840336134, "grad_norm": 1.3552413071380474, "learning_rate": 4.788751454345763e-07, "loss": 0.4405, "step": 520 }, { "epoch": 2.630252100840336, "grad_norm": 1.320913107468769, "learning_rate": 4.663913108871726e-07, "loss": 0.4105, "step": 521 }, { "epoch": 2.635294117647059, "grad_norm": 1.2848967010536776, "learning_rate": 4.540643897317887e-07, "loss": 0.3934, "step": 522 }, { "epoch": 2.6403361344537815, "grad_norm": 1.3500509189164658, "learning_rate": 4.4189480861729137e-07, "loss": 0.4339, "step": 523 }, { "epoch": 2.645378151260504, "grad_norm": 1.3387080610453355, "learning_rate": 4.2988298874682754e-07, "loss": 0.4552, "step": 524 }, { "epoch": 2.650420168067227, "grad_norm": 1.3397812410356982, "learning_rate": 4.1802934586324897e-07, "loss": 0.5401, "step": 525 }, { "epoch": 2.6554621848739495, "grad_norm": 1.446011629760243, "learning_rate": 4.0633429023472004e-07, "loss": 0.5409, "step": 526 }, { "epoch": 2.660504201680672, "grad_norm": 1.3710949034220614, "learning_rate": 3.947982266405159e-07, "loss": 0.501, "step": 527 }, { "epoch": 2.665546218487395, "grad_norm": 1.5073033115483478, "learning_rate": 3.834215543570191e-07, "loss": 0.5156, "step": 528 }, { "epoch": 2.6705882352941175, "grad_norm": 1.3549599833015573, "learning_rate": 3.72204667143895e-07, "loss": 0.4667, "step": 529 }, { "epoch": 2.6756302521008406, "grad_norm": 1.368632751852017, "learning_rate": 3.611479532304618e-07, "loss": 0.4596, "step": 530 }, { "epoch": 2.6806722689075633, "grad_norm": 1.3310734620781681, "learning_rate": 3.5025179530225995e-07, "loss": 0.4248, "step": 531 }, { "epoch": 2.685714285714286, "grad_norm": 1.429961991715737, "learning_rate": 3.395165704878023e-07, "loss": 0.4921, "step": 532 }, { "epoch": 2.6907563025210086, "grad_norm": 1.3220689464603654, "learning_rate": 3.289426503455201e-07, "loss": 0.4686, "step": 533 }, { "epoch": 2.6957983193277313, "grad_norm": 1.3596446823078556, "learning_rate": 3.185304008509077e-07, "loss": 0.4692, "step": 534 }, { "epoch": 2.700840336134454, "grad_norm": 1.2664017870580138, "learning_rate": 3.082801823838527e-07, "loss": 0.4792, "step": 535 }, { "epoch": 2.7058823529411766, "grad_norm": 1.277008676617942, "learning_rate": 2.9819234971616154e-07, "loss": 0.4496, "step": 536 }, { "epoch": 2.7109243697478993, "grad_norm": 1.3031675483473417, "learning_rate": 2.882672519992824e-07, "loss": 0.4599, "step": 537 }, { "epoch": 2.715966386554622, "grad_norm": 1.475285425023621, "learning_rate": 2.785052327522214e-07, "loss": 0.5562, "step": 538 }, { "epoch": 2.7210084033613446, "grad_norm": 1.2387397112349467, "learning_rate": 2.6890662984965234e-07, "loss": 0.4508, "step": 539 }, { "epoch": 2.7260504201680673, "grad_norm": 1.2769755883493084, "learning_rate": 2.594717755102205e-07, "loss": 0.4497, "step": 540 }, { "epoch": 2.73109243697479, "grad_norm": 1.4117553058680856, "learning_rate": 2.5020099628504603e-07, "loss": 0.4176, "step": 541 }, { "epoch": 2.7361344537815127, "grad_norm": 1.3430474164461437, "learning_rate": 2.4109461304642254e-07, "loss": 0.61, "step": 542 }, { "epoch": 2.7411764705882353, "grad_norm": 1.319429861827343, "learning_rate": 2.3215294097670927e-07, "loss": 0.4451, "step": 543 }, { "epoch": 2.746218487394958, "grad_norm": 1.436920605125832, "learning_rate": 2.2337628955742263e-07, "loss": 0.4874, "step": 544 }, { "epoch": 2.7512605042016807, "grad_norm": 1.3812471581213166, "learning_rate": 2.1476496255852685e-07, "loss": 0.382, "step": 545 }, { "epoch": 2.7563025210084033, "grad_norm": 1.205494792014491, "learning_rate": 2.0631925802791608e-07, "loss": 0.5224, "step": 546 }, { "epoch": 2.761344537815126, "grad_norm": 1.3083334014447827, "learning_rate": 1.9803946828110376e-07, "loss": 0.5117, "step": 547 }, { "epoch": 2.7663865546218487, "grad_norm": 1.3758887119834913, "learning_rate": 1.8992587989110133e-07, "loss": 0.4898, "step": 548 }, { "epoch": 2.7714285714285714, "grad_norm": 1.3436017213466456, "learning_rate": 1.8197877367849948e-07, "loss": 0.5596, "step": 549 }, { "epoch": 2.776470588235294, "grad_norm": 1.4507659924194913, "learning_rate": 1.7419842470175196e-07, "loss": 0.4889, "step": 550 }, { "epoch": 2.7815126050420167, "grad_norm": 1.5070411133243147, "learning_rate": 1.6658510224765333e-07, "loss": 0.47, "step": 551 }, { "epoch": 2.7865546218487394, "grad_norm": 1.3934953281445221, "learning_rate": 1.5913906982201744e-07, "loss": 0.4626, "step": 552 }, { "epoch": 2.791596638655462, "grad_norm": 1.4300047982632422, "learning_rate": 1.5186058514055912e-07, "loss": 0.4808, "step": 553 }, { "epoch": 2.7966386554621847, "grad_norm": 1.3007207174809041, "learning_rate": 1.447499001199748e-07, "loss": 0.5228, "step": 554 }, { "epoch": 2.8016806722689074, "grad_norm": 1.335166451449638, "learning_rate": 1.3780726086922103e-07, "loss": 0.5314, "step": 555 }, { "epoch": 2.80672268907563, "grad_norm": 1.2727049723883297, "learning_rate": 1.3103290768099796e-07, "loss": 0.4538, "step": 556 }, { "epoch": 2.8117647058823527, "grad_norm": 1.4233653924829766, "learning_rate": 1.244270750234333e-07, "loss": 0.4768, "step": 557 }, { "epoch": 2.8168067226890754, "grad_norm": 1.4089563114452142, "learning_rate": 1.1798999153196433e-07, "loss": 0.4543, "step": 558 }, { "epoch": 2.821848739495798, "grad_norm": 1.3596745441590257, "learning_rate": 1.1172188000142803e-07, "loss": 0.5016, "step": 559 }, { "epoch": 2.8268907563025207, "grad_norm": 1.3375081145484837, "learning_rate": 1.0562295737834738e-07, "loss": 0.47, "step": 560 }, { "epoch": 2.831932773109244, "grad_norm": 1.3797076618818533, "learning_rate": 9.969343475342285e-08, "loss": 0.4762, "step": 561 }, { "epoch": 2.8369747899159665, "grad_norm": 1.4014527371585839, "learning_rate": 9.393351735422773e-08, "loss": 0.4606, "step": 562 }, { "epoch": 2.842016806722689, "grad_norm": 1.317969883356561, "learning_rate": 8.834340453810375e-08, "loss": 0.4353, "step": 563 }, { "epoch": 2.847058823529412, "grad_norm": 1.3062183016322855, "learning_rate": 8.29232897852611e-08, "loss": 0.3857, "step": 564 }, { "epoch": 2.8521008403361345, "grad_norm": 1.3280320137002732, "learning_rate": 7.76733606920832e-08, "loss": 0.4572, "step": 565 }, { "epoch": 2.857142857142857, "grad_norm": 1.4128418670110612, "learning_rate": 7.259379896463248e-08, "loss": 0.4476, "step": 566 }, { "epoch": 2.86218487394958, "grad_norm": 1.3977595292294513, "learning_rate": 6.768478041236037e-08, "loss": 0.4436, "step": 567 }, { "epoch": 2.8672268907563025, "grad_norm": 1.3855652086248782, "learning_rate": 6.294647494202444e-08, "loss": 0.4346, "step": 568 }, { "epoch": 2.872268907563025, "grad_norm": 1.3251986287781006, "learning_rate": 5.8379046551807486e-08, "loss": 0.493, "step": 569 }, { "epoch": 2.877310924369748, "grad_norm": 1.32087943884219, "learning_rate": 5.398265332563935e-08, "loss": 0.4551, "step": 570 }, { "epoch": 2.8823529411764706, "grad_norm": 1.2437729277991256, "learning_rate": 4.975744742772848e-08, "loss": 0.4098, "step": 571 }, { "epoch": 2.8873949579831932, "grad_norm": 1.340919476266603, "learning_rate": 4.5703575097292286e-08, "loss": 0.4726, "step": 572 }, { "epoch": 2.892436974789916, "grad_norm": 1.2461844948007363, "learning_rate": 4.182117664349783e-08, "loss": 0.449, "step": 573 }, { "epoch": 2.8974789915966386, "grad_norm": 1.3240662502351237, "learning_rate": 3.8110386440605164e-08, "loss": 0.4603, "step": 574 }, { "epoch": 2.9025210084033612, "grad_norm": 1.3494315545656852, "learning_rate": 3.457133292331494e-08, "loss": 0.5058, "step": 575 }, { "epoch": 2.907563025210084, "grad_norm": 1.3389143724686245, "learning_rate": 3.120413858232474e-08, "loss": 0.4578, "step": 576 }, { "epoch": 2.9126050420168066, "grad_norm": 1.344475790060752, "learning_rate": 2.8008919960090253e-08, "loss": 0.5347, "step": 577 }, { "epoch": 2.9176470588235293, "grad_norm": 1.388286539991785, "learning_rate": 2.4985787646788497e-08, "loss": 0.4792, "step": 578 }, { "epoch": 2.9226890756302524, "grad_norm": 1.4667343155241181, "learning_rate": 2.2134846276494205e-08, "loss": 0.4854, "step": 579 }, { "epoch": 2.927731092436975, "grad_norm": 1.393293250138424, "learning_rate": 1.9456194523554404e-08, "loss": 0.4796, "step": 580 }, { "epoch": 2.9327731092436977, "grad_norm": 1.3210976282362301, "learning_rate": 1.69499250991767e-08, "loss": 0.4465, "step": 581 }, { "epoch": 2.9378151260504204, "grad_norm": 1.3544687735071852, "learning_rate": 1.4616124748217387e-08, "loss": 0.5223, "step": 582 }, { "epoch": 2.942857142857143, "grad_norm": 1.467595755846224, "learning_rate": 1.2454874246181081e-08, "loss": 0.6671, "step": 583 }, { "epoch": 2.9478991596638657, "grad_norm": 1.3671723526105932, "learning_rate": 1.0466248396424072e-08, "loss": 0.4499, "step": 584 }, { "epoch": 2.9529411764705884, "grad_norm": 1.4167636187504142, "learning_rate": 8.650316027566386e-09, "loss": 0.4873, "step": 585 }, { "epoch": 2.957983193277311, "grad_norm": 1.220474765102595, "learning_rate": 7.007139991108136e-09, "loss": 0.4043, "step": 586 }, { "epoch": 2.9630252100840337, "grad_norm": 1.3733660106334655, "learning_rate": 5.536777159254603e-09, "loss": 0.4793, "step": 587 }, { "epoch": 2.9680672268907564, "grad_norm": 1.3544611708705747, "learning_rate": 4.239278422948911e-09, "loss": 0.4953, "step": 588 }, { "epoch": 2.973109243697479, "grad_norm": 1.4589364978859505, "learning_rate": 3.1146886901090024e-09, "loss": 0.4547, "step": 589 }, { "epoch": 2.9781512605042018, "grad_norm": 1.3938123480231057, "learning_rate": 2.1630468840738716e-09, "loss": 0.4115, "step": 590 }, { "epoch": 2.9831932773109244, "grad_norm": 1.3511479563562372, "learning_rate": 1.3843859422574269e-09, "loss": 0.4926, "step": 591 }, { "epoch": 2.988235294117647, "grad_norm": 1.445464043641677, "learning_rate": 7.787328150071771e-10, "loss": 0.5346, "step": 592 }, { "epoch": 2.9932773109243698, "grad_norm": 1.5785257738352532, "learning_rate": 3.4610846467109106e-10, "loss": 0.5032, "step": 593 }, { "epoch": 2.9983193277310924, "grad_norm": 1.305339383484568, "learning_rate": 8.652786487484133e-11, "loss": 0.4666, "step": 594 }, { "epoch": 2.9983193277310924, "step": 594, "total_flos": 4.726427205490442e+17, "train_loss": 0.7082312573688199, "train_runtime": 63951.2458, "train_samples_per_second": 0.447, "train_steps_per_second": 0.009 } ], "logging_steps": 1, "max_steps": 594, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.726427205490442e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }