{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 48.231511254019296, "eval_steps": 100, "global_step": 15000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03215434083601286, "grad_norm": 38.097843170166016, "learning_rate": 3.5661234229400034e-06, "loss": 27.7904, "step": 10 }, { "epoch": 0.06430868167202572, "grad_norm": 14.835160255432129, "learning_rate": 4.5493245427856596e-06, "loss": 26.0862, "step": 20 }, { "epoch": 0.09646302250803858, "grad_norm": 12.339645385742188, "learning_rate": 5.124460328562416e-06, "loss": 25.8888, "step": 30 }, { "epoch": 0.12861736334405144, "grad_norm": 12.710750579833984, "learning_rate": 5.532525662631316e-06, "loss": 25.6062, "step": 40 }, { "epoch": 0.1607717041800643, "grad_norm": 9.90512466430664, "learning_rate": 5.849045726034349e-06, "loss": 25.4166, "step": 50 }, { "epoch": 0.19292604501607716, "grad_norm": 10.169076919555664, "learning_rate": 6.107661448408072e-06, "loss": 25.1822, "step": 60 }, { "epoch": 0.22508038585209003, "grad_norm": 10.793392181396484, "learning_rate": 6.326317926111255e-06, "loss": 25.2167, "step": 70 }, { "epoch": 0.2572347266881029, "grad_norm": 10.246559143066406, "learning_rate": 6.515726782476972e-06, "loss": 25.2609, "step": 80 }, { "epoch": 0.28938906752411575, "grad_norm": 10.341804504394531, "learning_rate": 6.682797234184829e-06, "loss": 25.0394, "step": 90 }, { "epoch": 0.3215434083601286, "grad_norm": 10.818121910095215, "learning_rate": 6.832246845880006e-06, "loss": 25.1674, "step": 100 }, { "epoch": 0.3215434083601286, "eval_runtime": 6.1809, "eval_samples_per_second": 8.089, "eval_steps_per_second": 1.133, "step": 100 }, { "epoch": 0.3536977491961415, "grad_norm": 10.39718246459961, "learning_rate": 6.967440464413666e-06, "loss": 25.2442, "step": 110 }, { "epoch": 0.3858520900321543, "grad_norm": 10.767495155334473, "learning_rate": 7.0908625682537295e-06, "loss": 24.8408, "step": 120 }, { "epoch": 0.4180064308681672, "grad_norm": 11.213605880737305, "learning_rate": 7.204399897737671e-06, "loss": 24.9617, "step": 130 }, { "epoch": 0.45016077170418006, "grad_norm": 10.485893249511719, "learning_rate": 7.309519045956911e-06, "loss": 25.0119, "step": 140 }, { "epoch": 0.48231511254019294, "grad_norm": 10.696779251098633, "learning_rate": 7.407382631656762e-06, "loss": 25.0381, "step": 150 }, { "epoch": 0.5144694533762058, "grad_norm": 10.078024864196777, "learning_rate": 7.498927902322629e-06, "loss": 24.9003, "step": 160 }, { "epoch": 0.5466237942122186, "grad_norm": 11.380358695983887, "learning_rate": 7.584921465784847e-06, "loss": 25.0449, "step": 170 }, { "epoch": 0.5787781350482315, "grad_norm": 10.402059555053711, "learning_rate": 7.665998354030485e-06, "loss": 24.8217, "step": 180 }, { "epoch": 0.6109324758842444, "grad_norm": 10.424239158630371, "learning_rate": 7.742690511180911e-06, "loss": 24.7757, "step": 190 }, { "epoch": 0.6430868167202572, "grad_norm": 9.530282974243164, "learning_rate": 7.815447965725662e-06, "loss": 24.6992, "step": 200 }, { "epoch": 0.6430868167202572, "eval_runtime": 5.8091, "eval_samples_per_second": 8.607, "eval_steps_per_second": 1.205, "step": 200 }, { "epoch": 0.6752411575562701, "grad_norm": 11.20760726928711, "learning_rate": 7.884654831733667e-06, "loss": 24.8028, "step": 210 }, { "epoch": 0.707395498392283, "grad_norm": 10.629549026489258, "learning_rate": 7.950641584259324e-06, "loss": 24.7763, "step": 220 }, { "epoch": 0.7395498392282959, "grad_norm": 10.574455261230469, "learning_rate": 8.013694603826467e-06, "loss": 24.8667, "step": 230 }, { "epoch": 0.7717041800643086, "grad_norm": 10.120210647583008, "learning_rate": 8.074063688099385e-06, "loss": 24.693, "step": 240 }, { "epoch": 0.8038585209003215, "grad_norm": 10.61245346069336, "learning_rate": 8.131968029128696e-06, "loss": 24.8847, "step": 250 }, { "epoch": 0.8360128617363344, "grad_norm": 9.841835021972656, "learning_rate": 8.187601017583327e-06, "loss": 24.69, "step": 260 }, { "epoch": 0.8681672025723473, "grad_norm": 10.24456787109375, "learning_rate": 8.241134139807243e-06, "loss": 24.5931, "step": 270 }, { "epoch": 0.9003215434083601, "grad_norm": 10.852150917053223, "learning_rate": 8.292720165802567e-06, "loss": 24.599, "step": 280 }, { "epoch": 0.932475884244373, "grad_norm": 10.495942115783691, "learning_rate": 8.342495777538349e-06, "loss": 24.7865, "step": 290 }, { "epoch": 0.9646302250803859, "grad_norm": 10.313858032226562, "learning_rate": 8.390583751502419e-06, "loss": 24.6473, "step": 300 }, { "epoch": 0.9646302250803859, "eval_runtime": 5.7286, "eval_samples_per_second": 8.728, "eval_steps_per_second": 1.222, "step": 300 }, { "epoch": 0.9967845659163987, "grad_norm": 10.438772201538086, "learning_rate": 8.437094783247598e-06, "loss": 24.5871, "step": 310 }, { "epoch": 1.0289389067524115, "grad_norm": 11.074353218078613, "learning_rate": 8.482129022168284e-06, "loss": 24.3994, "step": 320 }, { "epoch": 1.0610932475884245, "grad_norm": 9.956706047058105, "learning_rate": 8.525777370036078e-06, "loss": 24.3766, "step": 330 }, { "epoch": 1.0932475884244373, "grad_norm": 10.8497314453125, "learning_rate": 8.568122585630503e-06, "loss": 24.4433, "step": 340 }, { "epoch": 1.1254019292604502, "grad_norm": 10.27452564239502, "learning_rate": 8.6092402292056e-06, "loss": 24.3096, "step": 350 }, { "epoch": 1.157556270096463, "grad_norm": 11.772820472717285, "learning_rate": 8.649199473876142e-06, "loss": 24.5275, "step": 360 }, { "epoch": 1.189710610932476, "grad_norm": 9.97986888885498, "learning_rate": 8.68806380581011e-06, "loss": 24.3227, "step": 370 }, { "epoch": 1.2218649517684887, "grad_norm": 12.976365089416504, "learning_rate": 8.725891631026568e-06, "loss": 24.1092, "step": 380 }, { "epoch": 1.2540192926045015, "grad_norm": 11.314805030822754, "learning_rate": 8.762736803360083e-06, "loss": 24.3022, "step": 390 }, { "epoch": 1.2861736334405145, "grad_norm": 10.567892074584961, "learning_rate": 8.798649085571319e-06, "loss": 24.2889, "step": 400 }, { "epoch": 1.2861736334405145, "eval_runtime": 5.3583, "eval_samples_per_second": 9.331, "eval_steps_per_second": 1.306, "step": 400 }, { "epoch": 1.3183279742765273, "grad_norm": 10.870924949645996, "learning_rate": 8.833674553511845e-06, "loss": 24.3052, "step": 410 }, { "epoch": 1.3504823151125402, "grad_norm": 13.34801959991455, "learning_rate": 8.867855951579324e-06, "loss": 24.2141, "step": 420 }, { "epoch": 1.382636655948553, "grad_norm": 11.00822925567627, "learning_rate": 8.901233006342121e-06, "loss": 24.0611, "step": 430 }, { "epoch": 1.414790996784566, "grad_norm": 10.667072296142578, "learning_rate": 8.933842704104978e-06, "loss": 24.2253, "step": 440 }, { "epoch": 1.4469453376205788, "grad_norm": 10.923507690429688, "learning_rate": 8.965719537279175e-06, "loss": 24.3291, "step": 450 }, { "epoch": 1.4790996784565915, "grad_norm": 10.045744895935059, "learning_rate": 8.996895723672124e-06, "loss": 24.2715, "step": 460 }, { "epoch": 1.5112540192926045, "grad_norm": 12.88364028930664, "learning_rate": 9.027401402191656e-06, "loss": 23.9277, "step": 470 }, { "epoch": 1.5434083601286175, "grad_norm": 11.655719757080078, "learning_rate": 9.057264807945042e-06, "loss": 24.1366, "step": 480 }, { "epoch": 1.5755627009646302, "grad_norm": 11.226310729980469, "learning_rate": 9.086512429282506e-06, "loss": 24.1303, "step": 490 }, { "epoch": 1.607717041800643, "grad_norm": 11.92202377319336, "learning_rate": 9.115169148974353e-06, "loss": 24.2135, "step": 500 }, { "epoch": 1.607717041800643, "eval_runtime": 5.3308, "eval_samples_per_second": 9.379, "eval_steps_per_second": 1.313, "step": 500 }, { "epoch": 1.639871382636656, "grad_norm": 11.367260932922363, "learning_rate": 9.143258371407259e-06, "loss": 24.2118, "step": 510 }, { "epoch": 1.6720257234726688, "grad_norm": 12.469697952270508, "learning_rate": 9.170802137428984e-06, "loss": 24.1187, "step": 520 }, { "epoch": 1.7041800643086815, "grad_norm": 12.268674850463867, "learning_rate": 9.197821228253381e-06, "loss": 23.9118, "step": 530 }, { "epoch": 1.7363344051446945, "grad_norm": 13.215527534484863, "learning_rate": 9.2243352596529e-06, "loss": 24.0405, "step": 540 }, { "epoch": 1.7684887459807075, "grad_norm": 11.49039363861084, "learning_rate": 9.250362767508012e-06, "loss": 24.0138, "step": 550 }, { "epoch": 1.8006430868167203, "grad_norm": 12.058365821838379, "learning_rate": 9.275921285648224e-06, "loss": 24.0462, "step": 560 }, { "epoch": 1.832797427652733, "grad_norm": 12.134169578552246, "learning_rate": 9.301027416803324e-06, "loss": 23.9408, "step": 570 }, { "epoch": 1.864951768488746, "grad_norm": 11.846548080444336, "learning_rate": 9.325696897384006e-06, "loss": 23.9263, "step": 580 }, { "epoch": 1.897106109324759, "grad_norm": 11.323081016540527, "learning_rate": 9.349944656724833e-06, "loss": 24.0672, "step": 590 }, { "epoch": 1.9292604501607717, "grad_norm": 11.653038024902344, "learning_rate": 9.373784871348074e-06, "loss": 23.9623, "step": 600 }, { "epoch": 1.9292604501607717, "eval_runtime": 5.313, "eval_samples_per_second": 9.411, "eval_steps_per_second": 1.318, "step": 600 }, { "epoch": 1.9614147909967845, "grad_norm": 12.331990242004395, "learning_rate": 9.39723101474228e-06, "loss": 23.8344, "step": 610 }, { "epoch": 1.9935691318327975, "grad_norm": 10.339744567871094, "learning_rate": 9.420295903093257e-06, "loss": 23.6735, "step": 620 }, { "epoch": 2.0257234726688105, "grad_norm": 10.611045837402344, "learning_rate": 9.442991737356082e-06, "loss": 23.7053, "step": 630 }, { "epoch": 2.057877813504823, "grad_norm": 11.86598014831543, "learning_rate": 9.465330142013941e-06, "loss": 23.4927, "step": 640 }, { "epoch": 2.090032154340836, "grad_norm": 12.124993324279785, "learning_rate": 9.487322200832017e-06, "loss": 23.6022, "step": 650 }, { "epoch": 2.122186495176849, "grad_norm": 13.272892951965332, "learning_rate": 9.508978489881735e-06, "loss": 23.8272, "step": 660 }, { "epoch": 2.154340836012862, "grad_norm": 12.380485534667969, "learning_rate": 9.530309108081716e-06, "loss": 23.5429, "step": 670 }, { "epoch": 2.1864951768488745, "grad_norm": 14.054014205932617, "learning_rate": 9.551323705476158e-06, "loss": 23.5747, "step": 680 }, { "epoch": 2.2186495176848875, "grad_norm": 11.828348159790039, "learning_rate": 9.57203150944888e-06, "loss": 23.3238, "step": 690 }, { "epoch": 2.2508038585209005, "grad_norm": 11.95064926147461, "learning_rate": 9.592441349051258e-06, "loss": 23.4155, "step": 700 }, { "epoch": 2.2508038585209005, "eval_runtime": 5.3225, "eval_samples_per_second": 9.394, "eval_steps_per_second": 1.315, "step": 700 }, { "epoch": 2.282958199356913, "grad_norm": 13.41793155670166, "learning_rate": 9.612561677604608e-06, "loss": 23.4627, "step": 710 }, { "epoch": 2.315112540192926, "grad_norm": 12.131778717041016, "learning_rate": 9.6324005937218e-06, "loss": 23.4932, "step": 720 }, { "epoch": 2.347266881028939, "grad_norm": 11.272132873535156, "learning_rate": 9.651965860878983e-06, "loss": 23.34, "step": 730 }, { "epoch": 2.379421221864952, "grad_norm": 11.798440933227539, "learning_rate": 9.671264925655767e-06, "loss": 23.5724, "step": 740 }, { "epoch": 2.4115755627009645, "grad_norm": 12.789968490600586, "learning_rate": 9.690304934751109e-06, "loss": 23.5121, "step": 750 }, { "epoch": 2.4437299035369775, "grad_norm": 13.960710525512695, "learning_rate": 9.709092750872224e-06, "loss": 23.355, "step": 760 }, { "epoch": 2.4758842443729905, "grad_norm": 14.689282417297363, "learning_rate": 9.727634967584917e-06, "loss": 23.3292, "step": 770 }, { "epoch": 2.508038585209003, "grad_norm": 11.865299224853516, "learning_rate": 9.74593792320574e-06, "loss": 23.7079, "step": 780 }, { "epoch": 2.540192926045016, "grad_norm": 12.899320602416992, "learning_rate": 9.764007713809221e-06, "loss": 23.2868, "step": 790 }, { "epoch": 2.572347266881029, "grad_norm": 12.573016166687012, "learning_rate": 9.781850205416976e-06, "loss": 23.3866, "step": 800 }, { "epoch": 2.572347266881029, "eval_runtime": 5.3296, "eval_samples_per_second": 9.382, "eval_steps_per_second": 1.313, "step": 800 }, { "epoch": 2.604501607717042, "grad_norm": 12.247105598449707, "learning_rate": 9.799471045429654e-06, "loss": 23.3941, "step": 810 }, { "epoch": 2.6366559485530545, "grad_norm": 13.487316131591797, "learning_rate": 9.8168756733575e-06, "loss": 23.2919, "step": 820 }, { "epoch": 2.6688102893890675, "grad_norm": 12.495790481567383, "learning_rate": 9.834069330900517e-06, "loss": 23.3929, "step": 830 }, { "epoch": 2.7009646302250805, "grad_norm": 13.778923988342285, "learning_rate": 9.851057071424981e-06, "loss": 23.2486, "step": 840 }, { "epoch": 2.733118971061093, "grad_norm": 13.809144020080566, "learning_rate": 9.867843768879192e-06, "loss": 23.3308, "step": 850 }, { "epoch": 2.765273311897106, "grad_norm": 12.897558212280273, "learning_rate": 9.884434126187778e-06, "loss": 23.4699, "step": 860 }, { "epoch": 2.797427652733119, "grad_norm": 14.339113235473633, "learning_rate": 9.900832683160762e-06, "loss": 23.3836, "step": 870 }, { "epoch": 2.829581993569132, "grad_norm": 11.934818267822266, "learning_rate": 9.917043823950635e-06, "loss": 23.1259, "step": 880 }, { "epoch": 2.861736334405145, "grad_norm": 13.076687812805176, "learning_rate": 9.93307178408812e-06, "loss": 23.11, "step": 890 }, { "epoch": 2.8938906752411575, "grad_norm": 14.203890800476074, "learning_rate": 9.948920657124832e-06, "loss": 23.2333, "step": 900 }, { "epoch": 2.8938906752411575, "eval_runtime": 5.3867, "eval_samples_per_second": 9.282, "eval_steps_per_second": 1.3, "step": 900 }, { "epoch": 2.9260450160771705, "grad_norm": 15.171213150024414, "learning_rate": 9.964594400908922e-06, "loss": 23.3824, "step": 910 }, { "epoch": 2.958199356913183, "grad_norm": 11.40628719329834, "learning_rate": 9.980096843517781e-06, "loss": 23.1912, "step": 920 }, { "epoch": 2.990353697749196, "grad_norm": 13.028314590454102, "learning_rate": 9.995431688870011e-06, "loss": 23.1345, "step": 930 }, { "epoch": 3.022508038585209, "grad_norm": 12.914365768432617, "learning_rate": 9.999998804322445e-06, "loss": 23.0992, "step": 940 }, { "epoch": 3.054662379421222, "grad_norm": 12.187694549560547, "learning_rate": 9.9999929479441e-06, "loss": 22.8279, "step": 950 }, { "epoch": 3.0868167202572345, "grad_norm": 13.822305679321289, "learning_rate": 9.99998221125707e-06, "loss": 22.6484, "step": 960 }, { "epoch": 3.1189710610932475, "grad_norm": 14.562980651855469, "learning_rate": 9.999966594272994e-06, "loss": 22.9072, "step": 970 }, { "epoch": 3.1511254019292605, "grad_norm": 12.987993240356445, "learning_rate": 9.999946097008813e-06, "loss": 22.6164, "step": 980 }, { "epoch": 3.1832797427652735, "grad_norm": 15.733577728271484, "learning_rate": 9.999920719486753e-06, "loss": 22.8216, "step": 990 }, { "epoch": 3.215434083601286, "grad_norm": 16.308366775512695, "learning_rate": 9.99989046173434e-06, "loss": 22.7655, "step": 1000 }, { "epoch": 3.215434083601286, "eval_runtime": 5.3251, "eval_samples_per_second": 9.389, "eval_steps_per_second": 1.315, "step": 1000 }, { "epoch": 3.247588424437299, "grad_norm": 15.490066528320312, "learning_rate": 9.999855323784386e-06, "loss": 22.5306, "step": 1010 }, { "epoch": 3.279742765273312, "grad_norm": 16.54895782470703, "learning_rate": 9.999815305674998e-06, "loss": 22.3788, "step": 1020 }, { "epoch": 3.311897106109325, "grad_norm": 15.616133689880371, "learning_rate": 9.999770407449582e-06, "loss": 22.5517, "step": 1030 }, { "epoch": 3.3440514469453375, "grad_norm": 16.866737365722656, "learning_rate": 9.999720629156823e-06, "loss": 22.5158, "step": 1040 }, { "epoch": 3.3762057877813505, "grad_norm": 18.696901321411133, "learning_rate": 9.99966597085071e-06, "loss": 22.698, "step": 1050 }, { "epoch": 3.4083601286173635, "grad_norm": 15.379310607910156, "learning_rate": 9.999606432590522e-06, "loss": 22.7503, "step": 1060 }, { "epoch": 3.440514469453376, "grad_norm": 14.11539077758789, "learning_rate": 9.999542014440829e-06, "loss": 22.7458, "step": 1070 }, { "epoch": 3.472668810289389, "grad_norm": 13.95258903503418, "learning_rate": 9.99947271647149e-06, "loss": 22.5253, "step": 1080 }, { "epoch": 3.504823151125402, "grad_norm": 13.543705940246582, "learning_rate": 9.999398538757665e-06, "loss": 22.7168, "step": 1090 }, { "epoch": 3.536977491961415, "grad_norm": 18.485353469848633, "learning_rate": 9.999319481379793e-06, "loss": 22.5193, "step": 1100 }, { "epoch": 3.536977491961415, "eval_runtime": 5.3399, "eval_samples_per_second": 9.363, "eval_steps_per_second": 1.311, "step": 1100 }, { "epoch": 3.5691318327974275, "grad_norm": 17.137102127075195, "learning_rate": 9.99923554442362e-06, "loss": 22.4322, "step": 1110 }, { "epoch": 3.6012861736334405, "grad_norm": 16.949480056762695, "learning_rate": 9.999146727980174e-06, "loss": 22.7255, "step": 1120 }, { "epoch": 3.6334405144694535, "grad_norm": 14.135492324829102, "learning_rate": 9.99905303214578e-06, "loss": 22.505, "step": 1130 }, { "epoch": 3.665594855305466, "grad_norm": 15.640557289123535, "learning_rate": 9.99895445702205e-06, "loss": 22.6428, "step": 1140 }, { "epoch": 3.697749196141479, "grad_norm": 15.869978904724121, "learning_rate": 9.998851002715889e-06, "loss": 22.346, "step": 1150 }, { "epoch": 3.729903536977492, "grad_norm": 16.153425216674805, "learning_rate": 9.998742669339499e-06, "loss": 22.4214, "step": 1160 }, { "epoch": 3.762057877813505, "grad_norm": 16.749225616455078, "learning_rate": 9.998629457010366e-06, "loss": 22.5522, "step": 1170 }, { "epoch": 3.7942122186495175, "grad_norm": 15.06389331817627, "learning_rate": 9.998511365851271e-06, "loss": 22.4205, "step": 1180 }, { "epoch": 3.8263665594855305, "grad_norm": 19.347835540771484, "learning_rate": 9.998388395990284e-06, "loss": 22.1512, "step": 1190 }, { "epoch": 3.8585209003215435, "grad_norm": 14.581829071044922, "learning_rate": 9.998260547560774e-06, "loss": 22.2989, "step": 1200 }, { "epoch": 3.8585209003215435, "eval_runtime": 5.3228, "eval_samples_per_second": 9.394, "eval_steps_per_second": 1.315, "step": 1200 }, { "epoch": 3.890675241157556, "grad_norm": 16.318355560302734, "learning_rate": 9.998127820701387e-06, "loss": 22.3571, "step": 1210 }, { "epoch": 3.922829581993569, "grad_norm": 15.69788932800293, "learning_rate": 9.997990215556069e-06, "loss": 22.2864, "step": 1220 }, { "epoch": 3.954983922829582, "grad_norm": 16.747121810913086, "learning_rate": 9.997847732274057e-06, "loss": 21.9838, "step": 1230 }, { "epoch": 3.987138263665595, "grad_norm": 17.960826873779297, "learning_rate": 9.997700371009876e-06, "loss": 22.7356, "step": 1240 }, { "epoch": 4.019292604501608, "grad_norm": 15.899002075195312, "learning_rate": 9.997548131923342e-06, "loss": 22.1146, "step": 1250 }, { "epoch": 4.051446945337621, "grad_norm": 18.336225509643555, "learning_rate": 9.997391015179558e-06, "loss": 21.8025, "step": 1260 }, { "epoch": 4.083601286173633, "grad_norm": 14.990306854248047, "learning_rate": 9.99722902094892e-06, "loss": 21.9162, "step": 1270 }, { "epoch": 4.115755627009646, "grad_norm": 16.47388458251953, "learning_rate": 9.997062149407114e-06, "loss": 21.7433, "step": 1280 }, { "epoch": 4.147909967845659, "grad_norm": 15.472047805786133, "learning_rate": 9.996890400735113e-06, "loss": 21.7712, "step": 1290 }, { "epoch": 4.180064308681672, "grad_norm": 15.434989929199219, "learning_rate": 9.996713775119185e-06, "loss": 21.6379, "step": 1300 }, { "epoch": 4.180064308681672, "eval_runtime": 5.3105, "eval_samples_per_second": 9.415, "eval_steps_per_second": 1.318, "step": 1300 }, { "epoch": 4.212218649517685, "grad_norm": 17.2169132232666, "learning_rate": 9.996532272750878e-06, "loss": 21.8952, "step": 1310 }, { "epoch": 4.244372990353698, "grad_norm": 19.28879737854004, "learning_rate": 9.99634589382704e-06, "loss": 21.8384, "step": 1320 }, { "epoch": 4.276527331189711, "grad_norm": 18.723726272583008, "learning_rate": 9.996154638549794e-06, "loss": 21.4172, "step": 1330 }, { "epoch": 4.308681672025724, "grad_norm": 17.209087371826172, "learning_rate": 9.995958507126565e-06, "loss": 21.4505, "step": 1340 }, { "epoch": 4.340836012861736, "grad_norm": 18.480688095092773, "learning_rate": 9.99575749977006e-06, "loss": 21.6339, "step": 1350 }, { "epoch": 4.372990353697749, "grad_norm": 24.204160690307617, "learning_rate": 9.99555161669827e-06, "loss": 21.9256, "step": 1360 }, { "epoch": 4.405144694533762, "grad_norm": 23.987192153930664, "learning_rate": 9.995340858134486e-06, "loss": 21.9207, "step": 1370 }, { "epoch": 4.437299035369775, "grad_norm": 15.931623458862305, "learning_rate": 9.99512522430727e-06, "loss": 21.711, "step": 1380 }, { "epoch": 4.469453376205788, "grad_norm": 25.17464828491211, "learning_rate": 9.994904715450488e-06, "loss": 21.5192, "step": 1390 }, { "epoch": 4.501607717041801, "grad_norm": 17.540313720703125, "learning_rate": 9.994679331803277e-06, "loss": 21.9733, "step": 1400 }, { "epoch": 4.501607717041801, "eval_runtime": 5.3465, "eval_samples_per_second": 9.352, "eval_steps_per_second": 1.309, "step": 1400 }, { "epoch": 4.533762057877814, "grad_norm": 18.941497802734375, "learning_rate": 9.994449073610076e-06, "loss": 21.6435, "step": 1410 }, { "epoch": 4.565916398713826, "grad_norm": 21.45750617980957, "learning_rate": 9.9942139411206e-06, "loss": 21.5423, "step": 1420 }, { "epoch": 4.598070739549839, "grad_norm": 17.793384552001953, "learning_rate": 9.993973934589856e-06, "loss": 21.7526, "step": 1430 }, { "epoch": 4.630225080385852, "grad_norm": 20.084718704223633, "learning_rate": 9.99372905427813e-06, "loss": 21.5571, "step": 1440 }, { "epoch": 4.662379421221865, "grad_norm": 18.12001609802246, "learning_rate": 9.993479300451004e-06, "loss": 21.6229, "step": 1450 }, { "epoch": 4.694533762057878, "grad_norm": 21.047819137573242, "learning_rate": 9.993224673379335e-06, "loss": 21.4278, "step": 1460 }, { "epoch": 4.726688102893891, "grad_norm": 17.669565200805664, "learning_rate": 9.992965173339272e-06, "loss": 21.6965, "step": 1470 }, { "epoch": 4.758842443729904, "grad_norm": 20.821794509887695, "learning_rate": 9.992700800612247e-06, "loss": 21.8465, "step": 1480 }, { "epoch": 4.790996784565916, "grad_norm": 18.82571792602539, "learning_rate": 9.992431555484973e-06, "loss": 21.7016, "step": 1490 }, { "epoch": 4.823151125401929, "grad_norm": 17.934131622314453, "learning_rate": 9.992157438249456e-06, "loss": 21.2573, "step": 1500 }, { "epoch": 4.823151125401929, "eval_runtime": 5.3001, "eval_samples_per_second": 9.434, "eval_steps_per_second": 1.321, "step": 1500 }, { "epoch": 4.855305466237942, "grad_norm": 19.788393020629883, "learning_rate": 9.991878449202976e-06, "loss": 21.5668, "step": 1510 }, { "epoch": 4.887459807073955, "grad_norm": 25.959495544433594, "learning_rate": 9.991594588648101e-06, "loss": 21.5135, "step": 1520 }, { "epoch": 4.919614147909968, "grad_norm": 21.621047973632812, "learning_rate": 9.991305856892684e-06, "loss": 21.3383, "step": 1530 }, { "epoch": 4.951768488745981, "grad_norm": 25.18577003479004, "learning_rate": 9.991012254249857e-06, "loss": 21.3393, "step": 1540 }, { "epoch": 4.983922829581994, "grad_norm": 30.819293975830078, "learning_rate": 9.990713781038037e-06, "loss": 21.0999, "step": 1550 }, { "epoch": 5.016077170418006, "grad_norm": 19.34333610534668, "learning_rate": 9.990410437580921e-06, "loss": 21.352, "step": 1560 }, { "epoch": 5.048231511254019, "grad_norm": 22.16535758972168, "learning_rate": 9.990102224207492e-06, "loss": 20.6181, "step": 1570 }, { "epoch": 5.080385852090032, "grad_norm": 21.072885513305664, "learning_rate": 9.989789141252011e-06, "loss": 20.8434, "step": 1580 }, { "epoch": 5.112540192926045, "grad_norm": 23.740102767944336, "learning_rate": 9.98947118905402e-06, "loss": 20.8399, "step": 1590 }, { "epoch": 5.144694533762058, "grad_norm": 25.014060974121094, "learning_rate": 9.989148367958346e-06, "loss": 20.9832, "step": 1600 }, { "epoch": 5.144694533762058, "eval_runtime": 5.3247, "eval_samples_per_second": 9.39, "eval_steps_per_second": 1.315, "step": 1600 }, { "epoch": 5.176848874598071, "grad_norm": 23.70455551147461, "learning_rate": 9.98882067831509e-06, "loss": 20.9883, "step": 1610 }, { "epoch": 5.209003215434084, "grad_norm": 22.214468002319336, "learning_rate": 9.988488120479637e-06, "loss": 20.7647, "step": 1620 }, { "epoch": 5.241157556270096, "grad_norm": 28.82222557067871, "learning_rate": 9.988150694812651e-06, "loss": 20.7777, "step": 1630 }, { "epoch": 5.273311897106109, "grad_norm": 18.030961990356445, "learning_rate": 9.987808401680077e-06, "loss": 20.6907, "step": 1640 }, { "epoch": 5.305466237942122, "grad_norm": 19.381826400756836, "learning_rate": 9.987461241453132e-06, "loss": 20.909, "step": 1650 }, { "epoch": 5.337620578778135, "grad_norm": 22.835229873657227, "learning_rate": 9.987109214508323e-06, "loss": 20.7824, "step": 1660 }, { "epoch": 5.369774919614148, "grad_norm": 26.361066818237305, "learning_rate": 9.986752321227422e-06, "loss": 20.5155, "step": 1670 }, { "epoch": 5.401929260450161, "grad_norm": 22.57075309753418, "learning_rate": 9.98639056199749e-06, "loss": 20.8649, "step": 1680 }, { "epoch": 5.434083601286174, "grad_norm": 26.257099151611328, "learning_rate": 9.986023937210858e-06, "loss": 20.6359, "step": 1690 }, { "epoch": 5.466237942122186, "grad_norm": 20.32964515686035, "learning_rate": 9.985652447265136e-06, "loss": 20.7283, "step": 1700 }, { "epoch": 5.466237942122186, "eval_runtime": 5.3108, "eval_samples_per_second": 9.415, "eval_steps_per_second": 1.318, "step": 1700 }, { "epoch": 5.498392282958199, "grad_norm": 22.166868209838867, "learning_rate": 9.98527609256321e-06, "loss": 20.6549, "step": 1710 }, { "epoch": 5.530546623794212, "grad_norm": 23.852516174316406, "learning_rate": 9.984894873513244e-06, "loss": 20.8719, "step": 1720 }, { "epoch": 5.562700964630225, "grad_norm": 23.02214241027832, "learning_rate": 9.984508790528675e-06, "loss": 20.8409, "step": 1730 }, { "epoch": 5.594855305466238, "grad_norm": 24.25050163269043, "learning_rate": 9.984117844028214e-06, "loss": 20.6357, "step": 1740 }, { "epoch": 5.627009646302251, "grad_norm": 22.49781608581543, "learning_rate": 9.98372203443585e-06, "loss": 20.7227, "step": 1750 }, { "epoch": 5.659163987138264, "grad_norm": 30.93893814086914, "learning_rate": 9.983321362180842e-06, "loss": 20.3558, "step": 1760 }, { "epoch": 5.691318327974276, "grad_norm": 26.473947525024414, "learning_rate": 9.982915827697728e-06, "loss": 20.4576, "step": 1770 }, { "epoch": 5.723472668810289, "grad_norm": 23.059999465942383, "learning_rate": 9.982505431426315e-06, "loss": 20.4611, "step": 1780 }, { "epoch": 5.755627009646302, "grad_norm": 29.39781951904297, "learning_rate": 9.982090173811684e-06, "loss": 20.8373, "step": 1790 }, { "epoch": 5.787781350482315, "grad_norm": 25.880264282226562, "learning_rate": 9.981670055304186e-06, "loss": 20.3066, "step": 1800 }, { "epoch": 5.787781350482315, "eval_runtime": 5.2893, "eval_samples_per_second": 9.453, "eval_steps_per_second": 1.323, "step": 1800 }, { "epoch": 5.819935691318328, "grad_norm": 24.429786682128906, "learning_rate": 9.981245076359448e-06, "loss": 20.5217, "step": 1810 }, { "epoch": 5.852090032154341, "grad_norm": 32.27116012573242, "learning_rate": 9.980815237438367e-06, "loss": 20.527, "step": 1820 }, { "epoch": 5.884244372990354, "grad_norm": 30.01492691040039, "learning_rate": 9.980380539007107e-06, "loss": 20.0971, "step": 1830 }, { "epoch": 5.916398713826366, "grad_norm": 29.562925338745117, "learning_rate": 9.979940981537106e-06, "loss": 20.6867, "step": 1840 }, { "epoch": 5.948553054662379, "grad_norm": 26.031354904174805, "learning_rate": 9.97949656550507e-06, "loss": 20.4894, "step": 1850 }, { "epoch": 5.980707395498392, "grad_norm": 19.95577049255371, "learning_rate": 9.979047291392975e-06, "loss": 20.0435, "step": 1860 }, { "epoch": 6.012861736334405, "grad_norm": 36.58043670654297, "learning_rate": 9.978593159688066e-06, "loss": 20.2794, "step": 1870 }, { "epoch": 6.045016077170418, "grad_norm": 24.9891300201416, "learning_rate": 9.978134170882854e-06, "loss": 20.1104, "step": 1880 }, { "epoch": 6.077170418006431, "grad_norm": 21.276697158813477, "learning_rate": 9.977670325475121e-06, "loss": 19.7703, "step": 1890 }, { "epoch": 6.109324758842444, "grad_norm": 27.93838882446289, "learning_rate": 9.977201623967913e-06, "loss": 19.5878, "step": 1900 }, { "epoch": 6.109324758842444, "eval_runtime": 5.3496, "eval_samples_per_second": 9.347, "eval_steps_per_second": 1.309, "step": 1900 }, { "epoch": 6.141479099678457, "grad_norm": 24.8079776763916, "learning_rate": 9.976728066869543e-06, "loss": 19.8597, "step": 1910 }, { "epoch": 6.173633440514469, "grad_norm": 26.19797706604004, "learning_rate": 9.976249654693593e-06, "loss": 19.487, "step": 1920 }, { "epoch": 6.205787781350482, "grad_norm": 22.81288719177246, "learning_rate": 9.975766387958907e-06, "loss": 19.7653, "step": 1930 }, { "epoch": 6.237942122186495, "grad_norm": 26.222930908203125, "learning_rate": 9.975278267189593e-06, "loss": 20.1292, "step": 1940 }, { "epoch": 6.270096463022508, "grad_norm": 28.436248779296875, "learning_rate": 9.974785292915029e-06, "loss": 20.3, "step": 1950 }, { "epoch": 6.302250803858521, "grad_norm": 24.600616455078125, "learning_rate": 9.974287465669849e-06, "loss": 19.9071, "step": 1960 }, { "epoch": 6.334405144694534, "grad_norm": 26.37251091003418, "learning_rate": 9.973784785993956e-06, "loss": 19.6786, "step": 1970 }, { "epoch": 6.366559485530547, "grad_norm": 25.990135192871094, "learning_rate": 9.973277254432514e-06, "loss": 19.9117, "step": 1980 }, { "epoch": 6.39871382636656, "grad_norm": 24.333906173706055, "learning_rate": 9.972764871535947e-06, "loss": 19.6772, "step": 1990 }, { "epoch": 6.430868167202572, "grad_norm": 26.432903289794922, "learning_rate": 9.972247637859942e-06, "loss": 20.1963, "step": 2000 }, { "epoch": 6.430868167202572, "eval_runtime": 5.29, "eval_samples_per_second": 9.452, "eval_steps_per_second": 1.323, "step": 2000 }, { "epoch": 6.463022508038585, "grad_norm": 36.078643798828125, "learning_rate": 9.97172555396545e-06, "loss": 20.3001, "step": 2010 }, { "epoch": 6.495176848874598, "grad_norm": 26.97858238220215, "learning_rate": 9.97119862041867e-06, "loss": 19.5984, "step": 2020 }, { "epoch": 6.527331189710611, "grad_norm": 21.99107551574707, "learning_rate": 9.97066683779108e-06, "loss": 20.0637, "step": 2030 }, { "epoch": 6.559485530546624, "grad_norm": 27.621002197265625, "learning_rate": 9.970130206659398e-06, "loss": 19.8864, "step": 2040 }, { "epoch": 6.591639871382637, "grad_norm": 27.518064498901367, "learning_rate": 9.969588727605614e-06, "loss": 19.4799, "step": 2050 }, { "epoch": 6.62379421221865, "grad_norm": 31.95291519165039, "learning_rate": 9.969042401216965e-06, "loss": 20.0065, "step": 2060 }, { "epoch": 6.655948553054662, "grad_norm": 34.73428726196289, "learning_rate": 9.968491228085954e-06, "loss": 19.663, "step": 2070 }, { "epoch": 6.688102893890675, "grad_norm": 25.29422378540039, "learning_rate": 9.967935208810332e-06, "loss": 20.0892, "step": 2080 }, { "epoch": 6.720257234726688, "grad_norm": 30.417152404785156, "learning_rate": 9.967374343993115e-06, "loss": 19.7689, "step": 2090 }, { "epoch": 6.752411575562701, "grad_norm": 31.431406021118164, "learning_rate": 9.966808634242568e-06, "loss": 19.4306, "step": 2100 }, { "epoch": 6.752411575562701, "eval_runtime": 5.2722, "eval_samples_per_second": 9.484, "eval_steps_per_second": 1.328, "step": 2100 }, { "epoch": 6.784565916398714, "grad_norm": 30.45334815979004, "learning_rate": 9.966238080172209e-06, "loss": 19.6493, "step": 2110 }, { "epoch": 6.816720257234727, "grad_norm": 32.87178039550781, "learning_rate": 9.965662682400813e-06, "loss": 19.5532, "step": 2120 }, { "epoch": 6.84887459807074, "grad_norm": 34.73257827758789, "learning_rate": 9.96508244155241e-06, "loss": 19.6102, "step": 2130 }, { "epoch": 6.881028938906752, "grad_norm": 23.309972763061523, "learning_rate": 9.964497358256279e-06, "loss": 19.4626, "step": 2140 }, { "epoch": 6.913183279742765, "grad_norm": 30.528322219848633, "learning_rate": 9.963907433146947e-06, "loss": 19.8002, "step": 2150 }, { "epoch": 6.945337620578778, "grad_norm": 30.773754119873047, "learning_rate": 9.963312666864202e-06, "loss": 19.5852, "step": 2160 }, { "epoch": 6.977491961414791, "grad_norm": 27.484689712524414, "learning_rate": 9.962713060053075e-06, "loss": 19.514, "step": 2170 }, { "epoch": 7.009646302250804, "grad_norm": 29.755083084106445, "learning_rate": 9.96210861336385e-06, "loss": 19.5801, "step": 2180 }, { "epoch": 7.041800643086817, "grad_norm": 33.91703414916992, "learning_rate": 9.961499327452051e-06, "loss": 19.0605, "step": 2190 }, { "epoch": 7.07395498392283, "grad_norm": 32.00695037841797, "learning_rate": 9.960885202978468e-06, "loss": 19.1987, "step": 2200 }, { "epoch": 7.07395498392283, "eval_runtime": 5.5043, "eval_samples_per_second": 9.084, "eval_steps_per_second": 1.272, "step": 2200 }, { "epoch": 7.106109324758842, "grad_norm": 34.1601448059082, "learning_rate": 9.960266240609121e-06, "loss": 19.378, "step": 2210 }, { "epoch": 7.138263665594855, "grad_norm": 34.37297821044922, "learning_rate": 9.959642441015285e-06, "loss": 18.7165, "step": 2220 }, { "epoch": 7.170418006430868, "grad_norm": 29.401077270507812, "learning_rate": 9.959013804873484e-06, "loss": 18.7618, "step": 2230 }, { "epoch": 7.202572347266881, "grad_norm": 32.18341064453125, "learning_rate": 9.958380332865478e-06, "loss": 18.8383, "step": 2240 }, { "epoch": 7.234726688102894, "grad_norm": 26.608930587768555, "learning_rate": 9.95774202567828e-06, "loss": 18.7965, "step": 2250 }, { "epoch": 7.266881028938907, "grad_norm": 34.67891311645508, "learning_rate": 9.957098884004139e-06, "loss": 19.5027, "step": 2260 }, { "epoch": 7.29903536977492, "grad_norm": 31.889785766601562, "learning_rate": 9.956450908540558e-06, "loss": 19.1395, "step": 2270 }, { "epoch": 7.331189710610932, "grad_norm": 35.200927734375, "learning_rate": 9.955798099990271e-06, "loss": 18.63, "step": 2280 }, { "epoch": 7.363344051446945, "grad_norm": 27.889312744140625, "learning_rate": 9.955140459061263e-06, "loss": 19.2963, "step": 2290 }, { "epoch": 7.395498392282958, "grad_norm": 25.917516708374023, "learning_rate": 9.954477986466749e-06, "loss": 18.9285, "step": 2300 }, { "epoch": 7.395498392282958, "eval_runtime": 5.3961, "eval_samples_per_second": 9.266, "eval_steps_per_second": 1.297, "step": 2300 }, { "epoch": 7.427652733118971, "grad_norm": 37.95915222167969, "learning_rate": 9.953810682925196e-06, "loss": 18.6655, "step": 2310 }, { "epoch": 7.459807073954984, "grad_norm": 32.404884338378906, "learning_rate": 9.953138549160301e-06, "loss": 19.0886, "step": 2320 }, { "epoch": 7.491961414790997, "grad_norm": 41.5395622253418, "learning_rate": 9.952461585901006e-06, "loss": 18.9648, "step": 2330 }, { "epoch": 7.52411575562701, "grad_norm": 38.982093811035156, "learning_rate": 9.951779793881483e-06, "loss": 18.9497, "step": 2340 }, { "epoch": 7.556270096463022, "grad_norm": 27.97321319580078, "learning_rate": 9.951093173841154e-06, "loss": 18.2329, "step": 2350 }, { "epoch": 7.588424437299035, "grad_norm": 38.88495635986328, "learning_rate": 9.950401726524658e-06, "loss": 18.7051, "step": 2360 }, { "epoch": 7.620578778135048, "grad_norm": 32.95913314819336, "learning_rate": 9.949705452681887e-06, "loss": 19.1657, "step": 2370 }, { "epoch": 7.652733118971061, "grad_norm": 34.08066177368164, "learning_rate": 9.949004353067957e-06, "loss": 18.8268, "step": 2380 }, { "epoch": 7.684887459807074, "grad_norm": 31.087087631225586, "learning_rate": 9.948298428443224e-06, "loss": 18.9917, "step": 2390 }, { "epoch": 7.717041800643087, "grad_norm": 39.72461700439453, "learning_rate": 9.94758767957327e-06, "loss": 18.2326, "step": 2400 }, { "epoch": 7.717041800643087, "eval_runtime": 5.3216, "eval_samples_per_second": 9.396, "eval_steps_per_second": 1.315, "step": 2400 }, { "epoch": 7.7491961414791, "grad_norm": 40.55718231201172, "learning_rate": 9.946872107228916e-06, "loss": 18.7288, "step": 2410 }, { "epoch": 7.781350482315112, "grad_norm": 31.80949592590332, "learning_rate": 9.946151712186206e-06, "loss": 18.4089, "step": 2420 }, { "epoch": 7.813504823151125, "grad_norm": 43.62948226928711, "learning_rate": 9.945426495226424e-06, "loss": 18.5257, "step": 2430 }, { "epoch": 7.845659163987138, "grad_norm": 39.61101150512695, "learning_rate": 9.944696457136078e-06, "loss": 19.1786, "step": 2440 }, { "epoch": 7.877813504823151, "grad_norm": 34.469852447509766, "learning_rate": 9.943961598706902e-06, "loss": 18.5711, "step": 2450 }, { "epoch": 7.909967845659164, "grad_norm": 32.7941780090332, "learning_rate": 9.94322192073586e-06, "loss": 18.7481, "step": 2460 }, { "epoch": 7.942122186495177, "grad_norm": 30.207687377929688, "learning_rate": 9.94247742402515e-06, "loss": 18.6683, "step": 2470 }, { "epoch": 7.97427652733119, "grad_norm": 40.51539611816406, "learning_rate": 9.941728109382186e-06, "loss": 18.6317, "step": 2480 }, { "epoch": 8.006430868167202, "grad_norm": 28.78127098083496, "learning_rate": 9.940973977619608e-06, "loss": 18.176, "step": 2490 }, { "epoch": 8.038585209003216, "grad_norm": 35.019752502441406, "learning_rate": 9.940215029555285e-06, "loss": 18.1559, "step": 2500 }, { "epoch": 8.038585209003216, "eval_runtime": 5.2601, "eval_samples_per_second": 9.505, "eval_steps_per_second": 1.331, "step": 2500 }, { "epoch": 8.070739549839228, "grad_norm": 34.426753997802734, "learning_rate": 9.93945126601231e-06, "loss": 18.2816, "step": 2510 }, { "epoch": 8.102893890675242, "grad_norm": 39.0522575378418, "learning_rate": 9.938682687818993e-06, "loss": 18.0547, "step": 2520 }, { "epoch": 8.135048231511254, "grad_norm": 25.532230377197266, "learning_rate": 9.937909295808869e-06, "loss": 17.9425, "step": 2530 }, { "epoch": 8.167202572347266, "grad_norm": 32.825111389160156, "learning_rate": 9.937131090820692e-06, "loss": 18.079, "step": 2540 }, { "epoch": 8.19935691318328, "grad_norm": 37.0963249206543, "learning_rate": 9.936348073698439e-06, "loss": 18.3732, "step": 2550 }, { "epoch": 8.231511254019292, "grad_norm": 40.5250358581543, "learning_rate": 9.935560245291303e-06, "loss": 18.4048, "step": 2560 }, { "epoch": 8.263665594855306, "grad_norm": 39.9486198425293, "learning_rate": 9.934767606453694e-06, "loss": 17.8479, "step": 2570 }, { "epoch": 8.295819935691318, "grad_norm": 28.41617774963379, "learning_rate": 9.933970158045243e-06, "loss": 17.6264, "step": 2580 }, { "epoch": 8.327974276527332, "grad_norm": 38.20322036743164, "learning_rate": 9.933167900930792e-06, "loss": 17.8033, "step": 2590 }, { "epoch": 8.360128617363344, "grad_norm": 36.69364547729492, "learning_rate": 9.932360835980403e-06, "loss": 18.7213, "step": 2600 }, { "epoch": 8.360128617363344, "eval_runtime": 5.2849, "eval_samples_per_second": 9.461, "eval_steps_per_second": 1.325, "step": 2600 }, { "epoch": 8.392282958199356, "grad_norm": 38.89975357055664, "learning_rate": 9.931548964069346e-06, "loss": 17.2497, "step": 2610 }, { "epoch": 8.42443729903537, "grad_norm": 30.84475326538086, "learning_rate": 9.930732286078113e-06, "loss": 17.6923, "step": 2620 }, { "epoch": 8.456591639871382, "grad_norm": 33.5067024230957, "learning_rate": 9.9299108028924e-06, "loss": 17.7442, "step": 2630 }, { "epoch": 8.488745980707396, "grad_norm": 38.898868560791016, "learning_rate": 9.929084515403118e-06, "loss": 18.3503, "step": 2640 }, { "epoch": 8.520900321543408, "grad_norm": 36.928062438964844, "learning_rate": 9.928253424506392e-06, "loss": 18.0561, "step": 2650 }, { "epoch": 8.553054662379422, "grad_norm": 40.83977508544922, "learning_rate": 9.927417531103547e-06, "loss": 17.3311, "step": 2660 }, { "epoch": 8.585209003215434, "grad_norm": 31.670196533203125, "learning_rate": 9.926576836101122e-06, "loss": 18.351, "step": 2670 }, { "epoch": 8.617363344051448, "grad_norm": 40.12260818481445, "learning_rate": 9.925731340410865e-06, "loss": 17.8751, "step": 2680 }, { "epoch": 8.64951768488746, "grad_norm": 30.877674102783203, "learning_rate": 9.92488104494973e-06, "loss": 17.7256, "step": 2690 }, { "epoch": 8.681672025723472, "grad_norm": 32.23291778564453, "learning_rate": 9.924025950639871e-06, "loss": 17.9651, "step": 2700 }, { "epoch": 8.681672025723472, "eval_runtime": 5.309, "eval_samples_per_second": 9.418, "eval_steps_per_second": 1.319, "step": 2700 }, { "epoch": 8.713826366559486, "grad_norm": 36.37550735473633, "learning_rate": 9.923166058408655e-06, "loss": 17.6134, "step": 2710 }, { "epoch": 8.745980707395498, "grad_norm": 37.35031509399414, "learning_rate": 9.922301369188644e-06, "loss": 17.5018, "step": 2720 }, { "epoch": 8.778135048231512, "grad_norm": 42.36457443237305, "learning_rate": 9.921431883917608e-06, "loss": 18.0929, "step": 2730 }, { "epoch": 8.810289389067524, "grad_norm": 38.09732437133789, "learning_rate": 9.920557603538517e-06, "loss": 17.8352, "step": 2740 }, { "epoch": 8.842443729903536, "grad_norm": 43.581661224365234, "learning_rate": 9.919678528999539e-06, "loss": 17.8803, "step": 2750 }, { "epoch": 8.87459807073955, "grad_norm": 32.51306915283203, "learning_rate": 9.918794661254045e-06, "loss": 17.4394, "step": 2760 }, { "epoch": 8.906752411575562, "grad_norm": 28.124265670776367, "learning_rate": 9.9179060012606e-06, "loss": 17.7948, "step": 2770 }, { "epoch": 8.938906752411576, "grad_norm": 33.57081604003906, "learning_rate": 9.917012549982975e-06, "loss": 16.9722, "step": 2780 }, { "epoch": 8.971061093247588, "grad_norm": 42.00601577758789, "learning_rate": 9.916114308390125e-06, "loss": 17.7023, "step": 2790 }, { "epoch": 9.003215434083602, "grad_norm": 34.284671783447266, "learning_rate": 9.915211277456207e-06, "loss": 17.8052, "step": 2800 }, { "epoch": 9.003215434083602, "eval_runtime": 5.284, "eval_samples_per_second": 9.462, "eval_steps_per_second": 1.325, "step": 2800 }, { "epoch": 9.035369774919614, "grad_norm": 32.29022979736328, "learning_rate": 9.914303458160574e-06, "loss": 17.3868, "step": 2810 }, { "epoch": 9.067524115755628, "grad_norm": 44.101993560791016, "learning_rate": 9.913390851487768e-06, "loss": 17.2221, "step": 2820 }, { "epoch": 9.09967845659164, "grad_norm": 36.17911911010742, "learning_rate": 9.912473458427524e-06, "loss": 17.3468, "step": 2830 }, { "epoch": 9.131832797427652, "grad_norm": 45.87486267089844, "learning_rate": 9.91155127997477e-06, "loss": 16.406, "step": 2840 }, { "epoch": 9.163987138263666, "grad_norm": 35.22056198120117, "learning_rate": 9.910624317129619e-06, "loss": 16.6738, "step": 2850 }, { "epoch": 9.196141479099678, "grad_norm": 41.093963623046875, "learning_rate": 9.90969257089738e-06, "loss": 17.1796, "step": 2860 }, { "epoch": 9.228295819935692, "grad_norm": 32.463504791259766, "learning_rate": 9.908756042288542e-06, "loss": 17.5413, "step": 2870 }, { "epoch": 9.260450160771704, "grad_norm": 34.884971618652344, "learning_rate": 9.907814732318786e-06, "loss": 16.8466, "step": 2880 }, { "epoch": 9.292604501607718, "grad_norm": 47.55032730102539, "learning_rate": 9.906868642008974e-06, "loss": 17.1988, "step": 2890 }, { "epoch": 9.32475884244373, "grad_norm": 34.37715530395508, "learning_rate": 9.905917772385158e-06, "loss": 17.245, "step": 2900 }, { "epoch": 9.32475884244373, "eval_runtime": 5.2947, "eval_samples_per_second": 9.443, "eval_steps_per_second": 1.322, "step": 2900 }, { "epoch": 9.356913183279742, "grad_norm": 53.30574417114258, "learning_rate": 9.904962124478568e-06, "loss": 16.6431, "step": 2910 }, { "epoch": 9.389067524115756, "grad_norm": 39.47657012939453, "learning_rate": 9.90400169932562e-06, "loss": 16.7345, "step": 2920 }, { "epoch": 9.421221864951768, "grad_norm": 46.4093017578125, "learning_rate": 9.903036497967908e-06, "loss": 17.2429, "step": 2930 }, { "epoch": 9.453376205787782, "grad_norm": 39.42192459106445, "learning_rate": 9.90206652145221e-06, "loss": 17.0855, "step": 2940 }, { "epoch": 9.485530546623794, "grad_norm": 31.1597900390625, "learning_rate": 9.901091770830473e-06, "loss": 17.1141, "step": 2950 }, { "epoch": 9.517684887459808, "grad_norm": 44.61915969848633, "learning_rate": 9.900112247159835e-06, "loss": 17.155, "step": 2960 }, { "epoch": 9.54983922829582, "grad_norm": 60.631805419921875, "learning_rate": 9.899127951502601e-06, "loss": 17.0669, "step": 2970 }, { "epoch": 9.581993569131832, "grad_norm": 28.94301986694336, "learning_rate": 9.898138884926255e-06, "loss": 17.3093, "step": 2980 }, { "epoch": 9.614147909967846, "grad_norm": 40.43606185913086, "learning_rate": 9.897145048503455e-06, "loss": 16.9892, "step": 2990 }, { "epoch": 9.646302250803858, "grad_norm": 46.10663604736328, "learning_rate": 9.89614644331203e-06, "loss": 16.3563, "step": 3000 }, { "epoch": 9.646302250803858, "eval_runtime": 5.269, "eval_samples_per_second": 9.489, "eval_steps_per_second": 1.329, "step": 3000 }, { "epoch": 9.678456591639872, "grad_norm": 45.46335983276367, "learning_rate": 9.895143070434981e-06, "loss": 16.6485, "step": 3010 }, { "epoch": 9.710610932475884, "grad_norm": 38.375057220458984, "learning_rate": 9.894134930960482e-06, "loss": 16.6654, "step": 3020 }, { "epoch": 9.742765273311898, "grad_norm": 33.574764251708984, "learning_rate": 9.893122025981875e-06, "loss": 16.8412, "step": 3030 }, { "epoch": 9.77491961414791, "grad_norm": 38.5945930480957, "learning_rate": 9.892104356597671e-06, "loss": 17.0105, "step": 3040 }, { "epoch": 9.807073954983922, "grad_norm": 42.92488098144531, "learning_rate": 9.891081923911547e-06, "loss": 16.6836, "step": 3050 }, { "epoch": 9.839228295819936, "grad_norm": 37.13010787963867, "learning_rate": 9.890054729032348e-06, "loss": 17.3576, "step": 3060 }, { "epoch": 9.871382636655948, "grad_norm": 57.497440338134766, "learning_rate": 9.889022773074077e-06, "loss": 16.7957, "step": 3070 }, { "epoch": 9.903536977491962, "grad_norm": 41.097190856933594, "learning_rate": 9.887986057155912e-06, "loss": 17.3542, "step": 3080 }, { "epoch": 9.935691318327974, "grad_norm": 38.25314712524414, "learning_rate": 9.886944582402181e-06, "loss": 16.5293, "step": 3090 }, { "epoch": 9.967845659163988, "grad_norm": 42.60857391357422, "learning_rate": 9.885898349942383e-06, "loss": 16.8708, "step": 3100 }, { "epoch": 9.967845659163988, "eval_runtime": 5.3578, "eval_samples_per_second": 9.332, "eval_steps_per_second": 1.306, "step": 3100 }, { "epoch": 10.0, "grad_norm": 45.5200309753418, "learning_rate": 9.884847360911168e-06, "loss": 16.9047, "step": 3110 }, { "epoch": 10.032154340836012, "grad_norm": 32.950439453125, "learning_rate": 9.883791616448353e-06, "loss": 15.7675, "step": 3120 }, { "epoch": 10.064308681672026, "grad_norm": 37.04921340942383, "learning_rate": 9.882731117698907e-06, "loss": 15.8269, "step": 3130 }, { "epoch": 10.096463022508038, "grad_norm": 37.445804595947266, "learning_rate": 9.881665865812957e-06, "loss": 16.113, "step": 3140 }, { "epoch": 10.128617363344052, "grad_norm": 34.36590576171875, "learning_rate": 9.880595861945783e-06, "loss": 16.0043, "step": 3150 }, { "epoch": 10.160771704180064, "grad_norm": 33.16121292114258, "learning_rate": 9.879521107257824e-06, "loss": 16.8634, "step": 3160 }, { "epoch": 10.192926045016078, "grad_norm": 41.15790939331055, "learning_rate": 9.878441602914663e-06, "loss": 16.4276, "step": 3170 }, { "epoch": 10.22508038585209, "grad_norm": 36.52079391479492, "learning_rate": 9.877357350087039e-06, "loss": 16.4466, "step": 3180 }, { "epoch": 10.257234726688102, "grad_norm": 36.07787322998047, "learning_rate": 9.87626834995084e-06, "loss": 16.0052, "step": 3190 }, { "epoch": 10.289389067524116, "grad_norm": 44.88103485107422, "learning_rate": 9.875174603687104e-06, "loss": 16.2346, "step": 3200 }, { "epoch": 10.289389067524116, "eval_runtime": 5.3206, "eval_samples_per_second": 9.397, "eval_steps_per_second": 1.316, "step": 3200 }, { "epoch": 10.321543408360128, "grad_norm": 47.590553283691406, "learning_rate": 9.874076112482017e-06, "loss": 16.1779, "step": 3210 }, { "epoch": 10.353697749196142, "grad_norm": 45.31756591796875, "learning_rate": 9.872972877526904e-06, "loss": 15.9693, "step": 3220 }, { "epoch": 10.385852090032154, "grad_norm": 37.529563903808594, "learning_rate": 9.871864900018242e-06, "loss": 16.7452, "step": 3230 }, { "epoch": 10.418006430868168, "grad_norm": 41.66072463989258, "learning_rate": 9.87075218115765e-06, "loss": 16.7178, "step": 3240 }, { "epoch": 10.45016077170418, "grad_norm": 38.12178421020508, "learning_rate": 9.869634722151886e-06, "loss": 17.1137, "step": 3250 }, { "epoch": 10.482315112540192, "grad_norm": 46.55085754394531, "learning_rate": 9.868512524212851e-06, "loss": 15.8753, "step": 3260 }, { "epoch": 10.514469453376206, "grad_norm": 39.947059631347656, "learning_rate": 9.86738558855759e-06, "loss": 16.5416, "step": 3270 }, { "epoch": 10.546623794212218, "grad_norm": 36.83174514770508, "learning_rate": 9.866253916408273e-06, "loss": 16.469, "step": 3280 }, { "epoch": 10.578778135048232, "grad_norm": 43.83608627319336, "learning_rate": 9.86511750899222e-06, "loss": 16.6846, "step": 3290 }, { "epoch": 10.610932475884244, "grad_norm": 42.55561065673828, "learning_rate": 9.863976367541884e-06, "loss": 16.4142, "step": 3300 }, { "epoch": 10.610932475884244, "eval_runtime": 5.332, "eval_samples_per_second": 9.377, "eval_steps_per_second": 1.313, "step": 3300 }, { "epoch": 10.643086816720258, "grad_norm": 37.118675231933594, "learning_rate": 9.862830493294843e-06, "loss": 16.72, "step": 3310 }, { "epoch": 10.67524115755627, "grad_norm": 46.181678771972656, "learning_rate": 9.861679887493819e-06, "loss": 16.2448, "step": 3320 }, { "epoch": 10.707395498392284, "grad_norm": 40.46521759033203, "learning_rate": 9.860524551386662e-06, "loss": 15.9411, "step": 3330 }, { "epoch": 10.739549839228296, "grad_norm": 43.16596221923828, "learning_rate": 9.859364486226349e-06, "loss": 16.2274, "step": 3340 }, { "epoch": 10.771704180064308, "grad_norm": 38.23724365234375, "learning_rate": 9.858199693270986e-06, "loss": 16.1474, "step": 3350 }, { "epoch": 10.803858520900322, "grad_norm": 41.1743049621582, "learning_rate": 9.857030173783811e-06, "loss": 15.8649, "step": 3360 }, { "epoch": 10.836012861736334, "grad_norm": 45.38727569580078, "learning_rate": 9.855855929033184e-06, "loss": 15.3542, "step": 3370 }, { "epoch": 10.868167202572348, "grad_norm": 45.04948806762695, "learning_rate": 9.854676960292592e-06, "loss": 15.9345, "step": 3380 }, { "epoch": 10.90032154340836, "grad_norm": 40.88465118408203, "learning_rate": 9.853493268840643e-06, "loss": 15.7595, "step": 3390 }, { "epoch": 10.932475884244372, "grad_norm": 37.035762786865234, "learning_rate": 9.852304855961067e-06, "loss": 15.9212, "step": 3400 }, { "epoch": 10.932475884244372, "eval_runtime": 5.2679, "eval_samples_per_second": 9.491, "eval_steps_per_second": 1.329, "step": 3400 }, { "epoch": 10.964630225080386, "grad_norm": 45.86964416503906, "learning_rate": 9.851111722942715e-06, "loss": 16.0053, "step": 3410 }, { "epoch": 10.996784565916398, "grad_norm": 52.381107330322266, "learning_rate": 9.849913871079561e-06, "loss": 16.107, "step": 3420 }, { "epoch": 11.028938906752412, "grad_norm": 41.03367233276367, "learning_rate": 9.848711301670689e-06, "loss": 16.1092, "step": 3430 }, { "epoch": 11.061093247588424, "grad_norm": 49.59638214111328, "learning_rate": 9.847504016020305e-06, "loss": 14.8743, "step": 3440 }, { "epoch": 11.093247588424438, "grad_norm": 48.12381362915039, "learning_rate": 9.846292015437727e-06, "loss": 15.1856, "step": 3450 }, { "epoch": 11.12540192926045, "grad_norm": 42.135231018066406, "learning_rate": 9.845075301237388e-06, "loss": 15.1894, "step": 3460 }, { "epoch": 11.157556270096464, "grad_norm": 43.23624038696289, "learning_rate": 9.843853874738833e-06, "loss": 15.3851, "step": 3470 }, { "epoch": 11.189710610932476, "grad_norm": 39.43941879272461, "learning_rate": 9.842627737266719e-06, "loss": 15.961, "step": 3480 }, { "epoch": 11.221864951768488, "grad_norm": 41.22692108154297, "learning_rate": 9.841396890150805e-06, "loss": 15.2711, "step": 3490 }, { "epoch": 11.254019292604502, "grad_norm": 29.887779235839844, "learning_rate": 9.840161334725966e-06, "loss": 15.8929, "step": 3500 }, { "epoch": 11.254019292604502, "eval_runtime": 5.3868, "eval_samples_per_second": 9.282, "eval_steps_per_second": 1.299, "step": 3500 }, { "epoch": 11.286173633440514, "grad_norm": 38.214500427246094, "learning_rate": 9.838921072332181e-06, "loss": 15.6409, "step": 3510 }, { "epoch": 11.318327974276528, "grad_norm": 47.688682556152344, "learning_rate": 9.837676104314532e-06, "loss": 15.9269, "step": 3520 }, { "epoch": 11.35048231511254, "grad_norm": 35.883243560791016, "learning_rate": 9.836426432023204e-06, "loss": 15.767, "step": 3530 }, { "epoch": 11.382636655948554, "grad_norm": 33.941200256347656, "learning_rate": 9.835172056813488e-06, "loss": 15.379, "step": 3540 }, { "epoch": 11.414790996784566, "grad_norm": 37.60831832885742, "learning_rate": 9.833912980045767e-06, "loss": 15.2177, "step": 3550 }, { "epoch": 11.446945337620578, "grad_norm": 43.98740005493164, "learning_rate": 9.832649203085535e-06, "loss": 15.8949, "step": 3560 }, { "epoch": 11.479099678456592, "grad_norm": 51.312965393066406, "learning_rate": 9.83138072730337e-06, "loss": 15.7196, "step": 3570 }, { "epoch": 11.511254019292604, "grad_norm": 46.85712432861328, "learning_rate": 9.830107554074958e-06, "loss": 14.4287, "step": 3580 }, { "epoch": 11.543408360128618, "grad_norm": 48.731658935546875, "learning_rate": 9.828829684781073e-06, "loss": 15.5323, "step": 3590 }, { "epoch": 11.57556270096463, "grad_norm": 45.208797454833984, "learning_rate": 9.82754712080758e-06, "loss": 15.405, "step": 3600 }, { "epoch": 11.57556270096463, "eval_runtime": 5.2766, "eval_samples_per_second": 9.476, "eval_steps_per_second": 1.327, "step": 3600 }, { "epoch": 11.607717041800644, "grad_norm": 52.0434684753418, "learning_rate": 9.826259863545444e-06, "loss": 14.99, "step": 3610 }, { "epoch": 11.639871382636656, "grad_norm": 48.86016845703125, "learning_rate": 9.824967914390708e-06, "loss": 15.741, "step": 3620 }, { "epoch": 11.672025723472668, "grad_norm": 45.55179977416992, "learning_rate": 9.823671274744514e-06, "loss": 16.0046, "step": 3630 }, { "epoch": 11.704180064308682, "grad_norm": 47.784278869628906, "learning_rate": 9.822369946013087e-06, "loss": 14.7567, "step": 3640 }, { "epoch": 11.736334405144694, "grad_norm": 38.537986755371094, "learning_rate": 9.821063929607737e-06, "loss": 15.1768, "step": 3650 }, { "epoch": 11.768488745980708, "grad_norm": 41.475242614746094, "learning_rate": 9.819753226944855e-06, "loss": 14.8819, "step": 3660 }, { "epoch": 11.80064308681672, "grad_norm": 48.3632926940918, "learning_rate": 9.818437839445921e-06, "loss": 15.2298, "step": 3670 }, { "epoch": 11.832797427652734, "grad_norm": 42.113460540771484, "learning_rate": 9.817117768537489e-06, "loss": 16.1912, "step": 3680 }, { "epoch": 11.864951768488746, "grad_norm": 46.626319885253906, "learning_rate": 9.815793015651198e-06, "loss": 14.0658, "step": 3690 }, { "epoch": 11.897106109324758, "grad_norm": 44.97504806518555, "learning_rate": 9.814463582223762e-06, "loss": 15.5536, "step": 3700 }, { "epoch": 11.897106109324758, "eval_runtime": 5.36, "eval_samples_per_second": 9.328, "eval_steps_per_second": 1.306, "step": 3700 }, { "epoch": 11.929260450160772, "grad_norm": 34.236751556396484, "learning_rate": 9.81312946969697e-06, "loss": 14.8358, "step": 3710 }, { "epoch": 11.961414790996784, "grad_norm": 48.50252151489258, "learning_rate": 9.811790679517685e-06, "loss": 14.7316, "step": 3720 }, { "epoch": 11.993569131832798, "grad_norm": 62.792503356933594, "learning_rate": 9.810447213137849e-06, "loss": 16.073, "step": 3730 }, { "epoch": 12.02572347266881, "grad_norm": 42.848289489746094, "learning_rate": 9.809099072014468e-06, "loss": 15.3286, "step": 3740 }, { "epoch": 12.057877813504824, "grad_norm": 47.60424041748047, "learning_rate": 9.807746257609621e-06, "loss": 14.7468, "step": 3750 }, { "epoch": 12.090032154340836, "grad_norm": 42.546661376953125, "learning_rate": 9.806388771390457e-06, "loss": 14.6005, "step": 3760 }, { "epoch": 12.122186495176848, "grad_norm": 59.381622314453125, "learning_rate": 9.805026614829188e-06, "loss": 15.1724, "step": 3770 }, { "epoch": 12.154340836012862, "grad_norm": 41.44083023071289, "learning_rate": 9.803659789403094e-06, "loss": 14.9266, "step": 3780 }, { "epoch": 12.186495176848874, "grad_norm": 43.43206024169922, "learning_rate": 9.802288296594516e-06, "loss": 15.3256, "step": 3790 }, { "epoch": 12.218649517684888, "grad_norm": 41.55681610107422, "learning_rate": 9.80091213789086e-06, "loss": 14.3895, "step": 3800 }, { "epoch": 12.218649517684888, "eval_runtime": 5.2817, "eval_samples_per_second": 9.467, "eval_steps_per_second": 1.325, "step": 3800 }, { "epoch": 12.2508038585209, "grad_norm": 55.414180755615234, "learning_rate": 9.799531314784589e-06, "loss": 13.7536, "step": 3810 }, { "epoch": 12.282958199356914, "grad_norm": 35.76074981689453, "learning_rate": 9.798145828773226e-06, "loss": 14.5145, "step": 3820 }, { "epoch": 12.315112540192926, "grad_norm": 44.31184005737305, "learning_rate": 9.796755681359352e-06, "loss": 14.9203, "step": 3830 }, { "epoch": 12.347266881028938, "grad_norm": 46.08373260498047, "learning_rate": 9.795360874050601e-06, "loss": 14.9923, "step": 3840 }, { "epoch": 12.379421221864952, "grad_norm": 43.940948486328125, "learning_rate": 9.793961408359664e-06, "loss": 15.0307, "step": 3850 }, { "epoch": 12.411575562700964, "grad_norm": 46.624298095703125, "learning_rate": 9.79255728580428e-06, "loss": 13.8379, "step": 3860 }, { "epoch": 12.443729903536978, "grad_norm": 45.44304656982422, "learning_rate": 9.791148507907242e-06, "loss": 14.4357, "step": 3870 }, { "epoch": 12.47588424437299, "grad_norm": 41.72871017456055, "learning_rate": 9.789735076196392e-06, "loss": 14.4999, "step": 3880 }, { "epoch": 12.508038585209004, "grad_norm": 34.370609283447266, "learning_rate": 9.788316992204612e-06, "loss": 14.2755, "step": 3890 }, { "epoch": 12.540192926045016, "grad_norm": 35.56723403930664, "learning_rate": 9.786894257469842e-06, "loss": 14.6492, "step": 3900 }, { "epoch": 12.540192926045016, "eval_runtime": 5.3086, "eval_samples_per_second": 9.419, "eval_steps_per_second": 1.319, "step": 3900 }, { "epoch": 12.572347266881028, "grad_norm": 36.72100067138672, "learning_rate": 9.785466873535053e-06, "loss": 15.0472, "step": 3910 }, { "epoch": 12.604501607717042, "grad_norm": 41.98920822143555, "learning_rate": 9.784034841948267e-06, "loss": 14.6576, "step": 3920 }, { "epoch": 12.636655948553054, "grad_norm": 39.373348236083984, "learning_rate": 9.782598164262543e-06, "loss": 15.0164, "step": 3930 }, { "epoch": 12.668810289389068, "grad_norm": 44.47653579711914, "learning_rate": 9.781156842035978e-06, "loss": 14.0708, "step": 3940 }, { "epoch": 12.70096463022508, "grad_norm": 47.652626037597656, "learning_rate": 9.779710876831708e-06, "loss": 14.8275, "step": 3950 }, { "epoch": 12.733118971061094, "grad_norm": 34.6234016418457, "learning_rate": 9.778260270217904e-06, "loss": 14.7146, "step": 3960 }, { "epoch": 12.765273311897106, "grad_norm": 44.25484085083008, "learning_rate": 9.776805023767771e-06, "loss": 14.9002, "step": 3970 }, { "epoch": 12.79742765273312, "grad_norm": 51.592872619628906, "learning_rate": 9.775345139059543e-06, "loss": 14.4535, "step": 3980 }, { "epoch": 12.829581993569132, "grad_norm": 50.952674865722656, "learning_rate": 9.773880617676486e-06, "loss": 14.316, "step": 3990 }, { "epoch": 12.861736334405144, "grad_norm": 50.37294387817383, "learning_rate": 9.772411461206899e-06, "loss": 13.771, "step": 4000 }, { "epoch": 12.861736334405144, "eval_runtime": 5.3211, "eval_samples_per_second": 9.397, "eval_steps_per_second": 1.316, "step": 4000 }, { "epoch": 12.893890675241158, "grad_norm": 45.57170486450195, "learning_rate": 9.770937671244101e-06, "loss": 14.2711, "step": 4010 }, { "epoch": 12.92604501607717, "grad_norm": 43.43727493286133, "learning_rate": 9.769459249386439e-06, "loss": 14.4719, "step": 4020 }, { "epoch": 12.958199356913184, "grad_norm": 47.96916198730469, "learning_rate": 9.767976197237283e-06, "loss": 14.1787, "step": 4030 }, { "epoch": 12.990353697749196, "grad_norm": 51.02168273925781, "learning_rate": 9.766488516405026e-06, "loss": 15.0222, "step": 4040 }, { "epoch": 13.02250803858521, "grad_norm": 39.42076110839844, "learning_rate": 9.764996208503079e-06, "loss": 14.2946, "step": 4050 }, { "epoch": 13.054662379421222, "grad_norm": 43.09572982788086, "learning_rate": 9.763499275149874e-06, "loss": 14.5018, "step": 4060 }, { "epoch": 13.086816720257234, "grad_norm": 55.481815338134766, "learning_rate": 9.761997717968852e-06, "loss": 14.1465, "step": 4070 }, { "epoch": 13.118971061093248, "grad_norm": 53.61174011230469, "learning_rate": 9.76049153858848e-06, "loss": 13.4, "step": 4080 }, { "epoch": 13.15112540192926, "grad_norm": 43.636539459228516, "learning_rate": 9.75898073864223e-06, "loss": 13.7836, "step": 4090 }, { "epoch": 13.183279742765274, "grad_norm": 42.249969482421875, "learning_rate": 9.757465319768584e-06, "loss": 13.9679, "step": 4100 }, { "epoch": 13.183279742765274, "eval_runtime": 5.3065, "eval_samples_per_second": 9.422, "eval_steps_per_second": 1.319, "step": 4100 }, { "epoch": 13.215434083601286, "grad_norm": 43.19422149658203, "learning_rate": 9.75594528361104e-06, "loss": 14.6132, "step": 4110 }, { "epoch": 13.2475884244373, "grad_norm": 47.03890609741211, "learning_rate": 9.754420631818097e-06, "loss": 13.7951, "step": 4120 }, { "epoch": 13.279742765273312, "grad_norm": 44.761940002441406, "learning_rate": 9.752891366043264e-06, "loss": 13.2658, "step": 4130 }, { "epoch": 13.311897106109324, "grad_norm": 48.1104850769043, "learning_rate": 9.751357487945049e-06, "loss": 14.0039, "step": 4140 }, { "epoch": 13.344051446945338, "grad_norm": 49.26441192626953, "learning_rate": 9.749818999186968e-06, "loss": 13.5097, "step": 4150 }, { "epoch": 13.37620578778135, "grad_norm": 54.5966911315918, "learning_rate": 9.748275901437537e-06, "loss": 13.1336, "step": 4160 }, { "epoch": 13.408360128617364, "grad_norm": 51.42368698120117, "learning_rate": 9.746728196370265e-06, "loss": 13.7998, "step": 4170 }, { "epoch": 13.440514469453376, "grad_norm": 52.13337707519531, "learning_rate": 9.74517588566366e-06, "loss": 13.7565, "step": 4180 }, { "epoch": 13.47266881028939, "grad_norm": 40.15563201904297, "learning_rate": 9.743618971001229e-06, "loss": 14.0668, "step": 4190 }, { "epoch": 13.504823151125402, "grad_norm": 41.08787536621094, "learning_rate": 9.742057454071466e-06, "loss": 13.9121, "step": 4200 }, { "epoch": 13.504823151125402, "eval_runtime": 5.2832, "eval_samples_per_second": 9.464, "eval_steps_per_second": 1.325, "step": 4200 }, { "epoch": 13.536977491961414, "grad_norm": 49.81306076049805, "learning_rate": 9.740491336567862e-06, "loss": 13.3083, "step": 4210 }, { "epoch": 13.569131832797428, "grad_norm": 49.36894607543945, "learning_rate": 9.738920620188893e-06, "loss": 14.1038, "step": 4220 }, { "epoch": 13.60128617363344, "grad_norm": 46.570045471191406, "learning_rate": 9.737345306638025e-06, "loss": 13.8598, "step": 4230 }, { "epoch": 13.633440514469454, "grad_norm": 42.7360725402832, "learning_rate": 9.735765397623707e-06, "loss": 13.9974, "step": 4240 }, { "epoch": 13.665594855305466, "grad_norm": 51.25929260253906, "learning_rate": 9.734180894859374e-06, "loss": 13.8204, "step": 4250 }, { "epoch": 13.69774919614148, "grad_norm": 43.31510543823242, "learning_rate": 9.732591800063443e-06, "loss": 13.1874, "step": 4260 }, { "epoch": 13.729903536977492, "grad_norm": 41.486324310302734, "learning_rate": 9.730998114959313e-06, "loss": 12.7544, "step": 4270 }, { "epoch": 13.762057877813504, "grad_norm": 51.980281829833984, "learning_rate": 9.729399841275355e-06, "loss": 14.0642, "step": 4280 }, { "epoch": 13.794212218649518, "grad_norm": 42.551422119140625, "learning_rate": 9.727796980744924e-06, "loss": 13.6235, "step": 4290 }, { "epoch": 13.82636655948553, "grad_norm": 49.038841247558594, "learning_rate": 9.726189535106342e-06, "loss": 13.8284, "step": 4300 }, { "epoch": 13.82636655948553, "eval_runtime": 5.3185, "eval_samples_per_second": 9.401, "eval_steps_per_second": 1.316, "step": 4300 }, { "epoch": 13.858520900321544, "grad_norm": 47.276588439941406, "learning_rate": 9.724577506102912e-06, "loss": 14.1296, "step": 4310 }, { "epoch": 13.890675241157556, "grad_norm": 40.3382682800293, "learning_rate": 9.722960895482899e-06, "loss": 13.5219, "step": 4320 }, { "epoch": 13.92282958199357, "grad_norm": 45.87374496459961, "learning_rate": 9.721339704999543e-06, "loss": 14.2253, "step": 4330 }, { "epoch": 13.954983922829582, "grad_norm": 48.910179138183594, "learning_rate": 9.71971393641105e-06, "loss": 13.7282, "step": 4340 }, { "epoch": 13.987138263665594, "grad_norm": 42.45161819458008, "learning_rate": 9.718083591480587e-06, "loss": 13.8043, "step": 4350 }, { "epoch": 14.019292604501608, "grad_norm": 51.504878997802734, "learning_rate": 9.716448671976291e-06, "loss": 13.6057, "step": 4360 }, { "epoch": 14.05144694533762, "grad_norm": 51.580989837646484, "learning_rate": 9.714809179671253e-06, "loss": 13.1361, "step": 4370 }, { "epoch": 14.083601286173634, "grad_norm": 41.577484130859375, "learning_rate": 9.713165116343529e-06, "loss": 13.2498, "step": 4380 }, { "epoch": 14.115755627009646, "grad_norm": 42.91811752319336, "learning_rate": 9.711516483776128e-06, "loss": 13.3177, "step": 4390 }, { "epoch": 14.14790996784566, "grad_norm": 55.4658203125, "learning_rate": 9.709863283757017e-06, "loss": 13.6656, "step": 4400 }, { "epoch": 14.14790996784566, "eval_runtime": 5.2969, "eval_samples_per_second": 9.439, "eval_steps_per_second": 1.322, "step": 4400 }, { "epoch": 14.180064308681672, "grad_norm": 49.35305404663086, "learning_rate": 9.708205518079117e-06, "loss": 13.3078, "step": 4410 }, { "epoch": 14.212218649517684, "grad_norm": 45.5296630859375, "learning_rate": 9.706543188540297e-06, "loss": 13.5803, "step": 4420 }, { "epoch": 14.244372990353698, "grad_norm": 48.44739532470703, "learning_rate": 9.70487629694338e-06, "loss": 12.5461, "step": 4430 }, { "epoch": 14.27652733118971, "grad_norm": 60.19660186767578, "learning_rate": 9.703204845096133e-06, "loss": 13.3434, "step": 4440 }, { "epoch": 14.308681672025724, "grad_norm": 52.12981414794922, "learning_rate": 9.70152883481127e-06, "loss": 12.9722, "step": 4450 }, { "epoch": 14.340836012861736, "grad_norm": 46.14937210083008, "learning_rate": 9.699848267906451e-06, "loss": 12.6258, "step": 4460 }, { "epoch": 14.37299035369775, "grad_norm": 53.3614616394043, "learning_rate": 9.698163146204274e-06, "loss": 13.1836, "step": 4470 }, { "epoch": 14.405144694533762, "grad_norm": 61.66213607788086, "learning_rate": 9.696473471532278e-06, "loss": 13.4708, "step": 4480 }, { "epoch": 14.437299035369774, "grad_norm": 47.458370208740234, "learning_rate": 9.69477924572294e-06, "loss": 12.8135, "step": 4490 }, { "epoch": 14.469453376205788, "grad_norm": 57.72317886352539, "learning_rate": 9.693080470613676e-06, "loss": 12.6388, "step": 4500 }, { "epoch": 14.469453376205788, "eval_runtime": 5.369, "eval_samples_per_second": 9.313, "eval_steps_per_second": 1.304, "step": 4500 }, { "epoch": 14.5016077170418, "grad_norm": 44.399166107177734, "learning_rate": 9.691377148046828e-06, "loss": 12.7872, "step": 4510 }, { "epoch": 14.533762057877814, "grad_norm": 50.65188217163086, "learning_rate": 9.689669279869677e-06, "loss": 13.7343, "step": 4520 }, { "epoch": 14.565916398713826, "grad_norm": 48.098323822021484, "learning_rate": 9.687956867934431e-06, "loss": 13.5257, "step": 4530 }, { "epoch": 14.59807073954984, "grad_norm": 56.8110466003418, "learning_rate": 9.686239914098225e-06, "loss": 12.923, "step": 4540 }, { "epoch": 14.630225080385852, "grad_norm": 48.01587677001953, "learning_rate": 9.684518420223124e-06, "loss": 13.8433, "step": 4550 }, { "epoch": 14.662379421221864, "grad_norm": 43.76228332519531, "learning_rate": 9.68279238817611e-06, "loss": 12.5531, "step": 4560 }, { "epoch": 14.694533762057878, "grad_norm": 54.67728805541992, "learning_rate": 9.68106181982909e-06, "loss": 12.6262, "step": 4570 }, { "epoch": 14.72668810289389, "grad_norm": 49.625709533691406, "learning_rate": 9.679326717058893e-06, "loss": 12.9982, "step": 4580 }, { "epoch": 14.758842443729904, "grad_norm": 46.10453414916992, "learning_rate": 9.677587081747261e-06, "loss": 12.9123, "step": 4590 }, { "epoch": 14.790996784565916, "grad_norm": 51.23402786254883, "learning_rate": 9.675842915780858e-06, "loss": 13.5719, "step": 4600 }, { "epoch": 14.790996784565916, "eval_runtime": 5.3656, "eval_samples_per_second": 9.319, "eval_steps_per_second": 1.305, "step": 4600 }, { "epoch": 14.82315112540193, "grad_norm": 43.96033477783203, "learning_rate": 9.674094221051255e-06, "loss": 13.4374, "step": 4610 }, { "epoch": 14.855305466237942, "grad_norm": 48.21479034423828, "learning_rate": 9.672340999454937e-06, "loss": 13.2253, "step": 4620 }, { "epoch": 14.887459807073956, "grad_norm": 52.33036804199219, "learning_rate": 9.6705832528933e-06, "loss": 13.3778, "step": 4630 }, { "epoch": 14.919614147909968, "grad_norm": 56.59408950805664, "learning_rate": 9.668820983272645e-06, "loss": 13.0073, "step": 4640 }, { "epoch": 14.95176848874598, "grad_norm": 44.571353912353516, "learning_rate": 9.667054192504179e-06, "loss": 12.69, "step": 4650 }, { "epoch": 14.983922829581994, "grad_norm": 42.88235855102539, "learning_rate": 9.665282882504015e-06, "loss": 12.9091, "step": 4660 }, { "epoch": 15.016077170418006, "grad_norm": 45.16333770751953, "learning_rate": 9.663507055193162e-06, "loss": 12.3788, "step": 4670 }, { "epoch": 15.04823151125402, "grad_norm": 49.29978942871094, "learning_rate": 9.661726712497532e-06, "loss": 12.2129, "step": 4680 }, { "epoch": 15.080385852090032, "grad_norm": 41.6961669921875, "learning_rate": 9.659941856347934e-06, "loss": 12.5573, "step": 4690 }, { "epoch": 15.112540192926046, "grad_norm": 47.20383834838867, "learning_rate": 9.658152488680069e-06, "loss": 12.6664, "step": 4700 }, { "epoch": 15.112540192926046, "eval_runtime": 5.3669, "eval_samples_per_second": 9.316, "eval_steps_per_second": 1.304, "step": 4700 }, { "epoch": 15.144694533762058, "grad_norm": 51.68077850341797, "learning_rate": 9.656358611434536e-06, "loss": 12.6621, "step": 4710 }, { "epoch": 15.17684887459807, "grad_norm": 46.28759002685547, "learning_rate": 9.654560226556815e-06, "loss": 12.2395, "step": 4720 }, { "epoch": 15.209003215434084, "grad_norm": 42.1481819152832, "learning_rate": 9.652757335997286e-06, "loss": 12.475, "step": 4730 }, { "epoch": 15.241157556270096, "grad_norm": 57.05293273925781, "learning_rate": 9.65094994171121e-06, "loss": 12.4885, "step": 4740 }, { "epoch": 15.27331189710611, "grad_norm": 46.24205780029297, "learning_rate": 9.649138045658731e-06, "loss": 11.8126, "step": 4750 }, { "epoch": 15.305466237942122, "grad_norm": 48.606319427490234, "learning_rate": 9.647321649804879e-06, "loss": 12.09, "step": 4760 }, { "epoch": 15.337620578778136, "grad_norm": 43.13861083984375, "learning_rate": 9.645500756119559e-06, "loss": 12.7528, "step": 4770 }, { "epoch": 15.369774919614148, "grad_norm": 55.4376335144043, "learning_rate": 9.643675366577559e-06, "loss": 13.1092, "step": 4780 }, { "epoch": 15.40192926045016, "grad_norm": 50.24001693725586, "learning_rate": 9.641845483158541e-06, "loss": 12.4875, "step": 4790 }, { "epoch": 15.434083601286174, "grad_norm": 42.600669860839844, "learning_rate": 9.640011107847043e-06, "loss": 12.5747, "step": 4800 }, { "epoch": 15.434083601286174, "eval_runtime": 5.3208, "eval_samples_per_second": 9.397, "eval_steps_per_second": 1.316, "step": 4800 }, { "epoch": 15.466237942122186, "grad_norm": 63.25564956665039, "learning_rate": 9.638172242632466e-06, "loss": 12.3523, "step": 4810 }, { "epoch": 15.4983922829582, "grad_norm": 53.0198974609375, "learning_rate": 9.636328889509091e-06, "loss": 12.9975, "step": 4820 }, { "epoch": 15.530546623794212, "grad_norm": 44.917457580566406, "learning_rate": 9.63448105047606e-06, "loss": 13.3548, "step": 4830 }, { "epoch": 15.562700964630226, "grad_norm": 47.41967010498047, "learning_rate": 9.632628727537384e-06, "loss": 12.9612, "step": 4840 }, { "epoch": 15.594855305466238, "grad_norm": 52.9598388671875, "learning_rate": 9.630771922701931e-06, "loss": 12.6249, "step": 4850 }, { "epoch": 15.62700964630225, "grad_norm": 55.8404541015625, "learning_rate": 9.628910637983436e-06, "loss": 12.0022, "step": 4860 }, { "epoch": 15.659163987138264, "grad_norm": 41.96122741699219, "learning_rate": 9.627044875400487e-06, "loss": 12.3223, "step": 4870 }, { "epoch": 15.691318327974276, "grad_norm": 42.56925582885742, "learning_rate": 9.625174636976535e-06, "loss": 12.3486, "step": 4880 }, { "epoch": 15.72347266881029, "grad_norm": 46.75522232055664, "learning_rate": 9.623299924739876e-06, "loss": 12.1986, "step": 4890 }, { "epoch": 15.755627009646302, "grad_norm": 47.262176513671875, "learning_rate": 9.621420740723667e-06, "loss": 11.8861, "step": 4900 }, { "epoch": 15.755627009646302, "eval_runtime": 5.403, "eval_samples_per_second": 9.254, "eval_steps_per_second": 1.296, "step": 4900 }, { "epoch": 15.787781350482316, "grad_norm": 48.26375198364258, "learning_rate": 9.619537086965909e-06, "loss": 13.0045, "step": 4910 }, { "epoch": 15.819935691318328, "grad_norm": 58.158538818359375, "learning_rate": 9.617648965509454e-06, "loss": 12.3026, "step": 4920 }, { "epoch": 15.85209003215434, "grad_norm": 47.09524154663086, "learning_rate": 9.615756378401994e-06, "loss": 12.3806, "step": 4930 }, { "epoch": 15.884244372990354, "grad_norm": 47.47500228881836, "learning_rate": 9.613859327696072e-06, "loss": 12.659, "step": 4940 }, { "epoch": 15.916398713826366, "grad_norm": 51.31810760498047, "learning_rate": 9.611957815449067e-06, "loss": 12.4747, "step": 4950 }, { "epoch": 15.94855305466238, "grad_norm": 57.17998123168945, "learning_rate": 9.610051843723192e-06, "loss": 12.965, "step": 4960 }, { "epoch": 15.980707395498392, "grad_norm": 50.38270568847656, "learning_rate": 9.608141414585508e-06, "loss": 12.4585, "step": 4970 }, { "epoch": 16.012861736334404, "grad_norm": 46.389408111572266, "learning_rate": 9.6062265301079e-06, "loss": 11.8497, "step": 4980 }, { "epoch": 16.04501607717042, "grad_norm": 45.911376953125, "learning_rate": 9.604307192367087e-06, "loss": 12.0841, "step": 4990 }, { "epoch": 16.077170418006432, "grad_norm": 41.935237884521484, "learning_rate": 9.602383403444622e-06, "loss": 11.6601, "step": 5000 }, { "epoch": 16.077170418006432, "eval_runtime": 5.3953, "eval_samples_per_second": 9.267, "eval_steps_per_second": 1.297, "step": 5000 }, { "epoch": 16.109324758842444, "grad_norm": 47.621158599853516, "learning_rate": 9.600455165426882e-06, "loss": 11.8249, "step": 5010 }, { "epoch": 16.141479099678456, "grad_norm": 43.822696685791016, "learning_rate": 9.59852248040507e-06, "loss": 12.4154, "step": 5020 }, { "epoch": 16.173633440514468, "grad_norm": 48.92320251464844, "learning_rate": 9.596585350475208e-06, "loss": 11.2804, "step": 5030 }, { "epoch": 16.205787781350484, "grad_norm": 51.55178451538086, "learning_rate": 9.594643777738146e-06, "loss": 11.9014, "step": 5040 }, { "epoch": 16.237942122186496, "grad_norm": 43.94123077392578, "learning_rate": 9.592697764299547e-06, "loss": 11.4866, "step": 5050 }, { "epoch": 16.270096463022508, "grad_norm": 45.77236557006836, "learning_rate": 9.59074731226989e-06, "loss": 12.157, "step": 5060 }, { "epoch": 16.30225080385852, "grad_norm": 49.68130874633789, "learning_rate": 9.58879242376447e-06, "loss": 11.3667, "step": 5070 }, { "epoch": 16.334405144694532, "grad_norm": 46.00023651123047, "learning_rate": 9.586833100903392e-06, "loss": 12.1255, "step": 5080 }, { "epoch": 16.366559485530548, "grad_norm": 47.52351379394531, "learning_rate": 9.584869345811572e-06, "loss": 11.9583, "step": 5090 }, { "epoch": 16.39871382636656, "grad_norm": 45.236446380615234, "learning_rate": 9.58290116061873e-06, "loss": 11.9231, "step": 5100 }, { "epoch": 16.39871382636656, "eval_runtime": 5.4747, "eval_samples_per_second": 9.133, "eval_steps_per_second": 1.279, "step": 5100 }, { "epoch": 16.430868167202572, "grad_norm": 60.21353530883789, "learning_rate": 9.580928547459391e-06, "loss": 12.3092, "step": 5110 }, { "epoch": 16.463022508038584, "grad_norm": 53.243038177490234, "learning_rate": 9.578951508472884e-06, "loss": 11.5951, "step": 5120 }, { "epoch": 16.4951768488746, "grad_norm": 50.102027893066406, "learning_rate": 9.576970045803338e-06, "loss": 11.5593, "step": 5130 }, { "epoch": 16.527331189710612, "grad_norm": 45.714664459228516, "learning_rate": 9.574984161599677e-06, "loss": 12.0302, "step": 5140 }, { "epoch": 16.559485530546624, "grad_norm": 50.59874725341797, "learning_rate": 9.572993858015623e-06, "loss": 12.2591, "step": 5150 }, { "epoch": 16.591639871382636, "grad_norm": 59.64794158935547, "learning_rate": 9.570999137209687e-06, "loss": 12.3109, "step": 5160 }, { "epoch": 16.623794212218648, "grad_norm": 58.02330017089844, "learning_rate": 9.569000001345177e-06, "loss": 12.4381, "step": 5170 }, { "epoch": 16.655948553054664, "grad_norm": 56.31669616699219, "learning_rate": 9.56699645259018e-06, "loss": 12.1157, "step": 5180 }, { "epoch": 16.688102893890676, "grad_norm": 51.5637321472168, "learning_rate": 9.56498849311758e-06, "loss": 11.6235, "step": 5190 }, { "epoch": 16.720257234726688, "grad_norm": 53.77766036987305, "learning_rate": 9.562976125105037e-06, "loss": 11.6102, "step": 5200 }, { "epoch": 16.720257234726688, "eval_runtime": 5.2711, "eval_samples_per_second": 9.486, "eval_steps_per_second": 1.328, "step": 5200 }, { "epoch": 16.7524115755627, "grad_norm": 47.7010383605957, "learning_rate": 9.560959350734991e-06, "loss": 12.1422, "step": 5210 }, { "epoch": 16.784565916398712, "grad_norm": 49.0406494140625, "learning_rate": 9.558938172194664e-06, "loss": 11.1553, "step": 5220 }, { "epoch": 16.816720257234728, "grad_norm": 56.12392044067383, "learning_rate": 9.556912591676057e-06, "loss": 12.2848, "step": 5230 }, { "epoch": 16.84887459807074, "grad_norm": 51.17445373535156, "learning_rate": 9.55488261137594e-06, "loss": 11.4488, "step": 5240 }, { "epoch": 16.881028938906752, "grad_norm": 46.27265167236328, "learning_rate": 9.552848233495858e-06, "loss": 11.0959, "step": 5250 }, { "epoch": 16.913183279742764, "grad_norm": 48.56627655029297, "learning_rate": 9.55080946024212e-06, "loss": 12.2105, "step": 5260 }, { "epoch": 16.94533762057878, "grad_norm": 57.16343688964844, "learning_rate": 9.548766293825808e-06, "loss": 11.4549, "step": 5270 }, { "epoch": 16.977491961414792, "grad_norm": 51.33845138549805, "learning_rate": 9.54671873646277e-06, "loss": 11.8203, "step": 5280 }, { "epoch": 17.009646302250804, "grad_norm": 57.24526596069336, "learning_rate": 9.544666790373605e-06, "loss": 11.0662, "step": 5290 }, { "epoch": 17.041800643086816, "grad_norm": 53.52762985229492, "learning_rate": 9.542610457783683e-06, "loss": 10.9998, "step": 5300 }, { "epoch": 17.041800643086816, "eval_runtime": 5.3438, "eval_samples_per_second": 9.357, "eval_steps_per_second": 1.31, "step": 5300 }, { "epoch": 17.073954983922828, "grad_norm": 54.95735168457031, "learning_rate": 9.540549740923126e-06, "loss": 11.4855, "step": 5310 }, { "epoch": 17.106109324758844, "grad_norm": 68.64714050292969, "learning_rate": 9.538484642026811e-06, "loss": 11.1134, "step": 5320 }, { "epoch": 17.138263665594856, "grad_norm": 61.943172454833984, "learning_rate": 9.536415163334371e-06, "loss": 10.9785, "step": 5330 }, { "epoch": 17.170418006430868, "grad_norm": 54.304473876953125, "learning_rate": 9.534341307090184e-06, "loss": 11.2546, "step": 5340 }, { "epoch": 17.20257234726688, "grad_norm": 42.46025848388672, "learning_rate": 9.532263075543378e-06, "loss": 11.3133, "step": 5350 }, { "epoch": 17.234726688102892, "grad_norm": 55.20064926147461, "learning_rate": 9.530180470947826e-06, "loss": 11.1336, "step": 5360 }, { "epoch": 17.266881028938908, "grad_norm": 55.996150970458984, "learning_rate": 9.528093495562144e-06, "loss": 11.5219, "step": 5370 }, { "epoch": 17.29903536977492, "grad_norm": 60.711082458496094, "learning_rate": 9.526002151649684e-06, "loss": 12.057, "step": 5380 }, { "epoch": 17.331189710610932, "grad_norm": 61.25688934326172, "learning_rate": 9.523906441478544e-06, "loss": 11.0988, "step": 5390 }, { "epoch": 17.363344051446944, "grad_norm": 62.150657653808594, "learning_rate": 9.52180636732155e-06, "loss": 11.778, "step": 5400 }, { "epoch": 17.363344051446944, "eval_runtime": 5.3248, "eval_samples_per_second": 9.39, "eval_steps_per_second": 1.315, "step": 5400 }, { "epoch": 17.39549839228296, "grad_norm": 49.9788703918457, "learning_rate": 9.519701931456264e-06, "loss": 11.3253, "step": 5410 }, { "epoch": 17.427652733118972, "grad_norm": 50.869529724121094, "learning_rate": 9.517593136164977e-06, "loss": 11.4674, "step": 5420 }, { "epoch": 17.459807073954984, "grad_norm": 58.9503059387207, "learning_rate": 9.515479983734712e-06, "loss": 11.1915, "step": 5430 }, { "epoch": 17.491961414790996, "grad_norm": 56.844329833984375, "learning_rate": 9.513362476457209e-06, "loss": 10.9972, "step": 5440 }, { "epoch": 17.524115755627008, "grad_norm": 54.84030532836914, "learning_rate": 9.511240616628938e-06, "loss": 10.8704, "step": 5450 }, { "epoch": 17.556270096463024, "grad_norm": 45.1226806640625, "learning_rate": 9.509114406551088e-06, "loss": 10.3191, "step": 5460 }, { "epoch": 17.588424437299036, "grad_norm": 49.96663284301758, "learning_rate": 9.506983848529562e-06, "loss": 10.4972, "step": 5470 }, { "epoch": 17.620578778135048, "grad_norm": 53.029090881347656, "learning_rate": 9.504848944874987e-06, "loss": 10.8375, "step": 5480 }, { "epoch": 17.65273311897106, "grad_norm": 52.83979415893555, "learning_rate": 9.502709697902696e-06, "loss": 11.5219, "step": 5490 }, { "epoch": 17.684887459807072, "grad_norm": 51.812713623046875, "learning_rate": 9.500566109932729e-06, "loss": 10.7833, "step": 5500 }, { "epoch": 17.684887459807072, "eval_runtime": 5.3239, "eval_samples_per_second": 9.392, "eval_steps_per_second": 1.315, "step": 5500 }, { "epoch": 17.717041800643088, "grad_norm": 51.39299774169922, "learning_rate": 9.498418183289844e-06, "loss": 11.7384, "step": 5510 }, { "epoch": 17.7491961414791, "grad_norm": 58.02050018310547, "learning_rate": 9.496265920303498e-06, "loss": 11.4004, "step": 5520 }, { "epoch": 17.781350482315112, "grad_norm": 59.29935836791992, "learning_rate": 9.49410932330785e-06, "loss": 10.9842, "step": 5530 }, { "epoch": 17.813504823151124, "grad_norm": 54.1755256652832, "learning_rate": 9.491948394641762e-06, "loss": 10.7042, "step": 5540 }, { "epoch": 17.84565916398714, "grad_norm": 50.73415756225586, "learning_rate": 9.489783136648795e-06, "loss": 10.4692, "step": 5550 }, { "epoch": 17.877813504823152, "grad_norm": 60.59455108642578, "learning_rate": 9.487613551677201e-06, "loss": 11.5964, "step": 5560 }, { "epoch": 17.909967845659164, "grad_norm": 48.66271209716797, "learning_rate": 9.485439642079928e-06, "loss": 11.421, "step": 5570 }, { "epoch": 17.942122186495176, "grad_norm": 48.67372131347656, "learning_rate": 9.483261410214612e-06, "loss": 11.2094, "step": 5580 }, { "epoch": 17.974276527331188, "grad_norm": 55.46310806274414, "learning_rate": 9.481078858443578e-06, "loss": 10.8483, "step": 5590 }, { "epoch": 18.006430868167204, "grad_norm": 54.21710968017578, "learning_rate": 9.478891989133836e-06, "loss": 11.4621, "step": 5600 }, { "epoch": 18.006430868167204, "eval_runtime": 5.3232, "eval_samples_per_second": 9.393, "eval_steps_per_second": 1.315, "step": 5600 }, { "epoch": 18.038585209003216, "grad_norm": 51.18660354614258, "learning_rate": 9.476700804657077e-06, "loss": 10.5623, "step": 5610 }, { "epoch": 18.070739549839228, "grad_norm": 47.81584930419922, "learning_rate": 9.474505307389675e-06, "loss": 11.1587, "step": 5620 }, { "epoch": 18.10289389067524, "grad_norm": 47.375091552734375, "learning_rate": 9.472305499712677e-06, "loss": 10.6037, "step": 5630 }, { "epoch": 18.135048231511256, "grad_norm": 60.6822395324707, "learning_rate": 9.470101384011808e-06, "loss": 11.0627, "step": 5640 }, { "epoch": 18.167202572347268, "grad_norm": 48.55972671508789, "learning_rate": 9.467892962677462e-06, "loss": 10.7964, "step": 5650 }, { "epoch": 18.19935691318328, "grad_norm": 49.35833740234375, "learning_rate": 9.465680238104709e-06, "loss": 10.2958, "step": 5660 }, { "epoch": 18.231511254019292, "grad_norm": 61.94197463989258, "learning_rate": 9.463463212693277e-06, "loss": 11.1513, "step": 5670 }, { "epoch": 18.263665594855304, "grad_norm": 48.617218017578125, "learning_rate": 9.461241888847563e-06, "loss": 10.5154, "step": 5680 }, { "epoch": 18.29581993569132, "grad_norm": 64.13507843017578, "learning_rate": 9.459016268976628e-06, "loss": 10.7931, "step": 5690 }, { "epoch": 18.327974276527332, "grad_norm": 51.7435417175293, "learning_rate": 9.456786355494189e-06, "loss": 10.4584, "step": 5700 }, { "epoch": 18.327974276527332, "eval_runtime": 5.2779, "eval_samples_per_second": 9.473, "eval_steps_per_second": 1.326, "step": 5700 }, { "epoch": 18.360128617363344, "grad_norm": 56.216087341308594, "learning_rate": 9.454552150818617e-06, "loss": 10.9169, "step": 5710 }, { "epoch": 18.392282958199356, "grad_norm": 58.084556579589844, "learning_rate": 9.452313657372944e-06, "loss": 10.2755, "step": 5720 }, { "epoch": 18.424437299035368, "grad_norm": 51.023536682128906, "learning_rate": 9.450070877584847e-06, "loss": 10.5232, "step": 5730 }, { "epoch": 18.456591639871384, "grad_norm": 49.9437370300293, "learning_rate": 9.447823813886655e-06, "loss": 10.9118, "step": 5740 }, { "epoch": 18.488745980707396, "grad_norm": 46.36228561401367, "learning_rate": 9.445572468715337e-06, "loss": 10.4203, "step": 5750 }, { "epoch": 18.520900321543408, "grad_norm": 57.828556060791016, "learning_rate": 9.443316844512518e-06, "loss": 9.9389, "step": 5760 }, { "epoch": 18.55305466237942, "grad_norm": 52.90595626831055, "learning_rate": 9.44105694372445e-06, "loss": 10.7033, "step": 5770 }, { "epoch": 18.585209003215436, "grad_norm": 74.65838623046875, "learning_rate": 9.438792768802031e-06, "loss": 10.6001, "step": 5780 }, { "epoch": 18.617363344051448, "grad_norm": 54.788700103759766, "learning_rate": 9.43652432220079e-06, "loss": 10.7125, "step": 5790 }, { "epoch": 18.64951768488746, "grad_norm": 50.16817092895508, "learning_rate": 9.434251606380893e-06, "loss": 10.4789, "step": 5800 }, { "epoch": 18.64951768488746, "eval_runtime": 5.3011, "eval_samples_per_second": 9.432, "eval_steps_per_second": 1.32, "step": 5800 }, { "epoch": 18.681672025723472, "grad_norm": 53.58561325073242, "learning_rate": 9.431974623807132e-06, "loss": 9.9671, "step": 5810 }, { "epoch": 18.713826366559484, "grad_norm": 57.45869064331055, "learning_rate": 9.42969337694893e-06, "loss": 10.2129, "step": 5820 }, { "epoch": 18.7459807073955, "grad_norm": 46.13480758666992, "learning_rate": 9.427407868280331e-06, "loss": 10.5959, "step": 5830 }, { "epoch": 18.778135048231512, "grad_norm": 72.66554260253906, "learning_rate": 9.425118100280003e-06, "loss": 10.8932, "step": 5840 }, { "epoch": 18.810289389067524, "grad_norm": 53.599971771240234, "learning_rate": 9.422824075431238e-06, "loss": 10.75, "step": 5850 }, { "epoch": 18.842443729903536, "grad_norm": 49.64387130737305, "learning_rate": 9.420525796221932e-06, "loss": 9.7145, "step": 5860 }, { "epoch": 18.874598070739548, "grad_norm": 45.53738021850586, "learning_rate": 9.418223265144608e-06, "loss": 10.5637, "step": 5870 }, { "epoch": 18.906752411575564, "grad_norm": 58.98841094970703, "learning_rate": 9.41591648469639e-06, "loss": 10.7278, "step": 5880 }, { "epoch": 18.938906752411576, "grad_norm": 61.08856964111328, "learning_rate": 9.41360545737902e-06, "loss": 10.1988, "step": 5890 }, { "epoch": 18.971061093247588, "grad_norm": 66.69367980957031, "learning_rate": 9.411290185698843e-06, "loss": 10.483, "step": 5900 }, { "epoch": 18.971061093247588, "eval_runtime": 5.3016, "eval_samples_per_second": 9.431, "eval_steps_per_second": 1.32, "step": 5900 }, { "epoch": 19.0032154340836, "grad_norm": 60.92210006713867, "learning_rate": 9.408970672166799e-06, "loss": 10.9087, "step": 5910 }, { "epoch": 19.035369774919616, "grad_norm": 57.48671340942383, "learning_rate": 9.406646919298439e-06, "loss": 10.0415, "step": 5920 }, { "epoch": 19.067524115755628, "grad_norm": 51.680564880371094, "learning_rate": 9.404318929613906e-06, "loss": 10.1596, "step": 5930 }, { "epoch": 19.09967845659164, "grad_norm": 73.64752197265625, "learning_rate": 9.401986705637938e-06, "loss": 10.3127, "step": 5940 }, { "epoch": 19.131832797427652, "grad_norm": 57.87544631958008, "learning_rate": 9.399650249899869e-06, "loss": 10.2914, "step": 5950 }, { "epoch": 19.163987138263664, "grad_norm": 57.10334396362305, "learning_rate": 9.397309564933617e-06, "loss": 10.0837, "step": 5960 }, { "epoch": 19.19614147909968, "grad_norm": 70.41741180419922, "learning_rate": 9.39496465327769e-06, "loss": 9.9919, "step": 5970 }, { "epoch": 19.228295819935692, "grad_norm": 55.16864776611328, "learning_rate": 9.39261551747518e-06, "loss": 9.5109, "step": 5980 }, { "epoch": 19.260450160771704, "grad_norm": 54.571807861328125, "learning_rate": 9.390262160073758e-06, "loss": 9.8488, "step": 5990 }, { "epoch": 19.292604501607716, "grad_norm": 57.01283264160156, "learning_rate": 9.387904583625678e-06, "loss": 10.0752, "step": 6000 }, { "epoch": 19.292604501607716, "eval_runtime": 5.2554, "eval_samples_per_second": 9.514, "eval_steps_per_second": 1.332, "step": 6000 }, { "epoch": 19.324758842443728, "grad_norm": 70.93519592285156, "learning_rate": 9.385542790687763e-06, "loss": 9.9351, "step": 6010 }, { "epoch": 19.356913183279744, "grad_norm": 56.50486373901367, "learning_rate": 9.383176783821416e-06, "loss": 9.8069, "step": 6020 }, { "epoch": 19.389067524115756, "grad_norm": 67.0200424194336, "learning_rate": 9.380806565592601e-06, "loss": 9.6098, "step": 6030 }, { "epoch": 19.421221864951768, "grad_norm": 59.967620849609375, "learning_rate": 9.378432138571858e-06, "loss": 10.527, "step": 6040 }, { "epoch": 19.45337620578778, "grad_norm": 63.3838005065918, "learning_rate": 9.376053505334288e-06, "loss": 10.1927, "step": 6050 }, { "epoch": 19.485530546623796, "grad_norm": 62.784019470214844, "learning_rate": 9.373670668459553e-06, "loss": 10.5373, "step": 6060 }, { "epoch": 19.517684887459808, "grad_norm": 55.19639205932617, "learning_rate": 9.371283630531876e-06, "loss": 9.5571, "step": 6070 }, { "epoch": 19.54983922829582, "grad_norm": 69.46549224853516, "learning_rate": 9.368892394140034e-06, "loss": 9.9361, "step": 6080 }, { "epoch": 19.581993569131832, "grad_norm": 61.49732208251953, "learning_rate": 9.366496961877355e-06, "loss": 10.3161, "step": 6090 }, { "epoch": 19.614147909967844, "grad_norm": 58.494598388671875, "learning_rate": 9.364097336341727e-06, "loss": 9.9703, "step": 6100 }, { "epoch": 19.614147909967844, "eval_runtime": 5.3482, "eval_samples_per_second": 9.349, "eval_steps_per_second": 1.309, "step": 6100 }, { "epoch": 19.64630225080386, "grad_norm": 48.6429557800293, "learning_rate": 9.361693520135575e-06, "loss": 9.8478, "step": 6110 }, { "epoch": 19.678456591639872, "grad_norm": 60.25499725341797, "learning_rate": 9.359285515865873e-06, "loss": 9.8925, "step": 6120 }, { "epoch": 19.710610932475884, "grad_norm": 52.251930236816406, "learning_rate": 9.35687332614414e-06, "loss": 10.5841, "step": 6130 }, { "epoch": 19.742765273311896, "grad_norm": 52.99293899536133, "learning_rate": 9.354456953586429e-06, "loss": 9.7042, "step": 6140 }, { "epoch": 19.77491961414791, "grad_norm": 47.859378814697266, "learning_rate": 9.35203640081333e-06, "loss": 9.8555, "step": 6150 }, { "epoch": 19.807073954983924, "grad_norm": 51.382625579833984, "learning_rate": 9.349611670449972e-06, "loss": 9.3117, "step": 6160 }, { "epoch": 19.839228295819936, "grad_norm": 49.26898193359375, "learning_rate": 9.34718276512601e-06, "loss": 9.3723, "step": 6170 }, { "epoch": 19.871382636655948, "grad_norm": 52.96788024902344, "learning_rate": 9.344749687475626e-06, "loss": 9.8715, "step": 6180 }, { "epoch": 19.90353697749196, "grad_norm": 53.44609069824219, "learning_rate": 9.342312440137528e-06, "loss": 10.2471, "step": 6190 }, { "epoch": 19.935691318327976, "grad_norm": 54.40663528442383, "learning_rate": 9.33987102575495e-06, "loss": 10.5, "step": 6200 }, { "epoch": 19.935691318327976, "eval_runtime": 5.3087, "eval_samples_per_second": 9.419, "eval_steps_per_second": 1.319, "step": 6200 }, { "epoch": 19.967845659163988, "grad_norm": 68.6010971069336, "learning_rate": 9.337425446975639e-06, "loss": 9.7353, "step": 6210 }, { "epoch": 20.0, "grad_norm": 52.1768684387207, "learning_rate": 9.334975706451863e-06, "loss": 9.9019, "step": 6220 }, { "epoch": 20.032154340836012, "grad_norm": 55.96736145019531, "learning_rate": 9.3325218068404e-06, "loss": 9.2253, "step": 6230 }, { "epoch": 20.064308681672024, "grad_norm": 62.114601135253906, "learning_rate": 9.33006375080254e-06, "loss": 9.2174, "step": 6240 }, { "epoch": 20.09646302250804, "grad_norm": 59.23468017578125, "learning_rate": 9.327601541004083e-06, "loss": 9.5407, "step": 6250 }, { "epoch": 20.128617363344052, "grad_norm": 53.87682342529297, "learning_rate": 9.325135180115331e-06, "loss": 10.0327, "step": 6260 }, { "epoch": 20.160771704180064, "grad_norm": 60.87358093261719, "learning_rate": 9.32266467081109e-06, "loss": 10.0087, "step": 6270 }, { "epoch": 20.192926045016076, "grad_norm": 50.26223373413086, "learning_rate": 9.320190015770661e-06, "loss": 9.8779, "step": 6280 }, { "epoch": 20.225080385852092, "grad_norm": 66.09937286376953, "learning_rate": 9.317711217677846e-06, "loss": 9.4498, "step": 6290 }, { "epoch": 20.257234726688104, "grad_norm": 52.84104919433594, "learning_rate": 9.315228279220937e-06, "loss": 10.0835, "step": 6300 }, { "epoch": 20.257234726688104, "eval_runtime": 5.2529, "eval_samples_per_second": 9.519, "eval_steps_per_second": 1.333, "step": 6300 }, { "epoch": 20.289389067524116, "grad_norm": 55.96133804321289, "learning_rate": 9.312741203092717e-06, "loss": 9.3506, "step": 6310 }, { "epoch": 20.321543408360128, "grad_norm": 64.66504669189453, "learning_rate": 9.310249991990458e-06, "loss": 9.2441, "step": 6320 }, { "epoch": 20.35369774919614, "grad_norm": 52.057647705078125, "learning_rate": 9.307754648615915e-06, "loss": 8.9303, "step": 6330 }, { "epoch": 20.385852090032156, "grad_norm": 57.54296875, "learning_rate": 9.305255175675323e-06, "loss": 9.8486, "step": 6340 }, { "epoch": 20.418006430868168, "grad_norm": 65.58293151855469, "learning_rate": 9.302751575879401e-06, "loss": 10.102, "step": 6350 }, { "epoch": 20.45016077170418, "grad_norm": 57.753334045410156, "learning_rate": 9.300243851943337e-06, "loss": 9.3894, "step": 6360 }, { "epoch": 20.482315112540192, "grad_norm": 62.4243049621582, "learning_rate": 9.297732006586792e-06, "loss": 9.3568, "step": 6370 }, { "epoch": 20.514469453376204, "grad_norm": 60.819114685058594, "learning_rate": 9.295216042533906e-06, "loss": 9.7041, "step": 6380 }, { "epoch": 20.54662379421222, "grad_norm": 52.42291259765625, "learning_rate": 9.292695962513274e-06, "loss": 9.4524, "step": 6390 }, { "epoch": 20.578778135048232, "grad_norm": 63.88450622558594, "learning_rate": 9.29017176925796e-06, "loss": 9.7861, "step": 6400 }, { "epoch": 20.578778135048232, "eval_runtime": 5.2676, "eval_samples_per_second": 9.492, "eval_steps_per_second": 1.329, "step": 6400 }, { "epoch": 20.610932475884244, "grad_norm": 56.74431610107422, "learning_rate": 9.287643465505493e-06, "loss": 9.0948, "step": 6410 }, { "epoch": 20.643086816720256, "grad_norm": 55.9102668762207, "learning_rate": 9.285111053997851e-06, "loss": 9.3674, "step": 6420 }, { "epoch": 20.675241157556272, "grad_norm": 56.914039611816406, "learning_rate": 9.282574537481474e-06, "loss": 9.3464, "step": 6430 }, { "epoch": 20.707395498392284, "grad_norm": 58.53620529174805, "learning_rate": 9.28003391870725e-06, "loss": 9.5911, "step": 6440 }, { "epoch": 20.739549839228296, "grad_norm": 56.804283142089844, "learning_rate": 9.27748920043052e-06, "loss": 9.3236, "step": 6450 }, { "epoch": 20.771704180064308, "grad_norm": 62.06853485107422, "learning_rate": 9.274940385411066e-06, "loss": 9.6743, "step": 6460 }, { "epoch": 20.80385852090032, "grad_norm": 56.94602966308594, "learning_rate": 9.272387476413118e-06, "loss": 9.656, "step": 6470 }, { "epoch": 20.836012861736336, "grad_norm": 64.08161163330078, "learning_rate": 9.269830476205341e-06, "loss": 9.4546, "step": 6480 }, { "epoch": 20.868167202572348, "grad_norm": 52.450836181640625, "learning_rate": 9.267269387560839e-06, "loss": 9.5842, "step": 6490 }, { "epoch": 20.90032154340836, "grad_norm": 57.53072738647461, "learning_rate": 9.264704213257154e-06, "loss": 9.0415, "step": 6500 }, { "epoch": 20.90032154340836, "eval_runtime": 5.3551, "eval_samples_per_second": 9.337, "eval_steps_per_second": 1.307, "step": 6500 }, { "epoch": 20.932475884244372, "grad_norm": 52.40592956542969, "learning_rate": 9.262134956076252e-06, "loss": 9.4433, "step": 6510 }, { "epoch": 20.964630225080384, "grad_norm": 58.261253356933594, "learning_rate": 9.25956161880453e-06, "loss": 9.3192, "step": 6520 }, { "epoch": 20.9967845659164, "grad_norm": 61.8966064453125, "learning_rate": 9.256984204232814e-06, "loss": 9.3181, "step": 6530 }, { "epoch": 21.028938906752412, "grad_norm": 50.421878814697266, "learning_rate": 9.254402715156344e-06, "loss": 8.9187, "step": 6540 }, { "epoch": 21.061093247588424, "grad_norm": 65.20902252197266, "learning_rate": 9.251817154374785e-06, "loss": 8.4673, "step": 6550 }, { "epoch": 21.093247588424436, "grad_norm": 56.05644989013672, "learning_rate": 9.249227524692216e-06, "loss": 8.9482, "step": 6560 }, { "epoch": 21.125401929260452, "grad_norm": 54.780670166015625, "learning_rate": 9.24663382891713e-06, "loss": 9.5204, "step": 6570 }, { "epoch": 21.157556270096464, "grad_norm": 57.659202575683594, "learning_rate": 9.244036069862422e-06, "loss": 8.9363, "step": 6580 }, { "epoch": 21.189710610932476, "grad_norm": 58.41425704956055, "learning_rate": 9.241434250345406e-06, "loss": 8.9345, "step": 6590 }, { "epoch": 21.221864951768488, "grad_norm": 57.45591735839844, "learning_rate": 9.238828373187792e-06, "loss": 8.952, "step": 6600 }, { "epoch": 21.221864951768488, "eval_runtime": 5.384, "eval_samples_per_second": 9.287, "eval_steps_per_second": 1.3, "step": 6600 }, { "epoch": 21.2540192926045, "grad_norm": 69.79849243164062, "learning_rate": 9.236218441215692e-06, "loss": 8.4636, "step": 6610 }, { "epoch": 21.286173633440516, "grad_norm": 63.7364616394043, "learning_rate": 9.233604457259617e-06, "loss": 8.9091, "step": 6620 }, { "epoch": 21.318327974276528, "grad_norm": 54.77615737915039, "learning_rate": 9.23098642415447e-06, "loss": 8.8375, "step": 6630 }, { "epoch": 21.35048231511254, "grad_norm": 58.79292297363281, "learning_rate": 9.228364344739543e-06, "loss": 9.0165, "step": 6640 }, { "epoch": 21.382636655948552, "grad_norm": 53.08946990966797, "learning_rate": 9.225738221858526e-06, "loss": 8.9148, "step": 6650 }, { "epoch": 21.414790996784564, "grad_norm": 61.58588790893555, "learning_rate": 9.223108058359484e-06, "loss": 8.5333, "step": 6660 }, { "epoch": 21.44694533762058, "grad_norm": 59.364498138427734, "learning_rate": 9.220473857094872e-06, "loss": 8.9808, "step": 6670 }, { "epoch": 21.479099678456592, "grad_norm": 62.607452392578125, "learning_rate": 9.217835620921514e-06, "loss": 8.7105, "step": 6680 }, { "epoch": 21.511254019292604, "grad_norm": 56.75678634643555, "learning_rate": 9.21519335270062e-06, "loss": 8.8047, "step": 6690 }, { "epoch": 21.543408360128616, "grad_norm": 62.21288299560547, "learning_rate": 9.21254705529777e-06, "loss": 9.8135, "step": 6700 }, { "epoch": 21.543408360128616, "eval_runtime": 5.2949, "eval_samples_per_second": 9.443, "eval_steps_per_second": 1.322, "step": 6700 }, { "epoch": 21.575562700964632, "grad_norm": 59.72418975830078, "learning_rate": 9.20989673158291e-06, "loss": 8.8718, "step": 6710 }, { "epoch": 21.607717041800644, "grad_norm": 58.68205261230469, "learning_rate": 9.207242384430354e-06, "loss": 8.5931, "step": 6720 }, { "epoch": 21.639871382636656, "grad_norm": 53.73439407348633, "learning_rate": 9.204584016718783e-06, "loss": 9.277, "step": 6730 }, { "epoch": 21.672025723472668, "grad_norm": 52.59233856201172, "learning_rate": 9.201921631331238e-06, "loss": 8.9232, "step": 6740 }, { "epoch": 21.70418006430868, "grad_norm": 57.419822692871094, "learning_rate": 9.19925523115511e-06, "loss": 8.9797, "step": 6750 }, { "epoch": 21.736334405144696, "grad_norm": 62.657371520996094, "learning_rate": 9.196584819082155e-06, "loss": 8.9851, "step": 6760 }, { "epoch": 21.768488745980708, "grad_norm": 64.1988754272461, "learning_rate": 9.193910398008467e-06, "loss": 9.177, "step": 6770 }, { "epoch": 21.80064308681672, "grad_norm": 64.96138763427734, "learning_rate": 9.191231970834503e-06, "loss": 8.5449, "step": 6780 }, { "epoch": 21.832797427652732, "grad_norm": 54.36065673828125, "learning_rate": 9.188549540465051e-06, "loss": 9.1973, "step": 6790 }, { "epoch": 21.864951768488744, "grad_norm": 60.866031646728516, "learning_rate": 9.185863109809248e-06, "loss": 9.193, "step": 6800 }, { "epoch": 21.864951768488744, "eval_runtime": 5.3152, "eval_samples_per_second": 9.407, "eval_steps_per_second": 1.317, "step": 6800 }, { "epoch": 21.89710610932476, "grad_norm": 57.89910125732422, "learning_rate": 9.18317268178057e-06, "loss": 9.4843, "step": 6810 }, { "epoch": 21.929260450160772, "grad_norm": 55.017269134521484, "learning_rate": 9.180478259296824e-06, "loss": 8.8671, "step": 6820 }, { "epoch": 21.961414790996784, "grad_norm": 59.19805145263672, "learning_rate": 9.17777984528015e-06, "loss": 8.6977, "step": 6830 }, { "epoch": 21.993569131832796, "grad_norm": 64.24816131591797, "learning_rate": 9.175077442657016e-06, "loss": 9.0509, "step": 6840 }, { "epoch": 22.025723472668812, "grad_norm": 74.12789154052734, "learning_rate": 9.172371054358224e-06, "loss": 8.7719, "step": 6850 }, { "epoch": 22.057877813504824, "grad_norm": 55.18266677856445, "learning_rate": 9.169660683318885e-06, "loss": 9.077, "step": 6860 }, { "epoch": 22.090032154340836, "grad_norm": 63.42824172973633, "learning_rate": 9.16694633247844e-06, "loss": 9.0524, "step": 6870 }, { "epoch": 22.122186495176848, "grad_norm": 56.45744705200195, "learning_rate": 9.164228004780643e-06, "loss": 8.941, "step": 6880 }, { "epoch": 22.15434083601286, "grad_norm": 57.74501037597656, "learning_rate": 9.16150570317356e-06, "loss": 7.9124, "step": 6890 }, { "epoch": 22.186495176848876, "grad_norm": 53.75066375732422, "learning_rate": 9.158779430609565e-06, "loss": 8.0836, "step": 6900 }, { "epoch": 22.186495176848876, "eval_runtime": 5.2992, "eval_samples_per_second": 9.435, "eval_steps_per_second": 1.321, "step": 6900 }, { "epoch": 22.218649517684888, "grad_norm": 68.46334075927734, "learning_rate": 9.156049190045343e-06, "loss": 8.4346, "step": 6910 }, { "epoch": 22.2508038585209, "grad_norm": 60.11943435668945, "learning_rate": 9.153314984441879e-06, "loss": 9.0686, "step": 6920 }, { "epoch": 22.282958199356912, "grad_norm": 51.76887512207031, "learning_rate": 9.15057681676446e-06, "loss": 8.2197, "step": 6930 }, { "epoch": 22.315112540192928, "grad_norm": 61.723548889160156, "learning_rate": 9.147834689982668e-06, "loss": 8.6972, "step": 6940 }, { "epoch": 22.34726688102894, "grad_norm": 57.37504959106445, "learning_rate": 9.145088607070382e-06, "loss": 8.3775, "step": 6950 }, { "epoch": 22.379421221864952, "grad_norm": 65.61277770996094, "learning_rate": 9.142338571005768e-06, "loss": 8.1289, "step": 6960 }, { "epoch": 22.411575562700964, "grad_norm": 47.58235168457031, "learning_rate": 9.13958458477128e-06, "loss": 8.7548, "step": 6970 }, { "epoch": 22.443729903536976, "grad_norm": 60.261863708496094, "learning_rate": 9.136826651353659e-06, "loss": 8.1523, "step": 6980 }, { "epoch": 22.475884244372992, "grad_norm": 57.03665542602539, "learning_rate": 9.134064773743924e-06, "loss": 8.4339, "step": 6990 }, { "epoch": 22.508038585209004, "grad_norm": 66.92808532714844, "learning_rate": 9.131298954937369e-06, "loss": 8.4332, "step": 7000 }, { "epoch": 22.508038585209004, "eval_runtime": 5.3196, "eval_samples_per_second": 9.399, "eval_steps_per_second": 1.316, "step": 7000 }, { "epoch": 22.540192926045016, "grad_norm": 81.61766815185547, "learning_rate": 9.128529197933568e-06, "loss": 8.1676, "step": 7010 }, { "epoch": 22.572347266881028, "grad_norm": 65.74669647216797, "learning_rate": 9.125755505736361e-06, "loss": 8.1011, "step": 7020 }, { "epoch": 22.60450160771704, "grad_norm": 64.58338165283203, "learning_rate": 9.122977881353861e-06, "loss": 8.4663, "step": 7030 }, { "epoch": 22.636655948553056, "grad_norm": 55.91627883911133, "learning_rate": 9.120196327798442e-06, "loss": 8.586, "step": 7040 }, { "epoch": 22.668810289389068, "grad_norm": 60.77627182006836, "learning_rate": 9.117410848086738e-06, "loss": 8.1005, "step": 7050 }, { "epoch": 22.70096463022508, "grad_norm": 54.508480072021484, "learning_rate": 9.114621445239645e-06, "loss": 8.4063, "step": 7060 }, { "epoch": 22.733118971061092, "grad_norm": 69.63489532470703, "learning_rate": 9.111828122282311e-06, "loss": 8.5796, "step": 7070 }, { "epoch": 22.765273311897108, "grad_norm": 56.7982063293457, "learning_rate": 9.109030882244133e-06, "loss": 7.5517, "step": 7080 }, { "epoch": 22.79742765273312, "grad_norm": 61.081565856933594, "learning_rate": 9.106229728158763e-06, "loss": 8.2897, "step": 7090 }, { "epoch": 22.829581993569132, "grad_norm": 60.39103317260742, "learning_rate": 9.103424663064092e-06, "loss": 8.8123, "step": 7100 }, { "epoch": 22.829581993569132, "eval_runtime": 5.3517, "eval_samples_per_second": 9.343, "eval_steps_per_second": 1.308, "step": 7100 }, { "epoch": 22.861736334405144, "grad_norm": 65.78849029541016, "learning_rate": 9.100615690002253e-06, "loss": 8.261, "step": 7110 }, { "epoch": 22.893890675241156, "grad_norm": 58.6796875, "learning_rate": 9.097802812019622e-06, "loss": 8.2567, "step": 7120 }, { "epoch": 22.926045016077172, "grad_norm": 66.61534118652344, "learning_rate": 9.094986032166807e-06, "loss": 8.6677, "step": 7130 }, { "epoch": 22.958199356913184, "grad_norm": 61.790279388427734, "learning_rate": 9.092165353498643e-06, "loss": 8.504, "step": 7140 }, { "epoch": 22.990353697749196, "grad_norm": 55.522769927978516, "learning_rate": 9.0893407790742e-06, "loss": 8.5342, "step": 7150 }, { "epoch": 23.022508038585208, "grad_norm": 61.51953125, "learning_rate": 9.08651231195677e-06, "loss": 7.9812, "step": 7160 }, { "epoch": 23.05466237942122, "grad_norm": 59.70973587036133, "learning_rate": 9.08367995521387e-06, "loss": 7.6476, "step": 7170 }, { "epoch": 23.086816720257236, "grad_norm": 64.99278259277344, "learning_rate": 9.080843711917228e-06, "loss": 7.8628, "step": 7180 }, { "epoch": 23.118971061093248, "grad_norm": 57.341285705566406, "learning_rate": 9.078003585142796e-06, "loss": 7.9148, "step": 7190 }, { "epoch": 23.15112540192926, "grad_norm": 56.916412353515625, "learning_rate": 9.07515957797073e-06, "loss": 8.0547, "step": 7200 }, { "epoch": 23.15112540192926, "eval_runtime": 5.3194, "eval_samples_per_second": 9.4, "eval_steps_per_second": 1.316, "step": 7200 }, { "epoch": 23.183279742765272, "grad_norm": 58.17158126831055, "learning_rate": 9.072311693485401e-06, "loss": 7.6858, "step": 7210 }, { "epoch": 23.215434083601288, "grad_norm": 64.55870056152344, "learning_rate": 9.069459934775377e-06, "loss": 8.1269, "step": 7220 }, { "epoch": 23.2475884244373, "grad_norm": 66.45418548583984, "learning_rate": 9.066604304933438e-06, "loss": 8.2135, "step": 7230 }, { "epoch": 23.279742765273312, "grad_norm": 57.9680290222168, "learning_rate": 9.063744807056553e-06, "loss": 8.3325, "step": 7240 }, { "epoch": 23.311897106109324, "grad_norm": 54.45177459716797, "learning_rate": 9.060881444245892e-06, "loss": 8.1132, "step": 7250 }, { "epoch": 23.344051446945336, "grad_norm": 61.12494659423828, "learning_rate": 9.05801421960681e-06, "loss": 7.7703, "step": 7260 }, { "epoch": 23.376205787781352, "grad_norm": 56.391151428222656, "learning_rate": 9.055143136248858e-06, "loss": 8.0131, "step": 7270 }, { "epoch": 23.408360128617364, "grad_norm": 62.924720764160156, "learning_rate": 9.052268197285768e-06, "loss": 8.4918, "step": 7280 }, { "epoch": 23.440514469453376, "grad_norm": 56.40593338012695, "learning_rate": 9.049389405835452e-06, "loss": 7.7022, "step": 7290 }, { "epoch": 23.472668810289388, "grad_norm": 71.3387680053711, "learning_rate": 9.046506765020004e-06, "loss": 8.3636, "step": 7300 }, { "epoch": 23.472668810289388, "eval_runtime": 5.3015, "eval_samples_per_second": 9.431, "eval_steps_per_second": 1.32, "step": 7300 }, { "epoch": 23.5048231511254, "grad_norm": 64.80892944335938, "learning_rate": 9.043620277965689e-06, "loss": 7.8225, "step": 7310 }, { "epoch": 23.536977491961416, "grad_norm": 61.202632904052734, "learning_rate": 9.040729947802945e-06, "loss": 7.7196, "step": 7320 }, { "epoch": 23.569131832797428, "grad_norm": 61.75710678100586, "learning_rate": 9.037835777666376e-06, "loss": 8.4728, "step": 7330 }, { "epoch": 23.60128617363344, "grad_norm": 56.47146987915039, "learning_rate": 9.034937770694755e-06, "loss": 8.4825, "step": 7340 }, { "epoch": 23.633440514469452, "grad_norm": 60.002037048339844, "learning_rate": 9.032035930031012e-06, "loss": 7.9708, "step": 7350 }, { "epoch": 23.665594855305468, "grad_norm": 62.84184265136719, "learning_rate": 9.029130258822237e-06, "loss": 7.9067, "step": 7360 }, { "epoch": 23.69774919614148, "grad_norm": 55.42881393432617, "learning_rate": 9.026220760219671e-06, "loss": 7.4969, "step": 7370 }, { "epoch": 23.729903536977492, "grad_norm": 55.9797477722168, "learning_rate": 9.02330743737871e-06, "loss": 7.7235, "step": 7380 }, { "epoch": 23.762057877813504, "grad_norm": 51.53264236450195, "learning_rate": 9.020390293458895e-06, "loss": 8.009, "step": 7390 }, { "epoch": 23.794212218649516, "grad_norm": 57.072776794433594, "learning_rate": 9.017469331623912e-06, "loss": 8.2271, "step": 7400 }, { "epoch": 23.794212218649516, "eval_runtime": 5.3296, "eval_samples_per_second": 9.382, "eval_steps_per_second": 1.313, "step": 7400 }, { "epoch": 23.826366559485532, "grad_norm": 69.68956756591797, "learning_rate": 9.014544555041585e-06, "loss": 7.8191, "step": 7410 }, { "epoch": 23.858520900321544, "grad_norm": 59.79237365722656, "learning_rate": 9.01161596688388e-06, "loss": 7.7372, "step": 7420 }, { "epoch": 23.890675241157556, "grad_norm": 49.20173263549805, "learning_rate": 9.008683570326895e-06, "loss": 8.063, "step": 7430 }, { "epoch": 23.922829581993568, "grad_norm": 56.72202682495117, "learning_rate": 9.005747368550853e-06, "loss": 7.9286, "step": 7440 }, { "epoch": 23.954983922829584, "grad_norm": 62.62885284423828, "learning_rate": 9.002807364740112e-06, "loss": 8.3231, "step": 7450 }, { "epoch": 23.987138263665596, "grad_norm": 67.47502899169922, "learning_rate": 8.999863562083148e-06, "loss": 8.275, "step": 7460 }, { "epoch": 24.019292604501608, "grad_norm": 70.99124908447266, "learning_rate": 8.996915963772557e-06, "loss": 8.3191, "step": 7470 }, { "epoch": 24.05144694533762, "grad_norm": 57.50202941894531, "learning_rate": 8.993964573005055e-06, "loss": 7.2389, "step": 7480 }, { "epoch": 24.083601286173632, "grad_norm": 60.121585845947266, "learning_rate": 8.991009392981468e-06, "loss": 7.5521, "step": 7490 }, { "epoch": 24.115755627009648, "grad_norm": 68.00171661376953, "learning_rate": 8.988050426906733e-06, "loss": 8.1091, "step": 7500 }, { "epoch": 24.115755627009648, "eval_runtime": 5.2975, "eval_samples_per_second": 9.438, "eval_steps_per_second": 1.321, "step": 7500 }, { "epoch": 24.14790996784566, "grad_norm": 57.748008728027344, "learning_rate": 8.98508767798989e-06, "loss": 7.3311, "step": 7510 }, { "epoch": 24.180064308681672, "grad_norm": 66.20043182373047, "learning_rate": 8.982121149444087e-06, "loss": 7.7674, "step": 7520 }, { "epoch": 24.212218649517684, "grad_norm": 57.10810852050781, "learning_rate": 8.979150844486567e-06, "loss": 8.0656, "step": 7530 }, { "epoch": 24.244372990353696, "grad_norm": 59.05127716064453, "learning_rate": 8.976176766338668e-06, "loss": 7.1747, "step": 7540 }, { "epoch": 24.276527331189712, "grad_norm": 61.506526947021484, "learning_rate": 8.973198918225821e-06, "loss": 7.3948, "step": 7550 }, { "epoch": 24.308681672025724, "grad_norm": 60.9382209777832, "learning_rate": 8.97021730337755e-06, "loss": 7.136, "step": 7560 }, { "epoch": 24.340836012861736, "grad_norm": 61.987548828125, "learning_rate": 8.967231925027456e-06, "loss": 7.8493, "step": 7570 }, { "epoch": 24.372990353697748, "grad_norm": 62.725799560546875, "learning_rate": 8.964242786413228e-06, "loss": 7.5542, "step": 7580 }, { "epoch": 24.405144694533764, "grad_norm": 64.25658416748047, "learning_rate": 8.961249890776633e-06, "loss": 7.4208, "step": 7590 }, { "epoch": 24.437299035369776, "grad_norm": 60.61648941040039, "learning_rate": 8.958253241363505e-06, "loss": 7.6353, "step": 7600 }, { "epoch": 24.437299035369776, "eval_runtime": 5.3012, "eval_samples_per_second": 9.432, "eval_steps_per_second": 1.32, "step": 7600 }, { "epoch": 24.469453376205788, "grad_norm": 53.20369338989258, "learning_rate": 8.95525284142376e-06, "loss": 7.8987, "step": 7610 }, { "epoch": 24.5016077170418, "grad_norm": 62.43135452270508, "learning_rate": 8.952248694211375e-06, "loss": 7.5367, "step": 7620 }, { "epoch": 24.533762057877812, "grad_norm": 57.20931625366211, "learning_rate": 8.949240802984386e-06, "loss": 7.6914, "step": 7630 }, { "epoch": 24.565916398713828, "grad_norm": 55.226558685302734, "learning_rate": 8.946229171004904e-06, "loss": 7.2811, "step": 7640 }, { "epoch": 24.59807073954984, "grad_norm": 60.629093170166016, "learning_rate": 8.943213801539082e-06, "loss": 7.2937, "step": 7650 }, { "epoch": 24.630225080385852, "grad_norm": 67.9481201171875, "learning_rate": 8.940194697857133e-06, "loss": 7.4245, "step": 7660 }, { "epoch": 24.662379421221864, "grad_norm": 62.48249816894531, "learning_rate": 8.937171863233324e-06, "loss": 7.5844, "step": 7670 }, { "epoch": 24.694533762057876, "grad_norm": 78.08454132080078, "learning_rate": 8.93414530094596e-06, "loss": 8.0408, "step": 7680 }, { "epoch": 24.726688102893892, "grad_norm": 60.88642501831055, "learning_rate": 8.931115014277394e-06, "loss": 7.2282, "step": 7690 }, { "epoch": 24.758842443729904, "grad_norm": 53.79058837890625, "learning_rate": 8.928081006514015e-06, "loss": 7.3305, "step": 7700 }, { "epoch": 24.758842443729904, "eval_runtime": 5.2529, "eval_samples_per_second": 9.519, "eval_steps_per_second": 1.333, "step": 7700 }, { "epoch": 24.790996784565916, "grad_norm": 62.66322326660156, "learning_rate": 8.92504328094625e-06, "loss": 7.9897, "step": 7710 }, { "epoch": 24.823151125401928, "grad_norm": 62.55107116699219, "learning_rate": 8.922001840868558e-06, "loss": 7.5771, "step": 7720 }, { "epoch": 24.855305466237944, "grad_norm": 65.16651916503906, "learning_rate": 8.918956689579426e-06, "loss": 7.762, "step": 7730 }, { "epoch": 24.887459807073956, "grad_norm": 58.460044860839844, "learning_rate": 8.915907830381362e-06, "loss": 7.9156, "step": 7740 }, { "epoch": 24.919614147909968, "grad_norm": 56.16514587402344, "learning_rate": 8.912855266580902e-06, "loss": 6.8966, "step": 7750 }, { "epoch": 24.95176848874598, "grad_norm": 68.46279907226562, "learning_rate": 8.909799001488596e-06, "loss": 7.3429, "step": 7760 }, { "epoch": 24.983922829581992, "grad_norm": 66.40889739990234, "learning_rate": 8.906739038419006e-06, "loss": 7.4867, "step": 7770 }, { "epoch": 25.016077170418008, "grad_norm": 54.370201110839844, "learning_rate": 8.90367538069071e-06, "loss": 7.274, "step": 7780 }, { "epoch": 25.04823151125402, "grad_norm": 57.53028106689453, "learning_rate": 8.900608031626292e-06, "loss": 6.8108, "step": 7790 }, { "epoch": 25.080385852090032, "grad_norm": 59.551612854003906, "learning_rate": 8.89753699455233e-06, "loss": 6.7061, "step": 7800 }, { "epoch": 25.080385852090032, "eval_runtime": 5.317, "eval_samples_per_second": 9.404, "eval_steps_per_second": 1.317, "step": 7800 }, { "epoch": 25.112540192926044, "grad_norm": 54.75897979736328, "learning_rate": 8.894462272799414e-06, "loss": 7.5585, "step": 7810 }, { "epoch": 25.144694533762056, "grad_norm": 59.23293685913086, "learning_rate": 8.891383869702127e-06, "loss": 7.365, "step": 7820 }, { "epoch": 25.176848874598072, "grad_norm": 51.98605728149414, "learning_rate": 8.888301788599036e-06, "loss": 7.1062, "step": 7830 }, { "epoch": 25.209003215434084, "grad_norm": 65.81130981445312, "learning_rate": 8.885216032832711e-06, "loss": 7.2293, "step": 7840 }, { "epoch": 25.241157556270096, "grad_norm": 67.55733489990234, "learning_rate": 8.882126605749695e-06, "loss": 6.9482, "step": 7850 }, { "epoch": 25.273311897106108, "grad_norm": 60.69416809082031, "learning_rate": 8.879033510700517e-06, "loss": 7.1452, "step": 7860 }, { "epoch": 25.305466237942124, "grad_norm": 61.736873626708984, "learning_rate": 8.875936751039687e-06, "loss": 6.7744, "step": 7870 }, { "epoch": 25.337620578778136, "grad_norm": 56.67613220214844, "learning_rate": 8.872836330125683e-06, "loss": 7.3829, "step": 7880 }, { "epoch": 25.369774919614148, "grad_norm": 50.431800842285156, "learning_rate": 8.869732251320959e-06, "loss": 7.1119, "step": 7890 }, { "epoch": 25.40192926045016, "grad_norm": 61.97815704345703, "learning_rate": 8.866624517991935e-06, "loss": 7.6223, "step": 7900 }, { "epoch": 25.40192926045016, "eval_runtime": 5.257, "eval_samples_per_second": 9.511, "eval_steps_per_second": 1.332, "step": 7900 }, { "epoch": 25.434083601286172, "grad_norm": 63.44896697998047, "learning_rate": 8.863513133508992e-06, "loss": 7.5214, "step": 7910 }, { "epoch": 25.466237942122188, "grad_norm": 61.347991943359375, "learning_rate": 8.860398101246475e-06, "loss": 6.9295, "step": 7920 }, { "epoch": 25.4983922829582, "grad_norm": 58.75422286987305, "learning_rate": 8.857279424582678e-06, "loss": 7.297, "step": 7930 }, { "epoch": 25.530546623794212, "grad_norm": 59.36176300048828, "learning_rate": 8.854157106899857e-06, "loss": 7.5684, "step": 7940 }, { "epoch": 25.562700964630224, "grad_norm": 60.92229461669922, "learning_rate": 8.851031151584207e-06, "loss": 7.2044, "step": 7950 }, { "epoch": 25.59485530546624, "grad_norm": 54.66810607910156, "learning_rate": 8.847901562025876e-06, "loss": 7.3577, "step": 7960 }, { "epoch": 25.627009646302252, "grad_norm": 53.53245544433594, "learning_rate": 8.844768341618946e-06, "loss": 7.1454, "step": 7970 }, { "epoch": 25.659163987138264, "grad_norm": 80.19052124023438, "learning_rate": 8.841631493761446e-06, "loss": 6.8396, "step": 7980 }, { "epoch": 25.691318327974276, "grad_norm": 62.41783142089844, "learning_rate": 8.83849102185533e-06, "loss": 6.9515, "step": 7990 }, { "epoch": 25.723472668810288, "grad_norm": 76.13848876953125, "learning_rate": 8.835346929306488e-06, "loss": 7.2933, "step": 8000 }, { "epoch": 25.723472668810288, "eval_runtime": 5.3223, "eval_samples_per_second": 9.395, "eval_steps_per_second": 1.315, "step": 8000 }, { "epoch": 25.755627009646304, "grad_norm": 62.00101852416992, "learning_rate": 8.832199219524732e-06, "loss": 7.1342, "step": 8010 }, { "epoch": 25.787781350482316, "grad_norm": 56.926849365234375, "learning_rate": 8.829047895923802e-06, "loss": 7.0949, "step": 8020 }, { "epoch": 25.819935691318328, "grad_norm": 61.58393859863281, "learning_rate": 8.825892961921355e-06, "loss": 6.9091, "step": 8030 }, { "epoch": 25.85209003215434, "grad_norm": 53.677001953125, "learning_rate": 8.822734420938963e-06, "loss": 7.1641, "step": 8040 }, { "epoch": 25.884244372990352, "grad_norm": 56.999446868896484, "learning_rate": 8.81957227640211e-06, "loss": 6.9443, "step": 8050 }, { "epoch": 25.916398713826368, "grad_norm": 61.26164627075195, "learning_rate": 8.816406531740188e-06, "loss": 7.4479, "step": 8060 }, { "epoch": 25.94855305466238, "grad_norm": 61.08005142211914, "learning_rate": 8.813237190386495e-06, "loss": 7.1974, "step": 8070 }, { "epoch": 25.980707395498392, "grad_norm": 56.71426010131836, "learning_rate": 8.810064255778226e-06, "loss": 7.0536, "step": 8080 }, { "epoch": 26.012861736334404, "grad_norm": 68.89727020263672, "learning_rate": 8.806887731356478e-06, "loss": 6.888, "step": 8090 }, { "epoch": 26.04501607717042, "grad_norm": 60.45811462402344, "learning_rate": 8.803707620566238e-06, "loss": 6.9448, "step": 8100 }, { "epoch": 26.04501607717042, "eval_runtime": 5.2486, "eval_samples_per_second": 9.526, "eval_steps_per_second": 1.334, "step": 8100 }, { "epoch": 26.077170418006432, "grad_norm": 61.41531753540039, "learning_rate": 8.80052392685638e-06, "loss": 7.3765, "step": 8110 }, { "epoch": 26.109324758842444, "grad_norm": 60.405670166015625, "learning_rate": 8.79733665367967e-06, "loss": 7.1174, "step": 8120 }, { "epoch": 26.141479099678456, "grad_norm": 74.53112030029297, "learning_rate": 8.794145804492755e-06, "loss": 6.8419, "step": 8130 }, { "epoch": 26.173633440514468, "grad_norm": 62.855682373046875, "learning_rate": 8.790951382756152e-06, "loss": 7.0322, "step": 8140 }, { "epoch": 26.205787781350484, "grad_norm": 55.47507858276367, "learning_rate": 8.787753391934265e-06, "loss": 6.6151, "step": 8150 }, { "epoch": 26.237942122186496, "grad_norm": 53.67250442504883, "learning_rate": 8.784551835495355e-06, "loss": 6.7205, "step": 8160 }, { "epoch": 26.270096463022508, "grad_norm": 59.168155670166016, "learning_rate": 8.781346716911564e-06, "loss": 6.9673, "step": 8170 }, { "epoch": 26.30225080385852, "grad_norm": 57.25756072998047, "learning_rate": 8.778138039658884e-06, "loss": 7.1881, "step": 8180 }, { "epoch": 26.334405144694532, "grad_norm": 55.55506896972656, "learning_rate": 8.774925807217177e-06, "loss": 6.3286, "step": 8190 }, { "epoch": 26.366559485530548, "grad_norm": 54.58069610595703, "learning_rate": 8.771710023070155e-06, "loss": 6.7168, "step": 8200 }, { "epoch": 26.366559485530548, "eval_runtime": 5.2613, "eval_samples_per_second": 9.503, "eval_steps_per_second": 1.33, "step": 8200 }, { "epoch": 26.39871382636656, "grad_norm": 50.808128356933594, "learning_rate": 8.768490690705385e-06, "loss": 6.8051, "step": 8210 }, { "epoch": 26.430868167202572, "grad_norm": 55.814552307128906, "learning_rate": 8.765267813614279e-06, "loss": 6.6246, "step": 8220 }, { "epoch": 26.463022508038584, "grad_norm": 52.591522216796875, "learning_rate": 8.762041395292095e-06, "loss": 6.6424, "step": 8230 }, { "epoch": 26.4951768488746, "grad_norm": 60.21788787841797, "learning_rate": 8.75881143923793e-06, "loss": 6.7071, "step": 8240 }, { "epoch": 26.527331189710612, "grad_norm": 65.58866119384766, "learning_rate": 8.755577948954723e-06, "loss": 7.1173, "step": 8250 }, { "epoch": 26.559485530546624, "grad_norm": 59.43306350708008, "learning_rate": 8.752340927949234e-06, "loss": 6.4451, "step": 8260 }, { "epoch": 26.591639871382636, "grad_norm": 53.872779846191406, "learning_rate": 8.749100379732069e-06, "loss": 6.6692, "step": 8270 }, { "epoch": 26.623794212218648, "grad_norm": 64.21817016601562, "learning_rate": 8.745856307817645e-06, "loss": 6.9, "step": 8280 }, { "epoch": 26.655948553054664, "grad_norm": 58.702049255371094, "learning_rate": 8.742608715724204e-06, "loss": 7.2242, "step": 8290 }, { "epoch": 26.688102893890676, "grad_norm": 57.790008544921875, "learning_rate": 8.739357606973814e-06, "loss": 6.3024, "step": 8300 }, { "epoch": 26.688102893890676, "eval_runtime": 5.316, "eval_samples_per_second": 9.406, "eval_steps_per_second": 1.317, "step": 8300 }, { "epoch": 26.720257234726688, "grad_norm": 53.38924789428711, "learning_rate": 8.736102985092346e-06, "loss": 6.4699, "step": 8310 }, { "epoch": 26.7524115755627, "grad_norm": 61.68867111206055, "learning_rate": 8.732844853609486e-06, "loss": 6.8254, "step": 8320 }, { "epoch": 26.784565916398712, "grad_norm": 58.17760467529297, "learning_rate": 8.729583216058727e-06, "loss": 6.4675, "step": 8330 }, { "epoch": 26.816720257234728, "grad_norm": 58.13966751098633, "learning_rate": 8.72631807597736e-06, "loss": 6.7336, "step": 8340 }, { "epoch": 26.84887459807074, "grad_norm": 67.4608154296875, "learning_rate": 8.723049436906482e-06, "loss": 6.6143, "step": 8350 }, { "epoch": 26.881028938906752, "grad_norm": 59.398338317871094, "learning_rate": 8.719777302390977e-06, "loss": 6.8047, "step": 8360 }, { "epoch": 26.913183279742764, "grad_norm": 55.392425537109375, "learning_rate": 8.716501675979522e-06, "loss": 6.5321, "step": 8370 }, { "epoch": 26.94533762057878, "grad_norm": 53.98456954956055, "learning_rate": 8.713222561224586e-06, "loss": 6.7438, "step": 8380 }, { "epoch": 26.977491961414792, "grad_norm": 56.46892547607422, "learning_rate": 8.709939961682414e-06, "loss": 7.189, "step": 8390 }, { "epoch": 27.009646302250804, "grad_norm": 63.870994567871094, "learning_rate": 8.706653880913036e-06, "loss": 6.6603, "step": 8400 }, { "epoch": 27.009646302250804, "eval_runtime": 5.3355, "eval_samples_per_second": 9.371, "eval_steps_per_second": 1.312, "step": 8400 }, { "epoch": 27.041800643086816, "grad_norm": 57.07316970825195, "learning_rate": 8.703364322480253e-06, "loss": 6.5994, "step": 8410 }, { "epoch": 27.073954983922828, "grad_norm": 59.450653076171875, "learning_rate": 8.70007128995164e-06, "loss": 6.4431, "step": 8420 }, { "epoch": 27.106109324758844, "grad_norm": 54.65387725830078, "learning_rate": 8.696774786898541e-06, "loss": 7.0494, "step": 8430 }, { "epoch": 27.138263665594856, "grad_norm": 49.9312858581543, "learning_rate": 8.69347481689606e-06, "loss": 6.6273, "step": 8440 }, { "epoch": 27.170418006430868, "grad_norm": 53.50532913208008, "learning_rate": 8.690171383523065e-06, "loss": 6.4262, "step": 8450 }, { "epoch": 27.20257234726688, "grad_norm": 62.09320068359375, "learning_rate": 8.686864490362177e-06, "loss": 6.804, "step": 8460 }, { "epoch": 27.234726688102892, "grad_norm": 55.54867172241211, "learning_rate": 8.683554140999771e-06, "loss": 6.559, "step": 8470 }, { "epoch": 27.266881028938908, "grad_norm": 56.69111633300781, "learning_rate": 8.680240339025969e-06, "loss": 6.5146, "step": 8480 }, { "epoch": 27.29903536977492, "grad_norm": 58.27489471435547, "learning_rate": 8.676923088034637e-06, "loss": 6.333, "step": 8490 }, { "epoch": 27.331189710610932, "grad_norm": 63.73439025878906, "learning_rate": 8.673602391623388e-06, "loss": 6.5041, "step": 8500 }, { "epoch": 27.331189710610932, "eval_runtime": 5.2789, "eval_samples_per_second": 9.472, "eval_steps_per_second": 1.326, "step": 8500 }, { "epoch": 27.363344051446944, "grad_norm": 60.448822021484375, "learning_rate": 8.670278253393559e-06, "loss": 5.9606, "step": 8510 }, { "epoch": 27.39549839228296, "grad_norm": 56.507816314697266, "learning_rate": 8.66695067695023e-06, "loss": 6.5239, "step": 8520 }, { "epoch": 27.427652733118972, "grad_norm": 60.00230407714844, "learning_rate": 8.663619665902207e-06, "loss": 5.9417, "step": 8530 }, { "epoch": 27.459807073954984, "grad_norm": 68.22850036621094, "learning_rate": 8.660285223862022e-06, "loss": 6.7481, "step": 8540 }, { "epoch": 27.491961414790996, "grad_norm": 62.291229248046875, "learning_rate": 8.656947354445924e-06, "loss": 6.4039, "step": 8550 }, { "epoch": 27.524115755627008, "grad_norm": 59.224117279052734, "learning_rate": 8.653606061273883e-06, "loss": 6.5571, "step": 8560 }, { "epoch": 27.556270096463024, "grad_norm": 65.90287780761719, "learning_rate": 8.650261347969579e-06, "loss": 6.499, "step": 8570 }, { "epoch": 27.588424437299036, "grad_norm": 56.30282974243164, "learning_rate": 8.646913218160407e-06, "loss": 6.6796, "step": 8580 }, { "epoch": 27.620578778135048, "grad_norm": 51.96245193481445, "learning_rate": 8.643561675477457e-06, "loss": 6.2212, "step": 8590 }, { "epoch": 27.65273311897106, "grad_norm": 57.187530517578125, "learning_rate": 8.640206723555532e-06, "loss": 6.3037, "step": 8600 }, { "epoch": 27.65273311897106, "eval_runtime": 5.3127, "eval_samples_per_second": 9.411, "eval_steps_per_second": 1.318, "step": 8600 }, { "epoch": 27.684887459807072, "grad_norm": 53.94953536987305, "learning_rate": 8.636848366033124e-06, "loss": 6.2343, "step": 8610 }, { "epoch": 27.717041800643088, "grad_norm": 44.976783752441406, "learning_rate": 8.633486606552423e-06, "loss": 6.5426, "step": 8620 }, { "epoch": 27.7491961414791, "grad_norm": 58.54126739501953, "learning_rate": 8.630121448759304e-06, "loss": 6.737, "step": 8630 }, { "epoch": 27.781350482315112, "grad_norm": 54.23971176147461, "learning_rate": 8.62675289630333e-06, "loss": 6.3428, "step": 8640 }, { "epoch": 27.813504823151124, "grad_norm": 65.50804901123047, "learning_rate": 8.62338095283775e-06, "loss": 6.7331, "step": 8650 }, { "epoch": 27.84565916398714, "grad_norm": 56.34369659423828, "learning_rate": 8.620005622019484e-06, "loss": 6.8205, "step": 8660 }, { "epoch": 27.877813504823152, "grad_norm": 70.01112365722656, "learning_rate": 8.616626907509127e-06, "loss": 6.7424, "step": 8670 }, { "epoch": 27.909967845659164, "grad_norm": 56.44263458251953, "learning_rate": 8.613244812970947e-06, "loss": 6.7214, "step": 8680 }, { "epoch": 27.942122186495176, "grad_norm": 62.97362518310547, "learning_rate": 8.609859342072873e-06, "loss": 6.653, "step": 8690 }, { "epoch": 27.974276527331188, "grad_norm": 59.825050354003906, "learning_rate": 8.606470498486498e-06, "loss": 6.2512, "step": 8700 }, { "epoch": 27.974276527331188, "eval_runtime": 5.3098, "eval_samples_per_second": 9.416, "eval_steps_per_second": 1.318, "step": 8700 }, { "epoch": 28.006430868167204, "grad_norm": 57.50807189941406, "learning_rate": 8.603078285887076e-06, "loss": 6.0714, "step": 8710 }, { "epoch": 28.038585209003216, "grad_norm": 55.356170654296875, "learning_rate": 8.59968270795351e-06, "loss": 6.0901, "step": 8720 }, { "epoch": 28.070739549839228, "grad_norm": 70.37511444091797, "learning_rate": 8.596283768368352e-06, "loss": 5.9716, "step": 8730 }, { "epoch": 28.10289389067524, "grad_norm": 52.252445220947266, "learning_rate": 8.592881470817804e-06, "loss": 5.8955, "step": 8740 }, { "epoch": 28.135048231511256, "grad_norm": 64.24617004394531, "learning_rate": 8.589475818991708e-06, "loss": 6.2642, "step": 8750 }, { "epoch": 28.167202572347268, "grad_norm": 60.256141662597656, "learning_rate": 8.586066816583544e-06, "loss": 6.1839, "step": 8760 }, { "epoch": 28.19935691318328, "grad_norm": 52.69380569458008, "learning_rate": 8.582654467290426e-06, "loss": 6.6624, "step": 8770 }, { "epoch": 28.231511254019292, "grad_norm": 59.75746536254883, "learning_rate": 8.579238774813093e-06, "loss": 6.2884, "step": 8780 }, { "epoch": 28.263665594855304, "grad_norm": 53.61122131347656, "learning_rate": 8.575819742855918e-06, "loss": 5.9441, "step": 8790 }, { "epoch": 28.29581993569132, "grad_norm": 57.29293441772461, "learning_rate": 8.572397375126892e-06, "loss": 5.9662, "step": 8800 }, { "epoch": 28.29581993569132, "eval_runtime": 5.3442, "eval_samples_per_second": 9.356, "eval_steps_per_second": 1.31, "step": 8800 }, { "epoch": 28.327974276527332, "grad_norm": 61.590003967285156, "learning_rate": 8.568971675337618e-06, "loss": 6.2483, "step": 8810 }, { "epoch": 28.360128617363344, "grad_norm": 53.30902099609375, "learning_rate": 8.565542647203324e-06, "loss": 6.2688, "step": 8820 }, { "epoch": 28.392282958199356, "grad_norm": 60.101837158203125, "learning_rate": 8.56211029444284e-06, "loss": 5.9732, "step": 8830 }, { "epoch": 28.424437299035368, "grad_norm": 61.60184097290039, "learning_rate": 8.558674620778601e-06, "loss": 6.1512, "step": 8840 }, { "epoch": 28.456591639871384, "grad_norm": 62.19864273071289, "learning_rate": 8.555235629936646e-06, "loss": 6.5352, "step": 8850 }, { "epoch": 28.488745980707396, "grad_norm": 58.276641845703125, "learning_rate": 8.551793325646612e-06, "loss": 6.5829, "step": 8860 }, { "epoch": 28.520900321543408, "grad_norm": 57.971092224121094, "learning_rate": 8.54834771164173e-06, "loss": 6.2775, "step": 8870 }, { "epoch": 28.55305466237942, "grad_norm": 59.613433837890625, "learning_rate": 8.544898791658819e-06, "loss": 6.798, "step": 8880 }, { "epoch": 28.585209003215436, "grad_norm": 60.403770446777344, "learning_rate": 8.54144656943828e-06, "loss": 6.1933, "step": 8890 }, { "epoch": 28.617363344051448, "grad_norm": 73.9917984008789, "learning_rate": 8.537991048724104e-06, "loss": 6.7451, "step": 8900 }, { "epoch": 28.617363344051448, "eval_runtime": 5.4144, "eval_samples_per_second": 9.235, "eval_steps_per_second": 1.293, "step": 8900 }, { "epoch": 28.64951768488746, "grad_norm": 53.785335540771484, "learning_rate": 8.53453223326385e-06, "loss": 5.8769, "step": 8910 }, { "epoch": 28.681672025723472, "grad_norm": 57.2869987487793, "learning_rate": 8.531070126808657e-06, "loss": 6.2429, "step": 8920 }, { "epoch": 28.713826366559484, "grad_norm": 67.60469818115234, "learning_rate": 8.52760473311323e-06, "loss": 6.5563, "step": 8930 }, { "epoch": 28.7459807073955, "grad_norm": 55.314884185791016, "learning_rate": 8.524136055935839e-06, "loss": 6.4451, "step": 8940 }, { "epoch": 28.778135048231512, "grad_norm": 57.34684371948242, "learning_rate": 8.520664099038314e-06, "loss": 6.3058, "step": 8950 }, { "epoch": 28.810289389067524, "grad_norm": 57.30896759033203, "learning_rate": 8.517188866186047e-06, "loss": 6.2964, "step": 8960 }, { "epoch": 28.842443729903536, "grad_norm": 60.07665252685547, "learning_rate": 8.513710361147976e-06, "loss": 6.1843, "step": 8970 }, { "epoch": 28.874598070739548, "grad_norm": 60.92975997924805, "learning_rate": 8.510228587696592e-06, "loss": 6.2749, "step": 8980 }, { "epoch": 28.906752411575564, "grad_norm": 57.638671875, "learning_rate": 8.506743549607931e-06, "loss": 6.126, "step": 8990 }, { "epoch": 28.938906752411576, "grad_norm": 57.3425178527832, "learning_rate": 8.503255250661568e-06, "loss": 5.7034, "step": 9000 }, { "epoch": 28.938906752411576, "eval_runtime": 5.3089, "eval_samples_per_second": 9.418, "eval_steps_per_second": 1.319, "step": 9000 }, { "epoch": 28.971061093247588, "grad_norm": 63.44777297973633, "learning_rate": 8.499763694640612e-06, "loss": 5.8814, "step": 9010 }, { "epoch": 29.0032154340836, "grad_norm": 66.48420715332031, "learning_rate": 8.496268885331709e-06, "loss": 6.4875, "step": 9020 }, { "epoch": 29.035369774919616, "grad_norm": 52.97175598144531, "learning_rate": 8.492770826525033e-06, "loss": 5.9942, "step": 9030 }, { "epoch": 29.067524115755628, "grad_norm": 62.576812744140625, "learning_rate": 8.489269522014276e-06, "loss": 5.9874, "step": 9040 }, { "epoch": 29.09967845659164, "grad_norm": 66.70442199707031, "learning_rate": 8.485764975596657e-06, "loss": 6.1161, "step": 9050 }, { "epoch": 29.131832797427652, "grad_norm": 70.01300811767578, "learning_rate": 8.482257191072909e-06, "loss": 5.8529, "step": 9060 }, { "epoch": 29.163987138263664, "grad_norm": 50.35975646972656, "learning_rate": 8.478746172247273e-06, "loss": 6.1681, "step": 9070 }, { "epoch": 29.19614147909968, "grad_norm": 61.9415397644043, "learning_rate": 8.475231922927505e-06, "loss": 5.868, "step": 9080 }, { "epoch": 29.228295819935692, "grad_norm": 52.75117492675781, "learning_rate": 8.471714446924856e-06, "loss": 5.7467, "step": 9090 }, { "epoch": 29.260450160771704, "grad_norm": 58.306583404541016, "learning_rate": 8.468193748054083e-06, "loss": 5.9785, "step": 9100 }, { "epoch": 29.260450160771704, "eval_runtime": 5.2817, "eval_samples_per_second": 9.467, "eval_steps_per_second": 1.325, "step": 9100 }, { "epoch": 29.292604501607716, "grad_norm": 60.78150939941406, "learning_rate": 8.464669830133434e-06, "loss": 6.1839, "step": 9110 }, { "epoch": 29.324758842443728, "grad_norm": 59.02619171142578, "learning_rate": 8.461142696984653e-06, "loss": 6.1791, "step": 9120 }, { "epoch": 29.356913183279744, "grad_norm": 56.96232223510742, "learning_rate": 8.457612352432967e-06, "loss": 5.8571, "step": 9130 }, { "epoch": 29.389067524115756, "grad_norm": 55.31610870361328, "learning_rate": 8.454078800307084e-06, "loss": 5.9016, "step": 9140 }, { "epoch": 29.421221864951768, "grad_norm": 56.29403305053711, "learning_rate": 8.450542044439195e-06, "loss": 6.1364, "step": 9150 }, { "epoch": 29.45337620578778, "grad_norm": 63.82276916503906, "learning_rate": 8.447002088664964e-06, "loss": 5.9842, "step": 9160 }, { "epoch": 29.485530546623796, "grad_norm": 58.416072845458984, "learning_rate": 8.443458936823525e-06, "loss": 5.9291, "step": 9170 }, { "epoch": 29.517684887459808, "grad_norm": 48.564701080322266, "learning_rate": 8.439912592757478e-06, "loss": 5.6367, "step": 9180 }, { "epoch": 29.54983922829582, "grad_norm": 60.03151321411133, "learning_rate": 8.436363060312888e-06, "loss": 6.7158, "step": 9190 }, { "epoch": 29.581993569131832, "grad_norm": 59.20677947998047, "learning_rate": 8.432810343339273e-06, "loss": 6.2729, "step": 9200 }, { "epoch": 29.581993569131832, "eval_runtime": 5.2745, "eval_samples_per_second": 9.479, "eval_steps_per_second": 1.327, "step": 9200 }, { "epoch": 29.614147909967844, "grad_norm": 63.340057373046875, "learning_rate": 8.429254445689606e-06, "loss": 5.9133, "step": 9210 }, { "epoch": 29.64630225080386, "grad_norm": 49.93601989746094, "learning_rate": 8.425695371220311e-06, "loss": 6.0413, "step": 9220 }, { "epoch": 29.678456591639872, "grad_norm": 60.493690490722656, "learning_rate": 8.42213312379126e-06, "loss": 5.7182, "step": 9230 }, { "epoch": 29.710610932475884, "grad_norm": 48.0650520324707, "learning_rate": 8.418567707265757e-06, "loss": 5.769, "step": 9240 }, { "epoch": 29.742765273311896, "grad_norm": 68.01608276367188, "learning_rate": 8.414999125510554e-06, "loss": 5.7324, "step": 9250 }, { "epoch": 29.77491961414791, "grad_norm": 65.94218444824219, "learning_rate": 8.411427382395829e-06, "loss": 5.705, "step": 9260 }, { "epoch": 29.807073954983924, "grad_norm": 59.62049865722656, "learning_rate": 8.407852481795188e-06, "loss": 6.1045, "step": 9270 }, { "epoch": 29.839228295819936, "grad_norm": 55.38057327270508, "learning_rate": 8.404274427585666e-06, "loss": 5.5096, "step": 9280 }, { "epoch": 29.871382636655948, "grad_norm": 64.40621185302734, "learning_rate": 8.400693223647713e-06, "loss": 5.9226, "step": 9290 }, { "epoch": 29.90353697749196, "grad_norm": 52.644493103027344, "learning_rate": 8.3971088738652e-06, "loss": 5.5369, "step": 9300 }, { "epoch": 29.90353697749196, "eval_runtime": 5.345, "eval_samples_per_second": 9.355, "eval_steps_per_second": 1.31, "step": 9300 }, { "epoch": 29.935691318327976, "grad_norm": 66.84420013427734, "learning_rate": 8.393521382125405e-06, "loss": 5.973, "step": 9310 }, { "epoch": 29.967845659163988, "grad_norm": 47.91856384277344, "learning_rate": 8.38993075231902e-06, "loss": 6.0168, "step": 9320 }, { "epoch": 30.0, "grad_norm": 82.2280044555664, "learning_rate": 8.38633698834013e-06, "loss": 6.1188, "step": 9330 }, { "epoch": 30.032154340836012, "grad_norm": 61.454593658447266, "learning_rate": 8.382740094086229e-06, "loss": 6.3123, "step": 9340 }, { "epoch": 30.064308681672024, "grad_norm": 54.600379943847656, "learning_rate": 8.379140073458199e-06, "loss": 5.4606, "step": 9350 }, { "epoch": 30.09646302250804, "grad_norm": 65.22834777832031, "learning_rate": 8.37553693036032e-06, "loss": 5.4302, "step": 9360 }, { "epoch": 30.128617363344052, "grad_norm": 51.56727600097656, "learning_rate": 8.371930668700249e-06, "loss": 5.6703, "step": 9370 }, { "epoch": 30.160771704180064, "grad_norm": 54.137393951416016, "learning_rate": 8.368321292389031e-06, "loss": 6.2352, "step": 9380 }, { "epoch": 30.192926045016076, "grad_norm": 57.31708908081055, "learning_rate": 8.36470880534109e-06, "loss": 5.7905, "step": 9390 }, { "epoch": 30.225080385852092, "grad_norm": 64.18864440917969, "learning_rate": 8.36109321147422e-06, "loss": 5.3087, "step": 9400 }, { "epoch": 30.225080385852092, "eval_runtime": 5.2578, "eval_samples_per_second": 9.51, "eval_steps_per_second": 1.331, "step": 9400 }, { "epoch": 30.257234726688104, "grad_norm": 48.637386322021484, "learning_rate": 8.357474514709585e-06, "loss": 6.4229, "step": 9410 }, { "epoch": 30.289389067524116, "grad_norm": 61.583038330078125, "learning_rate": 8.353852718971717e-06, "loss": 5.8695, "step": 9420 }, { "epoch": 30.321543408360128, "grad_norm": 50.925392150878906, "learning_rate": 8.350227828188504e-06, "loss": 5.7595, "step": 9430 }, { "epoch": 30.35369774919614, "grad_norm": 62.2911491394043, "learning_rate": 8.346599846291197e-06, "loss": 5.9597, "step": 9440 }, { "epoch": 30.385852090032156, "grad_norm": 55.542396545410156, "learning_rate": 8.342968777214393e-06, "loss": 5.9649, "step": 9450 }, { "epoch": 30.418006430868168, "grad_norm": 51.78948211669922, "learning_rate": 8.339334624896042e-06, "loss": 6.3714, "step": 9460 }, { "epoch": 30.45016077170418, "grad_norm": 54.41759490966797, "learning_rate": 8.335697393277435e-06, "loss": 6.0636, "step": 9470 }, { "epoch": 30.482315112540192, "grad_norm": 52.61174392700195, "learning_rate": 8.332057086303203e-06, "loss": 5.5828, "step": 9480 }, { "epoch": 30.514469453376204, "grad_norm": 49.92802047729492, "learning_rate": 8.328413707921311e-06, "loss": 5.8641, "step": 9490 }, { "epoch": 30.54662379421222, "grad_norm": 45.2222785949707, "learning_rate": 8.32476726208306e-06, "loss": 5.3849, "step": 9500 }, { "epoch": 30.54662379421222, "eval_runtime": 5.269, "eval_samples_per_second": 9.489, "eval_steps_per_second": 1.329, "step": 9500 }, { "epoch": 30.578778135048232, "grad_norm": 63.59880065917969, "learning_rate": 8.321117752743071e-06, "loss": 5.6547, "step": 9510 }, { "epoch": 30.610932475884244, "grad_norm": 48.55288314819336, "learning_rate": 8.317465183859293e-06, "loss": 5.3541, "step": 9520 }, { "epoch": 30.643086816720256, "grad_norm": 59.16263961791992, "learning_rate": 8.31380955939299e-06, "loss": 5.6051, "step": 9530 }, { "epoch": 30.675241157556272, "grad_norm": 55.41622543334961, "learning_rate": 8.310150883308741e-06, "loss": 5.3685, "step": 9540 }, { "epoch": 30.707395498392284, "grad_norm": 54.2862434387207, "learning_rate": 8.306489159574435e-06, "loss": 6.094, "step": 9550 }, { "epoch": 30.739549839228296, "grad_norm": 55.1435661315918, "learning_rate": 8.302824392161262e-06, "loss": 5.1891, "step": 9560 }, { "epoch": 30.771704180064308, "grad_norm": 50.40238571166992, "learning_rate": 8.29915658504372e-06, "loss": 5.8208, "step": 9570 }, { "epoch": 30.80385852090032, "grad_norm": 55.14621353149414, "learning_rate": 8.295485742199597e-06, "loss": 5.225, "step": 9580 }, { "epoch": 30.836012861736336, "grad_norm": 62.09653091430664, "learning_rate": 8.291811867609977e-06, "loss": 5.5442, "step": 9590 }, { "epoch": 30.868167202572348, "grad_norm": 61.922157287597656, "learning_rate": 8.288134965259234e-06, "loss": 5.7513, "step": 9600 }, { "epoch": 30.868167202572348, "eval_runtime": 5.2946, "eval_samples_per_second": 9.444, "eval_steps_per_second": 1.322, "step": 9600 }, { "epoch": 30.90032154340836, "grad_norm": 56.934730529785156, "learning_rate": 8.284455039135018e-06, "loss": 5.6947, "step": 9610 }, { "epoch": 30.932475884244372, "grad_norm": 60.78661346435547, "learning_rate": 8.280772093228267e-06, "loss": 5.8975, "step": 9620 }, { "epoch": 30.964630225080384, "grad_norm": 54.31464767456055, "learning_rate": 8.277086131533187e-06, "loss": 5.7664, "step": 9630 }, { "epoch": 30.9967845659164, "grad_norm": 56.35603332519531, "learning_rate": 8.273397158047262e-06, "loss": 5.5491, "step": 9640 }, { "epoch": 31.028938906752412, "grad_norm": 50.3165283203125, "learning_rate": 8.269705176771234e-06, "loss": 5.6563, "step": 9650 }, { "epoch": 31.061093247588424, "grad_norm": 58.48154830932617, "learning_rate": 8.266010191709113e-06, "loss": 5.4345, "step": 9660 }, { "epoch": 31.093247588424436, "grad_norm": 54.106971740722656, "learning_rate": 8.262312206868167e-06, "loss": 5.2967, "step": 9670 }, { "epoch": 31.125401929260452, "grad_norm": 51.608463287353516, "learning_rate": 8.25861122625891e-06, "loss": 5.5898, "step": 9680 }, { "epoch": 31.157556270096464, "grad_norm": 57.952877044677734, "learning_rate": 8.254907253895111e-06, "loss": 5.6342, "step": 9690 }, { "epoch": 31.189710610932476, "grad_norm": 57.91488265991211, "learning_rate": 8.251200293793786e-06, "loss": 5.4462, "step": 9700 }, { "epoch": 31.189710610932476, "eval_runtime": 5.3412, "eval_samples_per_second": 9.361, "eval_steps_per_second": 1.311, "step": 9700 }, { "epoch": 31.221864951768488, "grad_norm": 49.05908966064453, "learning_rate": 8.247490349975185e-06, "loss": 5.587, "step": 9710 }, { "epoch": 31.2540192926045, "grad_norm": 56.946937561035156, "learning_rate": 8.2437774264628e-06, "loss": 5.1922, "step": 9720 }, { "epoch": 31.286173633440516, "grad_norm": 59.365753173828125, "learning_rate": 8.240061527283349e-06, "loss": 5.8585, "step": 9730 }, { "epoch": 31.318327974276528, "grad_norm": 54.20245361328125, "learning_rate": 8.236342656466778e-06, "loss": 4.8492, "step": 9740 }, { "epoch": 31.35048231511254, "grad_norm": 61.62698745727539, "learning_rate": 8.23262081804626e-06, "loss": 5.8673, "step": 9750 }, { "epoch": 31.382636655948552, "grad_norm": 48.78109359741211, "learning_rate": 8.228896016058182e-06, "loss": 5.3508, "step": 9760 }, { "epoch": 31.414790996784564, "grad_norm": 60.31678771972656, "learning_rate": 8.225168254542147e-06, "loss": 5.2679, "step": 9770 }, { "epoch": 31.44694533762058, "grad_norm": 61.093360900878906, "learning_rate": 8.221437537540968e-06, "loss": 5.5891, "step": 9780 }, { "epoch": 31.479099678456592, "grad_norm": 57.764503479003906, "learning_rate": 8.21770386910066e-06, "loss": 5.247, "step": 9790 }, { "epoch": 31.511254019292604, "grad_norm": 61.2574348449707, "learning_rate": 8.213967253270447e-06, "loss": 4.9063, "step": 9800 }, { "epoch": 31.511254019292604, "eval_runtime": 5.3147, "eval_samples_per_second": 9.408, "eval_steps_per_second": 1.317, "step": 9800 }, { "epoch": 31.543408360128616, "grad_norm": 43.801368713378906, "learning_rate": 8.210227694102736e-06, "loss": 5.7125, "step": 9810 }, { "epoch": 31.575562700964632, "grad_norm": 57.50799560546875, "learning_rate": 8.206485195653143e-06, "loss": 5.4489, "step": 9820 }, { "epoch": 31.607717041800644, "grad_norm": 54.914920806884766, "learning_rate": 8.202739761980455e-06, "loss": 5.3613, "step": 9830 }, { "epoch": 31.639871382636656, "grad_norm": 56.83713150024414, "learning_rate": 8.198991397146655e-06, "loss": 5.5155, "step": 9840 }, { "epoch": 31.672025723472668, "grad_norm": 57.15107727050781, "learning_rate": 8.195240105216898e-06, "loss": 5.7797, "step": 9850 }, { "epoch": 31.70418006430868, "grad_norm": 50.33863830566406, "learning_rate": 8.191485890259514e-06, "loss": 5.4884, "step": 9860 }, { "epoch": 31.736334405144696, "grad_norm": 63.25849914550781, "learning_rate": 8.187728756346008e-06, "loss": 5.4765, "step": 9870 }, { "epoch": 31.768488745980708, "grad_norm": 59.00205993652344, "learning_rate": 8.183968707551043e-06, "loss": 5.504, "step": 9880 }, { "epoch": 31.80064308681672, "grad_norm": 58.43321228027344, "learning_rate": 8.180205747952451e-06, "loss": 5.4785, "step": 9890 }, { "epoch": 31.832797427652732, "grad_norm": 53.209693908691406, "learning_rate": 8.176439881631214e-06, "loss": 5.0682, "step": 9900 }, { "epoch": 31.832797427652732, "eval_runtime": 5.3565, "eval_samples_per_second": 9.335, "eval_steps_per_second": 1.307, "step": 9900 }, { "epoch": 31.864951768488744, "grad_norm": 49.74534225463867, "learning_rate": 8.172671112671473e-06, "loss": 5.3081, "step": 9910 }, { "epoch": 31.89710610932476, "grad_norm": 53.98598861694336, "learning_rate": 8.168899445160511e-06, "loss": 5.4792, "step": 9920 }, { "epoch": 31.929260450160772, "grad_norm": 51.88299560546875, "learning_rate": 8.165124883188757e-06, "loss": 5.8476, "step": 9930 }, { "epoch": 31.961414790996784, "grad_norm": 62.53742599487305, "learning_rate": 8.161347430849781e-06, "loss": 5.3401, "step": 9940 }, { "epoch": 31.993569131832796, "grad_norm": 59.32769775390625, "learning_rate": 8.157567092240285e-06, "loss": 5.8952, "step": 9950 }, { "epoch": 32.02572347266881, "grad_norm": 51.2103271484375, "learning_rate": 8.153783871460098e-06, "loss": 5.535, "step": 9960 }, { "epoch": 32.057877813504824, "grad_norm": 54.30561065673828, "learning_rate": 8.149997772612185e-06, "loss": 5.1334, "step": 9970 }, { "epoch": 32.09003215434084, "grad_norm": 54.58060073852539, "learning_rate": 8.146208799802626e-06, "loss": 5.6715, "step": 9980 }, { "epoch": 32.12218649517685, "grad_norm": 56.270145416259766, "learning_rate": 8.14241695714061e-06, "loss": 4.9399, "step": 9990 }, { "epoch": 32.154340836012864, "grad_norm": 52.211910247802734, "learning_rate": 8.138622248738452e-06, "loss": 5.0688, "step": 10000 }, { "epoch": 32.154340836012864, "eval_runtime": 5.3163, "eval_samples_per_second": 9.405, "eval_steps_per_second": 1.317, "step": 10000 }, { "epoch": 32.18649517684887, "grad_norm": 43.96849822998047, "learning_rate": 8.13482467871157e-06, "loss": 5.4072, "step": 10010 }, { "epoch": 32.21864951768489, "grad_norm": 66.8050765991211, "learning_rate": 8.131024251178484e-06, "loss": 5.2188, "step": 10020 }, { "epoch": 32.250803858520904, "grad_norm": 52.71855163574219, "learning_rate": 8.127220970260812e-06, "loss": 5.8324, "step": 10030 }, { "epoch": 32.28295819935691, "grad_norm": 63.28131103515625, "learning_rate": 8.12341484008327e-06, "loss": 5.6411, "step": 10040 }, { "epoch": 32.31511254019293, "grad_norm": 48.359554290771484, "learning_rate": 8.119605864773659e-06, "loss": 5.2204, "step": 10050 }, { "epoch": 32.347266881028936, "grad_norm": 67.4935531616211, "learning_rate": 8.115794048462871e-06, "loss": 5.1148, "step": 10060 }, { "epoch": 32.37942122186495, "grad_norm": 57.380828857421875, "learning_rate": 8.11197939528488e-06, "loss": 5.3543, "step": 10070 }, { "epoch": 32.41157556270097, "grad_norm": 59.74724197387695, "learning_rate": 8.10816190937673e-06, "loss": 5.0451, "step": 10080 }, { "epoch": 32.443729903536976, "grad_norm": 56.71868133544922, "learning_rate": 8.10434159487854e-06, "loss": 5.6177, "step": 10090 }, { "epoch": 32.47588424437299, "grad_norm": 48.64152145385742, "learning_rate": 8.100518455933501e-06, "loss": 4.9782, "step": 10100 }, { "epoch": 32.47588424437299, "eval_runtime": 5.3151, "eval_samples_per_second": 9.407, "eval_steps_per_second": 1.317, "step": 10100 }, { "epoch": 32.508038585209, "grad_norm": 57.26563262939453, "learning_rate": 8.09669249668786e-06, "loss": 5.5677, "step": 10110 }, { "epoch": 32.540192926045016, "grad_norm": 71.45044708251953, "learning_rate": 8.092863721290927e-06, "loss": 5.8921, "step": 10120 }, { "epoch": 32.57234726688103, "grad_norm": 63.97669982910156, "learning_rate": 8.08903213389507e-06, "loss": 5.0455, "step": 10130 }, { "epoch": 32.60450160771704, "grad_norm": 62.54376220703125, "learning_rate": 8.085197738655695e-06, "loss": 5.2044, "step": 10140 }, { "epoch": 32.636655948553056, "grad_norm": 64.35868072509766, "learning_rate": 8.081360539731263e-06, "loss": 5.6532, "step": 10150 }, { "epoch": 32.668810289389064, "grad_norm": 49.1938591003418, "learning_rate": 8.077520541283274e-06, "loss": 5.0363, "step": 10160 }, { "epoch": 32.70096463022508, "grad_norm": 45.296573638916016, "learning_rate": 8.073677747476261e-06, "loss": 5.1168, "step": 10170 }, { "epoch": 32.733118971061096, "grad_norm": 53.764652252197266, "learning_rate": 8.069832162477792e-06, "loss": 4.9258, "step": 10180 }, { "epoch": 32.765273311897104, "grad_norm": 56.458984375, "learning_rate": 8.065983790458462e-06, "loss": 5.2199, "step": 10190 }, { "epoch": 32.79742765273312, "grad_norm": 57.29536819458008, "learning_rate": 8.062132635591883e-06, "loss": 5.7302, "step": 10200 }, { "epoch": 32.79742765273312, "eval_runtime": 5.3126, "eval_samples_per_second": 9.412, "eval_steps_per_second": 1.318, "step": 10200 }, { "epoch": 32.82958199356913, "grad_norm": 59.477500915527344, "learning_rate": 8.058278702054693e-06, "loss": 5.0898, "step": 10210 }, { "epoch": 32.861736334405144, "grad_norm": 55.95297622680664, "learning_rate": 8.054421994026536e-06, "loss": 5.3344, "step": 10220 }, { "epoch": 32.89389067524116, "grad_norm": 51.814815521240234, "learning_rate": 8.050562515690074e-06, "loss": 5.3433, "step": 10230 }, { "epoch": 32.92604501607717, "grad_norm": 56.102012634277344, "learning_rate": 8.046700271230963e-06, "loss": 5.324, "step": 10240 }, { "epoch": 32.958199356913184, "grad_norm": 48.60362243652344, "learning_rate": 8.042835264837866e-06, "loss": 4.9038, "step": 10250 }, { "epoch": 32.9903536977492, "grad_norm": 52.753501892089844, "learning_rate": 8.038967500702441e-06, "loss": 5.3868, "step": 10260 }, { "epoch": 33.02250803858521, "grad_norm": 54.95564270019531, "learning_rate": 8.035096983019333e-06, "loss": 5.048, "step": 10270 }, { "epoch": 33.054662379421224, "grad_norm": 49.95817565917969, "learning_rate": 8.031223715986174e-06, "loss": 4.9152, "step": 10280 }, { "epoch": 33.08681672025723, "grad_norm": 52.70745086669922, "learning_rate": 8.027347703803586e-06, "loss": 5.0711, "step": 10290 }, { "epoch": 33.11897106109325, "grad_norm": 68.78199005126953, "learning_rate": 8.023468950675154e-06, "loss": 5.1001, "step": 10300 }, { "epoch": 33.11897106109325, "eval_runtime": 5.2712, "eval_samples_per_second": 9.486, "eval_steps_per_second": 1.328, "step": 10300 }, { "epoch": 33.151125401929264, "grad_norm": 61.755916595458984, "learning_rate": 8.019587460807446e-06, "loss": 5.0671, "step": 10310 }, { "epoch": 33.18327974276527, "grad_norm": 62.97221374511719, "learning_rate": 8.015703238409996e-06, "loss": 5.0072, "step": 10320 }, { "epoch": 33.21543408360129, "grad_norm": 52.16510009765625, "learning_rate": 8.011816287695298e-06, "loss": 4.7586, "step": 10330 }, { "epoch": 33.247588424437296, "grad_norm": 46.51667022705078, "learning_rate": 8.00792661287881e-06, "loss": 5.0421, "step": 10340 }, { "epoch": 33.27974276527331, "grad_norm": 60.30772018432617, "learning_rate": 8.00403421817894e-06, "loss": 5.4156, "step": 10350 }, { "epoch": 33.31189710610933, "grad_norm": 58.135738372802734, "learning_rate": 8.000139107817045e-06, "loss": 5.2792, "step": 10360 }, { "epoch": 33.344051446945336, "grad_norm": 57.31541442871094, "learning_rate": 7.996241286017436e-06, "loss": 5.3575, "step": 10370 }, { "epoch": 33.37620578778135, "grad_norm": 52.93006134033203, "learning_rate": 7.99234075700735e-06, "loss": 5.0645, "step": 10380 }, { "epoch": 33.40836012861736, "grad_norm": 51.80390167236328, "learning_rate": 7.988437525016978e-06, "loss": 5.0172, "step": 10390 }, { "epoch": 33.440514469453376, "grad_norm": 55.32362747192383, "learning_rate": 7.984531594279423e-06, "loss": 4.8687, "step": 10400 }, { "epoch": 33.440514469453376, "eval_runtime": 5.2696, "eval_samples_per_second": 9.488, "eval_steps_per_second": 1.328, "step": 10400 }, { "epoch": 33.47266881028939, "grad_norm": 47.476802825927734, "learning_rate": 7.98062296903073e-06, "loss": 5.3106, "step": 10410 }, { "epoch": 33.5048231511254, "grad_norm": 55.41559982299805, "learning_rate": 7.97671165350986e-06, "loss": 4.6407, "step": 10420 }, { "epoch": 33.536977491961416, "grad_norm": 49.87110900878906, "learning_rate": 7.972797651958692e-06, "loss": 5.1397, "step": 10430 }, { "epoch": 33.569131832797424, "grad_norm": 62.539146423339844, "learning_rate": 7.968880968622017e-06, "loss": 5.0668, "step": 10440 }, { "epoch": 33.60128617363344, "grad_norm": 66.64210510253906, "learning_rate": 7.964961607747538e-06, "loss": 5.2268, "step": 10450 }, { "epoch": 33.633440514469456, "grad_norm": 49.909088134765625, "learning_rate": 7.96103957358586e-06, "loss": 4.7176, "step": 10460 }, { "epoch": 33.665594855305464, "grad_norm": 55.02338409423828, "learning_rate": 7.957114870390483e-06, "loss": 5.1688, "step": 10470 }, { "epoch": 33.69774919614148, "grad_norm": 54.32430648803711, "learning_rate": 7.95318750241781e-06, "loss": 5.4782, "step": 10480 }, { "epoch": 33.72990353697749, "grad_norm": 55.96116256713867, "learning_rate": 7.949257473927127e-06, "loss": 5.367, "step": 10490 }, { "epoch": 33.762057877813504, "grad_norm": 62.097171783447266, "learning_rate": 7.945324789180608e-06, "loss": 5.5608, "step": 10500 }, { "epoch": 33.762057877813504, "eval_runtime": 5.2777, "eval_samples_per_second": 9.474, "eval_steps_per_second": 1.326, "step": 10500 }, { "epoch": 33.79421221864952, "grad_norm": 47.85714340209961, "learning_rate": 7.941389452443306e-06, "loss": 5.1342, "step": 10510 }, { "epoch": 33.82636655948553, "grad_norm": 55.75582504272461, "learning_rate": 7.937451467983156e-06, "loss": 5.2984, "step": 10520 }, { "epoch": 33.858520900321544, "grad_norm": 58.995609283447266, "learning_rate": 7.933510840070957e-06, "loss": 5.4034, "step": 10530 }, { "epoch": 33.89067524115756, "grad_norm": 42.02104568481445, "learning_rate": 7.92956757298038e-06, "loss": 4.9508, "step": 10540 }, { "epoch": 33.92282958199357, "grad_norm": 54.44306182861328, "learning_rate": 7.925621670987952e-06, "loss": 5.3077, "step": 10550 }, { "epoch": 33.954983922829584, "grad_norm": 51.43379592895508, "learning_rate": 7.921673138373068e-06, "loss": 5.1871, "step": 10560 }, { "epoch": 33.98713826366559, "grad_norm": 49.6999397277832, "learning_rate": 7.917721979417966e-06, "loss": 4.8582, "step": 10570 }, { "epoch": 34.01929260450161, "grad_norm": 54.68442916870117, "learning_rate": 7.913768198407737e-06, "loss": 4.9199, "step": 10580 }, { "epoch": 34.051446945337624, "grad_norm": 59.49836730957031, "learning_rate": 7.909811799630314e-06, "loss": 4.8021, "step": 10590 }, { "epoch": 34.08360128617363, "grad_norm": 56.582618713378906, "learning_rate": 7.90585278737647e-06, "loss": 4.8054, "step": 10600 }, { "epoch": 34.08360128617363, "eval_runtime": 5.3202, "eval_samples_per_second": 9.398, "eval_steps_per_second": 1.316, "step": 10600 }, { "epoch": 34.11575562700965, "grad_norm": 49.42789077758789, "learning_rate": 7.901891165939814e-06, "loss": 5.0305, "step": 10610 }, { "epoch": 34.147909967845656, "grad_norm": 61.47016525268555, "learning_rate": 7.89792693961678e-06, "loss": 5.2698, "step": 10620 }, { "epoch": 34.18006430868167, "grad_norm": 57.904640197753906, "learning_rate": 7.893960112706634e-06, "loss": 4.9296, "step": 10630 }, { "epoch": 34.21221864951769, "grad_norm": 56.29244613647461, "learning_rate": 7.889990689511453e-06, "loss": 5.5073, "step": 10640 }, { "epoch": 34.244372990353696, "grad_norm": 57.911659240722656, "learning_rate": 7.88601867433614e-06, "loss": 4.6156, "step": 10650 }, { "epoch": 34.27652733118971, "grad_norm": 55.01363754272461, "learning_rate": 7.8820440714884e-06, "loss": 4.8421, "step": 10660 }, { "epoch": 34.30868167202572, "grad_norm": 54.46140670776367, "learning_rate": 7.87806688527875e-06, "loss": 4.6285, "step": 10670 }, { "epoch": 34.340836012861736, "grad_norm": 56.07723617553711, "learning_rate": 7.874087120020509e-06, "loss": 5.2503, "step": 10680 }, { "epoch": 34.37299035369775, "grad_norm": 65.43433380126953, "learning_rate": 7.870104780029789e-06, "loss": 5.7208, "step": 10690 }, { "epoch": 34.40514469453376, "grad_norm": 59.60455322265625, "learning_rate": 7.866119869625494e-06, "loss": 4.8097, "step": 10700 }, { "epoch": 34.40514469453376, "eval_runtime": 5.3599, "eval_samples_per_second": 9.329, "eval_steps_per_second": 1.306, "step": 10700 }, { "epoch": 34.437299035369776, "grad_norm": 52.14591979980469, "learning_rate": 7.862132393129322e-06, "loss": 5.2419, "step": 10710 }, { "epoch": 34.469453376205784, "grad_norm": 55.90645217895508, "learning_rate": 7.85814235486575e-06, "loss": 5.043, "step": 10720 }, { "epoch": 34.5016077170418, "grad_norm": 59.83700942993164, "learning_rate": 7.85414975916203e-06, "loss": 4.8822, "step": 10730 }, { "epoch": 34.533762057877816, "grad_norm": 53.75102233886719, "learning_rate": 7.850154610348193e-06, "loss": 4.9036, "step": 10740 }, { "epoch": 34.565916398713824, "grad_norm": 52.06101989746094, "learning_rate": 7.84615691275704e-06, "loss": 4.4746, "step": 10750 }, { "epoch": 34.59807073954984, "grad_norm": 54.81111145019531, "learning_rate": 7.842156670724126e-06, "loss": 4.9541, "step": 10760 }, { "epoch": 34.630225080385856, "grad_norm": 58.03950881958008, "learning_rate": 7.838153888587778e-06, "loss": 4.8814, "step": 10770 }, { "epoch": 34.662379421221864, "grad_norm": 54.10479736328125, "learning_rate": 7.834148570689071e-06, "loss": 4.2837, "step": 10780 }, { "epoch": 34.69453376205788, "grad_norm": 52.66333770751953, "learning_rate": 7.830140721371828e-06, "loss": 4.8197, "step": 10790 }, { "epoch": 34.72668810289389, "grad_norm": 51.45301818847656, "learning_rate": 7.826130344982626e-06, "loss": 4.9082, "step": 10800 }, { "epoch": 34.72668810289389, "eval_runtime": 5.3145, "eval_samples_per_second": 9.408, "eval_steps_per_second": 1.317, "step": 10800 }, { "epoch": 34.758842443729904, "grad_norm": 55.49913787841797, "learning_rate": 7.822117445870773e-06, "loss": 4.1214, "step": 10810 }, { "epoch": 34.79099678456592, "grad_norm": 50.87074279785156, "learning_rate": 7.81810202838832e-06, "loss": 4.6114, "step": 10820 }, { "epoch": 34.82315112540193, "grad_norm": 73.03480529785156, "learning_rate": 7.814084096890044e-06, "loss": 4.9529, "step": 10830 }, { "epoch": 34.855305466237944, "grad_norm": 49.86583709716797, "learning_rate": 7.810063655733453e-06, "loss": 4.3955, "step": 10840 }, { "epoch": 34.88745980707395, "grad_norm": 60.34618377685547, "learning_rate": 7.806040709278777e-06, "loss": 4.5711, "step": 10850 }, { "epoch": 34.91961414790997, "grad_norm": 59.98482131958008, "learning_rate": 7.802015261888954e-06, "loss": 5.022, "step": 10860 }, { "epoch": 34.951768488745984, "grad_norm": 49.9338493347168, "learning_rate": 7.797987317929646e-06, "loss": 4.9322, "step": 10870 }, { "epoch": 34.98392282958199, "grad_norm": 51.86176681518555, "learning_rate": 7.793956881769216e-06, "loss": 4.5391, "step": 10880 }, { "epoch": 35.01607717041801, "grad_norm": 67.34719848632812, "learning_rate": 7.789923957778733e-06, "loss": 4.5859, "step": 10890 }, { "epoch": 35.048231511254016, "grad_norm": 50.80049514770508, "learning_rate": 7.785888550331961e-06, "loss": 4.7646, "step": 10900 }, { "epoch": 35.048231511254016, "eval_runtime": 5.3277, "eval_samples_per_second": 9.385, "eval_steps_per_second": 1.314, "step": 10900 }, { "epoch": 35.08038585209003, "grad_norm": 64.83521270751953, "learning_rate": 7.781850663805357e-06, "loss": 4.9858, "step": 10910 }, { "epoch": 35.11254019292605, "grad_norm": 62.404415130615234, "learning_rate": 7.777810302578074e-06, "loss": 4.9883, "step": 10920 }, { "epoch": 35.144694533762056, "grad_norm": 41.906639099121094, "learning_rate": 7.773767471031939e-06, "loss": 4.3608, "step": 10930 }, { "epoch": 35.17684887459807, "grad_norm": 48.85271072387695, "learning_rate": 7.76972217355146e-06, "loss": 4.5729, "step": 10940 }, { "epoch": 35.20900321543408, "grad_norm": 47.46480178833008, "learning_rate": 7.765674414523828e-06, "loss": 4.7279, "step": 10950 }, { "epoch": 35.241157556270096, "grad_norm": 51.648136138916016, "learning_rate": 7.761624198338895e-06, "loss": 4.7603, "step": 10960 }, { "epoch": 35.27331189710611, "grad_norm": 50.928585052490234, "learning_rate": 7.757571529389179e-06, "loss": 4.8528, "step": 10970 }, { "epoch": 35.30546623794212, "grad_norm": 52.56718063354492, "learning_rate": 7.753516412069858e-06, "loss": 4.8298, "step": 10980 }, { "epoch": 35.337620578778136, "grad_norm": 54.092002868652344, "learning_rate": 7.74945885077877e-06, "loss": 4.4937, "step": 10990 }, { "epoch": 35.369774919614144, "grad_norm": 53.386802673339844, "learning_rate": 7.7453988499164e-06, "loss": 4.5428, "step": 11000 }, { "epoch": 35.369774919614144, "eval_runtime": 5.3191, "eval_samples_per_second": 9.4, "eval_steps_per_second": 1.316, "step": 11000 }, { "epoch": 35.40192926045016, "grad_norm": 49.88675308227539, "learning_rate": 7.741336413885875e-06, "loss": 5.1831, "step": 11010 }, { "epoch": 35.434083601286176, "grad_norm": 49.13835525512695, "learning_rate": 7.73727154709297e-06, "loss": 4.6913, "step": 11020 }, { "epoch": 35.466237942122184, "grad_norm": 49.21491241455078, "learning_rate": 7.733204253946092e-06, "loss": 4.6972, "step": 11030 }, { "epoch": 35.4983922829582, "grad_norm": 49.346710205078125, "learning_rate": 7.729134538856279e-06, "loss": 4.4319, "step": 11040 }, { "epoch": 35.530546623794216, "grad_norm": 47.70076370239258, "learning_rate": 7.725062406237197e-06, "loss": 4.9937, "step": 11050 }, { "epoch": 35.562700964630224, "grad_norm": 55.57027816772461, "learning_rate": 7.720987860505137e-06, "loss": 4.4526, "step": 11060 }, { "epoch": 35.59485530546624, "grad_norm": 53.46825408935547, "learning_rate": 7.716910906079004e-06, "loss": 4.7098, "step": 11070 }, { "epoch": 35.62700964630225, "grad_norm": 57.566795349121094, "learning_rate": 7.712831547380309e-06, "loss": 5.1409, "step": 11080 }, { "epoch": 35.659163987138264, "grad_norm": 54.77958679199219, "learning_rate": 7.708749788833185e-06, "loss": 4.8542, "step": 11090 }, { "epoch": 35.69131832797428, "grad_norm": 56.45766067504883, "learning_rate": 7.704665634864354e-06, "loss": 4.5619, "step": 11100 }, { "epoch": 35.69131832797428, "eval_runtime": 5.3516, "eval_samples_per_second": 9.343, "eval_steps_per_second": 1.308, "step": 11100 }, { "epoch": 35.72347266881029, "grad_norm": 51.701271057128906, "learning_rate": 7.700579089903142e-06, "loss": 5.0409, "step": 11110 }, { "epoch": 35.755627009646304, "grad_norm": 52.01728057861328, "learning_rate": 7.696490158381466e-06, "loss": 4.4957, "step": 11120 }, { "epoch": 35.78778135048231, "grad_norm": 61.62910842895508, "learning_rate": 7.692398844733834e-06, "loss": 4.831, "step": 11130 }, { "epoch": 35.81993569131833, "grad_norm": 50.818359375, "learning_rate": 7.688305153397337e-06, "loss": 5.0025, "step": 11140 }, { "epoch": 35.852090032154344, "grad_norm": 52.49498748779297, "learning_rate": 7.684209088811643e-06, "loss": 4.4589, "step": 11150 }, { "epoch": 35.88424437299035, "grad_norm": 62.3687629699707, "learning_rate": 7.680110655418996e-06, "loss": 4.4222, "step": 11160 }, { "epoch": 35.91639871382637, "grad_norm": 50.95934295654297, "learning_rate": 7.676009857664202e-06, "loss": 5.2032, "step": 11170 }, { "epoch": 35.948553054662376, "grad_norm": 51.3870735168457, "learning_rate": 7.671906699994643e-06, "loss": 4.689, "step": 11180 }, { "epoch": 35.98070739549839, "grad_norm": 57.1351203918457, "learning_rate": 7.66780118686025e-06, "loss": 5.1134, "step": 11190 }, { "epoch": 36.01286173633441, "grad_norm": 54.10897445678711, "learning_rate": 7.663693322713514e-06, "loss": 5.0086, "step": 11200 }, { "epoch": 36.01286173633441, "eval_runtime": 5.2513, "eval_samples_per_second": 9.522, "eval_steps_per_second": 1.333, "step": 11200 }, { "epoch": 36.045016077170416, "grad_norm": 49.9356803894043, "learning_rate": 7.659583112009472e-06, "loss": 4.5061, "step": 11210 }, { "epoch": 36.07717041800643, "grad_norm": 49.30137252807617, "learning_rate": 7.655470559205712e-06, "loss": 4.8743, "step": 11220 }, { "epoch": 36.10932475884244, "grad_norm": 49.49820327758789, "learning_rate": 7.651355668762356e-06, "loss": 4.7676, "step": 11230 }, { "epoch": 36.141479099678456, "grad_norm": 54.866825103759766, "learning_rate": 7.647238445142063e-06, "loss": 5.1806, "step": 11240 }, { "epoch": 36.17363344051447, "grad_norm": 49.2337760925293, "learning_rate": 7.643118892810022e-06, "loss": 4.3792, "step": 11250 }, { "epoch": 36.20578778135048, "grad_norm": 53.35176086425781, "learning_rate": 7.638997016233952e-06, "loss": 4.3638, "step": 11260 }, { "epoch": 36.237942122186496, "grad_norm": 49.682701110839844, "learning_rate": 7.634872819884083e-06, "loss": 4.6008, "step": 11270 }, { "epoch": 36.27009646302251, "grad_norm": 58.483943939208984, "learning_rate": 7.630746308233172e-06, "loss": 5.1759, "step": 11280 }, { "epoch": 36.30225080385852, "grad_norm": 55.22038269042969, "learning_rate": 7.62661748575648e-06, "loss": 4.4753, "step": 11290 }, { "epoch": 36.334405144694536, "grad_norm": 54.23451614379883, "learning_rate": 7.622486356931774e-06, "loss": 4.3676, "step": 11300 }, { "epoch": 36.334405144694536, "eval_runtime": 5.3029, "eval_samples_per_second": 9.429, "eval_steps_per_second": 1.32, "step": 11300 }, { "epoch": 36.366559485530544, "grad_norm": 51.88092803955078, "learning_rate": 7.618352926239329e-06, "loss": 4.3776, "step": 11310 }, { "epoch": 36.39871382636656, "grad_norm": 63.947120666503906, "learning_rate": 7.614217198161905e-06, "loss": 4.4751, "step": 11320 }, { "epoch": 36.430868167202576, "grad_norm": 56.706321716308594, "learning_rate": 7.610079177184764e-06, "loss": 4.6462, "step": 11330 }, { "epoch": 36.463022508038584, "grad_norm": 52.347450256347656, "learning_rate": 7.60593886779565e-06, "loss": 4.4309, "step": 11340 }, { "epoch": 36.4951768488746, "grad_norm": 58.955604553222656, "learning_rate": 7.6017962744847905e-06, "loss": 4.8769, "step": 11350 }, { "epoch": 36.52733118971061, "grad_norm": 51.429752349853516, "learning_rate": 7.597651401744888e-06, "loss": 4.9719, "step": 11360 }, { "epoch": 36.559485530546624, "grad_norm": 48.57435989379883, "learning_rate": 7.593504254071117e-06, "loss": 4.6448, "step": 11370 }, { "epoch": 36.59163987138264, "grad_norm": 55.499603271484375, "learning_rate": 7.589354835961123e-06, "loss": 4.5167, "step": 11380 }, { "epoch": 36.62379421221865, "grad_norm": 49.66143035888672, "learning_rate": 7.585203151915007e-06, "loss": 4.2146, "step": 11390 }, { "epoch": 36.655948553054664, "grad_norm": 47.49949645996094, "learning_rate": 7.581049206435337e-06, "loss": 4.5781, "step": 11400 }, { "epoch": 36.655948553054664, "eval_runtime": 5.3061, "eval_samples_per_second": 9.423, "eval_steps_per_second": 1.319, "step": 11400 }, { "epoch": 36.68810289389067, "grad_norm": 47.65353775024414, "learning_rate": 7.576893004027125e-06, "loss": 4.9129, "step": 11410 }, { "epoch": 36.72025723472669, "grad_norm": 56.42992401123047, "learning_rate": 7.572734549197833e-06, "loss": 5.2679, "step": 11420 }, { "epoch": 36.752411575562704, "grad_norm": 50.922462463378906, "learning_rate": 7.568573846457367e-06, "loss": 4.8407, "step": 11430 }, { "epoch": 36.78456591639871, "grad_norm": 53.2662239074707, "learning_rate": 7.564410900318073e-06, "loss": 4.477, "step": 11440 }, { "epoch": 36.81672025723473, "grad_norm": 53.8399543762207, "learning_rate": 7.560245715294724e-06, "loss": 4.704, "step": 11450 }, { "epoch": 36.848874598070736, "grad_norm": 49.70867919921875, "learning_rate": 7.556078295904526e-06, "loss": 4.7144, "step": 11460 }, { "epoch": 36.88102893890675, "grad_norm": 53.347877502441406, "learning_rate": 7.551908646667108e-06, "loss": 4.7637, "step": 11470 }, { "epoch": 36.91318327974277, "grad_norm": 58.69789123535156, "learning_rate": 7.5477367721045134e-06, "loss": 4.1648, "step": 11480 }, { "epoch": 36.945337620578776, "grad_norm": 53.40052795410156, "learning_rate": 7.5435626767412044e-06, "loss": 4.6815, "step": 11490 }, { "epoch": 36.97749196141479, "grad_norm": 61.14105987548828, "learning_rate": 7.539386365104048e-06, "loss": 4.8457, "step": 11500 }, { "epoch": 36.97749196141479, "eval_runtime": 5.3057, "eval_samples_per_second": 9.424, "eval_steps_per_second": 1.319, "step": 11500 }, { "epoch": 37.0096463022508, "grad_norm": 59.53232955932617, "learning_rate": 7.535207841722314e-06, "loss": 4.7136, "step": 11510 }, { "epoch": 37.041800643086816, "grad_norm": 51.32575607299805, "learning_rate": 7.531027111127674e-06, "loss": 4.7227, "step": 11520 }, { "epoch": 37.07395498392283, "grad_norm": 51.98917007446289, "learning_rate": 7.526844177854192e-06, "loss": 4.454, "step": 11530 }, { "epoch": 37.10610932475884, "grad_norm": 63.425270080566406, "learning_rate": 7.522659046438321e-06, "loss": 4.5716, "step": 11540 }, { "epoch": 37.138263665594856, "grad_norm": 50.95613098144531, "learning_rate": 7.518471721418899e-06, "loss": 4.3262, "step": 11550 }, { "epoch": 37.17041800643087, "grad_norm": 48.976016998291016, "learning_rate": 7.514282207337138e-06, "loss": 4.1325, "step": 11560 }, { "epoch": 37.20257234726688, "grad_norm": 50.415836334228516, "learning_rate": 7.510090508736632e-06, "loss": 4.5524, "step": 11570 }, { "epoch": 37.234726688102896, "grad_norm": 54.804962158203125, "learning_rate": 7.505896630163337e-06, "loss": 4.3662, "step": 11580 }, { "epoch": 37.266881028938904, "grad_norm": 46.61585998535156, "learning_rate": 7.501700576165577e-06, "loss": 4.1974, "step": 11590 }, { "epoch": 37.29903536977492, "grad_norm": 53.02007293701172, "learning_rate": 7.497502351294035e-06, "loss": 4.2217, "step": 11600 }, { "epoch": 37.29903536977492, "eval_runtime": 5.2959, "eval_samples_per_second": 9.441, "eval_steps_per_second": 1.322, "step": 11600 }, { "epoch": 37.331189710610936, "grad_norm": 60.739479064941406, "learning_rate": 7.493301960101747e-06, "loss": 4.3619, "step": 11610 }, { "epoch": 37.363344051446944, "grad_norm": 50.937767028808594, "learning_rate": 7.489099407144098e-06, "loss": 4.3764, "step": 11620 }, { "epoch": 37.39549839228296, "grad_norm": 51.02251052856445, "learning_rate": 7.484894696978823e-06, "loss": 4.2902, "step": 11630 }, { "epoch": 37.42765273311897, "grad_norm": 58.351078033447266, "learning_rate": 7.480687834165987e-06, "loss": 4.6135, "step": 11640 }, { "epoch": 37.459807073954984, "grad_norm": 51.953086853027344, "learning_rate": 7.476478823267997e-06, "loss": 4.2574, "step": 11650 }, { "epoch": 37.491961414791, "grad_norm": 51.992427825927734, "learning_rate": 7.472267668849588e-06, "loss": 4.4152, "step": 11660 }, { "epoch": 37.52411575562701, "grad_norm": 44.269142150878906, "learning_rate": 7.468054375477818e-06, "loss": 4.3426, "step": 11670 }, { "epoch": 37.556270096463024, "grad_norm": 48.05318832397461, "learning_rate": 7.463838947722066e-06, "loss": 4.9246, "step": 11680 }, { "epoch": 37.58842443729903, "grad_norm": 51.06418228149414, "learning_rate": 7.4596213901540274e-06, "loss": 3.868, "step": 11690 }, { "epoch": 37.62057877813505, "grad_norm": 54.30795669555664, "learning_rate": 7.4554017073477e-06, "loss": 4.7488, "step": 11700 }, { "epoch": 37.62057877813505, "eval_runtime": 5.3277, "eval_samples_per_second": 9.385, "eval_steps_per_second": 1.314, "step": 11700 }, { "epoch": 37.652733118971064, "grad_norm": 47.76860046386719, "learning_rate": 7.4511799038793985e-06, "loss": 4.0585, "step": 11710 }, { "epoch": 37.68488745980707, "grad_norm": 54.083866119384766, "learning_rate": 7.446955984327729e-06, "loss": 4.0484, "step": 11720 }, { "epoch": 37.71704180064309, "grad_norm": 51.72701644897461, "learning_rate": 7.442729953273592e-06, "loss": 4.6214, "step": 11730 }, { "epoch": 37.749196141479096, "grad_norm": 61.967830657958984, "learning_rate": 7.438501815300182e-06, "loss": 5.1865, "step": 11740 }, { "epoch": 37.78135048231511, "grad_norm": 58.95353317260742, "learning_rate": 7.434271574992978e-06, "loss": 4.3511, "step": 11750 }, { "epoch": 37.81350482315113, "grad_norm": 51.34463119506836, "learning_rate": 7.430039236939738e-06, "loss": 4.5215, "step": 11760 }, { "epoch": 37.845659163987136, "grad_norm": 45.89046096801758, "learning_rate": 7.4258048057304944e-06, "loss": 4.326, "step": 11770 }, { "epoch": 37.87781350482315, "grad_norm": 59.45298767089844, "learning_rate": 7.421568285957549e-06, "loss": 4.6785, "step": 11780 }, { "epoch": 37.90996784565917, "grad_norm": 57.05272674560547, "learning_rate": 7.417329682215472e-06, "loss": 4.733, "step": 11790 }, { "epoch": 37.942122186495176, "grad_norm": 49.27427291870117, "learning_rate": 7.41308899910109e-06, "loss": 4.3005, "step": 11800 }, { "epoch": 37.942122186495176, "eval_runtime": 5.344, "eval_samples_per_second": 9.356, "eval_steps_per_second": 1.31, "step": 11800 }, { "epoch": 37.97427652733119, "grad_norm": 47.96388626098633, "learning_rate": 7.408846241213487e-06, "loss": 4.4826, "step": 11810 }, { "epoch": 38.0064308681672, "grad_norm": 50.23042678833008, "learning_rate": 7.404601413153997e-06, "loss": 4.3017, "step": 11820 }, { "epoch": 38.038585209003216, "grad_norm": 54.80693054199219, "learning_rate": 7.400354519526197e-06, "loss": 4.7194, "step": 11830 }, { "epoch": 38.07073954983923, "grad_norm": 57.706939697265625, "learning_rate": 7.396105564935908e-06, "loss": 4.4728, "step": 11840 }, { "epoch": 38.10289389067524, "grad_norm": 48.10573196411133, "learning_rate": 7.391854553991184e-06, "loss": 4.887, "step": 11850 }, { "epoch": 38.135048231511256, "grad_norm": 53.23372268676758, "learning_rate": 7.387601491302305e-06, "loss": 4.6529, "step": 11860 }, { "epoch": 38.167202572347264, "grad_norm": 63.3897590637207, "learning_rate": 7.383346381481785e-06, "loss": 4.651, "step": 11870 }, { "epoch": 38.19935691318328, "grad_norm": 54.40311813354492, "learning_rate": 7.379089229144351e-06, "loss": 4.2909, "step": 11880 }, { "epoch": 38.231511254019296, "grad_norm": 60.93667984008789, "learning_rate": 7.374830038906946e-06, "loss": 5.0164, "step": 11890 }, { "epoch": 38.263665594855304, "grad_norm": 47.537803649902344, "learning_rate": 7.37056881538873e-06, "loss": 4.1804, "step": 11900 }, { "epoch": 38.263665594855304, "eval_runtime": 5.3558, "eval_samples_per_second": 9.336, "eval_steps_per_second": 1.307, "step": 11900 }, { "epoch": 38.29581993569132, "grad_norm": 54.4545783996582, "learning_rate": 7.366305563211057e-06, "loss": 4.4244, "step": 11910 }, { "epoch": 38.32797427652733, "grad_norm": 52.25836944580078, "learning_rate": 7.362040286997487e-06, "loss": 4.4059, "step": 11920 }, { "epoch": 38.360128617363344, "grad_norm": 52.16881561279297, "learning_rate": 7.35777299137378e-06, "loss": 3.846, "step": 11930 }, { "epoch": 38.39228295819936, "grad_norm": 48.82463836669922, "learning_rate": 7.353503680967875e-06, "loss": 4.6382, "step": 11940 }, { "epoch": 38.42443729903537, "grad_norm": 57.7891960144043, "learning_rate": 7.349232360409907e-06, "loss": 4.0386, "step": 11950 }, { "epoch": 38.456591639871384, "grad_norm": 56.9764404296875, "learning_rate": 7.344959034332183e-06, "loss": 4.301, "step": 11960 }, { "epoch": 38.48874598070739, "grad_norm": 48.01116943359375, "learning_rate": 7.34068370736919e-06, "loss": 3.9257, "step": 11970 }, { "epoch": 38.52090032154341, "grad_norm": 46.03276443481445, "learning_rate": 7.336406384157582e-06, "loss": 4.5745, "step": 11980 }, { "epoch": 38.553054662379424, "grad_norm": 55.188987731933594, "learning_rate": 7.332127069336179e-06, "loss": 4.2677, "step": 11990 }, { "epoch": 38.58520900321543, "grad_norm": 65.20699310302734, "learning_rate": 7.327845767545961e-06, "loss": 4.4725, "step": 12000 }, { "epoch": 38.58520900321543, "eval_runtime": 5.2571, "eval_samples_per_second": 9.511, "eval_steps_per_second": 1.332, "step": 12000 }, { "epoch": 38.61736334405145, "grad_norm": 52.5718994140625, "learning_rate": 7.323562483430066e-06, "loss": 3.9634, "step": 12010 }, { "epoch": 38.649517684887456, "grad_norm": 70.15001678466797, "learning_rate": 7.3192772216337736e-06, "loss": 4.6852, "step": 12020 }, { "epoch": 38.68167202572347, "grad_norm": 48.11845016479492, "learning_rate": 7.314989986804516e-06, "loss": 4.5415, "step": 12030 }, { "epoch": 38.71382636655949, "grad_norm": 51.04377746582031, "learning_rate": 7.310700783591863e-06, "loss": 4.1097, "step": 12040 }, { "epoch": 38.745980707395496, "grad_norm": 52.283058166503906, "learning_rate": 7.306409616647516e-06, "loss": 4.1815, "step": 12050 }, { "epoch": 38.77813504823151, "grad_norm": 44.72518539428711, "learning_rate": 7.302116490625313e-06, "loss": 4.1274, "step": 12060 }, { "epoch": 38.81028938906753, "grad_norm": 57.12190628051758, "learning_rate": 7.29782141018121e-06, "loss": 4.1141, "step": 12070 }, { "epoch": 38.842443729903536, "grad_norm": 53.0617561340332, "learning_rate": 7.293524379973286e-06, "loss": 4.8792, "step": 12080 }, { "epoch": 38.87459807073955, "grad_norm": 51.7138786315918, "learning_rate": 7.2892254046617325e-06, "loss": 4.1189, "step": 12090 }, { "epoch": 38.90675241157556, "grad_norm": 48.76931381225586, "learning_rate": 7.284924488908853e-06, "loss": 4.2506, "step": 12100 }, { "epoch": 38.90675241157556, "eval_runtime": 5.2904, "eval_samples_per_second": 9.451, "eval_steps_per_second": 1.323, "step": 12100 }, { "epoch": 38.938906752411576, "grad_norm": 56.704402923583984, "learning_rate": 7.280621637379051e-06, "loss": 4.7519, "step": 12110 }, { "epoch": 38.97106109324759, "grad_norm": 55.45400619506836, "learning_rate": 7.276316854738836e-06, "loss": 4.0192, "step": 12120 }, { "epoch": 39.0032154340836, "grad_norm": 48.555965423583984, "learning_rate": 7.272010145656807e-06, "loss": 4.5071, "step": 12130 }, { "epoch": 39.035369774919616, "grad_norm": 43.25794982910156, "learning_rate": 7.267701514803653e-06, "loss": 4.6054, "step": 12140 }, { "epoch": 39.067524115755624, "grad_norm": 47.7982063293457, "learning_rate": 7.2633909668521484e-06, "loss": 4.0522, "step": 12150 }, { "epoch": 39.09967845659164, "grad_norm": 48.52701950073242, "learning_rate": 7.259078506477144e-06, "loss": 4.089, "step": 12160 }, { "epoch": 39.131832797427656, "grad_norm": 49.707366943359375, "learning_rate": 7.25476413835557e-06, "loss": 4.2426, "step": 12170 }, { "epoch": 39.163987138263664, "grad_norm": 49.971153259277344, "learning_rate": 7.250447867166419e-06, "loss": 4.0997, "step": 12180 }, { "epoch": 39.19614147909968, "grad_norm": 50.016578674316406, "learning_rate": 7.2461296975907525e-06, "loss": 4.465, "step": 12190 }, { "epoch": 39.22829581993569, "grad_norm": 51.3808479309082, "learning_rate": 7.241809634311689e-06, "loss": 3.8728, "step": 12200 }, { "epoch": 39.22829581993569, "eval_runtime": 5.389, "eval_samples_per_second": 9.278, "eval_steps_per_second": 1.299, "step": 12200 }, { "epoch": 39.260450160771704, "grad_norm": 50.564178466796875, "learning_rate": 7.237487682014401e-06, "loss": 4.0188, "step": 12210 }, { "epoch": 39.29260450160772, "grad_norm": 46.724815368652344, "learning_rate": 7.2331638453861095e-06, "loss": 4.1598, "step": 12220 }, { "epoch": 39.32475884244373, "grad_norm": 47.57096481323242, "learning_rate": 7.228838129116078e-06, "loss": 4.1301, "step": 12230 }, { "epoch": 39.356913183279744, "grad_norm": 53.63093566894531, "learning_rate": 7.224510537895611e-06, "loss": 3.9937, "step": 12240 }, { "epoch": 39.38906752411575, "grad_norm": 52.343746185302734, "learning_rate": 7.2201810764180416e-06, "loss": 4.2309, "step": 12250 }, { "epoch": 39.42122186495177, "grad_norm": 44.674739837646484, "learning_rate": 7.2158497493787385e-06, "loss": 4.446, "step": 12260 }, { "epoch": 39.453376205787784, "grad_norm": 50.669898986816406, "learning_rate": 7.211516561475088e-06, "loss": 4.2705, "step": 12270 }, { "epoch": 39.48553054662379, "grad_norm": 50.86297607421875, "learning_rate": 7.207181517406499e-06, "loss": 4.3653, "step": 12280 }, { "epoch": 39.51768488745981, "grad_norm": 44.99029541015625, "learning_rate": 7.202844621874385e-06, "loss": 4.0741, "step": 12290 }, { "epoch": 39.549839228295816, "grad_norm": 44.5157356262207, "learning_rate": 7.198505879582178e-06, "loss": 4.0135, "step": 12300 }, { "epoch": 39.549839228295816, "eval_runtime": 5.3429, "eval_samples_per_second": 9.358, "eval_steps_per_second": 1.31, "step": 12300 }, { "epoch": 39.58199356913183, "grad_norm": 51.87458419799805, "learning_rate": 7.194165295235307e-06, "loss": 4.1508, "step": 12310 }, { "epoch": 39.61414790996785, "grad_norm": 54.53363800048828, "learning_rate": 7.189822873541198e-06, "loss": 4.2903, "step": 12320 }, { "epoch": 39.646302250803856, "grad_norm": 42.80707550048828, "learning_rate": 7.185478619209273e-06, "loss": 3.8437, "step": 12330 }, { "epoch": 39.67845659163987, "grad_norm": 46.76114273071289, "learning_rate": 7.1811325369509385e-06, "loss": 3.8743, "step": 12340 }, { "epoch": 39.71061093247589, "grad_norm": 58.07074737548828, "learning_rate": 7.176784631479588e-06, "loss": 3.9895, "step": 12350 }, { "epoch": 39.742765273311896, "grad_norm": 56.82476806640625, "learning_rate": 7.172434907510584e-06, "loss": 4.2285, "step": 12360 }, { "epoch": 39.77491961414791, "grad_norm": 52.86793518066406, "learning_rate": 7.168083369761269e-06, "loss": 4.448, "step": 12370 }, { "epoch": 39.80707395498392, "grad_norm": 54.716407775878906, "learning_rate": 7.16373002295095e-06, "loss": 4.3565, "step": 12380 }, { "epoch": 39.839228295819936, "grad_norm": 45.35209274291992, "learning_rate": 7.159374871800897e-06, "loss": 3.9963, "step": 12390 }, { "epoch": 39.87138263665595, "grad_norm": 46.28361511230469, "learning_rate": 7.155017921034332e-06, "loss": 4.1374, "step": 12400 }, { "epoch": 39.87138263665595, "eval_runtime": 5.2464, "eval_samples_per_second": 9.53, "eval_steps_per_second": 1.334, "step": 12400 }, { "epoch": 39.90353697749196, "grad_norm": 50.316036224365234, "learning_rate": 7.150659175376436e-06, "loss": 4.0582, "step": 12410 }, { "epoch": 39.935691318327976, "grad_norm": 52.240936279296875, "learning_rate": 7.146298639554331e-06, "loss": 4.0898, "step": 12420 }, { "epoch": 39.967845659163984, "grad_norm": 58.620826721191406, "learning_rate": 7.141936318297083e-06, "loss": 4.4116, "step": 12430 }, { "epoch": 40.0, "grad_norm": 45.626583099365234, "learning_rate": 7.1375722163356945e-06, "loss": 4.2743, "step": 12440 }, { "epoch": 40.032154340836016, "grad_norm": 52.46480941772461, "learning_rate": 7.133206338403097e-06, "loss": 4.2337, "step": 12450 }, { "epoch": 40.064308681672024, "grad_norm": 48.91707229614258, "learning_rate": 7.128838689234147e-06, "loss": 3.9645, "step": 12460 }, { "epoch": 40.09646302250804, "grad_norm": 48.05337142944336, "learning_rate": 7.124469273565632e-06, "loss": 3.8868, "step": 12470 }, { "epoch": 40.12861736334405, "grad_norm": 47.3795280456543, "learning_rate": 7.120098096136241e-06, "loss": 3.9171, "step": 12480 }, { "epoch": 40.160771704180064, "grad_norm": 51.975257873535156, "learning_rate": 7.115725161686581e-06, "loss": 4.4802, "step": 12490 }, { "epoch": 40.19292604501608, "grad_norm": 47.14204788208008, "learning_rate": 7.111350474959167e-06, "loss": 4.1723, "step": 12500 }, { "epoch": 40.19292604501608, "eval_runtime": 5.3687, "eval_samples_per_second": 9.313, "eval_steps_per_second": 1.304, "step": 12500 }, { "epoch": 40.22508038585209, "grad_norm": 51.50626754760742, "learning_rate": 7.106974040698409e-06, "loss": 4.2594, "step": 12510 }, { "epoch": 40.257234726688104, "grad_norm": 49.7122688293457, "learning_rate": 7.102595863650616e-06, "loss": 4.2304, "step": 12520 }, { "epoch": 40.28938906752411, "grad_norm": 50.55232620239258, "learning_rate": 7.098215948563982e-06, "loss": 4.0336, "step": 12530 }, { "epoch": 40.32154340836013, "grad_norm": 42.135475158691406, "learning_rate": 7.0938343001885945e-06, "loss": 4.2344, "step": 12540 }, { "epoch": 40.353697749196144, "grad_norm": 54.885189056396484, "learning_rate": 7.089450923276412e-06, "loss": 4.086, "step": 12550 }, { "epoch": 40.38585209003215, "grad_norm": 52.99751663208008, "learning_rate": 7.085065822581273e-06, "loss": 3.7719, "step": 12560 }, { "epoch": 40.41800643086817, "grad_norm": 43.21663284301758, "learning_rate": 7.0806790028588835e-06, "loss": 3.9021, "step": 12570 }, { "epoch": 40.450160771704184, "grad_norm": 53.63578414916992, "learning_rate": 7.076290468866813e-06, "loss": 3.8744, "step": 12580 }, { "epoch": 40.48231511254019, "grad_norm": 50.84957504272461, "learning_rate": 7.071900225364494e-06, "loss": 4.2161, "step": 12590 }, { "epoch": 40.51446945337621, "grad_norm": 48.79473114013672, "learning_rate": 7.067508277113208e-06, "loss": 3.6981, "step": 12600 }, { "epoch": 40.51446945337621, "eval_runtime": 5.2903, "eval_samples_per_second": 9.451, "eval_steps_per_second": 1.323, "step": 12600 }, { "epoch": 40.546623794212216, "grad_norm": 38.822975158691406, "learning_rate": 7.063114628876089e-06, "loss": 4.0974, "step": 12610 }, { "epoch": 40.57877813504823, "grad_norm": 56.83395767211914, "learning_rate": 7.058719285418112e-06, "loss": 3.8108, "step": 12620 }, { "epoch": 40.61093247588425, "grad_norm": 54.07062530517578, "learning_rate": 7.05432225150609e-06, "loss": 3.9349, "step": 12630 }, { "epoch": 40.643086816720256, "grad_norm": 51.10496139526367, "learning_rate": 7.049923531908677e-06, "loss": 3.849, "step": 12640 }, { "epoch": 40.67524115755627, "grad_norm": 53.59551239013672, "learning_rate": 7.045523131396346e-06, "loss": 4.3287, "step": 12650 }, { "epoch": 40.70739549839228, "grad_norm": 46.66079330444336, "learning_rate": 7.041121054741394e-06, "loss": 4.2149, "step": 12660 }, { "epoch": 40.739549839228296, "grad_norm": 50.1739616394043, "learning_rate": 7.036717306717941e-06, "loss": 3.7173, "step": 12670 }, { "epoch": 40.77170418006431, "grad_norm": 51.96829605102539, "learning_rate": 7.032311892101915e-06, "loss": 4.5396, "step": 12680 }, { "epoch": 40.80385852090032, "grad_norm": 53.888736724853516, "learning_rate": 7.027904815671054e-06, "loss": 4.1715, "step": 12690 }, { "epoch": 40.836012861736336, "grad_norm": 60.205169677734375, "learning_rate": 7.023496082204898e-06, "loss": 3.7667, "step": 12700 }, { "epoch": 40.836012861736336, "eval_runtime": 5.318, "eval_samples_per_second": 9.402, "eval_steps_per_second": 1.316, "step": 12700 }, { "epoch": 40.868167202572344, "grad_norm": 50.36943435668945, "learning_rate": 7.019085696484781e-06, "loss": 4.8801, "step": 12710 }, { "epoch": 40.90032154340836, "grad_norm": 45.15705108642578, "learning_rate": 7.014673663293835e-06, "loss": 3.8467, "step": 12720 }, { "epoch": 40.932475884244376, "grad_norm": 53.83464813232422, "learning_rate": 7.010259987416972e-06, "loss": 4.0208, "step": 12730 }, { "epoch": 40.964630225080384, "grad_norm": 49.05274963378906, "learning_rate": 7.005844673640888e-06, "loss": 3.8908, "step": 12740 }, { "epoch": 40.9967845659164, "grad_norm": 51.81589126586914, "learning_rate": 7.001427726754059e-06, "loss": 4.6492, "step": 12750 }, { "epoch": 41.02893890675241, "grad_norm": 49.31263732910156, "learning_rate": 6.997009151546727e-06, "loss": 3.4569, "step": 12760 }, { "epoch": 41.061093247588424, "grad_norm": 55.872779846191406, "learning_rate": 6.992588952810901e-06, "loss": 3.8868, "step": 12770 }, { "epoch": 41.09324758842444, "grad_norm": 48.168521881103516, "learning_rate": 6.988167135340355e-06, "loss": 4.0076, "step": 12780 }, { "epoch": 41.12540192926045, "grad_norm": 47.07434844970703, "learning_rate": 6.983743703930612e-06, "loss": 3.7181, "step": 12790 }, { "epoch": 41.157556270096464, "grad_norm": 50.65055465698242, "learning_rate": 6.979318663378949e-06, "loss": 4.1766, "step": 12800 }, { "epoch": 41.157556270096464, "eval_runtime": 5.367, "eval_samples_per_second": 9.316, "eval_steps_per_second": 1.304, "step": 12800 }, { "epoch": 41.18971061093247, "grad_norm": 48.88496780395508, "learning_rate": 6.974892018484391e-06, "loss": 4.2327, "step": 12810 }, { "epoch": 41.22186495176849, "grad_norm": 56.517967224121094, "learning_rate": 6.970463774047694e-06, "loss": 4.3474, "step": 12820 }, { "epoch": 41.254019292604504, "grad_norm": 44.22603988647461, "learning_rate": 6.9660339348713576e-06, "loss": 4.0066, "step": 12830 }, { "epoch": 41.28617363344051, "grad_norm": 47.0938606262207, "learning_rate": 6.96160250575961e-06, "loss": 3.6856, "step": 12840 }, { "epoch": 41.31832797427653, "grad_norm": 55.668540954589844, "learning_rate": 6.957169491518397e-06, "loss": 3.7558, "step": 12850 }, { "epoch": 41.350482315112544, "grad_norm": 41.06981658935547, "learning_rate": 6.952734896955391e-06, "loss": 4.0289, "step": 12860 }, { "epoch": 41.38263665594855, "grad_norm": 44.29338455200195, "learning_rate": 6.948298726879975e-06, "loss": 3.9282, "step": 12870 }, { "epoch": 41.41479099678457, "grad_norm": 46.47079849243164, "learning_rate": 6.943860986103238e-06, "loss": 3.7237, "step": 12880 }, { "epoch": 41.446945337620576, "grad_norm": 61.39677047729492, "learning_rate": 6.939421679437979e-06, "loss": 4.1805, "step": 12890 }, { "epoch": 41.47909967845659, "grad_norm": 51.910057067871094, "learning_rate": 6.9349808116986875e-06, "loss": 4.1601, "step": 12900 }, { "epoch": 41.47909967845659, "eval_runtime": 5.3663, "eval_samples_per_second": 9.317, "eval_steps_per_second": 1.304, "step": 12900 }, { "epoch": 41.51125401929261, "grad_norm": 48.82887649536133, "learning_rate": 6.930538387701555e-06, "loss": 3.6095, "step": 12910 }, { "epoch": 41.543408360128616, "grad_norm": 51.42851257324219, "learning_rate": 6.926094412264451e-06, "loss": 3.5896, "step": 12920 }, { "epoch": 41.57556270096463, "grad_norm": 57.7702522277832, "learning_rate": 6.9216488902069344e-06, "loss": 4.3942, "step": 12930 }, { "epoch": 41.60771704180064, "grad_norm": 55.65568161010742, "learning_rate": 6.91720182635024e-06, "loss": 3.7551, "step": 12940 }, { "epoch": 41.639871382636656, "grad_norm": 55.94721984863281, "learning_rate": 6.912753225517274e-06, "loss": 3.9126, "step": 12950 }, { "epoch": 41.67202572347267, "grad_norm": 47.62665939331055, "learning_rate": 6.908303092532608e-06, "loss": 3.8385, "step": 12960 }, { "epoch": 41.70418006430868, "grad_norm": 41.36471939086914, "learning_rate": 6.903851432222478e-06, "loss": 3.9103, "step": 12970 }, { "epoch": 41.736334405144696, "grad_norm": 49.2620849609375, "learning_rate": 6.899398249414776e-06, "loss": 3.9337, "step": 12980 }, { "epoch": 41.768488745980704, "grad_norm": 43.52669143676758, "learning_rate": 6.894943548939043e-06, "loss": 4.1341, "step": 12990 }, { "epoch": 41.80064308681672, "grad_norm": 47.10542678833008, "learning_rate": 6.890487335626468e-06, "loss": 4.0335, "step": 13000 }, { "epoch": 41.80064308681672, "eval_runtime": 5.4817, "eval_samples_per_second": 9.121, "eval_steps_per_second": 1.277, "step": 13000 }, { "epoch": 41.832797427652736, "grad_norm": 44.02701950073242, "learning_rate": 6.88602961430988e-06, "loss": 4.0232, "step": 13010 }, { "epoch": 41.864951768488744, "grad_norm": 44.46939468383789, "learning_rate": 6.881570389823743e-06, "loss": 3.7755, "step": 13020 }, { "epoch": 41.89710610932476, "grad_norm": 49.459808349609375, "learning_rate": 6.877109667004152e-06, "loss": 4.0411, "step": 13030 }, { "epoch": 41.92926045016077, "grad_norm": 50.66773223876953, "learning_rate": 6.872647450688828e-06, "loss": 4.3123, "step": 13040 }, { "epoch": 41.961414790996784, "grad_norm": 46.916908264160156, "learning_rate": 6.868183745717108e-06, "loss": 3.8806, "step": 13050 }, { "epoch": 41.9935691318328, "grad_norm": 56.159976959228516, "learning_rate": 6.863718556929947e-06, "loss": 3.7034, "step": 13060 }, { "epoch": 42.02572347266881, "grad_norm": 57.16172409057617, "learning_rate": 6.859251889169908e-06, "loss": 3.3676, "step": 13070 }, { "epoch": 42.057877813504824, "grad_norm": 48.593055725097656, "learning_rate": 6.854783747281159e-06, "loss": 3.6935, "step": 13080 }, { "epoch": 42.09003215434084, "grad_norm": 44.20922088623047, "learning_rate": 6.850314136109464e-06, "loss": 3.7804, "step": 13090 }, { "epoch": 42.12218649517685, "grad_norm": 51.157691955566406, "learning_rate": 6.845843060502182e-06, "loss": 3.706, "step": 13100 }, { "epoch": 42.12218649517685, "eval_runtime": 5.4433, "eval_samples_per_second": 9.186, "eval_steps_per_second": 1.286, "step": 13100 }, { "epoch": 42.154340836012864, "grad_norm": 48.48221969604492, "learning_rate": 6.841370525308263e-06, "loss": 4.0196, "step": 13110 }, { "epoch": 42.18649517684887, "grad_norm": 55.9564094543457, "learning_rate": 6.8368965353782345e-06, "loss": 3.7177, "step": 13120 }, { "epoch": 42.21864951768489, "grad_norm": 50.058082580566406, "learning_rate": 6.832421095564208e-06, "loss": 3.5523, "step": 13130 }, { "epoch": 42.250803858520904, "grad_norm": 45.222782135009766, "learning_rate": 6.82794421071986e-06, "loss": 3.7844, "step": 13140 }, { "epoch": 42.28295819935691, "grad_norm": 47.01181411743164, "learning_rate": 6.82346588570044e-06, "loss": 3.6422, "step": 13150 }, { "epoch": 42.31511254019293, "grad_norm": 58.874839782714844, "learning_rate": 6.818986125362758e-06, "loss": 3.5839, "step": 13160 }, { "epoch": 42.347266881028936, "grad_norm": 51.676456451416016, "learning_rate": 6.814504934565179e-06, "loss": 3.9511, "step": 13170 }, { "epoch": 42.37942122186495, "grad_norm": 54.7979736328125, "learning_rate": 6.810022318167621e-06, "loss": 4.0108, "step": 13180 }, { "epoch": 42.41157556270097, "grad_norm": 53.461387634277344, "learning_rate": 6.805538281031547e-06, "loss": 3.8066, "step": 13190 }, { "epoch": 42.443729903536976, "grad_norm": 48.200599670410156, "learning_rate": 6.801052828019962e-06, "loss": 3.5466, "step": 13200 }, { "epoch": 42.443729903536976, "eval_runtime": 5.331, "eval_samples_per_second": 9.379, "eval_steps_per_second": 1.313, "step": 13200 }, { "epoch": 42.47588424437299, "grad_norm": 54.80650329589844, "learning_rate": 6.796565963997406e-06, "loss": 3.6224, "step": 13210 }, { "epoch": 42.508038585209, "grad_norm": 52.9497184753418, "learning_rate": 6.79207769382995e-06, "loss": 3.6325, "step": 13220 }, { "epoch": 42.540192926045016, "grad_norm": 45.64604568481445, "learning_rate": 6.7875880223851865e-06, "loss": 3.6963, "step": 13230 }, { "epoch": 42.57234726688103, "grad_norm": 45.026126861572266, "learning_rate": 6.783096954532233e-06, "loss": 3.8359, "step": 13240 }, { "epoch": 42.60450160771704, "grad_norm": 43.48832321166992, "learning_rate": 6.7786044951417184e-06, "loss": 3.7845, "step": 13250 }, { "epoch": 42.636655948553056, "grad_norm": 56.996578216552734, "learning_rate": 6.774110649085777e-06, "loss": 4.0855, "step": 13260 }, { "epoch": 42.668810289389064, "grad_norm": 47.228485107421875, "learning_rate": 6.769615421238058e-06, "loss": 4.0106, "step": 13270 }, { "epoch": 42.70096463022508, "grad_norm": 56.92782974243164, "learning_rate": 6.765118816473698e-06, "loss": 3.9061, "step": 13280 }, { "epoch": 42.733118971061096, "grad_norm": 62.8319206237793, "learning_rate": 6.760620839669333e-06, "loss": 3.5112, "step": 13290 }, { "epoch": 42.765273311897104, "grad_norm": 48.438560485839844, "learning_rate": 6.756121495703084e-06, "loss": 4.078, "step": 13300 }, { "epoch": 42.765273311897104, "eval_runtime": 5.3012, "eval_samples_per_second": 9.432, "eval_steps_per_second": 1.32, "step": 13300 }, { "epoch": 42.79742765273312, "grad_norm": 44.75928497314453, "learning_rate": 6.751620789454555e-06, "loss": 3.4295, "step": 13310 }, { "epoch": 42.82958199356913, "grad_norm": 50.021488189697266, "learning_rate": 6.747118725804829e-06, "loss": 3.6957, "step": 13320 }, { "epoch": 42.861736334405144, "grad_norm": 43.38002014160156, "learning_rate": 6.742615309636462e-06, "loss": 4.1265, "step": 13330 }, { "epoch": 42.89389067524116, "grad_norm": 50.49658203125, "learning_rate": 6.738110545833473e-06, "loss": 4.2925, "step": 13340 }, { "epoch": 42.92604501607717, "grad_norm": 39.69424819946289, "learning_rate": 6.733604439281346e-06, "loss": 3.5787, "step": 13350 }, { "epoch": 42.958199356913184, "grad_norm": 49.344093322753906, "learning_rate": 6.729096994867021e-06, "loss": 3.6501, "step": 13360 }, { "epoch": 42.9903536977492, "grad_norm": 63.367366790771484, "learning_rate": 6.7245882174788835e-06, "loss": 3.7006, "step": 13370 }, { "epoch": 43.02250803858521, "grad_norm": 44.297332763671875, "learning_rate": 6.720078112006775e-06, "loss": 3.3771, "step": 13380 }, { "epoch": 43.054662379421224, "grad_norm": 59.01403045654297, "learning_rate": 6.7155666833419685e-06, "loss": 3.8684, "step": 13390 }, { "epoch": 43.08681672025723, "grad_norm": 47.89314270019531, "learning_rate": 6.711053936377173e-06, "loss": 3.8693, "step": 13400 }, { "epoch": 43.08681672025723, "eval_runtime": 5.4219, "eval_samples_per_second": 9.222, "eval_steps_per_second": 1.291, "step": 13400 }, { "epoch": 43.11897106109325, "grad_norm": 50.11954116821289, "learning_rate": 6.706539876006532e-06, "loss": 3.913, "step": 13410 }, { "epoch": 43.151125401929264, "grad_norm": 49.00069808959961, "learning_rate": 6.7020245071256095e-06, "loss": 3.6763, "step": 13420 }, { "epoch": 43.18327974276527, "grad_norm": 46.35792541503906, "learning_rate": 6.697507834631389e-06, "loss": 3.8559, "step": 13430 }, { "epoch": 43.21543408360129, "grad_norm": 43.837913513183594, "learning_rate": 6.692989863422269e-06, "loss": 4.0118, "step": 13440 }, { "epoch": 43.247588424437296, "grad_norm": 48.81989288330078, "learning_rate": 6.688470598398056e-06, "loss": 3.9762, "step": 13450 }, { "epoch": 43.27974276527331, "grad_norm": 40.29865264892578, "learning_rate": 6.6839500444599596e-06, "loss": 3.815, "step": 13460 }, { "epoch": 43.31189710610933, "grad_norm": 46.04387283325195, "learning_rate": 6.679428206510589e-06, "loss": 3.5998, "step": 13470 }, { "epoch": 43.344051446945336, "grad_norm": 40.26736831665039, "learning_rate": 6.674905089453939e-06, "loss": 3.6526, "step": 13480 }, { "epoch": 43.37620578778135, "grad_norm": 45.58209228515625, "learning_rate": 6.6703806981954e-06, "loss": 3.6285, "step": 13490 }, { "epoch": 43.40836012861736, "grad_norm": 42.804473876953125, "learning_rate": 6.665855037641742e-06, "loss": 3.7106, "step": 13500 }, { "epoch": 43.40836012861736, "eval_runtime": 5.3006, "eval_samples_per_second": 9.433, "eval_steps_per_second": 1.321, "step": 13500 }, { "epoch": 43.440514469453376, "grad_norm": 43.7956428527832, "learning_rate": 6.66132811270111e-06, "loss": 4.1013, "step": 13510 }, { "epoch": 43.47266881028939, "grad_norm": 51.582008361816406, "learning_rate": 6.65679992828302e-06, "loss": 3.6988, "step": 13520 }, { "epoch": 43.5048231511254, "grad_norm": 46.648681640625, "learning_rate": 6.6522704892983545e-06, "loss": 4.114, "step": 13530 }, { "epoch": 43.536977491961416, "grad_norm": 44.82667541503906, "learning_rate": 6.647739800659358e-06, "loss": 3.4586, "step": 13540 }, { "epoch": 43.569131832797424, "grad_norm": 41.975826263427734, "learning_rate": 6.643207867279628e-06, "loss": 3.6801, "step": 13550 }, { "epoch": 43.60128617363344, "grad_norm": 53.41710662841797, "learning_rate": 6.638674694074113e-06, "loss": 3.9799, "step": 13560 }, { "epoch": 43.633440514469456, "grad_norm": 52.41001892089844, "learning_rate": 6.634140285959107e-06, "loss": 4.3491, "step": 13570 }, { "epoch": 43.665594855305464, "grad_norm": 43.59348678588867, "learning_rate": 6.6296046478522435e-06, "loss": 3.8023, "step": 13580 }, { "epoch": 43.69774919614148, "grad_norm": 54.23263168334961, "learning_rate": 6.625067784672486e-06, "loss": 3.4986, "step": 13590 }, { "epoch": 43.72990353697749, "grad_norm": 55.51115417480469, "learning_rate": 6.620529701340133e-06, "loss": 3.6358, "step": 13600 }, { "epoch": 43.72990353697749, "eval_runtime": 5.3848, "eval_samples_per_second": 9.285, "eval_steps_per_second": 1.3, "step": 13600 }, { "epoch": 43.762057877813504, "grad_norm": 45.36463165283203, "learning_rate": 6.615990402776801e-06, "loss": 3.595, "step": 13610 }, { "epoch": 43.79421221864952, "grad_norm": 45.646949768066406, "learning_rate": 6.611449893905423e-06, "loss": 3.46, "step": 13620 }, { "epoch": 43.82636655948553, "grad_norm": 58.14832305908203, "learning_rate": 6.606908179650253e-06, "loss": 3.543, "step": 13630 }, { "epoch": 43.858520900321544, "grad_norm": 49.842960357666016, "learning_rate": 6.6023652649368446e-06, "loss": 3.5958, "step": 13640 }, { "epoch": 43.89067524115756, "grad_norm": 48.7928352355957, "learning_rate": 6.597821154692056e-06, "loss": 3.1517, "step": 13650 }, { "epoch": 43.92282958199357, "grad_norm": 50.33212661743164, "learning_rate": 6.5932758538440425e-06, "loss": 3.6831, "step": 13660 }, { "epoch": 43.954983922829584, "grad_norm": 52.49909973144531, "learning_rate": 6.5887293673222465e-06, "loss": 3.7013, "step": 13670 }, { "epoch": 43.98713826366559, "grad_norm": 47.873165130615234, "learning_rate": 6.584181700057405e-06, "loss": 3.4438, "step": 13680 }, { "epoch": 44.01929260450161, "grad_norm": 45.209346771240234, "learning_rate": 6.579632856981527e-06, "loss": 4.1868, "step": 13690 }, { "epoch": 44.051446945337624, "grad_norm": 48.328453063964844, "learning_rate": 6.575082843027899e-06, "loss": 3.6362, "step": 13700 }, { "epoch": 44.051446945337624, "eval_runtime": 5.3356, "eval_samples_per_second": 9.371, "eval_steps_per_second": 1.312, "step": 13700 }, { "epoch": 44.08360128617363, "grad_norm": 47.588375091552734, "learning_rate": 6.570531663131081e-06, "loss": 4.1693, "step": 13710 }, { "epoch": 44.11575562700965, "grad_norm": 44.93582534790039, "learning_rate": 6.565979322226893e-06, "loss": 3.8178, "step": 13720 }, { "epoch": 44.147909967845656, "grad_norm": 47.445919036865234, "learning_rate": 6.561425825252416e-06, "loss": 3.5152, "step": 13730 }, { "epoch": 44.18006430868167, "grad_norm": 50.76397705078125, "learning_rate": 6.556871177145986e-06, "loss": 3.571, "step": 13740 }, { "epoch": 44.21221864951769, "grad_norm": 49.44401168823242, "learning_rate": 6.552315382847182e-06, "loss": 3.7158, "step": 13750 }, { "epoch": 44.244372990353696, "grad_norm": 48.373809814453125, "learning_rate": 6.547758447296835e-06, "loss": 3.7012, "step": 13760 }, { "epoch": 44.27652733118971, "grad_norm": 56.16640853881836, "learning_rate": 6.543200375437007e-06, "loss": 3.8704, "step": 13770 }, { "epoch": 44.30868167202572, "grad_norm": 50.02269744873047, "learning_rate": 6.538641172210991e-06, "loss": 3.4982, "step": 13780 }, { "epoch": 44.340836012861736, "grad_norm": 48.82128143310547, "learning_rate": 6.5340808425633155e-06, "loss": 3.903, "step": 13790 }, { "epoch": 44.37299035369775, "grad_norm": 49.56406021118164, "learning_rate": 6.529519391439722e-06, "loss": 4.094, "step": 13800 }, { "epoch": 44.37299035369775, "eval_runtime": 5.3698, "eval_samples_per_second": 9.311, "eval_steps_per_second": 1.304, "step": 13800 }, { "epoch": 44.40514469453376, "grad_norm": 42.36375427246094, "learning_rate": 6.524956823787172e-06, "loss": 3.2433, "step": 13810 }, { "epoch": 44.437299035369776, "grad_norm": 43.40381622314453, "learning_rate": 6.520393144553837e-06, "loss": 3.4314, "step": 13820 }, { "epoch": 44.469453376205784, "grad_norm": 45.20592498779297, "learning_rate": 6.515828358689096e-06, "loss": 3.8977, "step": 13830 }, { "epoch": 44.5016077170418, "grad_norm": 44.747493743896484, "learning_rate": 6.511262471143523e-06, "loss": 3.7857, "step": 13840 }, { "epoch": 44.533762057877816, "grad_norm": 55.7867317199707, "learning_rate": 6.506695486868894e-06, "loss": 4.2, "step": 13850 }, { "epoch": 44.565916398713824, "grad_norm": 42.33040237426758, "learning_rate": 6.5021274108181695e-06, "loss": 3.5574, "step": 13860 }, { "epoch": 44.59807073954984, "grad_norm": 44.761207580566406, "learning_rate": 6.4975582479454956e-06, "loss": 3.8456, "step": 13870 }, { "epoch": 44.630225080385856, "grad_norm": 68.4744873046875, "learning_rate": 6.492988003206195e-06, "loss": 3.6133, "step": 13880 }, { "epoch": 44.662379421221864, "grad_norm": 44.68567657470703, "learning_rate": 6.4884166815567686e-06, "loss": 3.6896, "step": 13890 }, { "epoch": 44.69453376205788, "grad_norm": 45.69990539550781, "learning_rate": 6.4838442879548805e-06, "loss": 3.9395, "step": 13900 }, { "epoch": 44.69453376205788, "eval_runtime": 5.3726, "eval_samples_per_second": 9.306, "eval_steps_per_second": 1.303, "step": 13900 }, { "epoch": 44.72668810289389, "grad_norm": 46.78415298461914, "learning_rate": 6.479270827359359e-06, "loss": 3.8031, "step": 13910 }, { "epoch": 44.758842443729904, "grad_norm": 44.30283737182617, "learning_rate": 6.474696304730187e-06, "loss": 3.495, "step": 13920 }, { "epoch": 44.79099678456592, "grad_norm": 52.217166900634766, "learning_rate": 6.470120725028506e-06, "loss": 3.9239, "step": 13930 }, { "epoch": 44.82315112540193, "grad_norm": 54.01493453979492, "learning_rate": 6.465544093216599e-06, "loss": 3.4874, "step": 13940 }, { "epoch": 44.855305466237944, "grad_norm": 47.50662612915039, "learning_rate": 6.460966414257888e-06, "loss": 3.8246, "step": 13950 }, { "epoch": 44.88745980707395, "grad_norm": 50.27058029174805, "learning_rate": 6.456387693116937e-06, "loss": 3.5592, "step": 13960 }, { "epoch": 44.91961414790997, "grad_norm": 42.38080596923828, "learning_rate": 6.451807934759431e-06, "loss": 3.3153, "step": 13970 }, { "epoch": 44.951768488745984, "grad_norm": 50.481361389160156, "learning_rate": 6.4472271441521916e-06, "loss": 3.7638, "step": 13980 }, { "epoch": 44.98392282958199, "grad_norm": 44.00603103637695, "learning_rate": 6.4426453262631505e-06, "loss": 3.3545, "step": 13990 }, { "epoch": 45.01607717041801, "grad_norm": 42.260379791259766, "learning_rate": 6.4380624860613564e-06, "loss": 3.2011, "step": 14000 }, { "epoch": 45.01607717041801, "eval_runtime": 5.4067, "eval_samples_per_second": 9.248, "eval_steps_per_second": 1.295, "step": 14000 }, { "epoch": 45.048231511254016, "grad_norm": 40.89357376098633, "learning_rate": 6.433478628516968e-06, "loss": 3.2951, "step": 14010 }, { "epoch": 45.08038585209003, "grad_norm": 42.917999267578125, "learning_rate": 6.428893758601248e-06, "loss": 3.4341, "step": 14020 }, { "epoch": 45.11254019292605, "grad_norm": 58.99101638793945, "learning_rate": 6.424307881286552e-06, "loss": 3.815, "step": 14030 }, { "epoch": 45.144694533762056, "grad_norm": 44.16122055053711, "learning_rate": 6.419721001546333e-06, "loss": 3.6844, "step": 14040 }, { "epoch": 45.17684887459807, "grad_norm": 46.57978057861328, "learning_rate": 6.4151331243551305e-06, "loss": 3.2951, "step": 14050 }, { "epoch": 45.20900321543408, "grad_norm": 46.803348541259766, "learning_rate": 6.410544254688565e-06, "loss": 3.1819, "step": 14060 }, { "epoch": 45.241157556270096, "grad_norm": 38.86985397338867, "learning_rate": 6.4059543975233315e-06, "loss": 3.4245, "step": 14070 }, { "epoch": 45.27331189710611, "grad_norm": 40.22675323486328, "learning_rate": 6.4013635578372015e-06, "loss": 3.4086, "step": 14080 }, { "epoch": 45.30546623794212, "grad_norm": 42.57950973510742, "learning_rate": 6.396771740609002e-06, "loss": 3.7607, "step": 14090 }, { "epoch": 45.337620578778136, "grad_norm": 47.24885177612305, "learning_rate": 6.392178950818632e-06, "loss": 3.8008, "step": 14100 }, { "epoch": 45.337620578778136, "eval_runtime": 5.3052, "eval_samples_per_second": 9.425, "eval_steps_per_second": 1.319, "step": 14100 }, { "epoch": 45.369774919614144, "grad_norm": 49.306373596191406, "learning_rate": 6.387585193447038e-06, "loss": 3.5793, "step": 14110 }, { "epoch": 45.40192926045016, "grad_norm": 52.76168441772461, "learning_rate": 6.382990473476218e-06, "loss": 4.2313, "step": 14120 }, { "epoch": 45.434083601286176, "grad_norm": 43.657840728759766, "learning_rate": 6.378394795889213e-06, "loss": 3.6169, "step": 14130 }, { "epoch": 45.466237942122184, "grad_norm": 43.90129852294922, "learning_rate": 6.373798165670102e-06, "loss": 3.8637, "step": 14140 }, { "epoch": 45.4983922829582, "grad_norm": 44.62062072753906, "learning_rate": 6.369200587804e-06, "loss": 3.0001, "step": 14150 }, { "epoch": 45.530546623794216, "grad_norm": 61.102718353271484, "learning_rate": 6.364602067277048e-06, "loss": 3.8044, "step": 14160 }, { "epoch": 45.562700964630224, "grad_norm": 36.800228118896484, "learning_rate": 6.360002609076408e-06, "loss": 3.7696, "step": 14170 }, { "epoch": 45.59485530546624, "grad_norm": 52.48075485229492, "learning_rate": 6.355402218190261e-06, "loss": 3.7447, "step": 14180 }, { "epoch": 45.62700964630225, "grad_norm": 52.804534912109375, "learning_rate": 6.350800899607799e-06, "loss": 3.6881, "step": 14190 }, { "epoch": 45.659163987138264, "grad_norm": 51.758460998535156, "learning_rate": 6.346198658319219e-06, "loss": 3.7239, "step": 14200 }, { "epoch": 45.659163987138264, "eval_runtime": 5.3043, "eval_samples_per_second": 9.426, "eval_steps_per_second": 1.32, "step": 14200 }, { "epoch": 45.69131832797428, "grad_norm": 43.409000396728516, "learning_rate": 6.341595499315722e-06, "loss": 3.5215, "step": 14210 }, { "epoch": 45.72347266881029, "grad_norm": 40.48189926147461, "learning_rate": 6.336991427589496e-06, "loss": 3.5506, "step": 14220 }, { "epoch": 45.755627009646304, "grad_norm": 42.237892150878906, "learning_rate": 6.33238644813373e-06, "loss": 3.8298, "step": 14230 }, { "epoch": 45.78778135048231, "grad_norm": 42.26970291137695, "learning_rate": 6.327780565942593e-06, "loss": 3.1535, "step": 14240 }, { "epoch": 45.81993569131833, "grad_norm": 49.509552001953125, "learning_rate": 6.323173786011226e-06, "loss": 3.56, "step": 14250 }, { "epoch": 45.852090032154344, "grad_norm": 38.58564376831055, "learning_rate": 6.3185661133357556e-06, "loss": 3.8886, "step": 14260 }, { "epoch": 45.88424437299035, "grad_norm": 46.30108642578125, "learning_rate": 6.3139575529132694e-06, "loss": 3.8585, "step": 14270 }, { "epoch": 45.91639871382637, "grad_norm": 51.71986389160156, "learning_rate": 6.309348109741816e-06, "loss": 3.5306, "step": 14280 }, { "epoch": 45.948553054662376, "grad_norm": 52.77317428588867, "learning_rate": 6.3047377888204076e-06, "loss": 3.4978, "step": 14290 }, { "epoch": 45.98070739549839, "grad_norm": 56.59071350097656, "learning_rate": 6.300126595149004e-06, "loss": 3.9293, "step": 14300 }, { "epoch": 45.98070739549839, "eval_runtime": 5.2751, "eval_samples_per_second": 9.478, "eval_steps_per_second": 1.327, "step": 14300 }, { "epoch": 46.01286173633441, "grad_norm": 44.007606506347656, "learning_rate": 6.295514533728514e-06, "loss": 3.5784, "step": 14310 }, { "epoch": 46.045016077170416, "grad_norm": 55.59687805175781, "learning_rate": 6.2909016095607845e-06, "loss": 3.0158, "step": 14320 }, { "epoch": 46.07717041800643, "grad_norm": 53.81737518310547, "learning_rate": 6.2862878276486005e-06, "loss": 3.3333, "step": 14330 }, { "epoch": 46.10932475884244, "grad_norm": 45.75010681152344, "learning_rate": 6.281673192995678e-06, "loss": 3.5152, "step": 14340 }, { "epoch": 46.141479099678456, "grad_norm": 52.03997039794922, "learning_rate": 6.277057710606653e-06, "loss": 3.7345, "step": 14350 }, { "epoch": 46.17363344051447, "grad_norm": 59.01860427856445, "learning_rate": 6.2724413854870846e-06, "loss": 3.1459, "step": 14360 }, { "epoch": 46.20578778135048, "grad_norm": 39.69333267211914, "learning_rate": 6.26782422264345e-06, "loss": 3.3531, "step": 14370 }, { "epoch": 46.237942122186496, "grad_norm": 50.33409118652344, "learning_rate": 6.263206227083125e-06, "loss": 3.2734, "step": 14380 }, { "epoch": 46.27009646302251, "grad_norm": 56.89069747924805, "learning_rate": 6.258587403814398e-06, "loss": 3.2888, "step": 14390 }, { "epoch": 46.30225080385852, "grad_norm": 48.22525405883789, "learning_rate": 6.253967757846447e-06, "loss": 3.2996, "step": 14400 }, { "epoch": 46.30225080385852, "eval_runtime": 5.4061, "eval_samples_per_second": 9.249, "eval_steps_per_second": 1.295, "step": 14400 }, { "epoch": 46.334405144694536, "grad_norm": 45.533573150634766, "learning_rate": 6.249347294189349e-06, "loss": 3.0609, "step": 14410 }, { "epoch": 46.366559485530544, "grad_norm": 44.181297302246094, "learning_rate": 6.2447260178540645e-06, "loss": 3.4611, "step": 14420 }, { "epoch": 46.39871382636656, "grad_norm": 51.61262130737305, "learning_rate": 6.240103933852437e-06, "loss": 3.4108, "step": 14430 }, { "epoch": 46.430868167202576, "grad_norm": 46.06563949584961, "learning_rate": 6.23548104719718e-06, "loss": 3.208, "step": 14440 }, { "epoch": 46.463022508038584, "grad_norm": 38.33654022216797, "learning_rate": 6.230857362901888e-06, "loss": 3.6308, "step": 14450 }, { "epoch": 46.4951768488746, "grad_norm": 47.99333572387695, "learning_rate": 6.22623288598101e-06, "loss": 3.7078, "step": 14460 }, { "epoch": 46.52733118971061, "grad_norm": 44.79597473144531, "learning_rate": 6.221607621449863e-06, "loss": 3.4414, "step": 14470 }, { "epoch": 46.559485530546624, "grad_norm": 47.10931396484375, "learning_rate": 6.216981574324612e-06, "loss": 3.3473, "step": 14480 }, { "epoch": 46.59163987138264, "grad_norm": 63.359012603759766, "learning_rate": 6.212354749622275e-06, "loss": 3.6452, "step": 14490 }, { "epoch": 46.62379421221865, "grad_norm": 37.793087005615234, "learning_rate": 6.207727152360711e-06, "loss": 2.9826, "step": 14500 }, { "epoch": 46.62379421221865, "eval_runtime": 5.3648, "eval_samples_per_second": 9.32, "eval_steps_per_second": 1.305, "step": 14500 }, { "epoch": 46.655948553054664, "grad_norm": 45.3447265625, "learning_rate": 6.203098787558618e-06, "loss": 3.6247, "step": 14510 }, { "epoch": 46.68810289389067, "grad_norm": 49.64666748046875, "learning_rate": 6.198469660235523e-06, "loss": 3.7759, "step": 14520 }, { "epoch": 46.72025723472669, "grad_norm": 54.8009147644043, "learning_rate": 6.1938397754117875e-06, "loss": 3.4779, "step": 14530 }, { "epoch": 46.752411575562704, "grad_norm": 39.07783889770508, "learning_rate": 6.189209138108589e-06, "loss": 2.985, "step": 14540 }, { "epoch": 46.78456591639871, "grad_norm": 48.924861907958984, "learning_rate": 6.184577753347916e-06, "loss": 3.3948, "step": 14550 }, { "epoch": 46.81672025723473, "grad_norm": 40.695377349853516, "learning_rate": 6.17994562615258e-06, "loss": 3.646, "step": 14560 }, { "epoch": 46.848874598070736, "grad_norm": 42.55186462402344, "learning_rate": 6.17531276154619e-06, "loss": 3.3653, "step": 14570 }, { "epoch": 46.88102893890675, "grad_norm": 49.43696594238281, "learning_rate": 6.170679164553153e-06, "loss": 3.6039, "step": 14580 }, { "epoch": 46.91318327974277, "grad_norm": 44.242435455322266, "learning_rate": 6.1660448401986775e-06, "loss": 3.9266, "step": 14590 }, { "epoch": 46.945337620578776, "grad_norm": 45.913108825683594, "learning_rate": 6.161409793508754e-06, "loss": 3.5294, "step": 14600 }, { "epoch": 46.945337620578776, "eval_runtime": 5.328, "eval_samples_per_second": 9.384, "eval_steps_per_second": 1.314, "step": 14600 }, { "epoch": 46.97749196141479, "grad_norm": 61.27463912963867, "learning_rate": 6.156774029510158e-06, "loss": 3.8432, "step": 14610 }, { "epoch": 47.0096463022508, "grad_norm": 43.16250228881836, "learning_rate": 6.152137553230447e-06, "loss": 3.567, "step": 14620 }, { "epoch": 47.041800643086816, "grad_norm": 55.030521392822266, "learning_rate": 6.147500369697945e-06, "loss": 3.4398, "step": 14630 }, { "epoch": 47.07395498392283, "grad_norm": 54.19996643066406, "learning_rate": 6.142862483941747e-06, "loss": 3.516, "step": 14640 }, { "epoch": 47.10610932475884, "grad_norm": 43.67913055419922, "learning_rate": 6.13822390099171e-06, "loss": 3.4482, "step": 14650 }, { "epoch": 47.138263665594856, "grad_norm": 51.55460739135742, "learning_rate": 6.133584625878442e-06, "loss": 3.3972, "step": 14660 }, { "epoch": 47.17041800643087, "grad_norm": 46.27159881591797, "learning_rate": 6.128944663633309e-06, "loss": 2.9431, "step": 14670 }, { "epoch": 47.20257234726688, "grad_norm": 43.540103912353516, "learning_rate": 6.1243040192884175e-06, "loss": 3.295, "step": 14680 }, { "epoch": 47.234726688102896, "grad_norm": 40.58637619018555, "learning_rate": 6.119662697876613e-06, "loss": 3.1671, "step": 14690 }, { "epoch": 47.266881028938904, "grad_norm": 50.803245544433594, "learning_rate": 6.115020704431478e-06, "loss": 3.4766, "step": 14700 }, { "epoch": 47.266881028938904, "eval_runtime": 5.2699, "eval_samples_per_second": 9.488, "eval_steps_per_second": 1.328, "step": 14700 }, { "epoch": 47.29903536977492, "grad_norm": 42.901222229003906, "learning_rate": 6.110378043987323e-06, "loss": 3.364, "step": 14710 }, { "epoch": 47.331189710610936, "grad_norm": 44.916481018066406, "learning_rate": 6.105734721579181e-06, "loss": 3.4608, "step": 14720 }, { "epoch": 47.363344051446944, "grad_norm": 43.07337951660156, "learning_rate": 6.101090742242804e-06, "loss": 3.4743, "step": 14730 }, { "epoch": 47.39549839228296, "grad_norm": 49.4533576965332, "learning_rate": 6.096446111014657e-06, "loss": 3.4457, "step": 14740 }, { "epoch": 47.42765273311897, "grad_norm": 45.95536422729492, "learning_rate": 6.09180083293191e-06, "loss": 3.2532, "step": 14750 }, { "epoch": 47.459807073954984, "grad_norm": 46.39643859863281, "learning_rate": 6.087154913032435e-06, "loss": 2.7945, "step": 14760 }, { "epoch": 47.491961414791, "grad_norm": 48.9324836730957, "learning_rate": 6.082508356354802e-06, "loss": 3.6404, "step": 14770 }, { "epoch": 47.52411575562701, "grad_norm": 56.64775085449219, "learning_rate": 6.07786116793827e-06, "loss": 3.4176, "step": 14780 }, { "epoch": 47.556270096463024, "grad_norm": 49.65724182128906, "learning_rate": 6.073213352822782e-06, "loss": 3.3078, "step": 14790 }, { "epoch": 47.58842443729903, "grad_norm": 40.25274658203125, "learning_rate": 6.068564916048964e-06, "loss": 3.4796, "step": 14800 }, { "epoch": 47.58842443729903, "eval_runtime": 5.2678, "eval_samples_per_second": 9.492, "eval_steps_per_second": 1.329, "step": 14800 }, { "epoch": 47.62057877813505, "grad_norm": 51.200618743896484, "learning_rate": 6.063915862658112e-06, "loss": 3.4717, "step": 14810 }, { "epoch": 47.652733118971064, "grad_norm": 56.198326110839844, "learning_rate": 6.059266197692194e-06, "loss": 3.5787, "step": 14820 }, { "epoch": 47.68488745980707, "grad_norm": 45.27773666381836, "learning_rate": 6.054615926193841e-06, "loss": 3.128, "step": 14830 }, { "epoch": 47.71704180064309, "grad_norm": 49.37786102294922, "learning_rate": 6.0499650532063405e-06, "loss": 3.4686, "step": 14840 }, { "epoch": 47.749196141479096, "grad_norm": 61.77187728881836, "learning_rate": 6.045313583773632e-06, "loss": 3.2507, "step": 14850 }, { "epoch": 47.78135048231511, "grad_norm": 48.81605911254883, "learning_rate": 6.040661522940304e-06, "loss": 3.2752, "step": 14860 }, { "epoch": 47.81350482315113, "grad_norm": 60.16655731201172, "learning_rate": 6.036008875751585e-06, "loss": 3.7235, "step": 14870 }, { "epoch": 47.845659163987136, "grad_norm": 47.13853073120117, "learning_rate": 6.0313556472533375e-06, "loss": 3.2753, "step": 14880 }, { "epoch": 47.87781350482315, "grad_norm": 53.64875030517578, "learning_rate": 6.0267018424920575e-06, "loss": 3.395, "step": 14890 }, { "epoch": 47.90996784565917, "grad_norm": 46.31193542480469, "learning_rate": 6.0220474665148666e-06, "loss": 3.2728, "step": 14900 }, { "epoch": 47.90996784565917, "eval_runtime": 5.3637, "eval_samples_per_second": 9.322, "eval_steps_per_second": 1.305, "step": 14900 }, { "epoch": 47.942122186495176, "grad_norm": 47.80056381225586, "learning_rate": 6.0173925243695e-06, "loss": 3.1878, "step": 14910 }, { "epoch": 47.97427652733119, "grad_norm": 50.2651481628418, "learning_rate": 6.012737021104316e-06, "loss": 3.6197, "step": 14920 }, { "epoch": 48.0064308681672, "grad_norm": 40.13654327392578, "learning_rate": 6.008080961768273e-06, "loss": 3.6295, "step": 14930 }, { "epoch": 48.038585209003216, "grad_norm": 41.257991790771484, "learning_rate": 6.003424351410938e-06, "loss": 3.156, "step": 14940 }, { "epoch": 48.07073954983923, "grad_norm": 50.42916488647461, "learning_rate": 5.998767195082472e-06, "loss": 3.2445, "step": 14950 }, { "epoch": 48.10289389067524, "grad_norm": 41.42873764038086, "learning_rate": 5.994109497833631e-06, "loss": 3.1038, "step": 14960 }, { "epoch": 48.135048231511256, "grad_norm": 47.499855041503906, "learning_rate": 5.989451264715756e-06, "loss": 3.4256, "step": 14970 }, { "epoch": 48.167202572347264, "grad_norm": 42.58186340332031, "learning_rate": 5.98479250078077e-06, "loss": 3.2457, "step": 14980 }, { "epoch": 48.19935691318328, "grad_norm": 47.350318908691406, "learning_rate": 5.980133211081167e-06, "loss": 3.4269, "step": 14990 }, { "epoch": 48.231511254019296, "grad_norm": 58.90053939819336, "learning_rate": 5.975473400670022e-06, "loss": 3.2995, "step": 15000 }, { "epoch": 48.231511254019296, "eval_runtime": 5.4587, "eval_samples_per_second": 9.16, "eval_steps_per_second": 1.282, "step": 15000 } ], "logging_steps": 10, "max_steps": 31100, "num_input_tokens_seen": 0, "num_train_epochs": 100, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }