{ "best_global_step": 2000, "best_metric": 9.218317031860352, "best_model_checkpoint": "./artifacts/models/base-250725-test/checkpoint-2000", "epoch": 0.06870834556550091, "eval_steps": 1000, "global_step": 22000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 7.807766541534195e-05, "grad_norm": 1.9921875, "learning_rate": 1.499063085571518e-06, "loss": 10.8863, "step": 25 }, { "epoch": 0.0001561553308306839, "grad_norm": 2.40625, "learning_rate": 3.0605871330418487e-06, "loss": 10.8814, "step": 50 }, { "epoch": 0.00023423299624602585, "grad_norm": 2.328125, "learning_rate": 4.62211118051218e-06, "loss": 10.883, "step": 75 }, { "epoch": 0.0003123106616613678, "grad_norm": 2.03125, "learning_rate": 6.183635227982511e-06, "loss": 10.8828, "step": 100 }, { "epoch": 0.00039038832707670977, "grad_norm": 1.9765625, "learning_rate": 7.745159275452842e-06, "loss": 10.8834, "step": 125 }, { "epoch": 0.0004684659924920517, "grad_norm": 2.125, "learning_rate": 9.306683322923173e-06, "loss": 10.8796, "step": 150 }, { "epoch": 0.0005465436579073936, "grad_norm": 1.96875, "learning_rate": 1.0868207370393504e-05, "loss": 10.8798, "step": 175 }, { "epoch": 0.0006246213233227356, "grad_norm": 1.9765625, "learning_rate": 1.2429731417863835e-05, "loss": 10.8764, "step": 200 }, { "epoch": 0.0007026989887380775, "grad_norm": 2.203125, "learning_rate": 1.3991255465334166e-05, "loss": 10.8779, "step": 225 }, { "epoch": 0.0007807766541534195, "grad_norm": 2.296875, "learning_rate": 1.5552779512804497e-05, "loss": 10.8715, "step": 250 }, { "epoch": 0.0008588543195687615, "grad_norm": 2.234375, "learning_rate": 1.7114303560274827e-05, "loss": 10.8676, "step": 275 }, { "epoch": 0.0009369319849841034, "grad_norm": 2.3125, "learning_rate": 1.867582760774516e-05, "loss": 10.8621, "step": 300 }, { "epoch": 0.0010150096503994453, "grad_norm": 2.359375, "learning_rate": 2.0237351655215492e-05, "loss": 10.8566, "step": 325 }, { "epoch": 0.0010930873158147873, "grad_norm": 2.015625, "learning_rate": 2.179887570268582e-05, "loss": 10.8515, "step": 350 }, { "epoch": 0.0011711649812301292, "grad_norm": 2.21875, "learning_rate": 2.3360399750156154e-05, "loss": 10.8445, "step": 375 }, { "epoch": 0.0012492426466454711, "grad_norm": 2.09375, "learning_rate": 2.4921923797626483e-05, "loss": 10.8383, "step": 400 }, { "epoch": 0.001327320312060813, "grad_norm": 2.0625, "learning_rate": 2.6483447845096816e-05, "loss": 10.8244, "step": 425 }, { "epoch": 0.001405397977476155, "grad_norm": 1.890625, "learning_rate": 2.804497189256715e-05, "loss": 10.8193, "step": 450 }, { "epoch": 0.001483475642891497, "grad_norm": 2.21875, "learning_rate": 2.9606495940037475e-05, "loss": 10.7992, "step": 475 }, { "epoch": 0.001561553308306839, "grad_norm": 2.09375, "learning_rate": 3.116801998750781e-05, "loss": 10.7987, "step": 500 }, { "epoch": 0.001639630973722181, "grad_norm": 2.0625, "learning_rate": 3.272954403497814e-05, "loss": 10.7803, "step": 525 }, { "epoch": 0.001717708639137523, "grad_norm": 2.140625, "learning_rate": 3.429106808244847e-05, "loss": 10.7653, "step": 550 }, { "epoch": 0.0017957863045528649, "grad_norm": 2.578125, "learning_rate": 3.58525921299188e-05, "loss": 10.745, "step": 575 }, { "epoch": 0.0018738639699682068, "grad_norm": 1.8515625, "learning_rate": 3.741411617738913e-05, "loss": 10.7327, "step": 600 }, { "epoch": 0.0019519416353835487, "grad_norm": 2.15625, "learning_rate": 3.897564022485946e-05, "loss": 10.7159, "step": 625 }, { "epoch": 0.0020300193007988907, "grad_norm": 2.53125, "learning_rate": 4.053716427232979e-05, "loss": 10.6931, "step": 650 }, { "epoch": 0.0021080969662142326, "grad_norm": 2.078125, "learning_rate": 4.2098688319800126e-05, "loss": 10.6688, "step": 675 }, { "epoch": 0.0021861746316295745, "grad_norm": 2.6875, "learning_rate": 4.3660212367270456e-05, "loss": 10.6408, "step": 700 }, { "epoch": 0.0022642522970449165, "grad_norm": 2.6875, "learning_rate": 4.522173641474079e-05, "loss": 10.63, "step": 725 }, { "epoch": 0.0023423299624602584, "grad_norm": 2.8125, "learning_rate": 4.678326046221112e-05, "loss": 10.6057, "step": 750 }, { "epoch": 0.0024204076278756003, "grad_norm": 2.796875, "learning_rate": 4.834478450968145e-05, "loss": 10.5781, "step": 775 }, { "epoch": 0.0024984852932909423, "grad_norm": 2.484375, "learning_rate": 4.990630855715178e-05, "loss": 10.5501, "step": 800 }, { "epoch": 0.002576562958706284, "grad_norm": 3.171875, "learning_rate": 5.1467832604622116e-05, "loss": 10.5076, "step": 825 }, { "epoch": 0.002654640624121626, "grad_norm": 3.078125, "learning_rate": 5.3029356652092445e-05, "loss": 10.477, "step": 850 }, { "epoch": 0.002732718289536968, "grad_norm": 2.53125, "learning_rate": 5.4590880699562774e-05, "loss": 10.4528, "step": 875 }, { "epoch": 0.00281079595495231, "grad_norm": 2.90625, "learning_rate": 5.6152404747033104e-05, "loss": 10.4192, "step": 900 }, { "epoch": 0.002888873620367652, "grad_norm": 2.859375, "learning_rate": 5.771392879450343e-05, "loss": 10.3672, "step": 925 }, { "epoch": 0.002966951285782994, "grad_norm": 3.078125, "learning_rate": 5.927545284197377e-05, "loss": 10.3219, "step": 950 }, { "epoch": 0.003045028951198336, "grad_norm": 2.890625, "learning_rate": 6.08369768894441e-05, "loss": 10.3154, "step": 975 }, { "epoch": 0.003123106616613678, "grad_norm": 3.125, "learning_rate": 6.239850093691443e-05, "loss": 10.2594, "step": 1000 }, { "epoch": 0.003123106616613678, "eval_loss": 10.227066993713379, "eval_runtime": 102.2402, "eval_samples_per_second": 50.89, "eval_steps_per_second": 3.189, "step": 1000 }, { "epoch": 0.00320118428202902, "grad_norm": 2.859375, "learning_rate": 6.396002498438476e-05, "loss": 10.2088, "step": 1025 }, { "epoch": 0.003279261947444362, "grad_norm": 2.921875, "learning_rate": 6.552154903185509e-05, "loss": 10.1806, "step": 1050 }, { "epoch": 0.003357339612859704, "grad_norm": 3.015625, "learning_rate": 6.708307307932544e-05, "loss": 10.1161, "step": 1075 }, { "epoch": 0.003435417278275046, "grad_norm": 2.640625, "learning_rate": 6.864459712679575e-05, "loss": 10.0794, "step": 1100 }, { "epoch": 0.003513494943690388, "grad_norm": 2.53125, "learning_rate": 7.020612117426608e-05, "loss": 10.0233, "step": 1125 }, { "epoch": 0.0035915726091057297, "grad_norm": 2.640625, "learning_rate": 7.176764522173641e-05, "loss": 9.9989, "step": 1150 }, { "epoch": 0.0036696502745210717, "grad_norm": 2.328125, "learning_rate": 7.332916926920674e-05, "loss": 9.9246, "step": 1175 }, { "epoch": 0.0037477279399364136, "grad_norm": 2.0, "learning_rate": 7.489069331667708e-05, "loss": 9.8812, "step": 1200 }, { "epoch": 0.0038258056053517555, "grad_norm": 1.875, "learning_rate": 7.645221736414741e-05, "loss": 9.8312, "step": 1225 }, { "epoch": 0.0039038832707670975, "grad_norm": 1.953125, "learning_rate": 7.801374141161774e-05, "loss": 9.8123, "step": 1250 }, { "epoch": 0.003981960936182439, "grad_norm": 1.9296875, "learning_rate": 7.957526545908807e-05, "loss": 9.7681, "step": 1275 }, { "epoch": 0.004060038601597781, "grad_norm": 1.703125, "learning_rate": 8.113678950655841e-05, "loss": 9.7618, "step": 1300 }, { "epoch": 0.004138116267013123, "grad_norm": 1.9453125, "learning_rate": 8.269831355402874e-05, "loss": 9.6876, "step": 1325 }, { "epoch": 0.004216193932428465, "grad_norm": 1.796875, "learning_rate": 8.425983760149906e-05, "loss": 9.689, "step": 1350 }, { "epoch": 0.004294271597843807, "grad_norm": 1.6640625, "learning_rate": 8.582136164896939e-05, "loss": 9.6471, "step": 1375 }, { "epoch": 0.004372349263259149, "grad_norm": 1.65625, "learning_rate": 8.738288569643972e-05, "loss": 9.6163, "step": 1400 }, { "epoch": 0.004450426928674491, "grad_norm": 1.3515625, "learning_rate": 8.894440974391006e-05, "loss": 9.5749, "step": 1425 }, { "epoch": 0.004528504594089833, "grad_norm": 1.6171875, "learning_rate": 9.050593379138039e-05, "loss": 9.5571, "step": 1450 }, { "epoch": 0.004606582259505175, "grad_norm": 1.6015625, "learning_rate": 9.206745783885072e-05, "loss": 9.539, "step": 1475 }, { "epoch": 0.004684659924920517, "grad_norm": 1.6171875, "learning_rate": 9.362898188632105e-05, "loss": 9.5223, "step": 1500 }, { "epoch": 0.004762737590335859, "grad_norm": 1.4140625, "learning_rate": 9.519050593379139e-05, "loss": 9.4832, "step": 1525 }, { "epoch": 0.004840815255751201, "grad_norm": 1.640625, "learning_rate": 9.675202998126172e-05, "loss": 9.4502, "step": 1550 }, { "epoch": 0.004918892921166543, "grad_norm": 1.5546875, "learning_rate": 9.831355402873205e-05, "loss": 9.4381, "step": 1575 }, { "epoch": 0.0049969705865818845, "grad_norm": 1.5625, "learning_rate": 9.987507807620237e-05, "loss": 9.4403, "step": 1600 }, { "epoch": 0.0050750482519972264, "grad_norm": 1.5703125, "learning_rate": 0.0001014366021236727, "loss": 9.4185, "step": 1625 }, { "epoch": 0.005153125917412568, "grad_norm": 1.5078125, "learning_rate": 0.00010299812617114304, "loss": 9.4044, "step": 1650 }, { "epoch": 0.00523120358282791, "grad_norm": 1.484375, "learning_rate": 0.00010455965021861337, "loss": 9.3678, "step": 1675 }, { "epoch": 0.005309281248243252, "grad_norm": 1.4609375, "learning_rate": 0.0001061211742660837, "loss": 9.3262, "step": 1700 }, { "epoch": 0.005387358913658594, "grad_norm": 1.4296875, "learning_rate": 0.00010768269831355403, "loss": 9.365, "step": 1725 }, { "epoch": 0.005465436579073936, "grad_norm": 1.734375, "learning_rate": 0.00010924422236102437, "loss": 9.319, "step": 1750 }, { "epoch": 0.005543514244489278, "grad_norm": 1.4296875, "learning_rate": 0.0001108057464084947, "loss": 9.3204, "step": 1775 }, { "epoch": 0.00562159190990462, "grad_norm": 1.5078125, "learning_rate": 0.00011236727045596503, "loss": 9.2825, "step": 1800 }, { "epoch": 0.005699669575319962, "grad_norm": 1.390625, "learning_rate": 0.00011392879450343536, "loss": 9.2735, "step": 1825 }, { "epoch": 0.005777747240735304, "grad_norm": 1.359375, "learning_rate": 0.0001154903185509057, "loss": 9.264, "step": 1850 }, { "epoch": 0.005855824906150646, "grad_norm": 1.484375, "learning_rate": 0.00011705184259837602, "loss": 9.263, "step": 1875 }, { "epoch": 0.005933902571565988, "grad_norm": 1.3671875, "learning_rate": 0.00011861336664584634, "loss": 9.2572, "step": 1900 }, { "epoch": 0.0060119802369813305, "grad_norm": 1.5078125, "learning_rate": 0.00012017489069331667, "loss": 9.2461, "step": 1925 }, { "epoch": 0.006090057902396672, "grad_norm": 1.53125, "learning_rate": 0.000121736414740787, "loss": 9.2345, "step": 1950 }, { "epoch": 0.006168135567812014, "grad_norm": 1.671875, "learning_rate": 0.00012329793878825736, "loss": 9.1947, "step": 1975 }, { "epoch": 0.006246213233227356, "grad_norm": 1.671875, "learning_rate": 0.00012485946283572768, "loss": 9.2535, "step": 2000 }, { "epoch": 0.006246213233227356, "eval_loss": 9.218317031860352, "eval_runtime": 102.1917, "eval_samples_per_second": 50.914, "eval_steps_per_second": 3.19, "step": 2000 }, { "epoch": 0.006324290898642698, "grad_norm": 1.625, "learning_rate": 0.000126420986883198, "loss": 9.2323, "step": 2025 }, { "epoch": 0.00640236856405804, "grad_norm": 1.8203125, "learning_rate": 0.00012798251093066833, "loss": 9.2067, "step": 2050 }, { "epoch": 0.006480446229473382, "grad_norm": 1.734375, "learning_rate": 0.00012954403497813865, "loss": 9.2166, "step": 2075 }, { "epoch": 0.006558523894888724, "grad_norm": 2.109375, "learning_rate": 0.000131105559025609, "loss": 9.2106, "step": 2100 }, { "epoch": 0.006636601560304066, "grad_norm": 1.9375, "learning_rate": 0.0001326670830730793, "loss": 9.1922, "step": 2125 }, { "epoch": 0.006714679225719408, "grad_norm": 1.8046875, "learning_rate": 0.00013422860712054965, "loss": 9.1984, "step": 2150 }, { "epoch": 0.00679275689113475, "grad_norm": 1.8515625, "learning_rate": 0.00013579013116802, "loss": 9.1911, "step": 2175 }, { "epoch": 0.006870834556550092, "grad_norm": 2.53125, "learning_rate": 0.0001373516552154903, "loss": 9.1912, "step": 2200 }, { "epoch": 0.006948912221965434, "grad_norm": 1.9375, "learning_rate": 0.00013891317926296065, "loss": 9.1964, "step": 2225 }, { "epoch": 0.007026989887380776, "grad_norm": 2.421875, "learning_rate": 0.000140474703310431, "loss": 9.2259, "step": 2250 }, { "epoch": 0.0071050675527961175, "grad_norm": 2.390625, "learning_rate": 0.0001420362273579013, "loss": 9.1996, "step": 2275 }, { "epoch": 0.0071831452182114595, "grad_norm": 2.234375, "learning_rate": 0.00014359775140537165, "loss": 9.2401, "step": 2300 }, { "epoch": 0.007261222883626801, "grad_norm": 2.140625, "learning_rate": 0.00014515927545284197, "loss": 9.2228, "step": 2325 }, { "epoch": 0.007339300549042143, "grad_norm": 2.171875, "learning_rate": 0.0001467207995003123, "loss": 9.1847, "step": 2350 }, { "epoch": 0.007417378214457485, "grad_norm": 1.984375, "learning_rate": 0.00014828232354778266, "loss": 9.1981, "step": 2375 }, { "epoch": 0.007495455879872827, "grad_norm": 2.21875, "learning_rate": 0.00014984384759525297, "loss": 9.1894, "step": 2400 }, { "epoch": 0.007573533545288169, "grad_norm": 2.1875, "learning_rate": 0.00015140537164272331, "loss": 9.2327, "step": 2425 }, { "epoch": 0.007651611210703511, "grad_norm": 2.265625, "learning_rate": 0.00015296689569019363, "loss": 9.2134, "step": 2450 }, { "epoch": 0.007729688876118853, "grad_norm": 2.375, "learning_rate": 0.00015452841973766397, "loss": 9.2026, "step": 2475 }, { "epoch": 0.007807766541534195, "grad_norm": 2.515625, "learning_rate": 0.00015608994378513432, "loss": 9.194, "step": 2500 }, { "epoch": 0.007885844206949537, "grad_norm": 2.328125, "learning_rate": 0.0001576514678326046, "loss": 9.2258, "step": 2525 }, { "epoch": 0.007963921872364879, "grad_norm": 2.390625, "learning_rate": 0.00015921299188007495, "loss": 9.2412, "step": 2550 }, { "epoch": 0.00804199953778022, "grad_norm": 2.578125, "learning_rate": 0.0001607745159275453, "loss": 9.2441, "step": 2575 }, { "epoch": 0.008120077203195563, "grad_norm": 2.78125, "learning_rate": 0.0001623360399750156, "loss": 9.25, "step": 2600 }, { "epoch": 0.008198154868610905, "grad_norm": 2.328125, "learning_rate": 0.00016389756402248595, "loss": 9.2576, "step": 2625 }, { "epoch": 0.008276232534026247, "grad_norm": 2.6875, "learning_rate": 0.00016545908806995626, "loss": 9.2549, "step": 2650 }, { "epoch": 0.008354310199441588, "grad_norm": 2.828125, "learning_rate": 0.0001670206121174266, "loss": 9.2837, "step": 2675 }, { "epoch": 0.00843238786485693, "grad_norm": 2.96875, "learning_rate": 0.00016858213616489695, "loss": 9.2405, "step": 2700 }, { "epoch": 0.008510465530272272, "grad_norm": 2.890625, "learning_rate": 0.00017014366021236727, "loss": 9.2927, "step": 2725 }, { "epoch": 0.008588543195687614, "grad_norm": 2.953125, "learning_rate": 0.0001717051842598376, "loss": 9.2427, "step": 2750 }, { "epoch": 0.008666620861102956, "grad_norm": 3.0, "learning_rate": 0.00017326670830730792, "loss": 9.3099, "step": 2775 }, { "epoch": 0.008744698526518298, "grad_norm": 2.890625, "learning_rate": 0.00017482823235477827, "loss": 9.2786, "step": 2800 }, { "epoch": 0.00882277619193364, "grad_norm": 3.875, "learning_rate": 0.0001763897564022486, "loss": 9.2615, "step": 2825 }, { "epoch": 0.008900853857348982, "grad_norm": 2.84375, "learning_rate": 0.00017795128044971893, "loss": 9.3233, "step": 2850 }, { "epoch": 0.008978931522764324, "grad_norm": 3.359375, "learning_rate": 0.00017951280449718927, "loss": 9.2634, "step": 2875 }, { "epoch": 0.009057009188179666, "grad_norm": 3.171875, "learning_rate": 0.00018107432854465959, "loss": 9.3164, "step": 2900 }, { "epoch": 0.009135086853595008, "grad_norm": 3.15625, "learning_rate": 0.00018263585259212993, "loss": 9.3274, "step": 2925 }, { "epoch": 0.00921316451901035, "grad_norm": 4.0, "learning_rate": 0.00018419737663960027, "loss": 9.3091, "step": 2950 }, { "epoch": 0.009291242184425692, "grad_norm": 3.03125, "learning_rate": 0.0001857589006870706, "loss": 9.317, "step": 2975 }, { "epoch": 0.009369319849841034, "grad_norm": 3.109375, "learning_rate": 0.00018732042473454093, "loss": 9.3963, "step": 3000 }, { "epoch": 0.009369319849841034, "eval_loss": 9.343441009521484, "eval_runtime": 102.2757, "eval_samples_per_second": 50.872, "eval_steps_per_second": 3.187, "step": 3000 }, { "epoch": 0.009447397515256375, "grad_norm": 3.671875, "learning_rate": 0.00018888194878201127, "loss": 9.3633, "step": 3025 }, { "epoch": 0.009525475180671717, "grad_norm": 3.125, "learning_rate": 0.00019044347282948156, "loss": 9.335, "step": 3050 }, { "epoch": 0.00960355284608706, "grad_norm": 3.140625, "learning_rate": 0.0001920049968769519, "loss": 9.3406, "step": 3075 }, { "epoch": 0.009681630511502401, "grad_norm": 3.109375, "learning_rate": 0.00019356652092442222, "loss": 9.3829, "step": 3100 }, { "epoch": 0.009759708176917743, "grad_norm": 3.90625, "learning_rate": 0.00019512804497189256, "loss": 9.3734, "step": 3125 }, { "epoch": 0.009837785842333085, "grad_norm": 3.671875, "learning_rate": 0.0001966895690193629, "loss": 9.3608, "step": 3150 }, { "epoch": 0.009915863507748427, "grad_norm": 3.359375, "learning_rate": 0.00019825109306683322, "loss": 9.3771, "step": 3175 }, { "epoch": 0.009993941173163769, "grad_norm": 3.984375, "learning_rate": 0.00019981261711430356, "loss": 9.4023, "step": 3200 }, { "epoch": 0.010072018838579111, "grad_norm": 3.875, "learning_rate": 0.00020137414116177388, "loss": 9.4179, "step": 3225 }, { "epoch": 0.010150096503994453, "grad_norm": 3.765625, "learning_rate": 0.00020293566520924422, "loss": 9.407, "step": 3250 }, { "epoch": 0.010228174169409795, "grad_norm": 5.0, "learning_rate": 0.00020449718925671457, "loss": 9.4284, "step": 3275 }, { "epoch": 0.010306251834825137, "grad_norm": 3.84375, "learning_rate": 0.00020605871330418488, "loss": 9.4289, "step": 3300 }, { "epoch": 0.010384329500240479, "grad_norm": 3.90625, "learning_rate": 0.00020762023735165522, "loss": 9.409, "step": 3325 }, { "epoch": 0.01046240716565582, "grad_norm": 3.609375, "learning_rate": 0.00020918176139912557, "loss": 9.4343, "step": 3350 }, { "epoch": 0.010540484831071163, "grad_norm": 4.15625, "learning_rate": 0.00021074328544659588, "loss": 9.4513, "step": 3375 }, { "epoch": 0.010618562496486504, "grad_norm": 3.609375, "learning_rate": 0.00021230480949406623, "loss": 9.458, "step": 3400 }, { "epoch": 0.010696640161901846, "grad_norm": 4.4375, "learning_rate": 0.00021386633354153654, "loss": 9.5056, "step": 3425 }, { "epoch": 0.010774717827317188, "grad_norm": 3.859375, "learning_rate": 0.00021542785758900688, "loss": 9.4958, "step": 3450 }, { "epoch": 0.01085279549273253, "grad_norm": 3.96875, "learning_rate": 0.00021698938163647723, "loss": 9.5275, "step": 3475 }, { "epoch": 0.010930873158147872, "grad_norm": 4.46875, "learning_rate": 0.00021855090568394754, "loss": 9.4947, "step": 3500 }, { "epoch": 0.011008950823563214, "grad_norm": 4.8125, "learning_rate": 0.00022011242973141789, "loss": 9.5044, "step": 3525 }, { "epoch": 0.011087028488978556, "grad_norm": 4.5, "learning_rate": 0.00022167395377888817, "loss": 9.5233, "step": 3550 }, { "epoch": 0.011165106154393898, "grad_norm": 4.1875, "learning_rate": 0.00022323547782635852, "loss": 9.5455, "step": 3575 }, { "epoch": 0.01124318381980924, "grad_norm": 4.46875, "learning_rate": 0.00022479700187382886, "loss": 9.5962, "step": 3600 }, { "epoch": 0.011321261485224582, "grad_norm": 4.46875, "learning_rate": 0.00022635852592129918, "loss": 9.5476, "step": 3625 }, { "epoch": 0.011399339150639924, "grad_norm": 4.9375, "learning_rate": 0.00022792004996876952, "loss": 9.5762, "step": 3650 }, { "epoch": 0.011477416816055266, "grad_norm": 4.84375, "learning_rate": 0.00022948157401623983, "loss": 9.6066, "step": 3675 }, { "epoch": 0.011555494481470608, "grad_norm": 4.8125, "learning_rate": 0.00023104309806371018, "loss": 9.6445, "step": 3700 }, { "epoch": 0.01163357214688595, "grad_norm": 4.6875, "learning_rate": 0.00023260462211118052, "loss": 9.6033, "step": 3725 }, { "epoch": 0.011711649812301292, "grad_norm": 5.0, "learning_rate": 0.00023416614615865084, "loss": 9.6635, "step": 3750 }, { "epoch": 0.011789727477716633, "grad_norm": 4.46875, "learning_rate": 0.00023572767020612118, "loss": 9.6236, "step": 3775 }, { "epoch": 0.011867805143131975, "grad_norm": 5.28125, "learning_rate": 0.00023728919425359152, "loss": 9.6867, "step": 3800 }, { "epoch": 0.011945882808547317, "grad_norm": 4.5, "learning_rate": 0.00023885071830106184, "loss": 9.6757, "step": 3825 }, { "epoch": 0.012023960473962661, "grad_norm": 5.53125, "learning_rate": 0.00024041224234853218, "loss": 9.7058, "step": 3850 }, { "epoch": 0.012102038139378003, "grad_norm": 5.71875, "learning_rate": 0.0002419737663960025, "loss": 9.7057, "step": 3875 }, { "epoch": 0.012180115804793345, "grad_norm": 6.0, "learning_rate": 0.00024353529044347284, "loss": 9.7199, "step": 3900 }, { "epoch": 0.012258193470208687, "grad_norm": 5.78125, "learning_rate": 0.00024509681449094316, "loss": 9.7453, "step": 3925 }, { "epoch": 0.012336271135624029, "grad_norm": 4.96875, "learning_rate": 0.00024665833853841347, "loss": 9.7496, "step": 3950 }, { "epoch": 0.01241434880103937, "grad_norm": 4.34375, "learning_rate": 0.00024821986258588384, "loss": 9.7802, "step": 3975 }, { "epoch": 0.012492426466454713, "grad_norm": 5.5, "learning_rate": 0.00024978138663335416, "loss": 9.814, "step": 4000 }, { "epoch": 0.012492426466454713, "eval_loss": 9.791647911071777, "eval_runtime": 102.2247, "eval_samples_per_second": 50.898, "eval_steps_per_second": 3.189, "step": 4000 }, { "epoch": 0.012570504131870055, "grad_norm": 6.0625, "learning_rate": 0.00025134291068082447, "loss": 9.785, "step": 4025 }, { "epoch": 0.012648581797285396, "grad_norm": 4.875, "learning_rate": 0.00025290443472829484, "loss": 9.8043, "step": 4050 }, { "epoch": 0.012726659462700738, "grad_norm": 6.03125, "learning_rate": 0.00025446595877576516, "loss": 9.8206, "step": 4075 }, { "epoch": 0.01280473712811608, "grad_norm": 5.15625, "learning_rate": 0.0002560274828232355, "loss": 9.8267, "step": 4100 }, { "epoch": 0.012882814793531422, "grad_norm": 5.5625, "learning_rate": 0.0002575890068707058, "loss": 9.8286, "step": 4125 }, { "epoch": 0.012960892458946764, "grad_norm": 5.09375, "learning_rate": 0.00025915053091817616, "loss": 9.8346, "step": 4150 }, { "epoch": 0.013038970124362106, "grad_norm": 6.53125, "learning_rate": 0.0002607120549656465, "loss": 9.8788, "step": 4175 }, { "epoch": 0.013117047789777448, "grad_norm": 5.84375, "learning_rate": 0.00026227357901311685, "loss": 9.8549, "step": 4200 }, { "epoch": 0.01319512545519279, "grad_norm": 5.3125, "learning_rate": 0.0002638351030605871, "loss": 9.8965, "step": 4225 }, { "epoch": 0.013273203120608132, "grad_norm": 6.625, "learning_rate": 0.0002653966271080575, "loss": 9.936, "step": 4250 }, { "epoch": 0.013351280786023474, "grad_norm": 5.6875, "learning_rate": 0.0002669581511555278, "loss": 9.9048, "step": 4275 }, { "epoch": 0.013429358451438816, "grad_norm": 5.78125, "learning_rate": 0.00026851967520299816, "loss": 9.9304, "step": 4300 }, { "epoch": 0.013507436116854158, "grad_norm": 5.90625, "learning_rate": 0.0002700811992504685, "loss": 9.9668, "step": 4325 }, { "epoch": 0.0135855137822695, "grad_norm": 5.90625, "learning_rate": 0.0002716427232979388, "loss": 9.957, "step": 4350 }, { "epoch": 0.013663591447684842, "grad_norm": 6.4375, "learning_rate": 0.0002732042473454091, "loss": 9.9809, "step": 4375 }, { "epoch": 0.013741669113100183, "grad_norm": 6.96875, "learning_rate": 0.0002747657713928794, "loss": 9.9868, "step": 4400 }, { "epoch": 0.013819746778515525, "grad_norm": 5.71875, "learning_rate": 0.0002763272954403498, "loss": 10.0066, "step": 4425 }, { "epoch": 0.013897824443930867, "grad_norm": 5.46875, "learning_rate": 0.0002778888194878201, "loss": 9.9905, "step": 4450 }, { "epoch": 0.01397590210934621, "grad_norm": 5.59375, "learning_rate": 0.0002794503435352905, "loss": 10.0002, "step": 4475 }, { "epoch": 0.014053979774761551, "grad_norm": 7.5625, "learning_rate": 0.00028101186758276074, "loss": 10.0869, "step": 4500 }, { "epoch": 0.014132057440176893, "grad_norm": 5.75, "learning_rate": 0.0002825733916302311, "loss": 10.0828, "step": 4525 }, { "epoch": 0.014210135105592235, "grad_norm": 6.4375, "learning_rate": 0.00028413491567770143, "loss": 10.1158, "step": 4550 }, { "epoch": 0.014288212771007577, "grad_norm": 6.59375, "learning_rate": 0.0002856964397251718, "loss": 10.1618, "step": 4575 }, { "epoch": 0.014366290436422919, "grad_norm": 6.59375, "learning_rate": 0.0002872579637726421, "loss": 10.1651, "step": 4600 }, { "epoch": 0.014444368101838261, "grad_norm": 6.9375, "learning_rate": 0.00028881948782011243, "loss": 10.1786, "step": 4625 }, { "epoch": 0.014522445767253603, "grad_norm": 8.375, "learning_rate": 0.00029038101186758275, "loss": 10.1674, "step": 4650 }, { "epoch": 0.014600523432668945, "grad_norm": 7.625, "learning_rate": 0.0002919425359150531, "loss": 10.1869, "step": 4675 }, { "epoch": 0.014678601098084287, "grad_norm": 6.21875, "learning_rate": 0.00029350405996252343, "loss": 10.2085, "step": 4700 }, { "epoch": 0.014756678763499629, "grad_norm": 5.5625, "learning_rate": 0.0002950655840099938, "loss": 10.2231, "step": 4725 }, { "epoch": 0.01483475642891497, "grad_norm": 7.03125, "learning_rate": 0.00029662710805746406, "loss": 10.2671, "step": 4750 }, { "epoch": 0.014912834094330312, "grad_norm": 7.5625, "learning_rate": 0.00029818863210493443, "loss": 10.3177, "step": 4775 }, { "epoch": 0.014990911759745654, "grad_norm": 7.0, "learning_rate": 0.00029975015615240475, "loss": 10.3046, "step": 4800 }, { "epoch": 0.015068989425160996, "grad_norm": 8.125, "learning_rate": 0.0003013116801998751, "loss": 10.3212, "step": 4825 }, { "epoch": 0.015147067090576338, "grad_norm": 6.75, "learning_rate": 0.00030287320424734543, "loss": 10.2822, "step": 4850 }, { "epoch": 0.01522514475599168, "grad_norm": 7.03125, "learning_rate": 0.0003044347282948157, "loss": 10.3131, "step": 4875 }, { "epoch": 0.015303222421407022, "grad_norm": 7.1875, "learning_rate": 0.00030599625234228607, "loss": 10.3112, "step": 4900 }, { "epoch": 0.015381300086822364, "grad_norm": 8.3125, "learning_rate": 0.0003075577763897564, "loss": 10.3796, "step": 4925 }, { "epoch": 0.015459377752237706, "grad_norm": 6.8125, "learning_rate": 0.00030911930043722675, "loss": 10.4225, "step": 4950 }, { "epoch": 0.015537455417653048, "grad_norm": 6.28125, "learning_rate": 0.00031068082448469707, "loss": 10.435, "step": 4975 }, { "epoch": 0.01561553308306839, "grad_norm": 7.0625, "learning_rate": 0.0003122423485321674, "loss": 10.4029, "step": 5000 }, { "epoch": 0.01561553308306839, "eval_loss": 10.434911727905273, "eval_runtime": 102.2426, "eval_samples_per_second": 50.889, "eval_steps_per_second": 3.188, "step": 5000 }, { "epoch": 0.01569361074848373, "grad_norm": 7.21875, "learning_rate": 0.0003138038725796377, "loss": 10.4435, "step": 5025 }, { "epoch": 0.015771688413899074, "grad_norm": 6.53125, "learning_rate": 0.00031536539662710807, "loss": 10.4452, "step": 5050 }, { "epoch": 0.015849766079314414, "grad_norm": 6.40625, "learning_rate": 0.0003169269206745784, "loss": 10.4211, "step": 5075 }, { "epoch": 0.015927843744729758, "grad_norm": 8.25, "learning_rate": 0.00031848844472204876, "loss": 10.5069, "step": 5100 }, { "epoch": 0.0160059214101451, "grad_norm": 8.625, "learning_rate": 0.00032004996876951907, "loss": 10.5109, "step": 5125 }, { "epoch": 0.01608399907556044, "grad_norm": 7.6875, "learning_rate": 0.0003216114928169894, "loss": 10.5435, "step": 5150 }, { "epoch": 0.016162076740975785, "grad_norm": 7.46875, "learning_rate": 0.0003231730168644597, "loss": 10.5252, "step": 5175 }, { "epoch": 0.016240154406391125, "grad_norm": 9.0625, "learning_rate": 0.00032473454091193007, "loss": 10.5791, "step": 5200 }, { "epoch": 0.01631823207180647, "grad_norm": 8.5625, "learning_rate": 0.0003262960649594004, "loss": 10.5973, "step": 5225 }, { "epoch": 0.01639630973722181, "grad_norm": 8.375, "learning_rate": 0.00032785758900687076, "loss": 10.6523, "step": 5250 }, { "epoch": 0.016474387402637153, "grad_norm": 7.375, "learning_rate": 0.000329419113054341, "loss": 10.5952, "step": 5275 }, { "epoch": 0.016552465068052493, "grad_norm": 6.40625, "learning_rate": 0.0003309806371018114, "loss": 10.6144, "step": 5300 }, { "epoch": 0.016630542733467837, "grad_norm": 7.34375, "learning_rate": 0.0003325421611492817, "loss": 10.6408, "step": 5325 }, { "epoch": 0.016708620398883177, "grad_norm": 9.125, "learning_rate": 0.0003341036851967521, "loss": 10.687, "step": 5350 }, { "epoch": 0.01678669806429852, "grad_norm": 8.375, "learning_rate": 0.0003356652092442224, "loss": 10.6634, "step": 5375 }, { "epoch": 0.01686477572971386, "grad_norm": 7.65625, "learning_rate": 0.00033722673329169265, "loss": 10.7616, "step": 5400 }, { "epoch": 0.016942853395129204, "grad_norm": 9.0, "learning_rate": 0.000338788257339163, "loss": 10.7149, "step": 5425 }, { "epoch": 0.017020931060544545, "grad_norm": 9.5, "learning_rate": 0.00034034978138663334, "loss": 10.7277, "step": 5450 }, { "epoch": 0.01709900872595989, "grad_norm": 8.625, "learning_rate": 0.0003419113054341037, "loss": 10.7305, "step": 5475 }, { "epoch": 0.01717708639137523, "grad_norm": 8.25, "learning_rate": 0.000343472829481574, "loss": 10.7721, "step": 5500 }, { "epoch": 0.017255164056790572, "grad_norm": 10.0625, "learning_rate": 0.00034503435352904434, "loss": 10.832, "step": 5525 }, { "epoch": 0.017333241722205912, "grad_norm": 10.0, "learning_rate": 0.00034659587757651466, "loss": 10.8325, "step": 5550 }, { "epoch": 0.017411319387621256, "grad_norm": 9.125, "learning_rate": 0.000348157401623985, "loss": 10.8516, "step": 5575 }, { "epoch": 0.017489397053036596, "grad_norm": 9.125, "learning_rate": 0.00034971892567145534, "loss": 10.9156, "step": 5600 }, { "epoch": 0.01756747471845194, "grad_norm": 9.0625, "learning_rate": 0.0003512804497189257, "loss": 10.9185, "step": 5625 }, { "epoch": 0.01764555238386728, "grad_norm": 10.4375, "learning_rate": 0.000352841973766396, "loss": 10.9967, "step": 5650 }, { "epoch": 0.017723630049282624, "grad_norm": 12.9375, "learning_rate": 0.00035440349781386634, "loss": 11.0233, "step": 5675 }, { "epoch": 0.017801707714697964, "grad_norm": 10.0, "learning_rate": 0.00035596502186133666, "loss": 11.0479, "step": 5700 }, { "epoch": 0.017879785380113308, "grad_norm": 9.6875, "learning_rate": 0.00035752654590880703, "loss": 10.9976, "step": 5725 }, { "epoch": 0.017957863045528648, "grad_norm": 8.5625, "learning_rate": 0.00035908806995627734, "loss": 11.0774, "step": 5750 }, { "epoch": 0.01803594071094399, "grad_norm": 10.125, "learning_rate": 0.00036064959400374766, "loss": 11.1011, "step": 5775 }, { "epoch": 0.01811401837635933, "grad_norm": 8.3125, "learning_rate": 0.000362211118051218, "loss": 11.087, "step": 5800 }, { "epoch": 0.018192096041774675, "grad_norm": 9.875, "learning_rate": 0.00036377264209868835, "loss": 11.1436, "step": 5825 }, { "epoch": 0.018270173707190016, "grad_norm": 8.75, "learning_rate": 0.00036533416614615866, "loss": 11.1463, "step": 5850 }, { "epoch": 0.01834825137260536, "grad_norm": 9.75, "learning_rate": 0.00036689569019362903, "loss": 11.1615, "step": 5875 }, { "epoch": 0.0184263290380207, "grad_norm": 10.4375, "learning_rate": 0.00036845721424109935, "loss": 11.1622, "step": 5900 }, { "epoch": 0.018504406703436043, "grad_norm": 9.1875, "learning_rate": 0.0003700187382885696, "loss": 11.2235, "step": 5925 }, { "epoch": 0.018582484368851383, "grad_norm": 8.9375, "learning_rate": 0.00037158026233604, "loss": 11.2721, "step": 5950 }, { "epoch": 0.018660562034266727, "grad_norm": 8.625, "learning_rate": 0.0003731417863835103, "loss": 11.2218, "step": 5975 }, { "epoch": 0.018738639699682067, "grad_norm": 8.5, "learning_rate": 0.00037470331043098067, "loss": 11.2897, "step": 6000 }, { "epoch": 0.018738639699682067, "eval_loss": 11.259696960449219, "eval_runtime": 102.0975, "eval_samples_per_second": 50.961, "eval_steps_per_second": 3.193, "step": 6000 }, { "epoch": 0.01881671736509741, "grad_norm": 10.1875, "learning_rate": 0.000376264834478451, "loss": 11.2667, "step": 6025 }, { "epoch": 0.01889479503051275, "grad_norm": 9.4375, "learning_rate": 0.0003778263585259213, "loss": 11.2334, "step": 6050 }, { "epoch": 0.018972872695928095, "grad_norm": 8.6875, "learning_rate": 0.0003793878825733916, "loss": 11.2445, "step": 6075 }, { "epoch": 0.019050950361343435, "grad_norm": 9.0, "learning_rate": 0.000380949406620862, "loss": 11.2638, "step": 6100 }, { "epoch": 0.01912902802675878, "grad_norm": 9.3125, "learning_rate": 0.0003825109306683323, "loss": 11.2733, "step": 6125 }, { "epoch": 0.01920710569217412, "grad_norm": 8.875, "learning_rate": 0.00038407245471580267, "loss": 11.327, "step": 6150 }, { "epoch": 0.019285183357589462, "grad_norm": 9.625, "learning_rate": 0.00038563397876327293, "loss": 11.3521, "step": 6175 }, { "epoch": 0.019363261023004803, "grad_norm": 9.1875, "learning_rate": 0.0003871955028107433, "loss": 11.3203, "step": 6200 }, { "epoch": 0.019441338688420146, "grad_norm": 10.25, "learning_rate": 0.0003887570268582136, "loss": 11.4162, "step": 6225 }, { "epoch": 0.019519416353835486, "grad_norm": 11.0, "learning_rate": 0.000390318550905684, "loss": 11.4063, "step": 6250 }, { "epoch": 0.01959749401925083, "grad_norm": 9.9375, "learning_rate": 0.0003918800749531543, "loss": 11.5209, "step": 6275 }, { "epoch": 0.01967557168466617, "grad_norm": 9.25, "learning_rate": 0.0003934415990006246, "loss": 11.5018, "step": 6300 }, { "epoch": 0.019753649350081514, "grad_norm": 12.25, "learning_rate": 0.00039500312304809493, "loss": 11.5067, "step": 6325 }, { "epoch": 0.019831727015496854, "grad_norm": 10.8125, "learning_rate": 0.0003965646470955653, "loss": 11.5646, "step": 6350 }, { "epoch": 0.019909804680912198, "grad_norm": 10.1875, "learning_rate": 0.0003981261711430356, "loss": 11.5575, "step": 6375 }, { "epoch": 0.019987882346327538, "grad_norm": 10.5, "learning_rate": 0.000399687695190506, "loss": 11.634, "step": 6400 }, { "epoch": 0.02006596001174288, "grad_norm": 10.5625, "learning_rate": 0.00040124921923797625, "loss": 11.7034, "step": 6425 }, { "epoch": 0.020144037677158222, "grad_norm": 10.1875, "learning_rate": 0.00040281074328544657, "loss": 11.6978, "step": 6450 }, { "epoch": 0.020222115342573566, "grad_norm": 12.125, "learning_rate": 0.00040437226733291694, "loss": 11.7471, "step": 6475 }, { "epoch": 0.020300193007988906, "grad_norm": 10.8125, "learning_rate": 0.00040593379138038725, "loss": 11.861, "step": 6500 }, { "epoch": 0.02037827067340425, "grad_norm": 9.8125, "learning_rate": 0.0004074953154278576, "loss": 11.7507, "step": 6525 }, { "epoch": 0.02045634833881959, "grad_norm": 12.0, "learning_rate": 0.0004090568394753279, "loss": 11.8528, "step": 6550 }, { "epoch": 0.020534426004234933, "grad_norm": 11.4375, "learning_rate": 0.00041061836352279825, "loss": 11.9091, "step": 6575 }, { "epoch": 0.020612503669650274, "grad_norm": 11.1875, "learning_rate": 0.00041217988757026857, "loss": 11.8911, "step": 6600 }, { "epoch": 0.020690581335065617, "grad_norm": 10.75, "learning_rate": 0.00041374141161773894, "loss": 11.9488, "step": 6625 }, { "epoch": 0.020768659000480957, "grad_norm": 10.0, "learning_rate": 0.00041530293566520925, "loss": 12.016, "step": 6650 }, { "epoch": 0.0208467366658963, "grad_norm": 9.75, "learning_rate": 0.0004168644597126796, "loss": 11.9987, "step": 6675 }, { "epoch": 0.02092481433131164, "grad_norm": 9.875, "learning_rate": 0.0004184259837601499, "loss": 12.0069, "step": 6700 }, { "epoch": 0.021002891996726985, "grad_norm": 10.9375, "learning_rate": 0.00041998750780762026, "loss": 11.9872, "step": 6725 }, { "epoch": 0.021080969662142325, "grad_norm": 10.9375, "learning_rate": 0.00042154903185509057, "loss": 12.0362, "step": 6750 }, { "epoch": 0.02115904732755767, "grad_norm": 10.25, "learning_rate": 0.00042311055590256094, "loss": 12.1232, "step": 6775 }, { "epoch": 0.02123712499297301, "grad_norm": 10.375, "learning_rate": 0.00042467207995003126, "loss": 12.1279, "step": 6800 }, { "epoch": 0.021315202658388353, "grad_norm": 12.5, "learning_rate": 0.0004262336039975016, "loss": 12.1096, "step": 6825 }, { "epoch": 0.021393280323803693, "grad_norm": 12.125, "learning_rate": 0.0004277951280449719, "loss": 12.1568, "step": 6850 }, { "epoch": 0.021471357989219036, "grad_norm": 10.375, "learning_rate": 0.00042935665209244226, "loss": 12.191, "step": 6875 }, { "epoch": 0.021549435654634377, "grad_norm": 12.125, "learning_rate": 0.0004309181761399126, "loss": 12.3206, "step": 6900 }, { "epoch": 0.02162751332004972, "grad_norm": 10.9375, "learning_rate": 0.00043247970018738294, "loss": 12.2622, "step": 6925 }, { "epoch": 0.02170559098546506, "grad_norm": 11.8125, "learning_rate": 0.0004340412242348532, "loss": 12.2397, "step": 6950 }, { "epoch": 0.021783668650880404, "grad_norm": 10.5625, "learning_rate": 0.0004356027482823235, "loss": 12.3172, "step": 6975 }, { "epoch": 0.021861746316295744, "grad_norm": 11.5625, "learning_rate": 0.0004371642723297939, "loss": 12.3313, "step": 7000 }, { "epoch": 0.021861746316295744, "eval_loss": 12.409297943115234, "eval_runtime": 102.1563, "eval_samples_per_second": 50.932, "eval_steps_per_second": 3.191, "step": 7000 }, { "epoch": 0.021939823981711088, "grad_norm": 12.0625, "learning_rate": 0.0004387257963772642, "loss": 12.4476, "step": 7025 }, { "epoch": 0.02201790164712643, "grad_norm": 12.4375, "learning_rate": 0.0004402873204247346, "loss": 12.4722, "step": 7050 }, { "epoch": 0.022095979312541772, "grad_norm": 13.5625, "learning_rate": 0.00044184884447220484, "loss": 12.4586, "step": 7075 }, { "epoch": 0.022174056977957112, "grad_norm": 11.75, "learning_rate": 0.0004434103685196752, "loss": 12.4961, "step": 7100 }, { "epoch": 0.022252134643372456, "grad_norm": 10.375, "learning_rate": 0.0004449718925671455, "loss": 12.5456, "step": 7125 }, { "epoch": 0.022330212308787796, "grad_norm": 16.5, "learning_rate": 0.0004465334166146159, "loss": 12.5605, "step": 7150 }, { "epoch": 0.02240828997420314, "grad_norm": 13.9375, "learning_rate": 0.0004480949406620862, "loss": 12.5607, "step": 7175 }, { "epoch": 0.02248636763961848, "grad_norm": 12.25, "learning_rate": 0.0004496564647095565, "loss": 12.6064, "step": 7200 }, { "epoch": 0.022564445305033824, "grad_norm": 13.875, "learning_rate": 0.00045121798875702684, "loss": 12.6238, "step": 7225 }, { "epoch": 0.022642522970449164, "grad_norm": 12.1875, "learning_rate": 0.0004527795128044972, "loss": 12.6783, "step": 7250 }, { "epoch": 0.022720600635864507, "grad_norm": 12.0, "learning_rate": 0.00045434103685196753, "loss": 12.6747, "step": 7275 }, { "epoch": 0.022798678301279848, "grad_norm": 12.9375, "learning_rate": 0.0004559025608994379, "loss": 12.7325, "step": 7300 }, { "epoch": 0.02287675596669519, "grad_norm": 10.75, "learning_rate": 0.00045746408494690816, "loss": 12.8587, "step": 7325 }, { "epoch": 0.02295483363211053, "grad_norm": 14.5, "learning_rate": 0.00045902560899437853, "loss": 12.8184, "step": 7350 }, { "epoch": 0.023032911297525875, "grad_norm": 12.3125, "learning_rate": 0.00046058713304184885, "loss": 12.8454, "step": 7375 }, { "epoch": 0.023110988962941215, "grad_norm": 10.75, "learning_rate": 0.0004621486570893192, "loss": 12.8707, "step": 7400 }, { "epoch": 0.02318906662835656, "grad_norm": 13.5625, "learning_rate": 0.00046371018113678953, "loss": 12.9231, "step": 7425 }, { "epoch": 0.0232671442937719, "grad_norm": 12.125, "learning_rate": 0.0004652717051842599, "loss": 12.9452, "step": 7450 }, { "epoch": 0.023345221959187243, "grad_norm": 17.0, "learning_rate": 0.00046683322923173016, "loss": 13.0146, "step": 7475 }, { "epoch": 0.023423299624602583, "grad_norm": 13.4375, "learning_rate": 0.0004683947532792005, "loss": 12.9707, "step": 7500 }, { "epoch": 0.023501377290017927, "grad_norm": 13.6875, "learning_rate": 0.00046995627732667085, "loss": 12.9913, "step": 7525 }, { "epoch": 0.023579454955433267, "grad_norm": 21.125, "learning_rate": 0.00047151780137414116, "loss": 13.0038, "step": 7550 }, { "epoch": 0.02365753262084861, "grad_norm": 14.25, "learning_rate": 0.00047307932542161153, "loss": 13.0291, "step": 7575 }, { "epoch": 0.02373561028626395, "grad_norm": 13.5, "learning_rate": 0.0004746408494690818, "loss": 12.9994, "step": 7600 }, { "epoch": 0.023813687951679294, "grad_norm": 12.3125, "learning_rate": 0.00047620237351655217, "loss": 13.0487, "step": 7625 }, { "epoch": 0.023891765617094635, "grad_norm": 12.875, "learning_rate": 0.0004777638975640225, "loss": 13.0586, "step": 7650 }, { "epoch": 0.02396984328250998, "grad_norm": 12.25, "learning_rate": 0.00047932542161149285, "loss": 13.0199, "step": 7675 }, { "epoch": 0.024047920947925322, "grad_norm": 12.5625, "learning_rate": 0.00048088694565896317, "loss": 13.0435, "step": 7700 }, { "epoch": 0.024125998613340662, "grad_norm": 14.0, "learning_rate": 0.0004824484697064335, "loss": 13.1292, "step": 7725 }, { "epoch": 0.024204076278756006, "grad_norm": 17.625, "learning_rate": 0.0004840099937539038, "loss": 13.1447, "step": 7750 }, { "epoch": 0.024282153944171346, "grad_norm": 14.1875, "learning_rate": 0.00048557151780137417, "loss": 13.1884, "step": 7775 }, { "epoch": 0.02436023160958669, "grad_norm": 14.1875, "learning_rate": 0.0004871330418488445, "loss": 13.2918, "step": 7800 }, { "epoch": 0.02443830927500203, "grad_norm": 12.75, "learning_rate": 0.0004886945658963149, "loss": 13.2436, "step": 7825 }, { "epoch": 0.024516386940417374, "grad_norm": 12.875, "learning_rate": 0.0004902560899437852, "loss": 13.2654, "step": 7850 }, { "epoch": 0.024594464605832714, "grad_norm": 13.8125, "learning_rate": 0.0004918176139912555, "loss": 13.3083, "step": 7875 }, { "epoch": 0.024672542271248057, "grad_norm": 15.5, "learning_rate": 0.0004933791380387258, "loss": 13.3748, "step": 7900 }, { "epoch": 0.024750619936663398, "grad_norm": 13.0625, "learning_rate": 0.0004949406620861961, "loss": 13.3709, "step": 7925 }, { "epoch": 0.02482869760207874, "grad_norm": 14.375, "learning_rate": 0.0004965021861336665, "loss": 13.4115, "step": 7950 }, { "epoch": 0.02490677526749408, "grad_norm": 14.5625, "learning_rate": 0.0004980637101811367, "loss": 13.5171, "step": 7975 }, { "epoch": 0.024984852932909425, "grad_norm": 15.25, "learning_rate": 0.0004996252342286071, "loss": 13.4743, "step": 8000 }, { "epoch": 0.024984852932909425, "eval_loss": 13.542752265930176, "eval_runtime": 102.3748, "eval_samples_per_second": 50.823, "eval_steps_per_second": 3.184, "step": 8000 }, { "epoch": 0.025062930598324765, "grad_norm": 14.25, "learning_rate": 0.0005011867582760775, "loss": 13.5245, "step": 8025 }, { "epoch": 0.02514100826374011, "grad_norm": 15.375, "learning_rate": 0.0005027482823235477, "loss": 13.5847, "step": 8050 }, { "epoch": 0.02521908592915545, "grad_norm": 15.5, "learning_rate": 0.0005043098063710181, "loss": 13.5861, "step": 8075 }, { "epoch": 0.025297163594570793, "grad_norm": 14.0, "learning_rate": 0.0005058713304184884, "loss": 13.6261, "step": 8100 }, { "epoch": 0.025375241259986133, "grad_norm": 12.875, "learning_rate": 0.0005074328544659588, "loss": 13.6362, "step": 8125 }, { "epoch": 0.025453318925401477, "grad_norm": 15.4375, "learning_rate": 0.0005089943785134291, "loss": 13.7079, "step": 8150 }, { "epoch": 0.025531396590816817, "grad_norm": 16.375, "learning_rate": 0.0005105559025608995, "loss": 13.7344, "step": 8175 }, { "epoch": 0.02560947425623216, "grad_norm": 13.6875, "learning_rate": 0.0005121174266083698, "loss": 13.8572, "step": 8200 }, { "epoch": 0.0256875519216475, "grad_norm": 15.4375, "learning_rate": 0.0005136789506558401, "loss": 13.9229, "step": 8225 }, { "epoch": 0.025765629587062844, "grad_norm": 14.125, "learning_rate": 0.0005152404747033104, "loss": 13.976, "step": 8250 }, { "epoch": 0.025843707252478185, "grad_norm": 13.6875, "learning_rate": 0.0005168019987507809, "loss": 13.9796, "step": 8275 }, { "epoch": 0.02592178491789353, "grad_norm": 14.5625, "learning_rate": 0.0005183635227982511, "loss": 14.0409, "step": 8300 }, { "epoch": 0.02599986258330887, "grad_norm": 14.0, "learning_rate": 0.0005199250468457214, "loss": 13.9807, "step": 8325 }, { "epoch": 0.026077940248724212, "grad_norm": 15.9375, "learning_rate": 0.0005214865708931917, "loss": 14.1036, "step": 8350 }, { "epoch": 0.026156017914139552, "grad_norm": 13.9375, "learning_rate": 0.0005230480949406621, "loss": 14.1808, "step": 8375 }, { "epoch": 0.026234095579554896, "grad_norm": 14.75, "learning_rate": 0.0005246096189881324, "loss": 14.0815, "step": 8400 }, { "epoch": 0.026312173244970236, "grad_norm": 17.375, "learning_rate": 0.0005261711430356028, "loss": 14.2371, "step": 8425 }, { "epoch": 0.02639025091038558, "grad_norm": 19.0, "learning_rate": 0.0005277326670830731, "loss": 14.3598, "step": 8450 }, { "epoch": 0.02646832857580092, "grad_norm": 15.5, "learning_rate": 0.0005292941911305435, "loss": 14.395, "step": 8475 }, { "epoch": 0.026546406241216264, "grad_norm": 16.0, "learning_rate": 0.0005308557151780138, "loss": 14.4232, "step": 8500 }, { "epoch": 0.026624483906631604, "grad_norm": 16.5, "learning_rate": 0.0005324172392254841, "loss": 14.498, "step": 8525 }, { "epoch": 0.026702561572046948, "grad_norm": 15.875, "learning_rate": 0.0005339787632729543, "loss": 14.5161, "step": 8550 }, { "epoch": 0.026780639237462288, "grad_norm": 16.625, "learning_rate": 0.0005355402873204247, "loss": 14.5657, "step": 8575 }, { "epoch": 0.02685871690287763, "grad_norm": 16.25, "learning_rate": 0.0005371018113678951, "loss": 14.5677, "step": 8600 }, { "epoch": 0.02693679456829297, "grad_norm": 16.25, "learning_rate": 0.0005386633354153654, "loss": 14.7238, "step": 8625 }, { "epoch": 0.027014872233708315, "grad_norm": 19.75, "learning_rate": 0.0005402248594628357, "loss": 14.7521, "step": 8650 }, { "epoch": 0.027092949899123656, "grad_norm": 17.5, "learning_rate": 0.000541786383510306, "loss": 14.81, "step": 8675 }, { "epoch": 0.027171027564539, "grad_norm": 17.25, "learning_rate": 0.0005433479075577764, "loss": 14.8193, "step": 8700 }, { "epoch": 0.02724910522995434, "grad_norm": 19.875, "learning_rate": 0.0005449094316052468, "loss": 14.7857, "step": 8725 }, { "epoch": 0.027327182895369683, "grad_norm": 16.625, "learning_rate": 0.0005464709556527171, "loss": 14.8485, "step": 8750 }, { "epoch": 0.027405260560785023, "grad_norm": 17.75, "learning_rate": 0.0005480324797001874, "loss": 14.8765, "step": 8775 }, { "epoch": 0.027483338226200367, "grad_norm": 16.625, "learning_rate": 0.0005495940037476578, "loss": 14.8669, "step": 8800 }, { "epoch": 0.027561415891615707, "grad_norm": 18.375, "learning_rate": 0.000551155527795128, "loss": 14.9237, "step": 8825 }, { "epoch": 0.02763949355703105, "grad_norm": 18.75, "learning_rate": 0.0005527170518425983, "loss": 15.0786, "step": 8850 }, { "epoch": 0.02771757122244639, "grad_norm": 18.625, "learning_rate": 0.0005542785758900687, "loss": 14.9841, "step": 8875 }, { "epoch": 0.027795648887861735, "grad_norm": 16.375, "learning_rate": 0.0005558400999375391, "loss": 15.1071, "step": 8900 }, { "epoch": 0.027873726553277075, "grad_norm": 18.0, "learning_rate": 0.0005574016239850094, "loss": 15.0875, "step": 8925 }, { "epoch": 0.02795180421869242, "grad_norm": 16.75, "learning_rate": 0.0005589631480324797, "loss": 15.1422, "step": 8950 }, { "epoch": 0.02802988188410776, "grad_norm": 16.125, "learning_rate": 0.00056052467207995, "loss": 15.1989, "step": 8975 }, { "epoch": 0.028107959549523102, "grad_norm": 17.625, "learning_rate": 0.0005620861961274205, "loss": 15.1135, "step": 9000 }, { "epoch": 0.028107959549523102, "eval_loss": 15.24026107788086, "eval_runtime": 102.3475, "eval_samples_per_second": 50.837, "eval_steps_per_second": 3.185, "step": 9000 }, { "epoch": 0.028186037214938443, "grad_norm": 15.4375, "learning_rate": 0.0005636477201748908, "loss": 15.2318, "step": 9025 }, { "epoch": 0.028264114880353786, "grad_norm": 16.125, "learning_rate": 0.0005652092442223611, "loss": 15.1658, "step": 9050 }, { "epoch": 0.028342192545769126, "grad_norm": 16.5, "learning_rate": 0.0005667707682698313, "loss": 15.2762, "step": 9075 }, { "epoch": 0.02842027021118447, "grad_norm": 15.375, "learning_rate": 0.0005683322923173016, "loss": 15.1555, "step": 9100 }, { "epoch": 0.02849834787659981, "grad_norm": 16.75, "learning_rate": 0.000569893816364772, "loss": 15.1879, "step": 9125 }, { "epoch": 0.028576425542015154, "grad_norm": 15.75, "learning_rate": 0.0005714553404122423, "loss": 15.1291, "step": 9150 }, { "epoch": 0.028654503207430494, "grad_norm": 16.375, "learning_rate": 0.0005730168644597127, "loss": 15.2669, "step": 9175 }, { "epoch": 0.028732580872845838, "grad_norm": 15.625, "learning_rate": 0.000574578388507183, "loss": 15.2461, "step": 9200 }, { "epoch": 0.028810658538261178, "grad_norm": 16.25, "learning_rate": 0.0005761399125546534, "loss": 15.1608, "step": 9225 }, { "epoch": 0.028888736203676522, "grad_norm": 16.625, "learning_rate": 0.0005777014366021237, "loss": 15.2685, "step": 9250 }, { "epoch": 0.028966813869091862, "grad_norm": 17.375, "learning_rate": 0.000579262960649594, "loss": 15.3043, "step": 9275 }, { "epoch": 0.029044891534507206, "grad_norm": 16.625, "learning_rate": 0.0005808244846970644, "loss": 15.251, "step": 9300 }, { "epoch": 0.029122969199922546, "grad_norm": 17.875, "learning_rate": 0.0005823860087445347, "loss": 15.3825, "step": 9325 }, { "epoch": 0.02920104686533789, "grad_norm": 17.625, "learning_rate": 0.000583947532792005, "loss": 15.2931, "step": 9350 }, { "epoch": 0.02927912453075323, "grad_norm": 17.375, "learning_rate": 0.0005855090568394753, "loss": 15.3777, "step": 9375 }, { "epoch": 0.029357202196168573, "grad_norm": 17.5, "learning_rate": 0.0005870705808869456, "loss": 15.369, "step": 9400 }, { "epoch": 0.029435279861583914, "grad_norm": 17.75, "learning_rate": 0.000588632104934416, "loss": 15.2992, "step": 9425 }, { "epoch": 0.029513357526999257, "grad_norm": 15.625, "learning_rate": 0.0005901936289818864, "loss": 15.3261, "step": 9450 }, { "epoch": 0.029591435192414597, "grad_norm": 19.375, "learning_rate": 0.0005917551530293567, "loss": 15.2748, "step": 9475 }, { "epoch": 0.02966951285782994, "grad_norm": 17.875, "learning_rate": 0.000593316677076827, "loss": 15.2702, "step": 9500 }, { "epoch": 0.02974759052324528, "grad_norm": 20.625, "learning_rate": 0.0005948782011242974, "loss": 15.3905, "step": 9525 }, { "epoch": 0.029825668188660625, "grad_norm": 16.5, "learning_rate": 0.0005964397251717677, "loss": 15.3166, "step": 9550 }, { "epoch": 0.029903745854075965, "grad_norm": 17.125, "learning_rate": 0.000598001249219238, "loss": 15.3973, "step": 9575 }, { "epoch": 0.02998182351949131, "grad_norm": 15.4375, "learning_rate": 0.0005995627732667083, "loss": 15.3621, "step": 9600 }, { "epoch": 0.03005990118490665, "grad_norm": 17.375, "learning_rate": 0.0006011242973141786, "loss": 15.532, "step": 9625 }, { "epoch": 0.030137978850321993, "grad_norm": 21.125, "learning_rate": 0.000602685821361649, "loss": 15.6577, "step": 9650 }, { "epoch": 0.030216056515737333, "grad_norm": 16.5, "learning_rate": 0.0006042473454091193, "loss": 15.7684, "step": 9675 }, { "epoch": 0.030294134181152677, "grad_norm": 17.125, "learning_rate": 0.0006058088694565896, "loss": 15.8286, "step": 9700 }, { "epoch": 0.030372211846568017, "grad_norm": 17.625, "learning_rate": 0.0006073703935040599, "loss": 15.9935, "step": 9725 }, { "epoch": 0.03045028951198336, "grad_norm": 20.25, "learning_rate": 0.0006089319175515304, "loss": 16.0198, "step": 9750 }, { "epoch": 0.0305283671773987, "grad_norm": 17.875, "learning_rate": 0.0006104934415990007, "loss": 16.0932, "step": 9775 }, { "epoch": 0.030606444842814044, "grad_norm": 21.75, "learning_rate": 0.000612054965646471, "loss": 16.0459, "step": 9800 }, { "epoch": 0.030684522508229384, "grad_norm": 20.25, "learning_rate": 0.0006136164896939413, "loss": 16.0705, "step": 9825 }, { "epoch": 0.030762600173644728, "grad_norm": 18.625, "learning_rate": 0.0006151780137414116, "loss": 16.0394, "step": 9850 }, { "epoch": 0.03084067783906007, "grad_norm": 16.75, "learning_rate": 0.0006167395377888819, "loss": 16.0319, "step": 9875 }, { "epoch": 0.030918755504475412, "grad_norm": 16.375, "learning_rate": 0.0006183010618363523, "loss": 16.0178, "step": 9900 }, { "epoch": 0.030996833169890752, "grad_norm": 26.375, "learning_rate": 0.0006198625858838226, "loss": 16.1248, "step": 9925 }, { "epoch": 0.031074910835306096, "grad_norm": 18.375, "learning_rate": 0.000621424109931293, "loss": 16.1081, "step": 9950 }, { "epoch": 0.031152988500721436, "grad_norm": 18.25, "learning_rate": 0.0006229856339787633, "loss": 16.195, "step": 9975 }, { "epoch": 0.03123106616613678, "grad_norm": 19.375, "learning_rate": 0.0006245471580262336, "loss": 16.2205, "step": 10000 }, { "epoch": 0.03123106616613678, "eval_loss": 16.29703140258789, "eval_runtime": 102.3113, "eval_samples_per_second": 50.855, "eval_steps_per_second": 3.186, "step": 10000 }, { "epoch": 0.03130914383155212, "grad_norm": 17.375, "learning_rate": 0.0006261086820737039, "loss": 16.2662, "step": 10025 }, { "epoch": 0.03138722149696746, "grad_norm": 25.625, "learning_rate": 0.0006276702061211744, "loss": 16.2528, "step": 10050 }, { "epoch": 0.031465299162382804, "grad_norm": 19.5, "learning_rate": 0.0006292317301686447, "loss": 16.3618, "step": 10075 }, { "epoch": 0.03154337682779815, "grad_norm": 19.25, "learning_rate": 0.0006307932542161149, "loss": 16.4928, "step": 10100 }, { "epoch": 0.03162145449321349, "grad_norm": 20.125, "learning_rate": 0.0006323547782635852, "loss": 16.4938, "step": 10125 }, { "epoch": 0.03169953215862883, "grad_norm": 22.5, "learning_rate": 0.0006339163023110555, "loss": 16.5037, "step": 10150 }, { "epoch": 0.03177760982404417, "grad_norm": 22.0, "learning_rate": 0.000635477826358526, "loss": 16.5408, "step": 10175 }, { "epoch": 0.031855687489459515, "grad_norm": 22.875, "learning_rate": 0.0006370393504059963, "loss": 16.6573, "step": 10200 }, { "epoch": 0.03193376515487486, "grad_norm": 20.625, "learning_rate": 0.0006386008744534666, "loss": 16.608, "step": 10225 }, { "epoch": 0.0320118428202902, "grad_norm": 18.375, "learning_rate": 0.0006401623985009369, "loss": 16.6253, "step": 10250 }, { "epoch": 0.03208992048570554, "grad_norm": 22.75, "learning_rate": 0.0006417239225484073, "loss": 16.7264, "step": 10275 }, { "epoch": 0.03216799815112088, "grad_norm": 22.75, "learning_rate": 0.0006432854465958776, "loss": 16.7937, "step": 10300 }, { "epoch": 0.03224607581653623, "grad_norm": 18.375, "learning_rate": 0.000644846970643348, "loss": 16.8422, "step": 10325 }, { "epoch": 0.03232415348195157, "grad_norm": 19.625, "learning_rate": 0.0006464084946908183, "loss": 16.8992, "step": 10350 }, { "epoch": 0.03240223114736691, "grad_norm": 18.375, "learning_rate": 0.0006479700187382886, "loss": 16.7641, "step": 10375 }, { "epoch": 0.03248030881278225, "grad_norm": 19.25, "learning_rate": 0.0006495315427857589, "loss": 16.9004, "step": 10400 }, { "epoch": 0.032558386478197594, "grad_norm": 20.0, "learning_rate": 0.0006510930668332292, "loss": 16.8661, "step": 10425 }, { "epoch": 0.03263646414361294, "grad_norm": 18.5, "learning_rate": 0.0006526545908806995, "loss": 16.717, "step": 10450 }, { "epoch": 0.032714541809028275, "grad_norm": 21.25, "learning_rate": 0.00065421611492817, "loss": 16.7414, "step": 10475 }, { "epoch": 0.03279261947444362, "grad_norm": 19.0, "learning_rate": 0.0006557776389756403, "loss": 16.751, "step": 10500 }, { "epoch": 0.03287069713985896, "grad_norm": 19.625, "learning_rate": 0.0006573391630231106, "loss": 16.761, "step": 10525 }, { "epoch": 0.032948774805274306, "grad_norm": 18.625, "learning_rate": 0.0006589006870705809, "loss": 16.7134, "step": 10550 }, { "epoch": 0.03302685247068964, "grad_norm": 19.5, "learning_rate": 0.0006604622111180513, "loss": 16.8445, "step": 10575 }, { "epoch": 0.033104930136104986, "grad_norm": 18.125, "learning_rate": 0.0006620237351655216, "loss": 16.923, "step": 10600 }, { "epoch": 0.03318300780152033, "grad_norm": 21.75, "learning_rate": 0.0006635852592129918, "loss": 17.0445, "step": 10625 }, { "epoch": 0.03326108546693567, "grad_norm": 20.5, "learning_rate": 0.0006651467832604622, "loss": 17.1396, "step": 10650 }, { "epoch": 0.03333916313235101, "grad_norm": 20.5, "learning_rate": 0.0006667083073079325, "loss": 17.0307, "step": 10675 }, { "epoch": 0.033417240797766354, "grad_norm": 18.875, "learning_rate": 0.0006682698313554029, "loss": 17.0284, "step": 10700 }, { "epoch": 0.0334953184631817, "grad_norm": 18.75, "learning_rate": 0.0006698313554028732, "loss": 17.1508, "step": 10725 }, { "epoch": 0.03357339612859704, "grad_norm": 24.625, "learning_rate": 0.0006713928794503435, "loss": 17.1699, "step": 10750 }, { "epoch": 0.03365147379401238, "grad_norm": 29.0, "learning_rate": 0.0006729544034978139, "loss": 17.2827, "step": 10775 }, { "epoch": 0.03372955145942772, "grad_norm": 22.25, "learning_rate": 0.0006745159275452843, "loss": 17.4336, "step": 10800 }, { "epoch": 0.033807629124843065, "grad_norm": 19.75, "learning_rate": 0.0006760774515927546, "loss": 17.2731, "step": 10825 }, { "epoch": 0.03388570679025841, "grad_norm": 20.875, "learning_rate": 0.0006776389756402249, "loss": 17.3691, "step": 10850 }, { "epoch": 0.033963784455673746, "grad_norm": 20.875, "learning_rate": 0.0006792004996876951, "loss": 17.5287, "step": 10875 }, { "epoch": 0.03404186212108909, "grad_norm": 19.25, "learning_rate": 0.0006807620237351655, "loss": 17.5784, "step": 10900 }, { "epoch": 0.03411993978650443, "grad_norm": 26.625, "learning_rate": 0.0006823235477826359, "loss": 17.5465, "step": 10925 }, { "epoch": 0.03419801745191978, "grad_norm": 21.75, "learning_rate": 0.0006838850718301062, "loss": 17.5995, "step": 10950 }, { "epoch": 0.03427609511733511, "grad_norm": 20.625, "learning_rate": 0.0006854465958775765, "loss": 17.8194, "step": 10975 }, { "epoch": 0.03435417278275046, "grad_norm": 21.125, "learning_rate": 0.0006870081199250469, "loss": 17.8228, "step": 11000 }, { "epoch": 0.03435417278275046, "eval_loss": 17.8679141998291, "eval_runtime": 102.2521, "eval_samples_per_second": 50.884, "eval_steps_per_second": 3.188, "step": 11000 }, { "epoch": 0.0344322504481658, "grad_norm": 20.25, "learning_rate": 0.0006885696439725172, "loss": 17.8434, "step": 11025 }, { "epoch": 0.034510328113581144, "grad_norm": 21.25, "learning_rate": 0.0006901311680199875, "loss": 17.8783, "step": 11050 }, { "epoch": 0.03458840577899648, "grad_norm": 24.25, "learning_rate": 0.0006916926920674579, "loss": 18.0649, "step": 11075 }, { "epoch": 0.034666483444411825, "grad_norm": 20.625, "learning_rate": 0.0006932542161149283, "loss": 18.0142, "step": 11100 }, { "epoch": 0.03474456110982717, "grad_norm": 21.25, "learning_rate": 0.0006948157401623986, "loss": 18.1228, "step": 11125 }, { "epoch": 0.03482263877524251, "grad_norm": 22.625, "learning_rate": 0.0006963772642098688, "loss": 18.2188, "step": 11150 }, { "epoch": 0.03490071644065785, "grad_norm": 24.625, "learning_rate": 0.0006979387882573391, "loss": 18.5369, "step": 11175 }, { "epoch": 0.03497879410607319, "grad_norm": 23.625, "learning_rate": 0.0006995003123048094, "loss": 18.6513, "step": 11200 }, { "epoch": 0.035056871771488536, "grad_norm": 23.125, "learning_rate": 0.0007010618363522799, "loss": 18.6154, "step": 11225 }, { "epoch": 0.03513494943690388, "grad_norm": 23.75, "learning_rate": 0.0007026233603997502, "loss": 18.5765, "step": 11250 }, { "epoch": 0.035213027102319217, "grad_norm": 24.25, "learning_rate": 0.0007041848844472205, "loss": 18.6452, "step": 11275 }, { "epoch": 0.03529110476773456, "grad_norm": 23.5, "learning_rate": 0.0007057464084946908, "loss": 18.5797, "step": 11300 }, { "epoch": 0.035369182433149904, "grad_norm": 25.0, "learning_rate": 0.0007073079325421612, "loss": 18.5652, "step": 11325 }, { "epoch": 0.03544726009856525, "grad_norm": 29.625, "learning_rate": 0.0007088694565896315, "loss": 18.623, "step": 11350 }, { "epoch": 0.035525337763980584, "grad_norm": 25.875, "learning_rate": 0.0007104309806371019, "loss": 18.6827, "step": 11375 }, { "epoch": 0.03560341542939593, "grad_norm": 25.0, "learning_rate": 0.0007119925046845721, "loss": 18.8077, "step": 11400 }, { "epoch": 0.03568149309481127, "grad_norm": 26.75, "learning_rate": 0.0007135540287320425, "loss": 18.7854, "step": 11425 }, { "epoch": 0.035759570760226615, "grad_norm": 26.125, "learning_rate": 0.0007151155527795128, "loss": 18.8622, "step": 11450 }, { "epoch": 0.03583764842564195, "grad_norm": 24.125, "learning_rate": 0.0007166770768269831, "loss": 18.8723, "step": 11475 }, { "epoch": 0.035915726091057296, "grad_norm": 23.75, "learning_rate": 0.0007182386008744534, "loss": 18.8899, "step": 11500 }, { "epoch": 0.03599380375647264, "grad_norm": 24.375, "learning_rate": 0.0007198001249219239, "loss": 18.9028, "step": 11525 }, { "epoch": 0.03607188142188798, "grad_norm": 24.5, "learning_rate": 0.0007213616489693942, "loss": 18.91, "step": 11550 }, { "epoch": 0.03614995908730332, "grad_norm": 24.625, "learning_rate": 0.0007229231730168645, "loss": 18.8747, "step": 11575 }, { "epoch": 0.03622803675271866, "grad_norm": 27.375, "learning_rate": 0.0007244846970643348, "loss": 18.7471, "step": 11600 }, { "epoch": 0.03630611441813401, "grad_norm": 21.375, "learning_rate": 0.0007260462211118051, "loss": 18.8455, "step": 11625 }, { "epoch": 0.03638419208354935, "grad_norm": 23.375, "learning_rate": 0.0007276077451592754, "loss": 18.8257, "step": 11650 }, { "epoch": 0.03646226974896469, "grad_norm": 19.875, "learning_rate": 0.0007291692692067458, "loss": 18.8365, "step": 11675 }, { "epoch": 0.03654034741438003, "grad_norm": 24.375, "learning_rate": 0.0007307307932542161, "loss": 18.847, "step": 11700 }, { "epoch": 0.036618425079795375, "grad_norm": 23.5, "learning_rate": 0.0007322923173016864, "loss": 18.9123, "step": 11725 }, { "epoch": 0.03669650274521072, "grad_norm": 22.0, "learning_rate": 0.0007338538413491568, "loss": 19.0867, "step": 11750 }, { "epoch": 0.036774580410626055, "grad_norm": 21.75, "learning_rate": 0.0007354153653966271, "loss": 19.0067, "step": 11775 }, { "epoch": 0.0368526580760414, "grad_norm": 22.5, "learning_rate": 0.0007369768894440974, "loss": 19.1682, "step": 11800 }, { "epoch": 0.03693073574145674, "grad_norm": 23.625, "learning_rate": 0.0007385384134915678, "loss": 19.3881, "step": 11825 }, { "epoch": 0.037008813406872086, "grad_norm": 22.5, "learning_rate": 0.0007400999375390382, "loss": 19.3211, "step": 11850 }, { "epoch": 0.03708689107228742, "grad_norm": 24.875, "learning_rate": 0.0007416614615865085, "loss": 19.4715, "step": 11875 }, { "epoch": 0.03716496873770277, "grad_norm": 23.125, "learning_rate": 0.0007432229856339788, "loss": 19.4858, "step": 11900 }, { "epoch": 0.03724304640311811, "grad_norm": 22.75, "learning_rate": 0.000744784509681449, "loss": 19.7193, "step": 11925 }, { "epoch": 0.037321124068533454, "grad_norm": 21.375, "learning_rate": 0.0007463460337289195, "loss": 19.6023, "step": 11950 }, { "epoch": 0.03739920173394879, "grad_norm": 27.375, "learning_rate": 0.0007479075577763898, "loss": 19.6003, "step": 11975 }, { "epoch": 0.037477279399364134, "grad_norm": 23.75, "learning_rate": 0.0007494690818238601, "loss": 19.7391, "step": 12000 }, { "epoch": 0.037477279399364134, "eval_loss": 19.823862075805664, "eval_runtime": 102.3056, "eval_samples_per_second": 50.857, "eval_steps_per_second": 3.187, "step": 12000 }, { "epoch": 0.03755535706477948, "grad_norm": 23.0, "learning_rate": 0.0007510306058713304, "loss": 19.8624, "step": 12025 }, { "epoch": 0.03763343473019482, "grad_norm": 22.5, "learning_rate": 0.0007525921299188008, "loss": 19.7457, "step": 12050 }, { "epoch": 0.03771151239561016, "grad_norm": 23.375, "learning_rate": 0.0007541536539662711, "loss": 19.83, "step": 12075 }, { "epoch": 0.0377895900610255, "grad_norm": 25.625, "learning_rate": 0.0007557151780137415, "loss": 19.8478, "step": 12100 }, { "epoch": 0.037867667726440846, "grad_norm": 23.5, "learning_rate": 0.0007572767020612118, "loss": 19.8336, "step": 12125 }, { "epoch": 0.03794574539185619, "grad_norm": 24.75, "learning_rate": 0.0007588382261086821, "loss": 19.9314, "step": 12150 }, { "epoch": 0.038023823057271526, "grad_norm": 22.375, "learning_rate": 0.0007603997501561524, "loss": 19.9643, "step": 12175 }, { "epoch": 0.03810190072268687, "grad_norm": 25.0, "learning_rate": 0.0007619612742036227, "loss": 19.9156, "step": 12200 }, { "epoch": 0.03817997838810221, "grad_norm": 22.125, "learning_rate": 0.000763522798251093, "loss": 19.9121, "step": 12225 }, { "epoch": 0.03825805605351756, "grad_norm": 23.0, "learning_rate": 0.0007650843222985633, "loss": 19.7906, "step": 12250 }, { "epoch": 0.038336133718932894, "grad_norm": 23.625, "learning_rate": 0.0007666458463460338, "loss": 19.911, "step": 12275 }, { "epoch": 0.03841421138434824, "grad_norm": 25.125, "learning_rate": 0.0007682073703935041, "loss": 19.7616, "step": 12300 }, { "epoch": 0.03849228904976358, "grad_norm": 26.375, "learning_rate": 0.0007697688944409744, "loss": 20.0897, "step": 12325 }, { "epoch": 0.038570366715178925, "grad_norm": 23.0, "learning_rate": 0.0007713304184884447, "loss": 19.9698, "step": 12350 }, { "epoch": 0.03864844438059426, "grad_norm": 26.75, "learning_rate": 0.0007728919425359151, "loss": 20.0703, "step": 12375 }, { "epoch": 0.038726522046009605, "grad_norm": 25.25, "learning_rate": 0.0007744534665833855, "loss": 20.0981, "step": 12400 }, { "epoch": 0.03880459971142495, "grad_norm": 30.125, "learning_rate": 0.0007760149906308557, "loss": 20.1253, "step": 12425 }, { "epoch": 0.03888267737684029, "grad_norm": 27.125, "learning_rate": 0.000777576514678326, "loss": 20.3167, "step": 12450 }, { "epoch": 0.03896075504225563, "grad_norm": 26.5, "learning_rate": 0.0007791380387257964, "loss": 20.4952, "step": 12475 }, { "epoch": 0.03903883270767097, "grad_norm": 32.75, "learning_rate": 0.0007806995627732667, "loss": 20.614, "step": 12500 }, { "epoch": 0.03911691037308632, "grad_norm": 28.0, "learning_rate": 0.000782261086820737, "loss": 20.8281, "step": 12525 }, { "epoch": 0.03919498803850166, "grad_norm": 27.625, "learning_rate": 0.0007838226108682074, "loss": 20.8424, "step": 12550 }, { "epoch": 0.039273065703917, "grad_norm": 37.25, "learning_rate": 0.0007853841349156778, "loss": 20.9891, "step": 12575 }, { "epoch": 0.03935114336933234, "grad_norm": 28.75, "learning_rate": 0.0007869456589631481, "loss": 20.8581, "step": 12600 }, { "epoch": 0.039429221034747684, "grad_norm": 27.375, "learning_rate": 0.0007885071830106184, "loss": 20.8655, "step": 12625 }, { "epoch": 0.03950729870016303, "grad_norm": 29.5, "learning_rate": 0.0007900687070580887, "loss": 21.0043, "step": 12650 }, { "epoch": 0.039585376365578365, "grad_norm": 33.5, "learning_rate": 0.000791630231105559, "loss": 21.1795, "step": 12675 }, { "epoch": 0.03966345403099371, "grad_norm": 28.625, "learning_rate": 0.0007931917551530294, "loss": 21.1842, "step": 12700 }, { "epoch": 0.03974153169640905, "grad_norm": 25.125, "learning_rate": 0.0007947532792004997, "loss": 21.1583, "step": 12725 }, { "epoch": 0.039819609361824396, "grad_norm": 28.875, "learning_rate": 0.00079631480324797, "loss": 21.2881, "step": 12750 }, { "epoch": 0.03989768702723974, "grad_norm": 30.875, "learning_rate": 0.0007978763272954403, "loss": 21.265, "step": 12775 }, { "epoch": 0.039975764692655076, "grad_norm": 30.25, "learning_rate": 0.0007994378513429107, "loss": 21.3535, "step": 12800 }, { "epoch": 0.04005384235807042, "grad_norm": 29.75, "learning_rate": 0.000800999375390381, "loss": 21.4447, "step": 12825 }, { "epoch": 0.04013192002348576, "grad_norm": 30.75, "learning_rate": 0.0008025608994378514, "loss": 21.5344, "step": 12850 }, { "epoch": 0.04020999768890111, "grad_norm": 33.75, "learning_rate": 0.0008041224234853217, "loss": 21.3934, "step": 12875 }, { "epoch": 0.040288075354316444, "grad_norm": 45.0, "learning_rate": 0.0008056839475327921, "loss": 21.7891, "step": 12900 }, { "epoch": 0.04036615301973179, "grad_norm": 28.125, "learning_rate": 0.0008072454715802624, "loss": 22.0609, "step": 12925 }, { "epoch": 0.04044423068514713, "grad_norm": 32.0, "learning_rate": 0.0008088069956277326, "loss": 21.915, "step": 12950 }, { "epoch": 0.040522308350562475, "grad_norm": 27.875, "learning_rate": 0.0008103685196752029, "loss": 21.9726, "step": 12975 }, { "epoch": 0.04060038601597781, "grad_norm": 27.625, "learning_rate": 0.0008119300437226734, "loss": 21.9425, "step": 13000 }, { "epoch": 0.04060038601597781, "eval_loss": 22.01194190979004, "eval_runtime": 102.3317, "eval_samples_per_second": 50.844, "eval_steps_per_second": 3.186, "step": 13000 }, { "epoch": 0.040678463681393155, "grad_norm": 35.75, "learning_rate": 0.0008134915677701437, "loss": 21.8983, "step": 13025 }, { "epoch": 0.0407565413468085, "grad_norm": 31.375, "learning_rate": 0.000815053091817614, "loss": 22.2354, "step": 13050 }, { "epoch": 0.04083461901222384, "grad_norm": 28.375, "learning_rate": 0.0008166146158650843, "loss": 22.3099, "step": 13075 }, { "epoch": 0.04091269667763918, "grad_norm": 34.5, "learning_rate": 0.0008181761399125547, "loss": 22.3739, "step": 13100 }, { "epoch": 0.04099077434305452, "grad_norm": 29.5, "learning_rate": 0.000819737663960025, "loss": 22.4598, "step": 13125 }, { "epoch": 0.04106885200846987, "grad_norm": 32.25, "learning_rate": 0.0008212991880074954, "loss": 22.7993, "step": 13150 }, { "epoch": 0.04114692967388521, "grad_norm": 31.375, "learning_rate": 0.0008228607120549657, "loss": 22.7376, "step": 13175 }, { "epoch": 0.04122500733930055, "grad_norm": 31.5, "learning_rate": 0.0008244222361024359, "loss": 22.6221, "step": 13200 }, { "epoch": 0.04130308500471589, "grad_norm": 29.375, "learning_rate": 0.0008259837601499063, "loss": 22.6237, "step": 13225 }, { "epoch": 0.041381162670131234, "grad_norm": 28.0, "learning_rate": 0.0008275452841973766, "loss": 22.4565, "step": 13250 }, { "epoch": 0.04145924033554658, "grad_norm": 28.875, "learning_rate": 0.0008291068082448469, "loss": 22.4236, "step": 13275 }, { "epoch": 0.041537318000961915, "grad_norm": 26.625, "learning_rate": 0.0008306683322923173, "loss": 22.4627, "step": 13300 }, { "epoch": 0.04161539566637726, "grad_norm": 30.75, "learning_rate": 0.0008322298563397877, "loss": 22.5395, "step": 13325 }, { "epoch": 0.0416934733317926, "grad_norm": 29.375, "learning_rate": 0.000833791380387258, "loss": 22.4437, "step": 13350 }, { "epoch": 0.041771550997207946, "grad_norm": 26.375, "learning_rate": 0.0008353529044347283, "loss": 22.5234, "step": 13375 }, { "epoch": 0.04184962866262328, "grad_norm": 28.0, "learning_rate": 0.0008369144284821986, "loss": 22.9237, "step": 13400 }, { "epoch": 0.041927706328038626, "grad_norm": 32.0, "learning_rate": 0.0008384759525296691, "loss": 22.9402, "step": 13425 }, { "epoch": 0.04200578399345397, "grad_norm": 30.375, "learning_rate": 0.0008400374765771394, "loss": 23.1061, "step": 13450 }, { "epoch": 0.04208386165886931, "grad_norm": 32.0, "learning_rate": 0.0008415990006246096, "loss": 22.9162, "step": 13475 }, { "epoch": 0.04216193932428465, "grad_norm": 30.25, "learning_rate": 0.0008431605246720799, "loss": 23.2072, "step": 13500 }, { "epoch": 0.042240016989699994, "grad_norm": 31.375, "learning_rate": 0.0008447220487195503, "loss": 23.2287, "step": 13525 }, { "epoch": 0.04231809465511534, "grad_norm": 29.5, "learning_rate": 0.0008462835727670206, "loss": 23.1901, "step": 13550 }, { "epoch": 0.04239617232053068, "grad_norm": 28.25, "learning_rate": 0.000847845096814491, "loss": 23.3087, "step": 13575 }, { "epoch": 0.04247424998594602, "grad_norm": 33.75, "learning_rate": 0.0008494066208619613, "loss": 23.5178, "step": 13600 }, { "epoch": 0.04255232765136136, "grad_norm": 27.875, "learning_rate": 0.0008509681449094317, "loss": 23.4003, "step": 13625 }, { "epoch": 0.042630405316776705, "grad_norm": 26.25, "learning_rate": 0.000852529668956902, "loss": 23.4554, "step": 13650 }, { "epoch": 0.04270848298219205, "grad_norm": 24.875, "learning_rate": 0.0008540911930043723, "loss": 23.3269, "step": 13675 }, { "epoch": 0.042786560647607386, "grad_norm": 24.875, "learning_rate": 0.0008556527170518426, "loss": 23.2309, "step": 13700 }, { "epoch": 0.04286463831302273, "grad_norm": 29.375, "learning_rate": 0.0008572142410993128, "loss": 23.0514, "step": 13725 }, { "epoch": 0.04294271597843807, "grad_norm": 27.125, "learning_rate": 0.0008587757651467833, "loss": 22.984, "step": 13750 }, { "epoch": 0.04302079364385342, "grad_norm": 30.25, "learning_rate": 0.0008603372891942536, "loss": 22.9465, "step": 13775 }, { "epoch": 0.04309887130926875, "grad_norm": 30.0, "learning_rate": 0.0008618988132417239, "loss": 23.0119, "step": 13800 }, { "epoch": 0.0431769489746841, "grad_norm": 30.25, "learning_rate": 0.0008634603372891942, "loss": 22.9153, "step": 13825 }, { "epoch": 0.04325502664009944, "grad_norm": 25.25, "learning_rate": 0.0008650218613366646, "loss": 23.1027, "step": 13850 }, { "epoch": 0.043333104305514784, "grad_norm": 33.75, "learning_rate": 0.000866583385384135, "loss": 23.0265, "step": 13875 }, { "epoch": 0.04341118197093012, "grad_norm": 29.625, "learning_rate": 0.0008681449094316053, "loss": 23.1209, "step": 13900 }, { "epoch": 0.043489259636345465, "grad_norm": 30.0, "learning_rate": 0.0008697064334790756, "loss": 23.2931, "step": 13925 }, { "epoch": 0.04356733730176081, "grad_norm": 31.25, "learning_rate": 0.000871267957526546, "loss": 23.6223, "step": 13950 }, { "epoch": 0.04364541496717615, "grad_norm": 29.125, "learning_rate": 0.0008728294815740162, "loss": 23.4989, "step": 13975 }, { "epoch": 0.04372349263259149, "grad_norm": 30.125, "learning_rate": 0.0008743910056214865, "loss": 23.923, "step": 14000 }, { "epoch": 0.04372349263259149, "eval_loss": 23.799776077270508, "eval_runtime": 102.2075, "eval_samples_per_second": 50.906, "eval_steps_per_second": 3.19, "step": 14000 }, { "epoch": 0.04380157029800683, "grad_norm": 32.0, "learning_rate": 0.0008759525296689569, "loss": 23.9569, "step": 14025 }, { "epoch": 0.043879647963422176, "grad_norm": 30.75, "learning_rate": 0.0008775140537164273, "loss": 23.764, "step": 14050 }, { "epoch": 0.04395772562883752, "grad_norm": 29.75, "learning_rate": 0.0008790755777638976, "loss": 23.4492, "step": 14075 }, { "epoch": 0.04403580329425286, "grad_norm": 28.125, "learning_rate": 0.0008806371018113679, "loss": 23.5056, "step": 14100 }, { "epoch": 0.0441138809596682, "grad_norm": 31.25, "learning_rate": 0.0008821986258588382, "loss": 23.7418, "step": 14125 }, { "epoch": 0.044191958625083544, "grad_norm": 31.0, "learning_rate": 0.0008837601499063086, "loss": 23.7158, "step": 14150 }, { "epoch": 0.04427003629049889, "grad_norm": 35.25, "learning_rate": 0.000885321673953779, "loss": 24.0083, "step": 14175 }, { "epoch": 0.044348113955914224, "grad_norm": 34.75, "learning_rate": 0.0008868831980012493, "loss": 23.95, "step": 14200 }, { "epoch": 0.04442619162132957, "grad_norm": 34.5, "learning_rate": 0.0008884447220487196, "loss": 24.0242, "step": 14225 }, { "epoch": 0.04450426928674491, "grad_norm": 34.0, "learning_rate": 0.0008900062460961898, "loss": 24.2818, "step": 14250 }, { "epoch": 0.044582346952160255, "grad_norm": 32.25, "learning_rate": 0.0008915677701436602, "loss": 24.4323, "step": 14275 }, { "epoch": 0.04466042461757559, "grad_norm": 31.125, "learning_rate": 0.0008931292941911305, "loss": 24.8361, "step": 14300 }, { "epoch": 0.044738502282990936, "grad_norm": 32.5, "learning_rate": 0.0008946908182386009, "loss": 24.8959, "step": 14325 }, { "epoch": 0.04481657994840628, "grad_norm": 30.75, "learning_rate": 0.0008962523422860712, "loss": 24.7795, "step": 14350 }, { "epoch": 0.04489465761382162, "grad_norm": 45.75, "learning_rate": 0.0008978138663335416, "loss": 24.9758, "step": 14375 }, { "epoch": 0.04497273527923696, "grad_norm": 31.625, "learning_rate": 0.0008993753903810119, "loss": 25.0523, "step": 14400 }, { "epoch": 0.0450508129446523, "grad_norm": 36.25, "learning_rate": 0.0009009369144284822, "loss": 24.8775, "step": 14425 }, { "epoch": 0.04512889061006765, "grad_norm": 34.25, "learning_rate": 0.0009024984384759525, "loss": 24.9395, "step": 14450 }, { "epoch": 0.04520696827548299, "grad_norm": 36.5, "learning_rate": 0.000904059962523423, "loss": 25.0047, "step": 14475 }, { "epoch": 0.04528504594089833, "grad_norm": 39.75, "learning_rate": 0.0009056214865708932, "loss": 24.9923, "step": 14500 }, { "epoch": 0.04536312360631367, "grad_norm": 32.75, "learning_rate": 0.0009071830106183635, "loss": 25.1583, "step": 14525 }, { "epoch": 0.045441201271729015, "grad_norm": 31.25, "learning_rate": 0.0009087445346658338, "loss": 25.1936, "step": 14550 }, { "epoch": 0.04551927893714436, "grad_norm": 31.75, "learning_rate": 0.0009103060587133042, "loss": 24.9059, "step": 14575 }, { "epoch": 0.045597356602559695, "grad_norm": 34.0, "learning_rate": 0.0009118675827607745, "loss": 25.1417, "step": 14600 }, { "epoch": 0.04567543426797504, "grad_norm": 32.25, "learning_rate": 0.0009134291068082449, "loss": 25.2183, "step": 14625 }, { "epoch": 0.04575351193339038, "grad_norm": 35.25, "learning_rate": 0.0009149906308557152, "loss": 25.3087, "step": 14650 }, { "epoch": 0.045831589598805726, "grad_norm": 31.5, "learning_rate": 0.0009165521549031856, "loss": 25.6569, "step": 14675 }, { "epoch": 0.04590966726422106, "grad_norm": 40.5, "learning_rate": 0.0009181136789506559, "loss": 25.9421, "step": 14700 }, { "epoch": 0.04598774492963641, "grad_norm": 39.75, "learning_rate": 0.0009196752029981262, "loss": 26.0395, "step": 14725 }, { "epoch": 0.04606582259505175, "grad_norm": 41.25, "learning_rate": 0.0009212367270455964, "loss": 26.0641, "step": 14750 }, { "epoch": 0.046143900260467094, "grad_norm": 39.5, "learning_rate": 0.0009227982510930668, "loss": 26.1332, "step": 14775 }, { "epoch": 0.04622197792588243, "grad_norm": 36.5, "learning_rate": 0.0009243597751405372, "loss": 26.102, "step": 14800 }, { "epoch": 0.046300055591297774, "grad_norm": 33.75, "learning_rate": 0.0009259212991880075, "loss": 26.1986, "step": 14825 }, { "epoch": 0.04637813325671312, "grad_norm": 36.5, "learning_rate": 0.0009274828232354778, "loss": 26.0567, "step": 14850 }, { "epoch": 0.04645621092212846, "grad_norm": 38.0, "learning_rate": 0.0009290443472829481, "loss": 26.2836, "step": 14875 }, { "epoch": 0.0465342885875438, "grad_norm": 36.0, "learning_rate": 0.0009306058713304186, "loss": 26.6167, "step": 14900 }, { "epoch": 0.04661236625295914, "grad_norm": 44.25, "learning_rate": 0.0009321673953778889, "loss": 26.4313, "step": 14925 }, { "epoch": 0.046690443918374486, "grad_norm": 36.25, "learning_rate": 0.0009337289194253592, "loss": 26.1888, "step": 14950 }, { "epoch": 0.04676852158378983, "grad_norm": 37.5, "learning_rate": 0.0009352904434728295, "loss": 26.2063, "step": 14975 }, { "epoch": 0.046846599249205166, "grad_norm": 36.25, "learning_rate": 0.0009368519675202999, "loss": 26.3716, "step": 15000 }, { "epoch": 0.046846599249205166, "eval_loss": 26.39820098876953, "eval_runtime": 102.1335, "eval_samples_per_second": 50.943, "eval_steps_per_second": 3.192, "step": 15000 }, { "epoch": 0.04692467691462051, "grad_norm": 35.5, "learning_rate": 0.0009384134915677701, "loss": 26.4646, "step": 15025 }, { "epoch": 0.04700275458003585, "grad_norm": 37.25, "learning_rate": 0.0009399750156152404, "loss": 26.457, "step": 15050 }, { "epoch": 0.0470808322454512, "grad_norm": 43.25, "learning_rate": 0.0009415365396627108, "loss": 26.4532, "step": 15075 }, { "epoch": 0.047158909910866534, "grad_norm": 34.75, "learning_rate": 0.0009430980637101812, "loss": 26.32, "step": 15100 }, { "epoch": 0.04723698757628188, "grad_norm": 58.5, "learning_rate": 0.0009446595877576515, "loss": 26.367, "step": 15125 }, { "epoch": 0.04731506524169722, "grad_norm": 44.25, "learning_rate": 0.0009462211118051218, "loss": 26.4783, "step": 15150 }, { "epoch": 0.047393142907112565, "grad_norm": 35.25, "learning_rate": 0.0009477826358525921, "loss": 26.3163, "step": 15175 }, { "epoch": 0.0474712205725279, "grad_norm": 36.0, "learning_rate": 0.0009493441599000626, "loss": 26.6294, "step": 15200 }, { "epoch": 0.047549298237943245, "grad_norm": 38.25, "learning_rate": 0.0009509056839475329, "loss": 26.6693, "step": 15225 }, { "epoch": 0.04762737590335859, "grad_norm": 42.25, "learning_rate": 0.0009524672079950032, "loss": 26.9737, "step": 15250 }, { "epoch": 0.04770545356877393, "grad_norm": 33.75, "learning_rate": 0.0009540287320424734, "loss": 26.9355, "step": 15275 }, { "epoch": 0.04778353123418927, "grad_norm": 37.75, "learning_rate": 0.0009555902560899437, "loss": 27.0918, "step": 15300 }, { "epoch": 0.04786160889960461, "grad_norm": 37.5, "learning_rate": 0.0009571517801374141, "loss": 27.2465, "step": 15325 }, { "epoch": 0.04793968656501996, "grad_norm": 35.25, "learning_rate": 0.0009587133041848845, "loss": 27.1683, "step": 15350 }, { "epoch": 0.0480177642304353, "grad_norm": 35.75, "learning_rate": 0.0009602748282323548, "loss": 27.0435, "step": 15375 }, { "epoch": 0.048095841895850644, "grad_norm": 39.0, "learning_rate": 0.0009618363522798251, "loss": 27.2943, "step": 15400 }, { "epoch": 0.04817391956126598, "grad_norm": 37.5, "learning_rate": 0.0009633978763272955, "loss": 27.1815, "step": 15425 }, { "epoch": 0.048251997226681324, "grad_norm": 38.75, "learning_rate": 0.0009649594003747658, "loss": 27.2386, "step": 15450 }, { "epoch": 0.04833007489209667, "grad_norm": 43.0, "learning_rate": 0.0009665209244222361, "loss": 27.5126, "step": 15475 }, { "epoch": 0.04840815255751201, "grad_norm": 44.75, "learning_rate": 0.0009680824484697065, "loss": 27.6576, "step": 15500 }, { "epoch": 0.04848623022292735, "grad_norm": 39.75, "learning_rate": 0.0009696439725171768, "loss": 27.6394, "step": 15525 }, { "epoch": 0.04856430788834269, "grad_norm": 46.5, "learning_rate": 0.0009712054965646471, "loss": 27.9862, "step": 15550 }, { "epoch": 0.048642385553758036, "grad_norm": 36.5, "learning_rate": 0.0009727670206121174, "loss": 27.6303, "step": 15575 }, { "epoch": 0.04872046321917338, "grad_norm": 36.25, "learning_rate": 0.0009743285446595877, "loss": 27.6376, "step": 15600 }, { "epoch": 0.048798540884588716, "grad_norm": 37.25, "learning_rate": 0.0009758900687070581, "loss": 27.792, "step": 15625 }, { "epoch": 0.04887661855000406, "grad_norm": 37.0, "learning_rate": 0.0009774515927545285, "loss": 27.8976, "step": 15650 }, { "epoch": 0.048954696215419403, "grad_norm": 44.75, "learning_rate": 0.0009790131168019988, "loss": 28.1314, "step": 15675 }, { "epoch": 0.04903277388083475, "grad_norm": 41.5, "learning_rate": 0.000980574640849469, "loss": 28.1346, "step": 15700 }, { "epoch": 0.049110851546250084, "grad_norm": 42.0, "learning_rate": 0.0009821361648969394, "loss": 28.3701, "step": 15725 }, { "epoch": 0.04918892921166543, "grad_norm": 38.5, "learning_rate": 0.0009836976889444097, "loss": 28.2846, "step": 15750 }, { "epoch": 0.04926700687708077, "grad_norm": 39.75, "learning_rate": 0.0009852592129918803, "loss": 28.4163, "step": 15775 }, { "epoch": 0.049345084542496115, "grad_norm": 37.25, "learning_rate": 0.0009868207370393504, "loss": 28.4691, "step": 15800 }, { "epoch": 0.04942316220791145, "grad_norm": 40.75, "learning_rate": 0.0009883822610868207, "loss": 28.3626, "step": 15825 }, { "epoch": 0.049501239873326795, "grad_norm": 38.75, "learning_rate": 0.000989943785134291, "loss": 28.2031, "step": 15850 }, { "epoch": 0.04957931753874214, "grad_norm": 54.25, "learning_rate": 0.0009915053091817613, "loss": 28.2261, "step": 15875 }, { "epoch": 0.04965739520415748, "grad_norm": 37.0, "learning_rate": 0.0009930668332292318, "loss": 27.9449, "step": 15900 }, { "epoch": 0.04973547286957282, "grad_norm": 34.25, "learning_rate": 0.0009946283572767022, "loss": 27.9998, "step": 15925 }, { "epoch": 0.04981355053498816, "grad_norm": 38.5, "learning_rate": 0.0009961898813241725, "loss": 27.9713, "step": 15950 }, { "epoch": 0.04989162820040351, "grad_norm": 34.5, "learning_rate": 0.0009977514053716428, "loss": 28.3091, "step": 15975 }, { "epoch": 0.04996970586581885, "grad_norm": 39.75, "learning_rate": 0.000999312929419113, "loss": 28.207, "step": 16000 }, { "epoch": 0.04996970586581885, "eval_loss": 28.333789825439453, "eval_runtime": 102.3237, "eval_samples_per_second": 50.848, "eval_steps_per_second": 3.186, "step": 16000 }, { "epoch": 0.05004778353123419, "grad_norm": 37.0, "learning_rate": 0.000999999994773354, "loss": 28.2375, "step": 16025 }, { "epoch": 0.05012586119664953, "grad_norm": 45.5, "learning_rate": 0.0009999999594401602, "loss": 28.1327, "step": 16050 }, { "epoch": 0.050203938862064874, "grad_norm": 42.75, "learning_rate": 0.0009999998907737678, "loss": 28.3186, "step": 16075 }, { "epoch": 0.05028201652748022, "grad_norm": 35.0, "learning_rate": 0.0009999997887741804, "loss": 27.9558, "step": 16100 }, { "epoch": 0.050360094192895555, "grad_norm": 47.75, "learning_rate": 0.0009999996534414057, "loss": 28.2493, "step": 16125 }, { "epoch": 0.0504381718583109, "grad_norm": 40.25, "learning_rate": 0.000999999484775452, "loss": 28.1951, "step": 16150 }, { "epoch": 0.05051624952372624, "grad_norm": 36.25, "learning_rate": 0.000999999282776331, "loss": 28.3094, "step": 16175 }, { "epoch": 0.050594327189141586, "grad_norm": 36.25, "learning_rate": 0.000999999047444056, "loss": 28.579, "step": 16200 }, { "epoch": 0.05067240485455692, "grad_norm": 38.5, "learning_rate": 0.0009999987787786427, "loss": 28.4296, "step": 16225 }, { "epoch": 0.050750482519972266, "grad_norm": 43.0, "learning_rate": 0.0009999984767801089, "loss": 28.686, "step": 16250 }, { "epoch": 0.05082856018538761, "grad_norm": 46.25, "learning_rate": 0.0009999981414484749, "loss": 28.5111, "step": 16275 }, { "epoch": 0.050906637850802954, "grad_norm": 44.75, "learning_rate": 0.000999997772783763, "loss": 28.7081, "step": 16300 }, { "epoch": 0.05098471551621829, "grad_norm": 43.0, "learning_rate": 0.0009999973707859977, "loss": 28.9352, "step": 16325 }, { "epoch": 0.051062793181633634, "grad_norm": 42.0, "learning_rate": 0.000999996935455206, "loss": 28.8936, "step": 16350 }, { "epoch": 0.05114087084704898, "grad_norm": 37.25, "learning_rate": 0.0009999964667914167, "loss": 28.9924, "step": 16375 }, { "epoch": 0.05121894851246432, "grad_norm": 37.75, "learning_rate": 0.0009999959647946613, "loss": 28.6103, "step": 16400 }, { "epoch": 0.05129702617787966, "grad_norm": 36.25, "learning_rate": 0.0009999954294649732, "loss": 28.7174, "step": 16425 }, { "epoch": 0.051375103843295, "grad_norm": 48.25, "learning_rate": 0.0009999948608023876, "loss": 28.5916, "step": 16450 }, { "epoch": 0.051453181508710345, "grad_norm": 37.0, "learning_rate": 0.0009999942588069433, "loss": 28.8703, "step": 16475 }, { "epoch": 0.05153125917412569, "grad_norm": 37.75, "learning_rate": 0.0009999936234786795, "loss": 29.1448, "step": 16500 }, { "epoch": 0.051609336839541026, "grad_norm": 36.25, "learning_rate": 0.0009999929548176391, "loss": 28.8964, "step": 16525 }, { "epoch": 0.05168741450495637, "grad_norm": 34.75, "learning_rate": 0.0009999922528238668, "loss": 28.6221, "step": 16550 }, { "epoch": 0.05176549217037171, "grad_norm": 39.5, "learning_rate": 0.000999991517497409, "loss": 28.9006, "step": 16575 }, { "epoch": 0.05184356983578706, "grad_norm": 35.25, "learning_rate": 0.0009999907488383148, "loss": 28.6834, "step": 16600 }, { "epoch": 0.05192164750120239, "grad_norm": 36.25, "learning_rate": 0.0009999899468466358, "loss": 28.4863, "step": 16625 }, { "epoch": 0.05199972516661774, "grad_norm": 34.75, "learning_rate": 0.0009999891115224251, "loss": 28.381, "step": 16650 }, { "epoch": 0.05207780283203308, "grad_norm": 42.25, "learning_rate": 0.0009999882428657384, "loss": 28.4007, "step": 16675 }, { "epoch": 0.052155880497448424, "grad_norm": 49.25, "learning_rate": 0.0009999873408766337, "loss": 28.3731, "step": 16700 }, { "epoch": 0.05223395816286376, "grad_norm": 45.0, "learning_rate": 0.0009999864055551713, "loss": 28.1782, "step": 16725 }, { "epoch": 0.052312035828279105, "grad_norm": 38.5, "learning_rate": 0.0009999854369014132, "loss": 28.2612, "step": 16750 }, { "epoch": 0.05239011349369445, "grad_norm": 41.5, "learning_rate": 0.0009999844349154244, "loss": 28.0716, "step": 16775 }, { "epoch": 0.05246819115910979, "grad_norm": 34.0, "learning_rate": 0.0009999833995972711, "loss": 27.842, "step": 16800 }, { "epoch": 0.05254626882452513, "grad_norm": 32.5, "learning_rate": 0.000999982330947023, "loss": 28.2459, "step": 16825 }, { "epoch": 0.05262434648994047, "grad_norm": 36.75, "learning_rate": 0.000999981228964751, "loss": 28.2205, "step": 16850 }, { "epoch": 0.052702424155355816, "grad_norm": 41.25, "learning_rate": 0.0009999800936505287, "loss": 28.2134, "step": 16875 }, { "epoch": 0.05278050182077116, "grad_norm": 41.0, "learning_rate": 0.0009999789250044312, "loss": 28.0064, "step": 16900 }, { "epoch": 0.0528585794861865, "grad_norm": 42.0, "learning_rate": 0.0009999777230265375, "loss": 28.2604, "step": 16925 }, { "epoch": 0.05293665715160184, "grad_norm": 40.75, "learning_rate": 0.0009999764877169268, "loss": 28.5458, "step": 16950 }, { "epoch": 0.053014734817017184, "grad_norm": 38.0, "learning_rate": 0.0009999752190756818, "loss": 28.4853, "step": 16975 }, { "epoch": 0.05309281248243253, "grad_norm": 38.75, "learning_rate": 0.000999973917102887, "loss": 28.8174, "step": 17000 }, { "epoch": 0.05309281248243253, "eval_loss": 28.75542449951172, "eval_runtime": 102.5252, "eval_samples_per_second": 50.749, "eval_steps_per_second": 3.18, "step": 17000 }, { "epoch": 0.053170890147847864, "grad_norm": 43.0, "learning_rate": 0.0009999725817986295, "loss": 28.8356, "step": 17025 }, { "epoch": 0.05324896781326321, "grad_norm": 38.0, "learning_rate": 0.0009999712131629978, "loss": 28.9959, "step": 17050 }, { "epoch": 0.05332704547867855, "grad_norm": 37.25, "learning_rate": 0.0009999698111960835, "loss": 28.713, "step": 17075 }, { "epoch": 0.053405123144093895, "grad_norm": 43.5, "learning_rate": 0.00099996837589798, "loss": 28.8244, "step": 17100 }, { "epoch": 0.05348320080950923, "grad_norm": 36.0, "learning_rate": 0.000999966907268783, "loss": 28.8987, "step": 17125 }, { "epoch": 0.053561278474924576, "grad_norm": 40.0, "learning_rate": 0.0009999654053085903, "loss": 28.6699, "step": 17150 }, { "epoch": 0.05363935614033992, "grad_norm": 36.25, "learning_rate": 0.000999963870017502, "loss": 28.8461, "step": 17175 }, { "epoch": 0.05371743380575526, "grad_norm": 34.0, "learning_rate": 0.0009999623013956208, "loss": 28.6992, "step": 17200 }, { "epoch": 0.0537955114711706, "grad_norm": 36.0, "learning_rate": 0.0009999606994430508, "loss": 28.6228, "step": 17225 }, { "epoch": 0.05387358913658594, "grad_norm": 31.0, "learning_rate": 0.000999959064159899, "loss": 28.801, "step": 17250 }, { "epoch": 0.05395166680200129, "grad_norm": 40.0, "learning_rate": 0.0009999573955462747, "loss": 28.9502, "step": 17275 }, { "epoch": 0.05402974446741663, "grad_norm": 45.25, "learning_rate": 0.0009999556936022887, "loss": 29.1268, "step": 17300 }, { "epoch": 0.05410782213283197, "grad_norm": 40.25, "learning_rate": 0.0009999539583280548, "loss": 29.3132, "step": 17325 }, { "epoch": 0.05418589979824731, "grad_norm": 48.75, "learning_rate": 0.0009999521897236885, "loss": 29.2909, "step": 17350 }, { "epoch": 0.054263977463662655, "grad_norm": 41.75, "learning_rate": 0.0009999503877893075, "loss": 29.5531, "step": 17375 }, { "epoch": 0.054342055129078, "grad_norm": 39.75, "learning_rate": 0.0009999485525250323, "loss": 29.7544, "step": 17400 }, { "epoch": 0.054420132794493335, "grad_norm": 35.25, "learning_rate": 0.0009999466839309852, "loss": 29.7906, "step": 17425 }, { "epoch": 0.05449821045990868, "grad_norm": 39.5, "learning_rate": 0.0009999447820072907, "loss": 30.0083, "step": 17450 }, { "epoch": 0.05457628812532402, "grad_norm": 44.0, "learning_rate": 0.0009999428467540755, "loss": 29.8602, "step": 17475 }, { "epoch": 0.054654365790739366, "grad_norm": 41.25, "learning_rate": 0.0009999408781714686, "loss": 30.1297, "step": 17500 }, { "epoch": 0.0547324434561547, "grad_norm": 36.75, "learning_rate": 0.0009999388762596015, "loss": 30.0162, "step": 17525 }, { "epoch": 0.05481052112157005, "grad_norm": 39.5, "learning_rate": 0.0009999368410186075, "loss": 30.2735, "step": 17550 }, { "epoch": 0.05488859878698539, "grad_norm": 38.25, "learning_rate": 0.0009999347724486223, "loss": 30.0508, "step": 17575 }, { "epoch": 0.054966676452400734, "grad_norm": 39.0, "learning_rate": 0.0009999326705497837, "loss": 30.0643, "step": 17600 }, { "epoch": 0.05504475411781607, "grad_norm": 45.25, "learning_rate": 0.0009999305353222319, "loss": 30.1616, "step": 17625 }, { "epoch": 0.055122831783231414, "grad_norm": 37.5, "learning_rate": 0.0009999283667661094, "loss": 29.9471, "step": 17650 }, { "epoch": 0.05520090944864676, "grad_norm": 39.75, "learning_rate": 0.0009999261648815604, "loss": 30.057, "step": 17675 }, { "epoch": 0.0552789871140621, "grad_norm": 44.0, "learning_rate": 0.0009999239296687322, "loss": 30.1771, "step": 17700 }, { "epoch": 0.05535706477947744, "grad_norm": 37.75, "learning_rate": 0.0009999216611277734, "loss": 30.6398, "step": 17725 }, { "epoch": 0.05543514244489278, "grad_norm": 49.75, "learning_rate": 0.000999919359258835, "loss": 30.2826, "step": 17750 }, { "epoch": 0.055513220110308126, "grad_norm": 38.75, "learning_rate": 0.0009999170240620715, "loss": 30.4671, "step": 17775 }, { "epoch": 0.05559129777572347, "grad_norm": 36.5, "learning_rate": 0.0009999146555376376, "loss": 30.3188, "step": 17800 }, { "epoch": 0.055669375441138806, "grad_norm": 41.25, "learning_rate": 0.0009999122536856913, "loss": 30.5831, "step": 17825 }, { "epoch": 0.05574745310655415, "grad_norm": 37.25, "learning_rate": 0.000999909818506393, "loss": 30.3014, "step": 17850 }, { "epoch": 0.055825530771969493, "grad_norm": 43.5, "learning_rate": 0.0009999073499999051, "loss": 30.3619, "step": 17875 }, { "epoch": 0.05590360843738484, "grad_norm": 41.25, "learning_rate": 0.0009999048481663922, "loss": 30.207, "step": 17900 }, { "epoch": 0.05598168610280018, "grad_norm": 55.75, "learning_rate": 0.0009999023130060208, "loss": 30.5041, "step": 17925 }, { "epoch": 0.05605976376821552, "grad_norm": 52.0, "learning_rate": 0.00099989974451896, "loss": 30.6651, "step": 17950 }, { "epoch": 0.05613784143363086, "grad_norm": 51.25, "learning_rate": 0.000999897142705381, "loss": 30.8339, "step": 17975 }, { "epoch": 0.056215919099046205, "grad_norm": 48.75, "learning_rate": 0.0009998945075654572, "loss": 30.9781, "step": 18000 }, { "epoch": 0.056215919099046205, "eval_loss": 31.040813446044922, "eval_runtime": 102.3512, "eval_samples_per_second": 50.835, "eval_steps_per_second": 3.185, "step": 18000 }, { "epoch": 0.05629399676446155, "grad_norm": 43.5, "learning_rate": 0.0009998918390993648, "loss": 30.9913, "step": 18025 }, { "epoch": 0.056372074429876885, "grad_norm": 42.75, "learning_rate": 0.000999889137307281, "loss": 31.086, "step": 18050 }, { "epoch": 0.05645015209529223, "grad_norm": 41.0, "learning_rate": 0.0009998864021893864, "loss": 31.0512, "step": 18075 }, { "epoch": 0.05652822976070757, "grad_norm": 42.75, "learning_rate": 0.0009998836337458629, "loss": 31.2091, "step": 18100 }, { "epoch": 0.056606307426122916, "grad_norm": 44.25, "learning_rate": 0.0009998808319768954, "loss": 31.1535, "step": 18125 }, { "epoch": 0.05668438509153825, "grad_norm": 43.5, "learning_rate": 0.0009998779968826707, "loss": 31.3788, "step": 18150 }, { "epoch": 0.0567624627569536, "grad_norm": 43.75, "learning_rate": 0.0009998751284633779, "loss": 31.3632, "step": 18175 }, { "epoch": 0.05684054042236894, "grad_norm": 39.0, "learning_rate": 0.0009998722267192076, "loss": 31.101, "step": 18200 }, { "epoch": 0.056918618087784284, "grad_norm": 38.25, "learning_rate": 0.000999869291650354, "loss": 30.8788, "step": 18225 }, { "epoch": 0.05699669575319962, "grad_norm": 36.75, "learning_rate": 0.0009998663232570122, "loss": 31.0841, "step": 18250 }, { "epoch": 0.057074773418614964, "grad_norm": 39.75, "learning_rate": 0.0009998633215393805, "loss": 31.4425, "step": 18275 }, { "epoch": 0.05715285108403031, "grad_norm": 37.5, "learning_rate": 0.000999860286497659, "loss": 31.6592, "step": 18300 }, { "epoch": 0.05723092874944565, "grad_norm": 40.0, "learning_rate": 0.0009998572181320496, "loss": 31.3277, "step": 18325 }, { "epoch": 0.05730900641486099, "grad_norm": 39.75, "learning_rate": 0.0009998541164427575, "loss": 31.3697, "step": 18350 }, { "epoch": 0.05738708408027633, "grad_norm": 35.0, "learning_rate": 0.0009998509814299888, "loss": 31.2663, "step": 18375 }, { "epoch": 0.057465161745691676, "grad_norm": 37.25, "learning_rate": 0.000999847813093953, "loss": 31.6682, "step": 18400 }, { "epoch": 0.05754323941110702, "grad_norm": 38.75, "learning_rate": 0.0009998446114348612, "loss": 31.7364, "step": 18425 }, { "epoch": 0.057621317076522356, "grad_norm": 48.5, "learning_rate": 0.0009998413764529266, "loss": 31.8273, "step": 18450 }, { "epoch": 0.0576993947419377, "grad_norm": 39.5, "learning_rate": 0.0009998381081483651, "loss": 32.178, "step": 18475 }, { "epoch": 0.057777472407353044, "grad_norm": 38.75, "learning_rate": 0.0009998348065213946, "loss": 32.3324, "step": 18500 }, { "epoch": 0.05785555007276839, "grad_norm": 41.75, "learning_rate": 0.000999831471572235, "loss": 32.6464, "step": 18525 }, { "epoch": 0.057933627738183724, "grad_norm": 42.0, "learning_rate": 0.0009998281033011091, "loss": 32.1848, "step": 18550 }, { "epoch": 0.05801170540359907, "grad_norm": 39.75, "learning_rate": 0.000999824701708241, "loss": 32.543, "step": 18575 }, { "epoch": 0.05808978306901441, "grad_norm": 48.5, "learning_rate": 0.0009998212667938578, "loss": 32.4726, "step": 18600 }, { "epoch": 0.058167860734429755, "grad_norm": 45.0, "learning_rate": 0.000999817798558188, "loss": 32.2877, "step": 18625 }, { "epoch": 0.05824593839984509, "grad_norm": 38.25, "learning_rate": 0.0009998142970014633, "loss": 32.4187, "step": 18650 }, { "epoch": 0.058324016065260435, "grad_norm": 51.5, "learning_rate": 0.0009998107621239168, "loss": 32.6334, "step": 18675 }, { "epoch": 0.05840209373067578, "grad_norm": 48.5, "learning_rate": 0.0009998071939257842, "loss": 33.0217, "step": 18700 }, { "epoch": 0.05848017139609112, "grad_norm": 50.0, "learning_rate": 0.0009998035924073036, "loss": 32.839, "step": 18725 }, { "epoch": 0.05855824906150646, "grad_norm": 41.75, "learning_rate": 0.000999799957568715, "loss": 32.84, "step": 18750 }, { "epoch": 0.0586363267269218, "grad_norm": 55.5, "learning_rate": 0.0009997962894102608, "loss": 33.0097, "step": 18775 }, { "epoch": 0.05871440439233715, "grad_norm": 52.5, "learning_rate": 0.0009997925879321854, "loss": 33.0055, "step": 18800 }, { "epoch": 0.05879248205775249, "grad_norm": 47.25, "learning_rate": 0.0009997888531347358, "loss": 33.3652, "step": 18825 }, { "epoch": 0.05887055972316783, "grad_norm": 41.25, "learning_rate": 0.0009997850850181605, "loss": 33.1608, "step": 18850 }, { "epoch": 0.05894863738858317, "grad_norm": 42.75, "learning_rate": 0.000999781283582711, "loss": 33.2872, "step": 18875 }, { "epoch": 0.059026715053998514, "grad_norm": 43.25, "learning_rate": 0.0009997774488286408, "loss": 33.0581, "step": 18900 }, { "epoch": 0.05910479271941386, "grad_norm": 48.0, "learning_rate": 0.0009997735807562055, "loss": 33.0212, "step": 18925 }, { "epoch": 0.059182870384829195, "grad_norm": 39.0, "learning_rate": 0.000999769679365663, "loss": 32.7047, "step": 18950 }, { "epoch": 0.05926094805024454, "grad_norm": 41.25, "learning_rate": 0.0009997657446572735, "loss": 32.7831, "step": 18975 }, { "epoch": 0.05933902571565988, "grad_norm": 42.75, "learning_rate": 0.0009997617766312988, "loss": 32.8744, "step": 19000 }, { "epoch": 0.05933902571565988, "eval_loss": 32.887264251708984, "eval_runtime": 102.2215, "eval_samples_per_second": 50.899, "eval_steps_per_second": 3.189, "step": 19000 }, { "epoch": 0.059417103381075226, "grad_norm": 41.5, "learning_rate": 0.0009997577752880041, "loss": 32.8132, "step": 19025 }, { "epoch": 0.05949518104649056, "grad_norm": 43.75, "learning_rate": 0.0009997537406276557, "loss": 32.9501, "step": 19050 }, { "epoch": 0.059573258711905906, "grad_norm": 45.25, "learning_rate": 0.0009997496726505228, "loss": 32.7061, "step": 19075 }, { "epoch": 0.05965133637732125, "grad_norm": 37.5, "learning_rate": 0.0009997455713568763, "loss": 32.7181, "step": 19100 }, { "epoch": 0.059729414042736594, "grad_norm": 41.0, "learning_rate": 0.00099974143674699, "loss": 32.554, "step": 19125 }, { "epoch": 0.05980749170815193, "grad_norm": 41.5, "learning_rate": 0.0009997372688211395, "loss": 32.7137, "step": 19150 }, { "epoch": 0.059885569373567274, "grad_norm": 45.0, "learning_rate": 0.0009997330675796023, "loss": 33.0025, "step": 19175 }, { "epoch": 0.05996364703898262, "grad_norm": 42.0, "learning_rate": 0.000999728833022659, "loss": 32.9643, "step": 19200 }, { "epoch": 0.06004172470439796, "grad_norm": 52.5, "learning_rate": 0.0009997245651505915, "loss": 32.8268, "step": 19225 }, { "epoch": 0.0601198023698133, "grad_norm": 43.0, "learning_rate": 0.0009997202639636844, "loss": 32.8, "step": 19250 }, { "epoch": 0.06019788003522864, "grad_norm": 56.5, "learning_rate": 0.0009997159294622246, "loss": 32.9133, "step": 19275 }, { "epoch": 0.060275957700643985, "grad_norm": 44.25, "learning_rate": 0.000999711561646501, "loss": 32.8573, "step": 19300 }, { "epoch": 0.06035403536605933, "grad_norm": 44.0, "learning_rate": 0.0009997071605168043, "loss": 32.7512, "step": 19325 }, { "epoch": 0.060432113031474666, "grad_norm": 36.5, "learning_rate": 0.000999702726073429, "loss": 32.9202, "step": 19350 }, { "epoch": 0.06051019069689001, "grad_norm": 40.0, "learning_rate": 0.0009996982583166695, "loss": 32.942, "step": 19375 }, { "epoch": 0.06058826836230535, "grad_norm": 39.0, "learning_rate": 0.0009996937572468246, "loss": 32.8775, "step": 19400 }, { "epoch": 0.0606663460277207, "grad_norm": 37.0, "learning_rate": 0.000999689222864194, "loss": 32.8532, "step": 19425 }, { "epoch": 0.06074442369313603, "grad_norm": 47.25, "learning_rate": 0.0009996846551690798, "loss": 32.9941, "step": 19450 }, { "epoch": 0.06082250135855138, "grad_norm": 38.0, "learning_rate": 0.0009996800541617868, "loss": 32.8616, "step": 19475 }, { "epoch": 0.06090057902396672, "grad_norm": 39.5, "learning_rate": 0.0009996754198426216, "loss": 32.9031, "step": 19500 }, { "epoch": 0.060978656689382064, "grad_norm": 44.5, "learning_rate": 0.0009996707522118933, "loss": 33.0028, "step": 19525 }, { "epoch": 0.0610567343547974, "grad_norm": 39.75, "learning_rate": 0.0009996660512699128, "loss": 32.8195, "step": 19550 }, { "epoch": 0.061134812020212745, "grad_norm": 40.75, "learning_rate": 0.0009996613170169936, "loss": 32.571, "step": 19575 }, { "epoch": 0.06121288968562809, "grad_norm": 36.75, "learning_rate": 0.0009996565494534517, "loss": 32.5517, "step": 19600 }, { "epoch": 0.06129096735104343, "grad_norm": 38.0, "learning_rate": 0.0009996517485796044, "loss": 32.5484, "step": 19625 }, { "epoch": 0.06136904501645877, "grad_norm": 41.75, "learning_rate": 0.000999646914395772, "loss": 32.5895, "step": 19650 }, { "epoch": 0.06144712268187411, "grad_norm": 42.0, "learning_rate": 0.0009996420469022766, "loss": 32.8765, "step": 19675 }, { "epoch": 0.061525200347289456, "grad_norm": 38.5, "learning_rate": 0.0009996371460994431, "loss": 32.8793, "step": 19700 }, { "epoch": 0.0616032780127048, "grad_norm": 40.25, "learning_rate": 0.0009996322119875977, "loss": 33.0708, "step": 19725 }, { "epoch": 0.06168135567812014, "grad_norm": 38.0, "learning_rate": 0.00099962724456707, "loss": 33.188, "step": 19750 }, { "epoch": 0.06175943334353548, "grad_norm": 49.0, "learning_rate": 0.0009996222438381904, "loss": 33.2918, "step": 19775 }, { "epoch": 0.061837511008950824, "grad_norm": 44.75, "learning_rate": 0.0009996172098012928, "loss": 33.4949, "step": 19800 }, { "epoch": 0.06191558867436617, "grad_norm": 43.25, "learning_rate": 0.0009996121424567126, "loss": 33.8741, "step": 19825 }, { "epoch": 0.061993666339781504, "grad_norm": 41.75, "learning_rate": 0.0009996070418047877, "loss": 33.6041, "step": 19850 }, { "epoch": 0.06207174400519685, "grad_norm": 40.25, "learning_rate": 0.000999601907845858, "loss": 33.6722, "step": 19875 }, { "epoch": 0.06214982167061219, "grad_norm": 40.5, "learning_rate": 0.000999596740580266, "loss": 33.484, "step": 19900 }, { "epoch": 0.062227899336027535, "grad_norm": 46.25, "learning_rate": 0.000999591540008356, "loss": 33.7352, "step": 19925 }, { "epoch": 0.06230597700144287, "grad_norm": 48.5, "learning_rate": 0.0009995863061304747, "loss": 33.9541, "step": 19950 }, { "epoch": 0.062384054666858216, "grad_norm": 44.0, "learning_rate": 0.0009995810389469711, "loss": 34.2383, "step": 19975 }, { "epoch": 0.06246213233227356, "grad_norm": 40.75, "learning_rate": 0.0009995757384581964, "loss": 33.8251, "step": 20000 }, { "epoch": 0.06246213233227356, "eval_loss": 34.19303512573242, "eval_runtime": 102.3811, "eval_samples_per_second": 50.82, "eval_steps_per_second": 3.184, "step": 20000 }, { "epoch": 0.0625402099976889, "grad_norm": 50.0, "learning_rate": 0.000999570404664504, "loss": 34.3706, "step": 20025 }, { "epoch": 0.06261828766310425, "grad_norm": 45.75, "learning_rate": 0.0009995650375662492, "loss": 34.1775, "step": 20050 }, { "epoch": 0.06269636532851959, "grad_norm": 43.5, "learning_rate": 0.0009995596371637897, "loss": 34.3327, "step": 20075 }, { "epoch": 0.06277444299393492, "grad_norm": 43.25, "learning_rate": 0.0009995542034574863, "loss": 34.3871, "step": 20100 }, { "epoch": 0.06285252065935026, "grad_norm": 42.75, "learning_rate": 0.0009995487364477004, "loss": 33.8116, "step": 20125 }, { "epoch": 0.06293059832476561, "grad_norm": 37.5, "learning_rate": 0.0009995432361347971, "loss": 33.9015, "step": 20150 }, { "epoch": 0.06300867599018095, "grad_norm": 38.5, "learning_rate": 0.0009995377025191427, "loss": 33.8639, "step": 20175 }, { "epoch": 0.0630867536555963, "grad_norm": 37.25, "learning_rate": 0.0009995321356011063, "loss": 33.6663, "step": 20200 }, { "epoch": 0.06316483132101164, "grad_norm": 40.5, "learning_rate": 0.0009995265353810589, "loss": 33.8264, "step": 20225 }, { "epoch": 0.06324290898642698, "grad_norm": 45.25, "learning_rate": 0.0009995209018593737, "loss": 33.6851, "step": 20250 }, { "epoch": 0.06332098665184233, "grad_norm": 42.0, "learning_rate": 0.0009995152350364266, "loss": 33.5799, "step": 20275 }, { "epoch": 0.06339906431725766, "grad_norm": 43.25, "learning_rate": 0.000999509534912595, "loss": 33.6905, "step": 20300 }, { "epoch": 0.063477141982673, "grad_norm": 37.25, "learning_rate": 0.0009995038014882593, "loss": 33.4839, "step": 20325 }, { "epoch": 0.06355521964808834, "grad_norm": 35.75, "learning_rate": 0.0009994980347638016, "loss": 33.6105, "step": 20350 }, { "epoch": 0.06363329731350369, "grad_norm": 38.0, "learning_rate": 0.0009994922347396063, "loss": 33.9047, "step": 20375 }, { "epoch": 0.06371137497891903, "grad_norm": 40.25, "learning_rate": 0.00099948640141606, "loss": 34.1876, "step": 20400 }, { "epoch": 0.06378945264433437, "grad_norm": 45.75, "learning_rate": 0.0009994805347935517, "loss": 33.9303, "step": 20425 }, { "epoch": 0.06386753030974972, "grad_norm": 42.75, "learning_rate": 0.0009994746348724727, "loss": 33.951, "step": 20450 }, { "epoch": 0.06394560797516506, "grad_norm": 50.0, "learning_rate": 0.000999468701653216, "loss": 34.056, "step": 20475 }, { "epoch": 0.0640236856405804, "grad_norm": 50.5, "learning_rate": 0.0009994627351361772, "loss": 33.9114, "step": 20500 }, { "epoch": 0.06410176330599573, "grad_norm": 42.25, "learning_rate": 0.0009994567353217541, "loss": 34.2422, "step": 20525 }, { "epoch": 0.06417984097141108, "grad_norm": 44.25, "learning_rate": 0.0009994507022103465, "loss": 34.0631, "step": 20550 }, { "epoch": 0.06425791863682642, "grad_norm": 39.75, "learning_rate": 0.000999444635802357, "loss": 33.8447, "step": 20575 }, { "epoch": 0.06433599630224177, "grad_norm": 44.75, "learning_rate": 0.00099943853609819, "loss": 33.8587, "step": 20600 }, { "epoch": 0.06441407396765711, "grad_norm": 39.25, "learning_rate": 0.0009994324030982518, "loss": 33.943, "step": 20625 }, { "epoch": 0.06449215163307245, "grad_norm": 41.75, "learning_rate": 0.0009994262368029515, "loss": 33.9425, "step": 20650 }, { "epoch": 0.0645702292984878, "grad_norm": 44.5, "learning_rate": 0.0009994200372127, "loss": 34.0832, "step": 20675 }, { "epoch": 0.06464830696390314, "grad_norm": 39.25, "learning_rate": 0.000999413804327911, "loss": 33.9888, "step": 20700 }, { "epoch": 0.06472638462931847, "grad_norm": 43.75, "learning_rate": 0.0009994075381489994, "loss": 34.1022, "step": 20725 }, { "epoch": 0.06480446229473381, "grad_norm": 44.25, "learning_rate": 0.0009994012386763836, "loss": 33.9719, "step": 20750 }, { "epoch": 0.06488253996014916, "grad_norm": 42.0, "learning_rate": 0.000999394905910483, "loss": 33.7568, "step": 20775 }, { "epoch": 0.0649606176255645, "grad_norm": 43.75, "learning_rate": 0.0009993885398517201, "loss": 33.7079, "step": 20800 }, { "epoch": 0.06503869529097984, "grad_norm": 40.0, "learning_rate": 0.0009993821405005195, "loss": 33.8396, "step": 20825 }, { "epoch": 0.06511677295639519, "grad_norm": 42.5, "learning_rate": 0.0009993757078573073, "loss": 33.6027, "step": 20850 }, { "epoch": 0.06519485062181053, "grad_norm": 42.5, "learning_rate": 0.0009993692419225126, "loss": 33.5388, "step": 20875 }, { "epoch": 0.06527292828722588, "grad_norm": 55.0, "learning_rate": 0.0009993627426965667, "loss": 33.775, "step": 20900 }, { "epoch": 0.0653510059526412, "grad_norm": 39.0, "learning_rate": 0.0009993562101799024, "loss": 33.8984, "step": 20925 }, { "epoch": 0.06542908361805655, "grad_norm": 41.5, "learning_rate": 0.0009993496443729557, "loss": 33.8582, "step": 20950 }, { "epoch": 0.06550716128347189, "grad_norm": 37.25, "learning_rate": 0.0009993430452761639, "loss": 33.8915, "step": 20975 }, { "epoch": 0.06558523894888724, "grad_norm": 35.0, "learning_rate": 0.0009993364128899672, "loss": 33.5705, "step": 21000 }, { "epoch": 0.06558523894888724, "eval_loss": 33.73247146606445, "eval_runtime": 102.3252, "eval_samples_per_second": 50.848, "eval_steps_per_second": 3.186, "step": 21000 }, { "epoch": 0.06566331661430258, "grad_norm": 37.0, "learning_rate": 0.0009993297472148076, "loss": 33.5467, "step": 21025 }, { "epoch": 0.06574139427971792, "grad_norm": 38.5, "learning_rate": 0.0009993230482511295, "loss": 33.6705, "step": 21050 }, { "epoch": 0.06581947194513327, "grad_norm": 39.0, "learning_rate": 0.0009993163159993798, "loss": 33.7872, "step": 21075 }, { "epoch": 0.06589754961054861, "grad_norm": 45.5, "learning_rate": 0.0009993095504600067, "loss": 33.6316, "step": 21100 }, { "epoch": 0.06597562727596394, "grad_norm": 38.0, "learning_rate": 0.0009993027516334617, "loss": 33.8796, "step": 21125 }, { "epoch": 0.06605370494137928, "grad_norm": 43.75, "learning_rate": 0.000999295919520198, "loss": 34.0526, "step": 21150 }, { "epoch": 0.06613178260679463, "grad_norm": 36.0, "learning_rate": 0.000999289054120671, "loss": 34.1438, "step": 21175 }, { "epoch": 0.06620986027220997, "grad_norm": 38.0, "learning_rate": 0.0009992821554353382, "loss": 33.7974, "step": 21200 }, { "epoch": 0.06628793793762532, "grad_norm": 46.0, "learning_rate": 0.00099927522346466, "loss": 33.8107, "step": 21225 }, { "epoch": 0.06636601560304066, "grad_norm": 45.75, "learning_rate": 0.0009992682582090982, "loss": 33.8952, "step": 21250 }, { "epoch": 0.066444093268456, "grad_norm": 39.5, "learning_rate": 0.0009992612596691171, "loss": 34.201, "step": 21275 }, { "epoch": 0.06652217093387135, "grad_norm": 49.25, "learning_rate": 0.0009992542278451832, "loss": 34.2007, "step": 21300 }, { "epoch": 0.06660024859928668, "grad_norm": 42.0, "learning_rate": 0.0009992471627377657, "loss": 34.3501, "step": 21325 }, { "epoch": 0.06667832626470202, "grad_norm": 48.75, "learning_rate": 0.0009992400643473354, "loss": 34.4321, "step": 21350 }, { "epoch": 0.06675640393011736, "grad_norm": 43.25, "learning_rate": 0.0009992329326743653, "loss": 34.638, "step": 21375 }, { "epoch": 0.06683448159553271, "grad_norm": 41.75, "learning_rate": 0.000999225767719331, "loss": 34.588, "step": 21400 }, { "epoch": 0.06691255926094805, "grad_norm": 44.5, "learning_rate": 0.0009992185694827102, "loss": 34.7111, "step": 21425 }, { "epoch": 0.0669906369263634, "grad_norm": 50.5, "learning_rate": 0.0009992113379649829, "loss": 34.7677, "step": 21450 }, { "epoch": 0.06706871459177874, "grad_norm": 62.0, "learning_rate": 0.000999204073166631, "loss": 35.0234, "step": 21475 }, { "epoch": 0.06714679225719408, "grad_norm": 48.0, "learning_rate": 0.0009991967750881388, "loss": 35.0909, "step": 21500 }, { "epoch": 0.06722486992260941, "grad_norm": 49.5, "learning_rate": 0.000999189443729993, "loss": 35.4811, "step": 21525 }, { "epoch": 0.06730294758802476, "grad_norm": 58.0, "learning_rate": 0.0009991820790926824, "loss": 35.2726, "step": 21550 }, { "epoch": 0.0673810252534401, "grad_norm": 55.5, "learning_rate": 0.0009991746811766975, "loss": 35.629, "step": 21575 }, { "epoch": 0.06745910291885544, "grad_norm": 44.0, "learning_rate": 0.000999167249982532, "loss": 35.4736, "step": 21600 }, { "epoch": 0.06753718058427079, "grad_norm": 45.75, "learning_rate": 0.0009991597855106814, "loss": 35.2275, "step": 21625 }, { "epoch": 0.06761525824968613, "grad_norm": 41.5, "learning_rate": 0.0009991522877616428, "loss": 35.2907, "step": 21650 }, { "epoch": 0.06769333591510147, "grad_norm": 56.5, "learning_rate": 0.000999144756735916, "loss": 35.2988, "step": 21675 }, { "epoch": 0.06777141358051682, "grad_norm": 56.0, "learning_rate": 0.000999137192434004, "loss": 35.2948, "step": 21700 }, { "epoch": 0.06784949124593215, "grad_norm": 42.0, "learning_rate": 0.0009991295948564103, "loss": 35.1186, "step": 21725 }, { "epoch": 0.06792756891134749, "grad_norm": 43.25, "learning_rate": 0.0009991219640036416, "loss": 35.115, "step": 21750 }, { "epoch": 0.06800564657676283, "grad_norm": 43.75, "learning_rate": 0.0009991142998762065, "loss": 35.347, "step": 21775 }, { "epoch": 0.06808372424217818, "grad_norm": 45.0, "learning_rate": 0.000999106602474616, "loss": 35.3008, "step": 21800 }, { "epoch": 0.06816180190759352, "grad_norm": 66.0, "learning_rate": 0.0009990988717993832, "loss": 35.321, "step": 21825 }, { "epoch": 0.06823987957300887, "grad_norm": 56.0, "learning_rate": 0.0009990911078510238, "loss": 35.373, "step": 21850 }, { "epoch": 0.06831795723842421, "grad_norm": 49.25, "learning_rate": 0.000999083310630055, "loss": 35.2404, "step": 21875 }, { "epoch": 0.06839603490383955, "grad_norm": 46.0, "learning_rate": 0.000999075480136997, "loss": 35.2177, "step": 21900 }, { "epoch": 0.06847411256925488, "grad_norm": 43.5, "learning_rate": 0.0009990676163723715, "loss": 35.1759, "step": 21925 }, { "epoch": 0.06855219023467023, "grad_norm": 54.5, "learning_rate": 0.000999059719336703, "loss": 34.7193, "step": 21950 }, { "epoch": 0.06863026790008557, "grad_norm": 48.25, "learning_rate": 0.0009990517890305175, "loss": 34.6676, "step": 21975 }, { "epoch": 0.06870834556550091, "grad_norm": 44.75, "learning_rate": 0.0009990438254543442, "loss": 34.4965, "step": 22000 }, { "epoch": 0.06870834556550091, "eval_loss": 34.531646728515625, "eval_runtime": 102.6371, "eval_samples_per_second": 50.693, "eval_steps_per_second": 3.176, "step": 22000 } ], "logging_steps": 25, "max_steps": 320194, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.7899608404454277e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }