pretrain / last-checkpoint /trainer_state.json
minpeter's picture
Training in progress, step 22000, checkpoint
77a5e93 verified
{
"best_global_step": 2000,
"best_metric": 9.218317031860352,
"best_model_checkpoint": "./artifacts/models/base-250725-test/checkpoint-2000",
"epoch": 0.06870834556550091,
"eval_steps": 1000,
"global_step": 22000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 7.807766541534195e-05,
"grad_norm": 1.9921875,
"learning_rate": 1.499063085571518e-06,
"loss": 10.8863,
"step": 25
},
{
"epoch": 0.0001561553308306839,
"grad_norm": 2.40625,
"learning_rate": 3.0605871330418487e-06,
"loss": 10.8814,
"step": 50
},
{
"epoch": 0.00023423299624602585,
"grad_norm": 2.328125,
"learning_rate": 4.62211118051218e-06,
"loss": 10.883,
"step": 75
},
{
"epoch": 0.0003123106616613678,
"grad_norm": 2.03125,
"learning_rate": 6.183635227982511e-06,
"loss": 10.8828,
"step": 100
},
{
"epoch": 0.00039038832707670977,
"grad_norm": 1.9765625,
"learning_rate": 7.745159275452842e-06,
"loss": 10.8834,
"step": 125
},
{
"epoch": 0.0004684659924920517,
"grad_norm": 2.125,
"learning_rate": 9.306683322923173e-06,
"loss": 10.8796,
"step": 150
},
{
"epoch": 0.0005465436579073936,
"grad_norm": 1.96875,
"learning_rate": 1.0868207370393504e-05,
"loss": 10.8798,
"step": 175
},
{
"epoch": 0.0006246213233227356,
"grad_norm": 1.9765625,
"learning_rate": 1.2429731417863835e-05,
"loss": 10.8764,
"step": 200
},
{
"epoch": 0.0007026989887380775,
"grad_norm": 2.203125,
"learning_rate": 1.3991255465334166e-05,
"loss": 10.8779,
"step": 225
},
{
"epoch": 0.0007807766541534195,
"grad_norm": 2.296875,
"learning_rate": 1.5552779512804497e-05,
"loss": 10.8715,
"step": 250
},
{
"epoch": 0.0008588543195687615,
"grad_norm": 2.234375,
"learning_rate": 1.7114303560274827e-05,
"loss": 10.8676,
"step": 275
},
{
"epoch": 0.0009369319849841034,
"grad_norm": 2.3125,
"learning_rate": 1.867582760774516e-05,
"loss": 10.8621,
"step": 300
},
{
"epoch": 0.0010150096503994453,
"grad_norm": 2.359375,
"learning_rate": 2.0237351655215492e-05,
"loss": 10.8566,
"step": 325
},
{
"epoch": 0.0010930873158147873,
"grad_norm": 2.015625,
"learning_rate": 2.179887570268582e-05,
"loss": 10.8515,
"step": 350
},
{
"epoch": 0.0011711649812301292,
"grad_norm": 2.21875,
"learning_rate": 2.3360399750156154e-05,
"loss": 10.8445,
"step": 375
},
{
"epoch": 0.0012492426466454711,
"grad_norm": 2.09375,
"learning_rate": 2.4921923797626483e-05,
"loss": 10.8383,
"step": 400
},
{
"epoch": 0.001327320312060813,
"grad_norm": 2.0625,
"learning_rate": 2.6483447845096816e-05,
"loss": 10.8244,
"step": 425
},
{
"epoch": 0.001405397977476155,
"grad_norm": 1.890625,
"learning_rate": 2.804497189256715e-05,
"loss": 10.8193,
"step": 450
},
{
"epoch": 0.001483475642891497,
"grad_norm": 2.21875,
"learning_rate": 2.9606495940037475e-05,
"loss": 10.7992,
"step": 475
},
{
"epoch": 0.001561553308306839,
"grad_norm": 2.09375,
"learning_rate": 3.116801998750781e-05,
"loss": 10.7987,
"step": 500
},
{
"epoch": 0.001639630973722181,
"grad_norm": 2.0625,
"learning_rate": 3.272954403497814e-05,
"loss": 10.7803,
"step": 525
},
{
"epoch": 0.001717708639137523,
"grad_norm": 2.140625,
"learning_rate": 3.429106808244847e-05,
"loss": 10.7653,
"step": 550
},
{
"epoch": 0.0017957863045528649,
"grad_norm": 2.578125,
"learning_rate": 3.58525921299188e-05,
"loss": 10.745,
"step": 575
},
{
"epoch": 0.0018738639699682068,
"grad_norm": 1.8515625,
"learning_rate": 3.741411617738913e-05,
"loss": 10.7327,
"step": 600
},
{
"epoch": 0.0019519416353835487,
"grad_norm": 2.15625,
"learning_rate": 3.897564022485946e-05,
"loss": 10.7159,
"step": 625
},
{
"epoch": 0.0020300193007988907,
"grad_norm": 2.53125,
"learning_rate": 4.053716427232979e-05,
"loss": 10.6931,
"step": 650
},
{
"epoch": 0.0021080969662142326,
"grad_norm": 2.078125,
"learning_rate": 4.2098688319800126e-05,
"loss": 10.6688,
"step": 675
},
{
"epoch": 0.0021861746316295745,
"grad_norm": 2.6875,
"learning_rate": 4.3660212367270456e-05,
"loss": 10.6408,
"step": 700
},
{
"epoch": 0.0022642522970449165,
"grad_norm": 2.6875,
"learning_rate": 4.522173641474079e-05,
"loss": 10.63,
"step": 725
},
{
"epoch": 0.0023423299624602584,
"grad_norm": 2.8125,
"learning_rate": 4.678326046221112e-05,
"loss": 10.6057,
"step": 750
},
{
"epoch": 0.0024204076278756003,
"grad_norm": 2.796875,
"learning_rate": 4.834478450968145e-05,
"loss": 10.5781,
"step": 775
},
{
"epoch": 0.0024984852932909423,
"grad_norm": 2.484375,
"learning_rate": 4.990630855715178e-05,
"loss": 10.5501,
"step": 800
},
{
"epoch": 0.002576562958706284,
"grad_norm": 3.171875,
"learning_rate": 5.1467832604622116e-05,
"loss": 10.5076,
"step": 825
},
{
"epoch": 0.002654640624121626,
"grad_norm": 3.078125,
"learning_rate": 5.3029356652092445e-05,
"loss": 10.477,
"step": 850
},
{
"epoch": 0.002732718289536968,
"grad_norm": 2.53125,
"learning_rate": 5.4590880699562774e-05,
"loss": 10.4528,
"step": 875
},
{
"epoch": 0.00281079595495231,
"grad_norm": 2.90625,
"learning_rate": 5.6152404747033104e-05,
"loss": 10.4192,
"step": 900
},
{
"epoch": 0.002888873620367652,
"grad_norm": 2.859375,
"learning_rate": 5.771392879450343e-05,
"loss": 10.3672,
"step": 925
},
{
"epoch": 0.002966951285782994,
"grad_norm": 3.078125,
"learning_rate": 5.927545284197377e-05,
"loss": 10.3219,
"step": 950
},
{
"epoch": 0.003045028951198336,
"grad_norm": 2.890625,
"learning_rate": 6.08369768894441e-05,
"loss": 10.3154,
"step": 975
},
{
"epoch": 0.003123106616613678,
"grad_norm": 3.125,
"learning_rate": 6.239850093691443e-05,
"loss": 10.2594,
"step": 1000
},
{
"epoch": 0.003123106616613678,
"eval_loss": 10.227066993713379,
"eval_runtime": 102.2402,
"eval_samples_per_second": 50.89,
"eval_steps_per_second": 3.189,
"step": 1000
},
{
"epoch": 0.00320118428202902,
"grad_norm": 2.859375,
"learning_rate": 6.396002498438476e-05,
"loss": 10.2088,
"step": 1025
},
{
"epoch": 0.003279261947444362,
"grad_norm": 2.921875,
"learning_rate": 6.552154903185509e-05,
"loss": 10.1806,
"step": 1050
},
{
"epoch": 0.003357339612859704,
"grad_norm": 3.015625,
"learning_rate": 6.708307307932544e-05,
"loss": 10.1161,
"step": 1075
},
{
"epoch": 0.003435417278275046,
"grad_norm": 2.640625,
"learning_rate": 6.864459712679575e-05,
"loss": 10.0794,
"step": 1100
},
{
"epoch": 0.003513494943690388,
"grad_norm": 2.53125,
"learning_rate": 7.020612117426608e-05,
"loss": 10.0233,
"step": 1125
},
{
"epoch": 0.0035915726091057297,
"grad_norm": 2.640625,
"learning_rate": 7.176764522173641e-05,
"loss": 9.9989,
"step": 1150
},
{
"epoch": 0.0036696502745210717,
"grad_norm": 2.328125,
"learning_rate": 7.332916926920674e-05,
"loss": 9.9246,
"step": 1175
},
{
"epoch": 0.0037477279399364136,
"grad_norm": 2.0,
"learning_rate": 7.489069331667708e-05,
"loss": 9.8812,
"step": 1200
},
{
"epoch": 0.0038258056053517555,
"grad_norm": 1.875,
"learning_rate": 7.645221736414741e-05,
"loss": 9.8312,
"step": 1225
},
{
"epoch": 0.0039038832707670975,
"grad_norm": 1.953125,
"learning_rate": 7.801374141161774e-05,
"loss": 9.8123,
"step": 1250
},
{
"epoch": 0.003981960936182439,
"grad_norm": 1.9296875,
"learning_rate": 7.957526545908807e-05,
"loss": 9.7681,
"step": 1275
},
{
"epoch": 0.004060038601597781,
"grad_norm": 1.703125,
"learning_rate": 8.113678950655841e-05,
"loss": 9.7618,
"step": 1300
},
{
"epoch": 0.004138116267013123,
"grad_norm": 1.9453125,
"learning_rate": 8.269831355402874e-05,
"loss": 9.6876,
"step": 1325
},
{
"epoch": 0.004216193932428465,
"grad_norm": 1.796875,
"learning_rate": 8.425983760149906e-05,
"loss": 9.689,
"step": 1350
},
{
"epoch": 0.004294271597843807,
"grad_norm": 1.6640625,
"learning_rate": 8.582136164896939e-05,
"loss": 9.6471,
"step": 1375
},
{
"epoch": 0.004372349263259149,
"grad_norm": 1.65625,
"learning_rate": 8.738288569643972e-05,
"loss": 9.6163,
"step": 1400
},
{
"epoch": 0.004450426928674491,
"grad_norm": 1.3515625,
"learning_rate": 8.894440974391006e-05,
"loss": 9.5749,
"step": 1425
},
{
"epoch": 0.004528504594089833,
"grad_norm": 1.6171875,
"learning_rate": 9.050593379138039e-05,
"loss": 9.5571,
"step": 1450
},
{
"epoch": 0.004606582259505175,
"grad_norm": 1.6015625,
"learning_rate": 9.206745783885072e-05,
"loss": 9.539,
"step": 1475
},
{
"epoch": 0.004684659924920517,
"grad_norm": 1.6171875,
"learning_rate": 9.362898188632105e-05,
"loss": 9.5223,
"step": 1500
},
{
"epoch": 0.004762737590335859,
"grad_norm": 1.4140625,
"learning_rate": 9.519050593379139e-05,
"loss": 9.4832,
"step": 1525
},
{
"epoch": 0.004840815255751201,
"grad_norm": 1.640625,
"learning_rate": 9.675202998126172e-05,
"loss": 9.4502,
"step": 1550
},
{
"epoch": 0.004918892921166543,
"grad_norm": 1.5546875,
"learning_rate": 9.831355402873205e-05,
"loss": 9.4381,
"step": 1575
},
{
"epoch": 0.0049969705865818845,
"grad_norm": 1.5625,
"learning_rate": 9.987507807620237e-05,
"loss": 9.4403,
"step": 1600
},
{
"epoch": 0.0050750482519972264,
"grad_norm": 1.5703125,
"learning_rate": 0.0001014366021236727,
"loss": 9.4185,
"step": 1625
},
{
"epoch": 0.005153125917412568,
"grad_norm": 1.5078125,
"learning_rate": 0.00010299812617114304,
"loss": 9.4044,
"step": 1650
},
{
"epoch": 0.00523120358282791,
"grad_norm": 1.484375,
"learning_rate": 0.00010455965021861337,
"loss": 9.3678,
"step": 1675
},
{
"epoch": 0.005309281248243252,
"grad_norm": 1.4609375,
"learning_rate": 0.0001061211742660837,
"loss": 9.3262,
"step": 1700
},
{
"epoch": 0.005387358913658594,
"grad_norm": 1.4296875,
"learning_rate": 0.00010768269831355403,
"loss": 9.365,
"step": 1725
},
{
"epoch": 0.005465436579073936,
"grad_norm": 1.734375,
"learning_rate": 0.00010924422236102437,
"loss": 9.319,
"step": 1750
},
{
"epoch": 0.005543514244489278,
"grad_norm": 1.4296875,
"learning_rate": 0.0001108057464084947,
"loss": 9.3204,
"step": 1775
},
{
"epoch": 0.00562159190990462,
"grad_norm": 1.5078125,
"learning_rate": 0.00011236727045596503,
"loss": 9.2825,
"step": 1800
},
{
"epoch": 0.005699669575319962,
"grad_norm": 1.390625,
"learning_rate": 0.00011392879450343536,
"loss": 9.2735,
"step": 1825
},
{
"epoch": 0.005777747240735304,
"grad_norm": 1.359375,
"learning_rate": 0.0001154903185509057,
"loss": 9.264,
"step": 1850
},
{
"epoch": 0.005855824906150646,
"grad_norm": 1.484375,
"learning_rate": 0.00011705184259837602,
"loss": 9.263,
"step": 1875
},
{
"epoch": 0.005933902571565988,
"grad_norm": 1.3671875,
"learning_rate": 0.00011861336664584634,
"loss": 9.2572,
"step": 1900
},
{
"epoch": 0.0060119802369813305,
"grad_norm": 1.5078125,
"learning_rate": 0.00012017489069331667,
"loss": 9.2461,
"step": 1925
},
{
"epoch": 0.006090057902396672,
"grad_norm": 1.53125,
"learning_rate": 0.000121736414740787,
"loss": 9.2345,
"step": 1950
},
{
"epoch": 0.006168135567812014,
"grad_norm": 1.671875,
"learning_rate": 0.00012329793878825736,
"loss": 9.1947,
"step": 1975
},
{
"epoch": 0.006246213233227356,
"grad_norm": 1.671875,
"learning_rate": 0.00012485946283572768,
"loss": 9.2535,
"step": 2000
},
{
"epoch": 0.006246213233227356,
"eval_loss": 9.218317031860352,
"eval_runtime": 102.1917,
"eval_samples_per_second": 50.914,
"eval_steps_per_second": 3.19,
"step": 2000
},
{
"epoch": 0.006324290898642698,
"grad_norm": 1.625,
"learning_rate": 0.000126420986883198,
"loss": 9.2323,
"step": 2025
},
{
"epoch": 0.00640236856405804,
"grad_norm": 1.8203125,
"learning_rate": 0.00012798251093066833,
"loss": 9.2067,
"step": 2050
},
{
"epoch": 0.006480446229473382,
"grad_norm": 1.734375,
"learning_rate": 0.00012954403497813865,
"loss": 9.2166,
"step": 2075
},
{
"epoch": 0.006558523894888724,
"grad_norm": 2.109375,
"learning_rate": 0.000131105559025609,
"loss": 9.2106,
"step": 2100
},
{
"epoch": 0.006636601560304066,
"grad_norm": 1.9375,
"learning_rate": 0.0001326670830730793,
"loss": 9.1922,
"step": 2125
},
{
"epoch": 0.006714679225719408,
"grad_norm": 1.8046875,
"learning_rate": 0.00013422860712054965,
"loss": 9.1984,
"step": 2150
},
{
"epoch": 0.00679275689113475,
"grad_norm": 1.8515625,
"learning_rate": 0.00013579013116802,
"loss": 9.1911,
"step": 2175
},
{
"epoch": 0.006870834556550092,
"grad_norm": 2.53125,
"learning_rate": 0.0001373516552154903,
"loss": 9.1912,
"step": 2200
},
{
"epoch": 0.006948912221965434,
"grad_norm": 1.9375,
"learning_rate": 0.00013891317926296065,
"loss": 9.1964,
"step": 2225
},
{
"epoch": 0.007026989887380776,
"grad_norm": 2.421875,
"learning_rate": 0.000140474703310431,
"loss": 9.2259,
"step": 2250
},
{
"epoch": 0.0071050675527961175,
"grad_norm": 2.390625,
"learning_rate": 0.0001420362273579013,
"loss": 9.1996,
"step": 2275
},
{
"epoch": 0.0071831452182114595,
"grad_norm": 2.234375,
"learning_rate": 0.00014359775140537165,
"loss": 9.2401,
"step": 2300
},
{
"epoch": 0.007261222883626801,
"grad_norm": 2.140625,
"learning_rate": 0.00014515927545284197,
"loss": 9.2228,
"step": 2325
},
{
"epoch": 0.007339300549042143,
"grad_norm": 2.171875,
"learning_rate": 0.0001467207995003123,
"loss": 9.1847,
"step": 2350
},
{
"epoch": 0.007417378214457485,
"grad_norm": 1.984375,
"learning_rate": 0.00014828232354778266,
"loss": 9.1981,
"step": 2375
},
{
"epoch": 0.007495455879872827,
"grad_norm": 2.21875,
"learning_rate": 0.00014984384759525297,
"loss": 9.1894,
"step": 2400
},
{
"epoch": 0.007573533545288169,
"grad_norm": 2.1875,
"learning_rate": 0.00015140537164272331,
"loss": 9.2327,
"step": 2425
},
{
"epoch": 0.007651611210703511,
"grad_norm": 2.265625,
"learning_rate": 0.00015296689569019363,
"loss": 9.2134,
"step": 2450
},
{
"epoch": 0.007729688876118853,
"grad_norm": 2.375,
"learning_rate": 0.00015452841973766397,
"loss": 9.2026,
"step": 2475
},
{
"epoch": 0.007807766541534195,
"grad_norm": 2.515625,
"learning_rate": 0.00015608994378513432,
"loss": 9.194,
"step": 2500
},
{
"epoch": 0.007885844206949537,
"grad_norm": 2.328125,
"learning_rate": 0.0001576514678326046,
"loss": 9.2258,
"step": 2525
},
{
"epoch": 0.007963921872364879,
"grad_norm": 2.390625,
"learning_rate": 0.00015921299188007495,
"loss": 9.2412,
"step": 2550
},
{
"epoch": 0.00804199953778022,
"grad_norm": 2.578125,
"learning_rate": 0.0001607745159275453,
"loss": 9.2441,
"step": 2575
},
{
"epoch": 0.008120077203195563,
"grad_norm": 2.78125,
"learning_rate": 0.0001623360399750156,
"loss": 9.25,
"step": 2600
},
{
"epoch": 0.008198154868610905,
"grad_norm": 2.328125,
"learning_rate": 0.00016389756402248595,
"loss": 9.2576,
"step": 2625
},
{
"epoch": 0.008276232534026247,
"grad_norm": 2.6875,
"learning_rate": 0.00016545908806995626,
"loss": 9.2549,
"step": 2650
},
{
"epoch": 0.008354310199441588,
"grad_norm": 2.828125,
"learning_rate": 0.0001670206121174266,
"loss": 9.2837,
"step": 2675
},
{
"epoch": 0.00843238786485693,
"grad_norm": 2.96875,
"learning_rate": 0.00016858213616489695,
"loss": 9.2405,
"step": 2700
},
{
"epoch": 0.008510465530272272,
"grad_norm": 2.890625,
"learning_rate": 0.00017014366021236727,
"loss": 9.2927,
"step": 2725
},
{
"epoch": 0.008588543195687614,
"grad_norm": 2.953125,
"learning_rate": 0.0001717051842598376,
"loss": 9.2427,
"step": 2750
},
{
"epoch": 0.008666620861102956,
"grad_norm": 3.0,
"learning_rate": 0.00017326670830730792,
"loss": 9.3099,
"step": 2775
},
{
"epoch": 0.008744698526518298,
"grad_norm": 2.890625,
"learning_rate": 0.00017482823235477827,
"loss": 9.2786,
"step": 2800
},
{
"epoch": 0.00882277619193364,
"grad_norm": 3.875,
"learning_rate": 0.0001763897564022486,
"loss": 9.2615,
"step": 2825
},
{
"epoch": 0.008900853857348982,
"grad_norm": 2.84375,
"learning_rate": 0.00017795128044971893,
"loss": 9.3233,
"step": 2850
},
{
"epoch": 0.008978931522764324,
"grad_norm": 3.359375,
"learning_rate": 0.00017951280449718927,
"loss": 9.2634,
"step": 2875
},
{
"epoch": 0.009057009188179666,
"grad_norm": 3.171875,
"learning_rate": 0.00018107432854465959,
"loss": 9.3164,
"step": 2900
},
{
"epoch": 0.009135086853595008,
"grad_norm": 3.15625,
"learning_rate": 0.00018263585259212993,
"loss": 9.3274,
"step": 2925
},
{
"epoch": 0.00921316451901035,
"grad_norm": 4.0,
"learning_rate": 0.00018419737663960027,
"loss": 9.3091,
"step": 2950
},
{
"epoch": 0.009291242184425692,
"grad_norm": 3.03125,
"learning_rate": 0.0001857589006870706,
"loss": 9.317,
"step": 2975
},
{
"epoch": 0.009369319849841034,
"grad_norm": 3.109375,
"learning_rate": 0.00018732042473454093,
"loss": 9.3963,
"step": 3000
},
{
"epoch": 0.009369319849841034,
"eval_loss": 9.343441009521484,
"eval_runtime": 102.2757,
"eval_samples_per_second": 50.872,
"eval_steps_per_second": 3.187,
"step": 3000
},
{
"epoch": 0.009447397515256375,
"grad_norm": 3.671875,
"learning_rate": 0.00018888194878201127,
"loss": 9.3633,
"step": 3025
},
{
"epoch": 0.009525475180671717,
"grad_norm": 3.125,
"learning_rate": 0.00019044347282948156,
"loss": 9.335,
"step": 3050
},
{
"epoch": 0.00960355284608706,
"grad_norm": 3.140625,
"learning_rate": 0.0001920049968769519,
"loss": 9.3406,
"step": 3075
},
{
"epoch": 0.009681630511502401,
"grad_norm": 3.109375,
"learning_rate": 0.00019356652092442222,
"loss": 9.3829,
"step": 3100
},
{
"epoch": 0.009759708176917743,
"grad_norm": 3.90625,
"learning_rate": 0.00019512804497189256,
"loss": 9.3734,
"step": 3125
},
{
"epoch": 0.009837785842333085,
"grad_norm": 3.671875,
"learning_rate": 0.0001966895690193629,
"loss": 9.3608,
"step": 3150
},
{
"epoch": 0.009915863507748427,
"grad_norm": 3.359375,
"learning_rate": 0.00019825109306683322,
"loss": 9.3771,
"step": 3175
},
{
"epoch": 0.009993941173163769,
"grad_norm": 3.984375,
"learning_rate": 0.00019981261711430356,
"loss": 9.4023,
"step": 3200
},
{
"epoch": 0.010072018838579111,
"grad_norm": 3.875,
"learning_rate": 0.00020137414116177388,
"loss": 9.4179,
"step": 3225
},
{
"epoch": 0.010150096503994453,
"grad_norm": 3.765625,
"learning_rate": 0.00020293566520924422,
"loss": 9.407,
"step": 3250
},
{
"epoch": 0.010228174169409795,
"grad_norm": 5.0,
"learning_rate": 0.00020449718925671457,
"loss": 9.4284,
"step": 3275
},
{
"epoch": 0.010306251834825137,
"grad_norm": 3.84375,
"learning_rate": 0.00020605871330418488,
"loss": 9.4289,
"step": 3300
},
{
"epoch": 0.010384329500240479,
"grad_norm": 3.90625,
"learning_rate": 0.00020762023735165522,
"loss": 9.409,
"step": 3325
},
{
"epoch": 0.01046240716565582,
"grad_norm": 3.609375,
"learning_rate": 0.00020918176139912557,
"loss": 9.4343,
"step": 3350
},
{
"epoch": 0.010540484831071163,
"grad_norm": 4.15625,
"learning_rate": 0.00021074328544659588,
"loss": 9.4513,
"step": 3375
},
{
"epoch": 0.010618562496486504,
"grad_norm": 3.609375,
"learning_rate": 0.00021230480949406623,
"loss": 9.458,
"step": 3400
},
{
"epoch": 0.010696640161901846,
"grad_norm": 4.4375,
"learning_rate": 0.00021386633354153654,
"loss": 9.5056,
"step": 3425
},
{
"epoch": 0.010774717827317188,
"grad_norm": 3.859375,
"learning_rate": 0.00021542785758900688,
"loss": 9.4958,
"step": 3450
},
{
"epoch": 0.01085279549273253,
"grad_norm": 3.96875,
"learning_rate": 0.00021698938163647723,
"loss": 9.5275,
"step": 3475
},
{
"epoch": 0.010930873158147872,
"grad_norm": 4.46875,
"learning_rate": 0.00021855090568394754,
"loss": 9.4947,
"step": 3500
},
{
"epoch": 0.011008950823563214,
"grad_norm": 4.8125,
"learning_rate": 0.00022011242973141789,
"loss": 9.5044,
"step": 3525
},
{
"epoch": 0.011087028488978556,
"grad_norm": 4.5,
"learning_rate": 0.00022167395377888817,
"loss": 9.5233,
"step": 3550
},
{
"epoch": 0.011165106154393898,
"grad_norm": 4.1875,
"learning_rate": 0.00022323547782635852,
"loss": 9.5455,
"step": 3575
},
{
"epoch": 0.01124318381980924,
"grad_norm": 4.46875,
"learning_rate": 0.00022479700187382886,
"loss": 9.5962,
"step": 3600
},
{
"epoch": 0.011321261485224582,
"grad_norm": 4.46875,
"learning_rate": 0.00022635852592129918,
"loss": 9.5476,
"step": 3625
},
{
"epoch": 0.011399339150639924,
"grad_norm": 4.9375,
"learning_rate": 0.00022792004996876952,
"loss": 9.5762,
"step": 3650
},
{
"epoch": 0.011477416816055266,
"grad_norm": 4.84375,
"learning_rate": 0.00022948157401623983,
"loss": 9.6066,
"step": 3675
},
{
"epoch": 0.011555494481470608,
"grad_norm": 4.8125,
"learning_rate": 0.00023104309806371018,
"loss": 9.6445,
"step": 3700
},
{
"epoch": 0.01163357214688595,
"grad_norm": 4.6875,
"learning_rate": 0.00023260462211118052,
"loss": 9.6033,
"step": 3725
},
{
"epoch": 0.011711649812301292,
"grad_norm": 5.0,
"learning_rate": 0.00023416614615865084,
"loss": 9.6635,
"step": 3750
},
{
"epoch": 0.011789727477716633,
"grad_norm": 4.46875,
"learning_rate": 0.00023572767020612118,
"loss": 9.6236,
"step": 3775
},
{
"epoch": 0.011867805143131975,
"grad_norm": 5.28125,
"learning_rate": 0.00023728919425359152,
"loss": 9.6867,
"step": 3800
},
{
"epoch": 0.011945882808547317,
"grad_norm": 4.5,
"learning_rate": 0.00023885071830106184,
"loss": 9.6757,
"step": 3825
},
{
"epoch": 0.012023960473962661,
"grad_norm": 5.53125,
"learning_rate": 0.00024041224234853218,
"loss": 9.7058,
"step": 3850
},
{
"epoch": 0.012102038139378003,
"grad_norm": 5.71875,
"learning_rate": 0.0002419737663960025,
"loss": 9.7057,
"step": 3875
},
{
"epoch": 0.012180115804793345,
"grad_norm": 6.0,
"learning_rate": 0.00024353529044347284,
"loss": 9.7199,
"step": 3900
},
{
"epoch": 0.012258193470208687,
"grad_norm": 5.78125,
"learning_rate": 0.00024509681449094316,
"loss": 9.7453,
"step": 3925
},
{
"epoch": 0.012336271135624029,
"grad_norm": 4.96875,
"learning_rate": 0.00024665833853841347,
"loss": 9.7496,
"step": 3950
},
{
"epoch": 0.01241434880103937,
"grad_norm": 4.34375,
"learning_rate": 0.00024821986258588384,
"loss": 9.7802,
"step": 3975
},
{
"epoch": 0.012492426466454713,
"grad_norm": 5.5,
"learning_rate": 0.00024978138663335416,
"loss": 9.814,
"step": 4000
},
{
"epoch": 0.012492426466454713,
"eval_loss": 9.791647911071777,
"eval_runtime": 102.2247,
"eval_samples_per_second": 50.898,
"eval_steps_per_second": 3.189,
"step": 4000
},
{
"epoch": 0.012570504131870055,
"grad_norm": 6.0625,
"learning_rate": 0.00025134291068082447,
"loss": 9.785,
"step": 4025
},
{
"epoch": 0.012648581797285396,
"grad_norm": 4.875,
"learning_rate": 0.00025290443472829484,
"loss": 9.8043,
"step": 4050
},
{
"epoch": 0.012726659462700738,
"grad_norm": 6.03125,
"learning_rate": 0.00025446595877576516,
"loss": 9.8206,
"step": 4075
},
{
"epoch": 0.01280473712811608,
"grad_norm": 5.15625,
"learning_rate": 0.0002560274828232355,
"loss": 9.8267,
"step": 4100
},
{
"epoch": 0.012882814793531422,
"grad_norm": 5.5625,
"learning_rate": 0.0002575890068707058,
"loss": 9.8286,
"step": 4125
},
{
"epoch": 0.012960892458946764,
"grad_norm": 5.09375,
"learning_rate": 0.00025915053091817616,
"loss": 9.8346,
"step": 4150
},
{
"epoch": 0.013038970124362106,
"grad_norm": 6.53125,
"learning_rate": 0.0002607120549656465,
"loss": 9.8788,
"step": 4175
},
{
"epoch": 0.013117047789777448,
"grad_norm": 5.84375,
"learning_rate": 0.00026227357901311685,
"loss": 9.8549,
"step": 4200
},
{
"epoch": 0.01319512545519279,
"grad_norm": 5.3125,
"learning_rate": 0.0002638351030605871,
"loss": 9.8965,
"step": 4225
},
{
"epoch": 0.013273203120608132,
"grad_norm": 6.625,
"learning_rate": 0.0002653966271080575,
"loss": 9.936,
"step": 4250
},
{
"epoch": 0.013351280786023474,
"grad_norm": 5.6875,
"learning_rate": 0.0002669581511555278,
"loss": 9.9048,
"step": 4275
},
{
"epoch": 0.013429358451438816,
"grad_norm": 5.78125,
"learning_rate": 0.00026851967520299816,
"loss": 9.9304,
"step": 4300
},
{
"epoch": 0.013507436116854158,
"grad_norm": 5.90625,
"learning_rate": 0.0002700811992504685,
"loss": 9.9668,
"step": 4325
},
{
"epoch": 0.0135855137822695,
"grad_norm": 5.90625,
"learning_rate": 0.0002716427232979388,
"loss": 9.957,
"step": 4350
},
{
"epoch": 0.013663591447684842,
"grad_norm": 6.4375,
"learning_rate": 0.0002732042473454091,
"loss": 9.9809,
"step": 4375
},
{
"epoch": 0.013741669113100183,
"grad_norm": 6.96875,
"learning_rate": 0.0002747657713928794,
"loss": 9.9868,
"step": 4400
},
{
"epoch": 0.013819746778515525,
"grad_norm": 5.71875,
"learning_rate": 0.0002763272954403498,
"loss": 10.0066,
"step": 4425
},
{
"epoch": 0.013897824443930867,
"grad_norm": 5.46875,
"learning_rate": 0.0002778888194878201,
"loss": 9.9905,
"step": 4450
},
{
"epoch": 0.01397590210934621,
"grad_norm": 5.59375,
"learning_rate": 0.0002794503435352905,
"loss": 10.0002,
"step": 4475
},
{
"epoch": 0.014053979774761551,
"grad_norm": 7.5625,
"learning_rate": 0.00028101186758276074,
"loss": 10.0869,
"step": 4500
},
{
"epoch": 0.014132057440176893,
"grad_norm": 5.75,
"learning_rate": 0.0002825733916302311,
"loss": 10.0828,
"step": 4525
},
{
"epoch": 0.014210135105592235,
"grad_norm": 6.4375,
"learning_rate": 0.00028413491567770143,
"loss": 10.1158,
"step": 4550
},
{
"epoch": 0.014288212771007577,
"grad_norm": 6.59375,
"learning_rate": 0.0002856964397251718,
"loss": 10.1618,
"step": 4575
},
{
"epoch": 0.014366290436422919,
"grad_norm": 6.59375,
"learning_rate": 0.0002872579637726421,
"loss": 10.1651,
"step": 4600
},
{
"epoch": 0.014444368101838261,
"grad_norm": 6.9375,
"learning_rate": 0.00028881948782011243,
"loss": 10.1786,
"step": 4625
},
{
"epoch": 0.014522445767253603,
"grad_norm": 8.375,
"learning_rate": 0.00029038101186758275,
"loss": 10.1674,
"step": 4650
},
{
"epoch": 0.014600523432668945,
"grad_norm": 7.625,
"learning_rate": 0.0002919425359150531,
"loss": 10.1869,
"step": 4675
},
{
"epoch": 0.014678601098084287,
"grad_norm": 6.21875,
"learning_rate": 0.00029350405996252343,
"loss": 10.2085,
"step": 4700
},
{
"epoch": 0.014756678763499629,
"grad_norm": 5.5625,
"learning_rate": 0.0002950655840099938,
"loss": 10.2231,
"step": 4725
},
{
"epoch": 0.01483475642891497,
"grad_norm": 7.03125,
"learning_rate": 0.00029662710805746406,
"loss": 10.2671,
"step": 4750
},
{
"epoch": 0.014912834094330312,
"grad_norm": 7.5625,
"learning_rate": 0.00029818863210493443,
"loss": 10.3177,
"step": 4775
},
{
"epoch": 0.014990911759745654,
"grad_norm": 7.0,
"learning_rate": 0.00029975015615240475,
"loss": 10.3046,
"step": 4800
},
{
"epoch": 0.015068989425160996,
"grad_norm": 8.125,
"learning_rate": 0.0003013116801998751,
"loss": 10.3212,
"step": 4825
},
{
"epoch": 0.015147067090576338,
"grad_norm": 6.75,
"learning_rate": 0.00030287320424734543,
"loss": 10.2822,
"step": 4850
},
{
"epoch": 0.01522514475599168,
"grad_norm": 7.03125,
"learning_rate": 0.0003044347282948157,
"loss": 10.3131,
"step": 4875
},
{
"epoch": 0.015303222421407022,
"grad_norm": 7.1875,
"learning_rate": 0.00030599625234228607,
"loss": 10.3112,
"step": 4900
},
{
"epoch": 0.015381300086822364,
"grad_norm": 8.3125,
"learning_rate": 0.0003075577763897564,
"loss": 10.3796,
"step": 4925
},
{
"epoch": 0.015459377752237706,
"grad_norm": 6.8125,
"learning_rate": 0.00030911930043722675,
"loss": 10.4225,
"step": 4950
},
{
"epoch": 0.015537455417653048,
"grad_norm": 6.28125,
"learning_rate": 0.00031068082448469707,
"loss": 10.435,
"step": 4975
},
{
"epoch": 0.01561553308306839,
"grad_norm": 7.0625,
"learning_rate": 0.0003122423485321674,
"loss": 10.4029,
"step": 5000
},
{
"epoch": 0.01561553308306839,
"eval_loss": 10.434911727905273,
"eval_runtime": 102.2426,
"eval_samples_per_second": 50.889,
"eval_steps_per_second": 3.188,
"step": 5000
},
{
"epoch": 0.01569361074848373,
"grad_norm": 7.21875,
"learning_rate": 0.0003138038725796377,
"loss": 10.4435,
"step": 5025
},
{
"epoch": 0.015771688413899074,
"grad_norm": 6.53125,
"learning_rate": 0.00031536539662710807,
"loss": 10.4452,
"step": 5050
},
{
"epoch": 0.015849766079314414,
"grad_norm": 6.40625,
"learning_rate": 0.0003169269206745784,
"loss": 10.4211,
"step": 5075
},
{
"epoch": 0.015927843744729758,
"grad_norm": 8.25,
"learning_rate": 0.00031848844472204876,
"loss": 10.5069,
"step": 5100
},
{
"epoch": 0.0160059214101451,
"grad_norm": 8.625,
"learning_rate": 0.00032004996876951907,
"loss": 10.5109,
"step": 5125
},
{
"epoch": 0.01608399907556044,
"grad_norm": 7.6875,
"learning_rate": 0.0003216114928169894,
"loss": 10.5435,
"step": 5150
},
{
"epoch": 0.016162076740975785,
"grad_norm": 7.46875,
"learning_rate": 0.0003231730168644597,
"loss": 10.5252,
"step": 5175
},
{
"epoch": 0.016240154406391125,
"grad_norm": 9.0625,
"learning_rate": 0.00032473454091193007,
"loss": 10.5791,
"step": 5200
},
{
"epoch": 0.01631823207180647,
"grad_norm": 8.5625,
"learning_rate": 0.0003262960649594004,
"loss": 10.5973,
"step": 5225
},
{
"epoch": 0.01639630973722181,
"grad_norm": 8.375,
"learning_rate": 0.00032785758900687076,
"loss": 10.6523,
"step": 5250
},
{
"epoch": 0.016474387402637153,
"grad_norm": 7.375,
"learning_rate": 0.000329419113054341,
"loss": 10.5952,
"step": 5275
},
{
"epoch": 0.016552465068052493,
"grad_norm": 6.40625,
"learning_rate": 0.0003309806371018114,
"loss": 10.6144,
"step": 5300
},
{
"epoch": 0.016630542733467837,
"grad_norm": 7.34375,
"learning_rate": 0.0003325421611492817,
"loss": 10.6408,
"step": 5325
},
{
"epoch": 0.016708620398883177,
"grad_norm": 9.125,
"learning_rate": 0.0003341036851967521,
"loss": 10.687,
"step": 5350
},
{
"epoch": 0.01678669806429852,
"grad_norm": 8.375,
"learning_rate": 0.0003356652092442224,
"loss": 10.6634,
"step": 5375
},
{
"epoch": 0.01686477572971386,
"grad_norm": 7.65625,
"learning_rate": 0.00033722673329169265,
"loss": 10.7616,
"step": 5400
},
{
"epoch": 0.016942853395129204,
"grad_norm": 9.0,
"learning_rate": 0.000338788257339163,
"loss": 10.7149,
"step": 5425
},
{
"epoch": 0.017020931060544545,
"grad_norm": 9.5,
"learning_rate": 0.00034034978138663334,
"loss": 10.7277,
"step": 5450
},
{
"epoch": 0.01709900872595989,
"grad_norm": 8.625,
"learning_rate": 0.0003419113054341037,
"loss": 10.7305,
"step": 5475
},
{
"epoch": 0.01717708639137523,
"grad_norm": 8.25,
"learning_rate": 0.000343472829481574,
"loss": 10.7721,
"step": 5500
},
{
"epoch": 0.017255164056790572,
"grad_norm": 10.0625,
"learning_rate": 0.00034503435352904434,
"loss": 10.832,
"step": 5525
},
{
"epoch": 0.017333241722205912,
"grad_norm": 10.0,
"learning_rate": 0.00034659587757651466,
"loss": 10.8325,
"step": 5550
},
{
"epoch": 0.017411319387621256,
"grad_norm": 9.125,
"learning_rate": 0.000348157401623985,
"loss": 10.8516,
"step": 5575
},
{
"epoch": 0.017489397053036596,
"grad_norm": 9.125,
"learning_rate": 0.00034971892567145534,
"loss": 10.9156,
"step": 5600
},
{
"epoch": 0.01756747471845194,
"grad_norm": 9.0625,
"learning_rate": 0.0003512804497189257,
"loss": 10.9185,
"step": 5625
},
{
"epoch": 0.01764555238386728,
"grad_norm": 10.4375,
"learning_rate": 0.000352841973766396,
"loss": 10.9967,
"step": 5650
},
{
"epoch": 0.017723630049282624,
"grad_norm": 12.9375,
"learning_rate": 0.00035440349781386634,
"loss": 11.0233,
"step": 5675
},
{
"epoch": 0.017801707714697964,
"grad_norm": 10.0,
"learning_rate": 0.00035596502186133666,
"loss": 11.0479,
"step": 5700
},
{
"epoch": 0.017879785380113308,
"grad_norm": 9.6875,
"learning_rate": 0.00035752654590880703,
"loss": 10.9976,
"step": 5725
},
{
"epoch": 0.017957863045528648,
"grad_norm": 8.5625,
"learning_rate": 0.00035908806995627734,
"loss": 11.0774,
"step": 5750
},
{
"epoch": 0.01803594071094399,
"grad_norm": 10.125,
"learning_rate": 0.00036064959400374766,
"loss": 11.1011,
"step": 5775
},
{
"epoch": 0.01811401837635933,
"grad_norm": 8.3125,
"learning_rate": 0.000362211118051218,
"loss": 11.087,
"step": 5800
},
{
"epoch": 0.018192096041774675,
"grad_norm": 9.875,
"learning_rate": 0.00036377264209868835,
"loss": 11.1436,
"step": 5825
},
{
"epoch": 0.018270173707190016,
"grad_norm": 8.75,
"learning_rate": 0.00036533416614615866,
"loss": 11.1463,
"step": 5850
},
{
"epoch": 0.01834825137260536,
"grad_norm": 9.75,
"learning_rate": 0.00036689569019362903,
"loss": 11.1615,
"step": 5875
},
{
"epoch": 0.0184263290380207,
"grad_norm": 10.4375,
"learning_rate": 0.00036845721424109935,
"loss": 11.1622,
"step": 5900
},
{
"epoch": 0.018504406703436043,
"grad_norm": 9.1875,
"learning_rate": 0.0003700187382885696,
"loss": 11.2235,
"step": 5925
},
{
"epoch": 0.018582484368851383,
"grad_norm": 8.9375,
"learning_rate": 0.00037158026233604,
"loss": 11.2721,
"step": 5950
},
{
"epoch": 0.018660562034266727,
"grad_norm": 8.625,
"learning_rate": 0.0003731417863835103,
"loss": 11.2218,
"step": 5975
},
{
"epoch": 0.018738639699682067,
"grad_norm": 8.5,
"learning_rate": 0.00037470331043098067,
"loss": 11.2897,
"step": 6000
},
{
"epoch": 0.018738639699682067,
"eval_loss": 11.259696960449219,
"eval_runtime": 102.0975,
"eval_samples_per_second": 50.961,
"eval_steps_per_second": 3.193,
"step": 6000
},
{
"epoch": 0.01881671736509741,
"grad_norm": 10.1875,
"learning_rate": 0.000376264834478451,
"loss": 11.2667,
"step": 6025
},
{
"epoch": 0.01889479503051275,
"grad_norm": 9.4375,
"learning_rate": 0.0003778263585259213,
"loss": 11.2334,
"step": 6050
},
{
"epoch": 0.018972872695928095,
"grad_norm": 8.6875,
"learning_rate": 0.0003793878825733916,
"loss": 11.2445,
"step": 6075
},
{
"epoch": 0.019050950361343435,
"grad_norm": 9.0,
"learning_rate": 0.000380949406620862,
"loss": 11.2638,
"step": 6100
},
{
"epoch": 0.01912902802675878,
"grad_norm": 9.3125,
"learning_rate": 0.0003825109306683323,
"loss": 11.2733,
"step": 6125
},
{
"epoch": 0.01920710569217412,
"grad_norm": 8.875,
"learning_rate": 0.00038407245471580267,
"loss": 11.327,
"step": 6150
},
{
"epoch": 0.019285183357589462,
"grad_norm": 9.625,
"learning_rate": 0.00038563397876327293,
"loss": 11.3521,
"step": 6175
},
{
"epoch": 0.019363261023004803,
"grad_norm": 9.1875,
"learning_rate": 0.0003871955028107433,
"loss": 11.3203,
"step": 6200
},
{
"epoch": 0.019441338688420146,
"grad_norm": 10.25,
"learning_rate": 0.0003887570268582136,
"loss": 11.4162,
"step": 6225
},
{
"epoch": 0.019519416353835486,
"grad_norm": 11.0,
"learning_rate": 0.000390318550905684,
"loss": 11.4063,
"step": 6250
},
{
"epoch": 0.01959749401925083,
"grad_norm": 9.9375,
"learning_rate": 0.0003918800749531543,
"loss": 11.5209,
"step": 6275
},
{
"epoch": 0.01967557168466617,
"grad_norm": 9.25,
"learning_rate": 0.0003934415990006246,
"loss": 11.5018,
"step": 6300
},
{
"epoch": 0.019753649350081514,
"grad_norm": 12.25,
"learning_rate": 0.00039500312304809493,
"loss": 11.5067,
"step": 6325
},
{
"epoch": 0.019831727015496854,
"grad_norm": 10.8125,
"learning_rate": 0.0003965646470955653,
"loss": 11.5646,
"step": 6350
},
{
"epoch": 0.019909804680912198,
"grad_norm": 10.1875,
"learning_rate": 0.0003981261711430356,
"loss": 11.5575,
"step": 6375
},
{
"epoch": 0.019987882346327538,
"grad_norm": 10.5,
"learning_rate": 0.000399687695190506,
"loss": 11.634,
"step": 6400
},
{
"epoch": 0.02006596001174288,
"grad_norm": 10.5625,
"learning_rate": 0.00040124921923797625,
"loss": 11.7034,
"step": 6425
},
{
"epoch": 0.020144037677158222,
"grad_norm": 10.1875,
"learning_rate": 0.00040281074328544657,
"loss": 11.6978,
"step": 6450
},
{
"epoch": 0.020222115342573566,
"grad_norm": 12.125,
"learning_rate": 0.00040437226733291694,
"loss": 11.7471,
"step": 6475
},
{
"epoch": 0.020300193007988906,
"grad_norm": 10.8125,
"learning_rate": 0.00040593379138038725,
"loss": 11.861,
"step": 6500
},
{
"epoch": 0.02037827067340425,
"grad_norm": 9.8125,
"learning_rate": 0.0004074953154278576,
"loss": 11.7507,
"step": 6525
},
{
"epoch": 0.02045634833881959,
"grad_norm": 12.0,
"learning_rate": 0.0004090568394753279,
"loss": 11.8528,
"step": 6550
},
{
"epoch": 0.020534426004234933,
"grad_norm": 11.4375,
"learning_rate": 0.00041061836352279825,
"loss": 11.9091,
"step": 6575
},
{
"epoch": 0.020612503669650274,
"grad_norm": 11.1875,
"learning_rate": 0.00041217988757026857,
"loss": 11.8911,
"step": 6600
},
{
"epoch": 0.020690581335065617,
"grad_norm": 10.75,
"learning_rate": 0.00041374141161773894,
"loss": 11.9488,
"step": 6625
},
{
"epoch": 0.020768659000480957,
"grad_norm": 10.0,
"learning_rate": 0.00041530293566520925,
"loss": 12.016,
"step": 6650
},
{
"epoch": 0.0208467366658963,
"grad_norm": 9.75,
"learning_rate": 0.0004168644597126796,
"loss": 11.9987,
"step": 6675
},
{
"epoch": 0.02092481433131164,
"grad_norm": 9.875,
"learning_rate": 0.0004184259837601499,
"loss": 12.0069,
"step": 6700
},
{
"epoch": 0.021002891996726985,
"grad_norm": 10.9375,
"learning_rate": 0.00041998750780762026,
"loss": 11.9872,
"step": 6725
},
{
"epoch": 0.021080969662142325,
"grad_norm": 10.9375,
"learning_rate": 0.00042154903185509057,
"loss": 12.0362,
"step": 6750
},
{
"epoch": 0.02115904732755767,
"grad_norm": 10.25,
"learning_rate": 0.00042311055590256094,
"loss": 12.1232,
"step": 6775
},
{
"epoch": 0.02123712499297301,
"grad_norm": 10.375,
"learning_rate": 0.00042467207995003126,
"loss": 12.1279,
"step": 6800
},
{
"epoch": 0.021315202658388353,
"grad_norm": 12.5,
"learning_rate": 0.0004262336039975016,
"loss": 12.1096,
"step": 6825
},
{
"epoch": 0.021393280323803693,
"grad_norm": 12.125,
"learning_rate": 0.0004277951280449719,
"loss": 12.1568,
"step": 6850
},
{
"epoch": 0.021471357989219036,
"grad_norm": 10.375,
"learning_rate": 0.00042935665209244226,
"loss": 12.191,
"step": 6875
},
{
"epoch": 0.021549435654634377,
"grad_norm": 12.125,
"learning_rate": 0.0004309181761399126,
"loss": 12.3206,
"step": 6900
},
{
"epoch": 0.02162751332004972,
"grad_norm": 10.9375,
"learning_rate": 0.00043247970018738294,
"loss": 12.2622,
"step": 6925
},
{
"epoch": 0.02170559098546506,
"grad_norm": 11.8125,
"learning_rate": 0.0004340412242348532,
"loss": 12.2397,
"step": 6950
},
{
"epoch": 0.021783668650880404,
"grad_norm": 10.5625,
"learning_rate": 0.0004356027482823235,
"loss": 12.3172,
"step": 6975
},
{
"epoch": 0.021861746316295744,
"grad_norm": 11.5625,
"learning_rate": 0.0004371642723297939,
"loss": 12.3313,
"step": 7000
},
{
"epoch": 0.021861746316295744,
"eval_loss": 12.409297943115234,
"eval_runtime": 102.1563,
"eval_samples_per_second": 50.932,
"eval_steps_per_second": 3.191,
"step": 7000
},
{
"epoch": 0.021939823981711088,
"grad_norm": 12.0625,
"learning_rate": 0.0004387257963772642,
"loss": 12.4476,
"step": 7025
},
{
"epoch": 0.02201790164712643,
"grad_norm": 12.4375,
"learning_rate": 0.0004402873204247346,
"loss": 12.4722,
"step": 7050
},
{
"epoch": 0.022095979312541772,
"grad_norm": 13.5625,
"learning_rate": 0.00044184884447220484,
"loss": 12.4586,
"step": 7075
},
{
"epoch": 0.022174056977957112,
"grad_norm": 11.75,
"learning_rate": 0.0004434103685196752,
"loss": 12.4961,
"step": 7100
},
{
"epoch": 0.022252134643372456,
"grad_norm": 10.375,
"learning_rate": 0.0004449718925671455,
"loss": 12.5456,
"step": 7125
},
{
"epoch": 0.022330212308787796,
"grad_norm": 16.5,
"learning_rate": 0.0004465334166146159,
"loss": 12.5605,
"step": 7150
},
{
"epoch": 0.02240828997420314,
"grad_norm": 13.9375,
"learning_rate": 0.0004480949406620862,
"loss": 12.5607,
"step": 7175
},
{
"epoch": 0.02248636763961848,
"grad_norm": 12.25,
"learning_rate": 0.0004496564647095565,
"loss": 12.6064,
"step": 7200
},
{
"epoch": 0.022564445305033824,
"grad_norm": 13.875,
"learning_rate": 0.00045121798875702684,
"loss": 12.6238,
"step": 7225
},
{
"epoch": 0.022642522970449164,
"grad_norm": 12.1875,
"learning_rate": 0.0004527795128044972,
"loss": 12.6783,
"step": 7250
},
{
"epoch": 0.022720600635864507,
"grad_norm": 12.0,
"learning_rate": 0.00045434103685196753,
"loss": 12.6747,
"step": 7275
},
{
"epoch": 0.022798678301279848,
"grad_norm": 12.9375,
"learning_rate": 0.0004559025608994379,
"loss": 12.7325,
"step": 7300
},
{
"epoch": 0.02287675596669519,
"grad_norm": 10.75,
"learning_rate": 0.00045746408494690816,
"loss": 12.8587,
"step": 7325
},
{
"epoch": 0.02295483363211053,
"grad_norm": 14.5,
"learning_rate": 0.00045902560899437853,
"loss": 12.8184,
"step": 7350
},
{
"epoch": 0.023032911297525875,
"grad_norm": 12.3125,
"learning_rate": 0.00046058713304184885,
"loss": 12.8454,
"step": 7375
},
{
"epoch": 0.023110988962941215,
"grad_norm": 10.75,
"learning_rate": 0.0004621486570893192,
"loss": 12.8707,
"step": 7400
},
{
"epoch": 0.02318906662835656,
"grad_norm": 13.5625,
"learning_rate": 0.00046371018113678953,
"loss": 12.9231,
"step": 7425
},
{
"epoch": 0.0232671442937719,
"grad_norm": 12.125,
"learning_rate": 0.0004652717051842599,
"loss": 12.9452,
"step": 7450
},
{
"epoch": 0.023345221959187243,
"grad_norm": 17.0,
"learning_rate": 0.00046683322923173016,
"loss": 13.0146,
"step": 7475
},
{
"epoch": 0.023423299624602583,
"grad_norm": 13.4375,
"learning_rate": 0.0004683947532792005,
"loss": 12.9707,
"step": 7500
},
{
"epoch": 0.023501377290017927,
"grad_norm": 13.6875,
"learning_rate": 0.00046995627732667085,
"loss": 12.9913,
"step": 7525
},
{
"epoch": 0.023579454955433267,
"grad_norm": 21.125,
"learning_rate": 0.00047151780137414116,
"loss": 13.0038,
"step": 7550
},
{
"epoch": 0.02365753262084861,
"grad_norm": 14.25,
"learning_rate": 0.00047307932542161153,
"loss": 13.0291,
"step": 7575
},
{
"epoch": 0.02373561028626395,
"grad_norm": 13.5,
"learning_rate": 0.0004746408494690818,
"loss": 12.9994,
"step": 7600
},
{
"epoch": 0.023813687951679294,
"grad_norm": 12.3125,
"learning_rate": 0.00047620237351655217,
"loss": 13.0487,
"step": 7625
},
{
"epoch": 0.023891765617094635,
"grad_norm": 12.875,
"learning_rate": 0.0004777638975640225,
"loss": 13.0586,
"step": 7650
},
{
"epoch": 0.02396984328250998,
"grad_norm": 12.25,
"learning_rate": 0.00047932542161149285,
"loss": 13.0199,
"step": 7675
},
{
"epoch": 0.024047920947925322,
"grad_norm": 12.5625,
"learning_rate": 0.00048088694565896317,
"loss": 13.0435,
"step": 7700
},
{
"epoch": 0.024125998613340662,
"grad_norm": 14.0,
"learning_rate": 0.0004824484697064335,
"loss": 13.1292,
"step": 7725
},
{
"epoch": 0.024204076278756006,
"grad_norm": 17.625,
"learning_rate": 0.0004840099937539038,
"loss": 13.1447,
"step": 7750
},
{
"epoch": 0.024282153944171346,
"grad_norm": 14.1875,
"learning_rate": 0.00048557151780137417,
"loss": 13.1884,
"step": 7775
},
{
"epoch": 0.02436023160958669,
"grad_norm": 14.1875,
"learning_rate": 0.0004871330418488445,
"loss": 13.2918,
"step": 7800
},
{
"epoch": 0.02443830927500203,
"grad_norm": 12.75,
"learning_rate": 0.0004886945658963149,
"loss": 13.2436,
"step": 7825
},
{
"epoch": 0.024516386940417374,
"grad_norm": 12.875,
"learning_rate": 0.0004902560899437852,
"loss": 13.2654,
"step": 7850
},
{
"epoch": 0.024594464605832714,
"grad_norm": 13.8125,
"learning_rate": 0.0004918176139912555,
"loss": 13.3083,
"step": 7875
},
{
"epoch": 0.024672542271248057,
"grad_norm": 15.5,
"learning_rate": 0.0004933791380387258,
"loss": 13.3748,
"step": 7900
},
{
"epoch": 0.024750619936663398,
"grad_norm": 13.0625,
"learning_rate": 0.0004949406620861961,
"loss": 13.3709,
"step": 7925
},
{
"epoch": 0.02482869760207874,
"grad_norm": 14.375,
"learning_rate": 0.0004965021861336665,
"loss": 13.4115,
"step": 7950
},
{
"epoch": 0.02490677526749408,
"grad_norm": 14.5625,
"learning_rate": 0.0004980637101811367,
"loss": 13.5171,
"step": 7975
},
{
"epoch": 0.024984852932909425,
"grad_norm": 15.25,
"learning_rate": 0.0004996252342286071,
"loss": 13.4743,
"step": 8000
},
{
"epoch": 0.024984852932909425,
"eval_loss": 13.542752265930176,
"eval_runtime": 102.3748,
"eval_samples_per_second": 50.823,
"eval_steps_per_second": 3.184,
"step": 8000
},
{
"epoch": 0.025062930598324765,
"grad_norm": 14.25,
"learning_rate": 0.0005011867582760775,
"loss": 13.5245,
"step": 8025
},
{
"epoch": 0.02514100826374011,
"grad_norm": 15.375,
"learning_rate": 0.0005027482823235477,
"loss": 13.5847,
"step": 8050
},
{
"epoch": 0.02521908592915545,
"grad_norm": 15.5,
"learning_rate": 0.0005043098063710181,
"loss": 13.5861,
"step": 8075
},
{
"epoch": 0.025297163594570793,
"grad_norm": 14.0,
"learning_rate": 0.0005058713304184884,
"loss": 13.6261,
"step": 8100
},
{
"epoch": 0.025375241259986133,
"grad_norm": 12.875,
"learning_rate": 0.0005074328544659588,
"loss": 13.6362,
"step": 8125
},
{
"epoch": 0.025453318925401477,
"grad_norm": 15.4375,
"learning_rate": 0.0005089943785134291,
"loss": 13.7079,
"step": 8150
},
{
"epoch": 0.025531396590816817,
"grad_norm": 16.375,
"learning_rate": 0.0005105559025608995,
"loss": 13.7344,
"step": 8175
},
{
"epoch": 0.02560947425623216,
"grad_norm": 13.6875,
"learning_rate": 0.0005121174266083698,
"loss": 13.8572,
"step": 8200
},
{
"epoch": 0.0256875519216475,
"grad_norm": 15.4375,
"learning_rate": 0.0005136789506558401,
"loss": 13.9229,
"step": 8225
},
{
"epoch": 0.025765629587062844,
"grad_norm": 14.125,
"learning_rate": 0.0005152404747033104,
"loss": 13.976,
"step": 8250
},
{
"epoch": 0.025843707252478185,
"grad_norm": 13.6875,
"learning_rate": 0.0005168019987507809,
"loss": 13.9796,
"step": 8275
},
{
"epoch": 0.02592178491789353,
"grad_norm": 14.5625,
"learning_rate": 0.0005183635227982511,
"loss": 14.0409,
"step": 8300
},
{
"epoch": 0.02599986258330887,
"grad_norm": 14.0,
"learning_rate": 0.0005199250468457214,
"loss": 13.9807,
"step": 8325
},
{
"epoch": 0.026077940248724212,
"grad_norm": 15.9375,
"learning_rate": 0.0005214865708931917,
"loss": 14.1036,
"step": 8350
},
{
"epoch": 0.026156017914139552,
"grad_norm": 13.9375,
"learning_rate": 0.0005230480949406621,
"loss": 14.1808,
"step": 8375
},
{
"epoch": 0.026234095579554896,
"grad_norm": 14.75,
"learning_rate": 0.0005246096189881324,
"loss": 14.0815,
"step": 8400
},
{
"epoch": 0.026312173244970236,
"grad_norm": 17.375,
"learning_rate": 0.0005261711430356028,
"loss": 14.2371,
"step": 8425
},
{
"epoch": 0.02639025091038558,
"grad_norm": 19.0,
"learning_rate": 0.0005277326670830731,
"loss": 14.3598,
"step": 8450
},
{
"epoch": 0.02646832857580092,
"grad_norm": 15.5,
"learning_rate": 0.0005292941911305435,
"loss": 14.395,
"step": 8475
},
{
"epoch": 0.026546406241216264,
"grad_norm": 16.0,
"learning_rate": 0.0005308557151780138,
"loss": 14.4232,
"step": 8500
},
{
"epoch": 0.026624483906631604,
"grad_norm": 16.5,
"learning_rate": 0.0005324172392254841,
"loss": 14.498,
"step": 8525
},
{
"epoch": 0.026702561572046948,
"grad_norm": 15.875,
"learning_rate": 0.0005339787632729543,
"loss": 14.5161,
"step": 8550
},
{
"epoch": 0.026780639237462288,
"grad_norm": 16.625,
"learning_rate": 0.0005355402873204247,
"loss": 14.5657,
"step": 8575
},
{
"epoch": 0.02685871690287763,
"grad_norm": 16.25,
"learning_rate": 0.0005371018113678951,
"loss": 14.5677,
"step": 8600
},
{
"epoch": 0.02693679456829297,
"grad_norm": 16.25,
"learning_rate": 0.0005386633354153654,
"loss": 14.7238,
"step": 8625
},
{
"epoch": 0.027014872233708315,
"grad_norm": 19.75,
"learning_rate": 0.0005402248594628357,
"loss": 14.7521,
"step": 8650
},
{
"epoch": 0.027092949899123656,
"grad_norm": 17.5,
"learning_rate": 0.000541786383510306,
"loss": 14.81,
"step": 8675
},
{
"epoch": 0.027171027564539,
"grad_norm": 17.25,
"learning_rate": 0.0005433479075577764,
"loss": 14.8193,
"step": 8700
},
{
"epoch": 0.02724910522995434,
"grad_norm": 19.875,
"learning_rate": 0.0005449094316052468,
"loss": 14.7857,
"step": 8725
},
{
"epoch": 0.027327182895369683,
"grad_norm": 16.625,
"learning_rate": 0.0005464709556527171,
"loss": 14.8485,
"step": 8750
},
{
"epoch": 0.027405260560785023,
"grad_norm": 17.75,
"learning_rate": 0.0005480324797001874,
"loss": 14.8765,
"step": 8775
},
{
"epoch": 0.027483338226200367,
"grad_norm": 16.625,
"learning_rate": 0.0005495940037476578,
"loss": 14.8669,
"step": 8800
},
{
"epoch": 0.027561415891615707,
"grad_norm": 18.375,
"learning_rate": 0.000551155527795128,
"loss": 14.9237,
"step": 8825
},
{
"epoch": 0.02763949355703105,
"grad_norm": 18.75,
"learning_rate": 0.0005527170518425983,
"loss": 15.0786,
"step": 8850
},
{
"epoch": 0.02771757122244639,
"grad_norm": 18.625,
"learning_rate": 0.0005542785758900687,
"loss": 14.9841,
"step": 8875
},
{
"epoch": 0.027795648887861735,
"grad_norm": 16.375,
"learning_rate": 0.0005558400999375391,
"loss": 15.1071,
"step": 8900
},
{
"epoch": 0.027873726553277075,
"grad_norm": 18.0,
"learning_rate": 0.0005574016239850094,
"loss": 15.0875,
"step": 8925
},
{
"epoch": 0.02795180421869242,
"grad_norm": 16.75,
"learning_rate": 0.0005589631480324797,
"loss": 15.1422,
"step": 8950
},
{
"epoch": 0.02802988188410776,
"grad_norm": 16.125,
"learning_rate": 0.00056052467207995,
"loss": 15.1989,
"step": 8975
},
{
"epoch": 0.028107959549523102,
"grad_norm": 17.625,
"learning_rate": 0.0005620861961274205,
"loss": 15.1135,
"step": 9000
},
{
"epoch": 0.028107959549523102,
"eval_loss": 15.24026107788086,
"eval_runtime": 102.3475,
"eval_samples_per_second": 50.837,
"eval_steps_per_second": 3.185,
"step": 9000
},
{
"epoch": 0.028186037214938443,
"grad_norm": 15.4375,
"learning_rate": 0.0005636477201748908,
"loss": 15.2318,
"step": 9025
},
{
"epoch": 0.028264114880353786,
"grad_norm": 16.125,
"learning_rate": 0.0005652092442223611,
"loss": 15.1658,
"step": 9050
},
{
"epoch": 0.028342192545769126,
"grad_norm": 16.5,
"learning_rate": 0.0005667707682698313,
"loss": 15.2762,
"step": 9075
},
{
"epoch": 0.02842027021118447,
"grad_norm": 15.375,
"learning_rate": 0.0005683322923173016,
"loss": 15.1555,
"step": 9100
},
{
"epoch": 0.02849834787659981,
"grad_norm": 16.75,
"learning_rate": 0.000569893816364772,
"loss": 15.1879,
"step": 9125
},
{
"epoch": 0.028576425542015154,
"grad_norm": 15.75,
"learning_rate": 0.0005714553404122423,
"loss": 15.1291,
"step": 9150
},
{
"epoch": 0.028654503207430494,
"grad_norm": 16.375,
"learning_rate": 0.0005730168644597127,
"loss": 15.2669,
"step": 9175
},
{
"epoch": 0.028732580872845838,
"grad_norm": 15.625,
"learning_rate": 0.000574578388507183,
"loss": 15.2461,
"step": 9200
},
{
"epoch": 0.028810658538261178,
"grad_norm": 16.25,
"learning_rate": 0.0005761399125546534,
"loss": 15.1608,
"step": 9225
},
{
"epoch": 0.028888736203676522,
"grad_norm": 16.625,
"learning_rate": 0.0005777014366021237,
"loss": 15.2685,
"step": 9250
},
{
"epoch": 0.028966813869091862,
"grad_norm": 17.375,
"learning_rate": 0.000579262960649594,
"loss": 15.3043,
"step": 9275
},
{
"epoch": 0.029044891534507206,
"grad_norm": 16.625,
"learning_rate": 0.0005808244846970644,
"loss": 15.251,
"step": 9300
},
{
"epoch": 0.029122969199922546,
"grad_norm": 17.875,
"learning_rate": 0.0005823860087445347,
"loss": 15.3825,
"step": 9325
},
{
"epoch": 0.02920104686533789,
"grad_norm": 17.625,
"learning_rate": 0.000583947532792005,
"loss": 15.2931,
"step": 9350
},
{
"epoch": 0.02927912453075323,
"grad_norm": 17.375,
"learning_rate": 0.0005855090568394753,
"loss": 15.3777,
"step": 9375
},
{
"epoch": 0.029357202196168573,
"grad_norm": 17.5,
"learning_rate": 0.0005870705808869456,
"loss": 15.369,
"step": 9400
},
{
"epoch": 0.029435279861583914,
"grad_norm": 17.75,
"learning_rate": 0.000588632104934416,
"loss": 15.2992,
"step": 9425
},
{
"epoch": 0.029513357526999257,
"grad_norm": 15.625,
"learning_rate": 0.0005901936289818864,
"loss": 15.3261,
"step": 9450
},
{
"epoch": 0.029591435192414597,
"grad_norm": 19.375,
"learning_rate": 0.0005917551530293567,
"loss": 15.2748,
"step": 9475
},
{
"epoch": 0.02966951285782994,
"grad_norm": 17.875,
"learning_rate": 0.000593316677076827,
"loss": 15.2702,
"step": 9500
},
{
"epoch": 0.02974759052324528,
"grad_norm": 20.625,
"learning_rate": 0.0005948782011242974,
"loss": 15.3905,
"step": 9525
},
{
"epoch": 0.029825668188660625,
"grad_norm": 16.5,
"learning_rate": 0.0005964397251717677,
"loss": 15.3166,
"step": 9550
},
{
"epoch": 0.029903745854075965,
"grad_norm": 17.125,
"learning_rate": 0.000598001249219238,
"loss": 15.3973,
"step": 9575
},
{
"epoch": 0.02998182351949131,
"grad_norm": 15.4375,
"learning_rate": 0.0005995627732667083,
"loss": 15.3621,
"step": 9600
},
{
"epoch": 0.03005990118490665,
"grad_norm": 17.375,
"learning_rate": 0.0006011242973141786,
"loss": 15.532,
"step": 9625
},
{
"epoch": 0.030137978850321993,
"grad_norm": 21.125,
"learning_rate": 0.000602685821361649,
"loss": 15.6577,
"step": 9650
},
{
"epoch": 0.030216056515737333,
"grad_norm": 16.5,
"learning_rate": 0.0006042473454091193,
"loss": 15.7684,
"step": 9675
},
{
"epoch": 0.030294134181152677,
"grad_norm": 17.125,
"learning_rate": 0.0006058088694565896,
"loss": 15.8286,
"step": 9700
},
{
"epoch": 0.030372211846568017,
"grad_norm": 17.625,
"learning_rate": 0.0006073703935040599,
"loss": 15.9935,
"step": 9725
},
{
"epoch": 0.03045028951198336,
"grad_norm": 20.25,
"learning_rate": 0.0006089319175515304,
"loss": 16.0198,
"step": 9750
},
{
"epoch": 0.0305283671773987,
"grad_norm": 17.875,
"learning_rate": 0.0006104934415990007,
"loss": 16.0932,
"step": 9775
},
{
"epoch": 0.030606444842814044,
"grad_norm": 21.75,
"learning_rate": 0.000612054965646471,
"loss": 16.0459,
"step": 9800
},
{
"epoch": 0.030684522508229384,
"grad_norm": 20.25,
"learning_rate": 0.0006136164896939413,
"loss": 16.0705,
"step": 9825
},
{
"epoch": 0.030762600173644728,
"grad_norm": 18.625,
"learning_rate": 0.0006151780137414116,
"loss": 16.0394,
"step": 9850
},
{
"epoch": 0.03084067783906007,
"grad_norm": 16.75,
"learning_rate": 0.0006167395377888819,
"loss": 16.0319,
"step": 9875
},
{
"epoch": 0.030918755504475412,
"grad_norm": 16.375,
"learning_rate": 0.0006183010618363523,
"loss": 16.0178,
"step": 9900
},
{
"epoch": 0.030996833169890752,
"grad_norm": 26.375,
"learning_rate": 0.0006198625858838226,
"loss": 16.1248,
"step": 9925
},
{
"epoch": 0.031074910835306096,
"grad_norm": 18.375,
"learning_rate": 0.000621424109931293,
"loss": 16.1081,
"step": 9950
},
{
"epoch": 0.031152988500721436,
"grad_norm": 18.25,
"learning_rate": 0.0006229856339787633,
"loss": 16.195,
"step": 9975
},
{
"epoch": 0.03123106616613678,
"grad_norm": 19.375,
"learning_rate": 0.0006245471580262336,
"loss": 16.2205,
"step": 10000
},
{
"epoch": 0.03123106616613678,
"eval_loss": 16.29703140258789,
"eval_runtime": 102.3113,
"eval_samples_per_second": 50.855,
"eval_steps_per_second": 3.186,
"step": 10000
},
{
"epoch": 0.03130914383155212,
"grad_norm": 17.375,
"learning_rate": 0.0006261086820737039,
"loss": 16.2662,
"step": 10025
},
{
"epoch": 0.03138722149696746,
"grad_norm": 25.625,
"learning_rate": 0.0006276702061211744,
"loss": 16.2528,
"step": 10050
},
{
"epoch": 0.031465299162382804,
"grad_norm": 19.5,
"learning_rate": 0.0006292317301686447,
"loss": 16.3618,
"step": 10075
},
{
"epoch": 0.03154337682779815,
"grad_norm": 19.25,
"learning_rate": 0.0006307932542161149,
"loss": 16.4928,
"step": 10100
},
{
"epoch": 0.03162145449321349,
"grad_norm": 20.125,
"learning_rate": 0.0006323547782635852,
"loss": 16.4938,
"step": 10125
},
{
"epoch": 0.03169953215862883,
"grad_norm": 22.5,
"learning_rate": 0.0006339163023110555,
"loss": 16.5037,
"step": 10150
},
{
"epoch": 0.03177760982404417,
"grad_norm": 22.0,
"learning_rate": 0.000635477826358526,
"loss": 16.5408,
"step": 10175
},
{
"epoch": 0.031855687489459515,
"grad_norm": 22.875,
"learning_rate": 0.0006370393504059963,
"loss": 16.6573,
"step": 10200
},
{
"epoch": 0.03193376515487486,
"grad_norm": 20.625,
"learning_rate": 0.0006386008744534666,
"loss": 16.608,
"step": 10225
},
{
"epoch": 0.0320118428202902,
"grad_norm": 18.375,
"learning_rate": 0.0006401623985009369,
"loss": 16.6253,
"step": 10250
},
{
"epoch": 0.03208992048570554,
"grad_norm": 22.75,
"learning_rate": 0.0006417239225484073,
"loss": 16.7264,
"step": 10275
},
{
"epoch": 0.03216799815112088,
"grad_norm": 22.75,
"learning_rate": 0.0006432854465958776,
"loss": 16.7937,
"step": 10300
},
{
"epoch": 0.03224607581653623,
"grad_norm": 18.375,
"learning_rate": 0.000644846970643348,
"loss": 16.8422,
"step": 10325
},
{
"epoch": 0.03232415348195157,
"grad_norm": 19.625,
"learning_rate": 0.0006464084946908183,
"loss": 16.8992,
"step": 10350
},
{
"epoch": 0.03240223114736691,
"grad_norm": 18.375,
"learning_rate": 0.0006479700187382886,
"loss": 16.7641,
"step": 10375
},
{
"epoch": 0.03248030881278225,
"grad_norm": 19.25,
"learning_rate": 0.0006495315427857589,
"loss": 16.9004,
"step": 10400
},
{
"epoch": 0.032558386478197594,
"grad_norm": 20.0,
"learning_rate": 0.0006510930668332292,
"loss": 16.8661,
"step": 10425
},
{
"epoch": 0.03263646414361294,
"grad_norm": 18.5,
"learning_rate": 0.0006526545908806995,
"loss": 16.717,
"step": 10450
},
{
"epoch": 0.032714541809028275,
"grad_norm": 21.25,
"learning_rate": 0.00065421611492817,
"loss": 16.7414,
"step": 10475
},
{
"epoch": 0.03279261947444362,
"grad_norm": 19.0,
"learning_rate": 0.0006557776389756403,
"loss": 16.751,
"step": 10500
},
{
"epoch": 0.03287069713985896,
"grad_norm": 19.625,
"learning_rate": 0.0006573391630231106,
"loss": 16.761,
"step": 10525
},
{
"epoch": 0.032948774805274306,
"grad_norm": 18.625,
"learning_rate": 0.0006589006870705809,
"loss": 16.7134,
"step": 10550
},
{
"epoch": 0.03302685247068964,
"grad_norm": 19.5,
"learning_rate": 0.0006604622111180513,
"loss": 16.8445,
"step": 10575
},
{
"epoch": 0.033104930136104986,
"grad_norm": 18.125,
"learning_rate": 0.0006620237351655216,
"loss": 16.923,
"step": 10600
},
{
"epoch": 0.03318300780152033,
"grad_norm": 21.75,
"learning_rate": 0.0006635852592129918,
"loss": 17.0445,
"step": 10625
},
{
"epoch": 0.03326108546693567,
"grad_norm": 20.5,
"learning_rate": 0.0006651467832604622,
"loss": 17.1396,
"step": 10650
},
{
"epoch": 0.03333916313235101,
"grad_norm": 20.5,
"learning_rate": 0.0006667083073079325,
"loss": 17.0307,
"step": 10675
},
{
"epoch": 0.033417240797766354,
"grad_norm": 18.875,
"learning_rate": 0.0006682698313554029,
"loss": 17.0284,
"step": 10700
},
{
"epoch": 0.0334953184631817,
"grad_norm": 18.75,
"learning_rate": 0.0006698313554028732,
"loss": 17.1508,
"step": 10725
},
{
"epoch": 0.03357339612859704,
"grad_norm": 24.625,
"learning_rate": 0.0006713928794503435,
"loss": 17.1699,
"step": 10750
},
{
"epoch": 0.03365147379401238,
"grad_norm": 29.0,
"learning_rate": 0.0006729544034978139,
"loss": 17.2827,
"step": 10775
},
{
"epoch": 0.03372955145942772,
"grad_norm": 22.25,
"learning_rate": 0.0006745159275452843,
"loss": 17.4336,
"step": 10800
},
{
"epoch": 0.033807629124843065,
"grad_norm": 19.75,
"learning_rate": 0.0006760774515927546,
"loss": 17.2731,
"step": 10825
},
{
"epoch": 0.03388570679025841,
"grad_norm": 20.875,
"learning_rate": 0.0006776389756402249,
"loss": 17.3691,
"step": 10850
},
{
"epoch": 0.033963784455673746,
"grad_norm": 20.875,
"learning_rate": 0.0006792004996876951,
"loss": 17.5287,
"step": 10875
},
{
"epoch": 0.03404186212108909,
"grad_norm": 19.25,
"learning_rate": 0.0006807620237351655,
"loss": 17.5784,
"step": 10900
},
{
"epoch": 0.03411993978650443,
"grad_norm": 26.625,
"learning_rate": 0.0006823235477826359,
"loss": 17.5465,
"step": 10925
},
{
"epoch": 0.03419801745191978,
"grad_norm": 21.75,
"learning_rate": 0.0006838850718301062,
"loss": 17.5995,
"step": 10950
},
{
"epoch": 0.03427609511733511,
"grad_norm": 20.625,
"learning_rate": 0.0006854465958775765,
"loss": 17.8194,
"step": 10975
},
{
"epoch": 0.03435417278275046,
"grad_norm": 21.125,
"learning_rate": 0.0006870081199250469,
"loss": 17.8228,
"step": 11000
},
{
"epoch": 0.03435417278275046,
"eval_loss": 17.8679141998291,
"eval_runtime": 102.2521,
"eval_samples_per_second": 50.884,
"eval_steps_per_second": 3.188,
"step": 11000
},
{
"epoch": 0.0344322504481658,
"grad_norm": 20.25,
"learning_rate": 0.0006885696439725172,
"loss": 17.8434,
"step": 11025
},
{
"epoch": 0.034510328113581144,
"grad_norm": 21.25,
"learning_rate": 0.0006901311680199875,
"loss": 17.8783,
"step": 11050
},
{
"epoch": 0.03458840577899648,
"grad_norm": 24.25,
"learning_rate": 0.0006916926920674579,
"loss": 18.0649,
"step": 11075
},
{
"epoch": 0.034666483444411825,
"grad_norm": 20.625,
"learning_rate": 0.0006932542161149283,
"loss": 18.0142,
"step": 11100
},
{
"epoch": 0.03474456110982717,
"grad_norm": 21.25,
"learning_rate": 0.0006948157401623986,
"loss": 18.1228,
"step": 11125
},
{
"epoch": 0.03482263877524251,
"grad_norm": 22.625,
"learning_rate": 0.0006963772642098688,
"loss": 18.2188,
"step": 11150
},
{
"epoch": 0.03490071644065785,
"grad_norm": 24.625,
"learning_rate": 0.0006979387882573391,
"loss": 18.5369,
"step": 11175
},
{
"epoch": 0.03497879410607319,
"grad_norm": 23.625,
"learning_rate": 0.0006995003123048094,
"loss": 18.6513,
"step": 11200
},
{
"epoch": 0.035056871771488536,
"grad_norm": 23.125,
"learning_rate": 0.0007010618363522799,
"loss": 18.6154,
"step": 11225
},
{
"epoch": 0.03513494943690388,
"grad_norm": 23.75,
"learning_rate": 0.0007026233603997502,
"loss": 18.5765,
"step": 11250
},
{
"epoch": 0.035213027102319217,
"grad_norm": 24.25,
"learning_rate": 0.0007041848844472205,
"loss": 18.6452,
"step": 11275
},
{
"epoch": 0.03529110476773456,
"grad_norm": 23.5,
"learning_rate": 0.0007057464084946908,
"loss": 18.5797,
"step": 11300
},
{
"epoch": 0.035369182433149904,
"grad_norm": 25.0,
"learning_rate": 0.0007073079325421612,
"loss": 18.5652,
"step": 11325
},
{
"epoch": 0.03544726009856525,
"grad_norm": 29.625,
"learning_rate": 0.0007088694565896315,
"loss": 18.623,
"step": 11350
},
{
"epoch": 0.035525337763980584,
"grad_norm": 25.875,
"learning_rate": 0.0007104309806371019,
"loss": 18.6827,
"step": 11375
},
{
"epoch": 0.03560341542939593,
"grad_norm": 25.0,
"learning_rate": 0.0007119925046845721,
"loss": 18.8077,
"step": 11400
},
{
"epoch": 0.03568149309481127,
"grad_norm": 26.75,
"learning_rate": 0.0007135540287320425,
"loss": 18.7854,
"step": 11425
},
{
"epoch": 0.035759570760226615,
"grad_norm": 26.125,
"learning_rate": 0.0007151155527795128,
"loss": 18.8622,
"step": 11450
},
{
"epoch": 0.03583764842564195,
"grad_norm": 24.125,
"learning_rate": 0.0007166770768269831,
"loss": 18.8723,
"step": 11475
},
{
"epoch": 0.035915726091057296,
"grad_norm": 23.75,
"learning_rate": 0.0007182386008744534,
"loss": 18.8899,
"step": 11500
},
{
"epoch": 0.03599380375647264,
"grad_norm": 24.375,
"learning_rate": 0.0007198001249219239,
"loss": 18.9028,
"step": 11525
},
{
"epoch": 0.03607188142188798,
"grad_norm": 24.5,
"learning_rate": 0.0007213616489693942,
"loss": 18.91,
"step": 11550
},
{
"epoch": 0.03614995908730332,
"grad_norm": 24.625,
"learning_rate": 0.0007229231730168645,
"loss": 18.8747,
"step": 11575
},
{
"epoch": 0.03622803675271866,
"grad_norm": 27.375,
"learning_rate": 0.0007244846970643348,
"loss": 18.7471,
"step": 11600
},
{
"epoch": 0.03630611441813401,
"grad_norm": 21.375,
"learning_rate": 0.0007260462211118051,
"loss": 18.8455,
"step": 11625
},
{
"epoch": 0.03638419208354935,
"grad_norm": 23.375,
"learning_rate": 0.0007276077451592754,
"loss": 18.8257,
"step": 11650
},
{
"epoch": 0.03646226974896469,
"grad_norm": 19.875,
"learning_rate": 0.0007291692692067458,
"loss": 18.8365,
"step": 11675
},
{
"epoch": 0.03654034741438003,
"grad_norm": 24.375,
"learning_rate": 0.0007307307932542161,
"loss": 18.847,
"step": 11700
},
{
"epoch": 0.036618425079795375,
"grad_norm": 23.5,
"learning_rate": 0.0007322923173016864,
"loss": 18.9123,
"step": 11725
},
{
"epoch": 0.03669650274521072,
"grad_norm": 22.0,
"learning_rate": 0.0007338538413491568,
"loss": 19.0867,
"step": 11750
},
{
"epoch": 0.036774580410626055,
"grad_norm": 21.75,
"learning_rate": 0.0007354153653966271,
"loss": 19.0067,
"step": 11775
},
{
"epoch": 0.0368526580760414,
"grad_norm": 22.5,
"learning_rate": 0.0007369768894440974,
"loss": 19.1682,
"step": 11800
},
{
"epoch": 0.03693073574145674,
"grad_norm": 23.625,
"learning_rate": 0.0007385384134915678,
"loss": 19.3881,
"step": 11825
},
{
"epoch": 0.037008813406872086,
"grad_norm": 22.5,
"learning_rate": 0.0007400999375390382,
"loss": 19.3211,
"step": 11850
},
{
"epoch": 0.03708689107228742,
"grad_norm": 24.875,
"learning_rate": 0.0007416614615865085,
"loss": 19.4715,
"step": 11875
},
{
"epoch": 0.03716496873770277,
"grad_norm": 23.125,
"learning_rate": 0.0007432229856339788,
"loss": 19.4858,
"step": 11900
},
{
"epoch": 0.03724304640311811,
"grad_norm": 22.75,
"learning_rate": 0.000744784509681449,
"loss": 19.7193,
"step": 11925
},
{
"epoch": 0.037321124068533454,
"grad_norm": 21.375,
"learning_rate": 0.0007463460337289195,
"loss": 19.6023,
"step": 11950
},
{
"epoch": 0.03739920173394879,
"grad_norm": 27.375,
"learning_rate": 0.0007479075577763898,
"loss": 19.6003,
"step": 11975
},
{
"epoch": 0.037477279399364134,
"grad_norm": 23.75,
"learning_rate": 0.0007494690818238601,
"loss": 19.7391,
"step": 12000
},
{
"epoch": 0.037477279399364134,
"eval_loss": 19.823862075805664,
"eval_runtime": 102.3056,
"eval_samples_per_second": 50.857,
"eval_steps_per_second": 3.187,
"step": 12000
},
{
"epoch": 0.03755535706477948,
"grad_norm": 23.0,
"learning_rate": 0.0007510306058713304,
"loss": 19.8624,
"step": 12025
},
{
"epoch": 0.03763343473019482,
"grad_norm": 22.5,
"learning_rate": 0.0007525921299188008,
"loss": 19.7457,
"step": 12050
},
{
"epoch": 0.03771151239561016,
"grad_norm": 23.375,
"learning_rate": 0.0007541536539662711,
"loss": 19.83,
"step": 12075
},
{
"epoch": 0.0377895900610255,
"grad_norm": 25.625,
"learning_rate": 0.0007557151780137415,
"loss": 19.8478,
"step": 12100
},
{
"epoch": 0.037867667726440846,
"grad_norm": 23.5,
"learning_rate": 0.0007572767020612118,
"loss": 19.8336,
"step": 12125
},
{
"epoch": 0.03794574539185619,
"grad_norm": 24.75,
"learning_rate": 0.0007588382261086821,
"loss": 19.9314,
"step": 12150
},
{
"epoch": 0.038023823057271526,
"grad_norm": 22.375,
"learning_rate": 0.0007603997501561524,
"loss": 19.9643,
"step": 12175
},
{
"epoch": 0.03810190072268687,
"grad_norm": 25.0,
"learning_rate": 0.0007619612742036227,
"loss": 19.9156,
"step": 12200
},
{
"epoch": 0.03817997838810221,
"grad_norm": 22.125,
"learning_rate": 0.000763522798251093,
"loss": 19.9121,
"step": 12225
},
{
"epoch": 0.03825805605351756,
"grad_norm": 23.0,
"learning_rate": 0.0007650843222985633,
"loss": 19.7906,
"step": 12250
},
{
"epoch": 0.038336133718932894,
"grad_norm": 23.625,
"learning_rate": 0.0007666458463460338,
"loss": 19.911,
"step": 12275
},
{
"epoch": 0.03841421138434824,
"grad_norm": 25.125,
"learning_rate": 0.0007682073703935041,
"loss": 19.7616,
"step": 12300
},
{
"epoch": 0.03849228904976358,
"grad_norm": 26.375,
"learning_rate": 0.0007697688944409744,
"loss": 20.0897,
"step": 12325
},
{
"epoch": 0.038570366715178925,
"grad_norm": 23.0,
"learning_rate": 0.0007713304184884447,
"loss": 19.9698,
"step": 12350
},
{
"epoch": 0.03864844438059426,
"grad_norm": 26.75,
"learning_rate": 0.0007728919425359151,
"loss": 20.0703,
"step": 12375
},
{
"epoch": 0.038726522046009605,
"grad_norm": 25.25,
"learning_rate": 0.0007744534665833855,
"loss": 20.0981,
"step": 12400
},
{
"epoch": 0.03880459971142495,
"grad_norm": 30.125,
"learning_rate": 0.0007760149906308557,
"loss": 20.1253,
"step": 12425
},
{
"epoch": 0.03888267737684029,
"grad_norm": 27.125,
"learning_rate": 0.000777576514678326,
"loss": 20.3167,
"step": 12450
},
{
"epoch": 0.03896075504225563,
"grad_norm": 26.5,
"learning_rate": 0.0007791380387257964,
"loss": 20.4952,
"step": 12475
},
{
"epoch": 0.03903883270767097,
"grad_norm": 32.75,
"learning_rate": 0.0007806995627732667,
"loss": 20.614,
"step": 12500
},
{
"epoch": 0.03911691037308632,
"grad_norm": 28.0,
"learning_rate": 0.000782261086820737,
"loss": 20.8281,
"step": 12525
},
{
"epoch": 0.03919498803850166,
"grad_norm": 27.625,
"learning_rate": 0.0007838226108682074,
"loss": 20.8424,
"step": 12550
},
{
"epoch": 0.039273065703917,
"grad_norm": 37.25,
"learning_rate": 0.0007853841349156778,
"loss": 20.9891,
"step": 12575
},
{
"epoch": 0.03935114336933234,
"grad_norm": 28.75,
"learning_rate": 0.0007869456589631481,
"loss": 20.8581,
"step": 12600
},
{
"epoch": 0.039429221034747684,
"grad_norm": 27.375,
"learning_rate": 0.0007885071830106184,
"loss": 20.8655,
"step": 12625
},
{
"epoch": 0.03950729870016303,
"grad_norm": 29.5,
"learning_rate": 0.0007900687070580887,
"loss": 21.0043,
"step": 12650
},
{
"epoch": 0.039585376365578365,
"grad_norm": 33.5,
"learning_rate": 0.000791630231105559,
"loss": 21.1795,
"step": 12675
},
{
"epoch": 0.03966345403099371,
"grad_norm": 28.625,
"learning_rate": 0.0007931917551530294,
"loss": 21.1842,
"step": 12700
},
{
"epoch": 0.03974153169640905,
"grad_norm": 25.125,
"learning_rate": 0.0007947532792004997,
"loss": 21.1583,
"step": 12725
},
{
"epoch": 0.039819609361824396,
"grad_norm": 28.875,
"learning_rate": 0.00079631480324797,
"loss": 21.2881,
"step": 12750
},
{
"epoch": 0.03989768702723974,
"grad_norm": 30.875,
"learning_rate": 0.0007978763272954403,
"loss": 21.265,
"step": 12775
},
{
"epoch": 0.039975764692655076,
"grad_norm": 30.25,
"learning_rate": 0.0007994378513429107,
"loss": 21.3535,
"step": 12800
},
{
"epoch": 0.04005384235807042,
"grad_norm": 29.75,
"learning_rate": 0.000800999375390381,
"loss": 21.4447,
"step": 12825
},
{
"epoch": 0.04013192002348576,
"grad_norm": 30.75,
"learning_rate": 0.0008025608994378514,
"loss": 21.5344,
"step": 12850
},
{
"epoch": 0.04020999768890111,
"grad_norm": 33.75,
"learning_rate": 0.0008041224234853217,
"loss": 21.3934,
"step": 12875
},
{
"epoch": 0.040288075354316444,
"grad_norm": 45.0,
"learning_rate": 0.0008056839475327921,
"loss": 21.7891,
"step": 12900
},
{
"epoch": 0.04036615301973179,
"grad_norm": 28.125,
"learning_rate": 0.0008072454715802624,
"loss": 22.0609,
"step": 12925
},
{
"epoch": 0.04044423068514713,
"grad_norm": 32.0,
"learning_rate": 0.0008088069956277326,
"loss": 21.915,
"step": 12950
},
{
"epoch": 0.040522308350562475,
"grad_norm": 27.875,
"learning_rate": 0.0008103685196752029,
"loss": 21.9726,
"step": 12975
},
{
"epoch": 0.04060038601597781,
"grad_norm": 27.625,
"learning_rate": 0.0008119300437226734,
"loss": 21.9425,
"step": 13000
},
{
"epoch": 0.04060038601597781,
"eval_loss": 22.01194190979004,
"eval_runtime": 102.3317,
"eval_samples_per_second": 50.844,
"eval_steps_per_second": 3.186,
"step": 13000
},
{
"epoch": 0.040678463681393155,
"grad_norm": 35.75,
"learning_rate": 0.0008134915677701437,
"loss": 21.8983,
"step": 13025
},
{
"epoch": 0.0407565413468085,
"grad_norm": 31.375,
"learning_rate": 0.000815053091817614,
"loss": 22.2354,
"step": 13050
},
{
"epoch": 0.04083461901222384,
"grad_norm": 28.375,
"learning_rate": 0.0008166146158650843,
"loss": 22.3099,
"step": 13075
},
{
"epoch": 0.04091269667763918,
"grad_norm": 34.5,
"learning_rate": 0.0008181761399125547,
"loss": 22.3739,
"step": 13100
},
{
"epoch": 0.04099077434305452,
"grad_norm": 29.5,
"learning_rate": 0.000819737663960025,
"loss": 22.4598,
"step": 13125
},
{
"epoch": 0.04106885200846987,
"grad_norm": 32.25,
"learning_rate": 0.0008212991880074954,
"loss": 22.7993,
"step": 13150
},
{
"epoch": 0.04114692967388521,
"grad_norm": 31.375,
"learning_rate": 0.0008228607120549657,
"loss": 22.7376,
"step": 13175
},
{
"epoch": 0.04122500733930055,
"grad_norm": 31.5,
"learning_rate": 0.0008244222361024359,
"loss": 22.6221,
"step": 13200
},
{
"epoch": 0.04130308500471589,
"grad_norm": 29.375,
"learning_rate": 0.0008259837601499063,
"loss": 22.6237,
"step": 13225
},
{
"epoch": 0.041381162670131234,
"grad_norm": 28.0,
"learning_rate": 0.0008275452841973766,
"loss": 22.4565,
"step": 13250
},
{
"epoch": 0.04145924033554658,
"grad_norm": 28.875,
"learning_rate": 0.0008291068082448469,
"loss": 22.4236,
"step": 13275
},
{
"epoch": 0.041537318000961915,
"grad_norm": 26.625,
"learning_rate": 0.0008306683322923173,
"loss": 22.4627,
"step": 13300
},
{
"epoch": 0.04161539566637726,
"grad_norm": 30.75,
"learning_rate": 0.0008322298563397877,
"loss": 22.5395,
"step": 13325
},
{
"epoch": 0.0416934733317926,
"grad_norm": 29.375,
"learning_rate": 0.000833791380387258,
"loss": 22.4437,
"step": 13350
},
{
"epoch": 0.041771550997207946,
"grad_norm": 26.375,
"learning_rate": 0.0008353529044347283,
"loss": 22.5234,
"step": 13375
},
{
"epoch": 0.04184962866262328,
"grad_norm": 28.0,
"learning_rate": 0.0008369144284821986,
"loss": 22.9237,
"step": 13400
},
{
"epoch": 0.041927706328038626,
"grad_norm": 32.0,
"learning_rate": 0.0008384759525296691,
"loss": 22.9402,
"step": 13425
},
{
"epoch": 0.04200578399345397,
"grad_norm": 30.375,
"learning_rate": 0.0008400374765771394,
"loss": 23.1061,
"step": 13450
},
{
"epoch": 0.04208386165886931,
"grad_norm": 32.0,
"learning_rate": 0.0008415990006246096,
"loss": 22.9162,
"step": 13475
},
{
"epoch": 0.04216193932428465,
"grad_norm": 30.25,
"learning_rate": 0.0008431605246720799,
"loss": 23.2072,
"step": 13500
},
{
"epoch": 0.042240016989699994,
"grad_norm": 31.375,
"learning_rate": 0.0008447220487195503,
"loss": 23.2287,
"step": 13525
},
{
"epoch": 0.04231809465511534,
"grad_norm": 29.5,
"learning_rate": 0.0008462835727670206,
"loss": 23.1901,
"step": 13550
},
{
"epoch": 0.04239617232053068,
"grad_norm": 28.25,
"learning_rate": 0.000847845096814491,
"loss": 23.3087,
"step": 13575
},
{
"epoch": 0.04247424998594602,
"grad_norm": 33.75,
"learning_rate": 0.0008494066208619613,
"loss": 23.5178,
"step": 13600
},
{
"epoch": 0.04255232765136136,
"grad_norm": 27.875,
"learning_rate": 0.0008509681449094317,
"loss": 23.4003,
"step": 13625
},
{
"epoch": 0.042630405316776705,
"grad_norm": 26.25,
"learning_rate": 0.000852529668956902,
"loss": 23.4554,
"step": 13650
},
{
"epoch": 0.04270848298219205,
"grad_norm": 24.875,
"learning_rate": 0.0008540911930043723,
"loss": 23.3269,
"step": 13675
},
{
"epoch": 0.042786560647607386,
"grad_norm": 24.875,
"learning_rate": 0.0008556527170518426,
"loss": 23.2309,
"step": 13700
},
{
"epoch": 0.04286463831302273,
"grad_norm": 29.375,
"learning_rate": 0.0008572142410993128,
"loss": 23.0514,
"step": 13725
},
{
"epoch": 0.04294271597843807,
"grad_norm": 27.125,
"learning_rate": 0.0008587757651467833,
"loss": 22.984,
"step": 13750
},
{
"epoch": 0.04302079364385342,
"grad_norm": 30.25,
"learning_rate": 0.0008603372891942536,
"loss": 22.9465,
"step": 13775
},
{
"epoch": 0.04309887130926875,
"grad_norm": 30.0,
"learning_rate": 0.0008618988132417239,
"loss": 23.0119,
"step": 13800
},
{
"epoch": 0.0431769489746841,
"grad_norm": 30.25,
"learning_rate": 0.0008634603372891942,
"loss": 22.9153,
"step": 13825
},
{
"epoch": 0.04325502664009944,
"grad_norm": 25.25,
"learning_rate": 0.0008650218613366646,
"loss": 23.1027,
"step": 13850
},
{
"epoch": 0.043333104305514784,
"grad_norm": 33.75,
"learning_rate": 0.000866583385384135,
"loss": 23.0265,
"step": 13875
},
{
"epoch": 0.04341118197093012,
"grad_norm": 29.625,
"learning_rate": 0.0008681449094316053,
"loss": 23.1209,
"step": 13900
},
{
"epoch": 0.043489259636345465,
"grad_norm": 30.0,
"learning_rate": 0.0008697064334790756,
"loss": 23.2931,
"step": 13925
},
{
"epoch": 0.04356733730176081,
"grad_norm": 31.25,
"learning_rate": 0.000871267957526546,
"loss": 23.6223,
"step": 13950
},
{
"epoch": 0.04364541496717615,
"grad_norm": 29.125,
"learning_rate": 0.0008728294815740162,
"loss": 23.4989,
"step": 13975
},
{
"epoch": 0.04372349263259149,
"grad_norm": 30.125,
"learning_rate": 0.0008743910056214865,
"loss": 23.923,
"step": 14000
},
{
"epoch": 0.04372349263259149,
"eval_loss": 23.799776077270508,
"eval_runtime": 102.2075,
"eval_samples_per_second": 50.906,
"eval_steps_per_second": 3.19,
"step": 14000
},
{
"epoch": 0.04380157029800683,
"grad_norm": 32.0,
"learning_rate": 0.0008759525296689569,
"loss": 23.9569,
"step": 14025
},
{
"epoch": 0.043879647963422176,
"grad_norm": 30.75,
"learning_rate": 0.0008775140537164273,
"loss": 23.764,
"step": 14050
},
{
"epoch": 0.04395772562883752,
"grad_norm": 29.75,
"learning_rate": 0.0008790755777638976,
"loss": 23.4492,
"step": 14075
},
{
"epoch": 0.04403580329425286,
"grad_norm": 28.125,
"learning_rate": 0.0008806371018113679,
"loss": 23.5056,
"step": 14100
},
{
"epoch": 0.0441138809596682,
"grad_norm": 31.25,
"learning_rate": 0.0008821986258588382,
"loss": 23.7418,
"step": 14125
},
{
"epoch": 0.044191958625083544,
"grad_norm": 31.0,
"learning_rate": 0.0008837601499063086,
"loss": 23.7158,
"step": 14150
},
{
"epoch": 0.04427003629049889,
"grad_norm": 35.25,
"learning_rate": 0.000885321673953779,
"loss": 24.0083,
"step": 14175
},
{
"epoch": 0.044348113955914224,
"grad_norm": 34.75,
"learning_rate": 0.0008868831980012493,
"loss": 23.95,
"step": 14200
},
{
"epoch": 0.04442619162132957,
"grad_norm": 34.5,
"learning_rate": 0.0008884447220487196,
"loss": 24.0242,
"step": 14225
},
{
"epoch": 0.04450426928674491,
"grad_norm": 34.0,
"learning_rate": 0.0008900062460961898,
"loss": 24.2818,
"step": 14250
},
{
"epoch": 0.044582346952160255,
"grad_norm": 32.25,
"learning_rate": 0.0008915677701436602,
"loss": 24.4323,
"step": 14275
},
{
"epoch": 0.04466042461757559,
"grad_norm": 31.125,
"learning_rate": 0.0008931292941911305,
"loss": 24.8361,
"step": 14300
},
{
"epoch": 0.044738502282990936,
"grad_norm": 32.5,
"learning_rate": 0.0008946908182386009,
"loss": 24.8959,
"step": 14325
},
{
"epoch": 0.04481657994840628,
"grad_norm": 30.75,
"learning_rate": 0.0008962523422860712,
"loss": 24.7795,
"step": 14350
},
{
"epoch": 0.04489465761382162,
"grad_norm": 45.75,
"learning_rate": 0.0008978138663335416,
"loss": 24.9758,
"step": 14375
},
{
"epoch": 0.04497273527923696,
"grad_norm": 31.625,
"learning_rate": 0.0008993753903810119,
"loss": 25.0523,
"step": 14400
},
{
"epoch": 0.0450508129446523,
"grad_norm": 36.25,
"learning_rate": 0.0009009369144284822,
"loss": 24.8775,
"step": 14425
},
{
"epoch": 0.04512889061006765,
"grad_norm": 34.25,
"learning_rate": 0.0009024984384759525,
"loss": 24.9395,
"step": 14450
},
{
"epoch": 0.04520696827548299,
"grad_norm": 36.5,
"learning_rate": 0.000904059962523423,
"loss": 25.0047,
"step": 14475
},
{
"epoch": 0.04528504594089833,
"grad_norm": 39.75,
"learning_rate": 0.0009056214865708932,
"loss": 24.9923,
"step": 14500
},
{
"epoch": 0.04536312360631367,
"grad_norm": 32.75,
"learning_rate": 0.0009071830106183635,
"loss": 25.1583,
"step": 14525
},
{
"epoch": 0.045441201271729015,
"grad_norm": 31.25,
"learning_rate": 0.0009087445346658338,
"loss": 25.1936,
"step": 14550
},
{
"epoch": 0.04551927893714436,
"grad_norm": 31.75,
"learning_rate": 0.0009103060587133042,
"loss": 24.9059,
"step": 14575
},
{
"epoch": 0.045597356602559695,
"grad_norm": 34.0,
"learning_rate": 0.0009118675827607745,
"loss": 25.1417,
"step": 14600
},
{
"epoch": 0.04567543426797504,
"grad_norm": 32.25,
"learning_rate": 0.0009134291068082449,
"loss": 25.2183,
"step": 14625
},
{
"epoch": 0.04575351193339038,
"grad_norm": 35.25,
"learning_rate": 0.0009149906308557152,
"loss": 25.3087,
"step": 14650
},
{
"epoch": 0.045831589598805726,
"grad_norm": 31.5,
"learning_rate": 0.0009165521549031856,
"loss": 25.6569,
"step": 14675
},
{
"epoch": 0.04590966726422106,
"grad_norm": 40.5,
"learning_rate": 0.0009181136789506559,
"loss": 25.9421,
"step": 14700
},
{
"epoch": 0.04598774492963641,
"grad_norm": 39.75,
"learning_rate": 0.0009196752029981262,
"loss": 26.0395,
"step": 14725
},
{
"epoch": 0.04606582259505175,
"grad_norm": 41.25,
"learning_rate": 0.0009212367270455964,
"loss": 26.0641,
"step": 14750
},
{
"epoch": 0.046143900260467094,
"grad_norm": 39.5,
"learning_rate": 0.0009227982510930668,
"loss": 26.1332,
"step": 14775
},
{
"epoch": 0.04622197792588243,
"grad_norm": 36.5,
"learning_rate": 0.0009243597751405372,
"loss": 26.102,
"step": 14800
},
{
"epoch": 0.046300055591297774,
"grad_norm": 33.75,
"learning_rate": 0.0009259212991880075,
"loss": 26.1986,
"step": 14825
},
{
"epoch": 0.04637813325671312,
"grad_norm": 36.5,
"learning_rate": 0.0009274828232354778,
"loss": 26.0567,
"step": 14850
},
{
"epoch": 0.04645621092212846,
"grad_norm": 38.0,
"learning_rate": 0.0009290443472829481,
"loss": 26.2836,
"step": 14875
},
{
"epoch": 0.0465342885875438,
"grad_norm": 36.0,
"learning_rate": 0.0009306058713304186,
"loss": 26.6167,
"step": 14900
},
{
"epoch": 0.04661236625295914,
"grad_norm": 44.25,
"learning_rate": 0.0009321673953778889,
"loss": 26.4313,
"step": 14925
},
{
"epoch": 0.046690443918374486,
"grad_norm": 36.25,
"learning_rate": 0.0009337289194253592,
"loss": 26.1888,
"step": 14950
},
{
"epoch": 0.04676852158378983,
"grad_norm": 37.5,
"learning_rate": 0.0009352904434728295,
"loss": 26.2063,
"step": 14975
},
{
"epoch": 0.046846599249205166,
"grad_norm": 36.25,
"learning_rate": 0.0009368519675202999,
"loss": 26.3716,
"step": 15000
},
{
"epoch": 0.046846599249205166,
"eval_loss": 26.39820098876953,
"eval_runtime": 102.1335,
"eval_samples_per_second": 50.943,
"eval_steps_per_second": 3.192,
"step": 15000
},
{
"epoch": 0.04692467691462051,
"grad_norm": 35.5,
"learning_rate": 0.0009384134915677701,
"loss": 26.4646,
"step": 15025
},
{
"epoch": 0.04700275458003585,
"grad_norm": 37.25,
"learning_rate": 0.0009399750156152404,
"loss": 26.457,
"step": 15050
},
{
"epoch": 0.0470808322454512,
"grad_norm": 43.25,
"learning_rate": 0.0009415365396627108,
"loss": 26.4532,
"step": 15075
},
{
"epoch": 0.047158909910866534,
"grad_norm": 34.75,
"learning_rate": 0.0009430980637101812,
"loss": 26.32,
"step": 15100
},
{
"epoch": 0.04723698757628188,
"grad_norm": 58.5,
"learning_rate": 0.0009446595877576515,
"loss": 26.367,
"step": 15125
},
{
"epoch": 0.04731506524169722,
"grad_norm": 44.25,
"learning_rate": 0.0009462211118051218,
"loss": 26.4783,
"step": 15150
},
{
"epoch": 0.047393142907112565,
"grad_norm": 35.25,
"learning_rate": 0.0009477826358525921,
"loss": 26.3163,
"step": 15175
},
{
"epoch": 0.0474712205725279,
"grad_norm": 36.0,
"learning_rate": 0.0009493441599000626,
"loss": 26.6294,
"step": 15200
},
{
"epoch": 0.047549298237943245,
"grad_norm": 38.25,
"learning_rate": 0.0009509056839475329,
"loss": 26.6693,
"step": 15225
},
{
"epoch": 0.04762737590335859,
"grad_norm": 42.25,
"learning_rate": 0.0009524672079950032,
"loss": 26.9737,
"step": 15250
},
{
"epoch": 0.04770545356877393,
"grad_norm": 33.75,
"learning_rate": 0.0009540287320424734,
"loss": 26.9355,
"step": 15275
},
{
"epoch": 0.04778353123418927,
"grad_norm": 37.75,
"learning_rate": 0.0009555902560899437,
"loss": 27.0918,
"step": 15300
},
{
"epoch": 0.04786160889960461,
"grad_norm": 37.5,
"learning_rate": 0.0009571517801374141,
"loss": 27.2465,
"step": 15325
},
{
"epoch": 0.04793968656501996,
"grad_norm": 35.25,
"learning_rate": 0.0009587133041848845,
"loss": 27.1683,
"step": 15350
},
{
"epoch": 0.0480177642304353,
"grad_norm": 35.75,
"learning_rate": 0.0009602748282323548,
"loss": 27.0435,
"step": 15375
},
{
"epoch": 0.048095841895850644,
"grad_norm": 39.0,
"learning_rate": 0.0009618363522798251,
"loss": 27.2943,
"step": 15400
},
{
"epoch": 0.04817391956126598,
"grad_norm": 37.5,
"learning_rate": 0.0009633978763272955,
"loss": 27.1815,
"step": 15425
},
{
"epoch": 0.048251997226681324,
"grad_norm": 38.75,
"learning_rate": 0.0009649594003747658,
"loss": 27.2386,
"step": 15450
},
{
"epoch": 0.04833007489209667,
"grad_norm": 43.0,
"learning_rate": 0.0009665209244222361,
"loss": 27.5126,
"step": 15475
},
{
"epoch": 0.04840815255751201,
"grad_norm": 44.75,
"learning_rate": 0.0009680824484697065,
"loss": 27.6576,
"step": 15500
},
{
"epoch": 0.04848623022292735,
"grad_norm": 39.75,
"learning_rate": 0.0009696439725171768,
"loss": 27.6394,
"step": 15525
},
{
"epoch": 0.04856430788834269,
"grad_norm": 46.5,
"learning_rate": 0.0009712054965646471,
"loss": 27.9862,
"step": 15550
},
{
"epoch": 0.048642385553758036,
"grad_norm": 36.5,
"learning_rate": 0.0009727670206121174,
"loss": 27.6303,
"step": 15575
},
{
"epoch": 0.04872046321917338,
"grad_norm": 36.25,
"learning_rate": 0.0009743285446595877,
"loss": 27.6376,
"step": 15600
},
{
"epoch": 0.048798540884588716,
"grad_norm": 37.25,
"learning_rate": 0.0009758900687070581,
"loss": 27.792,
"step": 15625
},
{
"epoch": 0.04887661855000406,
"grad_norm": 37.0,
"learning_rate": 0.0009774515927545285,
"loss": 27.8976,
"step": 15650
},
{
"epoch": 0.048954696215419403,
"grad_norm": 44.75,
"learning_rate": 0.0009790131168019988,
"loss": 28.1314,
"step": 15675
},
{
"epoch": 0.04903277388083475,
"grad_norm": 41.5,
"learning_rate": 0.000980574640849469,
"loss": 28.1346,
"step": 15700
},
{
"epoch": 0.049110851546250084,
"grad_norm": 42.0,
"learning_rate": 0.0009821361648969394,
"loss": 28.3701,
"step": 15725
},
{
"epoch": 0.04918892921166543,
"grad_norm": 38.5,
"learning_rate": 0.0009836976889444097,
"loss": 28.2846,
"step": 15750
},
{
"epoch": 0.04926700687708077,
"grad_norm": 39.75,
"learning_rate": 0.0009852592129918803,
"loss": 28.4163,
"step": 15775
},
{
"epoch": 0.049345084542496115,
"grad_norm": 37.25,
"learning_rate": 0.0009868207370393504,
"loss": 28.4691,
"step": 15800
},
{
"epoch": 0.04942316220791145,
"grad_norm": 40.75,
"learning_rate": 0.0009883822610868207,
"loss": 28.3626,
"step": 15825
},
{
"epoch": 0.049501239873326795,
"grad_norm": 38.75,
"learning_rate": 0.000989943785134291,
"loss": 28.2031,
"step": 15850
},
{
"epoch": 0.04957931753874214,
"grad_norm": 54.25,
"learning_rate": 0.0009915053091817613,
"loss": 28.2261,
"step": 15875
},
{
"epoch": 0.04965739520415748,
"grad_norm": 37.0,
"learning_rate": 0.0009930668332292318,
"loss": 27.9449,
"step": 15900
},
{
"epoch": 0.04973547286957282,
"grad_norm": 34.25,
"learning_rate": 0.0009946283572767022,
"loss": 27.9998,
"step": 15925
},
{
"epoch": 0.04981355053498816,
"grad_norm": 38.5,
"learning_rate": 0.0009961898813241725,
"loss": 27.9713,
"step": 15950
},
{
"epoch": 0.04989162820040351,
"grad_norm": 34.5,
"learning_rate": 0.0009977514053716428,
"loss": 28.3091,
"step": 15975
},
{
"epoch": 0.04996970586581885,
"grad_norm": 39.75,
"learning_rate": 0.000999312929419113,
"loss": 28.207,
"step": 16000
},
{
"epoch": 0.04996970586581885,
"eval_loss": 28.333789825439453,
"eval_runtime": 102.3237,
"eval_samples_per_second": 50.848,
"eval_steps_per_second": 3.186,
"step": 16000
},
{
"epoch": 0.05004778353123419,
"grad_norm": 37.0,
"learning_rate": 0.000999999994773354,
"loss": 28.2375,
"step": 16025
},
{
"epoch": 0.05012586119664953,
"grad_norm": 45.5,
"learning_rate": 0.0009999999594401602,
"loss": 28.1327,
"step": 16050
},
{
"epoch": 0.050203938862064874,
"grad_norm": 42.75,
"learning_rate": 0.0009999998907737678,
"loss": 28.3186,
"step": 16075
},
{
"epoch": 0.05028201652748022,
"grad_norm": 35.0,
"learning_rate": 0.0009999997887741804,
"loss": 27.9558,
"step": 16100
},
{
"epoch": 0.050360094192895555,
"grad_norm": 47.75,
"learning_rate": 0.0009999996534414057,
"loss": 28.2493,
"step": 16125
},
{
"epoch": 0.0504381718583109,
"grad_norm": 40.25,
"learning_rate": 0.000999999484775452,
"loss": 28.1951,
"step": 16150
},
{
"epoch": 0.05051624952372624,
"grad_norm": 36.25,
"learning_rate": 0.000999999282776331,
"loss": 28.3094,
"step": 16175
},
{
"epoch": 0.050594327189141586,
"grad_norm": 36.25,
"learning_rate": 0.000999999047444056,
"loss": 28.579,
"step": 16200
},
{
"epoch": 0.05067240485455692,
"grad_norm": 38.5,
"learning_rate": 0.0009999987787786427,
"loss": 28.4296,
"step": 16225
},
{
"epoch": 0.050750482519972266,
"grad_norm": 43.0,
"learning_rate": 0.0009999984767801089,
"loss": 28.686,
"step": 16250
},
{
"epoch": 0.05082856018538761,
"grad_norm": 46.25,
"learning_rate": 0.0009999981414484749,
"loss": 28.5111,
"step": 16275
},
{
"epoch": 0.050906637850802954,
"grad_norm": 44.75,
"learning_rate": 0.000999997772783763,
"loss": 28.7081,
"step": 16300
},
{
"epoch": 0.05098471551621829,
"grad_norm": 43.0,
"learning_rate": 0.0009999973707859977,
"loss": 28.9352,
"step": 16325
},
{
"epoch": 0.051062793181633634,
"grad_norm": 42.0,
"learning_rate": 0.000999996935455206,
"loss": 28.8936,
"step": 16350
},
{
"epoch": 0.05114087084704898,
"grad_norm": 37.25,
"learning_rate": 0.0009999964667914167,
"loss": 28.9924,
"step": 16375
},
{
"epoch": 0.05121894851246432,
"grad_norm": 37.75,
"learning_rate": 0.0009999959647946613,
"loss": 28.6103,
"step": 16400
},
{
"epoch": 0.05129702617787966,
"grad_norm": 36.25,
"learning_rate": 0.0009999954294649732,
"loss": 28.7174,
"step": 16425
},
{
"epoch": 0.051375103843295,
"grad_norm": 48.25,
"learning_rate": 0.0009999948608023876,
"loss": 28.5916,
"step": 16450
},
{
"epoch": 0.051453181508710345,
"grad_norm": 37.0,
"learning_rate": 0.0009999942588069433,
"loss": 28.8703,
"step": 16475
},
{
"epoch": 0.05153125917412569,
"grad_norm": 37.75,
"learning_rate": 0.0009999936234786795,
"loss": 29.1448,
"step": 16500
},
{
"epoch": 0.051609336839541026,
"grad_norm": 36.25,
"learning_rate": 0.0009999929548176391,
"loss": 28.8964,
"step": 16525
},
{
"epoch": 0.05168741450495637,
"grad_norm": 34.75,
"learning_rate": 0.0009999922528238668,
"loss": 28.6221,
"step": 16550
},
{
"epoch": 0.05176549217037171,
"grad_norm": 39.5,
"learning_rate": 0.000999991517497409,
"loss": 28.9006,
"step": 16575
},
{
"epoch": 0.05184356983578706,
"grad_norm": 35.25,
"learning_rate": 0.0009999907488383148,
"loss": 28.6834,
"step": 16600
},
{
"epoch": 0.05192164750120239,
"grad_norm": 36.25,
"learning_rate": 0.0009999899468466358,
"loss": 28.4863,
"step": 16625
},
{
"epoch": 0.05199972516661774,
"grad_norm": 34.75,
"learning_rate": 0.0009999891115224251,
"loss": 28.381,
"step": 16650
},
{
"epoch": 0.05207780283203308,
"grad_norm": 42.25,
"learning_rate": 0.0009999882428657384,
"loss": 28.4007,
"step": 16675
},
{
"epoch": 0.052155880497448424,
"grad_norm": 49.25,
"learning_rate": 0.0009999873408766337,
"loss": 28.3731,
"step": 16700
},
{
"epoch": 0.05223395816286376,
"grad_norm": 45.0,
"learning_rate": 0.0009999864055551713,
"loss": 28.1782,
"step": 16725
},
{
"epoch": 0.052312035828279105,
"grad_norm": 38.5,
"learning_rate": 0.0009999854369014132,
"loss": 28.2612,
"step": 16750
},
{
"epoch": 0.05239011349369445,
"grad_norm": 41.5,
"learning_rate": 0.0009999844349154244,
"loss": 28.0716,
"step": 16775
},
{
"epoch": 0.05246819115910979,
"grad_norm": 34.0,
"learning_rate": 0.0009999833995972711,
"loss": 27.842,
"step": 16800
},
{
"epoch": 0.05254626882452513,
"grad_norm": 32.5,
"learning_rate": 0.000999982330947023,
"loss": 28.2459,
"step": 16825
},
{
"epoch": 0.05262434648994047,
"grad_norm": 36.75,
"learning_rate": 0.000999981228964751,
"loss": 28.2205,
"step": 16850
},
{
"epoch": 0.052702424155355816,
"grad_norm": 41.25,
"learning_rate": 0.0009999800936505287,
"loss": 28.2134,
"step": 16875
},
{
"epoch": 0.05278050182077116,
"grad_norm": 41.0,
"learning_rate": 0.0009999789250044312,
"loss": 28.0064,
"step": 16900
},
{
"epoch": 0.0528585794861865,
"grad_norm": 42.0,
"learning_rate": 0.0009999777230265375,
"loss": 28.2604,
"step": 16925
},
{
"epoch": 0.05293665715160184,
"grad_norm": 40.75,
"learning_rate": 0.0009999764877169268,
"loss": 28.5458,
"step": 16950
},
{
"epoch": 0.053014734817017184,
"grad_norm": 38.0,
"learning_rate": 0.0009999752190756818,
"loss": 28.4853,
"step": 16975
},
{
"epoch": 0.05309281248243253,
"grad_norm": 38.75,
"learning_rate": 0.000999973917102887,
"loss": 28.8174,
"step": 17000
},
{
"epoch": 0.05309281248243253,
"eval_loss": 28.75542449951172,
"eval_runtime": 102.5252,
"eval_samples_per_second": 50.749,
"eval_steps_per_second": 3.18,
"step": 17000
},
{
"epoch": 0.053170890147847864,
"grad_norm": 43.0,
"learning_rate": 0.0009999725817986295,
"loss": 28.8356,
"step": 17025
},
{
"epoch": 0.05324896781326321,
"grad_norm": 38.0,
"learning_rate": 0.0009999712131629978,
"loss": 28.9959,
"step": 17050
},
{
"epoch": 0.05332704547867855,
"grad_norm": 37.25,
"learning_rate": 0.0009999698111960835,
"loss": 28.713,
"step": 17075
},
{
"epoch": 0.053405123144093895,
"grad_norm": 43.5,
"learning_rate": 0.00099996837589798,
"loss": 28.8244,
"step": 17100
},
{
"epoch": 0.05348320080950923,
"grad_norm": 36.0,
"learning_rate": 0.000999966907268783,
"loss": 28.8987,
"step": 17125
},
{
"epoch": 0.053561278474924576,
"grad_norm": 40.0,
"learning_rate": 0.0009999654053085903,
"loss": 28.6699,
"step": 17150
},
{
"epoch": 0.05363935614033992,
"grad_norm": 36.25,
"learning_rate": 0.000999963870017502,
"loss": 28.8461,
"step": 17175
},
{
"epoch": 0.05371743380575526,
"grad_norm": 34.0,
"learning_rate": 0.0009999623013956208,
"loss": 28.6992,
"step": 17200
},
{
"epoch": 0.0537955114711706,
"grad_norm": 36.0,
"learning_rate": 0.0009999606994430508,
"loss": 28.6228,
"step": 17225
},
{
"epoch": 0.05387358913658594,
"grad_norm": 31.0,
"learning_rate": 0.000999959064159899,
"loss": 28.801,
"step": 17250
},
{
"epoch": 0.05395166680200129,
"grad_norm": 40.0,
"learning_rate": 0.0009999573955462747,
"loss": 28.9502,
"step": 17275
},
{
"epoch": 0.05402974446741663,
"grad_norm": 45.25,
"learning_rate": 0.0009999556936022887,
"loss": 29.1268,
"step": 17300
},
{
"epoch": 0.05410782213283197,
"grad_norm": 40.25,
"learning_rate": 0.0009999539583280548,
"loss": 29.3132,
"step": 17325
},
{
"epoch": 0.05418589979824731,
"grad_norm": 48.75,
"learning_rate": 0.0009999521897236885,
"loss": 29.2909,
"step": 17350
},
{
"epoch": 0.054263977463662655,
"grad_norm": 41.75,
"learning_rate": 0.0009999503877893075,
"loss": 29.5531,
"step": 17375
},
{
"epoch": 0.054342055129078,
"grad_norm": 39.75,
"learning_rate": 0.0009999485525250323,
"loss": 29.7544,
"step": 17400
},
{
"epoch": 0.054420132794493335,
"grad_norm": 35.25,
"learning_rate": 0.0009999466839309852,
"loss": 29.7906,
"step": 17425
},
{
"epoch": 0.05449821045990868,
"grad_norm": 39.5,
"learning_rate": 0.0009999447820072907,
"loss": 30.0083,
"step": 17450
},
{
"epoch": 0.05457628812532402,
"grad_norm": 44.0,
"learning_rate": 0.0009999428467540755,
"loss": 29.8602,
"step": 17475
},
{
"epoch": 0.054654365790739366,
"grad_norm": 41.25,
"learning_rate": 0.0009999408781714686,
"loss": 30.1297,
"step": 17500
},
{
"epoch": 0.0547324434561547,
"grad_norm": 36.75,
"learning_rate": 0.0009999388762596015,
"loss": 30.0162,
"step": 17525
},
{
"epoch": 0.05481052112157005,
"grad_norm": 39.5,
"learning_rate": 0.0009999368410186075,
"loss": 30.2735,
"step": 17550
},
{
"epoch": 0.05488859878698539,
"grad_norm": 38.25,
"learning_rate": 0.0009999347724486223,
"loss": 30.0508,
"step": 17575
},
{
"epoch": 0.054966676452400734,
"grad_norm": 39.0,
"learning_rate": 0.0009999326705497837,
"loss": 30.0643,
"step": 17600
},
{
"epoch": 0.05504475411781607,
"grad_norm": 45.25,
"learning_rate": 0.0009999305353222319,
"loss": 30.1616,
"step": 17625
},
{
"epoch": 0.055122831783231414,
"grad_norm": 37.5,
"learning_rate": 0.0009999283667661094,
"loss": 29.9471,
"step": 17650
},
{
"epoch": 0.05520090944864676,
"grad_norm": 39.75,
"learning_rate": 0.0009999261648815604,
"loss": 30.057,
"step": 17675
},
{
"epoch": 0.0552789871140621,
"grad_norm": 44.0,
"learning_rate": 0.0009999239296687322,
"loss": 30.1771,
"step": 17700
},
{
"epoch": 0.05535706477947744,
"grad_norm": 37.75,
"learning_rate": 0.0009999216611277734,
"loss": 30.6398,
"step": 17725
},
{
"epoch": 0.05543514244489278,
"grad_norm": 49.75,
"learning_rate": 0.000999919359258835,
"loss": 30.2826,
"step": 17750
},
{
"epoch": 0.055513220110308126,
"grad_norm": 38.75,
"learning_rate": 0.0009999170240620715,
"loss": 30.4671,
"step": 17775
},
{
"epoch": 0.05559129777572347,
"grad_norm": 36.5,
"learning_rate": 0.0009999146555376376,
"loss": 30.3188,
"step": 17800
},
{
"epoch": 0.055669375441138806,
"grad_norm": 41.25,
"learning_rate": 0.0009999122536856913,
"loss": 30.5831,
"step": 17825
},
{
"epoch": 0.05574745310655415,
"grad_norm": 37.25,
"learning_rate": 0.000999909818506393,
"loss": 30.3014,
"step": 17850
},
{
"epoch": 0.055825530771969493,
"grad_norm": 43.5,
"learning_rate": 0.0009999073499999051,
"loss": 30.3619,
"step": 17875
},
{
"epoch": 0.05590360843738484,
"grad_norm": 41.25,
"learning_rate": 0.0009999048481663922,
"loss": 30.207,
"step": 17900
},
{
"epoch": 0.05598168610280018,
"grad_norm": 55.75,
"learning_rate": 0.0009999023130060208,
"loss": 30.5041,
"step": 17925
},
{
"epoch": 0.05605976376821552,
"grad_norm": 52.0,
"learning_rate": 0.00099989974451896,
"loss": 30.6651,
"step": 17950
},
{
"epoch": 0.05613784143363086,
"grad_norm": 51.25,
"learning_rate": 0.000999897142705381,
"loss": 30.8339,
"step": 17975
},
{
"epoch": 0.056215919099046205,
"grad_norm": 48.75,
"learning_rate": 0.0009998945075654572,
"loss": 30.9781,
"step": 18000
},
{
"epoch": 0.056215919099046205,
"eval_loss": 31.040813446044922,
"eval_runtime": 102.3512,
"eval_samples_per_second": 50.835,
"eval_steps_per_second": 3.185,
"step": 18000
},
{
"epoch": 0.05629399676446155,
"grad_norm": 43.5,
"learning_rate": 0.0009998918390993648,
"loss": 30.9913,
"step": 18025
},
{
"epoch": 0.056372074429876885,
"grad_norm": 42.75,
"learning_rate": 0.000999889137307281,
"loss": 31.086,
"step": 18050
},
{
"epoch": 0.05645015209529223,
"grad_norm": 41.0,
"learning_rate": 0.0009998864021893864,
"loss": 31.0512,
"step": 18075
},
{
"epoch": 0.05652822976070757,
"grad_norm": 42.75,
"learning_rate": 0.0009998836337458629,
"loss": 31.2091,
"step": 18100
},
{
"epoch": 0.056606307426122916,
"grad_norm": 44.25,
"learning_rate": 0.0009998808319768954,
"loss": 31.1535,
"step": 18125
},
{
"epoch": 0.05668438509153825,
"grad_norm": 43.5,
"learning_rate": 0.0009998779968826707,
"loss": 31.3788,
"step": 18150
},
{
"epoch": 0.0567624627569536,
"grad_norm": 43.75,
"learning_rate": 0.0009998751284633779,
"loss": 31.3632,
"step": 18175
},
{
"epoch": 0.05684054042236894,
"grad_norm": 39.0,
"learning_rate": 0.0009998722267192076,
"loss": 31.101,
"step": 18200
},
{
"epoch": 0.056918618087784284,
"grad_norm": 38.25,
"learning_rate": 0.000999869291650354,
"loss": 30.8788,
"step": 18225
},
{
"epoch": 0.05699669575319962,
"grad_norm": 36.75,
"learning_rate": 0.0009998663232570122,
"loss": 31.0841,
"step": 18250
},
{
"epoch": 0.057074773418614964,
"grad_norm": 39.75,
"learning_rate": 0.0009998633215393805,
"loss": 31.4425,
"step": 18275
},
{
"epoch": 0.05715285108403031,
"grad_norm": 37.5,
"learning_rate": 0.000999860286497659,
"loss": 31.6592,
"step": 18300
},
{
"epoch": 0.05723092874944565,
"grad_norm": 40.0,
"learning_rate": 0.0009998572181320496,
"loss": 31.3277,
"step": 18325
},
{
"epoch": 0.05730900641486099,
"grad_norm": 39.75,
"learning_rate": 0.0009998541164427575,
"loss": 31.3697,
"step": 18350
},
{
"epoch": 0.05738708408027633,
"grad_norm": 35.0,
"learning_rate": 0.0009998509814299888,
"loss": 31.2663,
"step": 18375
},
{
"epoch": 0.057465161745691676,
"grad_norm": 37.25,
"learning_rate": 0.000999847813093953,
"loss": 31.6682,
"step": 18400
},
{
"epoch": 0.05754323941110702,
"grad_norm": 38.75,
"learning_rate": 0.0009998446114348612,
"loss": 31.7364,
"step": 18425
},
{
"epoch": 0.057621317076522356,
"grad_norm": 48.5,
"learning_rate": 0.0009998413764529266,
"loss": 31.8273,
"step": 18450
},
{
"epoch": 0.0576993947419377,
"grad_norm": 39.5,
"learning_rate": 0.0009998381081483651,
"loss": 32.178,
"step": 18475
},
{
"epoch": 0.057777472407353044,
"grad_norm": 38.75,
"learning_rate": 0.0009998348065213946,
"loss": 32.3324,
"step": 18500
},
{
"epoch": 0.05785555007276839,
"grad_norm": 41.75,
"learning_rate": 0.000999831471572235,
"loss": 32.6464,
"step": 18525
},
{
"epoch": 0.057933627738183724,
"grad_norm": 42.0,
"learning_rate": 0.0009998281033011091,
"loss": 32.1848,
"step": 18550
},
{
"epoch": 0.05801170540359907,
"grad_norm": 39.75,
"learning_rate": 0.000999824701708241,
"loss": 32.543,
"step": 18575
},
{
"epoch": 0.05808978306901441,
"grad_norm": 48.5,
"learning_rate": 0.0009998212667938578,
"loss": 32.4726,
"step": 18600
},
{
"epoch": 0.058167860734429755,
"grad_norm": 45.0,
"learning_rate": 0.000999817798558188,
"loss": 32.2877,
"step": 18625
},
{
"epoch": 0.05824593839984509,
"grad_norm": 38.25,
"learning_rate": 0.0009998142970014633,
"loss": 32.4187,
"step": 18650
},
{
"epoch": 0.058324016065260435,
"grad_norm": 51.5,
"learning_rate": 0.0009998107621239168,
"loss": 32.6334,
"step": 18675
},
{
"epoch": 0.05840209373067578,
"grad_norm": 48.5,
"learning_rate": 0.0009998071939257842,
"loss": 33.0217,
"step": 18700
},
{
"epoch": 0.05848017139609112,
"grad_norm": 50.0,
"learning_rate": 0.0009998035924073036,
"loss": 32.839,
"step": 18725
},
{
"epoch": 0.05855824906150646,
"grad_norm": 41.75,
"learning_rate": 0.000999799957568715,
"loss": 32.84,
"step": 18750
},
{
"epoch": 0.0586363267269218,
"grad_norm": 55.5,
"learning_rate": 0.0009997962894102608,
"loss": 33.0097,
"step": 18775
},
{
"epoch": 0.05871440439233715,
"grad_norm": 52.5,
"learning_rate": 0.0009997925879321854,
"loss": 33.0055,
"step": 18800
},
{
"epoch": 0.05879248205775249,
"grad_norm": 47.25,
"learning_rate": 0.0009997888531347358,
"loss": 33.3652,
"step": 18825
},
{
"epoch": 0.05887055972316783,
"grad_norm": 41.25,
"learning_rate": 0.0009997850850181605,
"loss": 33.1608,
"step": 18850
},
{
"epoch": 0.05894863738858317,
"grad_norm": 42.75,
"learning_rate": 0.000999781283582711,
"loss": 33.2872,
"step": 18875
},
{
"epoch": 0.059026715053998514,
"grad_norm": 43.25,
"learning_rate": 0.0009997774488286408,
"loss": 33.0581,
"step": 18900
},
{
"epoch": 0.05910479271941386,
"grad_norm": 48.0,
"learning_rate": 0.0009997735807562055,
"loss": 33.0212,
"step": 18925
},
{
"epoch": 0.059182870384829195,
"grad_norm": 39.0,
"learning_rate": 0.000999769679365663,
"loss": 32.7047,
"step": 18950
},
{
"epoch": 0.05926094805024454,
"grad_norm": 41.25,
"learning_rate": 0.0009997657446572735,
"loss": 32.7831,
"step": 18975
},
{
"epoch": 0.05933902571565988,
"grad_norm": 42.75,
"learning_rate": 0.0009997617766312988,
"loss": 32.8744,
"step": 19000
},
{
"epoch": 0.05933902571565988,
"eval_loss": 32.887264251708984,
"eval_runtime": 102.2215,
"eval_samples_per_second": 50.899,
"eval_steps_per_second": 3.189,
"step": 19000
},
{
"epoch": 0.059417103381075226,
"grad_norm": 41.5,
"learning_rate": 0.0009997577752880041,
"loss": 32.8132,
"step": 19025
},
{
"epoch": 0.05949518104649056,
"grad_norm": 43.75,
"learning_rate": 0.0009997537406276557,
"loss": 32.9501,
"step": 19050
},
{
"epoch": 0.059573258711905906,
"grad_norm": 45.25,
"learning_rate": 0.0009997496726505228,
"loss": 32.7061,
"step": 19075
},
{
"epoch": 0.05965133637732125,
"grad_norm": 37.5,
"learning_rate": 0.0009997455713568763,
"loss": 32.7181,
"step": 19100
},
{
"epoch": 0.059729414042736594,
"grad_norm": 41.0,
"learning_rate": 0.00099974143674699,
"loss": 32.554,
"step": 19125
},
{
"epoch": 0.05980749170815193,
"grad_norm": 41.5,
"learning_rate": 0.0009997372688211395,
"loss": 32.7137,
"step": 19150
},
{
"epoch": 0.059885569373567274,
"grad_norm": 45.0,
"learning_rate": 0.0009997330675796023,
"loss": 33.0025,
"step": 19175
},
{
"epoch": 0.05996364703898262,
"grad_norm": 42.0,
"learning_rate": 0.000999728833022659,
"loss": 32.9643,
"step": 19200
},
{
"epoch": 0.06004172470439796,
"grad_norm": 52.5,
"learning_rate": 0.0009997245651505915,
"loss": 32.8268,
"step": 19225
},
{
"epoch": 0.0601198023698133,
"grad_norm": 43.0,
"learning_rate": 0.0009997202639636844,
"loss": 32.8,
"step": 19250
},
{
"epoch": 0.06019788003522864,
"grad_norm": 56.5,
"learning_rate": 0.0009997159294622246,
"loss": 32.9133,
"step": 19275
},
{
"epoch": 0.060275957700643985,
"grad_norm": 44.25,
"learning_rate": 0.000999711561646501,
"loss": 32.8573,
"step": 19300
},
{
"epoch": 0.06035403536605933,
"grad_norm": 44.0,
"learning_rate": 0.0009997071605168043,
"loss": 32.7512,
"step": 19325
},
{
"epoch": 0.060432113031474666,
"grad_norm": 36.5,
"learning_rate": 0.000999702726073429,
"loss": 32.9202,
"step": 19350
},
{
"epoch": 0.06051019069689001,
"grad_norm": 40.0,
"learning_rate": 0.0009996982583166695,
"loss": 32.942,
"step": 19375
},
{
"epoch": 0.06058826836230535,
"grad_norm": 39.0,
"learning_rate": 0.0009996937572468246,
"loss": 32.8775,
"step": 19400
},
{
"epoch": 0.0606663460277207,
"grad_norm": 37.0,
"learning_rate": 0.000999689222864194,
"loss": 32.8532,
"step": 19425
},
{
"epoch": 0.06074442369313603,
"grad_norm": 47.25,
"learning_rate": 0.0009996846551690798,
"loss": 32.9941,
"step": 19450
},
{
"epoch": 0.06082250135855138,
"grad_norm": 38.0,
"learning_rate": 0.0009996800541617868,
"loss": 32.8616,
"step": 19475
},
{
"epoch": 0.06090057902396672,
"grad_norm": 39.5,
"learning_rate": 0.0009996754198426216,
"loss": 32.9031,
"step": 19500
},
{
"epoch": 0.060978656689382064,
"grad_norm": 44.5,
"learning_rate": 0.0009996707522118933,
"loss": 33.0028,
"step": 19525
},
{
"epoch": 0.0610567343547974,
"grad_norm": 39.75,
"learning_rate": 0.0009996660512699128,
"loss": 32.8195,
"step": 19550
},
{
"epoch": 0.061134812020212745,
"grad_norm": 40.75,
"learning_rate": 0.0009996613170169936,
"loss": 32.571,
"step": 19575
},
{
"epoch": 0.06121288968562809,
"grad_norm": 36.75,
"learning_rate": 0.0009996565494534517,
"loss": 32.5517,
"step": 19600
},
{
"epoch": 0.06129096735104343,
"grad_norm": 38.0,
"learning_rate": 0.0009996517485796044,
"loss": 32.5484,
"step": 19625
},
{
"epoch": 0.06136904501645877,
"grad_norm": 41.75,
"learning_rate": 0.000999646914395772,
"loss": 32.5895,
"step": 19650
},
{
"epoch": 0.06144712268187411,
"grad_norm": 42.0,
"learning_rate": 0.0009996420469022766,
"loss": 32.8765,
"step": 19675
},
{
"epoch": 0.061525200347289456,
"grad_norm": 38.5,
"learning_rate": 0.0009996371460994431,
"loss": 32.8793,
"step": 19700
},
{
"epoch": 0.0616032780127048,
"grad_norm": 40.25,
"learning_rate": 0.0009996322119875977,
"loss": 33.0708,
"step": 19725
},
{
"epoch": 0.06168135567812014,
"grad_norm": 38.0,
"learning_rate": 0.00099962724456707,
"loss": 33.188,
"step": 19750
},
{
"epoch": 0.06175943334353548,
"grad_norm": 49.0,
"learning_rate": 0.0009996222438381904,
"loss": 33.2918,
"step": 19775
},
{
"epoch": 0.061837511008950824,
"grad_norm": 44.75,
"learning_rate": 0.0009996172098012928,
"loss": 33.4949,
"step": 19800
},
{
"epoch": 0.06191558867436617,
"grad_norm": 43.25,
"learning_rate": 0.0009996121424567126,
"loss": 33.8741,
"step": 19825
},
{
"epoch": 0.061993666339781504,
"grad_norm": 41.75,
"learning_rate": 0.0009996070418047877,
"loss": 33.6041,
"step": 19850
},
{
"epoch": 0.06207174400519685,
"grad_norm": 40.25,
"learning_rate": 0.000999601907845858,
"loss": 33.6722,
"step": 19875
},
{
"epoch": 0.06214982167061219,
"grad_norm": 40.5,
"learning_rate": 0.000999596740580266,
"loss": 33.484,
"step": 19900
},
{
"epoch": 0.062227899336027535,
"grad_norm": 46.25,
"learning_rate": 0.000999591540008356,
"loss": 33.7352,
"step": 19925
},
{
"epoch": 0.06230597700144287,
"grad_norm": 48.5,
"learning_rate": 0.0009995863061304747,
"loss": 33.9541,
"step": 19950
},
{
"epoch": 0.062384054666858216,
"grad_norm": 44.0,
"learning_rate": 0.0009995810389469711,
"loss": 34.2383,
"step": 19975
},
{
"epoch": 0.06246213233227356,
"grad_norm": 40.75,
"learning_rate": 0.0009995757384581964,
"loss": 33.8251,
"step": 20000
},
{
"epoch": 0.06246213233227356,
"eval_loss": 34.19303512573242,
"eval_runtime": 102.3811,
"eval_samples_per_second": 50.82,
"eval_steps_per_second": 3.184,
"step": 20000
},
{
"epoch": 0.0625402099976889,
"grad_norm": 50.0,
"learning_rate": 0.000999570404664504,
"loss": 34.3706,
"step": 20025
},
{
"epoch": 0.06261828766310425,
"grad_norm": 45.75,
"learning_rate": 0.0009995650375662492,
"loss": 34.1775,
"step": 20050
},
{
"epoch": 0.06269636532851959,
"grad_norm": 43.5,
"learning_rate": 0.0009995596371637897,
"loss": 34.3327,
"step": 20075
},
{
"epoch": 0.06277444299393492,
"grad_norm": 43.25,
"learning_rate": 0.0009995542034574863,
"loss": 34.3871,
"step": 20100
},
{
"epoch": 0.06285252065935026,
"grad_norm": 42.75,
"learning_rate": 0.0009995487364477004,
"loss": 33.8116,
"step": 20125
},
{
"epoch": 0.06293059832476561,
"grad_norm": 37.5,
"learning_rate": 0.0009995432361347971,
"loss": 33.9015,
"step": 20150
},
{
"epoch": 0.06300867599018095,
"grad_norm": 38.5,
"learning_rate": 0.0009995377025191427,
"loss": 33.8639,
"step": 20175
},
{
"epoch": 0.0630867536555963,
"grad_norm": 37.25,
"learning_rate": 0.0009995321356011063,
"loss": 33.6663,
"step": 20200
},
{
"epoch": 0.06316483132101164,
"grad_norm": 40.5,
"learning_rate": 0.0009995265353810589,
"loss": 33.8264,
"step": 20225
},
{
"epoch": 0.06324290898642698,
"grad_norm": 45.25,
"learning_rate": 0.0009995209018593737,
"loss": 33.6851,
"step": 20250
},
{
"epoch": 0.06332098665184233,
"grad_norm": 42.0,
"learning_rate": 0.0009995152350364266,
"loss": 33.5799,
"step": 20275
},
{
"epoch": 0.06339906431725766,
"grad_norm": 43.25,
"learning_rate": 0.000999509534912595,
"loss": 33.6905,
"step": 20300
},
{
"epoch": 0.063477141982673,
"grad_norm": 37.25,
"learning_rate": 0.0009995038014882593,
"loss": 33.4839,
"step": 20325
},
{
"epoch": 0.06355521964808834,
"grad_norm": 35.75,
"learning_rate": 0.0009994980347638016,
"loss": 33.6105,
"step": 20350
},
{
"epoch": 0.06363329731350369,
"grad_norm": 38.0,
"learning_rate": 0.0009994922347396063,
"loss": 33.9047,
"step": 20375
},
{
"epoch": 0.06371137497891903,
"grad_norm": 40.25,
"learning_rate": 0.00099948640141606,
"loss": 34.1876,
"step": 20400
},
{
"epoch": 0.06378945264433437,
"grad_norm": 45.75,
"learning_rate": 0.0009994805347935517,
"loss": 33.9303,
"step": 20425
},
{
"epoch": 0.06386753030974972,
"grad_norm": 42.75,
"learning_rate": 0.0009994746348724727,
"loss": 33.951,
"step": 20450
},
{
"epoch": 0.06394560797516506,
"grad_norm": 50.0,
"learning_rate": 0.000999468701653216,
"loss": 34.056,
"step": 20475
},
{
"epoch": 0.0640236856405804,
"grad_norm": 50.5,
"learning_rate": 0.0009994627351361772,
"loss": 33.9114,
"step": 20500
},
{
"epoch": 0.06410176330599573,
"grad_norm": 42.25,
"learning_rate": 0.0009994567353217541,
"loss": 34.2422,
"step": 20525
},
{
"epoch": 0.06417984097141108,
"grad_norm": 44.25,
"learning_rate": 0.0009994507022103465,
"loss": 34.0631,
"step": 20550
},
{
"epoch": 0.06425791863682642,
"grad_norm": 39.75,
"learning_rate": 0.000999444635802357,
"loss": 33.8447,
"step": 20575
},
{
"epoch": 0.06433599630224177,
"grad_norm": 44.75,
"learning_rate": 0.00099943853609819,
"loss": 33.8587,
"step": 20600
},
{
"epoch": 0.06441407396765711,
"grad_norm": 39.25,
"learning_rate": 0.0009994324030982518,
"loss": 33.943,
"step": 20625
},
{
"epoch": 0.06449215163307245,
"grad_norm": 41.75,
"learning_rate": 0.0009994262368029515,
"loss": 33.9425,
"step": 20650
},
{
"epoch": 0.0645702292984878,
"grad_norm": 44.5,
"learning_rate": 0.0009994200372127,
"loss": 34.0832,
"step": 20675
},
{
"epoch": 0.06464830696390314,
"grad_norm": 39.25,
"learning_rate": 0.000999413804327911,
"loss": 33.9888,
"step": 20700
},
{
"epoch": 0.06472638462931847,
"grad_norm": 43.75,
"learning_rate": 0.0009994075381489994,
"loss": 34.1022,
"step": 20725
},
{
"epoch": 0.06480446229473381,
"grad_norm": 44.25,
"learning_rate": 0.0009994012386763836,
"loss": 33.9719,
"step": 20750
},
{
"epoch": 0.06488253996014916,
"grad_norm": 42.0,
"learning_rate": 0.000999394905910483,
"loss": 33.7568,
"step": 20775
},
{
"epoch": 0.0649606176255645,
"grad_norm": 43.75,
"learning_rate": 0.0009993885398517201,
"loss": 33.7079,
"step": 20800
},
{
"epoch": 0.06503869529097984,
"grad_norm": 40.0,
"learning_rate": 0.0009993821405005195,
"loss": 33.8396,
"step": 20825
},
{
"epoch": 0.06511677295639519,
"grad_norm": 42.5,
"learning_rate": 0.0009993757078573073,
"loss": 33.6027,
"step": 20850
},
{
"epoch": 0.06519485062181053,
"grad_norm": 42.5,
"learning_rate": 0.0009993692419225126,
"loss": 33.5388,
"step": 20875
},
{
"epoch": 0.06527292828722588,
"grad_norm": 55.0,
"learning_rate": 0.0009993627426965667,
"loss": 33.775,
"step": 20900
},
{
"epoch": 0.0653510059526412,
"grad_norm": 39.0,
"learning_rate": 0.0009993562101799024,
"loss": 33.8984,
"step": 20925
},
{
"epoch": 0.06542908361805655,
"grad_norm": 41.5,
"learning_rate": 0.0009993496443729557,
"loss": 33.8582,
"step": 20950
},
{
"epoch": 0.06550716128347189,
"grad_norm": 37.25,
"learning_rate": 0.0009993430452761639,
"loss": 33.8915,
"step": 20975
},
{
"epoch": 0.06558523894888724,
"grad_norm": 35.0,
"learning_rate": 0.0009993364128899672,
"loss": 33.5705,
"step": 21000
},
{
"epoch": 0.06558523894888724,
"eval_loss": 33.73247146606445,
"eval_runtime": 102.3252,
"eval_samples_per_second": 50.848,
"eval_steps_per_second": 3.186,
"step": 21000
},
{
"epoch": 0.06566331661430258,
"grad_norm": 37.0,
"learning_rate": 0.0009993297472148076,
"loss": 33.5467,
"step": 21025
},
{
"epoch": 0.06574139427971792,
"grad_norm": 38.5,
"learning_rate": 0.0009993230482511295,
"loss": 33.6705,
"step": 21050
},
{
"epoch": 0.06581947194513327,
"grad_norm": 39.0,
"learning_rate": 0.0009993163159993798,
"loss": 33.7872,
"step": 21075
},
{
"epoch": 0.06589754961054861,
"grad_norm": 45.5,
"learning_rate": 0.0009993095504600067,
"loss": 33.6316,
"step": 21100
},
{
"epoch": 0.06597562727596394,
"grad_norm": 38.0,
"learning_rate": 0.0009993027516334617,
"loss": 33.8796,
"step": 21125
},
{
"epoch": 0.06605370494137928,
"grad_norm": 43.75,
"learning_rate": 0.000999295919520198,
"loss": 34.0526,
"step": 21150
},
{
"epoch": 0.06613178260679463,
"grad_norm": 36.0,
"learning_rate": 0.000999289054120671,
"loss": 34.1438,
"step": 21175
},
{
"epoch": 0.06620986027220997,
"grad_norm": 38.0,
"learning_rate": 0.0009992821554353382,
"loss": 33.7974,
"step": 21200
},
{
"epoch": 0.06628793793762532,
"grad_norm": 46.0,
"learning_rate": 0.00099927522346466,
"loss": 33.8107,
"step": 21225
},
{
"epoch": 0.06636601560304066,
"grad_norm": 45.75,
"learning_rate": 0.0009992682582090982,
"loss": 33.8952,
"step": 21250
},
{
"epoch": 0.066444093268456,
"grad_norm": 39.5,
"learning_rate": 0.0009992612596691171,
"loss": 34.201,
"step": 21275
},
{
"epoch": 0.06652217093387135,
"grad_norm": 49.25,
"learning_rate": 0.0009992542278451832,
"loss": 34.2007,
"step": 21300
},
{
"epoch": 0.06660024859928668,
"grad_norm": 42.0,
"learning_rate": 0.0009992471627377657,
"loss": 34.3501,
"step": 21325
},
{
"epoch": 0.06667832626470202,
"grad_norm": 48.75,
"learning_rate": 0.0009992400643473354,
"loss": 34.4321,
"step": 21350
},
{
"epoch": 0.06675640393011736,
"grad_norm": 43.25,
"learning_rate": 0.0009992329326743653,
"loss": 34.638,
"step": 21375
},
{
"epoch": 0.06683448159553271,
"grad_norm": 41.75,
"learning_rate": 0.000999225767719331,
"loss": 34.588,
"step": 21400
},
{
"epoch": 0.06691255926094805,
"grad_norm": 44.5,
"learning_rate": 0.0009992185694827102,
"loss": 34.7111,
"step": 21425
},
{
"epoch": 0.0669906369263634,
"grad_norm": 50.5,
"learning_rate": 0.0009992113379649829,
"loss": 34.7677,
"step": 21450
},
{
"epoch": 0.06706871459177874,
"grad_norm": 62.0,
"learning_rate": 0.000999204073166631,
"loss": 35.0234,
"step": 21475
},
{
"epoch": 0.06714679225719408,
"grad_norm": 48.0,
"learning_rate": 0.0009991967750881388,
"loss": 35.0909,
"step": 21500
},
{
"epoch": 0.06722486992260941,
"grad_norm": 49.5,
"learning_rate": 0.000999189443729993,
"loss": 35.4811,
"step": 21525
},
{
"epoch": 0.06730294758802476,
"grad_norm": 58.0,
"learning_rate": 0.0009991820790926824,
"loss": 35.2726,
"step": 21550
},
{
"epoch": 0.0673810252534401,
"grad_norm": 55.5,
"learning_rate": 0.0009991746811766975,
"loss": 35.629,
"step": 21575
},
{
"epoch": 0.06745910291885544,
"grad_norm": 44.0,
"learning_rate": 0.000999167249982532,
"loss": 35.4736,
"step": 21600
},
{
"epoch": 0.06753718058427079,
"grad_norm": 45.75,
"learning_rate": 0.0009991597855106814,
"loss": 35.2275,
"step": 21625
},
{
"epoch": 0.06761525824968613,
"grad_norm": 41.5,
"learning_rate": 0.0009991522877616428,
"loss": 35.2907,
"step": 21650
},
{
"epoch": 0.06769333591510147,
"grad_norm": 56.5,
"learning_rate": 0.000999144756735916,
"loss": 35.2988,
"step": 21675
},
{
"epoch": 0.06777141358051682,
"grad_norm": 56.0,
"learning_rate": 0.000999137192434004,
"loss": 35.2948,
"step": 21700
},
{
"epoch": 0.06784949124593215,
"grad_norm": 42.0,
"learning_rate": 0.0009991295948564103,
"loss": 35.1186,
"step": 21725
},
{
"epoch": 0.06792756891134749,
"grad_norm": 43.25,
"learning_rate": 0.0009991219640036416,
"loss": 35.115,
"step": 21750
},
{
"epoch": 0.06800564657676283,
"grad_norm": 43.75,
"learning_rate": 0.0009991142998762065,
"loss": 35.347,
"step": 21775
},
{
"epoch": 0.06808372424217818,
"grad_norm": 45.0,
"learning_rate": 0.000999106602474616,
"loss": 35.3008,
"step": 21800
},
{
"epoch": 0.06816180190759352,
"grad_norm": 66.0,
"learning_rate": 0.0009990988717993832,
"loss": 35.321,
"step": 21825
},
{
"epoch": 0.06823987957300887,
"grad_norm": 56.0,
"learning_rate": 0.0009990911078510238,
"loss": 35.373,
"step": 21850
},
{
"epoch": 0.06831795723842421,
"grad_norm": 49.25,
"learning_rate": 0.000999083310630055,
"loss": 35.2404,
"step": 21875
},
{
"epoch": 0.06839603490383955,
"grad_norm": 46.0,
"learning_rate": 0.000999075480136997,
"loss": 35.2177,
"step": 21900
},
{
"epoch": 0.06847411256925488,
"grad_norm": 43.5,
"learning_rate": 0.0009990676163723715,
"loss": 35.1759,
"step": 21925
},
{
"epoch": 0.06855219023467023,
"grad_norm": 54.5,
"learning_rate": 0.000999059719336703,
"loss": 34.7193,
"step": 21950
},
{
"epoch": 0.06863026790008557,
"grad_norm": 48.25,
"learning_rate": 0.0009990517890305175,
"loss": 34.6676,
"step": 21975
},
{
"epoch": 0.06870834556550091,
"grad_norm": 44.75,
"learning_rate": 0.0009990438254543442,
"loss": 34.4965,
"step": 22000
},
{
"epoch": 0.06870834556550091,
"eval_loss": 34.531646728515625,
"eval_runtime": 102.6371,
"eval_samples_per_second": 50.693,
"eval_steps_per_second": 3.176,
"step": 22000
}
],
"logging_steps": 25,
"max_steps": 320194,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.7899608404454277e+18,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}