run_21 / trainer_state.json
irodkin's picture
Training checkpoint at step 28000
cb684df verified
{
"best_global_step": 28000,
"best_metric": 2.380680799484253,
"best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_21/checkpoint-28000",
"epoch": 0.56,
"eval_steps": 100,
"global_step": 28000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0005,
"grad_norm": 27.027176292677446,
"learning_rate": 4.8e-08,
"loss": 3.52,
"step": 25
},
{
"epoch": 0.001,
"grad_norm": 22.883614597253285,
"learning_rate": 9.8e-08,
"loss": 3.4361,
"step": 50
},
{
"epoch": 0.0015,
"grad_norm": 14.88008652186332,
"learning_rate": 1.4800000000000003e-07,
"loss": 3.2752,
"step": 75
},
{
"epoch": 0.002,
"grad_norm": 8.829920836438578,
"learning_rate": 1.9800000000000003e-07,
"loss": 3.073,
"step": 100
},
{
"epoch": 0.002,
"eval_loss": 2.8928089141845703,
"eval_runtime": 31.5789,
"eval_samples_per_second": 3.23,
"eval_steps_per_second": 1.615,
"step": 100
},
{
"epoch": 0.0025,
"grad_norm": 6.672581323543055,
"learning_rate": 2.48e-07,
"loss": 2.8787,
"step": 125
},
{
"epoch": 0.003,
"grad_norm": 3.485187933164644,
"learning_rate": 2.9800000000000005e-07,
"loss": 2.7569,
"step": 150
},
{
"epoch": 0.0035,
"grad_norm": 1.6514027733962566,
"learning_rate": 3.48e-07,
"loss": 2.683,
"step": 175
},
{
"epoch": 0.004,
"grad_norm": 1.714322054077562,
"learning_rate": 3.9800000000000004e-07,
"loss": 2.6417,
"step": 200
},
{
"epoch": 0.004,
"eval_loss": 2.608551263809204,
"eval_runtime": 31.7434,
"eval_samples_per_second": 3.213,
"eval_steps_per_second": 1.607,
"step": 200
},
{
"epoch": 0.0045,
"grad_norm": 1.1166252829937406,
"learning_rate": 4.4800000000000004e-07,
"loss": 2.6075,
"step": 225
},
{
"epoch": 0.005,
"grad_norm": 1.2360541139925998,
"learning_rate": 4.98e-07,
"loss": 2.5833,
"step": 250
},
{
"epoch": 0.0055,
"grad_norm": 1.1186934925325145,
"learning_rate": 5.480000000000001e-07,
"loss": 2.568,
"step": 275
},
{
"epoch": 0.006,
"grad_norm": 2.2165517261683907,
"learning_rate": 5.98e-07,
"loss": 2.5488,
"step": 300
},
{
"epoch": 0.006,
"eval_loss": 2.532663345336914,
"eval_runtime": 31.7717,
"eval_samples_per_second": 3.21,
"eval_steps_per_second": 1.605,
"step": 300
},
{
"epoch": 0.0065,
"grad_norm": 1.9955553189401838,
"learning_rate": 6.48e-07,
"loss": 2.5186,
"step": 325
},
{
"epoch": 0.007,
"grad_norm": 1.7134269827298882,
"learning_rate": 6.98e-07,
"loss": 2.5133,
"step": 350
},
{
"epoch": 0.0075,
"grad_norm": 4.086994695184575,
"learning_rate": 7.480000000000001e-07,
"loss": 2.4979,
"step": 375
},
{
"epoch": 0.008,
"grad_norm": 2.2539165526987732,
"learning_rate": 7.98e-07,
"loss": 2.49,
"step": 400
},
{
"epoch": 0.008,
"eval_loss": 2.4952430725097656,
"eval_runtime": 31.9652,
"eval_samples_per_second": 3.191,
"eval_steps_per_second": 1.595,
"step": 400
},
{
"epoch": 0.0085,
"grad_norm": 1.138897058010547,
"learning_rate": 8.480000000000001e-07,
"loss": 2.4748,
"step": 425
},
{
"epoch": 0.009,
"grad_norm": 1.0112216946364496,
"learning_rate": 8.980000000000001e-07,
"loss": 2.4801,
"step": 450
},
{
"epoch": 0.0095,
"grad_norm": 1.3243191157122005,
"learning_rate": 9.480000000000001e-07,
"loss": 2.4699,
"step": 475
},
{
"epoch": 0.01,
"grad_norm": 1.2276747327077127,
"learning_rate": 9.98e-07,
"loss": 2.468,
"step": 500
},
{
"epoch": 0.01,
"eval_loss": 2.4748668670654297,
"eval_runtime": 31.7813,
"eval_samples_per_second": 3.209,
"eval_steps_per_second": 1.605,
"step": 500
},
{
"epoch": 0.0105,
"grad_norm": 0.845856364918703,
"learning_rate": 1.0480000000000002e-06,
"loss": 2.4738,
"step": 525
},
{
"epoch": 0.011,
"grad_norm": 1.3677643157822397,
"learning_rate": 1.0980000000000001e-06,
"loss": 2.4535,
"step": 550
},
{
"epoch": 0.0115,
"grad_norm": 2.919464896391848,
"learning_rate": 1.148e-06,
"loss": 2.4558,
"step": 575
},
{
"epoch": 0.012,
"grad_norm": 0.9435018771336037,
"learning_rate": 1.1980000000000002e-06,
"loss": 2.4568,
"step": 600
},
{
"epoch": 0.012,
"eval_loss": 2.4655494689941406,
"eval_runtime": 31.7457,
"eval_samples_per_second": 3.213,
"eval_steps_per_second": 1.607,
"step": 600
},
{
"epoch": 0.0125,
"grad_norm": 0.844314601352543,
"learning_rate": 1.248e-06,
"loss": 2.4493,
"step": 625
},
{
"epoch": 0.013,
"grad_norm": 0.8266693044311944,
"learning_rate": 1.2980000000000001e-06,
"loss": 2.4491,
"step": 650
},
{
"epoch": 0.0135,
"grad_norm": 0.9456226537014805,
"learning_rate": 1.348e-06,
"loss": 2.4538,
"step": 675
},
{
"epoch": 0.014,
"grad_norm": 1.241067240172021,
"learning_rate": 1.3980000000000002e-06,
"loss": 2.441,
"step": 700
},
{
"epoch": 0.014,
"eval_loss": 2.459726572036743,
"eval_runtime": 31.7996,
"eval_samples_per_second": 3.208,
"eval_steps_per_second": 1.604,
"step": 700
},
{
"epoch": 0.0145,
"grad_norm": 0.8214981637560076,
"learning_rate": 1.4480000000000002e-06,
"loss": 2.4375,
"step": 725
},
{
"epoch": 0.015,
"grad_norm": 0.8463041725741063,
"learning_rate": 1.498e-06,
"loss": 2.4476,
"step": 750
},
{
"epoch": 0.0155,
"grad_norm": 1.0459233803315569,
"learning_rate": 1.548e-06,
"loss": 2.4388,
"step": 775
},
{
"epoch": 0.016,
"grad_norm": 0.7899668512736558,
"learning_rate": 1.5980000000000002e-06,
"loss": 2.4376,
"step": 800
},
{
"epoch": 0.016,
"eval_loss": 2.4541866779327393,
"eval_runtime": 31.8537,
"eval_samples_per_second": 3.202,
"eval_steps_per_second": 1.601,
"step": 800
},
{
"epoch": 0.0165,
"grad_norm": 0.8397014905084252,
"learning_rate": 1.6480000000000001e-06,
"loss": 2.436,
"step": 825
},
{
"epoch": 0.017,
"grad_norm": 0.7623848831497283,
"learning_rate": 1.6980000000000003e-06,
"loss": 2.4384,
"step": 850
},
{
"epoch": 0.0175,
"grad_norm": 0.7990535915346776,
"learning_rate": 1.7480000000000002e-06,
"loss": 2.4388,
"step": 875
},
{
"epoch": 0.018,
"grad_norm": 1.1027343926443682,
"learning_rate": 1.798e-06,
"loss": 2.4195,
"step": 900
},
{
"epoch": 0.018,
"eval_loss": 2.4497900009155273,
"eval_runtime": 32.04,
"eval_samples_per_second": 3.184,
"eval_steps_per_second": 1.592,
"step": 900
},
{
"epoch": 0.0185,
"grad_norm": 1.0518607606934676,
"learning_rate": 1.8480000000000001e-06,
"loss": 2.441,
"step": 925
},
{
"epoch": 0.019,
"grad_norm": 0.7969899064558551,
"learning_rate": 1.898e-06,
"loss": 2.4416,
"step": 950
},
{
"epoch": 0.0195,
"grad_norm": 0.6779464500616844,
"learning_rate": 1.9480000000000002e-06,
"loss": 2.4397,
"step": 975
},
{
"epoch": 0.02,
"grad_norm": 0.8662904314628106,
"learning_rate": 1.998e-06,
"loss": 2.4316,
"step": 1000
},
{
"epoch": 0.02,
"eval_loss": 2.4468765258789062,
"eval_runtime": 31.891,
"eval_samples_per_second": 3.198,
"eval_steps_per_second": 1.599,
"step": 1000
},
{
"epoch": 0.0205,
"grad_norm": 0.6931713924838875,
"learning_rate": 2.048e-06,
"loss": 2.4456,
"step": 1025
},
{
"epoch": 0.021,
"grad_norm": 0.6887441871643851,
"learning_rate": 2.098e-06,
"loss": 2.4253,
"step": 1050
},
{
"epoch": 0.0215,
"grad_norm": 0.7500338911423412,
"learning_rate": 2.148e-06,
"loss": 2.431,
"step": 1075
},
{
"epoch": 0.022,
"grad_norm": 0.7458051760406093,
"learning_rate": 2.198e-06,
"loss": 2.4164,
"step": 1100
},
{
"epoch": 0.022,
"eval_loss": 2.4442293643951416,
"eval_runtime": 31.8697,
"eval_samples_per_second": 3.201,
"eval_steps_per_second": 1.6,
"step": 1100
},
{
"epoch": 0.0225,
"grad_norm": 0.8345425864188605,
"learning_rate": 2.2480000000000003e-06,
"loss": 2.4241,
"step": 1125
},
{
"epoch": 0.023,
"grad_norm": 0.6997049438769294,
"learning_rate": 2.2980000000000003e-06,
"loss": 2.43,
"step": 1150
},
{
"epoch": 0.0235,
"grad_norm": 0.7476759709197881,
"learning_rate": 2.3480000000000002e-06,
"loss": 2.4342,
"step": 1175
},
{
"epoch": 0.024,
"grad_norm": 0.6735584083816767,
"learning_rate": 2.398e-06,
"loss": 2.4274,
"step": 1200
},
{
"epoch": 0.024,
"eval_loss": 2.4423961639404297,
"eval_runtime": 31.6272,
"eval_samples_per_second": 3.225,
"eval_steps_per_second": 1.613,
"step": 1200
},
{
"epoch": 0.0245,
"grad_norm": 0.7414830106555006,
"learning_rate": 2.448e-06,
"loss": 2.4363,
"step": 1225
},
{
"epoch": 0.025,
"grad_norm": 0.7852755880662065,
"learning_rate": 2.498e-06,
"loss": 2.4356,
"step": 1250
},
{
"epoch": 0.0255,
"grad_norm": 0.6550676975591231,
"learning_rate": 2.5480000000000004e-06,
"loss": 2.4219,
"step": 1275
},
{
"epoch": 0.026,
"grad_norm": 0.6708503716821785,
"learning_rate": 2.598e-06,
"loss": 2.4442,
"step": 1300
},
{
"epoch": 0.026,
"eval_loss": 2.440678358078003,
"eval_runtime": 31.7661,
"eval_samples_per_second": 3.211,
"eval_steps_per_second": 1.605,
"step": 1300
},
{
"epoch": 0.0265,
"grad_norm": 0.6923805904104993,
"learning_rate": 2.648e-06,
"loss": 2.4317,
"step": 1325
},
{
"epoch": 0.027,
"grad_norm": 0.6600109660858106,
"learning_rate": 2.6980000000000003e-06,
"loss": 2.432,
"step": 1350
},
{
"epoch": 0.0275,
"grad_norm": 0.841715383150229,
"learning_rate": 2.748e-06,
"loss": 2.4196,
"step": 1375
},
{
"epoch": 0.028,
"grad_norm": 0.6392005959511108,
"learning_rate": 2.798e-06,
"loss": 2.4274,
"step": 1400
},
{
"epoch": 0.028,
"eval_loss": 2.439229726791382,
"eval_runtime": 32.0465,
"eval_samples_per_second": 3.183,
"eval_steps_per_second": 1.591,
"step": 1400
},
{
"epoch": 0.0285,
"grad_norm": 0.6653339947473879,
"learning_rate": 2.848e-06,
"loss": 2.4209,
"step": 1425
},
{
"epoch": 0.029,
"grad_norm": 0.6607591145573396,
"learning_rate": 2.8980000000000005e-06,
"loss": 2.4111,
"step": 1450
},
{
"epoch": 0.0295,
"grad_norm": 0.6492342012137399,
"learning_rate": 2.9480000000000004e-06,
"loss": 2.4319,
"step": 1475
},
{
"epoch": 0.03,
"grad_norm": 0.6418256237639189,
"learning_rate": 2.9980000000000003e-06,
"loss": 2.4257,
"step": 1500
},
{
"epoch": 0.03,
"eval_loss": 2.4380884170532227,
"eval_runtime": 32.1017,
"eval_samples_per_second": 3.177,
"eval_steps_per_second": 1.589,
"step": 1500
},
{
"epoch": 0.0305,
"grad_norm": 0.719808061901716,
"learning_rate": 3.0480000000000003e-06,
"loss": 2.4305,
"step": 1525
},
{
"epoch": 0.031,
"grad_norm": 0.6138892760464039,
"learning_rate": 3.0980000000000007e-06,
"loss": 2.4253,
"step": 1550
},
{
"epoch": 0.0315,
"grad_norm": 0.7179717159222389,
"learning_rate": 3.1480000000000006e-06,
"loss": 2.4286,
"step": 1575
},
{
"epoch": 0.032,
"grad_norm": 0.6337699388954209,
"learning_rate": 3.198e-06,
"loss": 2.4281,
"step": 1600
},
{
"epoch": 0.032,
"eval_loss": 2.4367759227752686,
"eval_runtime": 32.1865,
"eval_samples_per_second": 3.169,
"eval_steps_per_second": 1.585,
"step": 1600
},
{
"epoch": 0.0325,
"grad_norm": 0.6399383081078225,
"learning_rate": 3.248e-06,
"loss": 2.4127,
"step": 1625
},
{
"epoch": 0.033,
"grad_norm": 0.6239480160142674,
"learning_rate": 3.298e-06,
"loss": 2.4271,
"step": 1650
},
{
"epoch": 0.0335,
"grad_norm": 0.6964721038747086,
"learning_rate": 3.348e-06,
"loss": 2.4168,
"step": 1675
},
{
"epoch": 0.034,
"grad_norm": 0.6246300346633158,
"learning_rate": 3.3980000000000003e-06,
"loss": 2.4312,
"step": 1700
},
{
"epoch": 0.034,
"eval_loss": 2.43576717376709,
"eval_runtime": 32.35,
"eval_samples_per_second": 3.153,
"eval_steps_per_second": 1.577,
"step": 1700
},
{
"epoch": 0.0345,
"grad_norm": 0.6609046760569887,
"learning_rate": 3.4480000000000003e-06,
"loss": 2.4201,
"step": 1725
},
{
"epoch": 0.035,
"grad_norm": 0.611833218468793,
"learning_rate": 3.4980000000000002e-06,
"loss": 2.4248,
"step": 1750
},
{
"epoch": 0.0355,
"grad_norm": 0.6374610168215615,
"learning_rate": 3.548e-06,
"loss": 2.4195,
"step": 1775
},
{
"epoch": 0.036,
"grad_norm": 0.608911757784224,
"learning_rate": 3.5980000000000005e-06,
"loss": 2.4207,
"step": 1800
},
{
"epoch": 0.036,
"eval_loss": 2.4352190494537354,
"eval_runtime": 32.4107,
"eval_samples_per_second": 3.147,
"eval_steps_per_second": 1.574,
"step": 1800
},
{
"epoch": 0.0365,
"grad_norm": 0.7277576842118675,
"learning_rate": 3.6480000000000005e-06,
"loss": 2.429,
"step": 1825
},
{
"epoch": 0.037,
"grad_norm": 0.6177267450079238,
"learning_rate": 3.6980000000000004e-06,
"loss": 2.4216,
"step": 1850
},
{
"epoch": 0.0375,
"grad_norm": 0.6909621222715888,
"learning_rate": 3.7480000000000004e-06,
"loss": 2.4141,
"step": 1875
},
{
"epoch": 0.038,
"grad_norm": 0.6271064789808471,
"learning_rate": 3.7980000000000007e-06,
"loss": 2.4204,
"step": 1900
},
{
"epoch": 0.038,
"eval_loss": 2.434185743331909,
"eval_runtime": 32.1923,
"eval_samples_per_second": 3.168,
"eval_steps_per_second": 1.584,
"step": 1900
},
{
"epoch": 0.0385,
"grad_norm": 4.465543129416645,
"learning_rate": 3.848e-06,
"loss": 2.4278,
"step": 1925
},
{
"epoch": 0.039,
"grad_norm": 0.59428248175071,
"learning_rate": 3.898e-06,
"loss": 2.4231,
"step": 1950
},
{
"epoch": 0.0395,
"grad_norm": 0.6300066797920092,
"learning_rate": 3.948e-06,
"loss": 2.4163,
"step": 1975
},
{
"epoch": 0.04,
"grad_norm": 0.5995770487766363,
"learning_rate": 3.9980000000000005e-06,
"loss": 2.4236,
"step": 2000
},
{
"epoch": 0.04,
"eval_loss": 2.433772563934326,
"eval_runtime": 32.062,
"eval_samples_per_second": 3.181,
"eval_steps_per_second": 1.591,
"step": 2000
},
{
"epoch": 0.0405,
"grad_norm": 0.595289417756029,
"learning_rate": 4.048e-06,
"loss": 2.424,
"step": 2025
},
{
"epoch": 0.041,
"grad_norm": 0.6134282240517589,
"learning_rate": 4.098e-06,
"loss": 2.4255,
"step": 2050
},
{
"epoch": 0.0415,
"grad_norm": 0.6629564791128602,
"learning_rate": 4.148000000000001e-06,
"loss": 2.4097,
"step": 2075
},
{
"epoch": 0.042,
"grad_norm": 0.621927005205136,
"learning_rate": 4.198e-06,
"loss": 2.4268,
"step": 2100
},
{
"epoch": 0.042,
"eval_loss": 2.433004379272461,
"eval_runtime": 32.0064,
"eval_samples_per_second": 3.187,
"eval_steps_per_second": 1.593,
"step": 2100
},
{
"epoch": 0.0425,
"grad_norm": 0.5955395744872489,
"learning_rate": 4.248000000000001e-06,
"loss": 2.4134,
"step": 2125
},
{
"epoch": 0.043,
"grad_norm": 0.630503522814338,
"learning_rate": 4.298e-06,
"loss": 2.4195,
"step": 2150
},
{
"epoch": 0.0435,
"grad_norm": 0.6187515125513555,
"learning_rate": 4.3480000000000006e-06,
"loss": 2.4258,
"step": 2175
},
{
"epoch": 0.044,
"grad_norm": 0.7454395191545767,
"learning_rate": 4.398000000000001e-06,
"loss": 2.4226,
"step": 2200
},
{
"epoch": 0.044,
"eval_loss": 2.4322543144226074,
"eval_runtime": 31.9813,
"eval_samples_per_second": 3.189,
"eval_steps_per_second": 1.595,
"step": 2200
},
{
"epoch": 0.0445,
"grad_norm": 0.6347211303495337,
"learning_rate": 4.4480000000000004e-06,
"loss": 2.4191,
"step": 2225
},
{
"epoch": 0.045,
"grad_norm": 0.6135245446733344,
"learning_rate": 4.498e-06,
"loss": 2.4229,
"step": 2250
},
{
"epoch": 0.0455,
"grad_norm": 0.6009500019971098,
"learning_rate": 4.548e-06,
"loss": 2.42,
"step": 2275
},
{
"epoch": 0.046,
"grad_norm": 0.661258489557284,
"learning_rate": 4.598e-06,
"loss": 2.4129,
"step": 2300
},
{
"epoch": 0.046,
"eval_loss": 2.432189464569092,
"eval_runtime": 31.9429,
"eval_samples_per_second": 3.193,
"eval_steps_per_second": 1.597,
"step": 2300
},
{
"epoch": 0.0465,
"grad_norm": 0.6139592783182132,
"learning_rate": 4.648e-06,
"loss": 2.4104,
"step": 2325
},
{
"epoch": 0.047,
"grad_norm": 0.583220993400474,
"learning_rate": 4.698000000000001e-06,
"loss": 2.4244,
"step": 2350
},
{
"epoch": 0.0475,
"grad_norm": 0.6293186545915876,
"learning_rate": 4.748e-06,
"loss": 2.4225,
"step": 2375
},
{
"epoch": 0.048,
"grad_norm": 0.5798657043139257,
"learning_rate": 4.7980000000000005e-06,
"loss": 2.4283,
"step": 2400
},
{
"epoch": 0.048,
"eval_loss": 2.4312729835510254,
"eval_runtime": 31.7379,
"eval_samples_per_second": 3.214,
"eval_steps_per_second": 1.607,
"step": 2400
},
{
"epoch": 0.0485,
"grad_norm": 0.6301056488676946,
"learning_rate": 4.848000000000001e-06,
"loss": 2.4238,
"step": 2425
},
{
"epoch": 0.049,
"grad_norm": 0.6050753634716797,
"learning_rate": 4.898e-06,
"loss": 2.4209,
"step": 2450
},
{
"epoch": 0.0495,
"grad_norm": 0.5954330421177886,
"learning_rate": 4.948000000000001e-06,
"loss": 2.4208,
"step": 2475
},
{
"epoch": 0.05,
"grad_norm": 0.6115913011006808,
"learning_rate": 4.998e-06,
"loss": 2.4199,
"step": 2500
},
{
"epoch": 0.05,
"eval_loss": 2.430593490600586,
"eval_runtime": 31.7859,
"eval_samples_per_second": 3.209,
"eval_steps_per_second": 1.604,
"step": 2500
},
{
"epoch": 0.0505,
"grad_norm": 0.6088167798442012,
"learning_rate": 5.048000000000001e-06,
"loss": 2.4204,
"step": 2525
},
{
"epoch": 0.051,
"grad_norm": 0.5886456022713933,
"learning_rate": 5.098000000000001e-06,
"loss": 2.4233,
"step": 2550
},
{
"epoch": 0.0515,
"grad_norm": 0.5755814876588983,
"learning_rate": 5.1480000000000005e-06,
"loss": 2.414,
"step": 2575
},
{
"epoch": 0.052,
"grad_norm": 0.6101796511458513,
"learning_rate": 5.198000000000001e-06,
"loss": 2.4134,
"step": 2600
},
{
"epoch": 0.052,
"eval_loss": 2.430147886276245,
"eval_runtime": 31.667,
"eval_samples_per_second": 3.221,
"eval_steps_per_second": 1.611,
"step": 2600
},
{
"epoch": 0.0525,
"grad_norm": 0.5829483894700689,
"learning_rate": 5.248000000000001e-06,
"loss": 2.4176,
"step": 2625
},
{
"epoch": 0.053,
"grad_norm": 0.5756679405925968,
"learning_rate": 5.298000000000001e-06,
"loss": 2.4196,
"step": 2650
},
{
"epoch": 0.0535,
"grad_norm": 0.6203149656143291,
"learning_rate": 5.348000000000001e-06,
"loss": 2.4128,
"step": 2675
},
{
"epoch": 0.054,
"grad_norm": 0.6107431848759605,
"learning_rate": 5.398e-06,
"loss": 2.4066,
"step": 2700
},
{
"epoch": 0.054,
"eval_loss": 2.4298744201660156,
"eval_runtime": 31.8888,
"eval_samples_per_second": 3.199,
"eval_steps_per_second": 1.599,
"step": 2700
},
{
"epoch": 0.0545,
"grad_norm": 0.6313360362618398,
"learning_rate": 5.448e-06,
"loss": 2.4116,
"step": 2725
},
{
"epoch": 0.055,
"grad_norm": 0.7567581764202255,
"learning_rate": 5.498e-06,
"loss": 2.4137,
"step": 2750
},
{
"epoch": 0.0555,
"grad_norm": 0.5808819096916863,
"learning_rate": 5.548e-06,
"loss": 2.4261,
"step": 2775
},
{
"epoch": 0.056,
"grad_norm": 0.7401050453151701,
"learning_rate": 5.5980000000000004e-06,
"loss": 2.4102,
"step": 2800
},
{
"epoch": 0.056,
"eval_loss": 2.429075002670288,
"eval_runtime": 31.9187,
"eval_samples_per_second": 3.196,
"eval_steps_per_second": 1.598,
"step": 2800
},
{
"epoch": 0.0565,
"grad_norm": 0.6100412128745759,
"learning_rate": 5.648e-06,
"loss": 2.4205,
"step": 2825
},
{
"epoch": 0.057,
"grad_norm": 0.6038298357908357,
"learning_rate": 5.698e-06,
"loss": 2.4104,
"step": 2850
},
{
"epoch": 0.0575,
"grad_norm": 0.6294303689076208,
"learning_rate": 5.748e-06,
"loss": 2.4101,
"step": 2875
},
{
"epoch": 0.058,
"grad_norm": 0.6000316496044382,
"learning_rate": 5.798e-06,
"loss": 2.4116,
"step": 2900
},
{
"epoch": 0.058,
"eval_loss": 2.428636074066162,
"eval_runtime": 31.9776,
"eval_samples_per_second": 3.19,
"eval_steps_per_second": 1.595,
"step": 2900
},
{
"epoch": 0.0585,
"grad_norm": 0.6662370599985865,
"learning_rate": 5.848000000000001e-06,
"loss": 2.4271,
"step": 2925
},
{
"epoch": 0.059,
"grad_norm": 0.6065686333783092,
"learning_rate": 5.898e-06,
"loss": 2.4141,
"step": 2950
},
{
"epoch": 0.0595,
"grad_norm": 0.5896191268179571,
"learning_rate": 5.9480000000000005e-06,
"loss": 2.4194,
"step": 2975
},
{
"epoch": 0.06,
"grad_norm": 0.5984986372167933,
"learning_rate": 5.998000000000001e-06,
"loss": 2.4107,
"step": 3000
},
{
"epoch": 0.06,
"eval_loss": 2.428344488143921,
"eval_runtime": 31.827,
"eval_samples_per_second": 3.205,
"eval_steps_per_second": 1.602,
"step": 3000
},
{
"epoch": 0.0605,
"grad_norm": 0.6057904687423932,
"learning_rate": 6.048e-06,
"loss": 2.4231,
"step": 3025
},
{
"epoch": 0.061,
"grad_norm": 0.5775023699888965,
"learning_rate": 6.098000000000001e-06,
"loss": 2.4193,
"step": 3050
},
{
"epoch": 0.0615,
"grad_norm": 0.5945486563983137,
"learning_rate": 6.148e-06,
"loss": 2.4101,
"step": 3075
},
{
"epoch": 0.062,
"grad_norm": 0.5893073406656858,
"learning_rate": 6.198000000000001e-06,
"loss": 2.41,
"step": 3100
},
{
"epoch": 0.062,
"eval_loss": 2.4278364181518555,
"eval_runtime": 31.4582,
"eval_samples_per_second": 3.242,
"eval_steps_per_second": 1.621,
"step": 3100
},
{
"epoch": 0.0625,
"grad_norm": 0.6413551002827471,
"learning_rate": 6.248000000000001e-06,
"loss": 2.4155,
"step": 3125
},
{
"epoch": 0.063,
"grad_norm": 0.5799664342522566,
"learning_rate": 6.2980000000000005e-06,
"loss": 2.409,
"step": 3150
},
{
"epoch": 0.0635,
"grad_norm": 0.5811811320062699,
"learning_rate": 6.348000000000001e-06,
"loss": 2.4103,
"step": 3175
},
{
"epoch": 0.064,
"grad_norm": 1.7009375984265656,
"learning_rate": 6.398000000000001e-06,
"loss": 2.4063,
"step": 3200
},
{
"epoch": 0.064,
"eval_loss": 2.4270801544189453,
"eval_runtime": 31.5638,
"eval_samples_per_second": 3.232,
"eval_steps_per_second": 1.616,
"step": 3200
},
{
"epoch": 0.0645,
"grad_norm": 0.5922661228031734,
"learning_rate": 6.448000000000001e-06,
"loss": 2.4146,
"step": 3225
},
{
"epoch": 0.065,
"grad_norm": 0.6108654698148237,
"learning_rate": 6.498000000000001e-06,
"loss": 2.4202,
"step": 3250
},
{
"epoch": 0.0655,
"grad_norm": 0.5882408729466215,
"learning_rate": 6.548000000000001e-06,
"loss": 2.4226,
"step": 3275
},
{
"epoch": 0.066,
"grad_norm": 0.6095634937429834,
"learning_rate": 6.598000000000001e-06,
"loss": 2.4175,
"step": 3300
},
{
"epoch": 0.066,
"eval_loss": 2.4271743297576904,
"eval_runtime": 31.605,
"eval_samples_per_second": 3.227,
"eval_steps_per_second": 1.614,
"step": 3300
},
{
"epoch": 0.0665,
"grad_norm": 0.584006486469731,
"learning_rate": 6.648e-06,
"loss": 2.4183,
"step": 3325
},
{
"epoch": 0.067,
"grad_norm": 0.6183114977641251,
"learning_rate": 6.698e-06,
"loss": 2.4074,
"step": 3350
},
{
"epoch": 0.0675,
"grad_norm": 0.6102359150325862,
"learning_rate": 6.7480000000000004e-06,
"loss": 2.4168,
"step": 3375
},
{
"epoch": 0.068,
"grad_norm": 0.6988080460632056,
"learning_rate": 6.798e-06,
"loss": 2.433,
"step": 3400
},
{
"epoch": 0.068,
"eval_loss": 2.4267990589141846,
"eval_runtime": 31.5337,
"eval_samples_per_second": 3.235,
"eval_steps_per_second": 1.617,
"step": 3400
},
{
"epoch": 0.0685,
"grad_norm": 0.5923385092093629,
"learning_rate": 6.848e-06,
"loss": 2.4137,
"step": 3425
},
{
"epoch": 0.069,
"grad_norm": 0.5873912274008383,
"learning_rate": 6.898e-06,
"loss": 2.4183,
"step": 3450
},
{
"epoch": 0.0695,
"grad_norm": 0.5885684717655756,
"learning_rate": 6.948e-06,
"loss": 2.4282,
"step": 3475
},
{
"epoch": 0.07,
"grad_norm": 0.6026217656458652,
"learning_rate": 6.998000000000001e-06,
"loss": 2.4234,
"step": 3500
},
{
"epoch": 0.07,
"eval_loss": 2.4262564182281494,
"eval_runtime": 31.7503,
"eval_samples_per_second": 3.213,
"eval_steps_per_second": 1.606,
"step": 3500
},
{
"epoch": 0.0705,
"grad_norm": 0.5820881270462898,
"learning_rate": 7.048e-06,
"loss": 2.413,
"step": 3525
},
{
"epoch": 0.071,
"grad_norm": 0.6178510668793894,
"learning_rate": 7.0980000000000005e-06,
"loss": 2.3954,
"step": 3550
},
{
"epoch": 0.0715,
"grad_norm": 0.6186160369787075,
"learning_rate": 7.148000000000001e-06,
"loss": 2.4153,
"step": 3575
},
{
"epoch": 0.072,
"grad_norm": 0.6053079331192983,
"learning_rate": 7.198e-06,
"loss": 2.4061,
"step": 3600
},
{
"epoch": 0.072,
"eval_loss": 2.4260003566741943,
"eval_runtime": 31.4103,
"eval_samples_per_second": 3.247,
"eval_steps_per_second": 1.624,
"step": 3600
},
{
"epoch": 0.0725,
"grad_norm": 0.6002224672812325,
"learning_rate": 7.248000000000001e-06,
"loss": 2.4062,
"step": 3625
},
{
"epoch": 0.073,
"grad_norm": 0.616881726200715,
"learning_rate": 7.298e-06,
"loss": 2.4167,
"step": 3650
},
{
"epoch": 0.0735,
"grad_norm": 0.6148731575970318,
"learning_rate": 7.348000000000001e-06,
"loss": 2.4123,
"step": 3675
},
{
"epoch": 0.074,
"grad_norm": 0.6221338587681139,
"learning_rate": 7.398000000000001e-06,
"loss": 2.4199,
"step": 3700
},
{
"epoch": 0.074,
"eval_loss": 2.4258594512939453,
"eval_runtime": 31.717,
"eval_samples_per_second": 3.216,
"eval_steps_per_second": 1.608,
"step": 3700
},
{
"epoch": 0.0745,
"grad_norm": 0.6024880998969679,
"learning_rate": 7.4480000000000005e-06,
"loss": 2.4187,
"step": 3725
},
{
"epoch": 0.075,
"grad_norm": 0.5998431875234804,
"learning_rate": 7.498000000000001e-06,
"loss": 2.4045,
"step": 3750
},
{
"epoch": 0.0755,
"grad_norm": 0.5963168253580089,
"learning_rate": 7.548000000000001e-06,
"loss": 2.4161,
"step": 3775
},
{
"epoch": 0.076,
"grad_norm": 0.5891194096424622,
"learning_rate": 7.598000000000001e-06,
"loss": 2.4217,
"step": 3800
},
{
"epoch": 0.076,
"eval_loss": 2.425435781478882,
"eval_runtime": 32.0333,
"eval_samples_per_second": 3.184,
"eval_steps_per_second": 1.592,
"step": 3800
},
{
"epoch": 0.0765,
"grad_norm": 0.6220515512248757,
"learning_rate": 7.648e-06,
"loss": 2.4115,
"step": 3825
},
{
"epoch": 0.077,
"grad_norm": 0.592208980582776,
"learning_rate": 7.698000000000002e-06,
"loss": 2.4123,
"step": 3850
},
{
"epoch": 0.0775,
"grad_norm": 0.6050688229723428,
"learning_rate": 7.748000000000001e-06,
"loss": 2.4124,
"step": 3875
},
{
"epoch": 0.078,
"grad_norm": 0.6128946719272819,
"learning_rate": 7.798e-06,
"loss": 2.4167,
"step": 3900
},
{
"epoch": 0.078,
"eval_loss": 2.4252073764801025,
"eval_runtime": 31.7629,
"eval_samples_per_second": 3.211,
"eval_steps_per_second": 1.606,
"step": 3900
},
{
"epoch": 0.0785,
"grad_norm": 0.6300203936594084,
"learning_rate": 7.848000000000002e-06,
"loss": 2.4253,
"step": 3925
},
{
"epoch": 0.079,
"grad_norm": 0.622492494084331,
"learning_rate": 7.898e-06,
"loss": 2.4126,
"step": 3950
},
{
"epoch": 0.0795,
"grad_norm": 0.6054040520886763,
"learning_rate": 7.948e-06,
"loss": 2.4082,
"step": 3975
},
{
"epoch": 0.08,
"grad_norm": 0.5997365393444213,
"learning_rate": 7.998e-06,
"loss": 2.4187,
"step": 4000
},
{
"epoch": 0.08,
"eval_loss": 2.4248712062835693,
"eval_runtime": 31.7678,
"eval_samples_per_second": 3.211,
"eval_steps_per_second": 1.605,
"step": 4000
},
{
"epoch": 0.0805,
"grad_norm": 0.5914805613039377,
"learning_rate": 8.048e-06,
"loss": 2.4136,
"step": 4025
},
{
"epoch": 0.081,
"grad_norm": 0.6868999656119101,
"learning_rate": 8.098000000000001e-06,
"loss": 2.4071,
"step": 4050
},
{
"epoch": 0.0815,
"grad_norm": 0.6116238023737347,
"learning_rate": 8.148e-06,
"loss": 2.399,
"step": 4075
},
{
"epoch": 0.082,
"grad_norm": 0.6278682082032867,
"learning_rate": 8.198e-06,
"loss": 2.4147,
"step": 4100
},
{
"epoch": 0.082,
"eval_loss": 2.424673318862915,
"eval_runtime": 31.702,
"eval_samples_per_second": 3.217,
"eval_steps_per_second": 1.609,
"step": 4100
},
{
"epoch": 0.0825,
"grad_norm": 0.652529340562497,
"learning_rate": 8.248e-06,
"loss": 2.4122,
"step": 4125
},
{
"epoch": 0.083,
"grad_norm": 0.6241764244719189,
"learning_rate": 8.298000000000001e-06,
"loss": 2.4034,
"step": 4150
},
{
"epoch": 0.0835,
"grad_norm": 0.6093599459247064,
"learning_rate": 8.348e-06,
"loss": 2.4184,
"step": 4175
},
{
"epoch": 0.084,
"grad_norm": 0.6145457262520279,
"learning_rate": 8.398e-06,
"loss": 2.4099,
"step": 4200
},
{
"epoch": 0.084,
"eval_loss": 2.424262046813965,
"eval_runtime": 31.7126,
"eval_samples_per_second": 3.216,
"eval_steps_per_second": 1.608,
"step": 4200
},
{
"epoch": 0.0845,
"grad_norm": 0.6094287468338311,
"learning_rate": 8.448000000000001e-06,
"loss": 2.413,
"step": 4225
},
{
"epoch": 0.085,
"grad_norm": 0.6138052906293812,
"learning_rate": 8.498e-06,
"loss": 2.3935,
"step": 4250
},
{
"epoch": 0.0855,
"grad_norm": 0.6122465571930669,
"learning_rate": 8.548e-06,
"loss": 2.4061,
"step": 4275
},
{
"epoch": 0.086,
"grad_norm": 0.612830490698143,
"learning_rate": 8.598000000000001e-06,
"loss": 2.4112,
"step": 4300
},
{
"epoch": 0.086,
"eval_loss": 2.4238767623901367,
"eval_runtime": 31.7292,
"eval_samples_per_second": 3.215,
"eval_steps_per_second": 1.607,
"step": 4300
},
{
"epoch": 0.0865,
"grad_norm": 0.628133619898939,
"learning_rate": 8.648000000000001e-06,
"loss": 2.4046,
"step": 4325
},
{
"epoch": 0.087,
"grad_norm": 0.6496528950628708,
"learning_rate": 8.698e-06,
"loss": 2.4068,
"step": 4350
},
{
"epoch": 0.0875,
"grad_norm": 0.5799286999894695,
"learning_rate": 8.748000000000002e-06,
"loss": 2.4072,
"step": 4375
},
{
"epoch": 0.088,
"grad_norm": 0.5910425054287555,
"learning_rate": 8.798000000000001e-06,
"loss": 2.3926,
"step": 4400
},
{
"epoch": 0.088,
"eval_loss": 2.4238674640655518,
"eval_runtime": 31.7606,
"eval_samples_per_second": 3.212,
"eval_steps_per_second": 1.606,
"step": 4400
},
{
"epoch": 0.0885,
"grad_norm": 0.6159620367072861,
"learning_rate": 8.848e-06,
"loss": 2.4115,
"step": 4425
},
{
"epoch": 0.089,
"grad_norm": 0.6972746637095123,
"learning_rate": 8.898000000000002e-06,
"loss": 2.4105,
"step": 4450
},
{
"epoch": 0.0895,
"grad_norm": 0.585353172093314,
"learning_rate": 8.948000000000001e-06,
"loss": 2.4198,
"step": 4475
},
{
"epoch": 0.09,
"grad_norm": 0.6059468344943013,
"learning_rate": 8.998000000000001e-06,
"loss": 2.4069,
"step": 4500
},
{
"epoch": 0.09,
"eval_loss": 2.42350435256958,
"eval_runtime": 31.6869,
"eval_samples_per_second": 3.219,
"eval_steps_per_second": 1.609,
"step": 4500
},
{
"epoch": 0.0905,
"grad_norm": 0.6015924987371338,
"learning_rate": 9.048e-06,
"loss": 2.4081,
"step": 4525
},
{
"epoch": 0.091,
"grad_norm": 0.6006000726208087,
"learning_rate": 9.098000000000002e-06,
"loss": 2.4079,
"step": 4550
},
{
"epoch": 0.0915,
"grad_norm": 0.6334216081429662,
"learning_rate": 9.148e-06,
"loss": 2.4021,
"step": 4575
},
{
"epoch": 0.092,
"grad_norm": 0.618758486975248,
"learning_rate": 9.198e-06,
"loss": 2.4191,
"step": 4600
},
{
"epoch": 0.092,
"eval_loss": 2.42366361618042,
"eval_runtime": 31.7351,
"eval_samples_per_second": 3.214,
"eval_steps_per_second": 1.607,
"step": 4600
},
{
"epoch": 0.0925,
"grad_norm": 0.5982185393268022,
"learning_rate": 9.248e-06,
"loss": 2.4131,
"step": 4625
},
{
"epoch": 0.093,
"grad_norm": 0.5778256378393931,
"learning_rate": 9.298e-06,
"loss": 2.4105,
"step": 4650
},
{
"epoch": 0.0935,
"grad_norm": 0.5892823966497687,
"learning_rate": 9.348000000000001e-06,
"loss": 2.4146,
"step": 4675
},
{
"epoch": 0.094,
"grad_norm": 0.6000897787974973,
"learning_rate": 9.398e-06,
"loss": 2.4141,
"step": 4700
},
{
"epoch": 0.094,
"eval_loss": 2.4225125312805176,
"eval_runtime": 31.7008,
"eval_samples_per_second": 3.218,
"eval_steps_per_second": 1.609,
"step": 4700
},
{
"epoch": 0.0945,
"grad_norm": 0.6317324097500899,
"learning_rate": 9.448e-06,
"loss": 2.4157,
"step": 4725
},
{
"epoch": 0.095,
"grad_norm": 0.6157270042215848,
"learning_rate": 9.498000000000001e-06,
"loss": 2.4091,
"step": 4750
},
{
"epoch": 0.0955,
"grad_norm": 0.5753740107095965,
"learning_rate": 9.548e-06,
"loss": 2.4142,
"step": 4775
},
{
"epoch": 0.096,
"grad_norm": 0.6173977503240126,
"learning_rate": 9.598e-06,
"loss": 2.4083,
"step": 4800
},
{
"epoch": 0.096,
"eval_loss": 2.422691583633423,
"eval_runtime": 31.4709,
"eval_samples_per_second": 3.241,
"eval_steps_per_second": 1.621,
"step": 4800
},
{
"epoch": 0.0965,
"grad_norm": 0.5942953368600239,
"learning_rate": 9.648000000000001e-06,
"loss": 2.4087,
"step": 4825
},
{
"epoch": 0.097,
"grad_norm": 0.6555799317672051,
"learning_rate": 9.698000000000001e-06,
"loss": 2.4014,
"step": 4850
},
{
"epoch": 0.0975,
"grad_norm": 0.5757950367748221,
"learning_rate": 9.748e-06,
"loss": 2.4068,
"step": 4875
},
{
"epoch": 0.098,
"grad_norm": 0.632774385045014,
"learning_rate": 9.798e-06,
"loss": 2.4087,
"step": 4900
},
{
"epoch": 0.098,
"eval_loss": 2.4220755100250244,
"eval_runtime": 31.4352,
"eval_samples_per_second": 3.245,
"eval_steps_per_second": 1.622,
"step": 4900
},
{
"epoch": 0.0985,
"grad_norm": 0.5781361622989438,
"learning_rate": 9.848000000000001e-06,
"loss": 2.4143,
"step": 4925
},
{
"epoch": 0.099,
"grad_norm": 0.6262568188074606,
"learning_rate": 9.898e-06,
"loss": 2.4142,
"step": 4950
},
{
"epoch": 0.0995,
"grad_norm": 0.6349024994263993,
"learning_rate": 9.948e-06,
"loss": 2.4086,
"step": 4975
},
{
"epoch": 0.1,
"grad_norm": 0.5902257687086163,
"learning_rate": 9.998000000000002e-06,
"loss": 2.4075,
"step": 5000
},
{
"epoch": 0.1,
"eval_loss": 2.4221627712249756,
"eval_runtime": 31.4547,
"eval_samples_per_second": 3.243,
"eval_steps_per_second": 1.621,
"step": 5000
},
{
"epoch": 0.1005,
"grad_norm": 0.6096554216132576,
"learning_rate": 9.994666666666668e-06,
"loss": 2.4056,
"step": 5025
},
{
"epoch": 0.101,
"grad_norm": 0.6157713116203616,
"learning_rate": 9.989111111111111e-06,
"loss": 2.4104,
"step": 5050
},
{
"epoch": 0.1015,
"grad_norm": 0.6100961136574927,
"learning_rate": 9.983555555555556e-06,
"loss": 2.4041,
"step": 5075
},
{
"epoch": 0.102,
"grad_norm": 0.5965243725355741,
"learning_rate": 9.978000000000002e-06,
"loss": 2.406,
"step": 5100
},
{
"epoch": 0.102,
"eval_loss": 2.4214208126068115,
"eval_runtime": 31.4633,
"eval_samples_per_second": 3.242,
"eval_steps_per_second": 1.621,
"step": 5100
},
{
"epoch": 0.1025,
"grad_norm": 0.7288147495415569,
"learning_rate": 9.972444444444445e-06,
"loss": 2.419,
"step": 5125
},
{
"epoch": 0.103,
"grad_norm": 0.6027052437896476,
"learning_rate": 9.966888888888889e-06,
"loss": 2.4149,
"step": 5150
},
{
"epoch": 0.1035,
"grad_norm": 0.6351514057651396,
"learning_rate": 9.961333333333334e-06,
"loss": 2.4053,
"step": 5175
},
{
"epoch": 0.104,
"grad_norm": 0.5912339833990681,
"learning_rate": 9.95577777777778e-06,
"loss": 2.4099,
"step": 5200
},
{
"epoch": 0.104,
"eval_loss": 2.4213571548461914,
"eval_runtime": 31.7689,
"eval_samples_per_second": 3.211,
"eval_steps_per_second": 1.605,
"step": 5200
},
{
"epoch": 0.1045,
"grad_norm": 0.6252419519280321,
"learning_rate": 9.950222222222223e-06,
"loss": 2.4044,
"step": 5225
},
{
"epoch": 0.105,
"grad_norm": 0.5932871252062307,
"learning_rate": 9.944666666666668e-06,
"loss": 2.4041,
"step": 5250
},
{
"epoch": 0.1055,
"grad_norm": 0.6265014889786313,
"learning_rate": 9.939111111111112e-06,
"loss": 2.4121,
"step": 5275
},
{
"epoch": 0.106,
"grad_norm": 0.5586876350334784,
"learning_rate": 9.933555555555557e-06,
"loss": 2.4005,
"step": 5300
},
{
"epoch": 0.106,
"eval_loss": 2.4209611415863037,
"eval_runtime": 31.4697,
"eval_samples_per_second": 3.241,
"eval_steps_per_second": 1.621,
"step": 5300
},
{
"epoch": 0.1065,
"grad_norm": 0.6208578145519013,
"learning_rate": 9.928e-06,
"loss": 2.4095,
"step": 5325
},
{
"epoch": 0.107,
"grad_norm": 0.5761711209442947,
"learning_rate": 9.922444444444446e-06,
"loss": 2.411,
"step": 5350
},
{
"epoch": 0.1075,
"grad_norm": 0.6259961321288001,
"learning_rate": 9.91688888888889e-06,
"loss": 2.4062,
"step": 5375
},
{
"epoch": 0.108,
"grad_norm": 0.6636296843455429,
"learning_rate": 9.911333333333335e-06,
"loss": 2.411,
"step": 5400
},
{
"epoch": 0.108,
"eval_loss": 2.420535087585449,
"eval_runtime": 31.4447,
"eval_samples_per_second": 3.244,
"eval_steps_per_second": 1.622,
"step": 5400
},
{
"epoch": 0.1085,
"grad_norm": 0.5977322049971575,
"learning_rate": 9.905777777777778e-06,
"loss": 2.4073,
"step": 5425
},
{
"epoch": 0.109,
"grad_norm": 0.605286836273461,
"learning_rate": 9.900222222222223e-06,
"loss": 2.4023,
"step": 5450
},
{
"epoch": 0.1095,
"grad_norm": 0.6244785501127309,
"learning_rate": 9.894666666666669e-06,
"loss": 2.4084,
"step": 5475
},
{
"epoch": 0.11,
"grad_norm": 0.6135442380195029,
"learning_rate": 9.889111111111112e-06,
"loss": 2.4068,
"step": 5500
},
{
"epoch": 0.11,
"eval_loss": 2.4201102256774902,
"eval_runtime": 31.806,
"eval_samples_per_second": 3.207,
"eval_steps_per_second": 1.603,
"step": 5500
},
{
"epoch": 0.1105,
"grad_norm": 0.6260082123047037,
"learning_rate": 9.883555555555556e-06,
"loss": 2.4053,
"step": 5525
},
{
"epoch": 0.111,
"grad_norm": 0.5956336151974914,
"learning_rate": 9.878000000000001e-06,
"loss": 2.4152,
"step": 5550
},
{
"epoch": 0.1115,
"grad_norm": 0.6149620176113736,
"learning_rate": 9.872444444444446e-06,
"loss": 2.4055,
"step": 5575
},
{
"epoch": 0.112,
"grad_norm": 0.6326092489345128,
"learning_rate": 9.86688888888889e-06,
"loss": 2.3968,
"step": 5600
},
{
"epoch": 0.112,
"eval_loss": 2.420125722885132,
"eval_runtime": 31.8082,
"eval_samples_per_second": 3.207,
"eval_steps_per_second": 1.603,
"step": 5600
},
{
"epoch": 0.1125,
"grad_norm": 0.6390446494212693,
"learning_rate": 9.861333333333333e-06,
"loss": 2.4045,
"step": 5625
},
{
"epoch": 0.113,
"grad_norm": 0.6670896967232433,
"learning_rate": 9.855777777777779e-06,
"loss": 2.4013,
"step": 5650
},
{
"epoch": 0.1135,
"grad_norm": 0.6185087617978746,
"learning_rate": 9.850222222222224e-06,
"loss": 2.4015,
"step": 5675
},
{
"epoch": 0.114,
"grad_norm": 0.6040525454825223,
"learning_rate": 9.844666666666667e-06,
"loss": 2.4109,
"step": 5700
},
{
"epoch": 0.114,
"eval_loss": 2.419764518737793,
"eval_runtime": 31.7256,
"eval_samples_per_second": 3.215,
"eval_steps_per_second": 1.608,
"step": 5700
},
{
"epoch": 0.1145,
"grad_norm": 0.6010942125132981,
"learning_rate": 9.839111111111111e-06,
"loss": 2.4092,
"step": 5725
},
{
"epoch": 0.115,
"grad_norm": 0.602852118998295,
"learning_rate": 9.833555555555556e-06,
"loss": 2.414,
"step": 5750
},
{
"epoch": 0.1155,
"grad_norm": 0.6189454944937772,
"learning_rate": 9.828000000000001e-06,
"loss": 2.4112,
"step": 5775
},
{
"epoch": 0.116,
"grad_norm": 0.5871735622958322,
"learning_rate": 9.822444444444445e-06,
"loss": 2.3993,
"step": 5800
},
{
"epoch": 0.116,
"eval_loss": 2.419255495071411,
"eval_runtime": 31.7146,
"eval_samples_per_second": 3.216,
"eval_steps_per_second": 1.608,
"step": 5800
},
{
"epoch": 0.1165,
"grad_norm": 0.5719116548117884,
"learning_rate": 9.81688888888889e-06,
"loss": 2.4128,
"step": 5825
},
{
"epoch": 0.117,
"grad_norm": 0.5855276996729913,
"learning_rate": 9.811333333333334e-06,
"loss": 2.4127,
"step": 5850
},
{
"epoch": 0.1175,
"grad_norm": 0.5948413134062237,
"learning_rate": 9.805777777777779e-06,
"loss": 2.4028,
"step": 5875
},
{
"epoch": 0.118,
"grad_norm": 0.6114053718118341,
"learning_rate": 9.800222222222223e-06,
"loss": 2.4085,
"step": 5900
},
{
"epoch": 0.118,
"eval_loss": 2.4192631244659424,
"eval_runtime": 31.8221,
"eval_samples_per_second": 3.205,
"eval_steps_per_second": 1.603,
"step": 5900
},
{
"epoch": 0.1185,
"grad_norm": 0.6171839632107143,
"learning_rate": 9.794666666666668e-06,
"loss": 2.4063,
"step": 5925
},
{
"epoch": 0.119,
"grad_norm": 0.5985426708940325,
"learning_rate": 9.789111111111111e-06,
"loss": 2.401,
"step": 5950
},
{
"epoch": 0.1195,
"grad_norm": 0.6242757087701617,
"learning_rate": 9.783555555555557e-06,
"loss": 2.3977,
"step": 5975
},
{
"epoch": 0.12,
"grad_norm": 0.6472329844420622,
"learning_rate": 9.778e-06,
"loss": 2.4066,
"step": 6000
},
{
"epoch": 0.12,
"eval_loss": 2.4190170764923096,
"eval_runtime": 31.973,
"eval_samples_per_second": 3.19,
"eval_steps_per_second": 1.595,
"step": 6000
},
{
"epoch": 0.1205,
"grad_norm": 0.5979904516506753,
"learning_rate": 9.772444444444445e-06,
"loss": 2.4044,
"step": 6025
},
{
"epoch": 0.121,
"grad_norm": 0.5980588594331456,
"learning_rate": 9.76688888888889e-06,
"loss": 2.41,
"step": 6050
},
{
"epoch": 0.1215,
"grad_norm": 0.6344150039672136,
"learning_rate": 9.761333333333334e-06,
"loss": 2.4,
"step": 6075
},
{
"epoch": 0.122,
"grad_norm": 0.6035110768502723,
"learning_rate": 9.755777777777778e-06,
"loss": 2.4148,
"step": 6100
},
{
"epoch": 0.122,
"eval_loss": 2.418259382247925,
"eval_runtime": 31.784,
"eval_samples_per_second": 3.209,
"eval_steps_per_second": 1.605,
"step": 6100
},
{
"epoch": 0.1225,
"grad_norm": 0.5792932239951794,
"learning_rate": 9.750222222222223e-06,
"loss": 2.4061,
"step": 6125
},
{
"epoch": 0.123,
"grad_norm": 0.6529554995007899,
"learning_rate": 9.744666666666668e-06,
"loss": 2.4036,
"step": 6150
},
{
"epoch": 0.1235,
"grad_norm": 0.5946064726146467,
"learning_rate": 9.739111111111112e-06,
"loss": 2.4014,
"step": 6175
},
{
"epoch": 0.124,
"grad_norm": 0.5739473618849045,
"learning_rate": 9.733555555555555e-06,
"loss": 2.4057,
"step": 6200
},
{
"epoch": 0.124,
"eval_loss": 2.4179208278656006,
"eval_runtime": 31.6981,
"eval_samples_per_second": 3.218,
"eval_steps_per_second": 1.609,
"step": 6200
},
{
"epoch": 0.1245,
"grad_norm": 0.6907211114020956,
"learning_rate": 9.728e-06,
"loss": 2.393,
"step": 6225
},
{
"epoch": 0.125,
"grad_norm": 0.6225931887903327,
"learning_rate": 9.722444444444446e-06,
"loss": 2.4147,
"step": 6250
},
{
"epoch": 0.1255,
"grad_norm": 0.568397246680531,
"learning_rate": 9.71688888888889e-06,
"loss": 2.4024,
"step": 6275
},
{
"epoch": 0.126,
"grad_norm": 0.5842879344272728,
"learning_rate": 9.711333333333333e-06,
"loss": 2.404,
"step": 6300
},
{
"epoch": 0.126,
"eval_loss": 2.4178576469421387,
"eval_runtime": 31.7994,
"eval_samples_per_second": 3.208,
"eval_steps_per_second": 1.604,
"step": 6300
},
{
"epoch": 0.1265,
"grad_norm": 0.5805192382099048,
"learning_rate": 9.705777777777778e-06,
"loss": 2.4063,
"step": 6325
},
{
"epoch": 0.127,
"grad_norm": 0.6600294122711824,
"learning_rate": 9.700222222222224e-06,
"loss": 2.4078,
"step": 6350
},
{
"epoch": 0.1275,
"grad_norm": 0.6263098682936462,
"learning_rate": 9.694666666666667e-06,
"loss": 2.3961,
"step": 6375
},
{
"epoch": 0.128,
"grad_norm": 0.6961912679129473,
"learning_rate": 9.68911111111111e-06,
"loss": 2.4127,
"step": 6400
},
{
"epoch": 0.128,
"eval_loss": 2.417247772216797,
"eval_runtime": 31.7325,
"eval_samples_per_second": 3.214,
"eval_steps_per_second": 1.607,
"step": 6400
},
{
"epoch": 0.1285,
"grad_norm": 0.6396950069271417,
"learning_rate": 9.683555555555556e-06,
"loss": 2.4041,
"step": 6425
},
{
"epoch": 0.129,
"grad_norm": 0.6164180606933177,
"learning_rate": 9.678000000000001e-06,
"loss": 2.4,
"step": 6450
},
{
"epoch": 0.1295,
"grad_norm": 0.6120640198257105,
"learning_rate": 9.672444444444445e-06,
"loss": 2.3966,
"step": 6475
},
{
"epoch": 0.13,
"grad_norm": 0.6013045247718226,
"learning_rate": 9.66688888888889e-06,
"loss": 2.3991,
"step": 6500
},
{
"epoch": 0.13,
"eval_loss": 2.417280673980713,
"eval_runtime": 31.8112,
"eval_samples_per_second": 3.206,
"eval_steps_per_second": 1.603,
"step": 6500
},
{
"epoch": 0.1305,
"grad_norm": 0.6061836537875764,
"learning_rate": 9.661333333333334e-06,
"loss": 2.4161,
"step": 6525
},
{
"epoch": 0.131,
"grad_norm": 0.6100864625060891,
"learning_rate": 9.655777777777779e-06,
"loss": 2.4052,
"step": 6550
},
{
"epoch": 0.1315,
"grad_norm": 0.6932893052541476,
"learning_rate": 9.650222222222222e-06,
"loss": 2.4036,
"step": 6575
},
{
"epoch": 0.132,
"grad_norm": 0.5859072202807338,
"learning_rate": 9.644666666666668e-06,
"loss": 2.4045,
"step": 6600
},
{
"epoch": 0.132,
"eval_loss": 2.416877031326294,
"eval_runtime": 31.5203,
"eval_samples_per_second": 3.236,
"eval_steps_per_second": 1.618,
"step": 6600
},
{
"epoch": 0.1325,
"grad_norm": 0.579002436095642,
"learning_rate": 9.639111111111113e-06,
"loss": 2.4015,
"step": 6625
},
{
"epoch": 0.133,
"grad_norm": 0.5968858601649685,
"learning_rate": 9.633555555555556e-06,
"loss": 2.3986,
"step": 6650
},
{
"epoch": 0.1335,
"grad_norm": 0.5964714549861985,
"learning_rate": 9.628e-06,
"loss": 2.4062,
"step": 6675
},
{
"epoch": 0.134,
"grad_norm": 0.6126102944808797,
"learning_rate": 9.622444444444445e-06,
"loss": 2.4033,
"step": 6700
},
{
"epoch": 0.134,
"eval_loss": 2.4164350032806396,
"eval_runtime": 31.4543,
"eval_samples_per_second": 3.243,
"eval_steps_per_second": 1.621,
"step": 6700
},
{
"epoch": 0.1345,
"grad_norm": 0.5774452345333466,
"learning_rate": 9.61688888888889e-06,
"loss": 2.3997,
"step": 6725
},
{
"epoch": 0.135,
"grad_norm": 0.6227260743975279,
"learning_rate": 9.611333333333334e-06,
"loss": 2.4018,
"step": 6750
},
{
"epoch": 0.1355,
"grad_norm": 0.5846707991616706,
"learning_rate": 9.605777777777778e-06,
"loss": 2.3985,
"step": 6775
},
{
"epoch": 0.136,
"grad_norm": 0.6172483484063671,
"learning_rate": 9.600222222222223e-06,
"loss": 2.4213,
"step": 6800
},
{
"epoch": 0.136,
"eval_loss": 2.41625714302063,
"eval_runtime": 31.5517,
"eval_samples_per_second": 3.233,
"eval_steps_per_second": 1.616,
"step": 6800
},
{
"epoch": 0.1365,
"grad_norm": 0.5965299711032601,
"learning_rate": 9.594666666666668e-06,
"loss": 2.3976,
"step": 6825
},
{
"epoch": 0.137,
"grad_norm": 0.5884739304234496,
"learning_rate": 9.589111111111112e-06,
"loss": 2.3947,
"step": 6850
},
{
"epoch": 0.1375,
"grad_norm": 0.5737065693146471,
"learning_rate": 9.583555555555555e-06,
"loss": 2.3983,
"step": 6875
},
{
"epoch": 0.138,
"grad_norm": 0.6249698819825935,
"learning_rate": 9.578e-06,
"loss": 2.4008,
"step": 6900
},
{
"epoch": 0.138,
"eval_loss": 2.4156551361083984,
"eval_runtime": 31.5071,
"eval_samples_per_second": 3.237,
"eval_steps_per_second": 1.619,
"step": 6900
},
{
"epoch": 0.1385,
"grad_norm": 0.5930008566650997,
"learning_rate": 9.572444444444446e-06,
"loss": 2.3951,
"step": 6925
},
{
"epoch": 0.139,
"grad_norm": 0.6564746022716046,
"learning_rate": 9.56688888888889e-06,
"loss": 2.4083,
"step": 6950
},
{
"epoch": 0.1395,
"grad_norm": 0.611311960098376,
"learning_rate": 9.561333333333333e-06,
"loss": 2.4032,
"step": 6975
},
{
"epoch": 0.14,
"grad_norm": 0.594692534551516,
"learning_rate": 9.555777777777778e-06,
"loss": 2.41,
"step": 7000
},
{
"epoch": 0.14,
"eval_loss": 2.415269374847412,
"eval_runtime": 31.7535,
"eval_samples_per_second": 3.212,
"eval_steps_per_second": 1.606,
"step": 7000
},
{
"epoch": 0.1405,
"grad_norm": 0.5975652527083385,
"learning_rate": 9.550222222222223e-06,
"loss": 2.398,
"step": 7025
},
{
"epoch": 0.141,
"grad_norm": 0.5642285559875744,
"learning_rate": 9.544666666666667e-06,
"loss": 2.3907,
"step": 7050
},
{
"epoch": 0.1415,
"grad_norm": 0.5977243463765347,
"learning_rate": 9.539111111111112e-06,
"loss": 2.4063,
"step": 7075
},
{
"epoch": 0.142,
"grad_norm": 0.5938091922766982,
"learning_rate": 9.533555555555556e-06,
"loss": 2.4064,
"step": 7100
},
{
"epoch": 0.142,
"eval_loss": 2.4153244495391846,
"eval_runtime": 31.6856,
"eval_samples_per_second": 3.219,
"eval_steps_per_second": 1.61,
"step": 7100
},
{
"epoch": 0.1425,
"grad_norm": 0.6203811817044198,
"learning_rate": 9.528000000000001e-06,
"loss": 2.3995,
"step": 7125
},
{
"epoch": 0.143,
"grad_norm": 0.5748373728564159,
"learning_rate": 9.522444444444444e-06,
"loss": 2.4052,
"step": 7150
},
{
"epoch": 0.1435,
"grad_norm": 0.6318360721408016,
"learning_rate": 9.51688888888889e-06,
"loss": 2.396,
"step": 7175
},
{
"epoch": 0.144,
"grad_norm": 0.5777480191110791,
"learning_rate": 9.511333333333335e-06,
"loss": 2.3966,
"step": 7200
},
{
"epoch": 0.144,
"eval_loss": 2.414691209793091,
"eval_runtime": 31.5495,
"eval_samples_per_second": 3.233,
"eval_steps_per_second": 1.617,
"step": 7200
},
{
"epoch": 0.1445,
"grad_norm": 0.5896122820881663,
"learning_rate": 9.505777777777779e-06,
"loss": 2.4018,
"step": 7225
},
{
"epoch": 0.145,
"grad_norm": 0.6081675838061575,
"learning_rate": 9.500222222222222e-06,
"loss": 2.4036,
"step": 7250
},
{
"epoch": 0.1455,
"grad_norm": 0.6032973832585987,
"learning_rate": 9.494666666666667e-06,
"loss": 2.4025,
"step": 7275
},
{
"epoch": 0.146,
"grad_norm": 0.6283775464354142,
"learning_rate": 9.489111111111113e-06,
"loss": 2.4078,
"step": 7300
},
{
"epoch": 0.146,
"eval_loss": 2.4143505096435547,
"eval_runtime": 31.4643,
"eval_samples_per_second": 3.242,
"eval_steps_per_second": 1.621,
"step": 7300
},
{
"epoch": 0.1465,
"grad_norm": 0.5969038728051346,
"learning_rate": 9.483555555555556e-06,
"loss": 2.4066,
"step": 7325
},
{
"epoch": 0.147,
"grad_norm": 0.6048317665387537,
"learning_rate": 9.478e-06,
"loss": 2.4007,
"step": 7350
},
{
"epoch": 0.1475,
"grad_norm": 0.5721050600021237,
"learning_rate": 9.472444444444445e-06,
"loss": 2.4146,
"step": 7375
},
{
"epoch": 0.148,
"grad_norm": 0.6019256818391423,
"learning_rate": 9.46688888888889e-06,
"loss": 2.399,
"step": 7400
},
{
"epoch": 0.148,
"eval_loss": 2.414281129837036,
"eval_runtime": 31.7034,
"eval_samples_per_second": 3.217,
"eval_steps_per_second": 1.609,
"step": 7400
},
{
"epoch": 0.1485,
"grad_norm": 0.6386043502919573,
"learning_rate": 9.461333333333334e-06,
"loss": 2.3957,
"step": 7425
},
{
"epoch": 0.149,
"grad_norm": 0.5819226766027404,
"learning_rate": 9.455777777777777e-06,
"loss": 2.4001,
"step": 7450
},
{
"epoch": 0.1495,
"grad_norm": 0.6372396676223023,
"learning_rate": 9.450222222222223e-06,
"loss": 2.3976,
"step": 7475
},
{
"epoch": 0.15,
"grad_norm": 0.5888017578283452,
"learning_rate": 9.444666666666668e-06,
"loss": 2.4008,
"step": 7500
},
{
"epoch": 0.15,
"eval_loss": 2.414154291152954,
"eval_runtime": 31.8152,
"eval_samples_per_second": 3.206,
"eval_steps_per_second": 1.603,
"step": 7500
},
{
"epoch": 0.1505,
"grad_norm": 0.6132781564549638,
"learning_rate": 9.439111111111111e-06,
"loss": 2.4077,
"step": 7525
},
{
"epoch": 0.151,
"grad_norm": 0.6063002641957036,
"learning_rate": 9.433555555555557e-06,
"loss": 2.3889,
"step": 7550
},
{
"epoch": 0.1515,
"grad_norm": 0.614169638364484,
"learning_rate": 9.428e-06,
"loss": 2.4121,
"step": 7575
},
{
"epoch": 0.152,
"grad_norm": 0.5826866596297434,
"learning_rate": 9.422444444444445e-06,
"loss": 2.4075,
"step": 7600
},
{
"epoch": 0.152,
"eval_loss": 2.414039134979248,
"eval_runtime": 31.7985,
"eval_samples_per_second": 3.208,
"eval_steps_per_second": 1.604,
"step": 7600
},
{
"epoch": 0.1525,
"grad_norm": 0.5964985955677213,
"learning_rate": 9.41688888888889e-06,
"loss": 2.3976,
"step": 7625
},
{
"epoch": 0.153,
"grad_norm": 0.5946671745059025,
"learning_rate": 9.411333333333334e-06,
"loss": 2.3947,
"step": 7650
},
{
"epoch": 0.1535,
"grad_norm": 0.5894909865358033,
"learning_rate": 9.405777777777778e-06,
"loss": 2.4079,
"step": 7675
},
{
"epoch": 0.154,
"grad_norm": 0.6048420481174572,
"learning_rate": 9.400222222222223e-06,
"loss": 2.4015,
"step": 7700
},
{
"epoch": 0.154,
"eval_loss": 2.413475275039673,
"eval_runtime": 31.9136,
"eval_samples_per_second": 3.196,
"eval_steps_per_second": 1.598,
"step": 7700
},
{
"epoch": 0.1545,
"grad_norm": 0.617559481688582,
"learning_rate": 9.394666666666668e-06,
"loss": 2.4036,
"step": 7725
},
{
"epoch": 0.155,
"grad_norm": 0.6350332331451685,
"learning_rate": 9.389111111111112e-06,
"loss": 2.3989,
"step": 7750
},
{
"epoch": 0.1555,
"grad_norm": 0.6034892604414784,
"learning_rate": 9.383555555555557e-06,
"loss": 2.398,
"step": 7775
},
{
"epoch": 0.156,
"grad_norm": 0.5879016941841427,
"learning_rate": 9.378e-06,
"loss": 2.3989,
"step": 7800
},
{
"epoch": 0.156,
"eval_loss": 2.4134128093719482,
"eval_runtime": 31.7809,
"eval_samples_per_second": 3.209,
"eval_steps_per_second": 1.605,
"step": 7800
},
{
"epoch": 0.1565,
"grad_norm": 0.5957060592966067,
"learning_rate": 9.372444444444446e-06,
"loss": 2.3951,
"step": 7825
},
{
"epoch": 0.157,
"grad_norm": 0.6127788552445546,
"learning_rate": 9.36688888888889e-06,
"loss": 2.3966,
"step": 7850
},
{
"epoch": 0.1575,
"grad_norm": 0.6103495429829666,
"learning_rate": 9.361333333333335e-06,
"loss": 2.3974,
"step": 7875
},
{
"epoch": 0.158,
"grad_norm": 0.5940303847498369,
"learning_rate": 9.355777777777778e-06,
"loss": 2.3982,
"step": 7900
},
{
"epoch": 0.158,
"eval_loss": 2.4130520820617676,
"eval_runtime": 31.8718,
"eval_samples_per_second": 3.2,
"eval_steps_per_second": 1.6,
"step": 7900
},
{
"epoch": 0.1585,
"grad_norm": 0.5967208318826438,
"learning_rate": 9.350222222222224e-06,
"loss": 2.3963,
"step": 7925
},
{
"epoch": 0.159,
"grad_norm": 0.6074697420049116,
"learning_rate": 9.344666666666667e-06,
"loss": 2.4004,
"step": 7950
},
{
"epoch": 0.1595,
"grad_norm": 0.6007548308453654,
"learning_rate": 9.339111111111112e-06,
"loss": 2.3972,
"step": 7975
},
{
"epoch": 0.16,
"grad_norm": 0.6058573477149505,
"learning_rate": 9.333555555555558e-06,
"loss": 2.4,
"step": 8000
},
{
"epoch": 0.16,
"eval_loss": 2.4125914573669434,
"eval_runtime": 31.8819,
"eval_samples_per_second": 3.199,
"eval_steps_per_second": 1.6,
"step": 8000
},
{
"epoch": 0.1605,
"grad_norm": 0.5861319558312379,
"learning_rate": 9.328000000000001e-06,
"loss": 2.3883,
"step": 8025
},
{
"epoch": 0.161,
"grad_norm": 0.5836976562991806,
"learning_rate": 9.322444444444445e-06,
"loss": 2.3858,
"step": 8050
},
{
"epoch": 0.1615,
"grad_norm": 0.5844356099514875,
"learning_rate": 9.31688888888889e-06,
"loss": 2.408,
"step": 8075
},
{
"epoch": 0.162,
"grad_norm": 0.5898038882596441,
"learning_rate": 9.311333333333335e-06,
"loss": 2.3979,
"step": 8100
},
{
"epoch": 0.162,
"eval_loss": 2.4123263359069824,
"eval_runtime": 31.7798,
"eval_samples_per_second": 3.21,
"eval_steps_per_second": 1.605,
"step": 8100
},
{
"epoch": 0.1625,
"grad_norm": 0.6072648398087778,
"learning_rate": 9.305777777777779e-06,
"loss": 2.3904,
"step": 8125
},
{
"epoch": 0.163,
"grad_norm": 0.5947190221089934,
"learning_rate": 9.300222222222222e-06,
"loss": 2.3908,
"step": 8150
},
{
"epoch": 0.1635,
"grad_norm": 0.5923294532719955,
"learning_rate": 9.294666666666668e-06,
"loss": 2.3994,
"step": 8175
},
{
"epoch": 0.164,
"grad_norm": 0.6238957997579533,
"learning_rate": 9.289111111111113e-06,
"loss": 2.3935,
"step": 8200
},
{
"epoch": 0.164,
"eval_loss": 2.4118340015411377,
"eval_runtime": 31.8145,
"eval_samples_per_second": 3.206,
"eval_steps_per_second": 1.603,
"step": 8200
},
{
"epoch": 0.1645,
"grad_norm": 0.576622489198895,
"learning_rate": 9.283555555555556e-06,
"loss": 2.396,
"step": 8225
},
{
"epoch": 0.165,
"grad_norm": 0.6185118704471244,
"learning_rate": 9.278e-06,
"loss": 2.4035,
"step": 8250
},
{
"epoch": 0.1655,
"grad_norm": 0.5796535805449304,
"learning_rate": 9.272444444444445e-06,
"loss": 2.3943,
"step": 8275
},
{
"epoch": 0.166,
"grad_norm": 0.6173375014397958,
"learning_rate": 9.26688888888889e-06,
"loss": 2.3935,
"step": 8300
},
{
"epoch": 0.166,
"eval_loss": 2.4114973545074463,
"eval_runtime": 31.7754,
"eval_samples_per_second": 3.21,
"eval_steps_per_second": 1.605,
"step": 8300
},
{
"epoch": 0.1665,
"grad_norm": 0.5618534321843206,
"learning_rate": 9.261333333333334e-06,
"loss": 2.3974,
"step": 8325
},
{
"epoch": 0.167,
"grad_norm": 0.6009214777241336,
"learning_rate": 9.25577777777778e-06,
"loss": 2.4,
"step": 8350
},
{
"epoch": 0.1675,
"grad_norm": 0.5772198441104387,
"learning_rate": 9.250222222222223e-06,
"loss": 2.3991,
"step": 8375
},
{
"epoch": 0.168,
"grad_norm": 0.5740163940994337,
"learning_rate": 9.244666666666668e-06,
"loss": 2.3947,
"step": 8400
},
{
"epoch": 0.168,
"eval_loss": 2.411425828933716,
"eval_runtime": 31.5099,
"eval_samples_per_second": 3.237,
"eval_steps_per_second": 1.619,
"step": 8400
},
{
"epoch": 0.1685,
"grad_norm": 0.5687873679002051,
"learning_rate": 9.239111111111112e-06,
"loss": 2.3966,
"step": 8425
},
{
"epoch": 0.169,
"grad_norm": 0.5610136891748577,
"learning_rate": 9.233555555555557e-06,
"loss": 2.3998,
"step": 8450
},
{
"epoch": 0.1695,
"grad_norm": 0.6032713755890403,
"learning_rate": 9.228e-06,
"loss": 2.3943,
"step": 8475
},
{
"epoch": 0.17,
"grad_norm": 0.5964144518891603,
"learning_rate": 9.222444444444446e-06,
"loss": 2.3883,
"step": 8500
},
{
"epoch": 0.17,
"eval_loss": 2.411017656326294,
"eval_runtime": 31.5307,
"eval_samples_per_second": 3.235,
"eval_steps_per_second": 1.617,
"step": 8500
},
{
"epoch": 0.1705,
"grad_norm": 0.6150332993234658,
"learning_rate": 9.21688888888889e-06,
"loss": 2.3947,
"step": 8525
},
{
"epoch": 0.171,
"grad_norm": 0.5996705331900282,
"learning_rate": 9.211333333333334e-06,
"loss": 2.3767,
"step": 8550
},
{
"epoch": 0.1715,
"grad_norm": 0.5824632831455251,
"learning_rate": 9.20577777777778e-06,
"loss": 2.3872,
"step": 8575
},
{
"epoch": 0.172,
"grad_norm": 0.606207861483595,
"learning_rate": 9.200222222222223e-06,
"loss": 2.4039,
"step": 8600
},
{
"epoch": 0.172,
"eval_loss": 2.4107751846313477,
"eval_runtime": 31.4387,
"eval_samples_per_second": 3.244,
"eval_steps_per_second": 1.622,
"step": 8600
},
{
"epoch": 0.1725,
"grad_norm": 0.576823131255562,
"learning_rate": 9.194666666666667e-06,
"loss": 2.3954,
"step": 8625
},
{
"epoch": 0.173,
"grad_norm": 0.56597712239854,
"learning_rate": 9.189111111111112e-06,
"loss": 2.4072,
"step": 8650
},
{
"epoch": 0.1735,
"grad_norm": 0.5825959007699376,
"learning_rate": 9.183555555555557e-06,
"loss": 2.4081,
"step": 8675
},
{
"epoch": 0.174,
"grad_norm": 0.5776918671405765,
"learning_rate": 9.178000000000001e-06,
"loss": 2.4091,
"step": 8700
},
{
"epoch": 0.174,
"eval_loss": 2.410761594772339,
"eval_runtime": 31.7246,
"eval_samples_per_second": 3.215,
"eval_steps_per_second": 1.608,
"step": 8700
},
{
"epoch": 0.1745,
"grad_norm": 0.6256369047041809,
"learning_rate": 9.172444444444444e-06,
"loss": 2.3953,
"step": 8725
},
{
"epoch": 0.175,
"grad_norm": 0.5964709475887552,
"learning_rate": 9.16688888888889e-06,
"loss": 2.39,
"step": 8750
},
{
"epoch": 0.1755,
"grad_norm": 0.5775755843795828,
"learning_rate": 9.161333333333335e-06,
"loss": 2.391,
"step": 8775
},
{
"epoch": 0.176,
"grad_norm": 0.6655706627980364,
"learning_rate": 9.155777777777779e-06,
"loss": 2.4048,
"step": 8800
},
{
"epoch": 0.176,
"eval_loss": 2.4105958938598633,
"eval_runtime": 31.4248,
"eval_samples_per_second": 3.246,
"eval_steps_per_second": 1.623,
"step": 8800
},
{
"epoch": 0.1765,
"grad_norm": 0.5865172878151053,
"learning_rate": 9.150222222222222e-06,
"loss": 2.3878,
"step": 8825
},
{
"epoch": 0.177,
"grad_norm": 0.584391124965856,
"learning_rate": 9.144666666666667e-06,
"loss": 2.401,
"step": 8850
},
{
"epoch": 0.1775,
"grad_norm": 0.5726598382185046,
"learning_rate": 9.139111111111113e-06,
"loss": 2.4018,
"step": 8875
},
{
"epoch": 0.178,
"grad_norm": 0.5690725395770588,
"learning_rate": 9.133555555555556e-06,
"loss": 2.4034,
"step": 8900
},
{
"epoch": 0.178,
"eval_loss": 2.4101033210754395,
"eval_runtime": 31.4686,
"eval_samples_per_second": 3.241,
"eval_steps_per_second": 1.621,
"step": 8900
},
{
"epoch": 0.1785,
"grad_norm": 0.5978143013011991,
"learning_rate": 9.128e-06,
"loss": 2.4014,
"step": 8925
},
{
"epoch": 0.179,
"grad_norm": 0.6085180927490662,
"learning_rate": 9.122444444444445e-06,
"loss": 2.3924,
"step": 8950
},
{
"epoch": 0.1795,
"grad_norm": 0.5720265034599029,
"learning_rate": 9.11688888888889e-06,
"loss": 2.3977,
"step": 8975
},
{
"epoch": 0.18,
"grad_norm": 0.5739306861609581,
"learning_rate": 9.111333333333334e-06,
"loss": 2.3992,
"step": 9000
},
{
"epoch": 0.18,
"eval_loss": 2.410008430480957,
"eval_runtime": 32.192,
"eval_samples_per_second": 3.168,
"eval_steps_per_second": 1.584,
"step": 9000
},
{
"epoch": 0.1805,
"grad_norm": 0.5908878679870805,
"learning_rate": 9.105777777777779e-06,
"loss": 2.3938,
"step": 9025
},
{
"epoch": 0.181,
"grad_norm": 0.5496267273049,
"learning_rate": 9.100222222222223e-06,
"loss": 2.3961,
"step": 9050
},
{
"epoch": 0.1815,
"grad_norm": 0.5979695738071065,
"learning_rate": 9.094666666666668e-06,
"loss": 2.3858,
"step": 9075
},
{
"epoch": 0.182,
"grad_norm": 0.5938166893318079,
"learning_rate": 9.089111111111111e-06,
"loss": 2.3862,
"step": 9100
},
{
"epoch": 0.182,
"eval_loss": 2.410053253173828,
"eval_runtime": 32.2577,
"eval_samples_per_second": 3.162,
"eval_steps_per_second": 1.581,
"step": 9100
},
{
"epoch": 0.1825,
"grad_norm": 0.5958942390294701,
"learning_rate": 9.083555555555557e-06,
"loss": 2.3928,
"step": 9125
},
{
"epoch": 0.183,
"grad_norm": 0.5859164810125311,
"learning_rate": 9.078000000000002e-06,
"loss": 2.4022,
"step": 9150
},
{
"epoch": 0.1835,
"grad_norm": 0.5798241289951321,
"learning_rate": 9.072444444444445e-06,
"loss": 2.3928,
"step": 9175
},
{
"epoch": 0.184,
"grad_norm": 0.5882407091400851,
"learning_rate": 9.066888888888889e-06,
"loss": 2.3973,
"step": 9200
},
{
"epoch": 0.184,
"eval_loss": 2.409634590148926,
"eval_runtime": 32.249,
"eval_samples_per_second": 3.163,
"eval_steps_per_second": 1.581,
"step": 9200
},
{
"epoch": 0.1845,
"grad_norm": 0.5903772748051019,
"learning_rate": 9.061333333333334e-06,
"loss": 2.3831,
"step": 9225
},
{
"epoch": 0.185,
"grad_norm": 0.6211646089814673,
"learning_rate": 9.05577777777778e-06,
"loss": 2.3983,
"step": 9250
},
{
"epoch": 0.1855,
"grad_norm": 0.6172378815389531,
"learning_rate": 9.050222222222223e-06,
"loss": 2.3961,
"step": 9275
},
{
"epoch": 0.186,
"grad_norm": 0.6117693503941964,
"learning_rate": 9.044666666666667e-06,
"loss": 2.3991,
"step": 9300
},
{
"epoch": 0.186,
"eval_loss": 2.4100780487060547,
"eval_runtime": 31.6698,
"eval_samples_per_second": 3.221,
"eval_steps_per_second": 1.61,
"step": 9300
},
{
"epoch": 0.1865,
"grad_norm": 0.5955035334939845,
"learning_rate": 9.039111111111112e-06,
"loss": 2.4013,
"step": 9325
},
{
"epoch": 0.187,
"grad_norm": 0.6304889803867978,
"learning_rate": 9.033555555555557e-06,
"loss": 2.4045,
"step": 9350
},
{
"epoch": 0.1875,
"grad_norm": 0.5650857479280212,
"learning_rate": 9.028e-06,
"loss": 2.3993,
"step": 9375
},
{
"epoch": 0.188,
"grad_norm": 0.6102368092141387,
"learning_rate": 9.022444444444444e-06,
"loss": 2.3969,
"step": 9400
},
{
"epoch": 0.188,
"eval_loss": 2.4091312885284424,
"eval_runtime": 31.7427,
"eval_samples_per_second": 3.213,
"eval_steps_per_second": 1.607,
"step": 9400
},
{
"epoch": 0.1885,
"grad_norm": 0.5647006274355659,
"learning_rate": 9.01688888888889e-06,
"loss": 2.3962,
"step": 9425
},
{
"epoch": 0.189,
"grad_norm": 0.639478683787589,
"learning_rate": 9.011333333333335e-06,
"loss": 2.3957,
"step": 9450
},
{
"epoch": 0.1895,
"grad_norm": 0.5788568545073746,
"learning_rate": 9.005777777777778e-06,
"loss": 2.3914,
"step": 9475
},
{
"epoch": 0.19,
"grad_norm": 0.7290164754099147,
"learning_rate": 9.000222222222222e-06,
"loss": 2.386,
"step": 9500
},
{
"epoch": 0.19,
"eval_loss": 2.4086694717407227,
"eval_runtime": 31.8061,
"eval_samples_per_second": 3.207,
"eval_steps_per_second": 1.603,
"step": 9500
},
{
"epoch": 0.1905,
"grad_norm": 0.5817637514180484,
"learning_rate": 8.994666666666667e-06,
"loss": 2.4006,
"step": 9525
},
{
"epoch": 0.191,
"grad_norm": 0.5697879107784812,
"learning_rate": 8.989111111111112e-06,
"loss": 2.3899,
"step": 9550
},
{
"epoch": 0.1915,
"grad_norm": 0.584610269954786,
"learning_rate": 8.983555555555556e-06,
"loss": 2.3944,
"step": 9575
},
{
"epoch": 0.192,
"grad_norm": 0.608795413325502,
"learning_rate": 8.978000000000001e-06,
"loss": 2.398,
"step": 9600
},
{
"epoch": 0.192,
"eval_loss": 2.408263683319092,
"eval_runtime": 31.6859,
"eval_samples_per_second": 3.219,
"eval_steps_per_second": 1.61,
"step": 9600
},
{
"epoch": 0.1925,
"grad_norm": 0.5915130204472873,
"learning_rate": 8.972444444444445e-06,
"loss": 2.407,
"step": 9625
},
{
"epoch": 0.193,
"grad_norm": 0.59521034646126,
"learning_rate": 8.96688888888889e-06,
"loss": 2.3924,
"step": 9650
},
{
"epoch": 0.1935,
"grad_norm": 0.6050238690396914,
"learning_rate": 8.961333333333333e-06,
"loss": 2.3869,
"step": 9675
},
{
"epoch": 0.194,
"grad_norm": 0.5691067223521449,
"learning_rate": 8.955777777777779e-06,
"loss": 2.3874,
"step": 9700
},
{
"epoch": 0.194,
"eval_loss": 2.408264398574829,
"eval_runtime": 31.8579,
"eval_samples_per_second": 3.202,
"eval_steps_per_second": 1.601,
"step": 9700
},
{
"epoch": 0.1945,
"grad_norm": 0.5753054034666798,
"learning_rate": 8.950222222222224e-06,
"loss": 2.4027,
"step": 9725
},
{
"epoch": 0.195,
"grad_norm": 0.5864767839913545,
"learning_rate": 8.944666666666668e-06,
"loss": 2.3924,
"step": 9750
},
{
"epoch": 0.1955,
"grad_norm": 0.6642807256080032,
"learning_rate": 8.939111111111111e-06,
"loss": 2.3709,
"step": 9775
},
{
"epoch": 0.196,
"grad_norm": 0.6084139101409156,
"learning_rate": 8.933555555555556e-06,
"loss": 2.3958,
"step": 9800
},
{
"epoch": 0.196,
"eval_loss": 2.4076178073883057,
"eval_runtime": 31.7733,
"eval_samples_per_second": 3.21,
"eval_steps_per_second": 1.605,
"step": 9800
},
{
"epoch": 0.1965,
"grad_norm": 0.592206064244208,
"learning_rate": 8.928000000000002e-06,
"loss": 2.3922,
"step": 9825
},
{
"epoch": 0.197,
"grad_norm": 0.5685236067589632,
"learning_rate": 8.922444444444445e-06,
"loss": 2.3908,
"step": 9850
},
{
"epoch": 0.1975,
"grad_norm": 0.6034821273699428,
"learning_rate": 8.916888888888889e-06,
"loss": 2.3903,
"step": 9875
},
{
"epoch": 0.198,
"grad_norm": 0.5910198540350765,
"learning_rate": 8.911333333333334e-06,
"loss": 2.3767,
"step": 9900
},
{
"epoch": 0.198,
"eval_loss": 2.407928705215454,
"eval_runtime": 31.7033,
"eval_samples_per_second": 3.217,
"eval_steps_per_second": 1.609,
"step": 9900
},
{
"epoch": 0.1985,
"grad_norm": 0.5849079897115376,
"learning_rate": 8.90577777777778e-06,
"loss": 2.3956,
"step": 9925
},
{
"epoch": 0.199,
"grad_norm": 0.5683901924605945,
"learning_rate": 8.900222222222223e-06,
"loss": 2.3884,
"step": 9950
},
{
"epoch": 0.1995,
"grad_norm": 0.6037241225699064,
"learning_rate": 8.894666666666666e-06,
"loss": 2.3934,
"step": 9975
},
{
"epoch": 0.2,
"grad_norm": 0.5807810374364664,
"learning_rate": 8.889111111111112e-06,
"loss": 2.3999,
"step": 10000
},
{
"epoch": 0.2,
"eval_loss": 2.40779447555542,
"eval_runtime": 31.7288,
"eval_samples_per_second": 3.215,
"eval_steps_per_second": 1.607,
"step": 10000
},
{
"epoch": 0.2005,
"grad_norm": 0.5959223333719136,
"learning_rate": 8.883555555555557e-06,
"loss": 2.387,
"step": 10025
},
{
"epoch": 0.201,
"grad_norm": 0.604008744038432,
"learning_rate": 8.878e-06,
"loss": 2.4016,
"step": 10050
},
{
"epoch": 0.2015,
"grad_norm": 0.5721419521050413,
"learning_rate": 8.872444444444444e-06,
"loss": 2.3884,
"step": 10075
},
{
"epoch": 0.202,
"grad_norm": 0.5986167284289824,
"learning_rate": 8.86688888888889e-06,
"loss": 2.3945,
"step": 10100
},
{
"epoch": 0.202,
"eval_loss": 2.4074654579162598,
"eval_runtime": 31.8658,
"eval_samples_per_second": 3.201,
"eval_steps_per_second": 1.6,
"step": 10100
},
{
"epoch": 0.2025,
"grad_norm": 0.6046479507995179,
"learning_rate": 8.861333333333334e-06,
"loss": 2.3858,
"step": 10125
},
{
"epoch": 0.203,
"grad_norm": 0.5633013817443194,
"learning_rate": 8.855777777777778e-06,
"loss": 2.3879,
"step": 10150
},
{
"epoch": 0.2035,
"grad_norm": 0.5953174401982892,
"learning_rate": 8.850222222222223e-06,
"loss": 2.3967,
"step": 10175
},
{
"epoch": 0.204,
"grad_norm": 0.6306212647705982,
"learning_rate": 8.844666666666667e-06,
"loss": 2.3927,
"step": 10200
},
{
"epoch": 0.204,
"eval_loss": 2.407031297683716,
"eval_runtime": 31.7801,
"eval_samples_per_second": 3.21,
"eval_steps_per_second": 1.605,
"step": 10200
},
{
"epoch": 0.2045,
"grad_norm": 0.5605617492602121,
"learning_rate": 8.839111111111112e-06,
"loss": 2.4081,
"step": 10225
},
{
"epoch": 0.205,
"grad_norm": 0.5739246143474902,
"learning_rate": 8.833555555555556e-06,
"loss": 2.3841,
"step": 10250
},
{
"epoch": 0.2055,
"grad_norm": 0.5938549959471341,
"learning_rate": 8.828000000000001e-06,
"loss": 2.3902,
"step": 10275
},
{
"epoch": 0.206,
"grad_norm": 0.5902936931354175,
"learning_rate": 8.822444444444446e-06,
"loss": 2.3905,
"step": 10300
},
{
"epoch": 0.206,
"eval_loss": 2.4066004753112793,
"eval_runtime": 31.7707,
"eval_samples_per_second": 3.211,
"eval_steps_per_second": 1.605,
"step": 10300
},
{
"epoch": 0.2065,
"grad_norm": 0.5697435057211838,
"learning_rate": 8.81688888888889e-06,
"loss": 2.3854,
"step": 10325
},
{
"epoch": 0.207,
"grad_norm": 0.5879126074250441,
"learning_rate": 8.811333333333333e-06,
"loss": 2.3917,
"step": 10350
},
{
"epoch": 0.2075,
"grad_norm": 0.5800642153182343,
"learning_rate": 8.805777777777778e-06,
"loss": 2.3929,
"step": 10375
},
{
"epoch": 0.208,
"grad_norm": 0.5794546973922929,
"learning_rate": 8.800222222222224e-06,
"loss": 2.3912,
"step": 10400
},
{
"epoch": 0.208,
"eval_loss": 2.4065024852752686,
"eval_runtime": 31.7191,
"eval_samples_per_second": 3.216,
"eval_steps_per_second": 1.608,
"step": 10400
},
{
"epoch": 0.2085,
"grad_norm": 0.5776454190712899,
"learning_rate": 8.794666666666667e-06,
"loss": 2.386,
"step": 10425
},
{
"epoch": 0.209,
"grad_norm": 0.5578455228918948,
"learning_rate": 8.78911111111111e-06,
"loss": 2.3869,
"step": 10450
},
{
"epoch": 0.2095,
"grad_norm": 0.5721674793656858,
"learning_rate": 8.783555555555556e-06,
"loss": 2.3779,
"step": 10475
},
{
"epoch": 0.21,
"grad_norm": 0.5950633442730316,
"learning_rate": 8.778000000000001e-06,
"loss": 2.3845,
"step": 10500
},
{
"epoch": 0.21,
"eval_loss": 2.4065566062927246,
"eval_runtime": 31.8091,
"eval_samples_per_second": 3.207,
"eval_steps_per_second": 1.603,
"step": 10500
},
{
"epoch": 0.2105,
"grad_norm": 0.605078293663896,
"learning_rate": 8.772444444444445e-06,
"loss": 2.3913,
"step": 10525
},
{
"epoch": 0.211,
"grad_norm": 0.567849892850204,
"learning_rate": 8.766888888888888e-06,
"loss": 2.3966,
"step": 10550
},
{
"epoch": 0.2115,
"grad_norm": 0.6876645024191659,
"learning_rate": 8.761333333333334e-06,
"loss": 2.3993,
"step": 10575
},
{
"epoch": 0.212,
"grad_norm": 0.5841938304908528,
"learning_rate": 8.755777777777779e-06,
"loss": 2.3916,
"step": 10600
},
{
"epoch": 0.212,
"eval_loss": 2.4061877727508545,
"eval_runtime": 31.8484,
"eval_samples_per_second": 3.203,
"eval_steps_per_second": 1.601,
"step": 10600
},
{
"epoch": 0.2125,
"grad_norm": 0.5649004204666818,
"learning_rate": 8.750222222222223e-06,
"loss": 2.381,
"step": 10625
},
{
"epoch": 0.213,
"grad_norm": 0.5678489376050115,
"learning_rate": 8.744666666666666e-06,
"loss": 2.3995,
"step": 10650
},
{
"epoch": 0.2135,
"grad_norm": 0.5712733595317334,
"learning_rate": 8.739111111111111e-06,
"loss": 2.3954,
"step": 10675
},
{
"epoch": 0.214,
"grad_norm": 0.573353636066434,
"learning_rate": 8.733555555555557e-06,
"loss": 2.379,
"step": 10700
},
{
"epoch": 0.214,
"eval_loss": 2.4055771827697754,
"eval_runtime": 31.8192,
"eval_samples_per_second": 3.206,
"eval_steps_per_second": 1.603,
"step": 10700
},
{
"epoch": 0.2145,
"grad_norm": 0.6133309651928519,
"learning_rate": 8.728e-06,
"loss": 2.3946,
"step": 10725
},
{
"epoch": 0.215,
"grad_norm": 0.6033931866035528,
"learning_rate": 8.722444444444445e-06,
"loss": 2.3935,
"step": 10750
},
{
"epoch": 0.2155,
"grad_norm": 0.6008672136487845,
"learning_rate": 8.716888888888889e-06,
"loss": 2.3872,
"step": 10775
},
{
"epoch": 0.216,
"grad_norm": 0.5728704483928734,
"learning_rate": 8.711333333333334e-06,
"loss": 2.3917,
"step": 10800
},
{
"epoch": 0.216,
"eval_loss": 2.4059016704559326,
"eval_runtime": 31.7995,
"eval_samples_per_second": 3.208,
"eval_steps_per_second": 1.604,
"step": 10800
},
{
"epoch": 0.2165,
"grad_norm": 0.5888944153423502,
"learning_rate": 8.705777777777778e-06,
"loss": 2.3946,
"step": 10825
},
{
"epoch": 0.217,
"grad_norm": 0.5947880979306366,
"learning_rate": 8.700222222222223e-06,
"loss": 2.3736,
"step": 10850
},
{
"epoch": 0.2175,
"grad_norm": 0.6163696606959644,
"learning_rate": 8.694666666666668e-06,
"loss": 2.3838,
"step": 10875
},
{
"epoch": 0.218,
"grad_norm": 0.6004092938812543,
"learning_rate": 8.689111111111112e-06,
"loss": 2.3942,
"step": 10900
},
{
"epoch": 0.218,
"eval_loss": 2.4055566787719727,
"eval_runtime": 31.7386,
"eval_samples_per_second": 3.214,
"eval_steps_per_second": 1.607,
"step": 10900
},
{
"epoch": 0.2185,
"grad_norm": 0.5786273641598245,
"learning_rate": 8.683555555555555e-06,
"loss": 2.3938,
"step": 10925
},
{
"epoch": 0.219,
"grad_norm": 0.5764162885826465,
"learning_rate": 8.678e-06,
"loss": 2.3939,
"step": 10950
},
{
"epoch": 0.2195,
"grad_norm": 0.5923291223123188,
"learning_rate": 8.672444444444446e-06,
"loss": 2.3847,
"step": 10975
},
{
"epoch": 0.22,
"grad_norm": 0.6102815146568634,
"learning_rate": 8.66688888888889e-06,
"loss": 2.3901,
"step": 11000
},
{
"epoch": 0.22,
"eval_loss": 2.405616044998169,
"eval_runtime": 31.7048,
"eval_samples_per_second": 3.217,
"eval_steps_per_second": 1.609,
"step": 11000
},
{
"epoch": 0.2205,
"grad_norm": 0.619520274382602,
"learning_rate": 8.661333333333335e-06,
"loss": 2.3868,
"step": 11025
},
{
"epoch": 0.221,
"grad_norm": 0.5973378822756289,
"learning_rate": 8.655777777777778e-06,
"loss": 2.398,
"step": 11050
},
{
"epoch": 0.2215,
"grad_norm": 0.6143187669490118,
"learning_rate": 8.650222222222223e-06,
"loss": 2.387,
"step": 11075
},
{
"epoch": 0.222,
"grad_norm": 0.5804040103557917,
"learning_rate": 8.644666666666669e-06,
"loss": 2.3951,
"step": 11100
},
{
"epoch": 0.222,
"eval_loss": 2.4050545692443848,
"eval_runtime": 31.7713,
"eval_samples_per_second": 3.21,
"eval_steps_per_second": 1.605,
"step": 11100
},
{
"epoch": 0.2225,
"grad_norm": 0.5833158956225722,
"learning_rate": 8.639111111111112e-06,
"loss": 2.3854,
"step": 11125
},
{
"epoch": 0.223,
"grad_norm": 0.5741811771851818,
"learning_rate": 8.633555555555556e-06,
"loss": 2.3866,
"step": 11150
},
{
"epoch": 0.2235,
"grad_norm": 0.5856955103294486,
"learning_rate": 8.628000000000001e-06,
"loss": 2.4058,
"step": 11175
},
{
"epoch": 0.224,
"grad_norm": 0.5685596699989746,
"learning_rate": 8.622444444444446e-06,
"loss": 2.3953,
"step": 11200
},
{
"epoch": 0.224,
"eval_loss": 2.4051928520202637,
"eval_runtime": 35.481,
"eval_samples_per_second": 2.875,
"eval_steps_per_second": 1.437,
"step": 11200
},
{
"epoch": 0.2245,
"grad_norm": 0.5854297741723825,
"learning_rate": 8.61688888888889e-06,
"loss": 2.3977,
"step": 11225
},
{
"epoch": 0.225,
"grad_norm": 0.582929503102295,
"learning_rate": 8.611333333333333e-06,
"loss": 2.3948,
"step": 11250
},
{
"epoch": 0.2255,
"grad_norm": 0.5839207937169353,
"learning_rate": 8.605777777777779e-06,
"loss": 2.4104,
"step": 11275
},
{
"epoch": 0.226,
"grad_norm": 0.5568849917729087,
"learning_rate": 8.600222222222224e-06,
"loss": 2.4011,
"step": 11300
},
{
"epoch": 0.226,
"eval_loss": 2.404717445373535,
"eval_runtime": 31.9835,
"eval_samples_per_second": 3.189,
"eval_steps_per_second": 1.595,
"step": 11300
},
{
"epoch": 0.2265,
"grad_norm": 0.5549969270675909,
"learning_rate": 8.594666666666668e-06,
"loss": 2.3965,
"step": 11325
},
{
"epoch": 0.227,
"grad_norm": 0.5606539732290856,
"learning_rate": 8.589111111111111e-06,
"loss": 2.3921,
"step": 11350
},
{
"epoch": 0.2275,
"grad_norm": 0.5626929771754517,
"learning_rate": 8.583555555555556e-06,
"loss": 2.3912,
"step": 11375
},
{
"epoch": 0.228,
"grad_norm": 0.5731631708828652,
"learning_rate": 8.578000000000002e-06,
"loss": 2.3926,
"step": 11400
},
{
"epoch": 0.228,
"eval_loss": 2.4047322273254395,
"eval_runtime": 31.8245,
"eval_samples_per_second": 3.205,
"eval_steps_per_second": 1.603,
"step": 11400
},
{
"epoch": 0.2285,
"grad_norm": 0.5661654100374769,
"learning_rate": 8.572444444444445e-06,
"loss": 2.3951,
"step": 11425
},
{
"epoch": 0.229,
"grad_norm": 0.5602181256620924,
"learning_rate": 8.56688888888889e-06,
"loss": 2.3812,
"step": 11450
},
{
"epoch": 0.2295,
"grad_norm": 0.5950733473289397,
"learning_rate": 8.561333333333334e-06,
"loss": 2.3963,
"step": 11475
},
{
"epoch": 0.23,
"grad_norm": 0.5733938863696743,
"learning_rate": 8.55577777777778e-06,
"loss": 2.3932,
"step": 11500
},
{
"epoch": 0.23,
"eval_loss": 2.403830051422119,
"eval_runtime": 31.7862,
"eval_samples_per_second": 3.209,
"eval_steps_per_second": 1.604,
"step": 11500
},
{
"epoch": 0.2305,
"grad_norm": 0.5702512759518216,
"learning_rate": 8.550222222222223e-06,
"loss": 2.3824,
"step": 11525
},
{
"epoch": 0.231,
"grad_norm": 0.5749933738625221,
"learning_rate": 8.544666666666668e-06,
"loss": 2.3674,
"step": 11550
},
{
"epoch": 0.2315,
"grad_norm": 0.563814842108926,
"learning_rate": 8.539111111111112e-06,
"loss": 2.3866,
"step": 11575
},
{
"epoch": 0.232,
"grad_norm": 0.601764608458657,
"learning_rate": 8.533555555555557e-06,
"loss": 2.3949,
"step": 11600
},
{
"epoch": 0.232,
"eval_loss": 2.4035561084747314,
"eval_runtime": 31.7077,
"eval_samples_per_second": 3.217,
"eval_steps_per_second": 1.608,
"step": 11600
},
{
"epoch": 0.2325,
"grad_norm": 0.5674229084100237,
"learning_rate": 8.528e-06,
"loss": 2.3782,
"step": 11625
},
{
"epoch": 0.233,
"grad_norm": 0.5660025767055805,
"learning_rate": 8.522444444444446e-06,
"loss": 2.3811,
"step": 11650
},
{
"epoch": 0.2335,
"grad_norm": 0.5776196117388842,
"learning_rate": 8.51688888888889e-06,
"loss": 2.3964,
"step": 11675
},
{
"epoch": 0.234,
"grad_norm": 0.5815076886720436,
"learning_rate": 8.511333333333334e-06,
"loss": 2.3907,
"step": 11700
},
{
"epoch": 0.234,
"eval_loss": 2.4035725593566895,
"eval_runtime": 31.7541,
"eval_samples_per_second": 3.212,
"eval_steps_per_second": 1.606,
"step": 11700
},
{
"epoch": 0.2345,
"grad_norm": 0.5810635532925048,
"learning_rate": 8.505777777777778e-06,
"loss": 2.3921,
"step": 11725
},
{
"epoch": 0.235,
"grad_norm": 0.5635380257098753,
"learning_rate": 8.500222222222223e-06,
"loss": 2.4062,
"step": 11750
},
{
"epoch": 0.2355,
"grad_norm": 0.5985004911332629,
"learning_rate": 8.494666666666668e-06,
"loss": 2.3853,
"step": 11775
},
{
"epoch": 0.236,
"grad_norm": 0.580078413647693,
"learning_rate": 8.489111111111112e-06,
"loss": 2.3826,
"step": 11800
},
{
"epoch": 0.236,
"eval_loss": 2.403505325317383,
"eval_runtime": 31.7265,
"eval_samples_per_second": 3.215,
"eval_steps_per_second": 1.607,
"step": 11800
},
{
"epoch": 0.2365,
"grad_norm": 0.5560334145179444,
"learning_rate": 8.483555555555556e-06,
"loss": 2.3829,
"step": 11825
},
{
"epoch": 0.237,
"grad_norm": 0.5870934042209253,
"learning_rate": 8.478e-06,
"loss": 2.374,
"step": 11850
},
{
"epoch": 0.2375,
"grad_norm": 0.5745342448568999,
"learning_rate": 8.472444444444446e-06,
"loss": 2.3797,
"step": 11875
},
{
"epoch": 0.238,
"grad_norm": 0.5676573173578097,
"learning_rate": 8.46688888888889e-06,
"loss": 2.3867,
"step": 11900
},
{
"epoch": 0.238,
"eval_loss": 2.403400421142578,
"eval_runtime": 31.8105,
"eval_samples_per_second": 3.206,
"eval_steps_per_second": 1.603,
"step": 11900
},
{
"epoch": 0.2385,
"grad_norm": 0.5701256243606029,
"learning_rate": 8.461333333333333e-06,
"loss": 2.3832,
"step": 11925
},
{
"epoch": 0.239,
"grad_norm": 0.5839965205220576,
"learning_rate": 8.455777777777778e-06,
"loss": 2.3939,
"step": 11950
},
{
"epoch": 0.2395,
"grad_norm": 0.581600775004578,
"learning_rate": 8.450222222222224e-06,
"loss": 2.382,
"step": 11975
},
{
"epoch": 0.24,
"grad_norm": 0.5945113931788275,
"learning_rate": 8.444666666666667e-06,
"loss": 2.3947,
"step": 12000
},
{
"epoch": 0.24,
"eval_loss": 2.4031572341918945,
"eval_runtime": 31.7154,
"eval_samples_per_second": 3.216,
"eval_steps_per_second": 1.608,
"step": 12000
},
{
"epoch": 0.2405,
"grad_norm": 0.5687487747515707,
"learning_rate": 8.43911111111111e-06,
"loss": 2.3859,
"step": 12025
},
{
"epoch": 0.241,
"grad_norm": 0.6156971193882954,
"learning_rate": 8.433555555555556e-06,
"loss": 2.3936,
"step": 12050
},
{
"epoch": 0.2415,
"grad_norm": 0.5735725917481376,
"learning_rate": 8.428000000000001e-06,
"loss": 2.3867,
"step": 12075
},
{
"epoch": 0.242,
"grad_norm": 0.5900311312717111,
"learning_rate": 8.422444444444445e-06,
"loss": 2.381,
"step": 12100
},
{
"epoch": 0.242,
"eval_loss": 2.402616262435913,
"eval_runtime": 31.728,
"eval_samples_per_second": 3.215,
"eval_steps_per_second": 1.607,
"step": 12100
},
{
"epoch": 0.2425,
"grad_norm": 0.6210456413331185,
"learning_rate": 8.41688888888889e-06,
"loss": 2.3897,
"step": 12125
},
{
"epoch": 0.243,
"grad_norm": 0.564076844370536,
"learning_rate": 8.411333333333334e-06,
"loss": 2.3789,
"step": 12150
},
{
"epoch": 0.2435,
"grad_norm": 0.5787670607206897,
"learning_rate": 8.405777777777779e-06,
"loss": 2.3927,
"step": 12175
},
{
"epoch": 0.244,
"grad_norm": 0.557686861390105,
"learning_rate": 8.400222222222222e-06,
"loss": 2.3761,
"step": 12200
},
{
"epoch": 0.244,
"eval_loss": 2.4025542736053467,
"eval_runtime": 31.8116,
"eval_samples_per_second": 3.206,
"eval_steps_per_second": 1.603,
"step": 12200
},
{
"epoch": 0.2445,
"grad_norm": 0.5642621664909974,
"learning_rate": 8.394666666666668e-06,
"loss": 2.3787,
"step": 12225
},
{
"epoch": 0.245,
"grad_norm": 0.5812642245692796,
"learning_rate": 8.389111111111113e-06,
"loss": 2.3888,
"step": 12250
},
{
"epoch": 0.2455,
"grad_norm": 0.5903665572148793,
"learning_rate": 8.383555555555557e-06,
"loss": 2.3874,
"step": 12275
},
{
"epoch": 0.246,
"grad_norm": 0.5752826274496151,
"learning_rate": 8.378e-06,
"loss": 2.3851,
"step": 12300
},
{
"epoch": 0.246,
"eval_loss": 2.4024178981781006,
"eval_runtime": 31.9538,
"eval_samples_per_second": 3.192,
"eval_steps_per_second": 1.596,
"step": 12300
},
{
"epoch": 0.2465,
"grad_norm": 0.5625780105871633,
"learning_rate": 8.372444444444445e-06,
"loss": 2.3857,
"step": 12325
},
{
"epoch": 0.247,
"grad_norm": 0.5516059110433715,
"learning_rate": 8.36688888888889e-06,
"loss": 2.387,
"step": 12350
},
{
"epoch": 0.2475,
"grad_norm": 0.5743651124710031,
"learning_rate": 8.361333333333334e-06,
"loss": 2.3899,
"step": 12375
},
{
"epoch": 0.248,
"grad_norm": 0.6065509345211424,
"learning_rate": 8.355777777777778e-06,
"loss": 2.3811,
"step": 12400
},
{
"epoch": 0.248,
"eval_loss": 2.402189254760742,
"eval_runtime": 31.7357,
"eval_samples_per_second": 3.214,
"eval_steps_per_second": 1.607,
"step": 12400
},
{
"epoch": 0.2485,
"grad_norm": 0.569411806780091,
"learning_rate": 8.350222222222223e-06,
"loss": 2.3891,
"step": 12425
},
{
"epoch": 0.249,
"grad_norm": 0.5781227404353481,
"learning_rate": 8.344666666666668e-06,
"loss": 2.3799,
"step": 12450
},
{
"epoch": 0.2495,
"grad_norm": 0.5882770416548074,
"learning_rate": 8.339111111111112e-06,
"loss": 2.3921,
"step": 12475
},
{
"epoch": 0.25,
"grad_norm": 0.6053137792053689,
"learning_rate": 8.333555555555555e-06,
"loss": 2.3923,
"step": 12500
},
{
"epoch": 0.25,
"eval_loss": 2.401906967163086,
"eval_runtime": 31.7052,
"eval_samples_per_second": 3.217,
"eval_steps_per_second": 1.609,
"step": 12500
},
{
"epoch": 0.2505,
"grad_norm": 0.5493940361276148,
"learning_rate": 8.328e-06,
"loss": 2.3872,
"step": 12525
},
{
"epoch": 0.251,
"grad_norm": 0.5844453837465953,
"learning_rate": 8.322444444444446e-06,
"loss": 2.3859,
"step": 12550
},
{
"epoch": 0.2515,
"grad_norm": 0.589694030674745,
"learning_rate": 8.31688888888889e-06,
"loss": 2.3852,
"step": 12575
},
{
"epoch": 0.252,
"grad_norm": 0.5985872367130171,
"learning_rate": 8.311333333333333e-06,
"loss": 2.378,
"step": 12600
},
{
"epoch": 0.252,
"eval_loss": 2.4017632007598877,
"eval_runtime": 31.8059,
"eval_samples_per_second": 3.207,
"eval_steps_per_second": 1.603,
"step": 12600
},
{
"epoch": 0.2525,
"grad_norm": 0.6246560097732429,
"learning_rate": 8.305777777777778e-06,
"loss": 2.3891,
"step": 12625
},
{
"epoch": 0.253,
"grad_norm": 0.5977851115835912,
"learning_rate": 8.300222222222223e-06,
"loss": 2.3884,
"step": 12650
},
{
"epoch": 0.2535,
"grad_norm": 0.5535634109353079,
"learning_rate": 8.294666666666667e-06,
"loss": 2.3894,
"step": 12675
},
{
"epoch": 0.254,
"grad_norm": 0.5647542662126371,
"learning_rate": 8.289111111111112e-06,
"loss": 2.3889,
"step": 12700
},
{
"epoch": 0.254,
"eval_loss": 2.4015073776245117,
"eval_runtime": 31.6682,
"eval_samples_per_second": 3.221,
"eval_steps_per_second": 1.61,
"step": 12700
},
{
"epoch": 0.2545,
"grad_norm": 0.5689860381748764,
"learning_rate": 8.283555555555556e-06,
"loss": 2.391,
"step": 12725
},
{
"epoch": 0.255,
"grad_norm": 0.5788815220722723,
"learning_rate": 8.278000000000001e-06,
"loss": 2.3746,
"step": 12750
},
{
"epoch": 0.2555,
"grad_norm": 0.5746385277305921,
"learning_rate": 8.272444444444445e-06,
"loss": 2.3884,
"step": 12775
},
{
"epoch": 0.256,
"grad_norm": 0.5952261074381101,
"learning_rate": 8.26688888888889e-06,
"loss": 2.387,
"step": 12800
},
{
"epoch": 0.256,
"eval_loss": 2.401090383529663,
"eval_runtime": 31.7518,
"eval_samples_per_second": 3.212,
"eval_steps_per_second": 1.606,
"step": 12800
},
{
"epoch": 0.2565,
"grad_norm": 0.581914246490724,
"learning_rate": 8.261333333333335e-06,
"loss": 2.3879,
"step": 12825
},
{
"epoch": 0.257,
"grad_norm": 0.5582195018164189,
"learning_rate": 8.255777777777779e-06,
"loss": 2.3783,
"step": 12850
},
{
"epoch": 0.2575,
"grad_norm": 0.5633036552978725,
"learning_rate": 8.250222222222222e-06,
"loss": 2.3845,
"step": 12875
},
{
"epoch": 0.258,
"grad_norm": 0.5613155523789654,
"learning_rate": 8.244666666666667e-06,
"loss": 2.3942,
"step": 12900
},
{
"epoch": 0.258,
"eval_loss": 2.4014108180999756,
"eval_runtime": 31.8052,
"eval_samples_per_second": 3.207,
"eval_steps_per_second": 1.604,
"step": 12900
},
{
"epoch": 0.2585,
"grad_norm": 0.5906307979751212,
"learning_rate": 8.239111111111113e-06,
"loss": 2.3807,
"step": 12925
},
{
"epoch": 0.259,
"grad_norm": 0.5786593603781868,
"learning_rate": 8.233555555555556e-06,
"loss": 2.3848,
"step": 12950
},
{
"epoch": 0.2595,
"grad_norm": 0.5739057988147651,
"learning_rate": 8.228e-06,
"loss": 2.3841,
"step": 12975
},
{
"epoch": 0.26,
"grad_norm": 0.5727067411665359,
"learning_rate": 8.222444444444445e-06,
"loss": 2.3771,
"step": 13000
},
{
"epoch": 0.26,
"eval_loss": 2.4009385108947754,
"eval_runtime": 31.8075,
"eval_samples_per_second": 3.207,
"eval_steps_per_second": 1.603,
"step": 13000
},
{
"epoch": 0.2605,
"grad_norm": 0.5758550911461594,
"learning_rate": 8.21688888888889e-06,
"loss": 2.39,
"step": 13025
},
{
"epoch": 0.261,
"grad_norm": 0.5506335078390368,
"learning_rate": 8.211333333333334e-06,
"loss": 2.3879,
"step": 13050
},
{
"epoch": 0.2615,
"grad_norm": 0.578047700560021,
"learning_rate": 8.205777777777777e-06,
"loss": 2.3772,
"step": 13075
},
{
"epoch": 0.262,
"grad_norm": 0.5517825098879646,
"learning_rate": 8.200222222222223e-06,
"loss": 2.3751,
"step": 13100
},
{
"epoch": 0.262,
"eval_loss": 2.4008378982543945,
"eval_runtime": 31.8219,
"eval_samples_per_second": 3.205,
"eval_steps_per_second": 1.603,
"step": 13100
},
{
"epoch": 0.2625,
"grad_norm": 0.6060142395322289,
"learning_rate": 8.194666666666668e-06,
"loss": 2.3859,
"step": 13125
},
{
"epoch": 0.263,
"grad_norm": 0.6151379264003006,
"learning_rate": 8.189111111111111e-06,
"loss": 2.3906,
"step": 13150
},
{
"epoch": 0.2635,
"grad_norm": 0.5889091981712471,
"learning_rate": 8.183555555555555e-06,
"loss": 2.3813,
"step": 13175
},
{
"epoch": 0.264,
"grad_norm": 0.7021686085407579,
"learning_rate": 8.178e-06,
"loss": 2.3844,
"step": 13200
},
{
"epoch": 0.264,
"eval_loss": 2.400826930999756,
"eval_runtime": 31.7255,
"eval_samples_per_second": 3.215,
"eval_steps_per_second": 1.608,
"step": 13200
},
{
"epoch": 0.2645,
"grad_norm": 0.5738899506070113,
"learning_rate": 8.172444444444446e-06,
"loss": 2.3974,
"step": 13225
},
{
"epoch": 0.265,
"grad_norm": 0.618543215020873,
"learning_rate": 8.166888888888889e-06,
"loss": 2.3846,
"step": 13250
},
{
"epoch": 0.2655,
"grad_norm": 0.5529480549821216,
"learning_rate": 8.161333333333334e-06,
"loss": 2.3816,
"step": 13275
},
{
"epoch": 0.266,
"grad_norm": 0.569904631452621,
"learning_rate": 8.155777777777778e-06,
"loss": 2.3809,
"step": 13300
},
{
"epoch": 0.266,
"eval_loss": 2.4002933502197266,
"eval_runtime": 31.6983,
"eval_samples_per_second": 3.218,
"eval_steps_per_second": 1.609,
"step": 13300
},
{
"epoch": 0.2665,
"grad_norm": 0.5743878084278218,
"learning_rate": 8.150222222222223e-06,
"loss": 2.3941,
"step": 13325
},
{
"epoch": 0.267,
"grad_norm": 0.5594243149898632,
"learning_rate": 8.144666666666667e-06,
"loss": 2.3878,
"step": 13350
},
{
"epoch": 0.2675,
"grad_norm": 0.5810666087448406,
"learning_rate": 8.139111111111112e-06,
"loss": 2.381,
"step": 13375
},
{
"epoch": 0.268,
"grad_norm": 0.5595852108101106,
"learning_rate": 8.133555555555557e-06,
"loss": 2.3792,
"step": 13400
},
{
"epoch": 0.268,
"eval_loss": 2.400261878967285,
"eval_runtime": 31.6975,
"eval_samples_per_second": 3.218,
"eval_steps_per_second": 1.609,
"step": 13400
},
{
"epoch": 0.2685,
"grad_norm": 0.5789530002361615,
"learning_rate": 8.128e-06,
"loss": 2.3759,
"step": 13425
},
{
"epoch": 0.269,
"grad_norm": 0.5662301407639397,
"learning_rate": 8.122444444444444e-06,
"loss": 2.3791,
"step": 13450
},
{
"epoch": 0.2695,
"grad_norm": 0.6131145841315326,
"learning_rate": 8.11688888888889e-06,
"loss": 2.3833,
"step": 13475
},
{
"epoch": 0.27,
"grad_norm": 0.5607318024001929,
"learning_rate": 8.111333333333335e-06,
"loss": 2.3724,
"step": 13500
},
{
"epoch": 0.27,
"eval_loss": 2.4000020027160645,
"eval_runtime": 31.71,
"eval_samples_per_second": 3.217,
"eval_steps_per_second": 1.608,
"step": 13500
},
{
"epoch": 0.2705,
"grad_norm": 0.5692755244185855,
"learning_rate": 8.105777777777778e-06,
"loss": 2.3788,
"step": 13525
},
{
"epoch": 0.271,
"grad_norm": 0.5647342769538716,
"learning_rate": 8.100222222222222e-06,
"loss": 2.3799,
"step": 13550
},
{
"epoch": 0.2715,
"grad_norm": 0.5976773519089553,
"learning_rate": 8.094666666666667e-06,
"loss": 2.3828,
"step": 13575
},
{
"epoch": 0.272,
"grad_norm": 0.5642506953063758,
"learning_rate": 8.089111111111112e-06,
"loss": 2.3835,
"step": 13600
},
{
"epoch": 0.272,
"eval_loss": 2.400066614151001,
"eval_runtime": 31.8128,
"eval_samples_per_second": 3.206,
"eval_steps_per_second": 1.603,
"step": 13600
},
{
"epoch": 0.2725,
"grad_norm": 0.5616659241704035,
"learning_rate": 8.083555555555556e-06,
"loss": 2.3801,
"step": 13625
},
{
"epoch": 0.273,
"grad_norm": 0.5878315825498157,
"learning_rate": 8.078e-06,
"loss": 2.3781,
"step": 13650
},
{
"epoch": 0.2735,
"grad_norm": 0.5716337786191225,
"learning_rate": 8.072444444444445e-06,
"loss": 2.3932,
"step": 13675
},
{
"epoch": 0.274,
"grad_norm": 0.5636757577555458,
"learning_rate": 8.06688888888889e-06,
"loss": 2.4041,
"step": 13700
},
{
"epoch": 0.274,
"eval_loss": 2.3997650146484375,
"eval_runtime": 31.4871,
"eval_samples_per_second": 3.239,
"eval_steps_per_second": 1.62,
"step": 13700
},
{
"epoch": 0.2745,
"grad_norm": 0.5564992808480433,
"learning_rate": 8.061333333333334e-06,
"loss": 2.3971,
"step": 13725
},
{
"epoch": 0.275,
"grad_norm": 0.5736246457745038,
"learning_rate": 8.055777777777777e-06,
"loss": 2.3847,
"step": 13750
},
{
"epoch": 0.2755,
"grad_norm": 0.5423430973262378,
"learning_rate": 8.050222222222222e-06,
"loss": 2.3786,
"step": 13775
},
{
"epoch": 0.276,
"grad_norm": 0.5672815850751382,
"learning_rate": 8.044666666666668e-06,
"loss": 2.3945,
"step": 13800
},
{
"epoch": 0.276,
"eval_loss": 2.399338483810425,
"eval_runtime": 31.3741,
"eval_samples_per_second": 3.251,
"eval_steps_per_second": 1.626,
"step": 13800
},
{
"epoch": 0.2765,
"grad_norm": 0.5919813611615313,
"learning_rate": 8.039111111111111e-06,
"loss": 2.3738,
"step": 13825
},
{
"epoch": 0.277,
"grad_norm": 0.5679311638374708,
"learning_rate": 8.033555555555556e-06,
"loss": 2.3771,
"step": 13850
},
{
"epoch": 0.2775,
"grad_norm": 0.5533203763453908,
"learning_rate": 8.028e-06,
"loss": 2.3831,
"step": 13875
},
{
"epoch": 0.278,
"grad_norm": 0.5674818164725537,
"learning_rate": 8.022444444444445e-06,
"loss": 2.3811,
"step": 13900
},
{
"epoch": 0.278,
"eval_loss": 2.3990118503570557,
"eval_runtime": 31.47,
"eval_samples_per_second": 3.241,
"eval_steps_per_second": 1.621,
"step": 13900
},
{
"epoch": 0.2785,
"grad_norm": 0.5664699981127816,
"learning_rate": 8.016888888888889e-06,
"loss": 2.3848,
"step": 13925
},
{
"epoch": 0.279,
"grad_norm": 0.6085875103795902,
"learning_rate": 8.011333333333334e-06,
"loss": 2.3822,
"step": 13950
},
{
"epoch": 0.2795,
"grad_norm": 0.561160479481643,
"learning_rate": 8.00577777777778e-06,
"loss": 2.3722,
"step": 13975
},
{
"epoch": 0.28,
"grad_norm": 0.566395855978902,
"learning_rate": 8.000222222222223e-06,
"loss": 2.3922,
"step": 14000
},
{
"epoch": 0.28,
"eval_loss": 2.3991119861602783,
"eval_runtime": 31.6591,
"eval_samples_per_second": 3.222,
"eval_steps_per_second": 1.611,
"step": 14000
},
{
"epoch": 0.2805,
"grad_norm": 0.5680524398621669,
"learning_rate": 7.994666666666666e-06,
"loss": 2.382,
"step": 14025
},
{
"epoch": 0.281,
"grad_norm": 0.5577808062612865,
"learning_rate": 7.989111111111112e-06,
"loss": 2.3817,
"step": 14050
},
{
"epoch": 0.2815,
"grad_norm": 0.5609272583996402,
"learning_rate": 7.983555555555557e-06,
"loss": 2.3807,
"step": 14075
},
{
"epoch": 0.282,
"grad_norm": 0.5572862450140419,
"learning_rate": 7.978e-06,
"loss": 2.3883,
"step": 14100
},
{
"epoch": 0.282,
"eval_loss": 2.399045467376709,
"eval_runtime": 31.4262,
"eval_samples_per_second": 3.246,
"eval_steps_per_second": 1.623,
"step": 14100
},
{
"epoch": 0.2825,
"grad_norm": 0.5548825232758766,
"learning_rate": 7.972444444444444e-06,
"loss": 2.3906,
"step": 14125
},
{
"epoch": 0.283,
"grad_norm": 0.5699464235282781,
"learning_rate": 7.96688888888889e-06,
"loss": 2.3985,
"step": 14150
},
{
"epoch": 0.2835,
"grad_norm": 0.5949860745449153,
"learning_rate": 7.961333333333335e-06,
"loss": 2.384,
"step": 14175
},
{
"epoch": 0.284,
"grad_norm": 1.207767068552352,
"learning_rate": 7.955777777777778e-06,
"loss": 2.3897,
"step": 14200
},
{
"epoch": 0.284,
"eval_loss": 2.3988163471221924,
"eval_runtime": 31.5331,
"eval_samples_per_second": 3.235,
"eval_steps_per_second": 1.617,
"step": 14200
},
{
"epoch": 0.2845,
"grad_norm": 0.5734778733619218,
"learning_rate": 7.950222222222222e-06,
"loss": 2.3995,
"step": 14225
},
{
"epoch": 0.285,
"grad_norm": 0.5809053174835214,
"learning_rate": 7.944666666666667e-06,
"loss": 2.3935,
"step": 14250
},
{
"epoch": 0.2855,
"grad_norm": 0.5721177604701749,
"learning_rate": 7.939111111111112e-06,
"loss": 2.3831,
"step": 14275
},
{
"epoch": 0.286,
"grad_norm": 0.5870187369085319,
"learning_rate": 7.933555555555556e-06,
"loss": 2.3876,
"step": 14300
},
{
"epoch": 0.286,
"eval_loss": 2.3985910415649414,
"eval_runtime": 31.8276,
"eval_samples_per_second": 3.205,
"eval_steps_per_second": 1.602,
"step": 14300
},
{
"epoch": 0.2865,
"grad_norm": 0.5540420732959112,
"learning_rate": 7.928e-06,
"loss": 2.3894,
"step": 14325
},
{
"epoch": 0.287,
"grad_norm": 0.5771375830109964,
"learning_rate": 7.922444444444445e-06,
"loss": 2.3919,
"step": 14350
},
{
"epoch": 0.2875,
"grad_norm": 0.558274829145414,
"learning_rate": 7.91688888888889e-06,
"loss": 2.3792,
"step": 14375
},
{
"epoch": 0.288,
"grad_norm": 0.5489382411994304,
"learning_rate": 7.911333333333333e-06,
"loss": 2.382,
"step": 14400
},
{
"epoch": 0.288,
"eval_loss": 2.398547887802124,
"eval_runtime": 31.7859,
"eval_samples_per_second": 3.209,
"eval_steps_per_second": 1.604,
"step": 14400
},
{
"epoch": 0.2885,
"grad_norm": 0.5437020470565486,
"learning_rate": 7.905777777777779e-06,
"loss": 2.391,
"step": 14425
},
{
"epoch": 0.289,
"grad_norm": 0.5822012645571201,
"learning_rate": 7.900222222222222e-06,
"loss": 2.3774,
"step": 14450
},
{
"epoch": 0.2895,
"grad_norm": 0.5662409547337693,
"learning_rate": 7.894666666666667e-06,
"loss": 2.3754,
"step": 14475
},
{
"epoch": 0.29,
"grad_norm": 0.574336415517884,
"learning_rate": 7.889111111111113e-06,
"loss": 2.3696,
"step": 14500
},
{
"epoch": 0.29,
"eval_loss": 2.3984858989715576,
"eval_runtime": 31.7473,
"eval_samples_per_second": 3.213,
"eval_steps_per_second": 1.606,
"step": 14500
},
{
"epoch": 0.2905,
"grad_norm": 0.5564392509678192,
"learning_rate": 7.883555555555556e-06,
"loss": 2.3856,
"step": 14525
},
{
"epoch": 0.291,
"grad_norm": 0.5518394045498354,
"learning_rate": 7.878e-06,
"loss": 2.3972,
"step": 14550
},
{
"epoch": 0.2915,
"grad_norm": 0.5795808696759357,
"learning_rate": 7.872444444444445e-06,
"loss": 2.3831,
"step": 14575
},
{
"epoch": 0.292,
"grad_norm": 0.5601055983017486,
"learning_rate": 7.86688888888889e-06,
"loss": 2.3844,
"step": 14600
},
{
"epoch": 0.292,
"eval_loss": 2.3982439041137695,
"eval_runtime": 31.6763,
"eval_samples_per_second": 3.22,
"eval_steps_per_second": 1.61,
"step": 14600
},
{
"epoch": 0.2925,
"grad_norm": 0.5964235234322374,
"learning_rate": 7.861333333333334e-06,
"loss": 2.3899,
"step": 14625
},
{
"epoch": 0.293,
"grad_norm": 0.5610795516162878,
"learning_rate": 7.855777777777779e-06,
"loss": 2.3838,
"step": 14650
},
{
"epoch": 0.2935,
"grad_norm": 0.5670881867616083,
"learning_rate": 7.850222222222223e-06,
"loss": 2.3825,
"step": 14675
},
{
"epoch": 0.294,
"grad_norm": 0.5643624181789829,
"learning_rate": 7.844666666666668e-06,
"loss": 2.3882,
"step": 14700
},
{
"epoch": 0.294,
"eval_loss": 2.398089647293091,
"eval_runtime": 31.7677,
"eval_samples_per_second": 3.211,
"eval_steps_per_second": 1.605,
"step": 14700
},
{
"epoch": 0.2945,
"grad_norm": 0.5686315690402087,
"learning_rate": 7.839111111111111e-06,
"loss": 2.3745,
"step": 14725
},
{
"epoch": 0.295,
"grad_norm": 0.5893983725540548,
"learning_rate": 7.833555555555557e-06,
"loss": 2.378,
"step": 14750
},
{
"epoch": 0.2955,
"grad_norm": 0.5972901998200331,
"learning_rate": 7.828000000000002e-06,
"loss": 2.377,
"step": 14775
},
{
"epoch": 0.296,
"grad_norm": 0.5804879541179684,
"learning_rate": 7.822444444444446e-06,
"loss": 2.3911,
"step": 14800
},
{
"epoch": 0.296,
"eval_loss": 2.397839069366455,
"eval_runtime": 31.7602,
"eval_samples_per_second": 3.212,
"eval_steps_per_second": 1.606,
"step": 14800
},
{
"epoch": 0.2965,
"grad_norm": 0.577463980570899,
"learning_rate": 7.816888888888889e-06,
"loss": 2.3896,
"step": 14825
},
{
"epoch": 0.297,
"grad_norm": 0.5800702741538564,
"learning_rate": 7.811333333333334e-06,
"loss": 2.3838,
"step": 14850
},
{
"epoch": 0.2975,
"grad_norm": 0.6037725626202978,
"learning_rate": 7.80577777777778e-06,
"loss": 2.3827,
"step": 14875
},
{
"epoch": 0.298,
"grad_norm": 0.5862145198472817,
"learning_rate": 7.800222222222223e-06,
"loss": 2.3801,
"step": 14900
},
{
"epoch": 0.298,
"eval_loss": 2.3976035118103027,
"eval_runtime": 31.751,
"eval_samples_per_second": 3.212,
"eval_steps_per_second": 1.606,
"step": 14900
},
{
"epoch": 0.2985,
"grad_norm": 0.5670781074548332,
"learning_rate": 7.794666666666667e-06,
"loss": 2.3819,
"step": 14925
},
{
"epoch": 0.299,
"grad_norm": 0.5571823653622203,
"learning_rate": 7.789111111111112e-06,
"loss": 2.3835,
"step": 14950
},
{
"epoch": 0.2995,
"grad_norm": 0.5733242457342494,
"learning_rate": 7.783555555555557e-06,
"loss": 2.3728,
"step": 14975
},
{
"epoch": 0.3,
"grad_norm": 0.5619677124489769,
"learning_rate": 7.778e-06,
"loss": 2.3794,
"step": 15000
},
{
"epoch": 0.3,
"eval_loss": 2.397136688232422,
"eval_runtime": 31.7183,
"eval_samples_per_second": 3.216,
"eval_steps_per_second": 1.608,
"step": 15000
},
{
"epoch": 0.3005,
"grad_norm": 0.5657448746286691,
"learning_rate": 7.772444444444444e-06,
"loss": 2.3897,
"step": 15025
},
{
"epoch": 0.301,
"grad_norm": 0.5523525627604269,
"learning_rate": 7.76688888888889e-06,
"loss": 2.3795,
"step": 15050
},
{
"epoch": 0.3015,
"grad_norm": 0.5950789860717867,
"learning_rate": 7.761333333333335e-06,
"loss": 2.3914,
"step": 15075
},
{
"epoch": 0.302,
"grad_norm": 0.5999400034143391,
"learning_rate": 7.755777777777778e-06,
"loss": 2.3769,
"step": 15100
},
{
"epoch": 0.302,
"eval_loss": 2.396873950958252,
"eval_runtime": 31.7696,
"eval_samples_per_second": 3.211,
"eval_steps_per_second": 1.605,
"step": 15100
},
{
"epoch": 0.3025,
"grad_norm": 0.558834977842146,
"learning_rate": 7.750222222222222e-06,
"loss": 2.3854,
"step": 15125
},
{
"epoch": 0.303,
"grad_norm": 0.5582295283472423,
"learning_rate": 7.744666666666667e-06,
"loss": 2.3821,
"step": 15150
},
{
"epoch": 0.3035,
"grad_norm": 0.5632905015995245,
"learning_rate": 7.739111111111112e-06,
"loss": 2.3798,
"step": 15175
},
{
"epoch": 0.304,
"grad_norm": 0.5514118333084079,
"learning_rate": 7.733555555555556e-06,
"loss": 2.3788,
"step": 15200
},
{
"epoch": 0.304,
"eval_loss": 2.3965888023376465,
"eval_runtime": 31.7152,
"eval_samples_per_second": 3.216,
"eval_steps_per_second": 1.608,
"step": 15200
},
{
"epoch": 0.3045,
"grad_norm": 0.5649018768322466,
"learning_rate": 7.728000000000001e-06,
"loss": 2.3912,
"step": 15225
},
{
"epoch": 0.305,
"grad_norm": 0.581561230195339,
"learning_rate": 7.722444444444445e-06,
"loss": 2.3766,
"step": 15250
},
{
"epoch": 0.3055,
"grad_norm": 0.5604985750115082,
"learning_rate": 7.71688888888889e-06,
"loss": 2.3852,
"step": 15275
},
{
"epoch": 0.306,
"grad_norm": 0.5602736035393524,
"learning_rate": 7.711333333333334e-06,
"loss": 2.3867,
"step": 15300
},
{
"epoch": 0.306,
"eval_loss": 2.3968026638031006,
"eval_runtime": 31.8105,
"eval_samples_per_second": 3.206,
"eval_steps_per_second": 1.603,
"step": 15300
},
{
"epoch": 0.3065,
"grad_norm": 0.5404472339052024,
"learning_rate": 7.705777777777779e-06,
"loss": 2.3835,
"step": 15325
},
{
"epoch": 0.307,
"grad_norm": 0.5732167481475767,
"learning_rate": 7.700222222222224e-06,
"loss": 2.386,
"step": 15350
},
{
"epoch": 0.3075,
"grad_norm": 0.5668975128857069,
"learning_rate": 7.694666666666668e-06,
"loss": 2.3838,
"step": 15375
},
{
"epoch": 0.308,
"grad_norm": 0.5478312505357384,
"learning_rate": 7.689111111111111e-06,
"loss": 2.4068,
"step": 15400
},
{
"epoch": 0.308,
"eval_loss": 2.39662766456604,
"eval_runtime": 31.4625,
"eval_samples_per_second": 3.242,
"eval_steps_per_second": 1.621,
"step": 15400
},
{
"epoch": 0.3085,
"grad_norm": 0.5853236703412803,
"learning_rate": 7.683555555555556e-06,
"loss": 2.3781,
"step": 15425
},
{
"epoch": 0.309,
"grad_norm": 0.566498029803985,
"learning_rate": 7.678000000000002e-06,
"loss": 2.3825,
"step": 15450
},
{
"epoch": 0.3095,
"grad_norm": 0.5876295223419085,
"learning_rate": 7.672444444444445e-06,
"loss": 2.3821,
"step": 15475
},
{
"epoch": 0.31,
"grad_norm": 0.5308633915785282,
"learning_rate": 7.666888888888889e-06,
"loss": 2.3762,
"step": 15500
},
{
"epoch": 0.31,
"eval_loss": 2.39650559425354,
"eval_runtime": 31.6255,
"eval_samples_per_second": 3.225,
"eval_steps_per_second": 1.613,
"step": 15500
},
{
"epoch": 0.3105,
"grad_norm": 1.090575647217174,
"learning_rate": 7.661333333333334e-06,
"loss": 2.3854,
"step": 15525
},
{
"epoch": 0.311,
"grad_norm": 0.5608565584872227,
"learning_rate": 7.65577777777778e-06,
"loss": 2.3909,
"step": 15550
},
{
"epoch": 0.3115,
"grad_norm": 0.5664910219445479,
"learning_rate": 7.650222222222223e-06,
"loss": 2.3876,
"step": 15575
},
{
"epoch": 0.312,
"grad_norm": 0.5743138998726522,
"learning_rate": 7.644666666666666e-06,
"loss": 2.3891,
"step": 15600
},
{
"epoch": 0.312,
"eval_loss": 2.395846128463745,
"eval_runtime": 31.422,
"eval_samples_per_second": 3.246,
"eval_steps_per_second": 1.623,
"step": 15600
},
{
"epoch": 0.3125,
"grad_norm": 0.5838966503811626,
"learning_rate": 7.639111111111112e-06,
"loss": 2.3744,
"step": 15625
},
{
"epoch": 0.313,
"grad_norm": 0.5861982665217826,
"learning_rate": 7.633555555555557e-06,
"loss": 2.386,
"step": 15650
},
{
"epoch": 0.3135,
"grad_norm": 0.5623110973377239,
"learning_rate": 7.628000000000001e-06,
"loss": 2.3729,
"step": 15675
},
{
"epoch": 0.314,
"grad_norm": 0.5546807091447383,
"learning_rate": 7.622444444444445e-06,
"loss": 2.3758,
"step": 15700
},
{
"epoch": 0.314,
"eval_loss": 2.396050453186035,
"eval_runtime": 31.4839,
"eval_samples_per_second": 3.24,
"eval_steps_per_second": 1.62,
"step": 15700
},
{
"epoch": 0.3145,
"grad_norm": 0.566357543453858,
"learning_rate": 7.616888888888889e-06,
"loss": 2.3814,
"step": 15725
},
{
"epoch": 0.315,
"grad_norm": 0.5863021742964364,
"learning_rate": 7.611333333333334e-06,
"loss": 2.3912,
"step": 15750
},
{
"epoch": 0.3155,
"grad_norm": 0.5448091994015362,
"learning_rate": 7.605777777777779e-06,
"loss": 2.3949,
"step": 15775
},
{
"epoch": 0.316,
"grad_norm": 0.5571622234957405,
"learning_rate": 7.600222222222223e-06,
"loss": 2.3893,
"step": 15800
},
{
"epoch": 0.316,
"eval_loss": 2.3957884311676025,
"eval_runtime": 31.4676,
"eval_samples_per_second": 3.241,
"eval_steps_per_second": 1.621,
"step": 15800
},
{
"epoch": 0.3165,
"grad_norm": 0.6175149611764096,
"learning_rate": 7.594666666666667e-06,
"loss": 2.3858,
"step": 15825
},
{
"epoch": 0.317,
"grad_norm": 0.5811416818392343,
"learning_rate": 7.589111111111111e-06,
"loss": 2.3893,
"step": 15850
},
{
"epoch": 0.3175,
"grad_norm": 0.5685262674194088,
"learning_rate": 7.5835555555555566e-06,
"loss": 2.3895,
"step": 15875
},
{
"epoch": 0.318,
"grad_norm": 0.5726231388910242,
"learning_rate": 7.578000000000001e-06,
"loss": 2.3924,
"step": 15900
},
{
"epoch": 0.318,
"eval_loss": 2.3957200050354004,
"eval_runtime": 31.6833,
"eval_samples_per_second": 3.219,
"eval_steps_per_second": 1.61,
"step": 15900
},
{
"epoch": 0.3185,
"grad_norm": 0.5881014617899262,
"learning_rate": 7.572444444444445e-06,
"loss": 2.3719,
"step": 15925
},
{
"epoch": 0.319,
"grad_norm": 0.5635459036409981,
"learning_rate": 7.566888888888889e-06,
"loss": 2.378,
"step": 15950
},
{
"epoch": 0.3195,
"grad_norm": 0.5604907919572244,
"learning_rate": 7.561333333333334e-06,
"loss": 2.3744,
"step": 15975
},
{
"epoch": 0.32,
"grad_norm": 0.5743956921241223,
"learning_rate": 7.555777777777779e-06,
"loss": 2.3872,
"step": 16000
},
{
"epoch": 0.32,
"eval_loss": 2.3958442211151123,
"eval_runtime": 31.9703,
"eval_samples_per_second": 3.19,
"eval_steps_per_second": 1.595,
"step": 16000
},
{
"epoch": 0.3205,
"grad_norm": 0.5490540509150809,
"learning_rate": 7.550222222222223e-06,
"loss": 2.3908,
"step": 16025
},
{
"epoch": 0.321,
"grad_norm": 0.5604566538327537,
"learning_rate": 7.5446666666666665e-06,
"loss": 2.3816,
"step": 16050
},
{
"epoch": 0.3215,
"grad_norm": 0.5482351645184266,
"learning_rate": 7.539111111111112e-06,
"loss": 2.3783,
"step": 16075
},
{
"epoch": 0.322,
"grad_norm": 0.5738611670880387,
"learning_rate": 7.533555555555556e-06,
"loss": 2.3807,
"step": 16100
},
{
"epoch": 0.322,
"eval_loss": 2.3955187797546387,
"eval_runtime": 31.7782,
"eval_samples_per_second": 3.21,
"eval_steps_per_second": 1.605,
"step": 16100
},
{
"epoch": 0.3225,
"grad_norm": 0.6007459037823811,
"learning_rate": 7.528000000000001e-06,
"loss": 2.3908,
"step": 16125
},
{
"epoch": 0.323,
"grad_norm": 0.5719140015142068,
"learning_rate": 7.522444444444446e-06,
"loss": 2.379,
"step": 16150
},
{
"epoch": 0.3235,
"grad_norm": 0.5722843141001409,
"learning_rate": 7.516888888888889e-06,
"loss": 2.3831,
"step": 16175
},
{
"epoch": 0.324,
"grad_norm": 0.5500359198684006,
"learning_rate": 7.511333333333334e-06,
"loss": 2.3899,
"step": 16200
},
{
"epoch": 0.324,
"eval_loss": 2.3954145908355713,
"eval_runtime": 31.9265,
"eval_samples_per_second": 3.195,
"eval_steps_per_second": 1.597,
"step": 16200
},
{
"epoch": 0.3245,
"grad_norm": 0.5988197648020003,
"learning_rate": 7.505777777777778e-06,
"loss": 2.3768,
"step": 16225
},
{
"epoch": 0.325,
"grad_norm": 0.566314534087209,
"learning_rate": 7.5002222222222235e-06,
"loss": 2.3731,
"step": 16250
},
{
"epoch": 0.3255,
"grad_norm": 0.5462158611596983,
"learning_rate": 7.494666666666667e-06,
"loss": 2.3821,
"step": 16275
},
{
"epoch": 0.326,
"grad_norm": 0.5546038414202229,
"learning_rate": 7.4891111111111114e-06,
"loss": 2.3725,
"step": 16300
},
{
"epoch": 0.326,
"eval_loss": 2.395524501800537,
"eval_runtime": 31.8126,
"eval_samples_per_second": 3.206,
"eval_steps_per_second": 1.603,
"step": 16300
},
{
"epoch": 0.3265,
"grad_norm": 0.5596467845027929,
"learning_rate": 7.483555555555556e-06,
"loss": 2.3843,
"step": 16325
},
{
"epoch": 0.327,
"grad_norm": 0.5815120805791782,
"learning_rate": 7.478000000000001e-06,
"loss": 2.3815,
"step": 16350
},
{
"epoch": 0.3275,
"grad_norm": 0.5597449596999192,
"learning_rate": 7.4724444444444455e-06,
"loss": 2.3732,
"step": 16375
},
{
"epoch": 0.328,
"grad_norm": 0.5818958282150155,
"learning_rate": 7.466888888888889e-06,
"loss": 2.3793,
"step": 16400
},
{
"epoch": 0.328,
"eval_loss": 2.3949294090270996,
"eval_runtime": 31.7738,
"eval_samples_per_second": 3.21,
"eval_steps_per_second": 1.605,
"step": 16400
},
{
"epoch": 0.3285,
"grad_norm": 0.5662000485734395,
"learning_rate": 7.4613333333333334e-06,
"loss": 2.3812,
"step": 16425
},
{
"epoch": 0.329,
"grad_norm": 0.5563577533028059,
"learning_rate": 7.455777777777779e-06,
"loss": 2.3761,
"step": 16450
},
{
"epoch": 0.3295,
"grad_norm": 0.5687992956190129,
"learning_rate": 7.450222222222223e-06,
"loss": 2.381,
"step": 16475
},
{
"epoch": 0.33,
"grad_norm": 0.5487444076942639,
"learning_rate": 7.4446666666666675e-06,
"loss": 2.3883,
"step": 16500
},
{
"epoch": 0.33,
"eval_loss": 2.395174026489258,
"eval_runtime": 31.7762,
"eval_samples_per_second": 3.21,
"eval_steps_per_second": 1.605,
"step": 16500
},
{
"epoch": 0.3305,
"grad_norm": 0.5469101598299175,
"learning_rate": 7.439111111111111e-06,
"loss": 2.3766,
"step": 16525
},
{
"epoch": 0.331,
"grad_norm": 0.5567200858341991,
"learning_rate": 7.433555555555556e-06,
"loss": 2.3939,
"step": 16550
},
{
"epoch": 0.3315,
"grad_norm": 0.600536691861987,
"learning_rate": 7.428000000000001e-06,
"loss": 2.3822,
"step": 16575
},
{
"epoch": 0.332,
"grad_norm": 0.5505048207350117,
"learning_rate": 7.422444444444445e-06,
"loss": 2.378,
"step": 16600
},
{
"epoch": 0.332,
"eval_loss": 2.39481520652771,
"eval_runtime": 31.8394,
"eval_samples_per_second": 3.204,
"eval_steps_per_second": 1.602,
"step": 16600
},
{
"epoch": 0.3325,
"grad_norm": 0.5492676702406505,
"learning_rate": 7.416888888888889e-06,
"loss": 2.3769,
"step": 16625
},
{
"epoch": 0.333,
"grad_norm": 0.5492443037384863,
"learning_rate": 7.411333333333334e-06,
"loss": 2.3701,
"step": 16650
},
{
"epoch": 0.3335,
"grad_norm": 0.5857568383624908,
"learning_rate": 7.405777777777778e-06,
"loss": 2.381,
"step": 16675
},
{
"epoch": 0.334,
"grad_norm": 0.5647204860919086,
"learning_rate": 7.400222222222223e-06,
"loss": 2.3819,
"step": 16700
},
{
"epoch": 0.334,
"eval_loss": 2.394426107406616,
"eval_runtime": 31.892,
"eval_samples_per_second": 3.198,
"eval_steps_per_second": 1.599,
"step": 16700
},
{
"epoch": 0.3345,
"grad_norm": 0.5730702201176824,
"learning_rate": 7.394666666666668e-06,
"loss": 2.3857,
"step": 16725
},
{
"epoch": 0.335,
"grad_norm": 0.5521969424083262,
"learning_rate": 7.3891111111111115e-06,
"loss": 2.363,
"step": 16750
},
{
"epoch": 0.3355,
"grad_norm": 0.6057695700506919,
"learning_rate": 7.383555555555556e-06,
"loss": 2.3848,
"step": 16775
},
{
"epoch": 0.336,
"grad_norm": 0.5749986280132275,
"learning_rate": 7.378e-06,
"loss": 2.389,
"step": 16800
},
{
"epoch": 0.336,
"eval_loss": 2.3945508003234863,
"eval_runtime": 31.7463,
"eval_samples_per_second": 3.213,
"eval_steps_per_second": 1.606,
"step": 16800
},
{
"epoch": 0.3365,
"grad_norm": 0.5947076066210849,
"learning_rate": 7.372444444444446e-06,
"loss": 2.3865,
"step": 16825
},
{
"epoch": 0.337,
"grad_norm": 0.564221658006085,
"learning_rate": 7.366888888888889e-06,
"loss": 2.3696,
"step": 16850
},
{
"epoch": 0.3375,
"grad_norm": 0.5702041520098122,
"learning_rate": 7.3613333333333336e-06,
"loss": 2.3872,
"step": 16875
},
{
"epoch": 0.338,
"grad_norm": 0.5538661614565709,
"learning_rate": 7.355777777777778e-06,
"loss": 2.3828,
"step": 16900
},
{
"epoch": 0.338,
"eval_loss": 2.3942644596099854,
"eval_runtime": 31.8144,
"eval_samples_per_second": 3.206,
"eval_steps_per_second": 1.603,
"step": 16900
},
{
"epoch": 0.3385,
"grad_norm": 0.5614412730199092,
"learning_rate": 7.350222222222223e-06,
"loss": 2.3898,
"step": 16925
},
{
"epoch": 0.339,
"grad_norm": 0.5656638849693418,
"learning_rate": 7.344666666666668e-06,
"loss": 2.3639,
"step": 16950
},
{
"epoch": 0.3395,
"grad_norm": 0.5587793192894792,
"learning_rate": 7.339111111111111e-06,
"loss": 2.3761,
"step": 16975
},
{
"epoch": 0.34,
"grad_norm": 0.5537041511919,
"learning_rate": 7.3335555555555556e-06,
"loss": 2.3785,
"step": 17000
},
{
"epoch": 0.34,
"eval_loss": 2.394216775894165,
"eval_runtime": 31.7287,
"eval_samples_per_second": 3.215,
"eval_steps_per_second": 1.607,
"step": 17000
},
{
"epoch": 0.3405,
"grad_norm": 0.5625979440161315,
"learning_rate": 7.328000000000001e-06,
"loss": 2.3706,
"step": 17025
},
{
"epoch": 0.341,
"grad_norm": 0.5578934058534382,
"learning_rate": 7.322444444444445e-06,
"loss": 2.3717,
"step": 17050
},
{
"epoch": 0.3415,
"grad_norm": 0.5600783145650656,
"learning_rate": 7.31688888888889e-06,
"loss": 2.3549,
"step": 17075
},
{
"epoch": 0.342,
"grad_norm": 0.5443562716925451,
"learning_rate": 7.311333333333334e-06,
"loss": 2.3818,
"step": 17100
},
{
"epoch": 0.342,
"eval_loss": 2.3939199447631836,
"eval_runtime": 31.7183,
"eval_samples_per_second": 3.216,
"eval_steps_per_second": 1.608,
"step": 17100
},
{
"epoch": 0.3425,
"grad_norm": 0.6040551095214175,
"learning_rate": 7.3057777777777784e-06,
"loss": 2.3856,
"step": 17125
},
{
"epoch": 0.343,
"grad_norm": 0.5800600768624563,
"learning_rate": 7.300222222222223e-06,
"loss": 2.3812,
"step": 17150
},
{
"epoch": 0.3435,
"grad_norm": 0.606456873691792,
"learning_rate": 7.294666666666668e-06,
"loss": 2.3823,
"step": 17175
},
{
"epoch": 0.344,
"grad_norm": 0.5820033666001653,
"learning_rate": 7.289111111111112e-06,
"loss": 2.3772,
"step": 17200
},
{
"epoch": 0.344,
"eval_loss": 2.39414644241333,
"eval_runtime": 31.4591,
"eval_samples_per_second": 3.242,
"eval_steps_per_second": 1.621,
"step": 17200
},
{
"epoch": 0.3445,
"grad_norm": 0.592691728166079,
"learning_rate": 7.283555555555556e-06,
"loss": 2.3757,
"step": 17225
},
{
"epoch": 0.345,
"grad_norm": 0.5475066044517582,
"learning_rate": 7.2780000000000005e-06,
"loss": 2.393,
"step": 17250
},
{
"epoch": 0.3455,
"grad_norm": 0.5412153350606916,
"learning_rate": 7.272444444444446e-06,
"loss": 2.3775,
"step": 17275
},
{
"epoch": 0.346,
"grad_norm": 0.5703055910606494,
"learning_rate": 7.26688888888889e-06,
"loss": 2.3919,
"step": 17300
},
{
"epoch": 0.346,
"eval_loss": 2.393954277038574,
"eval_runtime": 31.4832,
"eval_samples_per_second": 3.24,
"eval_steps_per_second": 1.62,
"step": 17300
},
{
"epoch": 0.3465,
"grad_norm": 0.5720004911842855,
"learning_rate": 7.261333333333334e-06,
"loss": 2.3744,
"step": 17325
},
{
"epoch": 0.347,
"grad_norm": 0.5651936652229611,
"learning_rate": 7.255777777777778e-06,
"loss": 2.3766,
"step": 17350
},
{
"epoch": 0.3475,
"grad_norm": 0.552954097582646,
"learning_rate": 7.250222222222223e-06,
"loss": 2.38,
"step": 17375
},
{
"epoch": 0.348,
"grad_norm": 0.5753937605402671,
"learning_rate": 7.244666666666668e-06,
"loss": 2.3825,
"step": 17400
},
{
"epoch": 0.348,
"eval_loss": 2.3936057090759277,
"eval_runtime": 31.5155,
"eval_samples_per_second": 3.237,
"eval_steps_per_second": 1.618,
"step": 17400
},
{
"epoch": 0.3485,
"grad_norm": 0.5982429265702776,
"learning_rate": 7.239111111111111e-06,
"loss": 2.3748,
"step": 17425
},
{
"epoch": 0.349,
"grad_norm": 0.5707105076014326,
"learning_rate": 7.233555555555556e-06,
"loss": 2.3871,
"step": 17450
},
{
"epoch": 0.3495,
"grad_norm": 0.5749982454192974,
"learning_rate": 7.228000000000001e-06,
"loss": 2.3722,
"step": 17475
},
{
"epoch": 0.35,
"grad_norm": 0.5667678087541999,
"learning_rate": 7.222444444444445e-06,
"loss": 2.3897,
"step": 17500
},
{
"epoch": 0.35,
"eval_loss": 2.3934316635131836,
"eval_runtime": 31.5133,
"eval_samples_per_second": 3.237,
"eval_steps_per_second": 1.618,
"step": 17500
},
{
"epoch": 0.3505,
"grad_norm": 0.551269238238286,
"learning_rate": 7.21688888888889e-06,
"loss": 2.3759,
"step": 17525
},
{
"epoch": 0.351,
"grad_norm": 0.5683477126287287,
"learning_rate": 7.211333333333333e-06,
"loss": 2.3751,
"step": 17550
},
{
"epoch": 0.3515,
"grad_norm": 0.5534527601932518,
"learning_rate": 7.2057777777777785e-06,
"loss": 2.3749,
"step": 17575
},
{
"epoch": 0.352,
"grad_norm": 0.5444580304379504,
"learning_rate": 7.200222222222223e-06,
"loss": 2.3839,
"step": 17600
},
{
"epoch": 0.352,
"eval_loss": 2.3928964138031006,
"eval_runtime": 31.79,
"eval_samples_per_second": 3.209,
"eval_steps_per_second": 1.604,
"step": 17600
},
{
"epoch": 0.3525,
"grad_norm": 0.5683011717419817,
"learning_rate": 7.194666666666667e-06,
"loss": 2.3697,
"step": 17625
},
{
"epoch": 0.353,
"grad_norm": 0.5597200154635523,
"learning_rate": 7.189111111111111e-06,
"loss": 2.3758,
"step": 17650
},
{
"epoch": 0.3535,
"grad_norm": 0.5389975543023572,
"learning_rate": 7.183555555555556e-06,
"loss": 2.3748,
"step": 17675
},
{
"epoch": 0.354,
"grad_norm": 0.5766556300730846,
"learning_rate": 7.1780000000000006e-06,
"loss": 2.3863,
"step": 17700
},
{
"epoch": 0.354,
"eval_loss": 2.3929381370544434,
"eval_runtime": 31.4662,
"eval_samples_per_second": 3.242,
"eval_steps_per_second": 1.621,
"step": 17700
},
{
"epoch": 0.3545,
"grad_norm": 0.5422601731930108,
"learning_rate": 7.172444444444445e-06,
"loss": 2.3795,
"step": 17725
},
{
"epoch": 0.355,
"grad_norm": 0.587749563771833,
"learning_rate": 7.16688888888889e-06,
"loss": 2.3741,
"step": 17750
},
{
"epoch": 0.3555,
"grad_norm": 0.5448174780243932,
"learning_rate": 7.161333333333334e-06,
"loss": 2.374,
"step": 17775
},
{
"epoch": 0.356,
"grad_norm": 0.5487711297157323,
"learning_rate": 7.155777777777778e-06,
"loss": 2.3872,
"step": 17800
},
{
"epoch": 0.356,
"eval_loss": 2.3928709030151367,
"eval_runtime": 31.7364,
"eval_samples_per_second": 3.214,
"eval_steps_per_second": 1.607,
"step": 17800
},
{
"epoch": 0.3565,
"grad_norm": 0.5749112760792647,
"learning_rate": 7.150222222222223e-06,
"loss": 2.375,
"step": 17825
},
{
"epoch": 0.357,
"grad_norm": 0.5657127084376901,
"learning_rate": 7.144666666666668e-06,
"loss": 2.3635,
"step": 17850
},
{
"epoch": 0.3575,
"grad_norm": 0.5552559911086609,
"learning_rate": 7.139111111111112e-06,
"loss": 2.3791,
"step": 17875
},
{
"epoch": 0.358,
"grad_norm": 0.5587079571658956,
"learning_rate": 7.133555555555556e-06,
"loss": 2.3792,
"step": 17900
},
{
"epoch": 0.358,
"eval_loss": 2.39250111579895,
"eval_runtime": 31.8377,
"eval_samples_per_second": 3.204,
"eval_steps_per_second": 1.602,
"step": 17900
},
{
"epoch": 0.3585,
"grad_norm": 0.5476769108414363,
"learning_rate": 7.128e-06,
"loss": 2.3796,
"step": 17925
},
{
"epoch": 0.359,
"grad_norm": 0.5519286017800472,
"learning_rate": 7.1224444444444454e-06,
"loss": 2.3689,
"step": 17950
},
{
"epoch": 0.3595,
"grad_norm": 0.5690523665272621,
"learning_rate": 7.11688888888889e-06,
"loss": 2.3758,
"step": 17975
},
{
"epoch": 0.36,
"grad_norm": 0.575484852893059,
"learning_rate": 7.111333333333333e-06,
"loss": 2.3723,
"step": 18000
},
{
"epoch": 0.36,
"eval_loss": 2.3920133113861084,
"eval_runtime": 31.9286,
"eval_samples_per_second": 3.195,
"eval_steps_per_second": 1.597,
"step": 18000
},
{
"epoch": 0.3605,
"grad_norm": 0.5779120077378331,
"learning_rate": 7.105777777777778e-06,
"loss": 2.3798,
"step": 18025
},
{
"epoch": 0.361,
"grad_norm": 0.575309417070187,
"learning_rate": 7.100222222222223e-06,
"loss": 2.3875,
"step": 18050
},
{
"epoch": 0.3615,
"grad_norm": 0.6000430306182747,
"learning_rate": 7.0946666666666675e-06,
"loss": 2.3727,
"step": 18075
},
{
"epoch": 0.362,
"grad_norm": 0.5701734522791184,
"learning_rate": 7.089111111111112e-06,
"loss": 2.3793,
"step": 18100
},
{
"epoch": 0.362,
"eval_loss": 2.392152786254883,
"eval_runtime": 31.8363,
"eval_samples_per_second": 3.204,
"eval_steps_per_second": 1.602,
"step": 18100
},
{
"epoch": 0.3625,
"grad_norm": 0.5731611332750656,
"learning_rate": 7.083555555555555e-06,
"loss": 2.3715,
"step": 18125
},
{
"epoch": 0.363,
"grad_norm": 0.6114229583074544,
"learning_rate": 7.078000000000001e-06,
"loss": 2.383,
"step": 18150
},
{
"epoch": 0.3635,
"grad_norm": 0.541007634609165,
"learning_rate": 7.072444444444445e-06,
"loss": 2.3686,
"step": 18175
},
{
"epoch": 0.364,
"grad_norm": 0.5725748950012406,
"learning_rate": 7.0668888888888895e-06,
"loss": 2.3873,
"step": 18200
},
{
"epoch": 0.364,
"eval_loss": 2.392261505126953,
"eval_runtime": 31.7706,
"eval_samples_per_second": 3.211,
"eval_steps_per_second": 1.605,
"step": 18200
},
{
"epoch": 0.3645,
"grad_norm": 0.5593670656564304,
"learning_rate": 7.061333333333333e-06,
"loss": 2.3804,
"step": 18225
},
{
"epoch": 0.365,
"grad_norm": 0.6009795583649221,
"learning_rate": 7.055777777777778e-06,
"loss": 2.3795,
"step": 18250
},
{
"epoch": 0.3655,
"grad_norm": 0.5664495345544722,
"learning_rate": 7.050222222222223e-06,
"loss": 2.3631,
"step": 18275
},
{
"epoch": 0.366,
"grad_norm": 0.6104006309418994,
"learning_rate": 7.044666666666667e-06,
"loss": 2.3748,
"step": 18300
},
{
"epoch": 0.366,
"eval_loss": 2.392148971557617,
"eval_runtime": 31.734,
"eval_samples_per_second": 3.214,
"eval_steps_per_second": 1.607,
"step": 18300
},
{
"epoch": 0.3665,
"grad_norm": 0.5506059883330837,
"learning_rate": 7.039111111111112e-06,
"loss": 2.3714,
"step": 18325
},
{
"epoch": 0.367,
"grad_norm": 0.5621509156408089,
"learning_rate": 7.033555555555556e-06,
"loss": 2.368,
"step": 18350
},
{
"epoch": 0.3675,
"grad_norm": 0.5587181787810226,
"learning_rate": 7.028e-06,
"loss": 2.3791,
"step": 18375
},
{
"epoch": 0.368,
"grad_norm": 0.5677798724220077,
"learning_rate": 7.022444444444445e-06,
"loss": 2.384,
"step": 18400
},
{
"epoch": 0.368,
"eval_loss": 2.391704559326172,
"eval_runtime": 31.7798,
"eval_samples_per_second": 3.21,
"eval_steps_per_second": 1.605,
"step": 18400
},
{
"epoch": 0.3685,
"grad_norm": 0.5905061339542746,
"learning_rate": 7.01688888888889e-06,
"loss": 2.3881,
"step": 18425
},
{
"epoch": 0.369,
"grad_norm": 0.554978244766298,
"learning_rate": 7.011333333333334e-06,
"loss": 2.3683,
"step": 18450
},
{
"epoch": 0.3695,
"grad_norm": 0.5517801842410981,
"learning_rate": 7.005777777777778e-06,
"loss": 2.3835,
"step": 18475
},
{
"epoch": 0.37,
"grad_norm": 0.5501181046318251,
"learning_rate": 7.000222222222222e-06,
"loss": 2.374,
"step": 18500
},
{
"epoch": 0.37,
"eval_loss": 2.3915836811065674,
"eval_runtime": 31.7662,
"eval_samples_per_second": 3.211,
"eval_steps_per_second": 1.605,
"step": 18500
},
{
"epoch": 0.3705,
"grad_norm": 0.576826996404141,
"learning_rate": 6.9946666666666676e-06,
"loss": 2.3819,
"step": 18525
},
{
"epoch": 0.371,
"grad_norm": 0.5739797151959755,
"learning_rate": 6.989111111111112e-06,
"loss": 2.3794,
"step": 18550
},
{
"epoch": 0.3715,
"grad_norm": 0.5511012262440002,
"learning_rate": 6.9835555555555555e-06,
"loss": 2.3894,
"step": 18575
},
{
"epoch": 0.372,
"grad_norm": 0.5958849979817049,
"learning_rate": 6.978e-06,
"loss": 2.3674,
"step": 18600
},
{
"epoch": 0.372,
"eval_loss": 2.391352415084839,
"eval_runtime": 31.7756,
"eval_samples_per_second": 3.21,
"eval_steps_per_second": 1.605,
"step": 18600
},
{
"epoch": 0.3725,
"grad_norm": 0.5595892595435197,
"learning_rate": 6.972444444444445e-06,
"loss": 2.3835,
"step": 18625
},
{
"epoch": 0.373,
"grad_norm": 0.5946746403488841,
"learning_rate": 6.96688888888889e-06,
"loss": 2.3716,
"step": 18650
},
{
"epoch": 0.3735,
"grad_norm": 0.5613740876716816,
"learning_rate": 6.961333333333334e-06,
"loss": 2.3843,
"step": 18675
},
{
"epoch": 0.374,
"grad_norm": 0.58419422677193,
"learning_rate": 6.9557777777777776e-06,
"loss": 2.3883,
"step": 18700
},
{
"epoch": 0.374,
"eval_loss": 2.391383409500122,
"eval_runtime": 31.7182,
"eval_samples_per_second": 3.216,
"eval_steps_per_second": 1.608,
"step": 18700
},
{
"epoch": 0.3745,
"grad_norm": 0.5508427755524951,
"learning_rate": 6.950222222222223e-06,
"loss": 2.3749,
"step": 18725
},
{
"epoch": 0.375,
"grad_norm": 0.5686856026931271,
"learning_rate": 6.944666666666667e-06,
"loss": 2.38,
"step": 18750
},
{
"epoch": 0.3755,
"grad_norm": 0.5531747783480245,
"learning_rate": 6.939111111111112e-06,
"loss": 2.3718,
"step": 18775
},
{
"epoch": 0.376,
"grad_norm": 0.5800045444885175,
"learning_rate": 6.933555555555556e-06,
"loss": 2.3703,
"step": 18800
},
{
"epoch": 0.376,
"eval_loss": 2.391113042831421,
"eval_runtime": 31.7446,
"eval_samples_per_second": 3.213,
"eval_steps_per_second": 1.607,
"step": 18800
},
{
"epoch": 0.3765,
"grad_norm": 0.5451395919825731,
"learning_rate": 6.928e-06,
"loss": 2.3746,
"step": 18825
},
{
"epoch": 0.377,
"grad_norm": 0.5619738492106079,
"learning_rate": 6.922444444444445e-06,
"loss": 2.3815,
"step": 18850
},
{
"epoch": 0.3775,
"grad_norm": 0.5811440137998495,
"learning_rate": 6.91688888888889e-06,
"loss": 2.3655,
"step": 18875
},
{
"epoch": 0.378,
"grad_norm": 0.5528301840539304,
"learning_rate": 6.9113333333333345e-06,
"loss": 2.3721,
"step": 18900
},
{
"epoch": 0.378,
"eval_loss": 2.3908257484436035,
"eval_runtime": 31.6268,
"eval_samples_per_second": 3.225,
"eval_steps_per_second": 1.613,
"step": 18900
},
{
"epoch": 0.3785,
"grad_norm": 0.5791069800351532,
"learning_rate": 6.905777777777778e-06,
"loss": 2.3798,
"step": 18925
},
{
"epoch": 0.379,
"grad_norm": 0.5692008495737035,
"learning_rate": 6.9002222222222224e-06,
"loss": 2.3723,
"step": 18950
},
{
"epoch": 0.3795,
"grad_norm": 0.5614405054433378,
"learning_rate": 6.894666666666668e-06,
"loss": 2.3739,
"step": 18975
},
{
"epoch": 0.38,
"grad_norm": 0.5641420025760586,
"learning_rate": 6.889111111111112e-06,
"loss": 2.3728,
"step": 19000
},
{
"epoch": 0.38,
"eval_loss": 2.390749454498291,
"eval_runtime": 31.8098,
"eval_samples_per_second": 3.207,
"eval_steps_per_second": 1.603,
"step": 19000
},
{
"epoch": 0.3805,
"grad_norm": 0.5526396554433541,
"learning_rate": 6.8835555555555565e-06,
"loss": 2.3779,
"step": 19025
},
{
"epoch": 0.381,
"grad_norm": 0.574490460414078,
"learning_rate": 6.878e-06,
"loss": 2.3727,
"step": 19050
},
{
"epoch": 0.3815,
"grad_norm": 0.5611671894801677,
"learning_rate": 6.872444444444445e-06,
"loss": 2.379,
"step": 19075
},
{
"epoch": 0.382,
"grad_norm": 0.5434475778092571,
"learning_rate": 6.86688888888889e-06,
"loss": 2.3788,
"step": 19100
},
{
"epoch": 0.382,
"eval_loss": 2.390854597091675,
"eval_runtime": 31.4727,
"eval_samples_per_second": 3.241,
"eval_steps_per_second": 1.62,
"step": 19100
},
{
"epoch": 0.3825,
"grad_norm": 0.5438441040943751,
"learning_rate": 6.861333333333334e-06,
"loss": 2.3849,
"step": 19125
},
{
"epoch": 0.383,
"grad_norm": 0.5617582167520553,
"learning_rate": 6.855777777777778e-06,
"loss": 2.3778,
"step": 19150
},
{
"epoch": 0.3835,
"grad_norm": 0.5734148354957039,
"learning_rate": 6.850222222222223e-06,
"loss": 2.3749,
"step": 19175
},
{
"epoch": 0.384,
"grad_norm": 0.5567016447555824,
"learning_rate": 6.844666666666667e-06,
"loss": 2.3786,
"step": 19200
},
{
"epoch": 0.384,
"eval_loss": 2.390947103500366,
"eval_runtime": 31.472,
"eval_samples_per_second": 3.241,
"eval_steps_per_second": 1.62,
"step": 19200
},
{
"epoch": 0.3845,
"grad_norm": 0.5630941651558155,
"learning_rate": 6.839111111111112e-06,
"loss": 2.371,
"step": 19225
},
{
"epoch": 0.385,
"grad_norm": 0.5472891744821744,
"learning_rate": 6.833555555555557e-06,
"loss": 2.371,
"step": 19250
},
{
"epoch": 0.3855,
"grad_norm": 0.563854124925733,
"learning_rate": 6.8280000000000005e-06,
"loss": 2.3802,
"step": 19275
},
{
"epoch": 0.386,
"grad_norm": 0.5535188682099162,
"learning_rate": 6.822444444444445e-06,
"loss": 2.3668,
"step": 19300
},
{
"epoch": 0.386,
"eval_loss": 2.3904383182525635,
"eval_runtime": 31.5109,
"eval_samples_per_second": 3.237,
"eval_steps_per_second": 1.618,
"step": 19300
},
{
"epoch": 0.3865,
"grad_norm": 0.5847689751509554,
"learning_rate": 6.816888888888889e-06,
"loss": 2.3723,
"step": 19325
},
{
"epoch": 0.387,
"grad_norm": 0.5477508463021717,
"learning_rate": 6.811333333333335e-06,
"loss": 2.3748,
"step": 19350
},
{
"epoch": 0.3875,
"grad_norm": 0.5530662776524751,
"learning_rate": 6.805777777777778e-06,
"loss": 2.372,
"step": 19375
},
{
"epoch": 0.388,
"grad_norm": 0.5627088332087185,
"learning_rate": 6.8002222222222225e-06,
"loss": 2.3649,
"step": 19400
},
{
"epoch": 0.388,
"eval_loss": 2.3902432918548584,
"eval_runtime": 31.5016,
"eval_samples_per_second": 3.238,
"eval_steps_per_second": 1.619,
"step": 19400
},
{
"epoch": 0.3885,
"grad_norm": 0.5917805991329846,
"learning_rate": 6.794666666666667e-06,
"loss": 2.389,
"step": 19425
},
{
"epoch": 0.389,
"grad_norm": 0.5637153841856668,
"learning_rate": 6.789111111111112e-06,
"loss": 2.381,
"step": 19450
},
{
"epoch": 0.3895,
"grad_norm": 0.5638546592221216,
"learning_rate": 6.783555555555557e-06,
"loss": 2.3674,
"step": 19475
},
{
"epoch": 0.39,
"grad_norm": 0.5442599823902955,
"learning_rate": 6.778e-06,
"loss": 2.3684,
"step": 19500
},
{
"epoch": 0.39,
"eval_loss": 2.3898606300354004,
"eval_runtime": 31.4637,
"eval_samples_per_second": 3.242,
"eval_steps_per_second": 1.621,
"step": 19500
},
{
"epoch": 0.3905,
"grad_norm": 0.582280869057288,
"learning_rate": 6.7724444444444446e-06,
"loss": 2.3691,
"step": 19525
},
{
"epoch": 0.391,
"grad_norm": 0.5427829071455205,
"learning_rate": 6.76688888888889e-06,
"loss": 2.372,
"step": 19550
},
{
"epoch": 0.3915,
"grad_norm": 0.5690660297920415,
"learning_rate": 6.761333333333334e-06,
"loss": 2.3696,
"step": 19575
},
{
"epoch": 0.392,
"grad_norm": 0.5887280660795969,
"learning_rate": 6.755777777777779e-06,
"loss": 2.3647,
"step": 19600
},
{
"epoch": 0.392,
"eval_loss": 2.389928102493286,
"eval_runtime": 31.425,
"eval_samples_per_second": 3.246,
"eval_steps_per_second": 1.623,
"step": 19600
},
{
"epoch": 0.3925,
"grad_norm": 0.5706193677763675,
"learning_rate": 6.750222222222222e-06,
"loss": 2.3693,
"step": 19625
},
{
"epoch": 0.393,
"grad_norm": 0.5446782496969111,
"learning_rate": 6.7446666666666674e-06,
"loss": 2.3808,
"step": 19650
},
{
"epoch": 0.3935,
"grad_norm": 0.5571942248079983,
"learning_rate": 6.739111111111112e-06,
"loss": 2.3825,
"step": 19675
},
{
"epoch": 0.394,
"grad_norm": 0.5452923856402259,
"learning_rate": 6.733555555555556e-06,
"loss": 2.3689,
"step": 19700
},
{
"epoch": 0.394,
"eval_loss": 2.3896048069000244,
"eval_runtime": 31.5836,
"eval_samples_per_second": 3.23,
"eval_steps_per_second": 1.615,
"step": 19700
},
{
"epoch": 0.3945,
"grad_norm": 0.5828792681612529,
"learning_rate": 6.728e-06,
"loss": 2.3733,
"step": 19725
},
{
"epoch": 0.395,
"grad_norm": 0.5615201455315739,
"learning_rate": 6.722444444444445e-06,
"loss": 2.3689,
"step": 19750
},
{
"epoch": 0.3955,
"grad_norm": 0.5585669738111114,
"learning_rate": 6.7168888888888894e-06,
"loss": 2.3873,
"step": 19775
},
{
"epoch": 0.396,
"grad_norm": 0.5412795214285975,
"learning_rate": 6.711333333333334e-06,
"loss": 2.3786,
"step": 19800
},
{
"epoch": 0.396,
"eval_loss": 2.3894851207733154,
"eval_runtime": 31.4877,
"eval_samples_per_second": 3.239,
"eval_steps_per_second": 1.62,
"step": 19800
},
{
"epoch": 0.3965,
"grad_norm": 0.5778930227780084,
"learning_rate": 6.705777777777779e-06,
"loss": 2.3766,
"step": 19825
},
{
"epoch": 0.397,
"grad_norm": 0.5682987690385847,
"learning_rate": 6.700222222222223e-06,
"loss": 2.3783,
"step": 19850
},
{
"epoch": 0.3975,
"grad_norm": 0.5763865594632764,
"learning_rate": 6.694666666666667e-06,
"loss": 2.3738,
"step": 19875
},
{
"epoch": 0.398,
"grad_norm": 0.5514756259491804,
"learning_rate": 6.6891111111111115e-06,
"loss": 2.3764,
"step": 19900
},
{
"epoch": 0.398,
"eval_loss": 2.388927698135376,
"eval_runtime": 31.7775,
"eval_samples_per_second": 3.21,
"eval_steps_per_second": 1.605,
"step": 19900
},
{
"epoch": 0.3985,
"grad_norm": 0.5577240438533453,
"learning_rate": 6.683555555555557e-06,
"loss": 2.374,
"step": 19925
},
{
"epoch": 0.399,
"grad_norm": 0.553314104963858,
"learning_rate": 6.678e-06,
"loss": 2.3726,
"step": 19950
},
{
"epoch": 0.3995,
"grad_norm": 0.5615070159418603,
"learning_rate": 6.672444444444445e-06,
"loss": 2.3683,
"step": 19975
},
{
"epoch": 0.4,
"grad_norm": 0.5595654854755111,
"learning_rate": 6.666888888888889e-06,
"loss": 2.3632,
"step": 20000
},
{
"epoch": 0.4,
"eval_loss": 2.389249801635742,
"eval_runtime": 31.7934,
"eval_samples_per_second": 3.208,
"eval_steps_per_second": 1.604,
"step": 20000
},
{
"epoch": 0.4005,
"grad_norm": 0.5697829378233469,
"learning_rate": 6.661333333333334e-06,
"loss": 2.3675,
"step": 20025
},
{
"epoch": 0.401,
"grad_norm": 0.5582897347067457,
"learning_rate": 6.655777777777779e-06,
"loss": 2.3672,
"step": 20050
},
{
"epoch": 0.4015,
"grad_norm": 0.5926925535950422,
"learning_rate": 6.650222222222222e-06,
"loss": 2.3733,
"step": 20075
},
{
"epoch": 0.402,
"grad_norm": 0.544270592824537,
"learning_rate": 6.644666666666667e-06,
"loss": 2.3803,
"step": 20100
},
{
"epoch": 0.402,
"eval_loss": 2.389204502105713,
"eval_runtime": 31.8367,
"eval_samples_per_second": 3.204,
"eval_steps_per_second": 1.602,
"step": 20100
},
{
"epoch": 0.4025,
"grad_norm": 0.5530370407597024,
"learning_rate": 6.639111111111112e-06,
"loss": 2.3633,
"step": 20125
},
{
"epoch": 0.403,
"grad_norm": 0.5731039592674091,
"learning_rate": 6.633555555555556e-06,
"loss": 2.3642,
"step": 20150
},
{
"epoch": 0.4035,
"grad_norm": 0.5599029138977244,
"learning_rate": 6.628e-06,
"loss": 2.378,
"step": 20175
},
{
"epoch": 0.404,
"grad_norm": 0.5833746985921849,
"learning_rate": 6.622444444444444e-06,
"loss": 2.3797,
"step": 20200
},
{
"epoch": 0.404,
"eval_loss": 2.388874053955078,
"eval_runtime": 31.8821,
"eval_samples_per_second": 3.199,
"eval_steps_per_second": 1.6,
"step": 20200
},
{
"epoch": 0.4045,
"grad_norm": 0.5758811776953918,
"learning_rate": 6.6168888888888896e-06,
"loss": 2.3759,
"step": 20225
},
{
"epoch": 0.405,
"grad_norm": 0.559073322750905,
"learning_rate": 6.611333333333334e-06,
"loss": 2.3743,
"step": 20250
},
{
"epoch": 0.4055,
"grad_norm": 0.5638862668814341,
"learning_rate": 6.605777777777778e-06,
"loss": 2.3726,
"step": 20275
},
{
"epoch": 0.406,
"grad_norm": 0.5611977328077278,
"learning_rate": 6.600222222222222e-06,
"loss": 2.3704,
"step": 20300
},
{
"epoch": 0.406,
"eval_loss": 2.3888099193573,
"eval_runtime": 31.7076,
"eval_samples_per_second": 3.217,
"eval_steps_per_second": 1.608,
"step": 20300
},
{
"epoch": 0.4065,
"grad_norm": 0.5664333139784736,
"learning_rate": 6.594666666666667e-06,
"loss": 2.3644,
"step": 20325
},
{
"epoch": 0.407,
"grad_norm": 0.5549238936705829,
"learning_rate": 6.5891111111111116e-06,
"loss": 2.3594,
"step": 20350
},
{
"epoch": 0.4075,
"grad_norm": 0.56940110218198,
"learning_rate": 6.583555555555556e-06,
"loss": 2.3743,
"step": 20375
},
{
"epoch": 0.408,
"grad_norm": 0.5757908141952881,
"learning_rate": 6.578000000000001e-06,
"loss": 2.3774,
"step": 20400
},
{
"epoch": 0.408,
"eval_loss": 2.3890221118927,
"eval_runtime": 31.8193,
"eval_samples_per_second": 3.206,
"eval_steps_per_second": 1.603,
"step": 20400
},
{
"epoch": 0.4085,
"grad_norm": 0.6023338293027314,
"learning_rate": 6.572444444444445e-06,
"loss": 2.3774,
"step": 20425
},
{
"epoch": 0.409,
"grad_norm": 0.5398042018053211,
"learning_rate": 6.566888888888889e-06,
"loss": 2.3785,
"step": 20450
},
{
"epoch": 0.4095,
"grad_norm": 0.5961544515028506,
"learning_rate": 6.561333333333334e-06,
"loss": 2.3867,
"step": 20475
},
{
"epoch": 0.41,
"grad_norm": 0.5517605161130648,
"learning_rate": 6.555777777777779e-06,
"loss": 2.3713,
"step": 20500
},
{
"epoch": 0.41,
"eval_loss": 2.38859224319458,
"eval_runtime": 31.8577,
"eval_samples_per_second": 3.202,
"eval_steps_per_second": 1.601,
"step": 20500
},
{
"epoch": 0.4105,
"grad_norm": 0.5753260144360031,
"learning_rate": 6.550222222222222e-06,
"loss": 2.3653,
"step": 20525
},
{
"epoch": 0.411,
"grad_norm": 0.6404542212883029,
"learning_rate": 6.544666666666667e-06,
"loss": 2.3869,
"step": 20550
},
{
"epoch": 0.4115,
"grad_norm": 0.5777253920326619,
"learning_rate": 6.539111111111112e-06,
"loss": 2.3813,
"step": 20575
},
{
"epoch": 0.412,
"grad_norm": 0.5698546516216307,
"learning_rate": 6.5335555555555565e-06,
"loss": 2.3775,
"step": 20600
},
{
"epoch": 0.412,
"eval_loss": 2.388434648513794,
"eval_runtime": 31.8295,
"eval_samples_per_second": 3.205,
"eval_steps_per_second": 1.602,
"step": 20600
},
{
"epoch": 0.4125,
"grad_norm": 0.5842535685269022,
"learning_rate": 6.528000000000001e-06,
"loss": 2.3896,
"step": 20625
},
{
"epoch": 0.413,
"grad_norm": 0.5595088265556925,
"learning_rate": 6.522444444444444e-06,
"loss": 2.3878,
"step": 20650
},
{
"epoch": 0.4135,
"grad_norm": 0.5751254243123975,
"learning_rate": 6.51688888888889e-06,
"loss": 2.367,
"step": 20675
},
{
"epoch": 0.414,
"grad_norm": 0.5394876201865446,
"learning_rate": 6.511333333333334e-06,
"loss": 2.3776,
"step": 20700
},
{
"epoch": 0.414,
"eval_loss": 2.3883957862854004,
"eval_runtime": 31.8095,
"eval_samples_per_second": 3.207,
"eval_steps_per_second": 1.603,
"step": 20700
},
{
"epoch": 0.4145,
"grad_norm": 0.5601399673585632,
"learning_rate": 6.5057777777777785e-06,
"loss": 2.3679,
"step": 20725
},
{
"epoch": 0.415,
"grad_norm": 0.5715098373270459,
"learning_rate": 6.500222222222222e-06,
"loss": 2.3811,
"step": 20750
},
{
"epoch": 0.4155,
"grad_norm": 0.5517830411358287,
"learning_rate": 6.494666666666667e-06,
"loss": 2.3723,
"step": 20775
},
{
"epoch": 0.416,
"grad_norm": 0.5736440167807991,
"learning_rate": 6.489111111111112e-06,
"loss": 2.3804,
"step": 20800
},
{
"epoch": 0.416,
"eval_loss": 2.388143539428711,
"eval_runtime": 31.9362,
"eval_samples_per_second": 3.194,
"eval_steps_per_second": 1.597,
"step": 20800
},
{
"epoch": 0.4165,
"grad_norm": 0.5772877970336647,
"learning_rate": 6.483555555555556e-06,
"loss": 2.3721,
"step": 20825
},
{
"epoch": 0.417,
"grad_norm": 0.5746556720939705,
"learning_rate": 6.478000000000001e-06,
"loss": 2.3662,
"step": 20850
},
{
"epoch": 0.4175,
"grad_norm": 0.5605696940354651,
"learning_rate": 6.472444444444445e-06,
"loss": 2.3783,
"step": 20875
},
{
"epoch": 0.418,
"grad_norm": 0.5474840165552274,
"learning_rate": 6.466888888888889e-06,
"loss": 2.3799,
"step": 20900
},
{
"epoch": 0.418,
"eval_loss": 2.388044595718384,
"eval_runtime": 31.8313,
"eval_samples_per_second": 3.204,
"eval_steps_per_second": 1.602,
"step": 20900
},
{
"epoch": 0.4185,
"grad_norm": 0.5663680125421368,
"learning_rate": 6.461333333333334e-06,
"loss": 2.3843,
"step": 20925
},
{
"epoch": 0.419,
"grad_norm": 0.5531423851896319,
"learning_rate": 6.455777777777779e-06,
"loss": 2.3661,
"step": 20950
},
{
"epoch": 0.4195,
"grad_norm": 0.5644562314935403,
"learning_rate": 6.450222222222223e-06,
"loss": 2.3762,
"step": 20975
},
{
"epoch": 0.42,
"grad_norm": 0.5653831391780122,
"learning_rate": 6.444666666666667e-06,
"loss": 2.3588,
"step": 21000
},
{
"epoch": 0.42,
"eval_loss": 2.388213872909546,
"eval_runtime": 31.7864,
"eval_samples_per_second": 3.209,
"eval_steps_per_second": 1.604,
"step": 21000
},
{
"epoch": 0.4205,
"grad_norm": 0.5447308357523696,
"learning_rate": 6.439111111111111e-06,
"loss": 2.3803,
"step": 21025
},
{
"epoch": 0.421,
"grad_norm": 0.5426314550064573,
"learning_rate": 6.4335555555555566e-06,
"loss": 2.3798,
"step": 21050
},
{
"epoch": 0.4215,
"grad_norm": 0.5623213994558643,
"learning_rate": 6.428000000000001e-06,
"loss": 2.3855,
"step": 21075
},
{
"epoch": 0.422,
"grad_norm": 0.551782200199429,
"learning_rate": 6.4224444444444445e-06,
"loss": 2.3744,
"step": 21100
},
{
"epoch": 0.422,
"eval_loss": 2.3879234790802,
"eval_runtime": 31.7247,
"eval_samples_per_second": 3.215,
"eval_steps_per_second": 1.608,
"step": 21100
},
{
"epoch": 0.4225,
"grad_norm": 0.527718965025146,
"learning_rate": 6.416888888888889e-06,
"loss": 2.3629,
"step": 21125
},
{
"epoch": 0.423,
"grad_norm": 0.5608708238117702,
"learning_rate": 6.411333333333334e-06,
"loss": 2.3775,
"step": 21150
},
{
"epoch": 0.4235,
"grad_norm": 0.5448339479028284,
"learning_rate": 6.405777777777779e-06,
"loss": 2.379,
"step": 21175
},
{
"epoch": 0.424,
"grad_norm": 0.5418336159854089,
"learning_rate": 6.400222222222223e-06,
"loss": 2.3771,
"step": 21200
},
{
"epoch": 0.424,
"eval_loss": 2.3878672122955322,
"eval_runtime": 31.8891,
"eval_samples_per_second": 3.199,
"eval_steps_per_second": 1.599,
"step": 21200
},
{
"epoch": 0.4245,
"grad_norm": 0.5765916975285049,
"learning_rate": 6.3946666666666665e-06,
"loss": 2.3838,
"step": 21225
},
{
"epoch": 0.425,
"grad_norm": 0.5482787584221817,
"learning_rate": 6.389111111111112e-06,
"loss": 2.3751,
"step": 21250
},
{
"epoch": 0.4255,
"grad_norm": 0.5592623692636863,
"learning_rate": 6.383555555555556e-06,
"loss": 2.3714,
"step": 21275
},
{
"epoch": 0.426,
"grad_norm": 0.5502456266750644,
"learning_rate": 6.378000000000001e-06,
"loss": 2.3687,
"step": 21300
},
{
"epoch": 0.426,
"eval_loss": 2.387702226638794,
"eval_runtime": 31.8474,
"eval_samples_per_second": 3.203,
"eval_steps_per_second": 1.601,
"step": 21300
},
{
"epoch": 0.4265,
"grad_norm": 0.5508844144432443,
"learning_rate": 6.372444444444444e-06,
"loss": 2.3705,
"step": 21325
},
{
"epoch": 0.427,
"grad_norm": 0.5551955771008479,
"learning_rate": 6.366888888888889e-06,
"loss": 2.3616,
"step": 21350
},
{
"epoch": 0.4275,
"grad_norm": 0.5482174863813819,
"learning_rate": 6.361333333333334e-06,
"loss": 2.3679,
"step": 21375
},
{
"epoch": 0.428,
"grad_norm": 0.540793837360148,
"learning_rate": 6.355777777777778e-06,
"loss": 2.3724,
"step": 21400
},
{
"epoch": 0.428,
"eval_loss": 2.3876450061798096,
"eval_runtime": 32.2051,
"eval_samples_per_second": 3.167,
"eval_steps_per_second": 1.584,
"step": 21400
},
{
"epoch": 0.4285,
"grad_norm": 0.5478812262209652,
"learning_rate": 6.3502222222222235e-06,
"loss": 2.3639,
"step": 21425
},
{
"epoch": 0.429,
"grad_norm": 0.5598419449976438,
"learning_rate": 6.344666666666667e-06,
"loss": 2.3686,
"step": 21450
},
{
"epoch": 0.4295,
"grad_norm": 0.5650989625187698,
"learning_rate": 6.339111111111111e-06,
"loss": 2.3755,
"step": 21475
},
{
"epoch": 0.43,
"grad_norm": 0.5521104434834965,
"learning_rate": 6.333555555555556e-06,
"loss": 2.3819,
"step": 21500
},
{
"epoch": 0.43,
"eval_loss": 2.386732578277588,
"eval_runtime": 32.423,
"eval_samples_per_second": 3.146,
"eval_steps_per_second": 1.573,
"step": 21500
},
{
"epoch": 0.4305,
"grad_norm": 0.5718504697288973,
"learning_rate": 6.328000000000001e-06,
"loss": 2.3768,
"step": 21525
},
{
"epoch": 0.431,
"grad_norm": 0.5647383482527034,
"learning_rate": 6.3224444444444455e-06,
"loss": 2.3634,
"step": 21550
},
{
"epoch": 0.4315,
"grad_norm": 0.5740444089490578,
"learning_rate": 6.316888888888889e-06,
"loss": 2.3683,
"step": 21575
},
{
"epoch": 0.432,
"grad_norm": 0.5468815860778439,
"learning_rate": 6.3113333333333334e-06,
"loss": 2.3775,
"step": 21600
},
{
"epoch": 0.432,
"eval_loss": 2.386624813079834,
"eval_runtime": 32.2361,
"eval_samples_per_second": 3.164,
"eval_steps_per_second": 1.582,
"step": 21600
},
{
"epoch": 0.4325,
"grad_norm": 0.5491782166979611,
"learning_rate": 6.305777777777779e-06,
"loss": 2.3678,
"step": 21625
},
{
"epoch": 0.433,
"grad_norm": 0.5493956319744467,
"learning_rate": 6.300222222222223e-06,
"loss": 2.3632,
"step": 21650
},
{
"epoch": 0.4335,
"grad_norm": 0.5517199994093782,
"learning_rate": 6.294666666666667e-06,
"loss": 2.3719,
"step": 21675
},
{
"epoch": 0.434,
"grad_norm": 0.5480082798934808,
"learning_rate": 6.289111111111111e-06,
"loss": 2.3705,
"step": 21700
},
{
"epoch": 0.434,
"eval_loss": 2.386605978012085,
"eval_runtime": 31.811,
"eval_samples_per_second": 3.206,
"eval_steps_per_second": 1.603,
"step": 21700
},
{
"epoch": 0.4345,
"grad_norm": 0.5988374708555845,
"learning_rate": 6.283555555555556e-06,
"loss": 2.3736,
"step": 21725
},
{
"epoch": 0.435,
"grad_norm": 0.5394989364015422,
"learning_rate": 6.278000000000001e-06,
"loss": 2.38,
"step": 21750
},
{
"epoch": 0.4355,
"grad_norm": 0.5660475248416822,
"learning_rate": 6.272444444444445e-06,
"loss": 2.3712,
"step": 21775
},
{
"epoch": 0.436,
"grad_norm": 0.5824076374736812,
"learning_rate": 6.266888888888889e-06,
"loss": 2.3781,
"step": 21800
},
{
"epoch": 0.436,
"eval_loss": 2.3868014812469482,
"eval_runtime": 32.0011,
"eval_samples_per_second": 3.187,
"eval_steps_per_second": 1.594,
"step": 21800
},
{
"epoch": 0.4365,
"grad_norm": 0.5604649354431509,
"learning_rate": 6.261333333333334e-06,
"loss": 2.3673,
"step": 21825
},
{
"epoch": 0.437,
"grad_norm": 0.5581917280058185,
"learning_rate": 6.255777777777778e-06,
"loss": 2.3575,
"step": 21850
},
{
"epoch": 0.4375,
"grad_norm": 0.5682187519985219,
"learning_rate": 6.250222222222223e-06,
"loss": 2.3752,
"step": 21875
},
{
"epoch": 0.438,
"grad_norm": 0.5343819916754123,
"learning_rate": 6.244666666666666e-06,
"loss": 2.3688,
"step": 21900
},
{
"epoch": 0.438,
"eval_loss": 2.3865694999694824,
"eval_runtime": 31.8681,
"eval_samples_per_second": 3.201,
"eval_steps_per_second": 1.6,
"step": 21900
},
{
"epoch": 0.4385,
"grad_norm": 0.6084740129821103,
"learning_rate": 6.2391111111111115e-06,
"loss": 2.3611,
"step": 21925
},
{
"epoch": 0.439,
"grad_norm": 0.5550908983577711,
"learning_rate": 6.233555555555556e-06,
"loss": 2.364,
"step": 21950
},
{
"epoch": 0.4395,
"grad_norm": 0.5605896822575689,
"learning_rate": 6.228e-06,
"loss": 2.3875,
"step": 21975
},
{
"epoch": 0.44,
"grad_norm": 0.5679795530728957,
"learning_rate": 6.222444444444446e-06,
"loss": 2.3637,
"step": 22000
},
{
"epoch": 0.44,
"eval_loss": 2.3865110874176025,
"eval_runtime": 31.8116,
"eval_samples_per_second": 3.206,
"eval_steps_per_second": 1.603,
"step": 22000
},
{
"epoch": 0.4405,
"grad_norm": 0.5533397760322247,
"learning_rate": 6.216888888888889e-06,
"loss": 2.371,
"step": 22025
},
{
"epoch": 0.441,
"grad_norm": 0.5551275205002794,
"learning_rate": 6.2113333333333336e-06,
"loss": 2.3684,
"step": 22050
},
{
"epoch": 0.4415,
"grad_norm": 0.5520948023453888,
"learning_rate": 6.205777777777778e-06,
"loss": 2.3602,
"step": 22075
},
{
"epoch": 0.442,
"grad_norm": 0.5679529169964138,
"learning_rate": 6.200222222222223e-06,
"loss": 2.3867,
"step": 22100
},
{
"epoch": 0.442,
"eval_loss": 2.3863022327423096,
"eval_runtime": 32.0036,
"eval_samples_per_second": 3.187,
"eval_steps_per_second": 1.594,
"step": 22100
},
{
"epoch": 0.4425,
"grad_norm": 0.5619895216629556,
"learning_rate": 6.194666666666668e-06,
"loss": 2.3701,
"step": 22125
},
{
"epoch": 0.443,
"grad_norm": 0.5515875809771505,
"learning_rate": 6.189111111111111e-06,
"loss": 2.3734,
"step": 22150
},
{
"epoch": 0.4435,
"grad_norm": 0.5686425996531567,
"learning_rate": 6.1835555555555556e-06,
"loss": 2.3698,
"step": 22175
},
{
"epoch": 0.444,
"grad_norm": 0.5580871882801617,
"learning_rate": 6.178000000000001e-06,
"loss": 2.3676,
"step": 22200
},
{
"epoch": 0.444,
"eval_loss": 2.3865246772766113,
"eval_runtime": 31.7174,
"eval_samples_per_second": 3.216,
"eval_steps_per_second": 1.608,
"step": 22200
},
{
"epoch": 0.4445,
"grad_norm": 0.5784261034385078,
"learning_rate": 6.172444444444445e-06,
"loss": 2.3723,
"step": 22225
},
{
"epoch": 0.445,
"grad_norm": 0.5570688655308026,
"learning_rate": 6.166888888888889e-06,
"loss": 2.3709,
"step": 22250
},
{
"epoch": 0.4455,
"grad_norm": 0.5716930839552549,
"learning_rate": 6.161333333333334e-06,
"loss": 2.3734,
"step": 22275
},
{
"epoch": 0.446,
"grad_norm": 0.5550340902020618,
"learning_rate": 6.1557777777777784e-06,
"loss": 2.3648,
"step": 22300
},
{
"epoch": 0.446,
"eval_loss": 2.38633131980896,
"eval_runtime": 31.7943,
"eval_samples_per_second": 3.208,
"eval_steps_per_second": 1.604,
"step": 22300
},
{
"epoch": 0.4465,
"grad_norm": 0.5719936248106342,
"learning_rate": 6.150222222222223e-06,
"loss": 2.3751,
"step": 22325
},
{
"epoch": 0.447,
"grad_norm": 0.5616671760742846,
"learning_rate": 6.144666666666668e-06,
"loss": 2.3748,
"step": 22350
},
{
"epoch": 0.4475,
"grad_norm": 0.5785985644213604,
"learning_rate": 6.139111111111112e-06,
"loss": 2.3837,
"step": 22375
},
{
"epoch": 0.448,
"grad_norm": 0.5645620599147937,
"learning_rate": 6.133555555555556e-06,
"loss": 2.3745,
"step": 22400
},
{
"epoch": 0.448,
"eval_loss": 2.3862569332122803,
"eval_runtime": 31.9593,
"eval_samples_per_second": 3.192,
"eval_steps_per_second": 1.596,
"step": 22400
},
{
"epoch": 0.4485,
"grad_norm": 0.5469950240628229,
"learning_rate": 6.1280000000000005e-06,
"loss": 2.3642,
"step": 22425
},
{
"epoch": 0.449,
"grad_norm": 0.5324393599981698,
"learning_rate": 6.122444444444446e-06,
"loss": 2.379,
"step": 22450
},
{
"epoch": 0.4495,
"grad_norm": 0.5519962387254249,
"learning_rate": 6.116888888888889e-06,
"loss": 2.3635,
"step": 22475
},
{
"epoch": 0.45,
"grad_norm": 0.5588336399127953,
"learning_rate": 6.111333333333334e-06,
"loss": 2.3718,
"step": 22500
},
{
"epoch": 0.45,
"eval_loss": 2.385950803756714,
"eval_runtime": 31.7208,
"eval_samples_per_second": 3.216,
"eval_steps_per_second": 1.608,
"step": 22500
},
{
"epoch": 0.4505,
"grad_norm": 0.5923640418917652,
"learning_rate": 6.105777777777778e-06,
"loss": 2.3719,
"step": 22525
},
{
"epoch": 0.451,
"grad_norm": 0.5653562982992056,
"learning_rate": 6.100222222222223e-06,
"loss": 2.3808,
"step": 22550
},
{
"epoch": 0.4515,
"grad_norm": 0.5636846873459127,
"learning_rate": 6.094666666666668e-06,
"loss": 2.3641,
"step": 22575
},
{
"epoch": 0.452,
"grad_norm": 0.5850003926588586,
"learning_rate": 6.089111111111111e-06,
"loss": 2.3572,
"step": 22600
},
{
"epoch": 0.452,
"eval_loss": 2.386296033859253,
"eval_runtime": 31.8709,
"eval_samples_per_second": 3.2,
"eval_steps_per_second": 1.6,
"step": 22600
},
{
"epoch": 0.4525,
"grad_norm": 0.5334735362781007,
"learning_rate": 6.083555555555556e-06,
"loss": 2.3732,
"step": 22625
},
{
"epoch": 0.453,
"grad_norm": 0.5809776122118506,
"learning_rate": 6.078000000000001e-06,
"loss": 2.3842,
"step": 22650
},
{
"epoch": 0.4535,
"grad_norm": 0.5438625993671827,
"learning_rate": 6.072444444444445e-06,
"loss": 2.3802,
"step": 22675
},
{
"epoch": 0.454,
"grad_norm": 0.5581266930595516,
"learning_rate": 6.06688888888889e-06,
"loss": 2.3757,
"step": 22700
},
{
"epoch": 0.454,
"eval_loss": 2.3853445053100586,
"eval_runtime": 31.9465,
"eval_samples_per_second": 3.193,
"eval_steps_per_second": 1.596,
"step": 22700
},
{
"epoch": 0.4545,
"grad_norm": 0.5665471911134969,
"learning_rate": 6.061333333333333e-06,
"loss": 2.3632,
"step": 22725
},
{
"epoch": 0.455,
"grad_norm": 0.5602817372745607,
"learning_rate": 6.0557777777777785e-06,
"loss": 2.3759,
"step": 22750
},
{
"epoch": 0.4555,
"grad_norm": 0.5546395592927382,
"learning_rate": 6.050222222222223e-06,
"loss": 2.3654,
"step": 22775
},
{
"epoch": 0.456,
"grad_norm": 0.5466059675730089,
"learning_rate": 6.044666666666667e-06,
"loss": 2.3747,
"step": 22800
},
{
"epoch": 0.456,
"eval_loss": 2.3854382038116455,
"eval_runtime": 31.8135,
"eval_samples_per_second": 3.206,
"eval_steps_per_second": 1.603,
"step": 22800
},
{
"epoch": 0.4565,
"grad_norm": 0.556576922176953,
"learning_rate": 6.039111111111111e-06,
"loss": 2.3752,
"step": 22825
},
{
"epoch": 0.457,
"grad_norm": 0.5587160453347744,
"learning_rate": 6.033555555555556e-06,
"loss": 2.3753,
"step": 22850
},
{
"epoch": 0.4575,
"grad_norm": 0.5581750567947692,
"learning_rate": 6.0280000000000006e-06,
"loss": 2.3744,
"step": 22875
},
{
"epoch": 0.458,
"grad_norm": 0.5665211201226871,
"learning_rate": 6.022444444444445e-06,
"loss": 2.3707,
"step": 22900
},
{
"epoch": 0.458,
"eval_loss": 2.3854050636291504,
"eval_runtime": 31.8453,
"eval_samples_per_second": 3.203,
"eval_steps_per_second": 1.601,
"step": 22900
},
{
"epoch": 0.4585,
"grad_norm": 0.559138638343371,
"learning_rate": 6.01688888888889e-06,
"loss": 2.3771,
"step": 22925
},
{
"epoch": 0.459,
"grad_norm": 0.5765629867304476,
"learning_rate": 6.011333333333334e-06,
"loss": 2.3751,
"step": 22950
},
{
"epoch": 0.4595,
"grad_norm": 0.5697804508664757,
"learning_rate": 6.005777777777778e-06,
"loss": 2.3837,
"step": 22975
},
{
"epoch": 0.46,
"grad_norm": 0.5813773268685459,
"learning_rate": 6.000222222222223e-06,
"loss": 2.37,
"step": 23000
},
{
"epoch": 0.46,
"eval_loss": 2.385390520095825,
"eval_runtime": 31.767,
"eval_samples_per_second": 3.211,
"eval_steps_per_second": 1.605,
"step": 23000
},
{
"epoch": 0.4605,
"grad_norm": 0.5644614073323889,
"learning_rate": 5.994666666666668e-06,
"loss": 2.3627,
"step": 23025
},
{
"epoch": 0.461,
"grad_norm": 0.561196100799294,
"learning_rate": 5.989111111111111e-06,
"loss": 2.373,
"step": 23050
},
{
"epoch": 0.4615,
"grad_norm": 0.5988172465498709,
"learning_rate": 5.983555555555556e-06,
"loss": 2.3625,
"step": 23075
},
{
"epoch": 0.462,
"grad_norm": 0.5561927981892911,
"learning_rate": 5.978e-06,
"loss": 2.366,
"step": 23100
},
{
"epoch": 0.462,
"eval_loss": 2.3851592540740967,
"eval_runtime": 31.9972,
"eval_samples_per_second": 3.188,
"eval_steps_per_second": 1.594,
"step": 23100
},
{
"epoch": 0.4625,
"grad_norm": 0.5473375939412587,
"learning_rate": 5.9724444444444454e-06,
"loss": 2.3577,
"step": 23125
},
{
"epoch": 0.463,
"grad_norm": 0.5422432723666715,
"learning_rate": 5.96688888888889e-06,
"loss": 2.3724,
"step": 23150
},
{
"epoch": 0.4635,
"grad_norm": 0.5459369802725026,
"learning_rate": 5.961333333333333e-06,
"loss": 2.3693,
"step": 23175
},
{
"epoch": 0.464,
"grad_norm": 0.5602391995824985,
"learning_rate": 5.955777777777778e-06,
"loss": 2.3662,
"step": 23200
},
{
"epoch": 0.464,
"eval_loss": 2.384812593460083,
"eval_runtime": 31.7736,
"eval_samples_per_second": 3.21,
"eval_steps_per_second": 1.605,
"step": 23200
},
{
"epoch": 0.4645,
"grad_norm": 0.5382771454200044,
"learning_rate": 5.950222222222223e-06,
"loss": 2.373,
"step": 23225
},
{
"epoch": 0.465,
"grad_norm": 0.5616408548500356,
"learning_rate": 5.9446666666666675e-06,
"loss": 2.3744,
"step": 23250
},
{
"epoch": 0.4655,
"grad_norm": 0.5626270768454595,
"learning_rate": 5.939111111111111e-06,
"loss": 2.3745,
"step": 23275
},
{
"epoch": 0.466,
"grad_norm": 0.5771198592247021,
"learning_rate": 5.933555555555555e-06,
"loss": 2.3712,
"step": 23300
},
{
"epoch": 0.466,
"eval_loss": 2.385037660598755,
"eval_runtime": 31.6688,
"eval_samples_per_second": 3.221,
"eval_steps_per_second": 1.61,
"step": 23300
},
{
"epoch": 0.4665,
"grad_norm": 0.553677767303205,
"learning_rate": 5.928000000000001e-06,
"loss": 2.3688,
"step": 23325
},
{
"epoch": 0.467,
"grad_norm": 0.5761122434148291,
"learning_rate": 5.922444444444445e-06,
"loss": 2.3697,
"step": 23350
},
{
"epoch": 0.4675,
"grad_norm": 0.5776134096430138,
"learning_rate": 5.9168888888888895e-06,
"loss": 2.3696,
"step": 23375
},
{
"epoch": 0.468,
"grad_norm": 0.5410943763458229,
"learning_rate": 5.911333333333333e-06,
"loss": 2.3748,
"step": 23400
},
{
"epoch": 0.468,
"eval_loss": 2.3850579261779785,
"eval_runtime": 31.7506,
"eval_samples_per_second": 3.213,
"eval_steps_per_second": 1.606,
"step": 23400
},
{
"epoch": 0.4685,
"grad_norm": 0.5496846088073756,
"learning_rate": 5.905777777777778e-06,
"loss": 2.3631,
"step": 23425
},
{
"epoch": 0.469,
"grad_norm": 0.5489837887647091,
"learning_rate": 5.900222222222223e-06,
"loss": 2.3752,
"step": 23450
},
{
"epoch": 0.4695,
"grad_norm": 0.5595321821458019,
"learning_rate": 5.894666666666667e-06,
"loss": 2.3681,
"step": 23475
},
{
"epoch": 0.47,
"grad_norm": 0.5441176871533538,
"learning_rate": 5.889111111111112e-06,
"loss": 2.3689,
"step": 23500
},
{
"epoch": 0.47,
"eval_loss": 2.3847615718841553,
"eval_runtime": 31.7515,
"eval_samples_per_second": 3.212,
"eval_steps_per_second": 1.606,
"step": 23500
},
{
"epoch": 0.4705,
"grad_norm": 0.5591005943894303,
"learning_rate": 5.883555555555556e-06,
"loss": 2.3687,
"step": 23525
},
{
"epoch": 0.471,
"grad_norm": 0.5569068986313633,
"learning_rate": 5.878e-06,
"loss": 2.3579,
"step": 23550
},
{
"epoch": 0.4715,
"grad_norm": 0.5544550604142251,
"learning_rate": 5.872444444444445e-06,
"loss": 2.3654,
"step": 23575
},
{
"epoch": 0.472,
"grad_norm": 0.5682698532685105,
"learning_rate": 5.86688888888889e-06,
"loss": 2.3686,
"step": 23600
},
{
"epoch": 0.472,
"eval_loss": 2.384906053543091,
"eval_runtime": 31.7623,
"eval_samples_per_second": 3.211,
"eval_steps_per_second": 1.606,
"step": 23600
},
{
"epoch": 0.4725,
"grad_norm": 0.5754081011772445,
"learning_rate": 5.8613333333333335e-06,
"loss": 2.3629,
"step": 23625
},
{
"epoch": 0.473,
"grad_norm": 0.605492062724259,
"learning_rate": 5.855777777777778e-06,
"loss": 2.3702,
"step": 23650
},
{
"epoch": 0.4735,
"grad_norm": 0.5407520724247802,
"learning_rate": 5.850222222222222e-06,
"loss": 2.3652,
"step": 23675
},
{
"epoch": 0.474,
"grad_norm": 0.5531865604429913,
"learning_rate": 5.8446666666666676e-06,
"loss": 2.3724,
"step": 23700
},
{
"epoch": 0.474,
"eval_loss": 2.3844547271728516,
"eval_runtime": 31.833,
"eval_samples_per_second": 3.204,
"eval_steps_per_second": 1.602,
"step": 23700
},
{
"epoch": 0.4745,
"grad_norm": 0.573840223481603,
"learning_rate": 5.839111111111112e-06,
"loss": 2.365,
"step": 23725
},
{
"epoch": 0.475,
"grad_norm": 0.545580569851831,
"learning_rate": 5.8335555555555555e-06,
"loss": 2.3813,
"step": 23750
},
{
"epoch": 0.4755,
"grad_norm": 0.551471960312376,
"learning_rate": 5.828e-06,
"loss": 2.3617,
"step": 23775
},
{
"epoch": 0.476,
"grad_norm": 0.5953130526303944,
"learning_rate": 5.822444444444445e-06,
"loss": 2.3781,
"step": 23800
},
{
"epoch": 0.476,
"eval_loss": 2.38433575630188,
"eval_runtime": 31.8506,
"eval_samples_per_second": 3.202,
"eval_steps_per_second": 1.601,
"step": 23800
},
{
"epoch": 0.4765,
"grad_norm": 0.5604797565202618,
"learning_rate": 5.81688888888889e-06,
"loss": 2.3716,
"step": 23825
},
{
"epoch": 0.477,
"grad_norm": 0.554661200228578,
"learning_rate": 5.811333333333333e-06,
"loss": 2.3724,
"step": 23850
},
{
"epoch": 0.4775,
"grad_norm": 0.5534736868914567,
"learning_rate": 5.8057777777777775e-06,
"loss": 2.3754,
"step": 23875
},
{
"epoch": 0.478,
"grad_norm": 0.541434243018937,
"learning_rate": 5.800222222222223e-06,
"loss": 2.3612,
"step": 23900
},
{
"epoch": 0.478,
"eval_loss": 2.3843014240264893,
"eval_runtime": 31.7803,
"eval_samples_per_second": 3.21,
"eval_steps_per_second": 1.605,
"step": 23900
},
{
"epoch": 0.4785,
"grad_norm": 0.5557683143124796,
"learning_rate": 5.794666666666667e-06,
"loss": 2.3639,
"step": 23925
},
{
"epoch": 0.479,
"grad_norm": 0.5799527873689908,
"learning_rate": 5.789111111111112e-06,
"loss": 2.373,
"step": 23950
},
{
"epoch": 0.4795,
"grad_norm": 0.590904770982699,
"learning_rate": 5.783555555555556e-06,
"loss": 2.3778,
"step": 23975
},
{
"epoch": 0.48,
"grad_norm": 0.5561040991296016,
"learning_rate": 5.778e-06,
"loss": 2.3552,
"step": 24000
},
{
"epoch": 0.48,
"eval_loss": 2.3842599391937256,
"eval_runtime": 31.7209,
"eval_samples_per_second": 3.216,
"eval_steps_per_second": 1.608,
"step": 24000
},
{
"epoch": 0.4805,
"grad_norm": 0.5640470742370431,
"learning_rate": 5.772444444444445e-06,
"loss": 2.3622,
"step": 24025
},
{
"epoch": 0.481,
"grad_norm": 0.5463055265939479,
"learning_rate": 5.76688888888889e-06,
"loss": 2.3609,
"step": 24050
},
{
"epoch": 0.4815,
"grad_norm": 0.566766243472923,
"learning_rate": 5.7613333333333345e-06,
"loss": 2.3824,
"step": 24075
},
{
"epoch": 0.482,
"grad_norm": 0.5584478304684121,
"learning_rate": 5.755777777777778e-06,
"loss": 2.3744,
"step": 24100
},
{
"epoch": 0.482,
"eval_loss": 2.384092330932617,
"eval_runtime": 31.7835,
"eval_samples_per_second": 3.209,
"eval_steps_per_second": 1.605,
"step": 24100
},
{
"epoch": 0.4825,
"grad_norm": 0.5731740442874064,
"learning_rate": 5.7502222222222224e-06,
"loss": 2.3733,
"step": 24125
},
{
"epoch": 0.483,
"grad_norm": 0.5552901331066319,
"learning_rate": 5.744666666666668e-06,
"loss": 2.3755,
"step": 24150
},
{
"epoch": 0.4835,
"grad_norm": 0.5535450397337369,
"learning_rate": 5.739111111111112e-06,
"loss": 2.3777,
"step": 24175
},
{
"epoch": 0.484,
"grad_norm": 0.5622658531288893,
"learning_rate": 5.733555555555556e-06,
"loss": 2.3671,
"step": 24200
},
{
"epoch": 0.484,
"eval_loss": 2.3840036392211914,
"eval_runtime": 31.7615,
"eval_samples_per_second": 3.211,
"eval_steps_per_second": 1.606,
"step": 24200
},
{
"epoch": 0.4845,
"grad_norm": 0.5526779804173192,
"learning_rate": 5.728e-06,
"loss": 2.374,
"step": 24225
},
{
"epoch": 0.485,
"grad_norm": 0.5383978006357063,
"learning_rate": 5.722444444444445e-06,
"loss": 2.3664,
"step": 24250
},
{
"epoch": 0.4855,
"grad_norm": 0.5542389650019858,
"learning_rate": 5.71688888888889e-06,
"loss": 2.3692,
"step": 24275
},
{
"epoch": 0.486,
"grad_norm": 0.5542459781042757,
"learning_rate": 5.711333333333334e-06,
"loss": 2.379,
"step": 24300
},
{
"epoch": 0.486,
"eval_loss": 2.3838605880737305,
"eval_runtime": 31.8313,
"eval_samples_per_second": 3.204,
"eval_steps_per_second": 1.602,
"step": 24300
},
{
"epoch": 0.4865,
"grad_norm": 0.5371257785961498,
"learning_rate": 5.705777777777778e-06,
"loss": 2.3759,
"step": 24325
},
{
"epoch": 0.487,
"grad_norm": 0.5334074315105899,
"learning_rate": 5.700222222222223e-06,
"loss": 2.3842,
"step": 24350
},
{
"epoch": 0.4875,
"grad_norm": 0.5712028005119992,
"learning_rate": 5.694666666666667e-06,
"loss": 2.373,
"step": 24375
},
{
"epoch": 0.488,
"grad_norm": 0.5527635817323101,
"learning_rate": 5.689111111111112e-06,
"loss": 2.3632,
"step": 24400
},
{
"epoch": 0.488,
"eval_loss": 2.383908987045288,
"eval_runtime": 31.8006,
"eval_samples_per_second": 3.207,
"eval_steps_per_second": 1.604,
"step": 24400
},
{
"epoch": 0.4885,
"grad_norm": 0.5497988709199122,
"learning_rate": 5.683555555555555e-06,
"loss": 2.3674,
"step": 24425
},
{
"epoch": 0.489,
"grad_norm": 0.5478963614360626,
"learning_rate": 5.6780000000000005e-06,
"loss": 2.3795,
"step": 24450
},
{
"epoch": 0.4895,
"grad_norm": 0.5418443665589167,
"learning_rate": 5.672444444444445e-06,
"loss": 2.3769,
"step": 24475
},
{
"epoch": 0.49,
"grad_norm": 0.5637739038034214,
"learning_rate": 5.666888888888889e-06,
"loss": 2.3754,
"step": 24500
},
{
"epoch": 0.49,
"eval_loss": 2.3835647106170654,
"eval_runtime": 31.695,
"eval_samples_per_second": 3.218,
"eval_steps_per_second": 1.609,
"step": 24500
},
{
"epoch": 0.4905,
"grad_norm": 0.5352738455560374,
"learning_rate": 5.661333333333335e-06,
"loss": 2.3665,
"step": 24525
},
{
"epoch": 0.491,
"grad_norm": 0.5593898219847685,
"learning_rate": 5.655777777777778e-06,
"loss": 2.3621,
"step": 24550
},
{
"epoch": 0.4915,
"grad_norm": 0.5340153226573613,
"learning_rate": 5.6502222222222225e-06,
"loss": 2.3704,
"step": 24575
},
{
"epoch": 0.492,
"grad_norm": 0.5434269177198789,
"learning_rate": 5.644666666666667e-06,
"loss": 2.3707,
"step": 24600
},
{
"epoch": 0.492,
"eval_loss": 2.38376522064209,
"eval_runtime": 31.8117,
"eval_samples_per_second": 3.206,
"eval_steps_per_second": 1.603,
"step": 24600
},
{
"epoch": 0.4925,
"grad_norm": 0.5555073289213541,
"learning_rate": 5.639111111111112e-06,
"loss": 2.3702,
"step": 24625
},
{
"epoch": 0.493,
"grad_norm": 0.5608796205061338,
"learning_rate": 5.633555555555557e-06,
"loss": 2.373,
"step": 24650
},
{
"epoch": 0.4935,
"grad_norm": 0.5639681025688454,
"learning_rate": 5.628e-06,
"loss": 2.3641,
"step": 24675
},
{
"epoch": 0.494,
"grad_norm": 0.5610119210421548,
"learning_rate": 5.6224444444444446e-06,
"loss": 2.372,
"step": 24700
},
{
"epoch": 0.494,
"eval_loss": 2.383573293685913,
"eval_runtime": 31.6948,
"eval_samples_per_second": 3.218,
"eval_steps_per_second": 1.609,
"step": 24700
},
{
"epoch": 0.4945,
"grad_norm": 0.5442392815853518,
"learning_rate": 5.61688888888889e-06,
"loss": 2.3651,
"step": 24725
},
{
"epoch": 0.495,
"grad_norm": 0.5562532962787945,
"learning_rate": 5.611333333333334e-06,
"loss": 2.3705,
"step": 24750
},
{
"epoch": 0.4955,
"grad_norm": 0.5488206873990799,
"learning_rate": 5.605777777777778e-06,
"loss": 2.3623,
"step": 24775
},
{
"epoch": 0.496,
"grad_norm": 0.5653453728755813,
"learning_rate": 5.600222222222222e-06,
"loss": 2.3746,
"step": 24800
},
{
"epoch": 0.496,
"eval_loss": 2.383600950241089,
"eval_runtime": 31.8215,
"eval_samples_per_second": 3.205,
"eval_steps_per_second": 1.603,
"step": 24800
},
{
"epoch": 0.4965,
"grad_norm": 0.5714575887868236,
"learning_rate": 5.5946666666666674e-06,
"loss": 2.3698,
"step": 24825
},
{
"epoch": 0.497,
"grad_norm": 0.5479503311373944,
"learning_rate": 5.589111111111112e-06,
"loss": 2.3753,
"step": 24850
},
{
"epoch": 0.4975,
"grad_norm": 0.5465196721627547,
"learning_rate": 5.583555555555556e-06,
"loss": 2.3627,
"step": 24875
},
{
"epoch": 0.498,
"grad_norm": 0.5545182382115218,
"learning_rate": 5.578e-06,
"loss": 2.3623,
"step": 24900
},
{
"epoch": 0.498,
"eval_loss": 2.383317470550537,
"eval_runtime": 31.8409,
"eval_samples_per_second": 3.203,
"eval_steps_per_second": 1.602,
"step": 24900
},
{
"epoch": 0.4985,
"grad_norm": 0.5624766646317664,
"learning_rate": 5.572444444444445e-06,
"loss": 2.3659,
"step": 24925
},
{
"epoch": 0.499,
"grad_norm": 0.5642199082921324,
"learning_rate": 5.5668888888888894e-06,
"loss": 2.3684,
"step": 24950
},
{
"epoch": 0.4995,
"grad_norm": 0.5917431910025611,
"learning_rate": 5.561333333333334e-06,
"loss": 2.3723,
"step": 24975
},
{
"epoch": 0.5,
"grad_norm": 0.5530201275821488,
"learning_rate": 5.555777777777777e-06,
"loss": 2.3685,
"step": 25000
},
{
"epoch": 0.5,
"eval_loss": 2.3832170963287354,
"eval_runtime": 31.7959,
"eval_samples_per_second": 3.208,
"eval_steps_per_second": 1.604,
"step": 25000
},
{
"epoch": 0.5005,
"grad_norm": 0.5509816083841773,
"learning_rate": 5.550222222222223e-06,
"loss": 2.3559,
"step": 25025
},
{
"epoch": 0.501,
"grad_norm": 0.5547472529206742,
"learning_rate": 5.544666666666667e-06,
"loss": 2.3648,
"step": 25050
},
{
"epoch": 0.5015,
"grad_norm": 0.546260980184131,
"learning_rate": 5.5391111111111115e-06,
"loss": 2.3701,
"step": 25075
},
{
"epoch": 0.502,
"grad_norm": 0.5481216862316385,
"learning_rate": 5.533555555555557e-06,
"loss": 2.3798,
"step": 25100
},
{
"epoch": 0.502,
"eval_loss": 2.38305926322937,
"eval_runtime": 32.0473,
"eval_samples_per_second": 3.183,
"eval_steps_per_second": 1.591,
"step": 25100
},
{
"epoch": 0.5025,
"grad_norm": 0.5670640165543723,
"learning_rate": 5.528e-06,
"loss": 2.3622,
"step": 25125
},
{
"epoch": 0.503,
"grad_norm": 0.5463137917421312,
"learning_rate": 5.522444444444445e-06,
"loss": 2.3719,
"step": 25150
},
{
"epoch": 0.5035,
"grad_norm": 0.5400999701410277,
"learning_rate": 5.516888888888889e-06,
"loss": 2.3616,
"step": 25175
},
{
"epoch": 0.504,
"grad_norm": 0.5802126499364532,
"learning_rate": 5.511333333333334e-06,
"loss": 2.3721,
"step": 25200
},
{
"epoch": 0.504,
"eval_loss": 2.3829147815704346,
"eval_runtime": 31.7438,
"eval_samples_per_second": 3.213,
"eval_steps_per_second": 1.607,
"step": 25200
},
{
"epoch": 0.5045,
"grad_norm": 0.5435607747773122,
"learning_rate": 5.505777777777779e-06,
"loss": 2.3603,
"step": 25225
},
{
"epoch": 0.505,
"grad_norm": 0.5453890322127348,
"learning_rate": 5.500222222222222e-06,
"loss": 2.3636,
"step": 25250
},
{
"epoch": 0.5055,
"grad_norm": 0.5477131217196112,
"learning_rate": 5.494666666666667e-06,
"loss": 2.3697,
"step": 25275
},
{
"epoch": 0.506,
"grad_norm": 0.5621665226631756,
"learning_rate": 5.489111111111112e-06,
"loss": 2.3687,
"step": 25300
},
{
"epoch": 0.506,
"eval_loss": 2.3831355571746826,
"eval_runtime": 31.7979,
"eval_samples_per_second": 3.208,
"eval_steps_per_second": 1.604,
"step": 25300
},
{
"epoch": 0.5065,
"grad_norm": 0.5622191727496813,
"learning_rate": 5.483555555555556e-06,
"loss": 2.368,
"step": 25325
},
{
"epoch": 0.507,
"grad_norm": 0.5375310388584507,
"learning_rate": 5.478e-06,
"loss": 2.3617,
"step": 25350
},
{
"epoch": 0.5075,
"grad_norm": 0.5421092937376346,
"learning_rate": 5.472444444444444e-06,
"loss": 2.3759,
"step": 25375
},
{
"epoch": 0.508,
"grad_norm": 0.5726686989658507,
"learning_rate": 5.4668888888888896e-06,
"loss": 2.37,
"step": 25400
},
{
"epoch": 0.508,
"eval_loss": 2.383046865463257,
"eval_runtime": 31.8165,
"eval_samples_per_second": 3.206,
"eval_steps_per_second": 1.603,
"step": 25400
},
{
"epoch": 0.5085,
"grad_norm": 0.536904504012326,
"learning_rate": 5.461333333333334e-06,
"loss": 2.3683,
"step": 25425
},
{
"epoch": 0.509,
"grad_norm": 0.5792290465322086,
"learning_rate": 5.455777777777778e-06,
"loss": 2.3641,
"step": 25450
},
{
"epoch": 0.5095,
"grad_norm": 0.5667490944788528,
"learning_rate": 5.450222222222222e-06,
"loss": 2.3673,
"step": 25475
},
{
"epoch": 0.51,
"grad_norm": 0.5581091402617585,
"learning_rate": 5.444666666666667e-06,
"loss": 2.374,
"step": 25500
},
{
"epoch": 0.51,
"eval_loss": 2.3831074237823486,
"eval_runtime": 31.8462,
"eval_samples_per_second": 3.203,
"eval_steps_per_second": 1.601,
"step": 25500
},
{
"epoch": 0.5105,
"grad_norm": 0.5629059983127724,
"learning_rate": 5.4391111111111116e-06,
"loss": 2.376,
"step": 25525
},
{
"epoch": 0.511,
"grad_norm": 0.5600711744363054,
"learning_rate": 5.433555555555556e-06,
"loss": 2.3702,
"step": 25550
},
{
"epoch": 0.5115,
"grad_norm": 0.5500784026204207,
"learning_rate": 5.4279999999999995e-06,
"loss": 2.3704,
"step": 25575
},
{
"epoch": 0.512,
"grad_norm": 0.553377338742942,
"learning_rate": 5.422444444444445e-06,
"loss": 2.3644,
"step": 25600
},
{
"epoch": 0.512,
"eval_loss": 2.3826544284820557,
"eval_runtime": 31.7739,
"eval_samples_per_second": 3.21,
"eval_steps_per_second": 1.605,
"step": 25600
},
{
"epoch": 0.5125,
"grad_norm": 0.5861763037221558,
"learning_rate": 5.416888888888889e-06,
"loss": 2.3658,
"step": 25625
},
{
"epoch": 0.513,
"grad_norm": 0.5538084648071333,
"learning_rate": 5.411333333333334e-06,
"loss": 2.3693,
"step": 25650
},
{
"epoch": 0.5135,
"grad_norm": 0.5699472071254841,
"learning_rate": 5.405777777777779e-06,
"loss": 2.3707,
"step": 25675
},
{
"epoch": 0.514,
"grad_norm": 0.5440880568370218,
"learning_rate": 5.400222222222222e-06,
"loss": 2.3664,
"step": 25700
},
{
"epoch": 0.514,
"eval_loss": 2.382906675338745,
"eval_runtime": 31.7874,
"eval_samples_per_second": 3.209,
"eval_steps_per_second": 1.604,
"step": 25700
},
{
"epoch": 0.5145,
"grad_norm": 0.551256815387497,
"learning_rate": 5.394666666666667e-06,
"loss": 2.3608,
"step": 25725
},
{
"epoch": 0.515,
"grad_norm": 0.552653919875225,
"learning_rate": 5.389111111111112e-06,
"loss": 2.3648,
"step": 25750
},
{
"epoch": 0.5155,
"grad_norm": 0.5489775829628063,
"learning_rate": 5.3835555555555565e-06,
"loss": 2.368,
"step": 25775
},
{
"epoch": 0.516,
"grad_norm": 0.545224524462321,
"learning_rate": 5.378e-06,
"loss": 2.37,
"step": 25800
},
{
"epoch": 0.516,
"eval_loss": 2.382946491241455,
"eval_runtime": 31.8142,
"eval_samples_per_second": 3.206,
"eval_steps_per_second": 1.603,
"step": 25800
},
{
"epoch": 0.5165,
"grad_norm": 0.6177434912819645,
"learning_rate": 5.372444444444444e-06,
"loss": 2.3576,
"step": 25825
},
{
"epoch": 0.517,
"grad_norm": 0.5731672053410489,
"learning_rate": 5.36688888888889e-06,
"loss": 2.3641,
"step": 25850
},
{
"epoch": 0.5175,
"grad_norm": 0.547417736306074,
"learning_rate": 5.361333333333334e-06,
"loss": 2.3669,
"step": 25875
},
{
"epoch": 0.518,
"grad_norm": 0.5666721324439973,
"learning_rate": 5.3557777777777785e-06,
"loss": 2.3633,
"step": 25900
},
{
"epoch": 0.518,
"eval_loss": 2.3824901580810547,
"eval_runtime": 31.8236,
"eval_samples_per_second": 3.205,
"eval_steps_per_second": 1.603,
"step": 25900
},
{
"epoch": 0.5185,
"grad_norm": 0.5493694553264233,
"learning_rate": 5.350222222222222e-06,
"loss": 2.3676,
"step": 25925
},
{
"epoch": 0.519,
"grad_norm": 0.5581911332398992,
"learning_rate": 5.344666666666667e-06,
"loss": 2.3665,
"step": 25950
},
{
"epoch": 0.5195,
"grad_norm": 0.5523156791576098,
"learning_rate": 5.339111111111112e-06,
"loss": 2.3634,
"step": 25975
},
{
"epoch": 0.52,
"grad_norm": 0.5394984851015033,
"learning_rate": 5.333555555555556e-06,
"loss": 2.3693,
"step": 26000
},
{
"epoch": 0.52,
"eval_loss": 2.3825063705444336,
"eval_runtime": 31.7579,
"eval_samples_per_second": 3.212,
"eval_steps_per_second": 1.606,
"step": 26000
},
{
"epoch": 0.5205,
"grad_norm": 0.5425846904290578,
"learning_rate": 5.328000000000001e-06,
"loss": 2.3675,
"step": 26025
},
{
"epoch": 0.521,
"grad_norm": 0.5621800567569987,
"learning_rate": 5.322444444444445e-06,
"loss": 2.3759,
"step": 26050
},
{
"epoch": 0.5215,
"grad_norm": 0.5544103291449336,
"learning_rate": 5.316888888888889e-06,
"loss": 2.3576,
"step": 26075
},
{
"epoch": 0.522,
"grad_norm": 0.550125457461572,
"learning_rate": 5.311333333333334e-06,
"loss": 2.3567,
"step": 26100
},
{
"epoch": 0.522,
"eval_loss": 2.382749319076538,
"eval_runtime": 31.8184,
"eval_samples_per_second": 3.206,
"eval_steps_per_second": 1.603,
"step": 26100
},
{
"epoch": 0.5225,
"grad_norm": 0.5441956885780074,
"learning_rate": 5.305777777777779e-06,
"loss": 2.3562,
"step": 26125
},
{
"epoch": 0.523,
"grad_norm": 0.5677266247403775,
"learning_rate": 5.3002222222222225e-06,
"loss": 2.3666,
"step": 26150
},
{
"epoch": 0.5235,
"grad_norm": 0.5396975563673215,
"learning_rate": 5.294666666666667e-06,
"loss": 2.351,
"step": 26175
},
{
"epoch": 0.524,
"grad_norm": 0.5374437057610971,
"learning_rate": 5.289111111111111e-06,
"loss": 2.3625,
"step": 26200
},
{
"epoch": 0.524,
"eval_loss": 2.3822991847991943,
"eval_runtime": 31.8822,
"eval_samples_per_second": 3.199,
"eval_steps_per_second": 1.6,
"step": 26200
},
{
"epoch": 0.5245,
"grad_norm": 0.5627076715491244,
"learning_rate": 5.2835555555555566e-06,
"loss": 2.3699,
"step": 26225
},
{
"epoch": 0.525,
"grad_norm": 0.5430691314388109,
"learning_rate": 5.278000000000001e-06,
"loss": 2.3648,
"step": 26250
},
{
"epoch": 0.5255,
"grad_norm": 0.5319128139639624,
"learning_rate": 5.2724444444444445e-06,
"loss": 2.3722,
"step": 26275
},
{
"epoch": 0.526,
"grad_norm": 0.5560009569047116,
"learning_rate": 5.266888888888889e-06,
"loss": 2.3763,
"step": 26300
},
{
"epoch": 0.526,
"eval_loss": 2.3822247982025146,
"eval_runtime": 31.7558,
"eval_samples_per_second": 3.212,
"eval_steps_per_second": 1.606,
"step": 26300
},
{
"epoch": 0.5265,
"grad_norm": 0.5586923319248112,
"learning_rate": 5.261333333333334e-06,
"loss": 2.366,
"step": 26325
},
{
"epoch": 0.527,
"grad_norm": 0.5621950392943218,
"learning_rate": 5.255777777777779e-06,
"loss": 2.3713,
"step": 26350
},
{
"epoch": 0.5275,
"grad_norm": 0.5630783729958978,
"learning_rate": 5.250222222222222e-06,
"loss": 2.3508,
"step": 26375
},
{
"epoch": 0.528,
"grad_norm": 0.5543463911581646,
"learning_rate": 5.2446666666666665e-06,
"loss": 2.3588,
"step": 26400
},
{
"epoch": 0.528,
"eval_loss": 2.3820412158966064,
"eval_runtime": 31.7735,
"eval_samples_per_second": 3.21,
"eval_steps_per_second": 1.605,
"step": 26400
},
{
"epoch": 0.5285,
"grad_norm": 0.5521701819516005,
"learning_rate": 5.239111111111112e-06,
"loss": 2.3798,
"step": 26425
},
{
"epoch": 0.529,
"grad_norm": 0.5697290541696707,
"learning_rate": 5.233555555555556e-06,
"loss": 2.3705,
"step": 26450
},
{
"epoch": 0.5295,
"grad_norm": 0.5456656767494042,
"learning_rate": 5.228000000000001e-06,
"loss": 2.3603,
"step": 26475
},
{
"epoch": 0.53,
"grad_norm": 0.544157308823069,
"learning_rate": 5.222444444444444e-06,
"loss": 2.3598,
"step": 26500
},
{
"epoch": 0.53,
"eval_loss": 2.3819408416748047,
"eval_runtime": 31.804,
"eval_samples_per_second": 3.207,
"eval_steps_per_second": 1.604,
"step": 26500
},
{
"epoch": 0.5305,
"grad_norm": 0.5399718074412095,
"learning_rate": 5.216888888888889e-06,
"loss": 2.3765,
"step": 26525
},
{
"epoch": 0.531,
"grad_norm": 0.542440216852853,
"learning_rate": 5.211333333333334e-06,
"loss": 2.3758,
"step": 26550
},
{
"epoch": 0.5315,
"grad_norm": 0.5648571300651365,
"learning_rate": 5.205777777777778e-06,
"loss": 2.3685,
"step": 26575
},
{
"epoch": 0.532,
"grad_norm": 0.573442767423831,
"learning_rate": 5.2002222222222235e-06,
"loss": 2.3556,
"step": 26600
},
{
"epoch": 0.532,
"eval_loss": 2.382056951522827,
"eval_runtime": 31.8038,
"eval_samples_per_second": 3.207,
"eval_steps_per_second": 1.604,
"step": 26600
},
{
"epoch": 0.5325,
"grad_norm": 0.6056414806190663,
"learning_rate": 5.194666666666667e-06,
"loss": 2.3595,
"step": 26625
},
{
"epoch": 0.533,
"grad_norm": 0.5481757619700885,
"learning_rate": 5.189111111111111e-06,
"loss": 2.3727,
"step": 26650
},
{
"epoch": 0.5335,
"grad_norm": 0.5610562792027696,
"learning_rate": 5.183555555555556e-06,
"loss": 2.3673,
"step": 26675
},
{
"epoch": 0.534,
"grad_norm": 0.5702347426339772,
"learning_rate": 5.178000000000001e-06,
"loss": 2.3622,
"step": 26700
},
{
"epoch": 0.534,
"eval_loss": 2.381828546524048,
"eval_runtime": 31.992,
"eval_samples_per_second": 3.188,
"eval_steps_per_second": 1.594,
"step": 26700
},
{
"epoch": 0.5345,
"grad_norm": 0.5565593579595437,
"learning_rate": 5.172444444444445e-06,
"loss": 2.3651,
"step": 26725
},
{
"epoch": 0.535,
"grad_norm": 0.5398272748687973,
"learning_rate": 5.166888888888889e-06,
"loss": 2.3703,
"step": 26750
},
{
"epoch": 0.5355,
"grad_norm": 0.5611538131409728,
"learning_rate": 5.1613333333333334e-06,
"loss": 2.3778,
"step": 26775
},
{
"epoch": 0.536,
"grad_norm": 0.5436520053621182,
"learning_rate": 5.155777777777779e-06,
"loss": 2.3561,
"step": 26800
},
{
"epoch": 0.536,
"eval_loss": 2.381396532058716,
"eval_runtime": 31.772,
"eval_samples_per_second": 3.21,
"eval_steps_per_second": 1.605,
"step": 26800
},
{
"epoch": 0.5365,
"grad_norm": 0.5574841239488896,
"learning_rate": 5.150222222222223e-06,
"loss": 2.3607,
"step": 26825
},
{
"epoch": 0.537,
"grad_norm": 0.5459267231396281,
"learning_rate": 5.144666666666667e-06,
"loss": 2.3652,
"step": 26850
},
{
"epoch": 0.5375,
"grad_norm": 0.5764624554311072,
"learning_rate": 5.139111111111111e-06,
"loss": 2.3748,
"step": 26875
},
{
"epoch": 0.538,
"grad_norm": 0.5452582655691465,
"learning_rate": 5.133555555555556e-06,
"loss": 2.3751,
"step": 26900
},
{
"epoch": 0.538,
"eval_loss": 2.3815813064575195,
"eval_runtime": 31.833,
"eval_samples_per_second": 3.204,
"eval_steps_per_second": 1.602,
"step": 26900
},
{
"epoch": 0.5385,
"grad_norm": 0.5591974032204698,
"learning_rate": 5.128000000000001e-06,
"loss": 2.3595,
"step": 26925
},
{
"epoch": 0.539,
"grad_norm": 0.5910956937930101,
"learning_rate": 5.122444444444444e-06,
"loss": 2.3712,
"step": 26950
},
{
"epoch": 0.5395,
"grad_norm": 0.5532516136915937,
"learning_rate": 5.116888888888889e-06,
"loss": 2.3673,
"step": 26975
},
{
"epoch": 0.54,
"grad_norm": 0.5654498740726267,
"learning_rate": 5.111333333333334e-06,
"loss": 2.3667,
"step": 27000
},
{
"epoch": 0.54,
"eval_loss": 2.3814122676849365,
"eval_runtime": 31.7588,
"eval_samples_per_second": 3.212,
"eval_steps_per_second": 1.606,
"step": 27000
},
{
"epoch": 0.5405,
"grad_norm": 0.5639894142193489,
"learning_rate": 5.105777777777778e-06,
"loss": 2.3604,
"step": 27025
},
{
"epoch": 0.541,
"grad_norm": 0.5650474829629732,
"learning_rate": 5.100222222222223e-06,
"loss": 2.3615,
"step": 27050
},
{
"epoch": 0.5415,
"grad_norm": 0.5549449402784257,
"learning_rate": 5.094666666666666e-06,
"loss": 2.3679,
"step": 27075
},
{
"epoch": 0.542,
"grad_norm": 0.5615002192664388,
"learning_rate": 5.0891111111111115e-06,
"loss": 2.3634,
"step": 27100
},
{
"epoch": 0.542,
"eval_loss": 2.381121873855591,
"eval_runtime": 31.7586,
"eval_samples_per_second": 3.212,
"eval_steps_per_second": 1.606,
"step": 27100
},
{
"epoch": 0.5425,
"grad_norm": 0.5403095468370492,
"learning_rate": 5.083555555555556e-06,
"loss": 2.3665,
"step": 27125
},
{
"epoch": 0.543,
"grad_norm": 0.5421716749680758,
"learning_rate": 5.078e-06,
"loss": 2.369,
"step": 27150
},
{
"epoch": 0.5435,
"grad_norm": 0.5590064616229682,
"learning_rate": 5.072444444444446e-06,
"loss": 2.3594,
"step": 27175
},
{
"epoch": 0.544,
"grad_norm": 0.5444799207706167,
"learning_rate": 5.066888888888889e-06,
"loss": 2.3582,
"step": 27200
},
{
"epoch": 0.544,
"eval_loss": 2.3811404705047607,
"eval_runtime": 31.8368,
"eval_samples_per_second": 3.204,
"eval_steps_per_second": 1.602,
"step": 27200
},
{
"epoch": 0.5445,
"grad_norm": 0.5694522608963828,
"learning_rate": 5.0613333333333336e-06,
"loss": 2.3651,
"step": 27225
},
{
"epoch": 0.545,
"grad_norm": 0.5357232316900923,
"learning_rate": 5.055777777777778e-06,
"loss": 2.3595,
"step": 27250
},
{
"epoch": 0.5455,
"grad_norm": 0.5449200504756736,
"learning_rate": 5.050222222222223e-06,
"loss": 2.3563,
"step": 27275
},
{
"epoch": 0.546,
"grad_norm": 0.5669179572699722,
"learning_rate": 5.044666666666667e-06,
"loss": 2.3705,
"step": 27300
},
{
"epoch": 0.546,
"eval_loss": 2.3810057640075684,
"eval_runtime": 31.7869,
"eval_samples_per_second": 3.209,
"eval_steps_per_second": 1.604,
"step": 27300
},
{
"epoch": 0.5465,
"grad_norm": 0.5536644347581473,
"learning_rate": 5.039111111111111e-06,
"loss": 2.3658,
"step": 27325
},
{
"epoch": 0.547,
"grad_norm": 0.5774297317851765,
"learning_rate": 5.0335555555555556e-06,
"loss": 2.3553,
"step": 27350
},
{
"epoch": 0.5475,
"grad_norm": 0.567395549600367,
"learning_rate": 5.028000000000001e-06,
"loss": 2.3694,
"step": 27375
},
{
"epoch": 0.548,
"grad_norm": 0.5501789999743681,
"learning_rate": 5.022444444444445e-06,
"loss": 2.3643,
"step": 27400
},
{
"epoch": 0.548,
"eval_loss": 2.3811025619506836,
"eval_runtime": 31.9197,
"eval_samples_per_second": 3.196,
"eval_steps_per_second": 1.598,
"step": 27400
},
{
"epoch": 0.5485,
"grad_norm": 0.5719215133111718,
"learning_rate": 5.016888888888889e-06,
"loss": 2.365,
"step": 27425
},
{
"epoch": 0.549,
"grad_norm": 0.5899241097551456,
"learning_rate": 5.011333333333333e-06,
"loss": 2.3774,
"step": 27450
},
{
"epoch": 0.5495,
"grad_norm": 0.5731413292155066,
"learning_rate": 5.0057777777777784e-06,
"loss": 2.3706,
"step": 27475
},
{
"epoch": 0.55,
"grad_norm": 0.5425656065958468,
"learning_rate": 5.000222222222223e-06,
"loss": 2.3566,
"step": 27500
},
{
"epoch": 0.55,
"eval_loss": 2.380763292312622,
"eval_runtime": 31.8162,
"eval_samples_per_second": 3.206,
"eval_steps_per_second": 1.603,
"step": 27500
},
{
"epoch": 0.5505,
"grad_norm": 0.5601626399029922,
"learning_rate": 4.994666666666667e-06,
"loss": 2.3762,
"step": 27525
},
{
"epoch": 0.551,
"grad_norm": 0.5715204135637444,
"learning_rate": 4.989111111111112e-06,
"loss": 2.363,
"step": 27550
},
{
"epoch": 0.5515,
"grad_norm": 0.547533853702179,
"learning_rate": 4.983555555555556e-06,
"loss": 2.3659,
"step": 27575
},
{
"epoch": 0.552,
"grad_norm": 0.5817399132816639,
"learning_rate": 4.9780000000000005e-06,
"loss": 2.3693,
"step": 27600
},
{
"epoch": 0.552,
"eval_loss": 2.3807787895202637,
"eval_runtime": 31.8396,
"eval_samples_per_second": 3.204,
"eval_steps_per_second": 1.602,
"step": 27600
},
{
"epoch": 0.5525,
"grad_norm": 0.544660595894246,
"learning_rate": 4.972444444444445e-06,
"loss": 2.3661,
"step": 27625
},
{
"epoch": 0.553,
"grad_norm": 0.5813863819688693,
"learning_rate": 4.966888888888889e-06,
"loss": 2.365,
"step": 27650
},
{
"epoch": 0.5535,
"grad_norm": 0.555794514365692,
"learning_rate": 4.961333333333334e-06,
"loss": 2.3724,
"step": 27675
},
{
"epoch": 0.554,
"grad_norm": 0.5549771654031,
"learning_rate": 4.955777777777778e-06,
"loss": 2.3712,
"step": 27700
},
{
"epoch": 0.554,
"eval_loss": 2.380859613418579,
"eval_runtime": 32.035,
"eval_samples_per_second": 3.184,
"eval_steps_per_second": 1.592,
"step": 27700
},
{
"epoch": 0.5545,
"grad_norm": 0.5660580874490311,
"learning_rate": 4.9502222222222225e-06,
"loss": 2.3626,
"step": 27725
},
{
"epoch": 0.555,
"grad_norm": 0.5408935222204184,
"learning_rate": 4.944666666666667e-06,
"loss": 2.3546,
"step": 27750
},
{
"epoch": 0.5555,
"grad_norm": 0.5574539497290301,
"learning_rate": 4.939111111111112e-06,
"loss": 2.3503,
"step": 27775
},
{
"epoch": 0.556,
"grad_norm": 0.5733587459238179,
"learning_rate": 4.933555555555556e-06,
"loss": 2.3787,
"step": 27800
},
{
"epoch": 0.556,
"eval_loss": 2.380819082260132,
"eval_runtime": 31.8731,
"eval_samples_per_second": 3.2,
"eval_steps_per_second": 1.6,
"step": 27800
},
{
"epoch": 0.5565,
"grad_norm": 0.5469010479471977,
"learning_rate": 4.928000000000001e-06,
"loss": 2.3728,
"step": 27825
},
{
"epoch": 0.557,
"grad_norm": 0.5575923461377743,
"learning_rate": 4.9224444444444445e-06,
"loss": 2.3587,
"step": 27850
},
{
"epoch": 0.5575,
"grad_norm": 0.5484615569385746,
"learning_rate": 4.91688888888889e-06,
"loss": 2.3554,
"step": 27875
},
{
"epoch": 0.558,
"grad_norm": 0.5700580906470195,
"learning_rate": 4.911333333333333e-06,
"loss": 2.3591,
"step": 27900
},
{
"epoch": 0.558,
"eval_loss": 2.380748748779297,
"eval_runtime": 31.8799,
"eval_samples_per_second": 3.2,
"eval_steps_per_second": 1.6,
"step": 27900
},
{
"epoch": 0.5585,
"grad_norm": 0.5644741625244013,
"learning_rate": 4.9057777777777785e-06,
"loss": 2.3573,
"step": 27925
},
{
"epoch": 0.559,
"grad_norm": 0.5518750142742082,
"learning_rate": 4.900222222222223e-06,
"loss": 2.3722,
"step": 27950
},
{
"epoch": 0.5595,
"grad_norm": 0.5570570164343176,
"learning_rate": 4.894666666666667e-06,
"loss": 2.3644,
"step": 27975
},
{
"epoch": 0.56,
"grad_norm": 0.5454507656456767,
"learning_rate": 4.889111111111112e-06,
"loss": 2.3545,
"step": 28000
},
{
"epoch": 0.56,
"eval_loss": 2.380680799484253,
"eval_runtime": 31.8506,
"eval_samples_per_second": 3.202,
"eval_steps_per_second": 1.601,
"step": 28000
}
],
"logging_steps": 25,
"max_steps": 50000,
"num_input_tokens_seen": 0,
"num_train_epochs": 9223372036854775807,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 8.91296643531617e+19,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}