run_20 / trainer_state.json
irodkin's picture
Training checkpoint at step 9000
ecf2aaf verified
{
"best_global_step": 4300,
"best_metric": 2.432278633117676,
"best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_20/checkpoint-4000",
"epoch": 0.18,
"eval_steps": 100,
"global_step": 9000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0005,
"grad_norm": 39.75564521032967,
"learning_rate": 4.8e-08,
"loss": 3.6517,
"step": 25
},
{
"epoch": 0.001,
"grad_norm": 28.937531835097435,
"learning_rate": 9.8e-08,
"loss": 3.5931,
"step": 50
},
{
"epoch": 0.0015,
"grad_norm": 21.922720332659644,
"learning_rate": 1.4800000000000003e-07,
"loss": 3.3397,
"step": 75
},
{
"epoch": 0.002,
"grad_norm": 8.739610199908325,
"learning_rate": 1.9800000000000003e-07,
"loss": 3.1289,
"step": 100
},
{
"epoch": 0.002,
"eval_loss": 2.9243295192718506,
"eval_runtime": 264.3302,
"eval_samples_per_second": 3.11,
"eval_steps_per_second": 1.555,
"step": 100
},
{
"epoch": 0.0025,
"grad_norm": 4.433912600039677,
"learning_rate": 2.48e-07,
"loss": 2.8957,
"step": 125
},
{
"epoch": 0.003,
"grad_norm": 3.2874790066620303,
"learning_rate": 2.9800000000000005e-07,
"loss": 2.763,
"step": 150
},
{
"epoch": 0.0035,
"grad_norm": 1.5203472215469231,
"learning_rate": 3.48e-07,
"loss": 2.676,
"step": 175
},
{
"epoch": 0.004,
"grad_norm": 1.1945541683905954,
"learning_rate": 3.9800000000000004e-07,
"loss": 2.635,
"step": 200
},
{
"epoch": 0.004,
"eval_loss": 2.6094932556152344,
"eval_runtime": 265.7702,
"eval_samples_per_second": 3.093,
"eval_steps_per_second": 1.546,
"step": 200
},
{
"epoch": 0.0045,
"grad_norm": 1.0852713304633745,
"learning_rate": 4.4800000000000004e-07,
"loss": 2.6016,
"step": 225
},
{
"epoch": 0.005,
"grad_norm": 1.0733940346699529,
"learning_rate": 4.98e-07,
"loss": 2.5797,
"step": 250
},
{
"epoch": 0.0055,
"grad_norm": 0.9273949035031271,
"learning_rate": 5.480000000000001e-07,
"loss": 2.5607,
"step": 275
},
{
"epoch": 0.006,
"grad_norm": 0.9289300678591714,
"learning_rate": 5.98e-07,
"loss": 2.552,
"step": 300
},
{
"epoch": 0.006,
"eval_loss": 2.541522264480591,
"eval_runtime": 266.7478,
"eval_samples_per_second": 3.082,
"eval_steps_per_second": 1.541,
"step": 300
},
{
"epoch": 0.0065,
"grad_norm": 1.1328584507449984,
"learning_rate": 6.48e-07,
"loss": 2.5402,
"step": 325
},
{
"epoch": 0.007,
"grad_norm": 0.8593307029257858,
"learning_rate": 6.98e-07,
"loss": 2.5286,
"step": 350
},
{
"epoch": 0.0075,
"grad_norm": 0.895615604067586,
"learning_rate": 7.480000000000001e-07,
"loss": 2.5311,
"step": 375
},
{
"epoch": 0.008,
"grad_norm": 0.912306580242149,
"learning_rate": 7.98e-07,
"loss": 2.5037,
"step": 400
},
{
"epoch": 0.008,
"eval_loss": 2.514389991760254,
"eval_runtime": 266.4899,
"eval_samples_per_second": 3.085,
"eval_steps_per_second": 1.542,
"step": 400
},
{
"epoch": 0.0085,
"grad_norm": 1.1866535514670034,
"learning_rate": 8.480000000000001e-07,
"loss": 2.5011,
"step": 425
},
{
"epoch": 0.009,
"grad_norm": 1.211342504193914,
"learning_rate": 8.980000000000001e-07,
"loss": 2.503,
"step": 450
},
{
"epoch": 0.0095,
"grad_norm": 1.113763817383069,
"learning_rate": 9.480000000000001e-07,
"loss": 2.4999,
"step": 475
},
{
"epoch": 0.01,
"grad_norm": 1.2585585589647226,
"learning_rate": 9.98e-07,
"loss": 2.4872,
"step": 500
},
{
"epoch": 0.01,
"eval_loss": 2.497868061065674,
"eval_runtime": 265.7962,
"eval_samples_per_second": 3.093,
"eval_steps_per_second": 1.546,
"step": 500
},
{
"epoch": 0.0105,
"grad_norm": 1.2585825718084245,
"learning_rate": 1.0480000000000002e-06,
"loss": 2.4852,
"step": 525
},
{
"epoch": 0.011,
"grad_norm": 1.4101257437846046,
"learning_rate": 1.0980000000000001e-06,
"loss": 2.4892,
"step": 550
},
{
"epoch": 0.0115,
"grad_norm": 1.1975234150707363,
"learning_rate": 1.148e-06,
"loss": 2.4861,
"step": 575
},
{
"epoch": 0.012,
"grad_norm": 1.3662769225582332,
"learning_rate": 1.1980000000000002e-06,
"loss": 2.4882,
"step": 600
},
{
"epoch": 0.012,
"eval_loss": 2.4879231452941895,
"eval_runtime": 267.0005,
"eval_samples_per_second": 3.079,
"eval_steps_per_second": 1.539,
"step": 600
},
{
"epoch": 0.0125,
"grad_norm": 1.3086724275194024,
"learning_rate": 1.248e-06,
"loss": 2.4745,
"step": 625
},
{
"epoch": 0.013,
"grad_norm": 1.317023206802888,
"learning_rate": 1.2980000000000001e-06,
"loss": 2.4727,
"step": 650
},
{
"epoch": 0.0135,
"grad_norm": 1.5284967544483212,
"learning_rate": 1.348e-06,
"loss": 2.469,
"step": 675
},
{
"epoch": 0.014,
"grad_norm": 1.1047595217316941,
"learning_rate": 1.3980000000000002e-06,
"loss": 2.4695,
"step": 700
},
{
"epoch": 0.014,
"eval_loss": 2.480103015899658,
"eval_runtime": 263.5022,
"eval_samples_per_second": 3.12,
"eval_steps_per_second": 1.56,
"step": 700
},
{
"epoch": 0.0145,
"grad_norm": 1.2077328209863791,
"learning_rate": 1.4480000000000002e-06,
"loss": 2.4654,
"step": 725
},
{
"epoch": 0.015,
"grad_norm": 1.209220841771836,
"learning_rate": 1.498e-06,
"loss": 2.4663,
"step": 750
},
{
"epoch": 0.0155,
"grad_norm": 1.3063169829879686,
"learning_rate": 1.548e-06,
"loss": 2.4704,
"step": 775
},
{
"epoch": 0.016,
"grad_norm": 1.3180183352683195,
"learning_rate": 1.5980000000000002e-06,
"loss": 2.4583,
"step": 800
},
{
"epoch": 0.016,
"eval_loss": 2.473590850830078,
"eval_runtime": 305.9875,
"eval_samples_per_second": 2.686,
"eval_steps_per_second": 1.343,
"step": 800
},
{
"epoch": 0.0165,
"grad_norm": 1.1674852380778837,
"learning_rate": 1.6480000000000001e-06,
"loss": 2.467,
"step": 825
},
{
"epoch": 0.017,
"grad_norm": 1.2497656349941002,
"learning_rate": 1.6980000000000003e-06,
"loss": 2.4612,
"step": 850
},
{
"epoch": 0.0175,
"grad_norm": 1.3358614980967494,
"learning_rate": 1.7480000000000002e-06,
"loss": 2.4636,
"step": 875
},
{
"epoch": 0.018,
"grad_norm": 1.252489857653356,
"learning_rate": 1.798e-06,
"loss": 2.454,
"step": 900
},
{
"epoch": 0.018,
"eval_loss": 2.4681763648986816,
"eval_runtime": 264.702,
"eval_samples_per_second": 3.105,
"eval_steps_per_second": 1.553,
"step": 900
},
{
"epoch": 0.0185,
"grad_norm": 1.2815437998994337,
"learning_rate": 1.8480000000000001e-06,
"loss": 2.4571,
"step": 925
},
{
"epoch": 0.019,
"grad_norm": 1.0902475329451575,
"learning_rate": 1.898e-06,
"loss": 2.451,
"step": 950
},
{
"epoch": 0.0195,
"grad_norm": 1.1502696024965324,
"learning_rate": 1.9480000000000002e-06,
"loss": 2.4527,
"step": 975
},
{
"epoch": 0.02,
"grad_norm": 1.2336661855806117,
"learning_rate": 1.998e-06,
"loss": 2.4496,
"step": 1000
},
{
"epoch": 0.02,
"eval_loss": 2.463880777359009,
"eval_runtime": 275.7426,
"eval_samples_per_second": 2.981,
"eval_steps_per_second": 1.491,
"step": 1000
},
{
"epoch": 0.0205,
"grad_norm": 1.2680742209094296,
"learning_rate": 2.048e-06,
"loss": 2.4494,
"step": 1025
},
{
"epoch": 0.021,
"grad_norm": 1.0341778808278126,
"learning_rate": 2.098e-06,
"loss": 2.4467,
"step": 1050
},
{
"epoch": 0.0215,
"grad_norm": 0.9860490736001175,
"learning_rate": 2.148e-06,
"loss": 2.4473,
"step": 1075
},
{
"epoch": 0.022,
"grad_norm": 0.9419267295275278,
"learning_rate": 2.198e-06,
"loss": 2.443,
"step": 1100
},
{
"epoch": 0.022,
"eval_loss": 2.4598941802978516,
"eval_runtime": 265.0502,
"eval_samples_per_second": 3.101,
"eval_steps_per_second": 1.551,
"step": 1100
},
{
"epoch": 0.0225,
"grad_norm": 1.3280720471027394,
"learning_rate": 2.2480000000000003e-06,
"loss": 2.4515,
"step": 1125
},
{
"epoch": 0.023,
"grad_norm": 1.053570785582915,
"learning_rate": 2.2980000000000003e-06,
"loss": 2.4396,
"step": 1150
},
{
"epoch": 0.0235,
"grad_norm": 0.9108119839585552,
"learning_rate": 2.3480000000000002e-06,
"loss": 2.4442,
"step": 1175
},
{
"epoch": 0.024,
"grad_norm": 1.0062346367900277,
"learning_rate": 2.398e-06,
"loss": 2.4443,
"step": 1200
},
{
"epoch": 0.024,
"eval_loss": 2.456455945968628,
"eval_runtime": 264.5888,
"eval_samples_per_second": 3.107,
"eval_steps_per_second": 1.553,
"step": 1200
},
{
"epoch": 0.0245,
"grad_norm": 1.0264127705426926,
"learning_rate": 2.448e-06,
"loss": 2.4351,
"step": 1225
},
{
"epoch": 0.025,
"grad_norm": 0.8015249588347212,
"learning_rate": 2.498e-06,
"loss": 2.4406,
"step": 1250
},
{
"epoch": 0.0255,
"grad_norm": 1.1105649485540114,
"learning_rate": 2.5480000000000004e-06,
"loss": 2.4377,
"step": 1275
},
{
"epoch": 0.026,
"grad_norm": 0.9701758426012801,
"learning_rate": 2.598e-06,
"loss": 2.4341,
"step": 1300
},
{
"epoch": 0.026,
"eval_loss": 2.453026056289673,
"eval_runtime": 264.7653,
"eval_samples_per_second": 3.105,
"eval_steps_per_second": 1.552,
"step": 1300
},
{
"epoch": 0.0265,
"grad_norm": 0.9587254891845429,
"learning_rate": 2.648e-06,
"loss": 2.4303,
"step": 1325
},
{
"epoch": 0.027,
"grad_norm": 0.8135883960763247,
"learning_rate": 2.6980000000000003e-06,
"loss": 2.4363,
"step": 1350
},
{
"epoch": 0.0275,
"grad_norm": 0.9192860127847176,
"learning_rate": 2.748e-06,
"loss": 2.4257,
"step": 1375
},
{
"epoch": 0.028,
"grad_norm": 0.947465928893444,
"learning_rate": 2.798e-06,
"loss": 2.4353,
"step": 1400
},
{
"epoch": 0.028,
"eval_loss": 2.450345993041992,
"eval_runtime": 265.6266,
"eval_samples_per_second": 3.095,
"eval_steps_per_second": 1.547,
"step": 1400
},
{
"epoch": 0.0285,
"grad_norm": 0.9270137901066681,
"learning_rate": 2.848e-06,
"loss": 2.4347,
"step": 1425
},
{
"epoch": 0.029,
"grad_norm": 0.8839980710491563,
"learning_rate": 2.8980000000000005e-06,
"loss": 2.4213,
"step": 1450
},
{
"epoch": 0.0295,
"grad_norm": 0.913196005454606,
"learning_rate": 2.9480000000000004e-06,
"loss": 2.4232,
"step": 1475
},
{
"epoch": 0.03,
"grad_norm": 0.8139623858623861,
"learning_rate": 2.9980000000000003e-06,
"loss": 2.4254,
"step": 1500
},
{
"epoch": 0.03,
"eval_loss": 2.447662830352783,
"eval_runtime": 263.4353,
"eval_samples_per_second": 3.12,
"eval_steps_per_second": 1.56,
"step": 1500
},
{
"epoch": 0.0305,
"grad_norm": 0.8422198221554755,
"learning_rate": 3.0480000000000003e-06,
"loss": 2.4196,
"step": 1525
},
{
"epoch": 0.031,
"grad_norm": 0.8542957579365906,
"learning_rate": 3.0980000000000007e-06,
"loss": 2.4294,
"step": 1550
},
{
"epoch": 0.0315,
"grad_norm": 1.149263137594797,
"learning_rate": 3.1480000000000006e-06,
"loss": 2.4265,
"step": 1575
},
{
"epoch": 0.032,
"grad_norm": 0.811470126240392,
"learning_rate": 3.198e-06,
"loss": 2.4105,
"step": 1600
},
{
"epoch": 0.032,
"eval_loss": 2.4456679821014404,
"eval_runtime": 264.056,
"eval_samples_per_second": 3.113,
"eval_steps_per_second": 1.556,
"step": 1600
},
{
"epoch": 0.0325,
"grad_norm": 2.3928975221881434,
"learning_rate": 3.248e-06,
"loss": 2.4208,
"step": 1625
},
{
"epoch": 0.033,
"grad_norm": 0.8031315125360012,
"learning_rate": 3.298e-06,
"loss": 2.4224,
"step": 1650
},
{
"epoch": 0.0335,
"grad_norm": 0.835567276692195,
"learning_rate": 3.348e-06,
"loss": 2.4188,
"step": 1675
},
{
"epoch": 0.034,
"grad_norm": 0.8894325175719718,
"learning_rate": 3.3980000000000003e-06,
"loss": 2.4206,
"step": 1700
},
{
"epoch": 0.034,
"eval_loss": 2.4437851905822754,
"eval_runtime": 264.6455,
"eval_samples_per_second": 3.106,
"eval_steps_per_second": 1.553,
"step": 1700
},
{
"epoch": 0.0345,
"grad_norm": 0.802724390649243,
"learning_rate": 3.4480000000000003e-06,
"loss": 2.4241,
"step": 1725
},
{
"epoch": 0.035,
"grad_norm": 0.8206312612014312,
"learning_rate": 3.4980000000000002e-06,
"loss": 2.4157,
"step": 1750
},
{
"epoch": 0.0355,
"grad_norm": 0.8653789917535344,
"learning_rate": 3.548e-06,
"loss": 2.412,
"step": 1775
},
{
"epoch": 0.036,
"grad_norm": 0.7816319078215015,
"learning_rate": 3.5980000000000005e-06,
"loss": 2.4179,
"step": 1800
},
{
"epoch": 0.036,
"eval_loss": 2.4423036575317383,
"eval_runtime": 264.5578,
"eval_samples_per_second": 3.107,
"eval_steps_per_second": 1.554,
"step": 1800
},
{
"epoch": 0.0365,
"grad_norm": 0.707594544466941,
"learning_rate": 3.6480000000000005e-06,
"loss": 2.416,
"step": 1825
},
{
"epoch": 0.037,
"grad_norm": 0.7481066913011816,
"learning_rate": 3.6980000000000004e-06,
"loss": 2.4242,
"step": 1850
},
{
"epoch": 0.0375,
"grad_norm": 0.7612014979445353,
"learning_rate": 3.7480000000000004e-06,
"loss": 2.4173,
"step": 1875
},
{
"epoch": 0.038,
"grad_norm": 0.772750918048857,
"learning_rate": 3.7980000000000007e-06,
"loss": 2.4134,
"step": 1900
},
{
"epoch": 0.038,
"eval_loss": 2.440969228744507,
"eval_runtime": 274.3624,
"eval_samples_per_second": 2.996,
"eval_steps_per_second": 1.498,
"step": 1900
},
{
"epoch": 0.0385,
"grad_norm": 0.7927966042188935,
"learning_rate": 3.848e-06,
"loss": 2.4131,
"step": 1925
},
{
"epoch": 0.039,
"grad_norm": 0.7664274167276341,
"learning_rate": 3.898e-06,
"loss": 2.4133,
"step": 1950
},
{
"epoch": 0.0395,
"grad_norm": 0.7038638213491795,
"learning_rate": 3.948e-06,
"loss": 2.4135,
"step": 1975
},
{
"epoch": 0.04,
"grad_norm": 0.7231696877425319,
"learning_rate": 3.9980000000000005e-06,
"loss": 2.4169,
"step": 2000
},
{
"epoch": 0.04,
"eval_loss": 2.439641237258911,
"eval_runtime": 282.4449,
"eval_samples_per_second": 2.91,
"eval_steps_per_second": 1.455,
"step": 2000
},
{
"epoch": 0.0405,
"grad_norm": 0.7184393791203537,
"learning_rate": 4.048e-06,
"loss": 2.4071,
"step": 2025
},
{
"epoch": 0.041,
"grad_norm": 0.7366813467336683,
"learning_rate": 4.098e-06,
"loss": 2.4113,
"step": 2050
},
{
"epoch": 0.0415,
"grad_norm": 0.7081408763220511,
"learning_rate": 4.148000000000001e-06,
"loss": 2.4168,
"step": 2075
},
{
"epoch": 0.042,
"grad_norm": 0.6912835983850483,
"learning_rate": 4.198e-06,
"loss": 2.4105,
"step": 2100
},
{
"epoch": 0.042,
"eval_loss": 2.438904047012329,
"eval_runtime": 277.7481,
"eval_samples_per_second": 2.96,
"eval_steps_per_second": 1.48,
"step": 2100
},
{
"epoch": 0.0425,
"grad_norm": 0.7745538733736145,
"learning_rate": 4.248000000000001e-06,
"loss": 2.4131,
"step": 2125
},
{
"epoch": 0.043,
"grad_norm": 0.6897576190091962,
"learning_rate": 4.298e-06,
"loss": 2.4084,
"step": 2150
},
{
"epoch": 0.0435,
"grad_norm": 0.7020994032566351,
"learning_rate": 4.3480000000000006e-06,
"loss": 2.4125,
"step": 2175
},
{
"epoch": 0.044,
"grad_norm": 0.6668651869738377,
"learning_rate": 4.398000000000001e-06,
"loss": 2.4034,
"step": 2200
},
{
"epoch": 0.044,
"eval_loss": 2.4380908012390137,
"eval_runtime": 268.2252,
"eval_samples_per_second": 3.065,
"eval_steps_per_second": 1.532,
"step": 2200
},
{
"epoch": 0.0445,
"grad_norm": 0.6547759047620061,
"learning_rate": 4.4480000000000004e-06,
"loss": 2.4099,
"step": 2225
},
{
"epoch": 0.045,
"grad_norm": 0.6865815945777785,
"learning_rate": 4.498e-06,
"loss": 2.412,
"step": 2250
},
{
"epoch": 0.0455,
"grad_norm": 0.6878267781655092,
"learning_rate": 4.548e-06,
"loss": 2.4137,
"step": 2275
},
{
"epoch": 0.046,
"grad_norm": 0.8314813616644483,
"learning_rate": 4.598e-06,
"loss": 2.4097,
"step": 2300
},
{
"epoch": 0.046,
"eval_loss": 2.4374496936798096,
"eval_runtime": 263.1701,
"eval_samples_per_second": 3.123,
"eval_steps_per_second": 1.562,
"step": 2300
},
{
"epoch": 0.0465,
"grad_norm": 0.6723966792931375,
"learning_rate": 4.648e-06,
"loss": 2.4051,
"step": 2325
},
{
"epoch": 0.047,
"grad_norm": 0.7003756914046538,
"learning_rate": 4.698000000000001e-06,
"loss": 2.4032,
"step": 2350
},
{
"epoch": 0.0475,
"grad_norm": 0.6747085415631567,
"learning_rate": 4.748e-06,
"loss": 2.4096,
"step": 2375
},
{
"epoch": 0.048,
"grad_norm": 0.6571218540079207,
"learning_rate": 4.7980000000000005e-06,
"loss": 2.4165,
"step": 2400
},
{
"epoch": 0.048,
"eval_loss": 2.4365923404693604,
"eval_runtime": 264.2268,
"eval_samples_per_second": 3.111,
"eval_steps_per_second": 1.555,
"step": 2400
},
{
"epoch": 0.0485,
"grad_norm": 0.7464314980483315,
"learning_rate": 4.848000000000001e-06,
"loss": 2.4098,
"step": 2425
},
{
"epoch": 0.049,
"grad_norm": 0.6267266619200393,
"learning_rate": 4.898e-06,
"loss": 2.4019,
"step": 2450
},
{
"epoch": 0.0495,
"grad_norm": 0.6650772680412506,
"learning_rate": 4.948000000000001e-06,
"loss": 2.405,
"step": 2475
},
{
"epoch": 0.05,
"grad_norm": 0.7197173899674899,
"learning_rate": 4.998e-06,
"loss": 2.4095,
"step": 2500
},
{
"epoch": 0.05,
"eval_loss": 2.4358348846435547,
"eval_runtime": 266.7682,
"eval_samples_per_second": 3.081,
"eval_steps_per_second": 1.541,
"step": 2500
},
{
"epoch": 0.0505,
"grad_norm": 0.6249572472256157,
"learning_rate": 5.048000000000001e-06,
"loss": 2.4058,
"step": 2525
},
{
"epoch": 0.051,
"grad_norm": 0.7429228032719255,
"learning_rate": 5.098000000000001e-06,
"loss": 2.4084,
"step": 2550
},
{
"epoch": 0.0515,
"grad_norm": 0.6320325962693778,
"learning_rate": 5.1480000000000005e-06,
"loss": 2.4015,
"step": 2575
},
{
"epoch": 0.052,
"grad_norm": 0.672581755106835,
"learning_rate": 5.198000000000001e-06,
"loss": 2.4051,
"step": 2600
},
{
"epoch": 0.052,
"eval_loss": 2.4351842403411865,
"eval_runtime": 264.9149,
"eval_samples_per_second": 3.103,
"eval_steps_per_second": 1.551,
"step": 2600
},
{
"epoch": 0.0525,
"grad_norm": 0.7086480776921088,
"learning_rate": 5.248000000000001e-06,
"loss": 2.3988,
"step": 2625
},
{
"epoch": 0.053,
"grad_norm": 0.6774201154936552,
"learning_rate": 5.298000000000001e-06,
"loss": 2.394,
"step": 2650
},
{
"epoch": 0.0535,
"grad_norm": 0.6661104910300973,
"learning_rate": 5.348000000000001e-06,
"loss": 2.4034,
"step": 2675
},
{
"epoch": 0.054,
"grad_norm": 0.6224421593448741,
"learning_rate": 5.398e-06,
"loss": 2.3939,
"step": 2700
},
{
"epoch": 0.054,
"eval_loss": 2.434826374053955,
"eval_runtime": 264.1641,
"eval_samples_per_second": 3.112,
"eval_steps_per_second": 1.556,
"step": 2700
},
{
"epoch": 0.0545,
"grad_norm": 0.6944661408419767,
"learning_rate": 5.448e-06,
"loss": 2.4064,
"step": 2725
},
{
"epoch": 0.055,
"grad_norm": 0.6597297955298902,
"learning_rate": 5.498e-06,
"loss": 2.4051,
"step": 2750
},
{
"epoch": 0.0555,
"grad_norm": 0.6526109506522182,
"learning_rate": 5.548e-06,
"loss": 2.4124,
"step": 2775
},
{
"epoch": 0.056,
"grad_norm": 0.6528041780055424,
"learning_rate": 5.5980000000000004e-06,
"loss": 2.3979,
"step": 2800
},
{
"epoch": 0.056,
"eval_loss": 2.4344167709350586,
"eval_runtime": 264.2924,
"eval_samples_per_second": 3.11,
"eval_steps_per_second": 1.555,
"step": 2800
},
{
"epoch": 0.0565,
"grad_norm": 0.7067565611523313,
"learning_rate": 5.648e-06,
"loss": 2.398,
"step": 2825
},
{
"epoch": 0.057,
"grad_norm": 0.6416666495903947,
"learning_rate": 5.698e-06,
"loss": 2.3991,
"step": 2850
},
{
"epoch": 0.0575,
"grad_norm": 0.6605105424774851,
"learning_rate": 5.748e-06,
"loss": 2.3962,
"step": 2875
},
{
"epoch": 0.058,
"grad_norm": 0.6308761264530915,
"learning_rate": 5.798e-06,
"loss": 2.4058,
"step": 2900
},
{
"epoch": 0.058,
"eval_loss": 2.434436082839966,
"eval_runtime": 265.0112,
"eval_samples_per_second": 3.102,
"eval_steps_per_second": 1.551,
"step": 2900
},
{
"epoch": 0.0585,
"grad_norm": 0.6363649329289001,
"learning_rate": 5.848000000000001e-06,
"loss": 2.3943,
"step": 2925
},
{
"epoch": 0.059,
"grad_norm": 0.6147983139117156,
"learning_rate": 5.898e-06,
"loss": 2.3982,
"step": 2950
},
{
"epoch": 0.0595,
"grad_norm": 0.611354772141602,
"learning_rate": 5.9480000000000005e-06,
"loss": 2.3921,
"step": 2975
},
{
"epoch": 0.06,
"grad_norm": 0.6269054680170398,
"learning_rate": 5.998000000000001e-06,
"loss": 2.392,
"step": 3000
},
{
"epoch": 0.06,
"eval_loss": 2.433990955352783,
"eval_runtime": 264.2169,
"eval_samples_per_second": 3.111,
"eval_steps_per_second": 1.556,
"step": 3000
},
{
"epoch": 0.0605,
"grad_norm": 0.6248207448228328,
"learning_rate": 6.048e-06,
"loss": 2.3858,
"step": 3025
},
{
"epoch": 0.061,
"grad_norm": 0.6275258656299642,
"learning_rate": 6.098000000000001e-06,
"loss": 2.4015,
"step": 3050
},
{
"epoch": 0.0615,
"grad_norm": 1.0457401571274152,
"learning_rate": 6.148e-06,
"loss": 2.3909,
"step": 3075
},
{
"epoch": 0.062,
"grad_norm": 0.6551230863319748,
"learning_rate": 6.198000000000001e-06,
"loss": 2.3983,
"step": 3100
},
{
"epoch": 0.062,
"eval_loss": 2.433279275894165,
"eval_runtime": 264.1521,
"eval_samples_per_second": 3.112,
"eval_steps_per_second": 1.556,
"step": 3100
},
{
"epoch": 0.0625,
"grad_norm": 0.6306746226297937,
"learning_rate": 6.248000000000001e-06,
"loss": 2.397,
"step": 3125
},
{
"epoch": 0.063,
"grad_norm": 0.6299802316587856,
"learning_rate": 6.2980000000000005e-06,
"loss": 2.4018,
"step": 3150
},
{
"epoch": 0.0635,
"grad_norm": 0.6265424590222634,
"learning_rate": 6.348000000000001e-06,
"loss": 2.4065,
"step": 3175
},
{
"epoch": 0.064,
"grad_norm": 0.6717273211615455,
"learning_rate": 6.398000000000001e-06,
"loss": 2.3906,
"step": 3200
},
{
"epoch": 0.064,
"eval_loss": 2.4333276748657227,
"eval_runtime": 263.9592,
"eval_samples_per_second": 3.114,
"eval_steps_per_second": 1.557,
"step": 3200
},
{
"epoch": 0.0645,
"grad_norm": 0.6159924635031793,
"learning_rate": 6.448000000000001e-06,
"loss": 2.3947,
"step": 3225
},
{
"epoch": 0.065,
"grad_norm": 0.6124462043712093,
"learning_rate": 6.498000000000001e-06,
"loss": 2.3963,
"step": 3250
},
{
"epoch": 0.0655,
"grad_norm": 0.6144378183602921,
"learning_rate": 6.548000000000001e-06,
"loss": 2.402,
"step": 3275
},
{
"epoch": 0.066,
"grad_norm": 0.6295732934678283,
"learning_rate": 6.598000000000001e-06,
"loss": 2.3877,
"step": 3300
},
{
"epoch": 0.066,
"eval_loss": 2.4331116676330566,
"eval_runtime": 263.4524,
"eval_samples_per_second": 3.12,
"eval_steps_per_second": 1.56,
"step": 3300
},
{
"epoch": 0.0665,
"grad_norm": 0.5938287129149346,
"learning_rate": 6.648e-06,
"loss": 2.389,
"step": 3325
},
{
"epoch": 0.067,
"grad_norm": 0.6194783667871923,
"learning_rate": 6.698e-06,
"loss": 2.39,
"step": 3350
},
{
"epoch": 0.0675,
"grad_norm": 0.60927231594853,
"learning_rate": 6.7480000000000004e-06,
"loss": 2.3968,
"step": 3375
},
{
"epoch": 0.068,
"grad_norm": 0.6386175333576501,
"learning_rate": 6.798e-06,
"loss": 2.3861,
"step": 3400
},
{
"epoch": 0.068,
"eval_loss": 2.4328911304473877,
"eval_runtime": 264.2923,
"eval_samples_per_second": 3.11,
"eval_steps_per_second": 1.555,
"step": 3400
},
{
"epoch": 0.0685,
"grad_norm": 0.6092295027577579,
"learning_rate": 6.848e-06,
"loss": 2.3827,
"step": 3425
},
{
"epoch": 0.069,
"grad_norm": 0.5914846449422462,
"learning_rate": 6.898e-06,
"loss": 2.3894,
"step": 3450
},
{
"epoch": 0.0695,
"grad_norm": 0.5927461214526666,
"learning_rate": 6.948e-06,
"loss": 2.3858,
"step": 3475
},
{
"epoch": 0.07,
"grad_norm": 0.5992194088197265,
"learning_rate": 6.998000000000001e-06,
"loss": 2.3941,
"step": 3500
},
{
"epoch": 0.07,
"eval_loss": 2.432774543762207,
"eval_runtime": 263.8546,
"eval_samples_per_second": 3.115,
"eval_steps_per_second": 1.558,
"step": 3500
},
{
"epoch": 0.0705,
"grad_norm": 0.6119297158568089,
"learning_rate": 7.048e-06,
"loss": 2.3897,
"step": 3525
},
{
"epoch": 0.071,
"grad_norm": 0.6040666217758901,
"learning_rate": 7.0980000000000005e-06,
"loss": 2.3966,
"step": 3550
},
{
"epoch": 0.0715,
"grad_norm": 0.6142925813030266,
"learning_rate": 7.148000000000001e-06,
"loss": 2.3953,
"step": 3575
},
{
"epoch": 0.072,
"grad_norm": 0.5857079248330344,
"learning_rate": 7.198e-06,
"loss": 2.3854,
"step": 3600
},
{
"epoch": 0.072,
"eval_loss": 2.432868719100952,
"eval_runtime": 264.1849,
"eval_samples_per_second": 3.111,
"eval_steps_per_second": 1.556,
"step": 3600
},
{
"epoch": 0.0725,
"grad_norm": 0.6075613052530382,
"learning_rate": 7.248000000000001e-06,
"loss": 2.3798,
"step": 3625
},
{
"epoch": 0.073,
"grad_norm": 0.6146043204282547,
"learning_rate": 7.298e-06,
"loss": 2.3894,
"step": 3650
},
{
"epoch": 0.0735,
"grad_norm": 0.613284002341936,
"learning_rate": 7.348000000000001e-06,
"loss": 2.3897,
"step": 3675
},
{
"epoch": 0.074,
"grad_norm": 0.6694404263159593,
"learning_rate": 7.398000000000001e-06,
"loss": 2.3925,
"step": 3700
},
{
"epoch": 0.074,
"eval_loss": 2.4324021339416504,
"eval_runtime": 263.3107,
"eval_samples_per_second": 3.122,
"eval_steps_per_second": 1.561,
"step": 3700
},
{
"epoch": 0.0745,
"grad_norm": 0.5756401973694445,
"learning_rate": 7.4480000000000005e-06,
"loss": 2.3894,
"step": 3725
},
{
"epoch": 0.075,
"grad_norm": 0.5945783703417461,
"learning_rate": 7.498000000000001e-06,
"loss": 2.3928,
"step": 3750
},
{
"epoch": 0.0755,
"grad_norm": 0.5935750222986942,
"learning_rate": 7.548000000000001e-06,
"loss": 2.3774,
"step": 3775
},
{
"epoch": 0.076,
"grad_norm": 0.5938734543073783,
"learning_rate": 7.598000000000001e-06,
"loss": 2.3776,
"step": 3800
},
{
"epoch": 0.076,
"eval_loss": 2.432751178741455,
"eval_runtime": 263.8929,
"eval_samples_per_second": 3.115,
"eval_steps_per_second": 1.557,
"step": 3800
},
{
"epoch": 0.0765,
"grad_norm": 0.595820899700728,
"learning_rate": 7.648e-06,
"loss": 2.3804,
"step": 3825
},
{
"epoch": 0.077,
"grad_norm": 0.6079304106413467,
"learning_rate": 7.698000000000002e-06,
"loss": 2.3917,
"step": 3850
},
{
"epoch": 0.0775,
"grad_norm": 0.6083448146618482,
"learning_rate": 7.748000000000001e-06,
"loss": 2.3842,
"step": 3875
},
{
"epoch": 0.078,
"grad_norm": 0.6128893415605828,
"learning_rate": 7.798e-06,
"loss": 2.3806,
"step": 3900
},
{
"epoch": 0.078,
"eval_loss": 2.4325239658355713,
"eval_runtime": 263.6693,
"eval_samples_per_second": 3.118,
"eval_steps_per_second": 1.559,
"step": 3900
},
{
"epoch": 0.0785,
"grad_norm": 0.6079041195191952,
"learning_rate": 7.848000000000002e-06,
"loss": 2.3801,
"step": 3925
},
{
"epoch": 0.079,
"grad_norm": 0.6075689821557235,
"learning_rate": 7.898e-06,
"loss": 2.3797,
"step": 3950
},
{
"epoch": 0.0795,
"grad_norm": 0.5882326737716994,
"learning_rate": 7.948e-06,
"loss": 2.3905,
"step": 3975
},
{
"epoch": 0.08,
"grad_norm": 0.5828476462223788,
"learning_rate": 7.998e-06,
"loss": 2.3806,
"step": 4000
},
{
"epoch": 0.08,
"eval_loss": 2.4323527812957764,
"eval_runtime": 263.9786,
"eval_samples_per_second": 3.114,
"eval_steps_per_second": 1.557,
"step": 4000
},
{
"epoch": 0.0805,
"grad_norm": 0.5907927035367586,
"learning_rate": 8.048e-06,
"loss": 2.3739,
"step": 4025
},
{
"epoch": 0.081,
"grad_norm": 0.608189189988593,
"learning_rate": 8.098000000000001e-06,
"loss": 2.3837,
"step": 4050
},
{
"epoch": 0.0815,
"grad_norm": 0.5933025642280234,
"learning_rate": 8.148e-06,
"loss": 2.3814,
"step": 4075
},
{
"epoch": 0.082,
"grad_norm": 0.5898305070270532,
"learning_rate": 8.198e-06,
"loss": 2.3854,
"step": 4100
},
{
"epoch": 0.082,
"eval_loss": 2.432577610015869,
"eval_runtime": 264.0972,
"eval_samples_per_second": 3.112,
"eval_steps_per_second": 1.556,
"step": 4100
},
{
"epoch": 0.0825,
"grad_norm": 0.5673002921483621,
"learning_rate": 8.248e-06,
"loss": 2.3827,
"step": 4125
},
{
"epoch": 0.083,
"grad_norm": 0.5859186364996516,
"learning_rate": 8.298000000000001e-06,
"loss": 2.3859,
"step": 4150
},
{
"epoch": 0.0835,
"grad_norm": 0.5852893491639726,
"learning_rate": 8.348e-06,
"loss": 2.3711,
"step": 4175
},
{
"epoch": 0.084,
"grad_norm": 0.5704807601233864,
"learning_rate": 8.398e-06,
"loss": 2.3682,
"step": 4200
},
{
"epoch": 0.084,
"eval_loss": 2.4325780868530273,
"eval_runtime": 264.0677,
"eval_samples_per_second": 3.113,
"eval_steps_per_second": 1.556,
"step": 4200
},
{
"epoch": 0.0845,
"grad_norm": 0.565873049775094,
"learning_rate": 8.448000000000001e-06,
"loss": 2.3894,
"step": 4225
},
{
"epoch": 0.085,
"grad_norm": 0.6594348238393681,
"learning_rate": 8.498e-06,
"loss": 2.3736,
"step": 4250
},
{
"epoch": 0.0855,
"grad_norm": 0.6114416993962639,
"learning_rate": 8.548e-06,
"loss": 2.3768,
"step": 4275
},
{
"epoch": 0.086,
"grad_norm": 0.613007148558132,
"learning_rate": 8.598000000000001e-06,
"loss": 2.3841,
"step": 4300
},
{
"epoch": 0.086,
"eval_loss": 2.432278633117676,
"eval_runtime": 264.5455,
"eval_samples_per_second": 3.107,
"eval_steps_per_second": 1.554,
"step": 4300
},
{
"epoch": 0.0865,
"grad_norm": 0.6316113111159283,
"learning_rate": 8.648000000000001e-06,
"loss": 2.3853,
"step": 4325
},
{
"epoch": 0.087,
"grad_norm": 0.578758909498954,
"learning_rate": 8.698e-06,
"loss": 2.3838,
"step": 4350
},
{
"epoch": 0.0875,
"grad_norm": 0.5663796780744771,
"learning_rate": 8.748000000000002e-06,
"loss": 2.3744,
"step": 4375
},
{
"epoch": 0.088,
"grad_norm": 0.5996723194508057,
"learning_rate": 8.798000000000001e-06,
"loss": 2.3741,
"step": 4400
},
{
"epoch": 0.088,
"eval_loss": 2.4327504634857178,
"eval_runtime": 264.3839,
"eval_samples_per_second": 3.109,
"eval_steps_per_second": 1.555,
"step": 4400
},
{
"epoch": 0.0885,
"grad_norm": 0.5903185672805589,
"learning_rate": 8.848e-06,
"loss": 2.3789,
"step": 4425
},
{
"epoch": 0.089,
"grad_norm": 0.5683354037993711,
"learning_rate": 8.898000000000002e-06,
"loss": 2.3739,
"step": 4450
},
{
"epoch": 0.0895,
"grad_norm": 0.5992802333814672,
"learning_rate": 8.948000000000001e-06,
"loss": 2.3805,
"step": 4475
},
{
"epoch": 0.09,
"grad_norm": 0.5951158771681028,
"learning_rate": 8.998000000000001e-06,
"loss": 2.3702,
"step": 4500
},
{
"epoch": 0.09,
"eval_loss": 2.432904005050659,
"eval_runtime": 264.0927,
"eval_samples_per_second": 3.113,
"eval_steps_per_second": 1.556,
"step": 4500
},
{
"epoch": 0.0905,
"grad_norm": 0.628437176595306,
"learning_rate": 9.048e-06,
"loss": 2.3705,
"step": 4525
},
{
"epoch": 0.091,
"grad_norm": 0.5852194468933433,
"learning_rate": 9.098000000000002e-06,
"loss": 2.3726,
"step": 4550
},
{
"epoch": 0.0915,
"grad_norm": 0.5832814461503186,
"learning_rate": 9.148e-06,
"loss": 2.3709,
"step": 4575
},
{
"epoch": 0.092,
"grad_norm": 0.6235298544634128,
"learning_rate": 9.198e-06,
"loss": 2.3823,
"step": 4600
},
{
"epoch": 0.092,
"eval_loss": 2.433288335800171,
"eval_runtime": 264.0394,
"eval_samples_per_second": 3.113,
"eval_steps_per_second": 1.557,
"step": 4600
},
{
"epoch": 0.0925,
"grad_norm": 0.6097464410099737,
"learning_rate": 9.248e-06,
"loss": 2.3715,
"step": 4625
},
{
"epoch": 0.093,
"grad_norm": 0.5830918527201829,
"learning_rate": 9.298e-06,
"loss": 2.3694,
"step": 4650
},
{
"epoch": 0.0935,
"grad_norm": 0.6195865573807103,
"learning_rate": 9.348000000000001e-06,
"loss": 2.3711,
"step": 4675
},
{
"epoch": 0.094,
"grad_norm": 0.5922485886549429,
"learning_rate": 9.398e-06,
"loss": 2.3764,
"step": 4700
},
{
"epoch": 0.094,
"eval_loss": 2.4330477714538574,
"eval_runtime": 263.7501,
"eval_samples_per_second": 3.117,
"eval_steps_per_second": 1.558,
"step": 4700
},
{
"epoch": 0.0945,
"grad_norm": 0.5909566806378528,
"learning_rate": 9.448e-06,
"loss": 2.3799,
"step": 4725
},
{
"epoch": 0.095,
"grad_norm": 0.5872189964007283,
"learning_rate": 9.498000000000001e-06,
"loss": 2.3737,
"step": 4750
},
{
"epoch": 0.0955,
"grad_norm": 0.6071714619656263,
"learning_rate": 9.548e-06,
"loss": 2.3789,
"step": 4775
},
{
"epoch": 0.096,
"grad_norm": 0.5631342344537085,
"learning_rate": 9.598e-06,
"loss": 2.3641,
"step": 4800
},
{
"epoch": 0.096,
"eval_loss": 2.4332797527313232,
"eval_runtime": 264.5164,
"eval_samples_per_second": 3.108,
"eval_steps_per_second": 1.554,
"step": 4800
},
{
"epoch": 0.0965,
"grad_norm": 0.600707218384485,
"learning_rate": 9.648000000000001e-06,
"loss": 2.3715,
"step": 4825
},
{
"epoch": 0.097,
"grad_norm": 0.5705494762785608,
"learning_rate": 9.698000000000001e-06,
"loss": 2.3741,
"step": 4850
},
{
"epoch": 0.0975,
"grad_norm": 0.5891811727113021,
"learning_rate": 9.748e-06,
"loss": 2.3738,
"step": 4875
},
{
"epoch": 0.098,
"grad_norm": 0.5947555260131183,
"learning_rate": 9.798e-06,
"loss": 2.365,
"step": 4900
},
{
"epoch": 0.098,
"eval_loss": 2.433032751083374,
"eval_runtime": 264.6355,
"eval_samples_per_second": 3.106,
"eval_steps_per_second": 1.553,
"step": 4900
},
{
"epoch": 0.0985,
"grad_norm": 0.6055417663185935,
"learning_rate": 9.848000000000001e-06,
"loss": 2.3677,
"step": 4925
},
{
"epoch": 0.099,
"grad_norm": 0.5803464068069174,
"learning_rate": 9.898e-06,
"loss": 2.3699,
"step": 4950
},
{
"epoch": 0.0995,
"grad_norm": 0.5899201870269601,
"learning_rate": 9.948e-06,
"loss": 2.3685,
"step": 4975
},
{
"epoch": 0.1,
"grad_norm": 0.6226759838202708,
"learning_rate": 9.998000000000002e-06,
"loss": 2.3599,
"step": 5000
},
{
"epoch": 0.1,
"eval_loss": 2.433412551879883,
"eval_runtime": 279.6783,
"eval_samples_per_second": 2.939,
"eval_steps_per_second": 1.47,
"step": 5000
},
{
"epoch": 0.1005,
"grad_norm": 0.6129345554278736,
"learning_rate": 9.994666666666668e-06,
"loss": 2.3651,
"step": 5025
},
{
"epoch": 0.101,
"grad_norm": 0.5783687106202524,
"learning_rate": 9.989111111111111e-06,
"loss": 2.3635,
"step": 5050
},
{
"epoch": 0.1015,
"grad_norm": 0.7886759246703615,
"learning_rate": 9.983555555555556e-06,
"loss": 2.3688,
"step": 5075
},
{
"epoch": 0.102,
"grad_norm": 0.5496276670344779,
"learning_rate": 9.978000000000002e-06,
"loss": 2.3718,
"step": 5100
},
{
"epoch": 0.102,
"eval_loss": 2.4336636066436768,
"eval_runtime": 264.0531,
"eval_samples_per_second": 3.113,
"eval_steps_per_second": 1.557,
"step": 5100
},
{
"epoch": 0.1025,
"grad_norm": 0.596488402670124,
"learning_rate": 9.972444444444445e-06,
"loss": 2.3654,
"step": 5125
},
{
"epoch": 0.103,
"grad_norm": 0.5758952191659142,
"learning_rate": 9.966888888888889e-06,
"loss": 2.3662,
"step": 5150
},
{
"epoch": 0.1035,
"grad_norm": 0.5714325894660194,
"learning_rate": 9.961333333333334e-06,
"loss": 2.3671,
"step": 5175
},
{
"epoch": 0.104,
"grad_norm": 0.5826964477363549,
"learning_rate": 9.95577777777778e-06,
"loss": 2.3621,
"step": 5200
},
{
"epoch": 0.104,
"eval_loss": 2.433170795440674,
"eval_runtime": 263.4913,
"eval_samples_per_second": 3.12,
"eval_steps_per_second": 1.56,
"step": 5200
},
{
"epoch": 0.1045,
"grad_norm": 0.5939017286545814,
"learning_rate": 9.950222222222223e-06,
"loss": 2.3704,
"step": 5225
},
{
"epoch": 0.105,
"grad_norm": 0.5916137818576529,
"learning_rate": 9.944666666666668e-06,
"loss": 2.3662,
"step": 5250
},
{
"epoch": 0.1055,
"grad_norm": 0.6105360548349205,
"learning_rate": 9.939111111111112e-06,
"loss": 2.3646,
"step": 5275
},
{
"epoch": 0.106,
"grad_norm": 0.5821955662592928,
"learning_rate": 9.933555555555557e-06,
"loss": 2.365,
"step": 5300
},
{
"epoch": 0.106,
"eval_loss": 2.4327642917633057,
"eval_runtime": 263.745,
"eval_samples_per_second": 3.117,
"eval_steps_per_second": 1.558,
"step": 5300
},
{
"epoch": 0.1065,
"grad_norm": 0.5805717889494187,
"learning_rate": 9.928e-06,
"loss": 2.364,
"step": 5325
},
{
"epoch": 0.107,
"grad_norm": 0.5876895049794754,
"learning_rate": 9.922444444444446e-06,
"loss": 2.362,
"step": 5350
},
{
"epoch": 0.1075,
"grad_norm": 0.6258383766876349,
"learning_rate": 9.91688888888889e-06,
"loss": 2.3654,
"step": 5375
},
{
"epoch": 0.108,
"grad_norm": 0.5963835367877209,
"learning_rate": 9.911333333333335e-06,
"loss": 2.3627,
"step": 5400
},
{
"epoch": 0.108,
"eval_loss": 2.4326930046081543,
"eval_runtime": 263.2366,
"eval_samples_per_second": 3.123,
"eval_steps_per_second": 1.561,
"step": 5400
},
{
"epoch": 0.1085,
"grad_norm": 0.5827253994353866,
"learning_rate": 9.905777777777778e-06,
"loss": 2.3703,
"step": 5425
},
{
"epoch": 0.109,
"grad_norm": 0.571031920084426,
"learning_rate": 9.900222222222223e-06,
"loss": 2.3671,
"step": 5450
},
{
"epoch": 0.1095,
"grad_norm": 0.599548806743577,
"learning_rate": 9.894666666666669e-06,
"loss": 2.362,
"step": 5475
},
{
"epoch": 0.11,
"grad_norm": 0.5736311725646083,
"learning_rate": 9.889111111111112e-06,
"loss": 2.3622,
"step": 5500
},
{
"epoch": 0.11,
"eval_loss": 2.4330084323883057,
"eval_runtime": 264.1044,
"eval_samples_per_second": 3.112,
"eval_steps_per_second": 1.556,
"step": 5500
},
{
"epoch": 0.1105,
"grad_norm": 0.6098672058792028,
"learning_rate": 9.883555555555556e-06,
"loss": 2.3705,
"step": 5525
},
{
"epoch": 0.111,
"grad_norm": 0.5761728375832208,
"learning_rate": 9.878000000000001e-06,
"loss": 2.3608,
"step": 5550
},
{
"epoch": 0.1115,
"grad_norm": 0.5922504560114277,
"learning_rate": 9.872444444444446e-06,
"loss": 2.3542,
"step": 5575
},
{
"epoch": 0.112,
"grad_norm": 0.5668795024079605,
"learning_rate": 9.86688888888889e-06,
"loss": 2.3623,
"step": 5600
},
{
"epoch": 0.112,
"eval_loss": 2.432955503463745,
"eval_runtime": 263.8097,
"eval_samples_per_second": 3.116,
"eval_steps_per_second": 1.558,
"step": 5600
},
{
"epoch": 0.1125,
"grad_norm": 0.5697809034851604,
"learning_rate": 9.861333333333333e-06,
"loss": 2.3541,
"step": 5625
},
{
"epoch": 0.113,
"grad_norm": 0.5740407982821335,
"learning_rate": 9.855777777777779e-06,
"loss": 2.3594,
"step": 5650
},
{
"epoch": 0.1135,
"grad_norm": 0.5697372211616294,
"learning_rate": 9.850222222222224e-06,
"loss": 2.3592,
"step": 5675
},
{
"epoch": 0.114,
"grad_norm": 0.5845230307189324,
"learning_rate": 9.844666666666667e-06,
"loss": 2.3456,
"step": 5700
},
{
"epoch": 0.114,
"eval_loss": 2.432389974594116,
"eval_runtime": 263.8043,
"eval_samples_per_second": 3.116,
"eval_steps_per_second": 1.558,
"step": 5700
},
{
"epoch": 0.1145,
"grad_norm": 0.5677067211464538,
"learning_rate": 9.839111111111111e-06,
"loss": 2.3581,
"step": 5725
},
{
"epoch": 0.115,
"grad_norm": 0.6024564908699644,
"learning_rate": 9.833555555555556e-06,
"loss": 2.359,
"step": 5750
},
{
"epoch": 0.1155,
"grad_norm": 0.5789830837760237,
"learning_rate": 9.828000000000001e-06,
"loss": 2.36,
"step": 5775
},
{
"epoch": 0.116,
"grad_norm": 0.5912805339254935,
"learning_rate": 9.822444444444445e-06,
"loss": 2.3588,
"step": 5800
},
{
"epoch": 0.116,
"eval_loss": 2.432565689086914,
"eval_runtime": 263.3515,
"eval_samples_per_second": 3.121,
"eval_steps_per_second": 1.561,
"step": 5800
},
{
"epoch": 0.1165,
"grad_norm": 0.5647440650976697,
"learning_rate": 9.81688888888889e-06,
"loss": 2.3576,
"step": 5825
},
{
"epoch": 0.117,
"grad_norm": 0.5673458673735715,
"learning_rate": 9.811333333333334e-06,
"loss": 2.3616,
"step": 5850
},
{
"epoch": 0.1175,
"grad_norm": 0.6030082642745155,
"learning_rate": 9.805777777777779e-06,
"loss": 2.3556,
"step": 5875
},
{
"epoch": 0.118,
"grad_norm": 0.5571893163840321,
"learning_rate": 9.800222222222223e-06,
"loss": 2.3557,
"step": 5900
},
{
"epoch": 0.118,
"eval_loss": 2.4327075481414795,
"eval_runtime": 263.2657,
"eval_samples_per_second": 3.122,
"eval_steps_per_second": 1.561,
"step": 5900
},
{
"epoch": 0.1185,
"grad_norm": 0.5716010515949606,
"learning_rate": 9.794666666666668e-06,
"loss": 2.3616,
"step": 5925
},
{
"epoch": 0.119,
"grad_norm": 0.6245053681878497,
"learning_rate": 9.789111111111111e-06,
"loss": 2.358,
"step": 5950
},
{
"epoch": 0.1195,
"grad_norm": 0.5896528100704728,
"learning_rate": 9.783555555555557e-06,
"loss": 2.355,
"step": 5975
},
{
"epoch": 0.12,
"grad_norm": 0.5534590488643797,
"learning_rate": 9.778e-06,
"loss": 2.3567,
"step": 6000
},
{
"epoch": 0.12,
"eval_loss": 2.4327354431152344,
"eval_runtime": 263.9156,
"eval_samples_per_second": 3.115,
"eval_steps_per_second": 1.557,
"step": 6000
},
{
"epoch": 0.1205,
"grad_norm": 0.5779403883996491,
"learning_rate": 9.772444444444445e-06,
"loss": 2.3487,
"step": 6025
},
{
"epoch": 0.121,
"grad_norm": 0.5693494880188505,
"learning_rate": 9.76688888888889e-06,
"loss": 2.3506,
"step": 6050
},
{
"epoch": 0.1215,
"grad_norm": 0.5864069751838692,
"learning_rate": 9.761333333333334e-06,
"loss": 2.3498,
"step": 6075
},
{
"epoch": 0.122,
"grad_norm": 0.5930208676954954,
"learning_rate": 9.755777777777778e-06,
"loss": 2.3508,
"step": 6100
},
{
"epoch": 0.122,
"eval_loss": 2.432914972305298,
"eval_runtime": 263.746,
"eval_samples_per_second": 3.117,
"eval_steps_per_second": 1.558,
"step": 6100
},
{
"epoch": 0.1225,
"grad_norm": 0.5967532601446782,
"learning_rate": 9.750222222222223e-06,
"loss": 2.3584,
"step": 6125
},
{
"epoch": 0.123,
"grad_norm": 0.5670429310236035,
"learning_rate": 9.744666666666668e-06,
"loss": 2.3584,
"step": 6150
},
{
"epoch": 0.1235,
"grad_norm": 0.5744482242457726,
"learning_rate": 9.739111111111112e-06,
"loss": 2.351,
"step": 6175
},
{
"epoch": 0.124,
"grad_norm": 0.6029007635970692,
"learning_rate": 9.733555555555555e-06,
"loss": 2.3494,
"step": 6200
},
{
"epoch": 0.124,
"eval_loss": 2.432878255844116,
"eval_runtime": 263.5842,
"eval_samples_per_second": 3.119,
"eval_steps_per_second": 1.559,
"step": 6200
},
{
"epoch": 0.1245,
"grad_norm": 0.564399310279196,
"learning_rate": 9.728e-06,
"loss": 2.3595,
"step": 6225
},
{
"epoch": 0.125,
"grad_norm": 0.6065670221926927,
"learning_rate": 9.722444444444446e-06,
"loss": 2.3547,
"step": 6250
},
{
"epoch": 0.1255,
"grad_norm": 0.5659801132085207,
"learning_rate": 9.71688888888889e-06,
"loss": 2.3511,
"step": 6275
},
{
"epoch": 0.126,
"grad_norm": 0.5837628069797915,
"learning_rate": 9.711333333333333e-06,
"loss": 2.3575,
"step": 6300
},
{
"epoch": 0.126,
"eval_loss": 2.4329097270965576,
"eval_runtime": 264.6192,
"eval_samples_per_second": 3.106,
"eval_steps_per_second": 1.553,
"step": 6300
},
{
"epoch": 0.1265,
"grad_norm": 0.5760319910919499,
"learning_rate": 9.705777777777778e-06,
"loss": 2.3488,
"step": 6325
},
{
"epoch": 0.127,
"grad_norm": 0.5761318046315628,
"learning_rate": 9.700222222222224e-06,
"loss": 2.3435,
"step": 6350
},
{
"epoch": 0.1275,
"grad_norm": 0.5609369346838009,
"learning_rate": 9.694666666666667e-06,
"loss": 2.347,
"step": 6375
},
{
"epoch": 0.128,
"grad_norm": 0.5954461846572633,
"learning_rate": 9.68911111111111e-06,
"loss": 2.3485,
"step": 6400
},
{
"epoch": 0.128,
"eval_loss": 2.4333934783935547,
"eval_runtime": 263.5903,
"eval_samples_per_second": 3.118,
"eval_steps_per_second": 1.559,
"step": 6400
},
{
"epoch": 0.1285,
"grad_norm": 0.5524126786458765,
"learning_rate": 9.683555555555556e-06,
"loss": 2.3514,
"step": 6425
},
{
"epoch": 0.129,
"grad_norm": 0.5590067107241867,
"learning_rate": 9.678000000000001e-06,
"loss": 2.3477,
"step": 6450
},
{
"epoch": 0.1295,
"grad_norm": 0.5578028236930622,
"learning_rate": 9.672444444444445e-06,
"loss": 2.3434,
"step": 6475
},
{
"epoch": 0.13,
"grad_norm": 0.6002389478119885,
"learning_rate": 9.66688888888889e-06,
"loss": 2.3415,
"step": 6500
},
{
"epoch": 0.13,
"eval_loss": 2.433302164077759,
"eval_runtime": 263.4334,
"eval_samples_per_second": 3.12,
"eval_steps_per_second": 1.56,
"step": 6500
},
{
"epoch": 0.1305,
"grad_norm": 0.5868647352323021,
"learning_rate": 9.661333333333334e-06,
"loss": 2.3532,
"step": 6525
},
{
"epoch": 0.131,
"grad_norm": 0.5525203092071236,
"learning_rate": 9.655777777777779e-06,
"loss": 2.3439,
"step": 6550
},
{
"epoch": 0.1315,
"grad_norm": 0.642282300647443,
"learning_rate": 9.650222222222222e-06,
"loss": 2.333,
"step": 6575
},
{
"epoch": 0.132,
"grad_norm": 0.5954691746571129,
"learning_rate": 9.644666666666668e-06,
"loss": 2.3371,
"step": 6600
},
{
"epoch": 0.132,
"eval_loss": 2.4332070350646973,
"eval_runtime": 263.9928,
"eval_samples_per_second": 3.114,
"eval_steps_per_second": 1.557,
"step": 6600
},
{
"epoch": 0.1325,
"grad_norm": 0.5696322215994257,
"learning_rate": 9.639111111111113e-06,
"loss": 2.3568,
"step": 6625
},
{
"epoch": 0.133,
"grad_norm": 0.569783318316734,
"learning_rate": 9.633555555555556e-06,
"loss": 2.3468,
"step": 6650
},
{
"epoch": 0.1335,
"grad_norm": 0.5974477984803339,
"learning_rate": 9.628e-06,
"loss": 2.3369,
"step": 6675
},
{
"epoch": 0.134,
"grad_norm": 0.5850514409957908,
"learning_rate": 9.622444444444445e-06,
"loss": 2.3328,
"step": 6700
},
{
"epoch": 0.134,
"eval_loss": 2.4336042404174805,
"eval_runtime": 264.1653,
"eval_samples_per_second": 3.112,
"eval_steps_per_second": 1.556,
"step": 6700
},
{
"epoch": 0.1345,
"grad_norm": 0.5598567946533984,
"learning_rate": 9.61688888888889e-06,
"loss": 2.3505,
"step": 6725
},
{
"epoch": 0.135,
"grad_norm": 0.564538169627995,
"learning_rate": 9.611333333333334e-06,
"loss": 2.3512,
"step": 6750
},
{
"epoch": 0.1355,
"grad_norm": 0.555057205811747,
"learning_rate": 9.605777777777778e-06,
"loss": 2.3441,
"step": 6775
},
{
"epoch": 0.136,
"grad_norm": 0.5928392878820046,
"learning_rate": 9.600222222222223e-06,
"loss": 2.342,
"step": 6800
},
{
"epoch": 0.136,
"eval_loss": 2.4332380294799805,
"eval_runtime": 263.6981,
"eval_samples_per_second": 3.117,
"eval_steps_per_second": 1.559,
"step": 6800
},
{
"epoch": 0.1365,
"grad_norm": 0.580747535991996,
"learning_rate": 9.594666666666668e-06,
"loss": 2.3402,
"step": 6825
},
{
"epoch": 0.137,
"grad_norm": 0.5361093856752921,
"learning_rate": 9.589111111111112e-06,
"loss": 2.3345,
"step": 6850
},
{
"epoch": 0.1375,
"grad_norm": 0.5764684974648585,
"learning_rate": 9.583555555555555e-06,
"loss": 2.3434,
"step": 6875
},
{
"epoch": 0.138,
"grad_norm": 0.5695437902803252,
"learning_rate": 9.578e-06,
"loss": 2.3345,
"step": 6900
},
{
"epoch": 0.138,
"eval_loss": 2.4334897994995117,
"eval_runtime": 263.9042,
"eval_samples_per_second": 3.115,
"eval_steps_per_second": 1.557,
"step": 6900
},
{
"epoch": 0.1385,
"grad_norm": 0.5856816810807355,
"learning_rate": 9.572444444444446e-06,
"loss": 2.3344,
"step": 6925
},
{
"epoch": 0.139,
"grad_norm": 0.5692161417871612,
"learning_rate": 9.56688888888889e-06,
"loss": 2.3492,
"step": 6950
},
{
"epoch": 0.1395,
"grad_norm": 0.5782790626699041,
"learning_rate": 9.561333333333333e-06,
"loss": 2.3343,
"step": 6975
},
{
"epoch": 0.14,
"grad_norm": 0.5592348825440727,
"learning_rate": 9.555777777777778e-06,
"loss": 2.3361,
"step": 7000
},
{
"epoch": 0.14,
"eval_loss": 2.4338128566741943,
"eval_runtime": 264.0278,
"eval_samples_per_second": 3.113,
"eval_steps_per_second": 1.557,
"step": 7000
},
{
"epoch": 0.1405,
"grad_norm": 0.5810855929853301,
"learning_rate": 9.550222222222223e-06,
"loss": 2.3397,
"step": 7025
},
{
"epoch": 0.141,
"grad_norm": 0.5672444444354668,
"learning_rate": 9.544666666666667e-06,
"loss": 2.3384,
"step": 7050
},
{
"epoch": 0.1415,
"grad_norm": 0.649461804794621,
"learning_rate": 9.539111111111112e-06,
"loss": 2.3384,
"step": 7075
},
{
"epoch": 0.142,
"grad_norm": 0.5697893925017475,
"learning_rate": 9.533555555555556e-06,
"loss": 2.3415,
"step": 7100
},
{
"epoch": 0.142,
"eval_loss": 2.4329330921173096,
"eval_runtime": 263.8408,
"eval_samples_per_second": 3.116,
"eval_steps_per_second": 1.558,
"step": 7100
},
{
"epoch": 0.1425,
"grad_norm": 0.562192662676289,
"learning_rate": 9.528000000000001e-06,
"loss": 2.3381,
"step": 7125
},
{
"epoch": 0.143,
"grad_norm": 0.5782927675061864,
"learning_rate": 9.522444444444444e-06,
"loss": 2.3316,
"step": 7150
},
{
"epoch": 0.1435,
"grad_norm": 0.5470889439002048,
"learning_rate": 9.51688888888889e-06,
"loss": 2.3336,
"step": 7175
},
{
"epoch": 0.144,
"grad_norm": 0.5732687375919955,
"learning_rate": 9.511333333333335e-06,
"loss": 2.3302,
"step": 7200
},
{
"epoch": 0.144,
"eval_loss": 2.4339091777801514,
"eval_runtime": 265.4685,
"eval_samples_per_second": 3.096,
"eval_steps_per_second": 1.548,
"step": 7200
},
{
"epoch": 0.1445,
"grad_norm": 0.5552677779418167,
"learning_rate": 9.505777777777779e-06,
"loss": 2.3382,
"step": 7225
},
{
"epoch": 0.145,
"grad_norm": 0.5597695533114173,
"learning_rate": 9.500222222222222e-06,
"loss": 2.3281,
"step": 7250
},
{
"epoch": 0.1455,
"grad_norm": 0.586047229250587,
"learning_rate": 9.494666666666667e-06,
"loss": 2.3365,
"step": 7275
},
{
"epoch": 0.146,
"grad_norm": 0.5631697021330876,
"learning_rate": 9.489111111111113e-06,
"loss": 2.3434,
"step": 7300
},
{
"epoch": 0.146,
"eval_loss": 2.4337289333343506,
"eval_runtime": 264.0121,
"eval_samples_per_second": 3.113,
"eval_steps_per_second": 1.557,
"step": 7300
},
{
"epoch": 0.1465,
"grad_norm": 0.5787283610065107,
"learning_rate": 9.483555555555556e-06,
"loss": 2.3385,
"step": 7325
},
{
"epoch": 0.147,
"grad_norm": 0.5894250508009748,
"learning_rate": 9.478e-06,
"loss": 2.3289,
"step": 7350
},
{
"epoch": 0.1475,
"grad_norm": 0.5698558287850775,
"learning_rate": 9.472444444444445e-06,
"loss": 2.3363,
"step": 7375
},
{
"epoch": 0.148,
"grad_norm": 0.5704695535231787,
"learning_rate": 9.46688888888889e-06,
"loss": 2.3245,
"step": 7400
},
{
"epoch": 0.148,
"eval_loss": 2.4338371753692627,
"eval_runtime": 264.1068,
"eval_samples_per_second": 3.112,
"eval_steps_per_second": 1.556,
"step": 7400
},
{
"epoch": 0.1485,
"grad_norm": 0.5452782996001769,
"learning_rate": 9.461333333333334e-06,
"loss": 2.3442,
"step": 7425
},
{
"epoch": 0.149,
"grad_norm": 0.5741037001956839,
"learning_rate": 9.455777777777777e-06,
"loss": 2.3349,
"step": 7450
},
{
"epoch": 0.1495,
"grad_norm": 0.5570524045425876,
"learning_rate": 9.450222222222223e-06,
"loss": 2.3324,
"step": 7475
},
{
"epoch": 0.15,
"grad_norm": 0.5701333037498688,
"learning_rate": 9.444666666666668e-06,
"loss": 2.3268,
"step": 7500
},
{
"epoch": 0.15,
"eval_loss": 2.4347753524780273,
"eval_runtime": 264.1822,
"eval_samples_per_second": 3.111,
"eval_steps_per_second": 1.556,
"step": 7500
},
{
"epoch": 0.1505,
"grad_norm": 0.5636194713998469,
"learning_rate": 9.439111111111111e-06,
"loss": 2.3324,
"step": 7525
},
{
"epoch": 0.151,
"grad_norm": 0.5745462812172999,
"learning_rate": 9.433555555555557e-06,
"loss": 2.3438,
"step": 7550
},
{
"epoch": 0.1515,
"grad_norm": 0.5658180287749817,
"learning_rate": 9.428e-06,
"loss": 2.3272,
"step": 7575
},
{
"epoch": 0.152,
"grad_norm": 0.5590021944536283,
"learning_rate": 9.422444444444445e-06,
"loss": 2.3379,
"step": 7600
},
{
"epoch": 0.152,
"eval_loss": 2.43342924118042,
"eval_runtime": 264.6073,
"eval_samples_per_second": 3.106,
"eval_steps_per_second": 1.553,
"step": 7600
},
{
"epoch": 0.1525,
"grad_norm": 0.5756847823781959,
"learning_rate": 9.41688888888889e-06,
"loss": 2.3291,
"step": 7625
},
{
"epoch": 0.153,
"grad_norm": 0.5614727649452073,
"learning_rate": 9.411333333333334e-06,
"loss": 2.3164,
"step": 7650
},
{
"epoch": 0.1535,
"grad_norm": 0.581410678990456,
"learning_rate": 9.405777777777778e-06,
"loss": 2.3205,
"step": 7675
},
{
"epoch": 0.154,
"grad_norm": 0.6063515370764081,
"learning_rate": 9.400222222222223e-06,
"loss": 2.3331,
"step": 7700
},
{
"epoch": 0.154,
"eval_loss": 2.435711622238159,
"eval_runtime": 283.6724,
"eval_samples_per_second": 2.898,
"eval_steps_per_second": 1.449,
"step": 7700
},
{
"epoch": 0.1545,
"grad_norm": 0.5535459156675728,
"learning_rate": 9.394666666666668e-06,
"loss": 2.3312,
"step": 7725
},
{
"epoch": 0.155,
"grad_norm": 0.5550223235337549,
"learning_rate": 9.389111111111112e-06,
"loss": 2.3222,
"step": 7750
},
{
"epoch": 0.1555,
"grad_norm": 0.5661396564004607,
"learning_rate": 9.383555555555557e-06,
"loss": 2.329,
"step": 7775
},
{
"epoch": 0.156,
"grad_norm": 0.5754229466302317,
"learning_rate": 9.378e-06,
"loss": 2.3375,
"step": 7800
},
{
"epoch": 0.156,
"eval_loss": 2.4339263439178467,
"eval_runtime": 263.7245,
"eval_samples_per_second": 3.117,
"eval_steps_per_second": 1.558,
"step": 7800
},
{
"epoch": 0.1565,
"grad_norm": 0.5922113870936093,
"learning_rate": 9.372444444444446e-06,
"loss": 2.3326,
"step": 7825
},
{
"epoch": 0.157,
"grad_norm": 0.5802231546249389,
"learning_rate": 9.36688888888889e-06,
"loss": 2.3313,
"step": 7850
},
{
"epoch": 0.1575,
"grad_norm": 0.5613750089293277,
"learning_rate": 9.361333333333335e-06,
"loss": 2.3306,
"step": 7875
},
{
"epoch": 0.158,
"grad_norm": 0.5554952690049914,
"learning_rate": 9.355777777777778e-06,
"loss": 2.3307,
"step": 7900
},
{
"epoch": 0.158,
"eval_loss": 2.435500144958496,
"eval_runtime": 268.1064,
"eval_samples_per_second": 3.066,
"eval_steps_per_second": 1.533,
"step": 7900
},
{
"epoch": 0.1585,
"grad_norm": 0.5699743157285643,
"learning_rate": 9.350222222222224e-06,
"loss": 2.3274,
"step": 7925
},
{
"epoch": 0.159,
"grad_norm": 0.580771514541295,
"learning_rate": 9.344666666666667e-06,
"loss": 2.3238,
"step": 7950
},
{
"epoch": 0.1595,
"grad_norm": 0.563419791930312,
"learning_rate": 9.339111111111112e-06,
"loss": 2.3384,
"step": 7975
},
{
"epoch": 0.16,
"grad_norm": 0.5793778749938447,
"learning_rate": 9.333555555555558e-06,
"loss": 2.3291,
"step": 8000
},
{
"epoch": 0.16,
"eval_loss": 2.4343531131744385,
"eval_runtime": 263.9111,
"eval_samples_per_second": 3.115,
"eval_steps_per_second": 1.557,
"step": 8000
},
{
"epoch": 0.1605,
"grad_norm": 0.5748501940226582,
"learning_rate": 9.328000000000001e-06,
"loss": 2.3272,
"step": 8025
},
{
"epoch": 0.161,
"grad_norm": 0.5776520997935511,
"learning_rate": 9.322444444444445e-06,
"loss": 2.3232,
"step": 8050
},
{
"epoch": 0.1615,
"grad_norm": 0.5841162716826148,
"learning_rate": 9.31688888888889e-06,
"loss": 2.3252,
"step": 8075
},
{
"epoch": 0.162,
"grad_norm": 0.5582161918345583,
"learning_rate": 9.311333333333335e-06,
"loss": 2.3254,
"step": 8100
},
{
"epoch": 0.162,
"eval_loss": 2.4345877170562744,
"eval_runtime": 263.9792,
"eval_samples_per_second": 3.114,
"eval_steps_per_second": 1.557,
"step": 8100
},
{
"epoch": 0.1625,
"grad_norm": 0.5744381110572562,
"learning_rate": 9.305777777777779e-06,
"loss": 2.325,
"step": 8125
},
{
"epoch": 0.163,
"grad_norm": 0.5801402993634438,
"learning_rate": 9.300222222222222e-06,
"loss": 2.3203,
"step": 8150
},
{
"epoch": 0.1635,
"grad_norm": 0.5644380448766211,
"learning_rate": 9.294666666666668e-06,
"loss": 2.3179,
"step": 8175
},
{
"epoch": 0.164,
"grad_norm": 0.5747041663572834,
"learning_rate": 9.289111111111113e-06,
"loss": 2.3241,
"step": 8200
},
{
"epoch": 0.164,
"eval_loss": 2.435701847076416,
"eval_runtime": 263.9699,
"eval_samples_per_second": 3.114,
"eval_steps_per_second": 1.557,
"step": 8200
},
{
"epoch": 0.1645,
"grad_norm": 0.5550631701119645,
"learning_rate": 9.283555555555556e-06,
"loss": 2.3176,
"step": 8225
},
{
"epoch": 0.165,
"grad_norm": 0.5828828542252756,
"learning_rate": 9.278e-06,
"loss": 2.3213,
"step": 8250
},
{
"epoch": 0.1655,
"grad_norm": 0.5610132600982978,
"learning_rate": 9.272444444444445e-06,
"loss": 2.3117,
"step": 8275
},
{
"epoch": 0.166,
"grad_norm": 0.5777357931804634,
"learning_rate": 9.26688888888889e-06,
"loss": 2.3189,
"step": 8300
},
{
"epoch": 0.166,
"eval_loss": 2.43573260307312,
"eval_runtime": 264.2018,
"eval_samples_per_second": 3.111,
"eval_steps_per_second": 1.556,
"step": 8300
},
{
"epoch": 0.1665,
"grad_norm": 0.5515402141694353,
"learning_rate": 9.261333333333334e-06,
"loss": 2.3267,
"step": 8325
},
{
"epoch": 0.167,
"grad_norm": 0.588745393922677,
"learning_rate": 9.25577777777778e-06,
"loss": 2.3219,
"step": 8350
},
{
"epoch": 0.1675,
"grad_norm": 0.5391388541771018,
"learning_rate": 9.250222222222223e-06,
"loss": 2.3181,
"step": 8375
},
{
"epoch": 0.168,
"grad_norm": 0.5680296112961243,
"learning_rate": 9.244666666666668e-06,
"loss": 2.3231,
"step": 8400
},
{
"epoch": 0.168,
"eval_loss": 2.435276985168457,
"eval_runtime": 263.8428,
"eval_samples_per_second": 3.115,
"eval_steps_per_second": 1.558,
"step": 8400
},
{
"epoch": 0.1685,
"grad_norm": 0.5655802530008279,
"learning_rate": 9.239111111111112e-06,
"loss": 2.3201,
"step": 8425
},
{
"epoch": 0.169,
"grad_norm": 0.5917481613153034,
"learning_rate": 9.233555555555557e-06,
"loss": 2.3184,
"step": 8450
},
{
"epoch": 0.1695,
"grad_norm": 0.5808853698441179,
"learning_rate": 9.228e-06,
"loss": 2.3151,
"step": 8475
},
{
"epoch": 0.17,
"grad_norm": 0.5868551530423814,
"learning_rate": 9.222444444444446e-06,
"loss": 2.3146,
"step": 8500
},
{
"epoch": 0.17,
"eval_loss": 2.435950994491577,
"eval_runtime": 264.3586,
"eval_samples_per_second": 3.109,
"eval_steps_per_second": 1.555,
"step": 8500
},
{
"epoch": 0.1705,
"grad_norm": 0.5638181149272796,
"learning_rate": 9.21688888888889e-06,
"loss": 2.3155,
"step": 8525
},
{
"epoch": 0.171,
"grad_norm": 0.5740285526813199,
"learning_rate": 9.211333333333334e-06,
"loss": 2.319,
"step": 8550
},
{
"epoch": 0.1715,
"grad_norm": 0.5695622395648989,
"learning_rate": 9.20577777777778e-06,
"loss": 2.3206,
"step": 8575
},
{
"epoch": 0.172,
"grad_norm": 0.5747463636735414,
"learning_rate": 9.200222222222223e-06,
"loss": 2.3111,
"step": 8600
},
{
"epoch": 0.172,
"eval_loss": 2.4367878437042236,
"eval_runtime": 264.2061,
"eval_samples_per_second": 3.111,
"eval_steps_per_second": 1.556,
"step": 8600
},
{
"epoch": 0.1725,
"grad_norm": 0.5777631704492084,
"learning_rate": 9.194666666666667e-06,
"loss": 2.3078,
"step": 8625
},
{
"epoch": 0.173,
"grad_norm": 0.5746886517313039,
"learning_rate": 9.189111111111112e-06,
"loss": 2.3152,
"step": 8650
},
{
"epoch": 0.1735,
"grad_norm": 0.564580351173264,
"learning_rate": 9.183555555555557e-06,
"loss": 2.316,
"step": 8675
},
{
"epoch": 0.174,
"grad_norm": 0.6048784393681501,
"learning_rate": 9.178000000000001e-06,
"loss": 2.3251,
"step": 8700
},
{
"epoch": 0.174,
"eval_loss": 2.435750722885132,
"eval_runtime": 264.296,
"eval_samples_per_second": 3.11,
"eval_steps_per_second": 1.555,
"step": 8700
},
{
"epoch": 0.1745,
"grad_norm": 0.5769443750882641,
"learning_rate": 9.172444444444444e-06,
"loss": 2.3186,
"step": 8725
},
{
"epoch": 0.175,
"grad_norm": 0.5792202067037501,
"learning_rate": 9.16688888888889e-06,
"loss": 2.3106,
"step": 8750
},
{
"epoch": 0.1755,
"grad_norm": 0.5819115394572557,
"learning_rate": 9.161333333333335e-06,
"loss": 2.3118,
"step": 8775
},
{
"epoch": 0.176,
"grad_norm": 0.575657270210696,
"learning_rate": 9.155777777777779e-06,
"loss": 2.3106,
"step": 8800
},
{
"epoch": 0.176,
"eval_loss": 2.436899185180664,
"eval_runtime": 263.9579,
"eval_samples_per_second": 3.114,
"eval_steps_per_second": 1.557,
"step": 8800
},
{
"epoch": 0.1765,
"grad_norm": 0.572118834452971,
"learning_rate": 9.150222222222222e-06,
"loss": 2.3139,
"step": 8825
},
{
"epoch": 0.177,
"grad_norm": 0.5812618278818413,
"learning_rate": 9.144666666666667e-06,
"loss": 2.319,
"step": 8850
},
{
"epoch": 0.1775,
"grad_norm": 0.5527533551295488,
"learning_rate": 9.139111111111113e-06,
"loss": 2.3152,
"step": 8875
},
{
"epoch": 0.178,
"grad_norm": 0.5749551425231054,
"learning_rate": 9.133555555555556e-06,
"loss": 2.3065,
"step": 8900
},
{
"epoch": 0.178,
"eval_loss": 2.4364571571350098,
"eval_runtime": 264.0259,
"eval_samples_per_second": 3.113,
"eval_steps_per_second": 1.557,
"step": 8900
},
{
"epoch": 0.1785,
"grad_norm": 0.5758182476998225,
"learning_rate": 9.128e-06,
"loss": 2.3104,
"step": 8925
},
{
"epoch": 0.179,
"grad_norm": 0.5922756280220078,
"learning_rate": 9.122444444444445e-06,
"loss": 2.3158,
"step": 8950
},
{
"epoch": 0.1795,
"grad_norm": 0.5943790910117238,
"learning_rate": 9.11688888888889e-06,
"loss": 2.3167,
"step": 8975
},
{
"epoch": 0.18,
"grad_norm": 0.580613992072982,
"learning_rate": 9.111333333333334e-06,
"loss": 2.3069,
"step": 9000
},
{
"epoch": 0.18,
"eval_loss": 2.436984062194824,
"eval_runtime": 264.2235,
"eval_samples_per_second": 3.111,
"eval_steps_per_second": 1.556,
"step": 9000
}
],
"logging_steps": 25,
"max_steps": 50000,
"num_input_tokens_seen": 0,
"num_train_epochs": 9223372036854775807,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.8648820684944835e+19,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}