molcrawl-compounds-bert-small / trainer_state.json
deskull's picture
Upload MolCrawl compounds BERT small model
90dd521 verified
{
"best_metric": 0.229187473654747,
"best_model_checkpoint": "learning_source_20260318/compounds/bert-output/compounds-small/checkpoint-61000",
"epoch": 3.7895791085917425,
"eval_steps": 100,
"global_step": 63000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.006015204934272608,
"grad_norm": 4.738959312438965,
"learning_rate": 3e-06,
"loss": 4.8794,
"step": 100
},
{
"epoch": 0.006015204934272608,
"eval_loss": 3.196549892425537,
"eval_runtime": 21.7167,
"eval_samples_per_second": 460.474,
"eval_steps_per_second": 57.559,
"step": 100
},
{
"epoch": 0.012030409868545216,
"grad_norm": 3.5901083946228027,
"learning_rate": 6e-06,
"loss": 2.8953,
"step": 200
},
{
"epoch": 0.012030409868545216,
"eval_loss": 2.467365264892578,
"eval_runtime": 21.7318,
"eval_samples_per_second": 460.156,
"eval_steps_per_second": 57.52,
"step": 200
},
{
"epoch": 0.018045614802817824,
"grad_norm": 1.398197889328003,
"learning_rate": 5.998999666555519e-06,
"loss": 2.4113,
"step": 300
},
{
"epoch": 0.018045614802817824,
"eval_loss": 2.267258644104004,
"eval_runtime": 21.7416,
"eval_samples_per_second": 459.948,
"eval_steps_per_second": 57.493,
"step": 300
},
{
"epoch": 0.02406081973709043,
"grad_norm": 1.1230988502502441,
"learning_rate": 5.997999333111037e-06,
"loss": 2.2393,
"step": 400
},
{
"epoch": 0.02406081973709043,
"eval_loss": 2.1407406330108643,
"eval_runtime": 21.7301,
"eval_samples_per_second": 460.192,
"eval_steps_per_second": 57.524,
"step": 400
},
{
"epoch": 0.030076024671363038,
"grad_norm": 1.1243526935577393,
"learning_rate": 5.9969989996665554e-06,
"loss": 2.129,
"step": 500
},
{
"epoch": 0.030076024671363038,
"eval_loss": 2.0753207206726074,
"eval_runtime": 21.7547,
"eval_samples_per_second": 459.671,
"eval_steps_per_second": 57.459,
"step": 500
},
{
"epoch": 0.03609122960563565,
"grad_norm": 2.403114080429077,
"learning_rate": 5.995998666222074e-06,
"loss": 2.0746,
"step": 600
},
{
"epoch": 0.03609122960563565,
"eval_loss": 2.029200315475464,
"eval_runtime": 21.746,
"eval_samples_per_second": 459.855,
"eval_steps_per_second": 57.482,
"step": 600
},
{
"epoch": 0.042106434539908255,
"grad_norm": 1.4393417835235596,
"learning_rate": 5.994998332777593e-06,
"loss": 2.0389,
"step": 700
},
{
"epoch": 0.042106434539908255,
"eval_loss": 1.995573878288269,
"eval_runtime": 21.7364,
"eval_samples_per_second": 460.057,
"eval_steps_per_second": 57.507,
"step": 700
},
{
"epoch": 0.04812163947418086,
"grad_norm": 1.392498254776001,
"learning_rate": 5.9939979993331115e-06,
"loss": 1.9989,
"step": 800
},
{
"epoch": 0.04812163947418086,
"eval_loss": 1.9491331577301025,
"eval_runtime": 21.7276,
"eval_samples_per_second": 460.245,
"eval_steps_per_second": 57.531,
"step": 800
},
{
"epoch": 0.05413684440845347,
"grad_norm": 1.9008598327636719,
"learning_rate": 5.992997665888629e-06,
"loss": 1.9332,
"step": 900
},
{
"epoch": 0.05413684440845347,
"eval_loss": 1.8610961437225342,
"eval_runtime": 21.7345,
"eval_samples_per_second": 460.098,
"eval_steps_per_second": 57.512,
"step": 900
},
{
"epoch": 0.060152049342726076,
"grad_norm": 2.0319344997406006,
"learning_rate": 5.991997332444148e-06,
"loss": 1.8607,
"step": 1000
},
{
"epoch": 0.060152049342726076,
"eval_loss": 1.7824585437774658,
"eval_runtime": 21.7313,
"eval_samples_per_second": 460.167,
"eval_steps_per_second": 57.521,
"step": 1000
},
{
"epoch": 0.06616725427699868,
"grad_norm": 1.9651939868927002,
"learning_rate": 5.990996998999667e-06,
"loss": 1.8064,
"step": 1100
},
{
"epoch": 0.06616725427699868,
"eval_loss": 1.7280884981155396,
"eval_runtime": 21.7428,
"eval_samples_per_second": 459.923,
"eval_steps_per_second": 57.49,
"step": 1100
},
{
"epoch": 0.0721824592112713,
"grad_norm": 1.2767350673675537,
"learning_rate": 5.989996665555185e-06,
"loss": 1.7432,
"step": 1200
},
{
"epoch": 0.0721824592112713,
"eval_loss": 1.644949197769165,
"eval_runtime": 21.7259,
"eval_samples_per_second": 460.279,
"eval_steps_per_second": 57.535,
"step": 1200
},
{
"epoch": 0.0781976641455439,
"grad_norm": 1.3338353633880615,
"learning_rate": 5.988996332110703e-06,
"loss": 1.6816,
"step": 1300
},
{
"epoch": 0.0781976641455439,
"eval_loss": 1.5758517980575562,
"eval_runtime": 21.7331,
"eval_samples_per_second": 460.127,
"eval_steps_per_second": 57.516,
"step": 1300
},
{
"epoch": 0.08421286907981651,
"grad_norm": 1.5716562271118164,
"learning_rate": 5.987995998666222e-06,
"loss": 1.6209,
"step": 1400
},
{
"epoch": 0.08421286907981651,
"eval_loss": 1.506639003753662,
"eval_runtime": 21.74,
"eval_samples_per_second": 459.982,
"eval_steps_per_second": 57.498,
"step": 1400
},
{
"epoch": 0.09022807401408911,
"grad_norm": 1.4891563653945923,
"learning_rate": 5.986995665221741e-06,
"loss": 1.5562,
"step": 1500
},
{
"epoch": 0.09022807401408911,
"eval_loss": 1.4430310726165771,
"eval_runtime": 21.7254,
"eval_samples_per_second": 460.29,
"eval_steps_per_second": 57.536,
"step": 1500
},
{
"epoch": 0.09624327894836172,
"grad_norm": 1.6210014820098877,
"learning_rate": 5.9859953317772595e-06,
"loss": 1.5081,
"step": 1600
},
{
"epoch": 0.09624327894836172,
"eval_loss": 1.394563913345337,
"eval_runtime": 21.7318,
"eval_samples_per_second": 460.155,
"eval_steps_per_second": 57.519,
"step": 1600
},
{
"epoch": 0.10225848388263432,
"grad_norm": 2.3340542316436768,
"learning_rate": 5.984994998332777e-06,
"loss": 1.4674,
"step": 1700
},
{
"epoch": 0.10225848388263432,
"eval_loss": 1.3446385860443115,
"eval_runtime": 21.735,
"eval_samples_per_second": 460.088,
"eval_steps_per_second": 57.511,
"step": 1700
},
{
"epoch": 0.10827368881690694,
"grad_norm": 1.6647675037384033,
"learning_rate": 5.983994664888296e-06,
"loss": 1.424,
"step": 1800
},
{
"epoch": 0.10827368881690694,
"eval_loss": 1.3100253343582153,
"eval_runtime": 21.725,
"eval_samples_per_second": 460.299,
"eval_steps_per_second": 57.537,
"step": 1800
},
{
"epoch": 0.11428889375117954,
"grad_norm": 1.46592116355896,
"learning_rate": 5.982994331443815e-06,
"loss": 1.3892,
"step": 1900
},
{
"epoch": 0.11428889375117954,
"eval_loss": 1.2686667442321777,
"eval_runtime": 21.7512,
"eval_samples_per_second": 459.744,
"eval_steps_per_second": 57.468,
"step": 1900
},
{
"epoch": 0.12030409868545215,
"grad_norm": 1.8340036869049072,
"learning_rate": 5.981993997999333e-06,
"loss": 1.3564,
"step": 2000
},
{
"epoch": 0.12030409868545215,
"eval_loss": 1.2294234037399292,
"eval_runtime": 21.7966,
"eval_samples_per_second": 458.788,
"eval_steps_per_second": 57.348,
"step": 2000
},
{
"epoch": 0.12631930361972477,
"grad_norm": 1.5960652828216553,
"learning_rate": 5.980993664554851e-06,
"loss": 1.3285,
"step": 2100
},
{
"epoch": 0.12631930361972477,
"eval_loss": 1.2100887298583984,
"eval_runtime": 21.7436,
"eval_samples_per_second": 459.905,
"eval_steps_per_second": 57.488,
"step": 2100
},
{
"epoch": 0.13233450855399737,
"grad_norm": 1.8335785865783691,
"learning_rate": 5.979993331110371e-06,
"loss": 1.3001,
"step": 2200
},
{
"epoch": 0.13233450855399737,
"eval_loss": 1.1752792596817017,
"eval_runtime": 21.7453,
"eval_samples_per_second": 459.87,
"eval_steps_per_second": 57.484,
"step": 2200
},
{
"epoch": 0.13834971348826997,
"grad_norm": 1.612433671951294,
"learning_rate": 5.978992997665889e-06,
"loss": 1.2695,
"step": 2300
},
{
"epoch": 0.13834971348826997,
"eval_loss": 1.147255778312683,
"eval_runtime": 21.6924,
"eval_samples_per_second": 460.992,
"eval_steps_per_second": 57.624,
"step": 2300
},
{
"epoch": 0.1443649184225426,
"grad_norm": 1.5603346824645996,
"learning_rate": 5.9779926642214075e-06,
"loss": 1.2412,
"step": 2400
},
{
"epoch": 0.1443649184225426,
"eval_loss": 1.1130963563919067,
"eval_runtime": 21.7087,
"eval_samples_per_second": 460.645,
"eval_steps_per_second": 57.581,
"step": 2400
},
{
"epoch": 0.1503801233568152,
"grad_norm": 1.6393444538116455,
"learning_rate": 5.976992330776926e-06,
"loss": 1.2159,
"step": 2500
},
{
"epoch": 0.1503801233568152,
"eval_loss": 1.0844037532806396,
"eval_runtime": 21.7041,
"eval_samples_per_second": 460.743,
"eval_steps_per_second": 57.593,
"step": 2500
},
{
"epoch": 0.1563953282910878,
"grad_norm": 1.638340950012207,
"learning_rate": 5.975991997332444e-06,
"loss": 1.1898,
"step": 2600
},
{
"epoch": 0.1563953282910878,
"eval_loss": 1.0646270513534546,
"eval_runtime": 21.7166,
"eval_samples_per_second": 460.476,
"eval_steps_per_second": 57.56,
"step": 2600
},
{
"epoch": 0.1624105332253604,
"grad_norm": 1.745104432106018,
"learning_rate": 5.974991663887963e-06,
"loss": 1.1708,
"step": 2700
},
{
"epoch": 0.1624105332253604,
"eval_loss": 1.0485780239105225,
"eval_runtime": 21.7198,
"eval_samples_per_second": 460.41,
"eval_steps_per_second": 57.551,
"step": 2700
},
{
"epoch": 0.16842573815963302,
"grad_norm": 1.759570837020874,
"learning_rate": 5.973991330443481e-06,
"loss": 1.1522,
"step": 2800
},
{
"epoch": 0.16842573815963302,
"eval_loss": 1.0218431949615479,
"eval_runtime": 21.7241,
"eval_samples_per_second": 460.318,
"eval_steps_per_second": 57.54,
"step": 2800
},
{
"epoch": 0.17444094309390562,
"grad_norm": 1.76418936252594,
"learning_rate": 5.972990996999e-06,
"loss": 1.1218,
"step": 2900
},
{
"epoch": 0.17444094309390562,
"eval_loss": 1.0075299739837646,
"eval_runtime": 21.7441,
"eval_samples_per_second": 459.894,
"eval_steps_per_second": 57.487,
"step": 2900
},
{
"epoch": 0.18045614802817822,
"grad_norm": 1.7186238765716553,
"learning_rate": 5.971990663554519e-06,
"loss": 1.1074,
"step": 3000
},
{
"epoch": 0.18045614802817822,
"eval_loss": 0.9909061789512634,
"eval_runtime": 21.7393,
"eval_samples_per_second": 459.997,
"eval_steps_per_second": 57.5,
"step": 3000
},
{
"epoch": 0.18647135296245085,
"grad_norm": 1.6869324445724487,
"learning_rate": 5.970990330110037e-06,
"loss": 1.0871,
"step": 3100
},
{
"epoch": 0.18647135296245085,
"eval_loss": 0.965461254119873,
"eval_runtime": 21.7525,
"eval_samples_per_second": 459.717,
"eval_steps_per_second": 57.465,
"step": 3100
},
{
"epoch": 0.19248655789672345,
"grad_norm": 1.590827465057373,
"learning_rate": 5.9699899966655554e-06,
"loss": 1.0678,
"step": 3200
},
{
"epoch": 0.19248655789672345,
"eval_loss": 0.9502421617507935,
"eval_runtime": 21.7257,
"eval_samples_per_second": 460.284,
"eval_steps_per_second": 57.536,
"step": 3200
},
{
"epoch": 0.19850176283099605,
"grad_norm": 1.3480803966522217,
"learning_rate": 5.968989663221074e-06,
"loss": 1.05,
"step": 3300
},
{
"epoch": 0.19850176283099605,
"eval_loss": 0.9217738509178162,
"eval_runtime": 21.7126,
"eval_samples_per_second": 460.562,
"eval_steps_per_second": 57.57,
"step": 3300
},
{
"epoch": 0.20451696776526865,
"grad_norm": 1.611717700958252,
"learning_rate": 5.967989329776592e-06,
"loss": 1.0308,
"step": 3400
},
{
"epoch": 0.20451696776526865,
"eval_loss": 0.9114508628845215,
"eval_runtime": 21.727,
"eval_samples_per_second": 460.257,
"eval_steps_per_second": 57.532,
"step": 3400
},
{
"epoch": 0.21053217269954128,
"grad_norm": 1.424517035484314,
"learning_rate": 5.966988996332111e-06,
"loss": 1.0161,
"step": 3500
},
{
"epoch": 0.21053217269954128,
"eval_loss": 0.8955187797546387,
"eval_runtime": 21.7397,
"eval_samples_per_second": 459.988,
"eval_steps_per_second": 57.498,
"step": 3500
},
{
"epoch": 0.21654737763381388,
"grad_norm": 1.8415201902389526,
"learning_rate": 5.965988662887629e-06,
"loss": 0.9983,
"step": 3600
},
{
"epoch": 0.21654737763381388,
"eval_loss": 0.8804967999458313,
"eval_runtime": 21.7608,
"eval_samples_per_second": 459.541,
"eval_steps_per_second": 57.443,
"step": 3600
},
{
"epoch": 0.22256258256808648,
"grad_norm": 1.5056076049804688,
"learning_rate": 5.964988329443148e-06,
"loss": 0.9849,
"step": 3700
},
{
"epoch": 0.22256258256808648,
"eval_loss": 0.8613883852958679,
"eval_runtime": 21.7348,
"eval_samples_per_second": 460.091,
"eval_steps_per_second": 57.511,
"step": 3700
},
{
"epoch": 0.22857778750235908,
"grad_norm": 1.6334686279296875,
"learning_rate": 5.963987995998667e-06,
"loss": 0.9689,
"step": 3800
},
{
"epoch": 0.22857778750235908,
"eval_loss": 0.8555884957313538,
"eval_runtime": 21.7044,
"eval_samples_per_second": 460.736,
"eval_steps_per_second": 57.592,
"step": 3800
},
{
"epoch": 0.2345929924366317,
"grad_norm": 1.7393226623535156,
"learning_rate": 5.962987662554185e-06,
"loss": 0.9564,
"step": 3900
},
{
"epoch": 0.2345929924366317,
"eval_loss": 0.8427873849868774,
"eval_runtime": 21.7376,
"eval_samples_per_second": 460.033,
"eval_steps_per_second": 57.504,
"step": 3900
},
{
"epoch": 0.2406081973709043,
"grad_norm": 1.5030866861343384,
"learning_rate": 5.961987329109703e-06,
"loss": 0.9417,
"step": 4000
},
{
"epoch": 0.2406081973709043,
"eval_loss": 0.8300994038581848,
"eval_runtime": 21.7438,
"eval_samples_per_second": 459.902,
"eval_steps_per_second": 57.488,
"step": 4000
},
{
"epoch": 0.2466234023051769,
"grad_norm": 1.8627735376358032,
"learning_rate": 5.960986995665222e-06,
"loss": 0.9277,
"step": 4100
},
{
"epoch": 0.2466234023051769,
"eval_loss": 0.8120391368865967,
"eval_runtime": 21.755,
"eval_samples_per_second": 459.664,
"eval_steps_per_second": 57.458,
"step": 4100
},
{
"epoch": 0.25263860723944953,
"grad_norm": 1.5174646377563477,
"learning_rate": 5.95998666222074e-06,
"loss": 0.9123,
"step": 4200
},
{
"epoch": 0.25263860723944953,
"eval_loss": 0.7974905371665955,
"eval_runtime": 21.7219,
"eval_samples_per_second": 460.366,
"eval_steps_per_second": 57.546,
"step": 4200
},
{
"epoch": 0.25865381217372213,
"grad_norm": 1.354490041732788,
"learning_rate": 5.958986328776259e-06,
"loss": 0.9028,
"step": 4300
},
{
"epoch": 0.25865381217372213,
"eval_loss": 0.7938092947006226,
"eval_runtime": 21.7182,
"eval_samples_per_second": 460.444,
"eval_steps_per_second": 57.555,
"step": 4300
},
{
"epoch": 0.26466901710799473,
"grad_norm": 1.6153218746185303,
"learning_rate": 5.957985995331777e-06,
"loss": 0.8954,
"step": 4400
},
{
"epoch": 0.26466901710799473,
"eval_loss": 0.7785645723342896,
"eval_runtime": 21.7451,
"eval_samples_per_second": 459.873,
"eval_steps_per_second": 57.484,
"step": 4400
},
{
"epoch": 0.27068422204226733,
"grad_norm": 1.9774231910705566,
"learning_rate": 5.956985661887296e-06,
"loss": 0.8819,
"step": 4500
},
{
"epoch": 0.27068422204226733,
"eval_loss": 0.7742797136306763,
"eval_runtime": 21.7358,
"eval_samples_per_second": 460.07,
"eval_steps_per_second": 57.509,
"step": 4500
},
{
"epoch": 0.27669942697653993,
"grad_norm": 1.6561676263809204,
"learning_rate": 5.955985328442815e-06,
"loss": 0.8729,
"step": 4600
},
{
"epoch": 0.27669942697653993,
"eval_loss": 0.7637073397636414,
"eval_runtime": 21.7296,
"eval_samples_per_second": 460.202,
"eval_steps_per_second": 57.525,
"step": 4600
},
{
"epoch": 0.28271463191081253,
"grad_norm": 1.5622860193252563,
"learning_rate": 5.954984994998333e-06,
"loss": 0.8608,
"step": 4700
},
{
"epoch": 0.28271463191081253,
"eval_loss": 0.7628427743911743,
"eval_runtime": 21.7339,
"eval_samples_per_second": 460.111,
"eval_steps_per_second": 57.514,
"step": 4700
},
{
"epoch": 0.2887298368450852,
"grad_norm": 1.6501961946487427,
"learning_rate": 5.953984661553851e-06,
"loss": 0.8489,
"step": 4800
},
{
"epoch": 0.2887298368450852,
"eval_loss": 0.7505598068237305,
"eval_runtime": 21.7165,
"eval_samples_per_second": 460.479,
"eval_steps_per_second": 57.56,
"step": 4800
},
{
"epoch": 0.2947450417793578,
"grad_norm": 1.7538303136825562,
"learning_rate": 5.95298432810937e-06,
"loss": 0.8401,
"step": 4900
},
{
"epoch": 0.2947450417793578,
"eval_loss": 0.7426216006278992,
"eval_runtime": 21.7216,
"eval_samples_per_second": 460.37,
"eval_steps_per_second": 57.546,
"step": 4900
},
{
"epoch": 0.3007602467136304,
"grad_norm": 1.5520670413970947,
"learning_rate": 5.951983994664888e-06,
"loss": 0.8361,
"step": 5000
},
{
"epoch": 0.3007602467136304,
"eval_loss": 0.7258592247962952,
"eval_runtime": 21.7177,
"eval_samples_per_second": 460.454,
"eval_steps_per_second": 57.557,
"step": 5000
},
{
"epoch": 0.306775451647903,
"grad_norm": 2.0393898487091064,
"learning_rate": 5.950983661220407e-06,
"loss": 0.8273,
"step": 5100
},
{
"epoch": 0.306775451647903,
"eval_loss": 0.7185364365577698,
"eval_runtime": 21.6894,
"eval_samples_per_second": 461.054,
"eval_steps_per_second": 57.632,
"step": 5100
},
{
"epoch": 0.3127906565821756,
"grad_norm": 1.9601730108261108,
"learning_rate": 5.949983327775925e-06,
"loss": 0.8135,
"step": 5200
},
{
"epoch": 0.3127906565821756,
"eval_loss": 0.7162497639656067,
"eval_runtime": 21.7614,
"eval_samples_per_second": 459.53,
"eval_steps_per_second": 57.441,
"step": 5200
},
{
"epoch": 0.3188058615164482,
"grad_norm": 1.4966851472854614,
"learning_rate": 5.948982994331444e-06,
"loss": 0.8037,
"step": 5300
},
{
"epoch": 0.3188058615164482,
"eval_loss": 0.7116673588752747,
"eval_runtime": 21.8409,
"eval_samples_per_second": 457.857,
"eval_steps_per_second": 57.232,
"step": 5300
},
{
"epoch": 0.3248210664507208,
"grad_norm": 1.4574569463729858,
"learning_rate": 5.947982660886963e-06,
"loss": 0.8027,
"step": 5400
},
{
"epoch": 0.3248210664507208,
"eval_loss": 0.6981866359710693,
"eval_runtime": 21.7832,
"eval_samples_per_second": 459.07,
"eval_steps_per_second": 57.384,
"step": 5400
},
{
"epoch": 0.33083627138499344,
"grad_norm": 1.5823230743408203,
"learning_rate": 5.9469823274424815e-06,
"loss": 0.7898,
"step": 5500
},
{
"epoch": 0.33083627138499344,
"eval_loss": 0.6950494050979614,
"eval_runtime": 21.754,
"eval_samples_per_second": 459.685,
"eval_steps_per_second": 57.461,
"step": 5500
},
{
"epoch": 0.33685147631926604,
"grad_norm": 1.5350251197814941,
"learning_rate": 5.945981993997999e-06,
"loss": 0.7829,
"step": 5600
},
{
"epoch": 0.33685147631926604,
"eval_loss": 0.6908562183380127,
"eval_runtime": 21.6939,
"eval_samples_per_second": 460.958,
"eval_steps_per_second": 57.62,
"step": 5600
},
{
"epoch": 0.34286668125353864,
"grad_norm": 1.5343948602676392,
"learning_rate": 5.944981660553518e-06,
"loss": 0.7778,
"step": 5700
},
{
"epoch": 0.34286668125353864,
"eval_loss": 0.6897854208946228,
"eval_runtime": 21.687,
"eval_samples_per_second": 461.107,
"eval_steps_per_second": 57.638,
"step": 5700
},
{
"epoch": 0.34888188618781124,
"grad_norm": 1.6000343561172485,
"learning_rate": 5.943981327109036e-06,
"loss": 0.7672,
"step": 5800
},
{
"epoch": 0.34888188618781124,
"eval_loss": 0.6832409501075745,
"eval_runtime": 21.7047,
"eval_samples_per_second": 460.73,
"eval_steps_per_second": 57.591,
"step": 5800
},
{
"epoch": 0.35489709112208384,
"grad_norm": 1.3873372077941895,
"learning_rate": 5.942980993664555e-06,
"loss": 0.7645,
"step": 5900
},
{
"epoch": 0.35489709112208384,
"eval_loss": 0.6712300777435303,
"eval_runtime": 21.707,
"eval_samples_per_second": 460.681,
"eval_steps_per_second": 57.585,
"step": 5900
},
{
"epoch": 0.36091229605635644,
"grad_norm": 1.5178308486938477,
"learning_rate": 5.941980660220073e-06,
"loss": 0.756,
"step": 6000
},
{
"epoch": 0.36091229605635644,
"eval_loss": 0.6661484241485596,
"eval_runtime": 21.7032,
"eval_samples_per_second": 460.761,
"eval_steps_per_second": 57.595,
"step": 6000
},
{
"epoch": 0.36692750099062904,
"grad_norm": 1.4745811223983765,
"learning_rate": 5.940980326775592e-06,
"loss": 0.753,
"step": 6100
},
{
"epoch": 0.36692750099062904,
"eval_loss": 0.664915144443512,
"eval_runtime": 21.7252,
"eval_samples_per_second": 460.294,
"eval_steps_per_second": 57.537,
"step": 6100
},
{
"epoch": 0.3729427059249017,
"grad_norm": 1.6472891569137573,
"learning_rate": 5.939979993331111e-06,
"loss": 0.743,
"step": 6200
},
{
"epoch": 0.3729427059249017,
"eval_loss": 0.6596666574478149,
"eval_runtime": 21.6717,
"eval_samples_per_second": 461.432,
"eval_steps_per_second": 57.679,
"step": 6200
},
{
"epoch": 0.3789579108591743,
"grad_norm": 1.4315409660339355,
"learning_rate": 5.9389796598866294e-06,
"loss": 0.737,
"step": 6300
},
{
"epoch": 0.3789579108591743,
"eval_loss": 0.6593905091285706,
"eval_runtime": 21.8558,
"eval_samples_per_second": 457.545,
"eval_steps_per_second": 57.193,
"step": 6300
},
{
"epoch": 0.3849731157934469,
"grad_norm": 1.553122639656067,
"learning_rate": 5.937979326442147e-06,
"loss": 0.7284,
"step": 6400
},
{
"epoch": 0.3849731157934469,
"eval_loss": 0.6500257253646851,
"eval_runtime": 21.725,
"eval_samples_per_second": 460.298,
"eval_steps_per_second": 57.537,
"step": 6400
},
{
"epoch": 0.3909883207277195,
"grad_norm": 1.4755713939666748,
"learning_rate": 5.936978992997666e-06,
"loss": 0.7253,
"step": 6500
},
{
"epoch": 0.3909883207277195,
"eval_loss": 0.6457264423370361,
"eval_runtime": 21.6694,
"eval_samples_per_second": 461.48,
"eval_steps_per_second": 57.685,
"step": 6500
},
{
"epoch": 0.3970035256619921,
"grad_norm": 1.3153866529464722,
"learning_rate": 5.935978659553185e-06,
"loss": 0.7227,
"step": 6600
},
{
"epoch": 0.3970035256619921,
"eval_loss": 0.6387376189231873,
"eval_runtime": 21.6735,
"eval_samples_per_second": 461.393,
"eval_steps_per_second": 57.674,
"step": 6600
},
{
"epoch": 0.4030187305962647,
"grad_norm": 1.3349621295928955,
"learning_rate": 5.9349783261087026e-06,
"loss": 0.7161,
"step": 6700
},
{
"epoch": 0.4030187305962647,
"eval_loss": 0.6228384971618652,
"eval_runtime": 21.6839,
"eval_samples_per_second": 461.172,
"eval_steps_per_second": 57.646,
"step": 6700
},
{
"epoch": 0.4090339355305373,
"grad_norm": 1.4209269285202026,
"learning_rate": 5.933977992664221e-06,
"loss": 0.7101,
"step": 6800
},
{
"epoch": 0.4090339355305373,
"eval_loss": 0.6393507719039917,
"eval_runtime": 21.6842,
"eval_samples_per_second": 461.165,
"eval_steps_per_second": 57.646,
"step": 6800
},
{
"epoch": 0.4150491404648099,
"grad_norm": 1.3392629623413086,
"learning_rate": 5.93297765921974e-06,
"loss": 0.7043,
"step": 6900
},
{
"epoch": 0.4150491404648099,
"eval_loss": 0.6370413303375244,
"eval_runtime": 21.6802,
"eval_samples_per_second": 461.251,
"eval_steps_per_second": 57.656,
"step": 6900
},
{
"epoch": 0.42106434539908255,
"grad_norm": 1.420782446861267,
"learning_rate": 5.931977325775259e-06,
"loss": 0.6976,
"step": 7000
},
{
"epoch": 0.42106434539908255,
"eval_loss": 0.6197584867477417,
"eval_runtime": 21.6736,
"eval_samples_per_second": 461.391,
"eval_steps_per_second": 57.674,
"step": 7000
},
{
"epoch": 0.42707955033335515,
"grad_norm": 1.3362140655517578,
"learning_rate": 5.930976992330777e-06,
"loss": 0.6938,
"step": 7100
},
{
"epoch": 0.42707955033335515,
"eval_loss": 0.6171865463256836,
"eval_runtime": 21.6908,
"eval_samples_per_second": 461.024,
"eval_steps_per_second": 57.628,
"step": 7100
},
{
"epoch": 0.43309475526762775,
"grad_norm": 1.2855477333068848,
"learning_rate": 5.929976658886295e-06,
"loss": 0.6897,
"step": 7200
},
{
"epoch": 0.43309475526762775,
"eval_loss": 0.6011925935745239,
"eval_runtime": 21.6697,
"eval_samples_per_second": 461.474,
"eval_steps_per_second": 57.684,
"step": 7200
},
{
"epoch": 0.43910996020190035,
"grad_norm": 1.6744885444641113,
"learning_rate": 5.928976325441814e-06,
"loss": 0.6815,
"step": 7300
},
{
"epoch": 0.43910996020190035,
"eval_loss": 0.606606662273407,
"eval_runtime": 21.7361,
"eval_samples_per_second": 460.064,
"eval_steps_per_second": 57.508,
"step": 7300
},
{
"epoch": 0.44512516513617295,
"grad_norm": 1.4268521070480347,
"learning_rate": 5.927975991997333e-06,
"loss": 0.6785,
"step": 7400
},
{
"epoch": 0.44512516513617295,
"eval_loss": 0.6065685749053955,
"eval_runtime": 21.7924,
"eval_samples_per_second": 458.876,
"eval_steps_per_second": 57.359,
"step": 7400
},
{
"epoch": 0.45114037007044555,
"grad_norm": 1.248145341873169,
"learning_rate": 5.9269756585528505e-06,
"loss": 0.6734,
"step": 7500
},
{
"epoch": 0.45114037007044555,
"eval_loss": 0.5927532911300659,
"eval_runtime": 21.7131,
"eval_samples_per_second": 460.551,
"eval_steps_per_second": 57.569,
"step": 7500
},
{
"epoch": 0.45715557500471815,
"grad_norm": 1.3543365001678467,
"learning_rate": 5.92597532510837e-06,
"loss": 0.6692,
"step": 7600
},
{
"epoch": 0.45715557500471815,
"eval_loss": 0.584913432598114,
"eval_runtime": 21.6765,
"eval_samples_per_second": 461.329,
"eval_steps_per_second": 57.666,
"step": 7600
},
{
"epoch": 0.4631707799389908,
"grad_norm": 1.519895315170288,
"learning_rate": 5.924974991663888e-06,
"loss": 0.6683,
"step": 7700
},
{
"epoch": 0.4631707799389908,
"eval_loss": 0.5899286270141602,
"eval_runtime": 21.7078,
"eval_samples_per_second": 460.664,
"eval_steps_per_second": 57.583,
"step": 7700
},
{
"epoch": 0.4691859848732634,
"grad_norm": 1.3677542209625244,
"learning_rate": 5.923974658219407e-06,
"loss": 0.6612,
"step": 7800
},
{
"epoch": 0.4691859848732634,
"eval_loss": 0.5877178907394409,
"eval_runtime": 21.699,
"eval_samples_per_second": 460.851,
"eval_steps_per_second": 57.606,
"step": 7800
},
{
"epoch": 0.475201189807536,
"grad_norm": 1.3020201921463013,
"learning_rate": 5.922974324774925e-06,
"loss": 0.6593,
"step": 7900
},
{
"epoch": 0.475201189807536,
"eval_loss": 0.5901273488998413,
"eval_runtime": 21.6975,
"eval_samples_per_second": 460.883,
"eval_steps_per_second": 57.61,
"step": 7900
},
{
"epoch": 0.4812163947418086,
"grad_norm": 1.2522666454315186,
"learning_rate": 5.921973991330443e-06,
"loss": 0.6515,
"step": 8000
},
{
"epoch": 0.4812163947418086,
"eval_loss": 0.5791921019554138,
"eval_runtime": 21.6482,
"eval_samples_per_second": 461.932,
"eval_steps_per_second": 57.741,
"step": 8000
},
{
"epoch": 0.4872315996760812,
"grad_norm": 1.7226676940917969,
"learning_rate": 5.920973657885962e-06,
"loss": 0.6497,
"step": 8100
},
{
"epoch": 0.4872315996760812,
"eval_loss": 0.5783876776695251,
"eval_runtime": 21.8009,
"eval_samples_per_second": 458.696,
"eval_steps_per_second": 57.337,
"step": 8100
},
{
"epoch": 0.4932468046103538,
"grad_norm": 1.4653980731964111,
"learning_rate": 5.919973324441481e-06,
"loss": 0.6463,
"step": 8200
},
{
"epoch": 0.4932468046103538,
"eval_loss": 0.5752367973327637,
"eval_runtime": 21.7179,
"eval_samples_per_second": 460.45,
"eval_steps_per_second": 57.556,
"step": 8200
},
{
"epoch": 0.4992620095446264,
"grad_norm": 1.3331021070480347,
"learning_rate": 5.918972990996999e-06,
"loss": 0.6412,
"step": 8300
},
{
"epoch": 0.4992620095446264,
"eval_loss": 0.5725879669189453,
"eval_runtime": 21.7719,
"eval_samples_per_second": 459.308,
"eval_steps_per_second": 57.414,
"step": 8300
},
{
"epoch": 0.5052772144788991,
"grad_norm": 1.245968222618103,
"learning_rate": 5.917972657552518e-06,
"loss": 0.64,
"step": 8400
},
{
"epoch": 0.5052772144788991,
"eval_loss": 0.5639936923980713,
"eval_runtime": 21.7448,
"eval_samples_per_second": 459.88,
"eval_steps_per_second": 57.485,
"step": 8400
},
{
"epoch": 0.5112924194131716,
"grad_norm": 1.269049882888794,
"learning_rate": 5.916972324108037e-06,
"loss": 0.6341,
"step": 8500
},
{
"epoch": 0.5112924194131716,
"eval_loss": 0.5605804324150085,
"eval_runtime": 21.7116,
"eval_samples_per_second": 460.582,
"eval_steps_per_second": 57.573,
"step": 8500
},
{
"epoch": 0.5173076243474443,
"grad_norm": 1.2048168182373047,
"learning_rate": 5.915971990663555e-06,
"loss": 0.6327,
"step": 8600
},
{
"epoch": 0.5173076243474443,
"eval_loss": 0.5681275129318237,
"eval_runtime": 21.7037,
"eval_samples_per_second": 460.751,
"eval_steps_per_second": 57.594,
"step": 8600
},
{
"epoch": 0.5233228292817168,
"grad_norm": 1.269063949584961,
"learning_rate": 5.914971657219073e-06,
"loss": 0.6251,
"step": 8700
},
{
"epoch": 0.5233228292817168,
"eval_loss": 0.5644165277481079,
"eval_runtime": 21.6949,
"eval_samples_per_second": 460.937,
"eval_steps_per_second": 57.617,
"step": 8700
},
{
"epoch": 0.5293380342159895,
"grad_norm": 1.3928773403167725,
"learning_rate": 5.913971323774591e-06,
"loss": 0.6268,
"step": 8800
},
{
"epoch": 0.5293380342159895,
"eval_loss": 0.5452607870101929,
"eval_runtime": 21.7013,
"eval_samples_per_second": 460.803,
"eval_steps_per_second": 57.6,
"step": 8800
},
{
"epoch": 0.5353532391502621,
"grad_norm": 1.6263777017593384,
"learning_rate": 5.91297099033011e-06,
"loss": 0.6198,
"step": 8900
},
{
"epoch": 0.5353532391502621,
"eval_loss": 0.5565773248672485,
"eval_runtime": 21.7101,
"eval_samples_per_second": 460.615,
"eval_steps_per_second": 57.577,
"step": 8900
},
{
"epoch": 0.5413684440845347,
"grad_norm": 1.312068223953247,
"learning_rate": 5.911970656885629e-06,
"loss": 0.6168,
"step": 9000
},
{
"epoch": 0.5413684440845347,
"eval_loss": 0.544517457485199,
"eval_runtime": 21.6689,
"eval_samples_per_second": 461.49,
"eval_steps_per_second": 57.686,
"step": 9000
},
{
"epoch": 0.5473836490188073,
"grad_norm": 1.4878406524658203,
"learning_rate": 5.910970323441147e-06,
"loss": 0.6168,
"step": 9100
},
{
"epoch": 0.5473836490188073,
"eval_loss": 0.5467077493667603,
"eval_runtime": 21.7585,
"eval_samples_per_second": 459.591,
"eval_steps_per_second": 57.449,
"step": 9100
},
{
"epoch": 0.5533988539530799,
"grad_norm": 1.4762675762176514,
"learning_rate": 5.909969989996666e-06,
"loss": 0.6062,
"step": 9200
},
{
"epoch": 0.5533988539530799,
"eval_loss": 0.5416296720504761,
"eval_runtime": 21.7398,
"eval_samples_per_second": 459.985,
"eval_steps_per_second": 57.498,
"step": 9200
},
{
"epoch": 0.5594140588873525,
"grad_norm": 1.3053025007247925,
"learning_rate": 5.908969656552185e-06,
"loss": 0.6106,
"step": 9300
},
{
"epoch": 0.5594140588873525,
"eval_loss": 0.5386621356010437,
"eval_runtime": 21.7444,
"eval_samples_per_second": 459.888,
"eval_steps_per_second": 57.486,
"step": 9300
},
{
"epoch": 0.5654292638216251,
"grad_norm": 1.5423814058303833,
"learning_rate": 5.907969323107703e-06,
"loss": 0.6019,
"step": 9400
},
{
"epoch": 0.5654292638216251,
"eval_loss": 0.5405033230781555,
"eval_runtime": 21.726,
"eval_samples_per_second": 460.277,
"eval_steps_per_second": 57.535,
"step": 9400
},
{
"epoch": 0.5714444687558977,
"grad_norm": 1.4696613550186157,
"learning_rate": 5.906968989663221e-06,
"loss": 0.6011,
"step": 9500
},
{
"epoch": 0.5714444687558977,
"eval_loss": 0.5457667708396912,
"eval_runtime": 21.7773,
"eval_samples_per_second": 459.193,
"eval_steps_per_second": 57.399,
"step": 9500
},
{
"epoch": 0.5774596736901704,
"grad_norm": 1.5349172353744507,
"learning_rate": 5.90596865621874e-06,
"loss": 0.5961,
"step": 9600
},
{
"epoch": 0.5774596736901704,
"eval_loss": 0.533613920211792,
"eval_runtime": 21.9838,
"eval_samples_per_second": 454.88,
"eval_steps_per_second": 56.86,
"step": 9600
},
{
"epoch": 0.5834748786244429,
"grad_norm": 1.2024816274642944,
"learning_rate": 5.904968322774258e-06,
"loss": 0.593,
"step": 9700
},
{
"epoch": 0.5834748786244429,
"eval_loss": 0.5246294140815735,
"eval_runtime": 22.5017,
"eval_samples_per_second": 444.411,
"eval_steps_per_second": 55.551,
"step": 9700
},
{
"epoch": 0.5894900835587156,
"grad_norm": 1.2983571290969849,
"learning_rate": 5.9039679893297766e-06,
"loss": 0.5925,
"step": 9800
},
{
"epoch": 0.5894900835587156,
"eval_loss": 0.5254473686218262,
"eval_runtime": 23.0942,
"eval_samples_per_second": 433.009,
"eval_steps_per_second": 54.126,
"step": 9800
},
{
"epoch": 0.5955052884929881,
"grad_norm": 1.2889515161514282,
"learning_rate": 5.902967655885295e-06,
"loss": 0.5911,
"step": 9900
},
{
"epoch": 0.5955052884929881,
"eval_loss": 0.5365324020385742,
"eval_runtime": 23.3271,
"eval_samples_per_second": 428.686,
"eval_steps_per_second": 53.586,
"step": 9900
},
{
"epoch": 0.6015204934272608,
"grad_norm": 1.3131366968154907,
"learning_rate": 5.901967322440814e-06,
"loss": 0.5843,
"step": 10000
},
{
"epoch": 0.6015204934272608,
"eval_loss": 0.5123865008354187,
"eval_runtime": 23.435,
"eval_samples_per_second": 426.712,
"eval_steps_per_second": 53.339,
"step": 10000
},
{
"epoch": 0.6075356983615333,
"grad_norm": 1.3315032720565796,
"learning_rate": 5.900966988996333e-06,
"loss": 0.5832,
"step": 10100
},
{
"epoch": 0.6075356983615333,
"eval_loss": 0.5256994962692261,
"eval_runtime": 23.4061,
"eval_samples_per_second": 427.24,
"eval_steps_per_second": 53.405,
"step": 10100
},
{
"epoch": 0.613550903295806,
"grad_norm": 1.3008897304534912,
"learning_rate": 5.8999666555518505e-06,
"loss": 0.582,
"step": 10200
},
{
"epoch": 0.613550903295806,
"eval_loss": 0.5148985981941223,
"eval_runtime": 23.4451,
"eval_samples_per_second": 426.528,
"eval_steps_per_second": 53.316,
"step": 10200
},
{
"epoch": 0.6195661082300786,
"grad_norm": 1.272538423538208,
"learning_rate": 5.898966322107369e-06,
"loss": 0.5789,
"step": 10300
},
{
"epoch": 0.6195661082300786,
"eval_loss": 0.5160868763923645,
"eval_runtime": 23.3699,
"eval_samples_per_second": 427.901,
"eval_steps_per_second": 53.488,
"step": 10300
},
{
"epoch": 0.6255813131643512,
"grad_norm": 1.38733971118927,
"learning_rate": 5.897965988662888e-06,
"loss": 0.5768,
"step": 10400
},
{
"epoch": 0.6255813131643512,
"eval_loss": 0.5101234912872314,
"eval_runtime": 23.5052,
"eval_samples_per_second": 425.437,
"eval_steps_per_second": 53.18,
"step": 10400
},
{
"epoch": 0.6315965180986238,
"grad_norm": 1.3414686918258667,
"learning_rate": 5.896965655218406e-06,
"loss": 0.5728,
"step": 10500
},
{
"epoch": 0.6315965180986238,
"eval_loss": 0.5151140689849854,
"eval_runtime": 23.1483,
"eval_samples_per_second": 431.997,
"eval_steps_per_second": 54.0,
"step": 10500
},
{
"epoch": 0.6376117230328964,
"grad_norm": 1.2821862697601318,
"learning_rate": 5.8959653217739245e-06,
"loss": 0.5732,
"step": 10600
},
{
"epoch": 0.6376117230328964,
"eval_loss": 0.5067505240440369,
"eval_runtime": 23.3046,
"eval_samples_per_second": 429.099,
"eval_steps_per_second": 53.637,
"step": 10600
},
{
"epoch": 0.643626927967169,
"grad_norm": 1.4687350988388062,
"learning_rate": 5.894964988329443e-06,
"loss": 0.568,
"step": 10700
},
{
"epoch": 0.643626927967169,
"eval_loss": 0.5038474798202515,
"eval_runtime": 48.8496,
"eval_samples_per_second": 204.71,
"eval_steps_per_second": 25.589,
"step": 10700
},
{
"epoch": 0.6496421329014416,
"grad_norm": 1.1854100227355957,
"learning_rate": 5.893964654884962e-06,
"loss": 0.5665,
"step": 10800
},
{
"epoch": 0.6496421329014416,
"eval_loss": 0.5092170834541321,
"eval_runtime": 51.2918,
"eval_samples_per_second": 194.963,
"eval_steps_per_second": 24.37,
"step": 10800
},
{
"epoch": 0.6556573378357142,
"grad_norm": 1.2117469310760498,
"learning_rate": 5.892964321440481e-06,
"loss": 0.5641,
"step": 10900
},
{
"epoch": 0.6556573378357142,
"eval_loss": 0.4948270618915558,
"eval_runtime": 51.7341,
"eval_samples_per_second": 193.296,
"eval_steps_per_second": 24.162,
"step": 10900
},
{
"epoch": 0.6616725427699869,
"grad_norm": 1.1809200048446655,
"learning_rate": 5.8919639879959985e-06,
"loss": 0.559,
"step": 11000
},
{
"epoch": 0.6616725427699869,
"eval_loss": 0.49759823083877563,
"eval_runtime": 50.8828,
"eval_samples_per_second": 196.53,
"eval_steps_per_second": 24.566,
"step": 11000
},
{
"epoch": 0.6676877477042594,
"grad_norm": 1.4321728944778442,
"learning_rate": 5.890963654551517e-06,
"loss": 0.5597,
"step": 11100
},
{
"epoch": 0.6676877477042594,
"eval_loss": 0.49609047174453735,
"eval_runtime": 51.278,
"eval_samples_per_second": 195.015,
"eval_steps_per_second": 24.377,
"step": 11100
},
{
"epoch": 0.6737029526385321,
"grad_norm": 1.3043360710144043,
"learning_rate": 5.889963321107036e-06,
"loss": 0.5574,
"step": 11200
},
{
"epoch": 0.6737029526385321,
"eval_loss": 0.5004040002822876,
"eval_runtime": 50.7636,
"eval_samples_per_second": 196.992,
"eval_steps_per_second": 24.624,
"step": 11200
},
{
"epoch": 0.6797181575728046,
"grad_norm": 1.2415975332260132,
"learning_rate": 5.888962987662554e-06,
"loss": 0.5555,
"step": 11300
},
{
"epoch": 0.6797181575728046,
"eval_loss": 0.5004035234451294,
"eval_runtime": 51.3686,
"eval_samples_per_second": 194.672,
"eval_steps_per_second": 24.334,
"step": 11300
},
{
"epoch": 0.6857333625070773,
"grad_norm": 1.1731830835342407,
"learning_rate": 5.8879626542180725e-06,
"loss": 0.5541,
"step": 11400
},
{
"epoch": 0.6857333625070773,
"eval_loss": 0.4998365342617035,
"eval_runtime": 50.9083,
"eval_samples_per_second": 196.432,
"eval_steps_per_second": 24.554,
"step": 11400
},
{
"epoch": 0.6917485674413498,
"grad_norm": 1.2296881675720215,
"learning_rate": 5.886962320773592e-06,
"loss": 0.5487,
"step": 11500
},
{
"epoch": 0.6917485674413498,
"eval_loss": 0.4932882785797119,
"eval_runtime": 50.9764,
"eval_samples_per_second": 196.169,
"eval_steps_per_second": 24.521,
"step": 11500
},
{
"epoch": 0.6977637723756225,
"grad_norm": 1.4027659893035889,
"learning_rate": 5.88596198732911e-06,
"loss": 0.5488,
"step": 11600
},
{
"epoch": 0.6977637723756225,
"eval_loss": 0.48723334074020386,
"eval_runtime": 51.3087,
"eval_samples_per_second": 194.899,
"eval_steps_per_second": 24.362,
"step": 11600
},
{
"epoch": 0.7037789773098951,
"grad_norm": 1.345869541168213,
"learning_rate": 5.884961653884629e-06,
"loss": 0.5464,
"step": 11700
},
{
"epoch": 0.7037789773098951,
"eval_loss": 0.48902279138565063,
"eval_runtime": 51.5761,
"eval_samples_per_second": 193.888,
"eval_steps_per_second": 24.236,
"step": 11700
},
{
"epoch": 0.7097941822441677,
"grad_norm": 1.3029801845550537,
"learning_rate": 5.8839613204401465e-06,
"loss": 0.545,
"step": 11800
},
{
"epoch": 0.7097941822441677,
"eval_loss": 0.4815163016319275,
"eval_runtime": 51.0467,
"eval_samples_per_second": 195.899,
"eval_steps_per_second": 24.487,
"step": 11800
},
{
"epoch": 0.7158093871784403,
"grad_norm": 1.3300397396087646,
"learning_rate": 5.882960986995665e-06,
"loss": 0.5406,
"step": 11900
},
{
"epoch": 0.7158093871784403,
"eval_loss": 0.4828699231147766,
"eval_runtime": 50.6859,
"eval_samples_per_second": 197.294,
"eval_steps_per_second": 24.662,
"step": 11900
},
{
"epoch": 0.7218245921127129,
"grad_norm": 1.3354322910308838,
"learning_rate": 5.881960653551184e-06,
"loss": 0.5412,
"step": 12000
},
{
"epoch": 0.7218245921127129,
"eval_loss": 0.4760846197605133,
"eval_runtime": 51.0095,
"eval_samples_per_second": 196.042,
"eval_steps_per_second": 24.505,
"step": 12000
},
{
"epoch": 0.7278397970469855,
"grad_norm": 1.2316620349884033,
"learning_rate": 5.880960320106702e-06,
"loss": 0.5354,
"step": 12100
},
{
"epoch": 0.7278397970469855,
"eval_loss": 0.49535489082336426,
"eval_runtime": 51.064,
"eval_samples_per_second": 195.833,
"eval_steps_per_second": 24.479,
"step": 12100
},
{
"epoch": 0.7338550019812581,
"grad_norm": 1.2033593654632568,
"learning_rate": 5.879959986662221e-06,
"loss": 0.5343,
"step": 12200
},
{
"epoch": 0.7338550019812581,
"eval_loss": 0.4705411195755005,
"eval_runtime": 50.9982,
"eval_samples_per_second": 196.085,
"eval_steps_per_second": 24.511,
"step": 12200
},
{
"epoch": 0.7398702069155307,
"grad_norm": 1.2634704113006592,
"learning_rate": 5.87895965321774e-06,
"loss": 0.5337,
"step": 12300
},
{
"epoch": 0.7398702069155307,
"eval_loss": 0.47791826725006104,
"eval_runtime": 51.1718,
"eval_samples_per_second": 195.42,
"eval_steps_per_second": 24.428,
"step": 12300
},
{
"epoch": 0.7458854118498034,
"grad_norm": 1.2546501159667969,
"learning_rate": 5.877959319773258e-06,
"loss": 0.5324,
"step": 12400
},
{
"epoch": 0.7458854118498034,
"eval_loss": 0.4756995737552643,
"eval_runtime": 51.0651,
"eval_samples_per_second": 195.828,
"eval_steps_per_second": 24.479,
"step": 12400
},
{
"epoch": 0.7519006167840759,
"grad_norm": 1.1833654642105103,
"learning_rate": 5.876958986328777e-06,
"loss": 0.5299,
"step": 12500
},
{
"epoch": 0.7519006167840759,
"eval_loss": 0.47130194306373596,
"eval_runtime": 51.0775,
"eval_samples_per_second": 195.781,
"eval_steps_per_second": 24.473,
"step": 12500
},
{
"epoch": 0.7579158217183486,
"grad_norm": 1.0535800457000732,
"learning_rate": 5.875958652884295e-06,
"loss": 0.5288,
"step": 12600
},
{
"epoch": 0.7579158217183486,
"eval_loss": 0.46586230397224426,
"eval_runtime": 51.3884,
"eval_samples_per_second": 194.596,
"eval_steps_per_second": 24.325,
"step": 12600
},
{
"epoch": 0.7639310266526211,
"grad_norm": 1.2561872005462646,
"learning_rate": 5.874958319439813e-06,
"loss": 0.5297,
"step": 12700
},
{
"epoch": 0.7639310266526211,
"eval_loss": 0.4665389657020569,
"eval_runtime": 51.1355,
"eval_samples_per_second": 195.559,
"eval_steps_per_second": 24.445,
"step": 12700
},
{
"epoch": 0.7699462315868938,
"grad_norm": 1.177007794380188,
"learning_rate": 5.873957985995332e-06,
"loss": 0.5326,
"step": 12800
},
{
"epoch": 0.7699462315868938,
"eval_loss": 0.4671100676059723,
"eval_runtime": 51.3263,
"eval_samples_per_second": 194.832,
"eval_steps_per_second": 24.354,
"step": 12800
},
{
"epoch": 0.7759614365211663,
"grad_norm": 1.181401252746582,
"learning_rate": 5.8729576525508506e-06,
"loss": 0.5222,
"step": 12900
},
{
"epoch": 0.7759614365211663,
"eval_loss": 0.4585270583629608,
"eval_runtime": 51.1292,
"eval_samples_per_second": 195.583,
"eval_steps_per_second": 24.448,
"step": 12900
},
{
"epoch": 0.781976641455439,
"grad_norm": 1.108788013458252,
"learning_rate": 5.871957319106369e-06,
"loss": 0.5202,
"step": 13000
},
{
"epoch": 0.781976641455439,
"eval_loss": 0.46135467290878296,
"eval_runtime": 51.1302,
"eval_samples_per_second": 195.579,
"eval_steps_per_second": 24.447,
"step": 13000
},
{
"epoch": 0.7879918463897116,
"grad_norm": 1.152575969696045,
"learning_rate": 5.870956985661888e-06,
"loss": 0.5157,
"step": 13100
},
{
"epoch": 0.7879918463897116,
"eval_loss": 0.46781352162361145,
"eval_runtime": 51.1065,
"eval_samples_per_second": 195.67,
"eval_steps_per_second": 24.459,
"step": 13100
},
{
"epoch": 0.7940070513239842,
"grad_norm": 1.1765929460525513,
"learning_rate": 5.869956652217406e-06,
"loss": 0.5177,
"step": 13200
},
{
"epoch": 0.7940070513239842,
"eval_loss": 0.4588942527770996,
"eval_runtime": 51.1353,
"eval_samples_per_second": 195.56,
"eval_steps_per_second": 24.445,
"step": 13200
},
{
"epoch": 0.8000222562582568,
"grad_norm": 1.1165159940719604,
"learning_rate": 5.8689563187729245e-06,
"loss": 0.5141,
"step": 13300
},
{
"epoch": 0.8000222562582568,
"eval_loss": 0.4517599046230316,
"eval_runtime": 51.1096,
"eval_samples_per_second": 195.658,
"eval_steps_per_second": 24.457,
"step": 13300
},
{
"epoch": 0.8060374611925294,
"grad_norm": 1.0414021015167236,
"learning_rate": 5.867955985328443e-06,
"loss": 0.5135,
"step": 13400
},
{
"epoch": 0.8060374611925294,
"eval_loss": 0.46558651328086853,
"eval_runtime": 51.1277,
"eval_samples_per_second": 195.589,
"eval_steps_per_second": 24.449,
"step": 13400
},
{
"epoch": 0.812052666126802,
"grad_norm": 1.3002249002456665,
"learning_rate": 5.866955651883961e-06,
"loss": 0.5124,
"step": 13500
},
{
"epoch": 0.812052666126802,
"eval_loss": 0.4563812017440796,
"eval_runtime": 51.132,
"eval_samples_per_second": 195.572,
"eval_steps_per_second": 24.447,
"step": 13500
},
{
"epoch": 0.8180678710610746,
"grad_norm": 1.5342046022415161,
"learning_rate": 5.86595531843948e-06,
"loss": 0.5101,
"step": 13600
},
{
"epoch": 0.8180678710610746,
"eval_loss": 0.44918256998062134,
"eval_runtime": 51.2205,
"eval_samples_per_second": 195.234,
"eval_steps_per_second": 24.404,
"step": 13600
},
{
"epoch": 0.8240830759953472,
"grad_norm": 1.312056064605713,
"learning_rate": 5.8649549849949985e-06,
"loss": 0.5087,
"step": 13700
},
{
"epoch": 0.8240830759953472,
"eval_loss": 0.45463162660598755,
"eval_runtime": 50.988,
"eval_samples_per_second": 196.125,
"eval_steps_per_second": 24.516,
"step": 13700
},
{
"epoch": 0.8300982809296198,
"grad_norm": 1.4413928985595703,
"learning_rate": 5.863954651550517e-06,
"loss": 0.5079,
"step": 13800
},
{
"epoch": 0.8300982809296198,
"eval_loss": 0.4562767446041107,
"eval_runtime": 51.212,
"eval_samples_per_second": 195.267,
"eval_steps_per_second": 24.408,
"step": 13800
},
{
"epoch": 0.8361134858638924,
"grad_norm": 1.3391541242599487,
"learning_rate": 5.862954318106036e-06,
"loss": 0.5077,
"step": 13900
},
{
"epoch": 0.8361134858638924,
"eval_loss": 0.44607582688331604,
"eval_runtime": 51.1173,
"eval_samples_per_second": 195.628,
"eval_steps_per_second": 24.454,
"step": 13900
},
{
"epoch": 0.8421286907981651,
"grad_norm": 1.2158905267715454,
"learning_rate": 5.861953984661554e-06,
"loss": 0.5032,
"step": 14000
},
{
"epoch": 0.8421286907981651,
"eval_loss": 0.4587889611721039,
"eval_runtime": 51.1702,
"eval_samples_per_second": 195.426,
"eval_steps_per_second": 24.428,
"step": 14000
},
{
"epoch": 0.8481438957324376,
"grad_norm": 1.1938725709915161,
"learning_rate": 5.8609536512170725e-06,
"loss": 0.4996,
"step": 14100
},
{
"epoch": 0.8481438957324376,
"eval_loss": 0.4515674412250519,
"eval_runtime": 51.1351,
"eval_samples_per_second": 195.56,
"eval_steps_per_second": 24.445,
"step": 14100
},
{
"epoch": 0.8541591006667103,
"grad_norm": 1.1953227519989014,
"learning_rate": 5.859953317772591e-06,
"loss": 0.5014,
"step": 14200
},
{
"epoch": 0.8541591006667103,
"eval_loss": 0.44719940423965454,
"eval_runtime": 51.0487,
"eval_samples_per_second": 195.891,
"eval_steps_per_second": 24.486,
"step": 14200
},
{
"epoch": 0.8601743056009828,
"grad_norm": 1.2699577808380127,
"learning_rate": 5.858952984328109e-06,
"loss": 0.499,
"step": 14300
},
{
"epoch": 0.8601743056009828,
"eval_loss": 0.4444737732410431,
"eval_runtime": 51.2894,
"eval_samples_per_second": 194.972,
"eval_steps_per_second": 24.372,
"step": 14300
},
{
"epoch": 0.8661895105352555,
"grad_norm": 1.0982294082641602,
"learning_rate": 5.857952650883628e-06,
"loss": 0.5024,
"step": 14400
},
{
"epoch": 0.8661895105352555,
"eval_loss": 0.4426032602787018,
"eval_runtime": 51.0622,
"eval_samples_per_second": 195.84,
"eval_steps_per_second": 24.48,
"step": 14400
},
{
"epoch": 0.872204715469528,
"grad_norm": 1.1881742477416992,
"learning_rate": 5.8569523174391465e-06,
"loss": 0.4971,
"step": 14500
},
{
"epoch": 0.872204715469528,
"eval_loss": 0.4500812590122223,
"eval_runtime": 51.0676,
"eval_samples_per_second": 195.819,
"eval_steps_per_second": 24.477,
"step": 14500
},
{
"epoch": 0.8782199204038007,
"grad_norm": 1.2892823219299316,
"learning_rate": 5.855951983994665e-06,
"loss": 0.4947,
"step": 14600
},
{
"epoch": 0.8782199204038007,
"eval_loss": 0.45143038034439087,
"eval_runtime": 51.2218,
"eval_samples_per_second": 195.229,
"eval_steps_per_second": 24.404,
"step": 14600
},
{
"epoch": 0.8842351253380734,
"grad_norm": 1.1228898763656616,
"learning_rate": 5.854951650550184e-06,
"loss": 0.4912,
"step": 14700
},
{
"epoch": 0.8842351253380734,
"eval_loss": 0.443864107131958,
"eval_runtime": 51.1005,
"eval_samples_per_second": 195.693,
"eval_steps_per_second": 24.462,
"step": 14700
},
{
"epoch": 0.8902503302723459,
"grad_norm": 1.2021640539169312,
"learning_rate": 5.853951317105702e-06,
"loss": 0.4911,
"step": 14800
},
{
"epoch": 0.8902503302723459,
"eval_loss": 0.44539061188697815,
"eval_runtime": 51.3647,
"eval_samples_per_second": 194.686,
"eval_steps_per_second": 24.336,
"step": 14800
},
{
"epoch": 0.8962655352066186,
"grad_norm": 1.226335883140564,
"learning_rate": 5.8529509836612205e-06,
"loss": 0.488,
"step": 14900
},
{
"epoch": 0.8962655352066186,
"eval_loss": 0.43708336353302,
"eval_runtime": 51.0878,
"eval_samples_per_second": 195.741,
"eval_steps_per_second": 24.468,
"step": 14900
},
{
"epoch": 0.9022807401408911,
"grad_norm": 1.1519514322280884,
"learning_rate": 5.851950650216739e-06,
"loss": 0.4879,
"step": 15000
},
{
"epoch": 0.9022807401408911,
"eval_loss": 0.43572157621383667,
"eval_runtime": 51.0673,
"eval_samples_per_second": 195.82,
"eval_steps_per_second": 24.477,
"step": 15000
},
{
"epoch": 0.9082959450751638,
"grad_norm": 1.0578216314315796,
"learning_rate": 5.850950316772257e-06,
"loss": 0.491,
"step": 15100
},
{
"epoch": 0.9082959450751638,
"eval_loss": 0.43306058645248413,
"eval_runtime": 51.2921,
"eval_samples_per_second": 194.962,
"eval_steps_per_second": 24.37,
"step": 15100
},
{
"epoch": 0.9143111500094363,
"grad_norm": 1.292629599571228,
"learning_rate": 5.849949983327776e-06,
"loss": 0.4852,
"step": 15200
},
{
"epoch": 0.9143111500094363,
"eval_loss": 0.43448084592819214,
"eval_runtime": 51.0849,
"eval_samples_per_second": 195.752,
"eval_steps_per_second": 24.469,
"step": 15200
},
{
"epoch": 0.920326354943709,
"grad_norm": 1.2115490436553955,
"learning_rate": 5.8489496498832945e-06,
"loss": 0.4879,
"step": 15300
},
{
"epoch": 0.920326354943709,
"eval_loss": 0.4403839409351349,
"eval_runtime": 51.0866,
"eval_samples_per_second": 195.746,
"eval_steps_per_second": 24.468,
"step": 15300
},
{
"epoch": 0.9263415598779816,
"grad_norm": 1.2206310033798218,
"learning_rate": 5.847949316438813e-06,
"loss": 0.4771,
"step": 15400
},
{
"epoch": 0.9263415598779816,
"eval_loss": 0.43060389161109924,
"eval_runtime": 51.0659,
"eval_samples_per_second": 195.825,
"eval_steps_per_second": 24.478,
"step": 15400
},
{
"epoch": 0.9323567648122542,
"grad_norm": 1.0853536128997803,
"learning_rate": 5.846948982994332e-06,
"loss": 0.4821,
"step": 15500
},
{
"epoch": 0.9323567648122542,
"eval_loss": 0.42842620611190796,
"eval_runtime": 51.036,
"eval_samples_per_second": 195.94,
"eval_steps_per_second": 24.493,
"step": 15500
},
{
"epoch": 0.9383719697465268,
"grad_norm": 1.0656437873840332,
"learning_rate": 5.8459486495498506e-06,
"loss": 0.4796,
"step": 15600
},
{
"epoch": 0.9383719697465268,
"eval_loss": 0.4259638786315918,
"eval_runtime": 51.0811,
"eval_samples_per_second": 195.767,
"eval_steps_per_second": 24.471,
"step": 15600
},
{
"epoch": 0.9443871746807994,
"grad_norm": 1.2496039867401123,
"learning_rate": 5.8449483161053684e-06,
"loss": 0.4783,
"step": 15700
},
{
"epoch": 0.9443871746807994,
"eval_loss": 0.42784813046455383,
"eval_runtime": 51.0862,
"eval_samples_per_second": 195.748,
"eval_steps_per_second": 24.468,
"step": 15700
},
{
"epoch": 0.950402379615072,
"grad_norm": 1.0478885173797607,
"learning_rate": 5.843947982660887e-06,
"loss": 0.4736,
"step": 15800
},
{
"epoch": 0.950402379615072,
"eval_loss": 0.42105141282081604,
"eval_runtime": 51.0949,
"eval_samples_per_second": 195.714,
"eval_steps_per_second": 24.464,
"step": 15800
},
{
"epoch": 0.9564175845493446,
"grad_norm": 1.1973545551300049,
"learning_rate": 5.842947649216405e-06,
"loss": 0.4765,
"step": 15900
},
{
"epoch": 0.9564175845493446,
"eval_loss": 0.41922861337661743,
"eval_runtime": 51.0499,
"eval_samples_per_second": 195.887,
"eval_steps_per_second": 24.486,
"step": 15900
},
{
"epoch": 0.9624327894836172,
"grad_norm": 1.0738471746444702,
"learning_rate": 5.841947315771924e-06,
"loss": 0.4713,
"step": 16000
},
{
"epoch": 0.9624327894836172,
"eval_loss": 0.4311535060405731,
"eval_runtime": 51.0775,
"eval_samples_per_second": 195.781,
"eval_steps_per_second": 24.473,
"step": 16000
},
{
"epoch": 0.9684479944178899,
"grad_norm": 1.14482581615448,
"learning_rate": 5.840946982327443e-06,
"loss": 0.4732,
"step": 16100
},
{
"epoch": 0.9684479944178899,
"eval_loss": 0.41709282994270325,
"eval_runtime": 39.7116,
"eval_samples_per_second": 251.815,
"eval_steps_per_second": 31.477,
"step": 16100
},
{
"epoch": 0.9744631993521624,
"grad_norm": 1.1577385663986206,
"learning_rate": 5.839946648882961e-06,
"loss": 0.4704,
"step": 16200
},
{
"epoch": 0.9744631993521624,
"eval_loss": 0.4273630976676941,
"eval_runtime": 51.0906,
"eval_samples_per_second": 195.731,
"eval_steps_per_second": 24.466,
"step": 16200
},
{
"epoch": 0.9804784042864351,
"grad_norm": 1.125328779220581,
"learning_rate": 5.83894631543848e-06,
"loss": 0.4697,
"step": 16300
},
{
"epoch": 0.9804784042864351,
"eval_loss": 0.42490535974502563,
"eval_runtime": 51.0751,
"eval_samples_per_second": 195.79,
"eval_steps_per_second": 24.474,
"step": 16300
},
{
"epoch": 0.9864936092207076,
"grad_norm": 1.2619575262069702,
"learning_rate": 5.8379459819939985e-06,
"loss": 0.4721,
"step": 16400
},
{
"epoch": 0.9864936092207076,
"eval_loss": 0.42143183946609497,
"eval_runtime": 51.2808,
"eval_samples_per_second": 195.005,
"eval_steps_per_second": 24.376,
"step": 16400
},
{
"epoch": 0.9925088141549803,
"grad_norm": 1.0622971057891846,
"learning_rate": 5.836945648549516e-06,
"loss": 0.4672,
"step": 16500
},
{
"epoch": 0.9925088141549803,
"eval_loss": 0.4140073359012604,
"eval_runtime": 51.137,
"eval_samples_per_second": 195.553,
"eval_steps_per_second": 24.444,
"step": 16500
},
{
"epoch": 0.9985240190892528,
"grad_norm": 1.1675751209259033,
"learning_rate": 5.835945315105035e-06,
"loss": 0.469,
"step": 16600
},
{
"epoch": 0.9985240190892528,
"eval_loss": 0.413769394159317,
"eval_runtime": 51.1298,
"eval_samples_per_second": 195.581,
"eval_steps_per_second": 24.448,
"step": 16600
},
{
"epoch": 1.0045392240235254,
"grad_norm": 1.1390060186386108,
"learning_rate": 5.834944981660553e-06,
"loss": 0.4668,
"step": 16700
},
{
"epoch": 1.0045392240235254,
"eval_loss": 0.41630059480667114,
"eval_runtime": 51.1382,
"eval_samples_per_second": 195.548,
"eval_steps_per_second": 24.444,
"step": 16700
},
{
"epoch": 1.0105544289577981,
"grad_norm": 1.2013533115386963,
"learning_rate": 5.8339446482160725e-06,
"loss": 0.4636,
"step": 16800
},
{
"epoch": 1.0105544289577981,
"eval_loss": 0.4128175675868988,
"eval_runtime": 51.0766,
"eval_samples_per_second": 195.784,
"eval_steps_per_second": 24.473,
"step": 16800
},
{
"epoch": 1.0165696338920707,
"grad_norm": 1.1893339157104492,
"learning_rate": 5.832944314771591e-06,
"loss": 0.4628,
"step": 16900
},
{
"epoch": 1.0165696338920707,
"eval_loss": 0.4195719361305237,
"eval_runtime": 51.0932,
"eval_samples_per_second": 195.721,
"eval_steps_per_second": 24.465,
"step": 16900
},
{
"epoch": 1.0225848388263432,
"grad_norm": 1.1112314462661743,
"learning_rate": 5.831943981327109e-06,
"loss": 0.4631,
"step": 17000
},
{
"epoch": 1.0225848388263432,
"eval_loss": 0.41490069031715393,
"eval_runtime": 51.0962,
"eval_samples_per_second": 195.709,
"eval_steps_per_second": 24.464,
"step": 17000
},
{
"epoch": 1.028600043760616,
"grad_norm": 1.0246236324310303,
"learning_rate": 5.830943647882628e-06,
"loss": 0.4634,
"step": 17100
},
{
"epoch": 1.028600043760616,
"eval_loss": 0.4150553345680237,
"eval_runtime": 51.0756,
"eval_samples_per_second": 195.788,
"eval_steps_per_second": 24.474,
"step": 17100
},
{
"epoch": 1.0346152486948885,
"grad_norm": 1.09652578830719,
"learning_rate": 5.8299433144381465e-06,
"loss": 0.4618,
"step": 17200
},
{
"epoch": 1.0346152486948885,
"eval_loss": 0.41938120126724243,
"eval_runtime": 51.0832,
"eval_samples_per_second": 195.759,
"eval_steps_per_second": 24.47,
"step": 17200
},
{
"epoch": 1.040630453629161,
"grad_norm": 1.123412013053894,
"learning_rate": 5.828942980993664e-06,
"loss": 0.4598,
"step": 17300
},
{
"epoch": 1.040630453629161,
"eval_loss": 0.4131644666194916,
"eval_runtime": 51.0626,
"eval_samples_per_second": 195.838,
"eval_steps_per_second": 24.48,
"step": 17300
},
{
"epoch": 1.0466456585634338,
"grad_norm": 1.195304274559021,
"learning_rate": 5.827942647549183e-06,
"loss": 0.455,
"step": 17400
},
{
"epoch": 1.0466456585634338,
"eval_loss": 0.40582725405693054,
"eval_runtime": 51.2954,
"eval_samples_per_second": 194.949,
"eval_steps_per_second": 24.369,
"step": 17400
},
{
"epoch": 1.0526608634977064,
"grad_norm": 1.149339199066162,
"learning_rate": 5.826942314104702e-06,
"loss": 0.4547,
"step": 17500
},
{
"epoch": 1.0526608634977064,
"eval_loss": 0.4130345582962036,
"eval_runtime": 51.0931,
"eval_samples_per_second": 195.721,
"eval_steps_per_second": 24.465,
"step": 17500
},
{
"epoch": 1.058676068431979,
"grad_norm": 1.1289178133010864,
"learning_rate": 5.8259419806602205e-06,
"loss": 0.4551,
"step": 17600
},
{
"epoch": 1.058676068431979,
"eval_loss": 0.4048755466938019,
"eval_runtime": 51.0261,
"eval_samples_per_second": 195.978,
"eval_steps_per_second": 24.497,
"step": 17600
},
{
"epoch": 1.0646912733662515,
"grad_norm": 1.1146255731582642,
"learning_rate": 5.824941647215739e-06,
"loss": 0.4509,
"step": 17700
},
{
"epoch": 1.0646912733662515,
"eval_loss": 0.401869535446167,
"eval_runtime": 51.168,
"eval_samples_per_second": 195.435,
"eval_steps_per_second": 24.429,
"step": 17700
},
{
"epoch": 1.0707064783005242,
"grad_norm": 1.2300053834915161,
"learning_rate": 5.823941313771257e-06,
"loss": 0.4505,
"step": 17800
},
{
"epoch": 1.0707064783005242,
"eval_loss": 0.4011248052120209,
"eval_runtime": 51.0381,
"eval_samples_per_second": 195.932,
"eval_steps_per_second": 24.491,
"step": 17800
},
{
"epoch": 1.0767216832347968,
"grad_norm": 1.1278949975967407,
"learning_rate": 5.822940980326776e-06,
"loss": 0.4499,
"step": 17900
},
{
"epoch": 1.0767216832347968,
"eval_loss": 0.4098372459411621,
"eval_runtime": 51.1549,
"eval_samples_per_second": 195.485,
"eval_steps_per_second": 24.436,
"step": 17900
},
{
"epoch": 1.0827368881690693,
"grad_norm": 1.1039050817489624,
"learning_rate": 5.8219406468822945e-06,
"loss": 0.4479,
"step": 18000
},
{
"epoch": 1.0827368881690693,
"eval_loss": 0.4014202356338501,
"eval_runtime": 51.282,
"eval_samples_per_second": 195.0,
"eval_steps_per_second": 24.375,
"step": 18000
},
{
"epoch": 1.0887520931033419,
"grad_norm": 1.0981614589691162,
"learning_rate": 5.820940313437812e-06,
"loss": 0.4505,
"step": 18100
},
{
"epoch": 1.0887520931033419,
"eval_loss": 0.40326839685440063,
"eval_runtime": 51.0953,
"eval_samples_per_second": 195.713,
"eval_steps_per_second": 24.464,
"step": 18100
},
{
"epoch": 1.0947672980376146,
"grad_norm": 1.1146022081375122,
"learning_rate": 5.819939979993331e-06,
"loss": 0.4485,
"step": 18200
},
{
"epoch": 1.0947672980376146,
"eval_loss": 0.4028699994087219,
"eval_runtime": 51.095,
"eval_samples_per_second": 195.714,
"eval_steps_per_second": 24.464,
"step": 18200
},
{
"epoch": 1.1007825029718872,
"grad_norm": 1.0906445980072021,
"learning_rate": 5.81893964654885e-06,
"loss": 0.4441,
"step": 18300
},
{
"epoch": 1.1007825029718872,
"eval_loss": 0.39843133091926575,
"eval_runtime": 51.2428,
"eval_samples_per_second": 195.149,
"eval_steps_per_second": 24.394,
"step": 18300
},
{
"epoch": 1.1067977079061597,
"grad_norm": 1.0257636308670044,
"learning_rate": 5.8179393131043684e-06,
"loss": 0.4456,
"step": 18400
},
{
"epoch": 1.1067977079061597,
"eval_loss": 0.3976500630378723,
"eval_runtime": 51.0817,
"eval_samples_per_second": 195.765,
"eval_steps_per_second": 24.471,
"step": 18400
},
{
"epoch": 1.1128129128404325,
"grad_norm": 1.1339443922042847,
"learning_rate": 5.816938979659887e-06,
"loss": 0.4441,
"step": 18500
},
{
"epoch": 1.1128129128404325,
"eval_loss": 0.403137743473053,
"eval_runtime": 51.196,
"eval_samples_per_second": 195.328,
"eval_steps_per_second": 24.416,
"step": 18500
},
{
"epoch": 1.118828117774705,
"grad_norm": 1.146203637123108,
"learning_rate": 5.815938646215406e-06,
"loss": 0.4431,
"step": 18600
},
{
"epoch": 1.118828117774705,
"eval_loss": 0.40482422709465027,
"eval_runtime": 51.0834,
"eval_samples_per_second": 195.758,
"eval_steps_per_second": 24.47,
"step": 18600
},
{
"epoch": 1.1248433227089776,
"grad_norm": 1.1327886581420898,
"learning_rate": 5.814938312770924e-06,
"loss": 0.4446,
"step": 18700
},
{
"epoch": 1.1248433227089776,
"eval_loss": 0.39922335743904114,
"eval_runtime": 51.1856,
"eval_samples_per_second": 195.367,
"eval_steps_per_second": 24.421,
"step": 18700
},
{
"epoch": 1.1308585276432503,
"grad_norm": 1.1702196598052979,
"learning_rate": 5.8139379793264424e-06,
"loss": 0.4412,
"step": 18800
},
{
"epoch": 1.1308585276432503,
"eval_loss": 0.39871400594711304,
"eval_runtime": 51.1987,
"eval_samples_per_second": 195.317,
"eval_steps_per_second": 24.415,
"step": 18800
},
{
"epoch": 1.1368737325775229,
"grad_norm": 1.0438004732131958,
"learning_rate": 5.81293764588196e-06,
"loss": 0.44,
"step": 18900
},
{
"epoch": 1.1368737325775229,
"eval_loss": 0.3967694044113159,
"eval_runtime": 51.0919,
"eval_samples_per_second": 195.726,
"eval_steps_per_second": 24.466,
"step": 18900
},
{
"epoch": 1.1428889375117954,
"grad_norm": 1.0050268173217773,
"learning_rate": 5.811937312437479e-06,
"loss": 0.4395,
"step": 19000
},
{
"epoch": 1.1428889375117954,
"eval_loss": 0.3952539563179016,
"eval_runtime": 51.3885,
"eval_samples_per_second": 194.596,
"eval_steps_per_second": 24.325,
"step": 19000
},
{
"epoch": 1.148904142446068,
"grad_norm": 1.0875275135040283,
"learning_rate": 5.810936978992998e-06,
"loss": 0.4346,
"step": 19100
},
{
"epoch": 1.148904142446068,
"eval_loss": 0.3918244242668152,
"eval_runtime": 51.0342,
"eval_samples_per_second": 195.947,
"eval_steps_per_second": 24.493,
"step": 19100
},
{
"epoch": 1.1549193473803407,
"grad_norm": 1.0449281930923462,
"learning_rate": 5.809936645548516e-06,
"loss": 0.4391,
"step": 19200
},
{
"epoch": 1.1549193473803407,
"eval_loss": 0.3855830729007721,
"eval_runtime": 51.1568,
"eval_samples_per_second": 195.478,
"eval_steps_per_second": 24.435,
"step": 19200
},
{
"epoch": 1.1609345523146133,
"grad_norm": 0.9773437976837158,
"learning_rate": 5.808936312104035e-06,
"loss": 0.4355,
"step": 19300
},
{
"epoch": 1.1609345523146133,
"eval_loss": 0.3886500597000122,
"eval_runtime": 51.1956,
"eval_samples_per_second": 195.329,
"eval_steps_per_second": 24.416,
"step": 19300
},
{
"epoch": 1.1669497572488858,
"grad_norm": 1.091601014137268,
"learning_rate": 5.807935978659554e-06,
"loss": 0.4344,
"step": 19400
},
{
"epoch": 1.1669497572488858,
"eval_loss": 0.3868565857410431,
"eval_runtime": 51.1098,
"eval_samples_per_second": 195.657,
"eval_steps_per_second": 24.457,
"step": 19400
},
{
"epoch": 1.1729649621831584,
"grad_norm": 1.1882948875427246,
"learning_rate": 5.806935645215072e-06,
"loss": 0.434,
"step": 19500
},
{
"epoch": 1.1729649621831584,
"eval_loss": 0.38946595788002014,
"eval_runtime": 51.2843,
"eval_samples_per_second": 194.991,
"eval_steps_per_second": 24.374,
"step": 19500
},
{
"epoch": 1.1789801671174311,
"grad_norm": 1.0534999370574951,
"learning_rate": 5.80593531177059e-06,
"loss": 0.4329,
"step": 19600
},
{
"epoch": 1.1789801671174311,
"eval_loss": 0.3830993175506592,
"eval_runtime": 50.9094,
"eval_samples_per_second": 196.428,
"eval_steps_per_second": 24.553,
"step": 19600
},
{
"epoch": 1.1849953720517037,
"grad_norm": 1.0696886777877808,
"learning_rate": 5.804934978326108e-06,
"loss": 0.4311,
"step": 19700
},
{
"epoch": 1.1849953720517037,
"eval_loss": 0.39124995470046997,
"eval_runtime": 51.1273,
"eval_samples_per_second": 195.59,
"eval_steps_per_second": 24.449,
"step": 19700
},
{
"epoch": 1.1910105769859762,
"grad_norm": 1.0171489715576172,
"learning_rate": 5.803934644881627e-06,
"loss": 0.4332,
"step": 19800
},
{
"epoch": 1.1910105769859762,
"eval_loss": 0.384937584400177,
"eval_runtime": 51.3256,
"eval_samples_per_second": 194.834,
"eval_steps_per_second": 24.354,
"step": 19800
},
{
"epoch": 1.197025781920249,
"grad_norm": 1.1686575412750244,
"learning_rate": 5.802934311437146e-06,
"loss": 0.4289,
"step": 19900
},
{
"epoch": 1.197025781920249,
"eval_loss": 0.38561180233955383,
"eval_runtime": 51.072,
"eval_samples_per_second": 195.802,
"eval_steps_per_second": 24.475,
"step": 19900
},
{
"epoch": 1.2030409868545215,
"grad_norm": 1.0748465061187744,
"learning_rate": 5.801933977992664e-06,
"loss": 0.4334,
"step": 20000
},
{
"epoch": 1.2030409868545215,
"eval_loss": 0.382721871137619,
"eval_runtime": 51.3966,
"eval_samples_per_second": 194.565,
"eval_steps_per_second": 24.321,
"step": 20000
},
{
"epoch": 1.209056191788794,
"grad_norm": 1.100787878036499,
"learning_rate": 5.800933644548183e-06,
"loss": 0.4239,
"step": 20100
},
{
"epoch": 1.209056191788794,
"eval_loss": 0.3841208517551422,
"eval_runtime": 51.057,
"eval_samples_per_second": 195.859,
"eval_steps_per_second": 24.482,
"step": 20100
},
{
"epoch": 1.2150713967230669,
"grad_norm": 1.04718017578125,
"learning_rate": 5.799933311103702e-06,
"loss": 0.4271,
"step": 20200
},
{
"epoch": 1.2150713967230669,
"eval_loss": 0.3771766424179077,
"eval_runtime": 51.2777,
"eval_samples_per_second": 195.017,
"eval_steps_per_second": 24.377,
"step": 20200
},
{
"epoch": 1.2210866016573394,
"grad_norm": 1.1533209085464478,
"learning_rate": 5.79893297765922e-06,
"loss": 0.4254,
"step": 20300
},
{
"epoch": 1.2210866016573394,
"eval_loss": 0.38013017177581787,
"eval_runtime": 51.0118,
"eval_samples_per_second": 196.033,
"eval_steps_per_second": 24.504,
"step": 20300
},
{
"epoch": 1.227101806591612,
"grad_norm": 1.2025070190429688,
"learning_rate": 5.797932644214738e-06,
"loss": 0.4263,
"step": 20400
},
{
"epoch": 1.227101806591612,
"eval_loss": 0.37795642018318176,
"eval_runtime": 51.132,
"eval_samples_per_second": 195.572,
"eval_steps_per_second": 24.447,
"step": 20400
},
{
"epoch": 1.2331170115258845,
"grad_norm": 1.1051814556121826,
"learning_rate": 5.796932310770257e-06,
"loss": 0.4256,
"step": 20500
},
{
"epoch": 1.2331170115258845,
"eval_loss": 0.37627479434013367,
"eval_runtime": 50.9072,
"eval_samples_per_second": 196.436,
"eval_steps_per_second": 24.554,
"step": 20500
},
{
"epoch": 1.2391322164601573,
"grad_norm": 1.0987049341201782,
"learning_rate": 5.795931977325775e-06,
"loss": 0.4239,
"step": 20600
},
{
"epoch": 1.2391322164601573,
"eval_loss": 0.3853623569011688,
"eval_runtime": 51.0608,
"eval_samples_per_second": 195.845,
"eval_steps_per_second": 24.481,
"step": 20600
},
{
"epoch": 1.2451474213944298,
"grad_norm": 1.0989750623703003,
"learning_rate": 5.794931643881294e-06,
"loss": 0.4197,
"step": 20700
},
{
"epoch": 1.2451474213944298,
"eval_loss": 0.3807806670665741,
"eval_runtime": 51.3594,
"eval_samples_per_second": 194.706,
"eval_steps_per_second": 24.338,
"step": 20700
},
{
"epoch": 1.2511626263287023,
"grad_norm": 1.0866729021072388,
"learning_rate": 5.793931310436812e-06,
"loss": 0.4234,
"step": 20800
},
{
"epoch": 1.2511626263287023,
"eval_loss": 0.3777351379394531,
"eval_runtime": 51.0621,
"eval_samples_per_second": 195.84,
"eval_steps_per_second": 24.48,
"step": 20800
},
{
"epoch": 1.2571778312629749,
"grad_norm": 1.1387032270431519,
"learning_rate": 5.792930976992331e-06,
"loss": 0.4197,
"step": 20900
},
{
"epoch": 1.2571778312629749,
"eval_loss": 0.3739318549633026,
"eval_runtime": 51.1648,
"eval_samples_per_second": 195.447,
"eval_steps_per_second": 24.431,
"step": 20900
},
{
"epoch": 1.2631930361972477,
"grad_norm": 0.9848424792289734,
"learning_rate": 5.79193064354785e-06,
"loss": 0.4225,
"step": 21000
},
{
"epoch": 1.2631930361972477,
"eval_loss": 0.3804405629634857,
"eval_runtime": 51.1688,
"eval_samples_per_second": 195.431,
"eval_steps_per_second": 24.429,
"step": 21000
},
{
"epoch": 1.2692082411315202,
"grad_norm": 1.0492684841156006,
"learning_rate": 5.790930310103368e-06,
"loss": 0.4179,
"step": 21100
},
{
"epoch": 1.2692082411315202,
"eval_loss": 0.37157440185546875,
"eval_runtime": 51.0428,
"eval_samples_per_second": 195.914,
"eval_steps_per_second": 24.489,
"step": 21100
},
{
"epoch": 1.2752234460657927,
"grad_norm": 1.2355892658233643,
"learning_rate": 5.789929976658886e-06,
"loss": 0.4177,
"step": 21200
},
{
"epoch": 1.2752234460657927,
"eval_loss": 0.3794465661048889,
"eval_runtime": 51.1116,
"eval_samples_per_second": 195.65,
"eval_steps_per_second": 24.456,
"step": 21200
},
{
"epoch": 1.2812386510000655,
"grad_norm": 1.1180801391601562,
"learning_rate": 5.788929643214405e-06,
"loss": 0.4192,
"step": 21300
},
{
"epoch": 1.2812386510000655,
"eval_loss": 0.3741929829120636,
"eval_runtime": 51.043,
"eval_samples_per_second": 195.913,
"eval_steps_per_second": 24.489,
"step": 21300
},
{
"epoch": 1.287253855934338,
"grad_norm": 1.1260274648666382,
"learning_rate": 5.787929309769923e-06,
"loss": 0.4165,
"step": 21400
},
{
"epoch": 1.287253855934338,
"eval_loss": 0.37511906027793884,
"eval_runtime": 51.1867,
"eval_samples_per_second": 195.363,
"eval_steps_per_second": 24.42,
"step": 21400
},
{
"epoch": 1.2932690608686106,
"grad_norm": 1.0729244947433472,
"learning_rate": 5.7869289763254424e-06,
"loss": 0.4148,
"step": 21500
},
{
"epoch": 1.2932690608686106,
"eval_loss": 0.3755778670310974,
"eval_runtime": 50.9919,
"eval_samples_per_second": 196.11,
"eval_steps_per_second": 24.514,
"step": 21500
},
{
"epoch": 1.2992842658028834,
"grad_norm": 1.5396491289138794,
"learning_rate": 5.785928642880961e-06,
"loss": 0.4128,
"step": 21600
},
{
"epoch": 1.2992842658028834,
"eval_loss": 0.3713712990283966,
"eval_runtime": 51.0389,
"eval_samples_per_second": 195.929,
"eval_steps_per_second": 24.491,
"step": 21600
},
{
"epoch": 1.305299470737156,
"grad_norm": 0.9880481362342834,
"learning_rate": 5.784928309436479e-06,
"loss": 0.4138,
"step": 21700
},
{
"epoch": 1.305299470737156,
"eval_loss": 0.3710058033466339,
"eval_runtime": 51.3224,
"eval_samples_per_second": 194.847,
"eval_steps_per_second": 24.356,
"step": 21700
},
{
"epoch": 1.3113146756714285,
"grad_norm": 0.9788950085639954,
"learning_rate": 5.783927975991998e-06,
"loss": 0.4108,
"step": 21800
},
{
"epoch": 1.3113146756714285,
"eval_loss": 0.3687758147716522,
"eval_runtime": 51.0044,
"eval_samples_per_second": 196.062,
"eval_steps_per_second": 24.508,
"step": 21800
},
{
"epoch": 1.317329880605701,
"grad_norm": 1.0298100709915161,
"learning_rate": 5.782927642547516e-06,
"loss": 0.4129,
"step": 21900
},
{
"epoch": 1.317329880605701,
"eval_loss": 0.365496426820755,
"eval_runtime": 51.065,
"eval_samples_per_second": 195.829,
"eval_steps_per_second": 24.479,
"step": 21900
},
{
"epoch": 1.3233450855399735,
"grad_norm": 1.0753816366195679,
"learning_rate": 5.781927309103034e-06,
"loss": 0.413,
"step": 22000
},
{
"epoch": 1.3233450855399735,
"eval_loss": 0.3655156195163727,
"eval_runtime": 51.117,
"eval_samples_per_second": 195.63,
"eval_steps_per_second": 24.454,
"step": 22000
},
{
"epoch": 1.3293602904742463,
"grad_norm": 1.1379014253616333,
"learning_rate": 5.780926975658553e-06,
"loss": 0.4101,
"step": 22100
},
{
"epoch": 1.3293602904742463,
"eval_loss": 0.37188926339149475,
"eval_runtime": 51.0999,
"eval_samples_per_second": 195.695,
"eval_steps_per_second": 24.462,
"step": 22100
},
{
"epoch": 1.3353754954085189,
"grad_norm": 0.9869519472122192,
"learning_rate": 5.779926642214072e-06,
"loss": 0.4113,
"step": 22200
},
{
"epoch": 1.3353754954085189,
"eval_loss": 0.36685308814048767,
"eval_runtime": 50.9524,
"eval_samples_per_second": 196.262,
"eval_steps_per_second": 24.533,
"step": 22200
},
{
"epoch": 1.3413907003427914,
"grad_norm": 1.1977757215499878,
"learning_rate": 5.77892630876959e-06,
"loss": 0.4106,
"step": 22300
},
{
"epoch": 1.3413907003427914,
"eval_loss": 0.3694215714931488,
"eval_runtime": 50.8823,
"eval_samples_per_second": 196.532,
"eval_steps_per_second": 24.566,
"step": 22300
},
{
"epoch": 1.3474059052770642,
"grad_norm": 1.0620633363723755,
"learning_rate": 5.777925975325109e-06,
"loss": 0.407,
"step": 22400
},
{
"epoch": 1.3474059052770642,
"eval_loss": 0.36941900849342346,
"eval_runtime": 51.0452,
"eval_samples_per_second": 195.905,
"eval_steps_per_second": 24.488,
"step": 22400
},
{
"epoch": 1.3534211102113367,
"grad_norm": 1.0130232572555542,
"learning_rate": 5.776925641880627e-06,
"loss": 0.4076,
"step": 22500
},
{
"epoch": 1.3534211102113367,
"eval_loss": 0.3688518702983856,
"eval_runtime": 51.2935,
"eval_samples_per_second": 194.956,
"eval_steps_per_second": 24.37,
"step": 22500
},
{
"epoch": 1.3594363151456093,
"grad_norm": 1.1370288133621216,
"learning_rate": 5.775925308436146e-06,
"loss": 0.4058,
"step": 22600
},
{
"epoch": 1.3594363151456093,
"eval_loss": 0.35986149311065674,
"eval_runtime": 50.94,
"eval_samples_per_second": 196.309,
"eval_steps_per_second": 24.539,
"step": 22600
},
{
"epoch": 1.365451520079882,
"grad_norm": 1.0753254890441895,
"learning_rate": 5.7749249749916635e-06,
"loss": 0.404,
"step": 22700
},
{
"epoch": 1.365451520079882,
"eval_loss": 0.36281687021255493,
"eval_runtime": 51.0705,
"eval_samples_per_second": 195.808,
"eval_steps_per_second": 24.476,
"step": 22700
},
{
"epoch": 1.3714667250141546,
"grad_norm": 1.0779234170913696,
"learning_rate": 5.773924641547182e-06,
"loss": 0.4055,
"step": 22800
},
{
"epoch": 1.3714667250141546,
"eval_loss": 0.3607022762298584,
"eval_runtime": 51.2843,
"eval_samples_per_second": 194.992,
"eval_steps_per_second": 24.374,
"step": 22800
},
{
"epoch": 1.377481929948427,
"grad_norm": 1.0071178674697876,
"learning_rate": 5.772924308102701e-06,
"loss": 0.4038,
"step": 22900
},
{
"epoch": 1.377481929948427,
"eval_loss": 0.36346524953842163,
"eval_runtime": 50.9712,
"eval_samples_per_second": 196.189,
"eval_steps_per_second": 24.524,
"step": 22900
},
{
"epoch": 1.3834971348826999,
"grad_norm": 1.0683503150939941,
"learning_rate": 5.77192397465822e-06,
"loss": 0.4047,
"step": 23000
},
{
"epoch": 1.3834971348826999,
"eval_loss": 0.36117979884147644,
"eval_runtime": 51.0395,
"eval_samples_per_second": 195.927,
"eval_steps_per_second": 24.491,
"step": 23000
},
{
"epoch": 1.3895123398169724,
"grad_norm": 1.1770708560943604,
"learning_rate": 5.770923641213738e-06,
"loss": 0.4043,
"step": 23100
},
{
"epoch": 1.3895123398169724,
"eval_loss": 0.36106517910957336,
"eval_runtime": 51.0648,
"eval_samples_per_second": 195.83,
"eval_steps_per_second": 24.479,
"step": 23100
},
{
"epoch": 1.395527544751245,
"grad_norm": 0.9239141941070557,
"learning_rate": 5.769923307769257e-06,
"loss": 0.4011,
"step": 23200
},
{
"epoch": 1.395527544751245,
"eval_loss": 0.3578794598579407,
"eval_runtime": 51.0531,
"eval_samples_per_second": 195.875,
"eval_steps_per_second": 24.484,
"step": 23200
},
{
"epoch": 1.4015427496855175,
"grad_norm": 1.2712723016738892,
"learning_rate": 5.768922974324775e-06,
"loss": 0.4008,
"step": 23300
},
{
"epoch": 1.4015427496855175,
"eval_loss": 0.3636392652988434,
"eval_runtime": 51.1514,
"eval_samples_per_second": 195.498,
"eval_steps_per_second": 24.437,
"step": 23300
},
{
"epoch": 1.40755795461979,
"grad_norm": 1.040955901145935,
"learning_rate": 5.767922640880294e-06,
"loss": 0.3974,
"step": 23400
},
{
"epoch": 1.40755795461979,
"eval_loss": 0.3629893660545349,
"eval_runtime": 51.021,
"eval_samples_per_second": 195.998,
"eval_steps_per_second": 24.5,
"step": 23400
},
{
"epoch": 1.4135731595540628,
"grad_norm": 0.9896743893623352,
"learning_rate": 5.766922307435812e-06,
"loss": 0.3991,
"step": 23500
},
{
"epoch": 1.4135731595540628,
"eval_loss": 0.35531342029571533,
"eval_runtime": 51.17,
"eval_samples_per_second": 195.427,
"eval_steps_per_second": 24.428,
"step": 23500
},
{
"epoch": 1.4195883644883354,
"grad_norm": 1.088028073310852,
"learning_rate": 5.76592197399133e-06,
"loss": 0.3972,
"step": 23600
},
{
"epoch": 1.4195883644883354,
"eval_loss": 0.35938191413879395,
"eval_runtime": 51.2648,
"eval_samples_per_second": 195.066,
"eval_steps_per_second": 24.383,
"step": 23600
},
{
"epoch": 1.425603569422608,
"grad_norm": 1.0598886013031006,
"learning_rate": 5.764921640546849e-06,
"loss": 0.4021,
"step": 23700
},
{
"epoch": 1.425603569422608,
"eval_loss": 0.35533782839775085,
"eval_runtime": 51.0234,
"eval_samples_per_second": 195.989,
"eval_steps_per_second": 24.499,
"step": 23700
},
{
"epoch": 1.4316187743568807,
"grad_norm": 1.1906119585037231,
"learning_rate": 5.763921307102368e-06,
"loss": 0.3977,
"step": 23800
},
{
"epoch": 1.4316187743568807,
"eval_loss": 0.3564583361148834,
"eval_runtime": 51.0223,
"eval_samples_per_second": 195.993,
"eval_steps_per_second": 24.499,
"step": 23800
},
{
"epoch": 1.4376339792911532,
"grad_norm": 1.1549937725067139,
"learning_rate": 5.762920973657886e-06,
"loss": 0.3942,
"step": 23900
},
{
"epoch": 1.4376339792911532,
"eval_loss": 0.3534764051437378,
"eval_runtime": 51.1427,
"eval_samples_per_second": 195.531,
"eval_steps_per_second": 24.441,
"step": 23900
},
{
"epoch": 1.4436491842254258,
"grad_norm": 1.0571911334991455,
"learning_rate": 5.761920640213405e-06,
"loss": 0.3953,
"step": 24000
},
{
"epoch": 1.4436491842254258,
"eval_loss": 0.3564269542694092,
"eval_runtime": 51.0367,
"eval_samples_per_second": 195.938,
"eval_steps_per_second": 24.492,
"step": 24000
},
{
"epoch": 1.4496643891596985,
"grad_norm": 1.058688998222351,
"learning_rate": 5.760920306768923e-06,
"loss": 0.3957,
"step": 24100
},
{
"epoch": 1.4496643891596985,
"eval_loss": 0.3465494215488434,
"eval_runtime": 51.0338,
"eval_samples_per_second": 195.949,
"eval_steps_per_second": 24.494,
"step": 24100
},
{
"epoch": 1.455679594093971,
"grad_norm": 1.0260639190673828,
"learning_rate": 5.759919973324442e-06,
"loss": 0.3954,
"step": 24200
},
{
"epoch": 1.455679594093971,
"eval_loss": 0.34943073987960815,
"eval_runtime": 50.8891,
"eval_samples_per_second": 196.506,
"eval_steps_per_second": 24.563,
"step": 24200
},
{
"epoch": 1.4616947990282436,
"grad_norm": 0.9939345717430115,
"learning_rate": 5.75891963987996e-06,
"loss": 0.3944,
"step": 24300
},
{
"epoch": 1.4616947990282436,
"eval_loss": 0.35242801904678345,
"eval_runtime": 51.0489,
"eval_samples_per_second": 195.891,
"eval_steps_per_second": 24.486,
"step": 24300
},
{
"epoch": 1.4677100039625164,
"grad_norm": 1.0830129384994507,
"learning_rate": 5.757919306435478e-06,
"loss": 0.3894,
"step": 24400
},
{
"epoch": 1.4677100039625164,
"eval_loss": 0.34800294041633606,
"eval_runtime": 51.3057,
"eval_samples_per_second": 194.91,
"eval_steps_per_second": 24.364,
"step": 24400
},
{
"epoch": 1.473725208896789,
"grad_norm": 1.0526846647262573,
"learning_rate": 5.756918972990997e-06,
"loss": 0.39,
"step": 24500
},
{
"epoch": 1.473725208896789,
"eval_loss": 0.3510083556175232,
"eval_runtime": 50.9026,
"eval_samples_per_second": 196.454,
"eval_steps_per_second": 24.557,
"step": 24500
},
{
"epoch": 1.4797404138310615,
"grad_norm": 1.1267868280410767,
"learning_rate": 5.755918639546516e-06,
"loss": 0.3902,
"step": 24600
},
{
"epoch": 1.4797404138310615,
"eval_loss": 0.3532961308956146,
"eval_runtime": 51.0797,
"eval_samples_per_second": 195.773,
"eval_steps_per_second": 24.472,
"step": 24600
},
{
"epoch": 1.485755618765334,
"grad_norm": 1.1018403768539429,
"learning_rate": 5.754918306102034e-06,
"loss": 0.3908,
"step": 24700
},
{
"epoch": 1.485755618765334,
"eval_loss": 0.3456381559371948,
"eval_runtime": 51.3247,
"eval_samples_per_second": 194.838,
"eval_steps_per_second": 24.355,
"step": 24700
},
{
"epoch": 1.4917708236996066,
"grad_norm": 1.0022377967834473,
"learning_rate": 5.753917972657553e-06,
"loss": 0.3869,
"step": 24800
},
{
"epoch": 1.4917708236996066,
"eval_loss": 0.3509150445461273,
"eval_runtime": 51.0426,
"eval_samples_per_second": 195.915,
"eval_steps_per_second": 24.489,
"step": 24800
},
{
"epoch": 1.4977860286338793,
"grad_norm": 1.02973210811615,
"learning_rate": 5.752917639213071e-06,
"loss": 0.3885,
"step": 24900
},
{
"epoch": 1.4977860286338793,
"eval_loss": 0.3488512635231018,
"eval_runtime": 50.9719,
"eval_samples_per_second": 196.187,
"eval_steps_per_second": 24.523,
"step": 24900
},
{
"epoch": 1.5038012335681519,
"grad_norm": 1.0170624256134033,
"learning_rate": 5.7519173057685896e-06,
"loss": 0.386,
"step": 25000
},
{
"epoch": 1.5038012335681519,
"eval_loss": 0.344295859336853,
"eval_runtime": 51.2301,
"eval_samples_per_second": 195.198,
"eval_steps_per_second": 24.4,
"step": 25000
},
{
"epoch": 1.5098164385024244,
"grad_norm": 1.0053726434707642,
"learning_rate": 5.750916972324108e-06,
"loss": 0.3885,
"step": 25100
},
{
"epoch": 1.5098164385024244,
"eval_loss": 0.34295952320098877,
"eval_runtime": 51.2643,
"eval_samples_per_second": 195.068,
"eval_steps_per_second": 24.383,
"step": 25100
},
{
"epoch": 1.5158316434366972,
"grad_norm": 0.9546186327934265,
"learning_rate": 5.749916638879626e-06,
"loss": 0.3902,
"step": 25200
},
{
"epoch": 1.5158316434366972,
"eval_loss": 0.3494739234447479,
"eval_runtime": 51.1243,
"eval_samples_per_second": 195.602,
"eval_steps_per_second": 24.45,
"step": 25200
},
{
"epoch": 1.5218468483709697,
"grad_norm": 1.0184184312820435,
"learning_rate": 5.748916305435145e-06,
"loss": 0.3853,
"step": 25300
},
{
"epoch": 1.5218468483709697,
"eval_loss": 0.34722205996513367,
"eval_runtime": 51.0304,
"eval_samples_per_second": 195.961,
"eval_steps_per_second": 24.495,
"step": 25300
},
{
"epoch": 1.5278620533052423,
"grad_norm": 1.0732802152633667,
"learning_rate": 5.747915971990664e-06,
"loss": 0.3868,
"step": 25400
},
{
"epoch": 1.5278620533052423,
"eval_loss": 0.34737443923950195,
"eval_runtime": 51.1073,
"eval_samples_per_second": 195.667,
"eval_steps_per_second": 24.458,
"step": 25400
},
{
"epoch": 1.533877258239515,
"grad_norm": 1.023866891860962,
"learning_rate": 5.746915638546182e-06,
"loss": 0.3846,
"step": 25500
},
{
"epoch": 1.533877258239515,
"eval_loss": 0.34227558970451355,
"eval_runtime": 51.0647,
"eval_samples_per_second": 195.83,
"eval_steps_per_second": 24.479,
"step": 25500
},
{
"epoch": 1.5398924631737876,
"grad_norm": 0.9621095657348633,
"learning_rate": 5.745915305101701e-06,
"loss": 0.3853,
"step": 25600
},
{
"epoch": 1.5398924631737876,
"eval_loss": 0.33890464901924133,
"eval_runtime": 37.4533,
"eval_samples_per_second": 266.999,
"eval_steps_per_second": 33.375,
"step": 25600
},
{
"epoch": 1.5459076681080601,
"grad_norm": 1.0459903478622437,
"learning_rate": 5.744914971657219e-06,
"loss": 0.3867,
"step": 25700
},
{
"epoch": 1.5459076681080601,
"eval_loss": 0.3423731327056885,
"eval_runtime": 51.0943,
"eval_samples_per_second": 195.717,
"eval_steps_per_second": 24.465,
"step": 25700
},
{
"epoch": 1.551922873042333,
"grad_norm": 1.0103187561035156,
"learning_rate": 5.7439146382127375e-06,
"loss": 0.3846,
"step": 25800
},
{
"epoch": 1.551922873042333,
"eval_loss": 0.3495667576789856,
"eval_runtime": 51.0619,
"eval_samples_per_second": 195.841,
"eval_steps_per_second": 24.48,
"step": 25800
},
{
"epoch": 1.5579380779766052,
"grad_norm": 1.1959409713745117,
"learning_rate": 5.742914304768256e-06,
"loss": 0.3836,
"step": 25900
},
{
"epoch": 1.5579380779766052,
"eval_loss": 0.34345749020576477,
"eval_runtime": 50.9931,
"eval_samples_per_second": 196.105,
"eval_steps_per_second": 24.513,
"step": 25900
},
{
"epoch": 1.563953282910878,
"grad_norm": 1.0257697105407715,
"learning_rate": 5.741913971323774e-06,
"loss": 0.3832,
"step": 26000
},
{
"epoch": 1.563953282910878,
"eval_loss": 0.3426493704319,
"eval_runtime": 51.1309,
"eval_samples_per_second": 195.577,
"eval_steps_per_second": 24.447,
"step": 26000
},
{
"epoch": 1.5699684878451505,
"grad_norm": 1.1140973567962646,
"learning_rate": 5.740913637879294e-06,
"loss": 0.3797,
"step": 26100
},
{
"epoch": 1.5699684878451505,
"eval_loss": 0.34580498933792114,
"eval_runtime": 51.1787,
"eval_samples_per_second": 195.394,
"eval_steps_per_second": 24.424,
"step": 26100
},
{
"epoch": 1.575983692779423,
"grad_norm": 1.0050679445266724,
"learning_rate": 5.739913304434812e-06,
"loss": 0.3749,
"step": 26200
},
{
"epoch": 1.575983692779423,
"eval_loss": 0.3454411029815674,
"eval_runtime": 51.1577,
"eval_samples_per_second": 195.474,
"eval_steps_per_second": 24.434,
"step": 26200
},
{
"epoch": 1.5819988977136958,
"grad_norm": 1.0191149711608887,
"learning_rate": 5.73891297099033e-06,
"loss": 0.3772,
"step": 26300
},
{
"epoch": 1.5819988977136958,
"eval_loss": 0.3403486907482147,
"eval_runtime": 51.0929,
"eval_samples_per_second": 195.722,
"eval_steps_per_second": 24.465,
"step": 26300
},
{
"epoch": 1.5880141026479684,
"grad_norm": 1.1277610063552856,
"learning_rate": 5.737912637545849e-06,
"loss": 0.3783,
"step": 26400
},
{
"epoch": 1.5880141026479684,
"eval_loss": 0.3426676392555237,
"eval_runtime": 51.3622,
"eval_samples_per_second": 194.696,
"eval_steps_per_second": 24.337,
"step": 26400
},
{
"epoch": 1.594029307582241,
"grad_norm": 1.12416672706604,
"learning_rate": 5.736912304101368e-06,
"loss": 0.3765,
"step": 26500
},
{
"epoch": 1.594029307582241,
"eval_loss": 0.3407214879989624,
"eval_runtime": 51.185,
"eval_samples_per_second": 195.37,
"eval_steps_per_second": 24.421,
"step": 26500
},
{
"epoch": 1.6000445125165137,
"grad_norm": 0.9676984548568726,
"learning_rate": 5.7359119706568855e-06,
"loss": 0.377,
"step": 26600
},
{
"epoch": 1.6000445125165137,
"eval_loss": 0.3347455859184265,
"eval_runtime": 50.9838,
"eval_samples_per_second": 196.141,
"eval_steps_per_second": 24.518,
"step": 26600
},
{
"epoch": 1.6060597174507862,
"grad_norm": 1.0561347007751465,
"learning_rate": 5.734911637212404e-06,
"loss": 0.3768,
"step": 26700
},
{
"epoch": 1.6060597174507862,
"eval_loss": 0.3399183452129364,
"eval_runtime": 51.075,
"eval_samples_per_second": 195.79,
"eval_steps_per_second": 24.474,
"step": 26700
},
{
"epoch": 1.6120749223850588,
"grad_norm": 1.2122465372085571,
"learning_rate": 5.733911303767923e-06,
"loss": 0.3763,
"step": 26800
},
{
"epoch": 1.6120749223850588,
"eval_loss": 0.33461084961891174,
"eval_runtime": 51.0463,
"eval_samples_per_second": 195.901,
"eval_steps_per_second": 24.488,
"step": 26800
},
{
"epoch": 1.6180901273193316,
"grad_norm": 1.0054854154586792,
"learning_rate": 5.732910970323442e-06,
"loss": 0.3786,
"step": 26900
},
{
"epoch": 1.6180901273193316,
"eval_loss": 0.3318628668785095,
"eval_runtime": 51.0826,
"eval_samples_per_second": 195.761,
"eval_steps_per_second": 24.47,
"step": 26900
},
{
"epoch": 1.624105332253604,
"grad_norm": 1.072472333908081,
"learning_rate": 5.73191063687896e-06,
"loss": 0.3762,
"step": 27000
},
{
"epoch": 1.624105332253604,
"eval_loss": 0.3293687403202057,
"eval_runtime": 51.072,
"eval_samples_per_second": 195.802,
"eval_steps_per_second": 24.475,
"step": 27000
},
{
"epoch": 1.6301205371878766,
"grad_norm": 1.0058602094650269,
"learning_rate": 5.730910303434478e-06,
"loss": 0.3716,
"step": 27100
},
{
"epoch": 1.6301205371878766,
"eval_loss": 0.33610230684280396,
"eval_runtime": 51.0651,
"eval_samples_per_second": 195.828,
"eval_steps_per_second": 24.479,
"step": 27100
},
{
"epoch": 1.6361357421221494,
"grad_norm": 1.0208802223205566,
"learning_rate": 5.729909969989997e-06,
"loss": 0.3724,
"step": 27200
},
{
"epoch": 1.6361357421221494,
"eval_loss": 0.3361985981464386,
"eval_runtime": 51.1569,
"eval_samples_per_second": 195.477,
"eval_steps_per_second": 24.435,
"step": 27200
},
{
"epoch": 1.6421509470564217,
"grad_norm": 1.0464400053024292,
"learning_rate": 5.728909636545516e-06,
"loss": 0.3732,
"step": 27300
},
{
"epoch": 1.6421509470564217,
"eval_loss": 0.3356834053993225,
"eval_runtime": 21.647,
"eval_samples_per_second": 461.957,
"eval_steps_per_second": 57.745,
"step": 27300
},
{
"epoch": 1.6481661519906945,
"grad_norm": 1.1063635349273682,
"learning_rate": 5.7279093031010335e-06,
"loss": 0.3725,
"step": 27400
},
{
"epoch": 1.6481661519906945,
"eval_loss": 0.3378269374370575,
"eval_runtime": 48.6948,
"eval_samples_per_second": 205.361,
"eval_steps_per_second": 25.67,
"step": 27400
},
{
"epoch": 1.654181356924967,
"grad_norm": 0.8910077214241028,
"learning_rate": 5.726908969656552e-06,
"loss": 0.3707,
"step": 27500
},
{
"epoch": 1.654181356924967,
"eval_loss": 0.3300679624080658,
"eval_runtime": 48.819,
"eval_samples_per_second": 204.838,
"eval_steps_per_second": 25.605,
"step": 27500
},
{
"epoch": 1.6601965618592396,
"grad_norm": 0.9904689192771912,
"learning_rate": 5.725908636212071e-06,
"loss": 0.3722,
"step": 27600
},
{
"epoch": 1.6601965618592396,
"eval_loss": 0.33077552914619446,
"eval_runtime": 45.4305,
"eval_samples_per_second": 220.116,
"eval_steps_per_second": 27.515,
"step": 27600
},
{
"epoch": 1.6662117667935123,
"grad_norm": 1.0377715826034546,
"learning_rate": 5.72490830276759e-06,
"loss": 0.3693,
"step": 27700
},
{
"epoch": 1.6662117667935123,
"eval_loss": 0.3365156948566437,
"eval_runtime": 46.8492,
"eval_samples_per_second": 213.451,
"eval_steps_per_second": 26.681,
"step": 27700
},
{
"epoch": 1.672226971727785,
"grad_norm": 0.9838355183601379,
"learning_rate": 5.723907969323108e-06,
"loss": 0.373,
"step": 27800
},
{
"epoch": 1.672226971727785,
"eval_loss": 0.33353880047798157,
"eval_runtime": 47.6968,
"eval_samples_per_second": 209.658,
"eval_steps_per_second": 26.207,
"step": 27800
},
{
"epoch": 1.6782421766620574,
"grad_norm": 1.0050548315048218,
"learning_rate": 5.722907635878626e-06,
"loss": 0.3707,
"step": 27900
},
{
"epoch": 1.6782421766620574,
"eval_loss": 0.3265502154827118,
"eval_runtime": 48.1571,
"eval_samples_per_second": 207.654,
"eval_steps_per_second": 25.957,
"step": 27900
},
{
"epoch": 1.6842573815963302,
"grad_norm": 1.0083630084991455,
"learning_rate": 5.721907302434145e-06,
"loss": 0.3687,
"step": 28000
},
{
"epoch": 1.6842573815963302,
"eval_loss": 0.33139145374298096,
"eval_runtime": 48.694,
"eval_samples_per_second": 205.364,
"eval_steps_per_second": 25.671,
"step": 28000
},
{
"epoch": 1.6902725865306027,
"grad_norm": 0.9649508595466614,
"learning_rate": 5.7209069689896636e-06,
"loss": 0.3661,
"step": 28100
},
{
"epoch": 1.6902725865306027,
"eval_loss": 0.3332207202911377,
"eval_runtime": 40.0334,
"eval_samples_per_second": 249.792,
"eval_steps_per_second": 31.224,
"step": 28100
},
{
"epoch": 1.6962877914648753,
"grad_norm": 1.042528748512268,
"learning_rate": 5.7199066355451814e-06,
"loss": 0.3702,
"step": 28200
},
{
"epoch": 1.6962877914648753,
"eval_loss": 0.32571831345558167,
"eval_runtime": 49.2797,
"eval_samples_per_second": 202.923,
"eval_steps_per_second": 25.365,
"step": 28200
},
{
"epoch": 1.702302996399148,
"grad_norm": 0.9756554365158081,
"learning_rate": 5.7189063021007e-06,
"loss": 0.3647,
"step": 28300
},
{
"epoch": 1.702302996399148,
"eval_loss": 0.3234156668186188,
"eval_runtime": 49.7079,
"eval_samples_per_second": 201.175,
"eval_steps_per_second": 25.147,
"step": 28300
},
{
"epoch": 1.7083182013334206,
"grad_norm": 1.0613596439361572,
"learning_rate": 5.717905968656219e-06,
"loss": 0.3649,
"step": 28400
},
{
"epoch": 1.7083182013334206,
"eval_loss": 0.32939964532852173,
"eval_runtime": 50.06,
"eval_samples_per_second": 199.76,
"eval_steps_per_second": 24.97,
"step": 28400
},
{
"epoch": 1.7143334062676931,
"grad_norm": 1.0461217164993286,
"learning_rate": 5.7169056352117375e-06,
"loss": 0.3677,
"step": 28500
},
{
"epoch": 1.7143334062676931,
"eval_loss": 0.32745957374572754,
"eval_runtime": 50.0541,
"eval_samples_per_second": 199.784,
"eval_steps_per_second": 24.973,
"step": 28500
},
{
"epoch": 1.720348611201966,
"grad_norm": 1.0226540565490723,
"learning_rate": 5.715905301767256e-06,
"loss": 0.3642,
"step": 28600
},
{
"epoch": 1.720348611201966,
"eval_loss": 0.3290911316871643,
"eval_runtime": 50.4387,
"eval_samples_per_second": 198.26,
"eval_steps_per_second": 24.783,
"step": 28600
},
{
"epoch": 1.7263638161362382,
"grad_norm": 1.0498120784759521,
"learning_rate": 5.714904968322774e-06,
"loss": 0.3626,
"step": 28700
},
{
"epoch": 1.7263638161362382,
"eval_loss": 0.33111146092414856,
"eval_runtime": 50.7317,
"eval_samples_per_second": 197.115,
"eval_steps_per_second": 24.639,
"step": 28700
},
{
"epoch": 1.732379021070511,
"grad_norm": 1.0179612636566162,
"learning_rate": 5.713904634878293e-06,
"loss": 0.3611,
"step": 28800
},
{
"epoch": 1.732379021070511,
"eval_loss": 0.31966713070869446,
"eval_runtime": 35.8874,
"eval_samples_per_second": 278.65,
"eval_steps_per_second": 34.831,
"step": 28800
},
{
"epoch": 1.7383942260047835,
"grad_norm": 0.9876866340637207,
"learning_rate": 5.7129043014338115e-06,
"loss": 0.3609,
"step": 28900
},
{
"epoch": 1.7383942260047835,
"eval_loss": 0.3232952356338501,
"eval_runtime": 50.8899,
"eval_samples_per_second": 196.503,
"eval_steps_per_second": 24.563,
"step": 28900
},
{
"epoch": 1.744409430939056,
"grad_norm": 1.08419668674469,
"learning_rate": 5.711903967989329e-06,
"loss": 0.3621,
"step": 29000
},
{
"epoch": 1.744409430939056,
"eval_loss": 0.32880115509033203,
"eval_runtime": 50.9007,
"eval_samples_per_second": 196.461,
"eval_steps_per_second": 24.558,
"step": 29000
},
{
"epoch": 1.7504246358733289,
"grad_norm": 1.0506683588027954,
"learning_rate": 5.710903634544848e-06,
"loss": 0.3612,
"step": 29100
},
{
"epoch": 1.7504246358733289,
"eval_loss": 0.32626426219940186,
"eval_runtime": 51.3181,
"eval_samples_per_second": 194.863,
"eval_steps_per_second": 24.358,
"step": 29100
},
{
"epoch": 1.7564398408076014,
"grad_norm": 1.0610612630844116,
"learning_rate": 5.709903301100367e-06,
"loss": 0.3604,
"step": 29200
},
{
"epoch": 1.7564398408076014,
"eval_loss": 0.32427623867988586,
"eval_runtime": 51.1109,
"eval_samples_per_second": 195.653,
"eval_steps_per_second": 24.457,
"step": 29200
},
{
"epoch": 1.762455045741874,
"grad_norm": 1.0237441062927246,
"learning_rate": 5.7089029676558855e-06,
"loss": 0.3576,
"step": 29300
},
{
"epoch": 1.762455045741874,
"eval_loss": 0.325724720954895,
"eval_runtime": 51.0538,
"eval_samples_per_second": 195.872,
"eval_steps_per_second": 24.484,
"step": 29300
},
{
"epoch": 1.7684702506761467,
"grad_norm": 1.0518171787261963,
"learning_rate": 5.707902634211404e-06,
"loss": 0.3623,
"step": 29400
},
{
"epoch": 1.7684702506761467,
"eval_loss": 0.3236755430698395,
"eval_runtime": 51.279,
"eval_samples_per_second": 195.012,
"eval_steps_per_second": 24.376,
"step": 29400
},
{
"epoch": 1.7744854556104193,
"grad_norm": 1.008692741394043,
"learning_rate": 5.706902300766923e-06,
"loss": 0.3594,
"step": 29500
},
{
"epoch": 1.7744854556104193,
"eval_loss": 0.322955846786499,
"eval_runtime": 50.9674,
"eval_samples_per_second": 196.204,
"eval_steps_per_second": 24.525,
"step": 29500
},
{
"epoch": 1.7805006605446918,
"grad_norm": 1.0272122621536255,
"learning_rate": 5.705901967322441e-06,
"loss": 0.3589,
"step": 29600
},
{
"epoch": 1.7805006605446918,
"eval_loss": 0.32889479398727417,
"eval_runtime": 51.0901,
"eval_samples_per_second": 195.733,
"eval_steps_per_second": 24.467,
"step": 29600
},
{
"epoch": 1.7865158654789646,
"grad_norm": 0.9986202120780945,
"learning_rate": 5.7049016338779595e-06,
"loss": 0.3583,
"step": 29700
},
{
"epoch": 1.7865158654789646,
"eval_loss": 0.32579848170280457,
"eval_runtime": 51.3308,
"eval_samples_per_second": 194.815,
"eval_steps_per_second": 24.352,
"step": 29700
},
{
"epoch": 1.7925310704132371,
"grad_norm": 1.1426304578781128,
"learning_rate": 5.703901300433477e-06,
"loss": 0.3578,
"step": 29800
},
{
"epoch": 1.7925310704132371,
"eval_loss": 0.3219316303730011,
"eval_runtime": 51.0488,
"eval_samples_per_second": 195.891,
"eval_steps_per_second": 24.486,
"step": 29800
},
{
"epoch": 1.7985462753475097,
"grad_norm": 1.0315282344818115,
"learning_rate": 5.702900966988996e-06,
"loss": 0.3554,
"step": 29900
},
{
"epoch": 1.7985462753475097,
"eval_loss": 0.3245343267917633,
"eval_runtime": 51.1337,
"eval_samples_per_second": 195.566,
"eval_steps_per_second": 24.446,
"step": 29900
},
{
"epoch": 1.8045614802817824,
"grad_norm": 0.9708550572395325,
"learning_rate": 5.701900633544515e-06,
"loss": 0.3576,
"step": 30000
},
{
"epoch": 1.8045614802817824,
"eval_loss": 0.3180968761444092,
"eval_runtime": 51.0446,
"eval_samples_per_second": 195.907,
"eval_steps_per_second": 24.488,
"step": 30000
},
{
"epoch": 1.8105766852160547,
"grad_norm": 0.9034538865089417,
"learning_rate": 5.7009003001000335e-06,
"loss": 0.3537,
"step": 30100
},
{
"epoch": 1.8105766852160547,
"eval_loss": 0.3229399621486664,
"eval_runtime": 51.0689,
"eval_samples_per_second": 195.814,
"eval_steps_per_second": 24.477,
"step": 30100
},
{
"epoch": 1.8165918901503275,
"grad_norm": 1.0373872518539429,
"learning_rate": 5.699899966655552e-06,
"loss": 0.356,
"step": 30200
},
{
"epoch": 1.8165918901503275,
"eval_loss": 0.3164275288581848,
"eval_runtime": 51.4888,
"eval_samples_per_second": 194.217,
"eval_steps_per_second": 24.277,
"step": 30200
},
{
"epoch": 1.8226070950846,
"grad_norm": 1.073961615562439,
"learning_rate": 5.698899633211071e-06,
"loss": 0.3574,
"step": 30300
},
{
"epoch": 1.8226070950846,
"eval_loss": 0.3165951669216156,
"eval_runtime": 51.0637,
"eval_samples_per_second": 195.834,
"eval_steps_per_second": 24.479,
"step": 30300
},
{
"epoch": 1.8286223000188726,
"grad_norm": 0.9891506433486938,
"learning_rate": 5.697899299766589e-06,
"loss": 0.3548,
"step": 30400
},
{
"epoch": 1.8286223000188726,
"eval_loss": 0.3134399354457855,
"eval_runtime": 51.2735,
"eval_samples_per_second": 195.032,
"eval_steps_per_second": 24.379,
"step": 30400
},
{
"epoch": 1.8346375049531454,
"grad_norm": 0.9468514919281006,
"learning_rate": 5.6968989663221075e-06,
"loss": 0.3534,
"step": 30500
},
{
"epoch": 1.8346375049531454,
"eval_loss": 0.3175615966320038,
"eval_runtime": 51.0054,
"eval_samples_per_second": 196.058,
"eval_steps_per_second": 24.507,
"step": 30500
},
{
"epoch": 1.840652709887418,
"grad_norm": 1.0942094326019287,
"learning_rate": 5.695898632877625e-06,
"loss": 0.3551,
"step": 30600
},
{
"epoch": 1.840652709887418,
"eval_loss": 0.31934764981269836,
"eval_runtime": 50.744,
"eval_samples_per_second": 197.068,
"eval_steps_per_second": 24.633,
"step": 30600
},
{
"epoch": 1.8466679148216905,
"grad_norm": 1.0087659358978271,
"learning_rate": 5.694898299433144e-06,
"loss": 0.3534,
"step": 30700
},
{
"epoch": 1.8466679148216905,
"eval_loss": 0.3216070532798767,
"eval_runtime": 51.2443,
"eval_samples_per_second": 195.144,
"eval_steps_per_second": 24.393,
"step": 30700
},
{
"epoch": 1.8526831197559632,
"grad_norm": 0.973987340927124,
"learning_rate": 5.693897965988664e-06,
"loss": 0.3551,
"step": 30800
},
{
"epoch": 1.8526831197559632,
"eval_loss": 0.3222227990627289,
"eval_runtime": 51.317,
"eval_samples_per_second": 194.867,
"eval_steps_per_second": 24.358,
"step": 30800
},
{
"epoch": 1.8586983246902358,
"grad_norm": 1.0220999717712402,
"learning_rate": 5.6928976325441814e-06,
"loss": 0.3512,
"step": 30900
},
{
"epoch": 1.8586983246902358,
"eval_loss": 0.3149110972881317,
"eval_runtime": 50.9851,
"eval_samples_per_second": 196.136,
"eval_steps_per_second": 24.517,
"step": 30900
},
{
"epoch": 1.8647135296245083,
"grad_norm": 0.9891929626464844,
"learning_rate": 5.6918972990997e-06,
"loss": 0.3494,
"step": 31000
},
{
"epoch": 1.8647135296245083,
"eval_loss": 0.3158430755138397,
"eval_runtime": 51.0404,
"eval_samples_per_second": 195.923,
"eval_steps_per_second": 24.49,
"step": 31000
},
{
"epoch": 1.870728734558781,
"grad_norm": 1.0088871717453003,
"learning_rate": 5.690896965655219e-06,
"loss": 0.3554,
"step": 31100
},
{
"epoch": 1.870728734558781,
"eval_loss": 0.3154695928096771,
"eval_runtime": 51.3526,
"eval_samples_per_second": 194.732,
"eval_steps_per_second": 24.342,
"step": 31100
},
{
"epoch": 1.8767439394930534,
"grad_norm": 1.050904393196106,
"learning_rate": 5.689896632210737e-06,
"loss": 0.348,
"step": 31200
},
{
"epoch": 1.8767439394930534,
"eval_loss": 0.3176015019416809,
"eval_runtime": 50.968,
"eval_samples_per_second": 196.202,
"eval_steps_per_second": 24.525,
"step": 31200
},
{
"epoch": 1.8827591444273262,
"grad_norm": 0.9467193484306335,
"learning_rate": 5.688896298766255e-06,
"loss": 0.3495,
"step": 31300
},
{
"epoch": 1.8827591444273262,
"eval_loss": 0.31329813599586487,
"eval_runtime": 51.0441,
"eval_samples_per_second": 195.909,
"eval_steps_per_second": 24.489,
"step": 31300
},
{
"epoch": 1.888774349361599,
"grad_norm": 0.9775587916374207,
"learning_rate": 5.687895965321774e-06,
"loss": 0.348,
"step": 31400
},
{
"epoch": 1.888774349361599,
"eval_loss": 0.3119243383407593,
"eval_runtime": 51.4209,
"eval_samples_per_second": 194.474,
"eval_steps_per_second": 24.309,
"step": 31400
},
{
"epoch": 1.8947895542958713,
"grad_norm": 0.9961014986038208,
"learning_rate": 5.686895631877293e-06,
"loss": 0.3481,
"step": 31500
},
{
"epoch": 1.8947895542958713,
"eval_loss": 0.3146650791168213,
"eval_runtime": 51.0401,
"eval_samples_per_second": 195.924,
"eval_steps_per_second": 24.491,
"step": 31500
},
{
"epoch": 1.900804759230144,
"grad_norm": 0.9647944569587708,
"learning_rate": 5.6858952984328115e-06,
"loss": 0.3485,
"step": 31600
},
{
"epoch": 1.900804759230144,
"eval_loss": 0.3082703948020935,
"eval_runtime": 51.0736,
"eval_samples_per_second": 195.796,
"eval_steps_per_second": 24.474,
"step": 31600
},
{
"epoch": 1.9068199641644166,
"grad_norm": 0.977745532989502,
"learning_rate": 5.684894964988329e-06,
"loss": 0.346,
"step": 31700
},
{
"epoch": 1.9068199641644166,
"eval_loss": 0.31021973490715027,
"eval_runtime": 51.3893,
"eval_samples_per_second": 194.593,
"eval_steps_per_second": 24.324,
"step": 31700
},
{
"epoch": 1.9128351690986891,
"grad_norm": 1.007712960243225,
"learning_rate": 5.683894631543848e-06,
"loss": 0.3439,
"step": 31800
},
{
"epoch": 1.9128351690986891,
"eval_loss": 0.3149736225605011,
"eval_runtime": 50.9919,
"eval_samples_per_second": 196.109,
"eval_steps_per_second": 24.514,
"step": 31800
},
{
"epoch": 1.9188503740329619,
"grad_norm": 0.9901500940322876,
"learning_rate": 5.682894298099367e-06,
"loss": 0.3465,
"step": 31900
},
{
"epoch": 1.9188503740329619,
"eval_loss": 0.3099238872528076,
"eval_runtime": 49.9711,
"eval_samples_per_second": 200.116,
"eval_steps_per_second": 25.014,
"step": 31900
},
{
"epoch": 1.9248655789672344,
"grad_norm": 1.0771408081054688,
"learning_rate": 5.681893964654885e-06,
"loss": 0.3469,
"step": 32000
},
{
"epoch": 1.9248655789672344,
"eval_loss": 0.3117373585700989,
"eval_runtime": 51.3413,
"eval_samples_per_second": 194.775,
"eval_steps_per_second": 24.347,
"step": 32000
},
{
"epoch": 1.930880783901507,
"grad_norm": 0.9278393983840942,
"learning_rate": 5.680893631210403e-06,
"loss": 0.3449,
"step": 32100
},
{
"epoch": 1.930880783901507,
"eval_loss": 0.3087506890296936,
"eval_runtime": 50.9985,
"eval_samples_per_second": 196.084,
"eval_steps_per_second": 24.511,
"step": 32100
},
{
"epoch": 1.9368959888357797,
"grad_norm": 0.9451966285705566,
"learning_rate": 5.679893297765922e-06,
"loss": 0.3481,
"step": 32200
},
{
"epoch": 1.9368959888357797,
"eval_loss": 0.30677124857902527,
"eval_runtime": 51.0702,
"eval_samples_per_second": 195.809,
"eval_steps_per_second": 24.476,
"step": 32200
},
{
"epoch": 1.9429111937700523,
"grad_norm": 1.0483254194259644,
"learning_rate": 5.678892964321441e-06,
"loss": 0.3445,
"step": 32300
},
{
"epoch": 1.9429111937700523,
"eval_loss": 0.30840355157852173,
"eval_runtime": 51.0518,
"eval_samples_per_second": 195.879,
"eval_steps_per_second": 24.485,
"step": 32300
},
{
"epoch": 1.9489263987043248,
"grad_norm": 1.0422637462615967,
"learning_rate": 5.6778926308769595e-06,
"loss": 0.3441,
"step": 32400
},
{
"epoch": 1.9489263987043248,
"eval_loss": 0.3115750849246979,
"eval_runtime": 51.1153,
"eval_samples_per_second": 195.636,
"eval_steps_per_second": 24.455,
"step": 32400
},
{
"epoch": 1.9549416036385976,
"grad_norm": 0.9909389019012451,
"learning_rate": 5.676892297432478e-06,
"loss": 0.344,
"step": 32500
},
{
"epoch": 1.9549416036385976,
"eval_loss": 0.30596745014190674,
"eval_runtime": 51.3225,
"eval_samples_per_second": 194.846,
"eval_steps_per_second": 24.356,
"step": 32500
},
{
"epoch": 1.96095680857287,
"grad_norm": 0.9379361271858215,
"learning_rate": 5.675891963987996e-06,
"loss": 0.3451,
"step": 32600
},
{
"epoch": 1.96095680857287,
"eval_loss": 0.3045947253704071,
"eval_runtime": 48.1799,
"eval_samples_per_second": 207.555,
"eval_steps_per_second": 25.944,
"step": 32600
},
{
"epoch": 1.9669720135071427,
"grad_norm": 0.9916946887969971,
"learning_rate": 5.674891630543515e-06,
"loss": 0.3435,
"step": 32700
},
{
"epoch": 1.9669720135071427,
"eval_loss": 0.3098689019680023,
"eval_runtime": 51.0219,
"eval_samples_per_second": 195.994,
"eval_steps_per_second": 24.499,
"step": 32700
},
{
"epoch": 1.9729872184414154,
"grad_norm": 1.0491201877593994,
"learning_rate": 5.673891297099033e-06,
"loss": 0.3451,
"step": 32800
},
{
"epoch": 1.9729872184414154,
"eval_loss": 0.307062566280365,
"eval_runtime": 51.2447,
"eval_samples_per_second": 195.142,
"eval_steps_per_second": 24.393,
"step": 32800
},
{
"epoch": 1.9790024233756878,
"grad_norm": 1.0011417865753174,
"learning_rate": 5.672890963654551e-06,
"loss": 0.3438,
"step": 32900
},
{
"epoch": 1.9790024233756878,
"eval_loss": 0.30759868025779724,
"eval_runtime": 51.2551,
"eval_samples_per_second": 195.103,
"eval_steps_per_second": 24.388,
"step": 32900
},
{
"epoch": 1.9850176283099605,
"grad_norm": 0.997515082359314,
"learning_rate": 5.67189063021007e-06,
"loss": 0.3401,
"step": 33000
},
{
"epoch": 1.9850176283099605,
"eval_loss": 0.30724722146987915,
"eval_runtime": 51.0456,
"eval_samples_per_second": 195.903,
"eval_steps_per_second": 24.488,
"step": 33000
},
{
"epoch": 1.991032833244233,
"grad_norm": 1.00389564037323,
"learning_rate": 5.670890296765589e-06,
"loss": 0.3435,
"step": 33100
},
{
"epoch": 1.991032833244233,
"eval_loss": 0.30223432183265686,
"eval_runtime": 51.0634,
"eval_samples_per_second": 195.835,
"eval_steps_per_second": 24.479,
"step": 33100
},
{
"epoch": 1.9970480381785056,
"grad_norm": 1.0292458534240723,
"learning_rate": 5.6698899633211075e-06,
"loss": 0.342,
"step": 33200
},
{
"epoch": 1.9970480381785056,
"eval_loss": 0.3018937110900879,
"eval_runtime": 51.3884,
"eval_samples_per_second": 194.597,
"eval_steps_per_second": 24.325,
"step": 33200
},
{
"epoch": 2.0030632431127784,
"grad_norm": 0.9542250037193298,
"learning_rate": 5.668889629876626e-06,
"loss": 0.3437,
"step": 33300
},
{
"epoch": 2.0030632431127784,
"eval_loss": 0.3050287961959839,
"eval_runtime": 48.1087,
"eval_samples_per_second": 207.863,
"eval_steps_per_second": 25.983,
"step": 33300
},
{
"epoch": 2.0090784480470507,
"grad_norm": 0.9858297109603882,
"learning_rate": 5.667889296432144e-06,
"loss": 0.3376,
"step": 33400
},
{
"epoch": 2.0090784480470507,
"eval_loss": 0.3004157543182373,
"eval_runtime": 50.8704,
"eval_samples_per_second": 196.578,
"eval_steps_per_second": 24.572,
"step": 33400
},
{
"epoch": 2.0150936529813235,
"grad_norm": 0.9825339317321777,
"learning_rate": 5.666888962987663e-06,
"loss": 0.3387,
"step": 33500
},
{
"epoch": 2.0150936529813235,
"eval_loss": 0.3035270869731903,
"eval_runtime": 51.1972,
"eval_samples_per_second": 195.323,
"eval_steps_per_second": 24.415,
"step": 33500
},
{
"epoch": 2.0211088579155962,
"grad_norm": 0.9198622703552246,
"learning_rate": 5.665888629543181e-06,
"loss": 0.336,
"step": 33600
},
{
"epoch": 2.0211088579155962,
"eval_loss": 0.30675825476646423,
"eval_runtime": 50.9963,
"eval_samples_per_second": 196.093,
"eval_steps_per_second": 24.512,
"step": 33600
},
{
"epoch": 2.0271240628498686,
"grad_norm": 0.9473734498023987,
"learning_rate": 5.664888296098699e-06,
"loss": 0.336,
"step": 33700
},
{
"epoch": 2.0271240628498686,
"eval_loss": 0.3050824701786041,
"eval_runtime": 51.1058,
"eval_samples_per_second": 195.673,
"eval_steps_per_second": 24.459,
"step": 33700
},
{
"epoch": 2.0331392677841413,
"grad_norm": 0.9824632406234741,
"learning_rate": 5.663887962654218e-06,
"loss": 0.3366,
"step": 33800
},
{
"epoch": 2.0331392677841413,
"eval_loss": 0.3059363067150116,
"eval_runtime": 51.3136,
"eval_samples_per_second": 194.88,
"eval_steps_per_second": 24.36,
"step": 33800
},
{
"epoch": 2.039154472718414,
"grad_norm": 0.8891803622245789,
"learning_rate": 5.662887629209737e-06,
"loss": 0.3373,
"step": 33900
},
{
"epoch": 2.039154472718414,
"eval_loss": 0.2996893525123596,
"eval_runtime": 51.0027,
"eval_samples_per_second": 196.068,
"eval_steps_per_second": 24.509,
"step": 33900
},
{
"epoch": 2.0451696776526864,
"grad_norm": 1.0512337684631348,
"learning_rate": 5.6618872957652554e-06,
"loss": 0.3367,
"step": 34000
},
{
"epoch": 2.0451696776526864,
"eval_loss": 0.3059813976287842,
"eval_runtime": 48.247,
"eval_samples_per_second": 207.267,
"eval_steps_per_second": 25.908,
"step": 34000
},
{
"epoch": 2.051184882586959,
"grad_norm": 0.9054902791976929,
"learning_rate": 5.660886962320774e-06,
"loss": 0.3371,
"step": 34100
},
{
"epoch": 2.051184882586959,
"eval_loss": 0.3016323745250702,
"eval_runtime": 51.1014,
"eval_samples_per_second": 195.69,
"eval_steps_per_second": 24.461,
"step": 34100
},
{
"epoch": 2.057200087521232,
"grad_norm": 0.9262953400611877,
"learning_rate": 5.659886628876292e-06,
"loss": 0.3367,
"step": 34200
},
{
"epoch": 2.057200087521232,
"eval_loss": 0.29450055956840515,
"eval_runtime": 51.0335,
"eval_samples_per_second": 195.95,
"eval_steps_per_second": 24.494,
"step": 34200
},
{
"epoch": 2.0632152924555043,
"grad_norm": 0.9734236001968384,
"learning_rate": 5.658886295431811e-06,
"loss": 0.3343,
"step": 34300
},
{
"epoch": 2.0632152924555043,
"eval_loss": 0.3005402684211731,
"eval_runtime": 51.0508,
"eval_samples_per_second": 195.883,
"eval_steps_per_second": 24.485,
"step": 34300
},
{
"epoch": 2.069230497389777,
"grad_norm": 1.0002549886703491,
"learning_rate": 5.657885961987329e-06,
"loss": 0.3322,
"step": 34400
},
{
"epoch": 2.069230497389777,
"eval_loss": 0.2977810204029083,
"eval_runtime": 51.3717,
"eval_samples_per_second": 194.66,
"eval_steps_per_second": 24.332,
"step": 34400
},
{
"epoch": 2.07524570232405,
"grad_norm": 1.0582560300827026,
"learning_rate": 5.656885628542847e-06,
"loss": 0.3335,
"step": 34500
},
{
"epoch": 2.07524570232405,
"eval_loss": 0.30631959438323975,
"eval_runtime": 51.4392,
"eval_samples_per_second": 194.404,
"eval_steps_per_second": 24.301,
"step": 34500
},
{
"epoch": 2.081260907258322,
"grad_norm": 0.9257709383964539,
"learning_rate": 5.655885295098366e-06,
"loss": 0.3348,
"step": 34600
},
{
"epoch": 2.081260907258322,
"eval_loss": 0.296891450881958,
"eval_runtime": 51.1063,
"eval_samples_per_second": 195.671,
"eval_steps_per_second": 24.459,
"step": 34600
},
{
"epoch": 2.087276112192595,
"grad_norm": 0.9784733653068542,
"learning_rate": 5.654884961653885e-06,
"loss": 0.3351,
"step": 34700
},
{
"epoch": 2.087276112192595,
"eval_loss": 0.30041709542274475,
"eval_runtime": 36.3799,
"eval_samples_per_second": 274.877,
"eval_steps_per_second": 34.36,
"step": 34700
},
{
"epoch": 2.0932913171268677,
"grad_norm": 0.9119441509246826,
"learning_rate": 5.653884628209403e-06,
"loss": 0.3331,
"step": 34800
},
{
"epoch": 2.0932913171268677,
"eval_loss": 0.2985159754753113,
"eval_runtime": 51.0698,
"eval_samples_per_second": 195.811,
"eval_steps_per_second": 24.476,
"step": 34800
},
{
"epoch": 2.09930652206114,
"grad_norm": 0.8888152837753296,
"learning_rate": 5.652884294764922e-06,
"loss": 0.3329,
"step": 34900
},
{
"epoch": 2.09930652206114,
"eval_loss": 0.2997465431690216,
"eval_runtime": 51.2789,
"eval_samples_per_second": 195.012,
"eval_steps_per_second": 24.377,
"step": 34900
},
{
"epoch": 2.1053217269954128,
"grad_norm": 0.9288111329078674,
"learning_rate": 5.65188396132044e-06,
"loss": 0.3293,
"step": 35000
},
{
"epoch": 2.1053217269954128,
"eval_loss": 0.30220091342926025,
"eval_runtime": 51.0672,
"eval_samples_per_second": 195.82,
"eval_steps_per_second": 24.478,
"step": 35000
},
{
"epoch": 2.111336931929685,
"grad_norm": 0.9979832172393799,
"learning_rate": 5.650883627875959e-06,
"loss": 0.3335,
"step": 35100
},
{
"epoch": 2.111336931929685,
"eval_loss": 0.2983012795448303,
"eval_runtime": 51.1125,
"eval_samples_per_second": 195.647,
"eval_steps_per_second": 24.456,
"step": 35100
},
{
"epoch": 2.117352136863958,
"grad_norm": 0.9908544421195984,
"learning_rate": 5.649883294431477e-06,
"loss": 0.3308,
"step": 35200
},
{
"epoch": 2.117352136863958,
"eval_loss": 0.294648677110672,
"eval_runtime": 51.0363,
"eval_samples_per_second": 195.939,
"eval_steps_per_second": 24.492,
"step": 35200
},
{
"epoch": 2.1233673417982306,
"grad_norm": 0.9367330074310303,
"learning_rate": 5.648882960986995e-06,
"loss": 0.3308,
"step": 35300
},
{
"epoch": 2.1233673417982306,
"eval_loss": 0.2953595817089081,
"eval_runtime": 51.3129,
"eval_samples_per_second": 194.883,
"eval_steps_per_second": 24.36,
"step": 35300
},
{
"epoch": 2.129382546732503,
"grad_norm": 0.923230767250061,
"learning_rate": 5.647882627542515e-06,
"loss": 0.3305,
"step": 35400
},
{
"epoch": 2.129382546732503,
"eval_loss": 0.2954292893409729,
"eval_runtime": 51.1146,
"eval_samples_per_second": 195.639,
"eval_steps_per_second": 24.455,
"step": 35400
},
{
"epoch": 2.1353977516667757,
"grad_norm": 0.9737799167633057,
"learning_rate": 5.6468822940980335e-06,
"loss": 0.3321,
"step": 35500
},
{
"epoch": 2.1353977516667757,
"eval_loss": 0.2911643981933594,
"eval_runtime": 51.5291,
"eval_samples_per_second": 194.065,
"eval_steps_per_second": 24.258,
"step": 35500
},
{
"epoch": 2.1414129566010485,
"grad_norm": 0.957861602306366,
"learning_rate": 5.645881960653551e-06,
"loss": 0.3304,
"step": 35600
},
{
"epoch": 2.1414129566010485,
"eval_loss": 0.29846978187561035,
"eval_runtime": 50.954,
"eval_samples_per_second": 196.255,
"eval_steps_per_second": 24.532,
"step": 35600
},
{
"epoch": 2.147428161535321,
"grad_norm": 0.9183242321014404,
"learning_rate": 5.64488162720907e-06,
"loss": 0.3271,
"step": 35700
},
{
"epoch": 2.147428161535321,
"eval_loss": 0.2944715619087219,
"eval_runtime": 51.2205,
"eval_samples_per_second": 195.234,
"eval_steps_per_second": 24.404,
"step": 35700
},
{
"epoch": 2.1534433664695936,
"grad_norm": 0.9701703190803528,
"learning_rate": 5.643881293764588e-06,
"loss": 0.3293,
"step": 35800
},
{
"epoch": 2.1534433664695936,
"eval_loss": 0.29417359828948975,
"eval_runtime": 51.0579,
"eval_samples_per_second": 195.856,
"eval_steps_per_second": 24.482,
"step": 35800
},
{
"epoch": 2.1594585714038663,
"grad_norm": 0.992079496383667,
"learning_rate": 5.642880960320107e-06,
"loss": 0.3263,
"step": 35900
},
{
"epoch": 2.1594585714038663,
"eval_loss": 0.29444122314453125,
"eval_runtime": 51.0557,
"eval_samples_per_second": 195.864,
"eval_steps_per_second": 24.483,
"step": 35900
},
{
"epoch": 2.1654737763381386,
"grad_norm": 0.9776268005371094,
"learning_rate": 5.641880626875625e-06,
"loss": 0.3266,
"step": 36000
},
{
"epoch": 2.1654737763381386,
"eval_loss": 0.29786214232444763,
"eval_runtime": 44.4576,
"eval_samples_per_second": 224.934,
"eval_steps_per_second": 28.117,
"step": 36000
},
{
"epoch": 2.1714889812724114,
"grad_norm": 1.0352015495300293,
"learning_rate": 5.640880293431144e-06,
"loss": 0.3279,
"step": 36100
},
{
"epoch": 2.1714889812724114,
"eval_loss": 0.2935112416744232,
"eval_runtime": 51.0332,
"eval_samples_per_second": 195.951,
"eval_steps_per_second": 24.494,
"step": 36100
},
{
"epoch": 2.1775041862066837,
"grad_norm": 0.9267537593841553,
"learning_rate": 5.639879959986663e-06,
"loss": 0.3252,
"step": 36200
},
{
"epoch": 2.1775041862066837,
"eval_loss": 0.2946629822254181,
"eval_runtime": 51.0517,
"eval_samples_per_second": 195.88,
"eval_steps_per_second": 24.485,
"step": 36200
},
{
"epoch": 2.1835193911409565,
"grad_norm": 0.8838132619857788,
"learning_rate": 5.6388796265421815e-06,
"loss": 0.3273,
"step": 36300
},
{
"epoch": 2.1835193911409565,
"eval_loss": 0.28932899236679077,
"eval_runtime": 50.4286,
"eval_samples_per_second": 198.3,
"eval_steps_per_second": 24.788,
"step": 36300
},
{
"epoch": 2.1895345960752293,
"grad_norm": 0.9279465079307556,
"learning_rate": 5.637879293097699e-06,
"loss": 0.3282,
"step": 36400
},
{
"epoch": 2.1895345960752293,
"eval_loss": 0.2960895895957947,
"eval_runtime": 51.1104,
"eval_samples_per_second": 195.655,
"eval_steps_per_second": 24.457,
"step": 36400
},
{
"epoch": 2.1955498010095016,
"grad_norm": 1.0713165998458862,
"learning_rate": 5.636878959653218e-06,
"loss": 0.3269,
"step": 36500
},
{
"epoch": 2.1955498010095016,
"eval_loss": 0.29087430238723755,
"eval_runtime": 51.0616,
"eval_samples_per_second": 195.842,
"eval_steps_per_second": 24.48,
"step": 36500
},
{
"epoch": 2.2015650059437744,
"grad_norm": 0.966033935546875,
"learning_rate": 5.635878626208736e-06,
"loss": 0.3258,
"step": 36600
},
{
"epoch": 2.2015650059437744,
"eval_loss": 0.2945682108402252,
"eval_runtime": 51.2162,
"eval_samples_per_second": 195.251,
"eval_steps_per_second": 24.406,
"step": 36600
},
{
"epoch": 2.207580210878047,
"grad_norm": 1.0510607957839966,
"learning_rate": 5.634878292764255e-06,
"loss": 0.3239,
"step": 36700
},
{
"epoch": 2.207580210878047,
"eval_loss": 0.29083874821662903,
"eval_runtime": 51.0865,
"eval_samples_per_second": 195.746,
"eval_steps_per_second": 24.468,
"step": 36700
},
{
"epoch": 2.2135954158123194,
"grad_norm": 0.9516984224319458,
"learning_rate": 5.633877959319773e-06,
"loss": 0.3242,
"step": 36800
},
{
"epoch": 2.2135954158123194,
"eval_loss": 0.287597119808197,
"eval_runtime": 51.2859,
"eval_samples_per_second": 194.985,
"eval_steps_per_second": 24.373,
"step": 36800
},
{
"epoch": 2.219610620746592,
"grad_norm": 0.9704160094261169,
"learning_rate": 5.632877625875292e-06,
"loss": 0.3229,
"step": 36900
},
{
"epoch": 2.219610620746592,
"eval_loss": 0.28357696533203125,
"eval_runtime": 51.0184,
"eval_samples_per_second": 196.008,
"eval_steps_per_second": 24.501,
"step": 36900
},
{
"epoch": 2.225625825680865,
"grad_norm": 0.9318411350250244,
"learning_rate": 5.631877292430811e-06,
"loss": 0.3244,
"step": 37000
},
{
"epoch": 2.225625825680865,
"eval_loss": 0.2926484942436218,
"eval_runtime": 51.0515,
"eval_samples_per_second": 195.88,
"eval_steps_per_second": 24.485,
"step": 37000
},
{
"epoch": 2.2316410306151373,
"grad_norm": 0.9745403528213501,
"learning_rate": 5.6308769589863294e-06,
"loss": 0.3238,
"step": 37100
},
{
"epoch": 2.2316410306151373,
"eval_loss": 0.29221734404563904,
"eval_runtime": 51.0519,
"eval_samples_per_second": 195.879,
"eval_steps_per_second": 24.485,
"step": 37100
},
{
"epoch": 2.23765623554941,
"grad_norm": 1.0162553787231445,
"learning_rate": 5.629876625541847e-06,
"loss": 0.3209,
"step": 37200
},
{
"epoch": 2.23765623554941,
"eval_loss": 0.2900753319263458,
"eval_runtime": 51.0188,
"eval_samples_per_second": 196.006,
"eval_steps_per_second": 24.501,
"step": 37200
},
{
"epoch": 2.243671440483683,
"grad_norm": 0.9270024299621582,
"learning_rate": 5.628876292097366e-06,
"loss": 0.3218,
"step": 37300
},
{
"epoch": 2.243671440483683,
"eval_loss": 0.29185083508491516,
"eval_runtime": 49.1324,
"eval_samples_per_second": 203.532,
"eval_steps_per_second": 25.441,
"step": 37300
},
{
"epoch": 2.249686645417955,
"grad_norm": 1.0156973600387573,
"learning_rate": 5.627875958652885e-06,
"loss": 0.3221,
"step": 37400
},
{
"epoch": 2.249686645417955,
"eval_loss": 0.2883216440677643,
"eval_runtime": 51.0198,
"eval_samples_per_second": 196.002,
"eval_steps_per_second": 24.5,
"step": 37400
},
{
"epoch": 2.255701850352228,
"grad_norm": 0.884667694568634,
"learning_rate": 5.6268756252084026e-06,
"loss": 0.3231,
"step": 37500
},
{
"epoch": 2.255701850352228,
"eval_loss": 0.2843243181705475,
"eval_runtime": 51.199,
"eval_samples_per_second": 195.316,
"eval_steps_per_second": 24.415,
"step": 37500
},
{
"epoch": 2.2617170552865007,
"grad_norm": 1.0025333166122437,
"learning_rate": 5.625875291763921e-06,
"loss": 0.32,
"step": 37600
},
{
"epoch": 2.2617170552865007,
"eval_loss": 0.28985723853111267,
"eval_runtime": 51.0474,
"eval_samples_per_second": 195.896,
"eval_steps_per_second": 24.487,
"step": 37600
},
{
"epoch": 2.267732260220773,
"grad_norm": 0.9673831462860107,
"learning_rate": 5.62487495831944e-06,
"loss": 0.322,
"step": 37700
},
{
"epoch": 2.267732260220773,
"eval_loss": 0.2844723165035248,
"eval_runtime": 51.066,
"eval_samples_per_second": 195.825,
"eval_steps_per_second": 24.478,
"step": 37700
},
{
"epoch": 2.2737474651550458,
"grad_norm": 0.9513309597969055,
"learning_rate": 5.623874624874959e-06,
"loss": 0.3202,
"step": 37800
},
{
"epoch": 2.2737474651550458,
"eval_loss": 0.28764039278030396,
"eval_runtime": 51.061,
"eval_samples_per_second": 195.844,
"eval_steps_per_second": 24.481,
"step": 37800
},
{
"epoch": 2.279762670089318,
"grad_norm": 0.9131941795349121,
"learning_rate": 5.622874291430477e-06,
"loss": 0.3226,
"step": 37900
},
{
"epoch": 2.279762670089318,
"eval_loss": 0.28673484921455383,
"eval_runtime": 51.0581,
"eval_samples_per_second": 195.855,
"eval_steps_per_second": 24.482,
"step": 37900
},
{
"epoch": 2.285777875023591,
"grad_norm": 0.9458931684494019,
"learning_rate": 5.621873957985995e-06,
"loss": 0.3206,
"step": 38000
},
{
"epoch": 2.285777875023591,
"eval_loss": 0.2862774133682251,
"eval_runtime": 36.7081,
"eval_samples_per_second": 272.419,
"eval_steps_per_second": 34.052,
"step": 38000
},
{
"epoch": 2.2917930799578636,
"grad_norm": 0.997297465801239,
"learning_rate": 5.620873624541514e-06,
"loss": 0.3191,
"step": 38100
},
{
"epoch": 2.2917930799578636,
"eval_loss": 0.2823648750782013,
"eval_runtime": 51.0962,
"eval_samples_per_second": 195.709,
"eval_steps_per_second": 24.464,
"step": 38100
},
{
"epoch": 2.297808284892136,
"grad_norm": 0.9200996160507202,
"learning_rate": 5.619873291097033e-06,
"loss": 0.3187,
"step": 38200
},
{
"epoch": 2.297808284892136,
"eval_loss": 0.2872503995895386,
"eval_runtime": 51.0809,
"eval_samples_per_second": 195.768,
"eval_steps_per_second": 24.471,
"step": 38200
},
{
"epoch": 2.3038234898264087,
"grad_norm": 0.9441711902618408,
"learning_rate": 5.6188729576525505e-06,
"loss": 0.3209,
"step": 38300
},
{
"epoch": 2.3038234898264087,
"eval_loss": 0.28855210542678833,
"eval_runtime": 51.0269,
"eval_samples_per_second": 195.975,
"eval_steps_per_second": 24.497,
"step": 38300
},
{
"epoch": 2.3098386947606815,
"grad_norm": 1.0377998352050781,
"learning_rate": 5.617872624208069e-06,
"loss": 0.3189,
"step": 38400
},
{
"epoch": 2.3098386947606815,
"eval_loss": 0.2817797362804413,
"eval_runtime": 51.0556,
"eval_samples_per_second": 195.865,
"eval_steps_per_second": 24.483,
"step": 38400
},
{
"epoch": 2.315853899694954,
"grad_norm": 0.9088771939277649,
"learning_rate": 5.616872290763588e-06,
"loss": 0.3183,
"step": 38500
},
{
"epoch": 2.315853899694954,
"eval_loss": 0.28079554438591003,
"eval_runtime": 51.0907,
"eval_samples_per_second": 195.73,
"eval_steps_per_second": 24.466,
"step": 38500
},
{
"epoch": 2.3218691046292266,
"grad_norm": 0.8959800004959106,
"learning_rate": 5.615871957319107e-06,
"loss": 0.3174,
"step": 38600
},
{
"epoch": 2.3218691046292266,
"eval_loss": 0.28803524374961853,
"eval_runtime": 50.9133,
"eval_samples_per_second": 196.412,
"eval_steps_per_second": 24.552,
"step": 38600
},
{
"epoch": 2.3278843095634993,
"grad_norm": 0.9056723713874817,
"learning_rate": 5.614871623874625e-06,
"loss": 0.3167,
"step": 38700
},
{
"epoch": 2.3278843095634993,
"eval_loss": 0.2826622426509857,
"eval_runtime": 50.7905,
"eval_samples_per_second": 196.887,
"eval_steps_per_second": 24.611,
"step": 38700
},
{
"epoch": 2.3338995144977717,
"grad_norm": 0.9248780608177185,
"learning_rate": 5.613871290430143e-06,
"loss": 0.3176,
"step": 38800
},
{
"epoch": 2.3338995144977717,
"eval_loss": 0.2767186462879181,
"eval_runtime": 50.6115,
"eval_samples_per_second": 197.583,
"eval_steps_per_second": 24.698,
"step": 38800
},
{
"epoch": 2.3399147194320444,
"grad_norm": 0.9541249871253967,
"learning_rate": 5.612870956985662e-06,
"loss": 0.3187,
"step": 38900
},
{
"epoch": 2.3399147194320444,
"eval_loss": 0.28110334277153015,
"eval_runtime": 49.7615,
"eval_samples_per_second": 200.959,
"eval_steps_per_second": 25.12,
"step": 38900
},
{
"epoch": 2.3459299243663168,
"grad_norm": 0.9116654396057129,
"learning_rate": 5.611870623541181e-06,
"loss": 0.3147,
"step": 39000
},
{
"epoch": 2.3459299243663168,
"eval_loss": 0.2833644449710846,
"eval_runtime": 50.9711,
"eval_samples_per_second": 196.19,
"eval_steps_per_second": 24.524,
"step": 39000
},
{
"epoch": 2.3519451293005895,
"grad_norm": 0.9693782329559326,
"learning_rate": 5.6108702900966985e-06,
"loss": 0.3187,
"step": 39100
},
{
"epoch": 2.3519451293005895,
"eval_loss": 0.2744785249233246,
"eval_runtime": 51.0233,
"eval_samples_per_second": 195.989,
"eval_steps_per_second": 24.499,
"step": 39100
},
{
"epoch": 2.3579603342348623,
"grad_norm": 0.911391019821167,
"learning_rate": 5.609869956652217e-06,
"loss": 0.3144,
"step": 39200
},
{
"epoch": 2.3579603342348623,
"eval_loss": 0.27756959199905396,
"eval_runtime": 50.862,
"eval_samples_per_second": 196.61,
"eval_steps_per_second": 24.576,
"step": 39200
},
{
"epoch": 2.3639755391691346,
"grad_norm": 0.9383348822593689,
"learning_rate": 5.608869623207736e-06,
"loss": 0.3167,
"step": 39300
},
{
"epoch": 2.3639755391691346,
"eval_loss": 0.2751516103744507,
"eval_runtime": 51.0203,
"eval_samples_per_second": 196.001,
"eval_steps_per_second": 24.5,
"step": 39300
},
{
"epoch": 2.3699907441034074,
"grad_norm": 0.8825791478157043,
"learning_rate": 5.607869289763255e-06,
"loss": 0.3133,
"step": 39400
},
{
"epoch": 2.3699907441034074,
"eval_loss": 0.27583110332489014,
"eval_runtime": 51.0047,
"eval_samples_per_second": 196.06,
"eval_steps_per_second": 24.508,
"step": 39400
},
{
"epoch": 2.37600594903768,
"grad_norm": 0.9765325784683228,
"learning_rate": 5.606868956318773e-06,
"loss": 0.314,
"step": 39500
},
{
"epoch": 2.37600594903768,
"eval_loss": 0.2750406563282013,
"eval_runtime": 51.0333,
"eval_samples_per_second": 195.95,
"eval_steps_per_second": 24.494,
"step": 39500
},
{
"epoch": 2.3820211539719525,
"grad_norm": 0.968429684638977,
"learning_rate": 5.605868622874291e-06,
"loss": 0.3162,
"step": 39600
},
{
"epoch": 2.3820211539719525,
"eval_loss": 0.28406116366386414,
"eval_runtime": 50.9992,
"eval_samples_per_second": 196.081,
"eval_steps_per_second": 24.51,
"step": 39600
},
{
"epoch": 2.3880363589062252,
"grad_norm": 0.9351980686187744,
"learning_rate": 5.60486828942981e-06,
"loss": 0.3087,
"step": 39700
},
{
"epoch": 2.3880363589062252,
"eval_loss": 0.2797408103942871,
"eval_runtime": 51.0101,
"eval_samples_per_second": 196.04,
"eval_steps_per_second": 24.505,
"step": 39700
},
{
"epoch": 2.394051563840498,
"grad_norm": 0.9547052383422852,
"learning_rate": 5.603867955985329e-06,
"loss": 0.3139,
"step": 39800
},
{
"epoch": 2.394051563840498,
"eval_loss": 0.2779112458229065,
"eval_runtime": 50.9598,
"eval_samples_per_second": 196.233,
"eval_steps_per_second": 24.529,
"step": 39800
},
{
"epoch": 2.4000667687747703,
"grad_norm": 0.8971194624900818,
"learning_rate": 5.6028676225408465e-06,
"loss": 0.3113,
"step": 39900
},
{
"epoch": 2.4000667687747703,
"eval_loss": 0.28396087884902954,
"eval_runtime": 51.1122,
"eval_samples_per_second": 195.648,
"eval_steps_per_second": 24.456,
"step": 39900
},
{
"epoch": 2.406081973709043,
"grad_norm": 0.9058307409286499,
"learning_rate": 5.601867289096365e-06,
"loss": 0.314,
"step": 40000
},
{
"epoch": 2.406081973709043,
"eval_loss": 0.2806677222251892,
"eval_runtime": 50.9901,
"eval_samples_per_second": 196.117,
"eval_steps_per_second": 24.515,
"step": 40000
},
{
"epoch": 2.4120971786433154,
"grad_norm": 0.9002136588096619,
"learning_rate": 5.600866955651885e-06,
"loss": 0.3107,
"step": 40100
},
{
"epoch": 2.4120971786433154,
"eval_loss": 0.2816166579723358,
"eval_runtime": 50.992,
"eval_samples_per_second": 196.109,
"eval_steps_per_second": 24.514,
"step": 40100
},
{
"epoch": 2.418112383577588,
"grad_norm": 0.9614746570587158,
"learning_rate": 5.599866622207403e-06,
"loss": 0.3107,
"step": 40200
},
{
"epoch": 2.418112383577588,
"eval_loss": 0.2749168276786804,
"eval_runtime": 51.0276,
"eval_samples_per_second": 195.972,
"eval_steps_per_second": 24.497,
"step": 40200
},
{
"epoch": 2.424127588511861,
"grad_norm": 0.8742543458938599,
"learning_rate": 5.598866288762921e-06,
"loss": 0.3149,
"step": 40300
},
{
"epoch": 2.424127588511861,
"eval_loss": 0.2682496905326843,
"eval_runtime": 49.6212,
"eval_samples_per_second": 201.527,
"eval_steps_per_second": 25.191,
"step": 40300
},
{
"epoch": 2.4301427934461337,
"grad_norm": 0.9011858105659485,
"learning_rate": 5.59786595531844e-06,
"loss": 0.3094,
"step": 40400
},
{
"epoch": 2.4301427934461337,
"eval_loss": 0.277034729719162,
"eval_runtime": 51.0504,
"eval_samples_per_second": 195.885,
"eval_steps_per_second": 24.486,
"step": 40400
},
{
"epoch": 2.436157998380406,
"grad_norm": 0.9290640950202942,
"learning_rate": 5.596865621873958e-06,
"loss": 0.3114,
"step": 40500
},
{
"epoch": 2.436157998380406,
"eval_loss": 0.27406954765319824,
"eval_runtime": 51.0172,
"eval_samples_per_second": 196.012,
"eval_steps_per_second": 24.502,
"step": 40500
},
{
"epoch": 2.442173203314679,
"grad_norm": 0.89925616979599,
"learning_rate": 5.5958652884294766e-06,
"loss": 0.3096,
"step": 40600
},
{
"epoch": 2.442173203314679,
"eval_loss": 0.2777319848537445,
"eval_runtime": 51.1656,
"eval_samples_per_second": 195.444,
"eval_steps_per_second": 24.43,
"step": 40600
},
{
"epoch": 2.448188408248951,
"grad_norm": 0.8584897518157959,
"learning_rate": 5.594864954984994e-06,
"loss": 0.3123,
"step": 40700
},
{
"epoch": 2.448188408248951,
"eval_loss": 0.27250877022743225,
"eval_runtime": 51.0648,
"eval_samples_per_second": 195.829,
"eval_steps_per_second": 24.479,
"step": 40700
},
{
"epoch": 2.454203613183224,
"grad_norm": 0.9398366808891296,
"learning_rate": 5.593864621540514e-06,
"loss": 0.3108,
"step": 40800
},
{
"epoch": 2.454203613183224,
"eval_loss": 0.27442407608032227,
"eval_runtime": 51.06,
"eval_samples_per_second": 195.848,
"eval_steps_per_second": 24.481,
"step": 40800
},
{
"epoch": 2.4602188181174967,
"grad_norm": 0.8771011233329773,
"learning_rate": 5.592864288096033e-06,
"loss": 0.3107,
"step": 40900
},
{
"epoch": 2.4602188181174967,
"eval_loss": 0.27768152952194214,
"eval_runtime": 51.1346,
"eval_samples_per_second": 195.562,
"eval_steps_per_second": 24.445,
"step": 40900
},
{
"epoch": 2.466234023051769,
"grad_norm": 0.922232449054718,
"learning_rate": 5.5918639546515505e-06,
"loss": 0.3082,
"step": 41000
},
{
"epoch": 2.466234023051769,
"eval_loss": 0.27813389897346497,
"eval_runtime": 28.068,
"eval_samples_per_second": 356.278,
"eval_steps_per_second": 44.535,
"step": 41000
},
{
"epoch": 2.4722492279860417,
"grad_norm": 0.9415081143379211,
"learning_rate": 5.590863621207069e-06,
"loss": 0.3105,
"step": 41100
},
{
"epoch": 2.4722492279860417,
"eval_loss": 0.27401283383369446,
"eval_runtime": 50.7464,
"eval_samples_per_second": 197.058,
"eval_steps_per_second": 24.632,
"step": 41100
},
{
"epoch": 2.4782644329203145,
"grad_norm": 0.8894750475883484,
"learning_rate": 5.589863287762588e-06,
"loss": 0.31,
"step": 41200
},
{
"epoch": 2.4782644329203145,
"eval_loss": 0.2711414694786072,
"eval_runtime": 50.7544,
"eval_samples_per_second": 197.027,
"eval_steps_per_second": 24.628,
"step": 41200
},
{
"epoch": 2.484279637854587,
"grad_norm": 0.8910822868347168,
"learning_rate": 5.588862954318106e-06,
"loss": 0.3064,
"step": 41300
},
{
"epoch": 2.484279637854587,
"eval_loss": 0.2753881514072418,
"eval_runtime": 48.2521,
"eval_samples_per_second": 207.245,
"eval_steps_per_second": 25.906,
"step": 41300
},
{
"epoch": 2.4902948427888596,
"grad_norm": 0.890864908695221,
"learning_rate": 5.5878626208736245e-06,
"loss": 0.3042,
"step": 41400
},
{
"epoch": 2.4902948427888596,
"eval_loss": 0.27833056449890137,
"eval_runtime": 44.9081,
"eval_samples_per_second": 222.677,
"eval_steps_per_second": 27.835,
"step": 41400
},
{
"epoch": 2.4963100477231324,
"grad_norm": 0.8507567048072815,
"learning_rate": 5.586862287429143e-06,
"loss": 0.308,
"step": 41500
},
{
"epoch": 2.4963100477231324,
"eval_loss": 0.2749514579772949,
"eval_runtime": 45.6991,
"eval_samples_per_second": 218.823,
"eval_steps_per_second": 27.353,
"step": 41500
},
{
"epoch": 2.5023252526574047,
"grad_norm": 1.0246086120605469,
"learning_rate": 5.585861953984662e-06,
"loss": 0.308,
"step": 41600
},
{
"epoch": 2.5023252526574047,
"eval_loss": 0.2693102955818176,
"eval_runtime": 48.6013,
"eval_samples_per_second": 205.756,
"eval_steps_per_second": 25.719,
"step": 41600
},
{
"epoch": 2.5083404575916775,
"grad_norm": 1.015673279762268,
"learning_rate": 5.584861620540181e-06,
"loss": 0.3062,
"step": 41700
},
{
"epoch": 2.5083404575916775,
"eval_loss": 0.2740586996078491,
"eval_runtime": 49.0311,
"eval_samples_per_second": 203.952,
"eval_steps_per_second": 25.494,
"step": 41700
},
{
"epoch": 2.5143556625259498,
"grad_norm": 0.9325861930847168,
"learning_rate": 5.5838612870956985e-06,
"loss": 0.3085,
"step": 41800
},
{
"epoch": 2.5143556625259498,
"eval_loss": 0.2755836844444275,
"eval_runtime": 49.0354,
"eval_samples_per_second": 203.934,
"eval_steps_per_second": 25.492,
"step": 41800
},
{
"epoch": 2.5203708674602225,
"grad_norm": 0.8402740359306335,
"learning_rate": 5.582860953651217e-06,
"loss": 0.3074,
"step": 41900
},
{
"epoch": 2.5203708674602225,
"eval_loss": 0.2750794291496277,
"eval_runtime": 49.6049,
"eval_samples_per_second": 201.593,
"eval_steps_per_second": 25.199,
"step": 41900
},
{
"epoch": 2.5263860723944953,
"grad_norm": 0.8873264193534851,
"learning_rate": 5.581860620206736e-06,
"loss": 0.3073,
"step": 42000
},
{
"epoch": 2.5263860723944953,
"eval_loss": 0.2801840901374817,
"eval_runtime": 49.3914,
"eval_samples_per_second": 202.464,
"eval_steps_per_second": 25.308,
"step": 42000
},
{
"epoch": 2.5324012773287676,
"grad_norm": 0.9626051187515259,
"learning_rate": 5.580860286762254e-06,
"loss": 0.3068,
"step": 42100
},
{
"epoch": 2.5324012773287676,
"eval_loss": 0.2711939811706543,
"eval_runtime": 49.617,
"eval_samples_per_second": 201.544,
"eval_steps_per_second": 25.193,
"step": 42100
},
{
"epoch": 2.5384164822630404,
"grad_norm": 0.9168198108673096,
"learning_rate": 5.5798599533177725e-06,
"loss": 0.3059,
"step": 42200
},
{
"epoch": 2.5384164822630404,
"eval_loss": 0.270614355802536,
"eval_runtime": 50.1412,
"eval_samples_per_second": 199.437,
"eval_steps_per_second": 24.93,
"step": 42200
},
{
"epoch": 2.544431687197313,
"grad_norm": 0.9542158842086792,
"learning_rate": 5.578859619873291e-06,
"loss": 0.3061,
"step": 42300
},
{
"epoch": 2.544431687197313,
"eval_loss": 0.2705308198928833,
"eval_runtime": 50.4655,
"eval_samples_per_second": 198.155,
"eval_steps_per_second": 24.769,
"step": 42300
},
{
"epoch": 2.5504468921315855,
"grad_norm": 0.8468143939971924,
"learning_rate": 5.57785928642881e-06,
"loss": 0.3048,
"step": 42400
},
{
"epoch": 2.5504468921315855,
"eval_loss": 0.27329984307289124,
"eval_runtime": 50.4318,
"eval_samples_per_second": 198.288,
"eval_steps_per_second": 24.786,
"step": 42400
},
{
"epoch": 2.5564620970658583,
"grad_norm": 0.9493191838264465,
"learning_rate": 5.576858952984329e-06,
"loss": 0.3019,
"step": 42500
},
{
"epoch": 2.5564620970658583,
"eval_loss": 0.2731817364692688,
"eval_runtime": 50.5666,
"eval_samples_per_second": 197.759,
"eval_steps_per_second": 24.72,
"step": 42500
},
{
"epoch": 2.562477302000131,
"grad_norm": 0.9617642760276794,
"learning_rate": 5.5758586195398465e-06,
"loss": 0.3012,
"step": 42600
},
{
"epoch": 2.562477302000131,
"eval_loss": 0.26970621943473816,
"eval_runtime": 51.0766,
"eval_samples_per_second": 195.784,
"eval_steps_per_second": 24.473,
"step": 42600
},
{
"epoch": 2.5684925069344033,
"grad_norm": 0.9389893412590027,
"learning_rate": 5.574858286095365e-06,
"loss": 0.3027,
"step": 42700
},
{
"epoch": 2.5684925069344033,
"eval_loss": 0.27145934104919434,
"eval_runtime": 51.074,
"eval_samples_per_second": 195.794,
"eval_steps_per_second": 24.474,
"step": 42700
},
{
"epoch": 2.574507711868676,
"grad_norm": 0.9073367714881897,
"learning_rate": 5.573857952650884e-06,
"loss": 0.3021,
"step": 42800
},
{
"epoch": 2.574507711868676,
"eval_loss": 0.2711017429828644,
"eval_runtime": 51.072,
"eval_samples_per_second": 195.802,
"eval_steps_per_second": 24.475,
"step": 42800
},
{
"epoch": 2.5805229168029484,
"grad_norm": 0.8948126435279846,
"learning_rate": 5.572857619206402e-06,
"loss": 0.302,
"step": 42900
},
{
"epoch": 2.5805229168029484,
"eval_loss": 0.2703753113746643,
"eval_runtime": 51.0323,
"eval_samples_per_second": 195.954,
"eval_steps_per_second": 24.494,
"step": 42900
},
{
"epoch": 2.586538121737221,
"grad_norm": 0.943368136882782,
"learning_rate": 5.5718572857619205e-06,
"loss": 0.3007,
"step": 43000
},
{
"epoch": 2.586538121737221,
"eval_loss": 0.2676005959510803,
"eval_runtime": 51.147,
"eval_samples_per_second": 195.515,
"eval_steps_per_second": 24.439,
"step": 43000
},
{
"epoch": 2.592553326671494,
"grad_norm": 0.9073809385299683,
"learning_rate": 5.570856952317439e-06,
"loss": 0.3004,
"step": 43100
},
{
"epoch": 2.592553326671494,
"eval_loss": 0.26843926310539246,
"eval_runtime": 51.0148,
"eval_samples_per_second": 196.021,
"eval_steps_per_second": 24.503,
"step": 43100
},
{
"epoch": 2.5985685316057667,
"grad_norm": 0.9534226655960083,
"learning_rate": 5.569856618872958e-06,
"loss": 0.3039,
"step": 43200
},
{
"epoch": 2.5985685316057667,
"eval_loss": 0.2675269842147827,
"eval_runtime": 51.1418,
"eval_samples_per_second": 195.535,
"eval_steps_per_second": 24.442,
"step": 43200
},
{
"epoch": 2.604583736540039,
"grad_norm": 0.8546542525291443,
"learning_rate": 5.5688562854284766e-06,
"loss": 0.3008,
"step": 43300
},
{
"epoch": 2.604583736540039,
"eval_loss": 0.2680804133415222,
"eval_runtime": 51.0519,
"eval_samples_per_second": 195.879,
"eval_steps_per_second": 24.485,
"step": 43300
},
{
"epoch": 2.610598941474312,
"grad_norm": 0.9167499542236328,
"learning_rate": 5.567855951983995e-06,
"loss": 0.3001,
"step": 43400
},
{
"epoch": 2.610598941474312,
"eval_loss": 0.26866093277931213,
"eval_runtime": 51.2331,
"eval_samples_per_second": 195.186,
"eval_steps_per_second": 24.398,
"step": 43400
},
{
"epoch": 2.616614146408584,
"grad_norm": 0.9243641495704651,
"learning_rate": 5.566855618539513e-06,
"loss": 0.3007,
"step": 43500
},
{
"epoch": 2.616614146408584,
"eval_loss": 0.27828356623649597,
"eval_runtime": 35.4476,
"eval_samples_per_second": 282.107,
"eval_steps_per_second": 35.263,
"step": 43500
},
{
"epoch": 2.622629351342857,
"grad_norm": 0.9069240689277649,
"learning_rate": 5.565855285095032e-06,
"loss": 0.3039,
"step": 43600
},
{
"epoch": 2.622629351342857,
"eval_loss": 0.27373048663139343,
"eval_runtime": 51.0712,
"eval_samples_per_second": 195.805,
"eval_steps_per_second": 24.476,
"step": 43600
},
{
"epoch": 2.6286445562771297,
"grad_norm": 0.8967992663383484,
"learning_rate": 5.56485495165055e-06,
"loss": 0.3026,
"step": 43700
},
{
"epoch": 2.6286445562771297,
"eval_loss": 0.2672281861305237,
"eval_runtime": 51.0214,
"eval_samples_per_second": 195.996,
"eval_steps_per_second": 24.5,
"step": 43700
},
{
"epoch": 2.634659761211402,
"grad_norm": 0.8463547229766846,
"learning_rate": 5.563854618206068e-06,
"loss": 0.3018,
"step": 43800
},
{
"epoch": 2.634659761211402,
"eval_loss": 0.2690221071243286,
"eval_runtime": 51.0223,
"eval_samples_per_second": 195.993,
"eval_steps_per_second": 24.499,
"step": 43800
},
{
"epoch": 2.6406749661456748,
"grad_norm": 0.8656585812568665,
"learning_rate": 5.562854284761587e-06,
"loss": 0.3019,
"step": 43900
},
{
"epoch": 2.6406749661456748,
"eval_loss": 0.2694147229194641,
"eval_runtime": 51.2059,
"eval_samples_per_second": 195.29,
"eval_steps_per_second": 24.411,
"step": 43900
},
{
"epoch": 2.646690171079947,
"grad_norm": 0.8388367891311646,
"learning_rate": 5.561853951317106e-06,
"loss": 0.299,
"step": 44000
},
{
"epoch": 2.646690171079947,
"eval_loss": 0.27004268765449524,
"eval_runtime": 51.0385,
"eval_samples_per_second": 195.93,
"eval_steps_per_second": 24.491,
"step": 44000
},
{
"epoch": 2.65270537601422,
"grad_norm": 0.8733914494514465,
"learning_rate": 5.5608536178726245e-06,
"loss": 0.2996,
"step": 44100
},
{
"epoch": 2.65270537601422,
"eval_loss": 0.2620984613895416,
"eval_runtime": 51.1206,
"eval_samples_per_second": 195.616,
"eval_steps_per_second": 24.452,
"step": 44100
},
{
"epoch": 2.6587205809484926,
"grad_norm": 0.825485348701477,
"learning_rate": 5.559853284428143e-06,
"loss": 0.2996,
"step": 44200
},
{
"epoch": 2.6587205809484926,
"eval_loss": 0.26619336009025574,
"eval_runtime": 50.9856,
"eval_samples_per_second": 196.134,
"eval_steps_per_second": 24.517,
"step": 44200
},
{
"epoch": 2.6647357858827654,
"grad_norm": 0.9234973192214966,
"learning_rate": 5.558852950983661e-06,
"loss": 0.2994,
"step": 44300
},
{
"epoch": 2.6647357858827654,
"eval_loss": 0.269397497177124,
"eval_runtime": 51.1229,
"eval_samples_per_second": 195.607,
"eval_steps_per_second": 24.451,
"step": 44300
},
{
"epoch": 2.6707509908170377,
"grad_norm": 0.9815935492515564,
"learning_rate": 5.55785261753918e-06,
"loss": 0.2964,
"step": 44400
},
{
"epoch": 2.6707509908170377,
"eval_loss": 0.26540160179138184,
"eval_runtime": 51.0268,
"eval_samples_per_second": 195.975,
"eval_steps_per_second": 24.497,
"step": 44400
},
{
"epoch": 2.6767661957513105,
"grad_norm": 0.8895259499549866,
"learning_rate": 5.5568522840946985e-06,
"loss": 0.2943,
"step": 44500
},
{
"epoch": 2.6767661957513105,
"eval_loss": 0.2682526707649231,
"eval_runtime": 51.188,
"eval_samples_per_second": 195.358,
"eval_steps_per_second": 24.42,
"step": 44500
},
{
"epoch": 2.682781400685583,
"grad_norm": 0.8415577411651611,
"learning_rate": 5.555851950650216e-06,
"loss": 0.2972,
"step": 44600
},
{
"epoch": 2.682781400685583,
"eval_loss": 0.2677549421787262,
"eval_runtime": 51.1092,
"eval_samples_per_second": 195.66,
"eval_steps_per_second": 24.457,
"step": 44600
},
{
"epoch": 2.6887966056198556,
"grad_norm": 0.8922407031059265,
"learning_rate": 5.554851617205736e-06,
"loss": 0.2969,
"step": 44700
},
{
"epoch": 2.6887966056198556,
"eval_loss": 0.2671573758125305,
"eval_runtime": 51.0789,
"eval_samples_per_second": 195.776,
"eval_steps_per_second": 24.472,
"step": 44700
},
{
"epoch": 2.6948118105541283,
"grad_norm": 1.0156275033950806,
"learning_rate": 5.553851283761254e-06,
"loss": 0.2972,
"step": 44800
},
{
"epoch": 2.6948118105541283,
"eval_loss": 0.26524412631988525,
"eval_runtime": 51.0819,
"eval_samples_per_second": 195.764,
"eval_steps_per_second": 24.471,
"step": 44800
},
{
"epoch": 2.7008270154884007,
"grad_norm": 0.9283206462860107,
"learning_rate": 5.5528509503167725e-06,
"loss": 0.2953,
"step": 44900
},
{
"epoch": 2.7008270154884007,
"eval_loss": 0.26051226258277893,
"eval_runtime": 51.0731,
"eval_samples_per_second": 195.798,
"eval_steps_per_second": 24.475,
"step": 44900
},
{
"epoch": 2.7068422204226734,
"grad_norm": 0.9081267714500427,
"learning_rate": 5.551850616872291e-06,
"loss": 0.2956,
"step": 45000
},
{
"epoch": 2.7068422204226734,
"eval_loss": 0.26829174160957336,
"eval_runtime": 51.0764,
"eval_samples_per_second": 195.785,
"eval_steps_per_second": 24.473,
"step": 45000
},
{
"epoch": 2.712857425356946,
"grad_norm": 0.9797186255455017,
"learning_rate": 5.550850283427809e-06,
"loss": 0.2951,
"step": 45100
},
{
"epoch": 2.712857425356946,
"eval_loss": 0.2626285254955292,
"eval_runtime": 51.0441,
"eval_samples_per_second": 195.909,
"eval_steps_per_second": 24.489,
"step": 45100
},
{
"epoch": 2.7188726302912185,
"grad_norm": 0.972873866558075,
"learning_rate": 5.549849949983328e-06,
"loss": 0.2938,
"step": 45200
},
{
"epoch": 2.7188726302912185,
"eval_loss": 0.2651112675666809,
"eval_runtime": 51.1856,
"eval_samples_per_second": 195.368,
"eval_steps_per_second": 24.421,
"step": 45200
},
{
"epoch": 2.7248878352254913,
"grad_norm": 0.8637024164199829,
"learning_rate": 5.5488496165388465e-06,
"loss": 0.2951,
"step": 45300
},
{
"epoch": 2.7248878352254913,
"eval_loss": 0.26248618960380554,
"eval_runtime": 51.1456,
"eval_samples_per_second": 195.52,
"eval_steps_per_second": 24.44,
"step": 45300
},
{
"epoch": 2.730903040159764,
"grad_norm": 0.9163945317268372,
"learning_rate": 5.547849283094365e-06,
"loss": 0.2948,
"step": 45400
},
{
"epoch": 2.730903040159764,
"eval_loss": 0.2693786025047302,
"eval_runtime": 51.0867,
"eval_samples_per_second": 195.746,
"eval_steps_per_second": 24.468,
"step": 45400
},
{
"epoch": 2.7369182450940364,
"grad_norm": 1.0530128479003906,
"learning_rate": 5.546848949649884e-06,
"loss": 0.2944,
"step": 45500
},
{
"epoch": 2.7369182450940364,
"eval_loss": 0.2621295750141144,
"eval_runtime": 51.1036,
"eval_samples_per_second": 195.681,
"eval_steps_per_second": 24.46,
"step": 45500
},
{
"epoch": 2.742933450028309,
"grad_norm": 0.9258381128311157,
"learning_rate": 5.545848616205402e-06,
"loss": 0.2943,
"step": 45600
},
{
"epoch": 2.742933450028309,
"eval_loss": 0.25974345207214355,
"eval_runtime": 51.1397,
"eval_samples_per_second": 195.543,
"eval_steps_per_second": 24.443,
"step": 45600
},
{
"epoch": 2.7489486549625815,
"grad_norm": 0.8768019676208496,
"learning_rate": 5.5448482827609205e-06,
"loss": 0.2934,
"step": 45700
},
{
"epoch": 2.7489486549625815,
"eval_loss": 0.26323673129081726,
"eval_runtime": 51.1134,
"eval_samples_per_second": 195.643,
"eval_steps_per_second": 24.455,
"step": 45700
},
{
"epoch": 2.754963859896854,
"grad_norm": 0.8610267639160156,
"learning_rate": 5.543847949316439e-06,
"loss": 0.2934,
"step": 45800
},
{
"epoch": 2.754963859896854,
"eval_loss": 0.2621345818042755,
"eval_runtime": 51.0875,
"eval_samples_per_second": 195.743,
"eval_steps_per_second": 24.468,
"step": 45800
},
{
"epoch": 2.760979064831127,
"grad_norm": 0.8272863626480103,
"learning_rate": 5.542847615871957e-06,
"loss": 0.2952,
"step": 45900
},
{
"epoch": 2.760979064831127,
"eval_loss": 0.2651170790195465,
"eval_runtime": 51.1189,
"eval_samples_per_second": 195.622,
"eval_steps_per_second": 24.453,
"step": 45900
},
{
"epoch": 2.7669942697653997,
"grad_norm": 0.8691322207450867,
"learning_rate": 5.541847282427476e-06,
"loss": 0.2903,
"step": 46000
},
{
"epoch": 2.7669942697653997,
"eval_loss": 0.2674708664417267,
"eval_runtime": 51.0977,
"eval_samples_per_second": 195.704,
"eval_steps_per_second": 24.463,
"step": 46000
},
{
"epoch": 2.773009474699672,
"grad_norm": 0.9887429475784302,
"learning_rate": 5.5408469489829944e-06,
"loss": 0.2931,
"step": 46100
},
{
"epoch": 2.773009474699672,
"eval_loss": 0.2632472515106201,
"eval_runtime": 51.1106,
"eval_samples_per_second": 195.654,
"eval_steps_per_second": 24.457,
"step": 46100
},
{
"epoch": 2.779024679633945,
"grad_norm": 0.9419971704483032,
"learning_rate": 5.539846615538513e-06,
"loss": 0.2933,
"step": 46200
},
{
"epoch": 2.779024679633945,
"eval_loss": 0.2613042891025543,
"eval_runtime": 51.0338,
"eval_samples_per_second": 195.949,
"eval_steps_per_second": 24.494,
"step": 46200
},
{
"epoch": 2.785039884568217,
"grad_norm": 0.9267482161521912,
"learning_rate": 5.538846282094032e-06,
"loss": 0.2915,
"step": 46300
},
{
"epoch": 2.785039884568217,
"eval_loss": 0.2661626935005188,
"eval_runtime": 51.084,
"eval_samples_per_second": 195.756,
"eval_steps_per_second": 24.469,
"step": 46300
},
{
"epoch": 2.79105508950249,
"grad_norm": 0.9020786285400391,
"learning_rate": 5.5378459486495506e-06,
"loss": 0.2933,
"step": 46400
},
{
"epoch": 2.79105508950249,
"eval_loss": 0.2588748335838318,
"eval_runtime": 51.1198,
"eval_samples_per_second": 195.619,
"eval_steps_per_second": 24.452,
"step": 46400
},
{
"epoch": 2.7970702944367627,
"grad_norm": 0.893649160861969,
"learning_rate": 5.5368456152050684e-06,
"loss": 0.2914,
"step": 46500
},
{
"epoch": 2.7970702944367627,
"eval_loss": 0.2560584545135498,
"eval_runtime": 51.1578,
"eval_samples_per_second": 195.474,
"eval_steps_per_second": 24.434,
"step": 46500
},
{
"epoch": 2.803085499371035,
"grad_norm": 0.8569892644882202,
"learning_rate": 5.535845281760587e-06,
"loss": 0.2921,
"step": 46600
},
{
"epoch": 2.803085499371035,
"eval_loss": 0.26415926218032837,
"eval_runtime": 48.8588,
"eval_samples_per_second": 204.672,
"eval_steps_per_second": 25.584,
"step": 46600
},
{
"epoch": 2.809100704305308,
"grad_norm": 0.967966616153717,
"learning_rate": 5.534844948316105e-06,
"loss": 0.2932,
"step": 46700
},
{
"epoch": 2.809100704305308,
"eval_loss": 0.262004554271698,
"eval_runtime": 51.1167,
"eval_samples_per_second": 195.631,
"eval_steps_per_second": 24.454,
"step": 46700
},
{
"epoch": 2.81511590923958,
"grad_norm": 0.8977293968200684,
"learning_rate": 5.533844614871624e-06,
"loss": 0.291,
"step": 46800
},
{
"epoch": 2.81511590923958,
"eval_loss": 0.26304325461387634,
"eval_runtime": 51.1071,
"eval_samples_per_second": 195.668,
"eval_steps_per_second": 24.458,
"step": 46800
},
{
"epoch": 2.821131114173853,
"grad_norm": 0.8833451271057129,
"learning_rate": 5.532844281427142e-06,
"loss": 0.2879,
"step": 46900
},
{
"epoch": 2.821131114173853,
"eval_loss": 0.2652186155319214,
"eval_runtime": 51.1212,
"eval_samples_per_second": 195.614,
"eval_steps_per_second": 24.452,
"step": 46900
},
{
"epoch": 2.8271463191081256,
"grad_norm": 0.916098415851593,
"learning_rate": 5.531843947982661e-06,
"loss": 0.29,
"step": 47000
},
{
"epoch": 2.8271463191081256,
"eval_loss": 0.2618425190448761,
"eval_runtime": 51.1419,
"eval_samples_per_second": 195.534,
"eval_steps_per_second": 24.442,
"step": 47000
},
{
"epoch": 2.8331615240423984,
"grad_norm": 0.8808870315551758,
"learning_rate": 5.53084361453818e-06,
"loss": 0.2912,
"step": 47100
},
{
"epoch": 2.8331615240423984,
"eval_loss": 0.26288196444511414,
"eval_runtime": 51.1216,
"eval_samples_per_second": 195.612,
"eval_steps_per_second": 24.452,
"step": 47100
},
{
"epoch": 2.8391767289766707,
"grad_norm": 0.8972067832946777,
"learning_rate": 5.5298432810936985e-06,
"loss": 0.2914,
"step": 47200
},
{
"epoch": 2.8391767289766707,
"eval_loss": 0.2557620704174042,
"eval_runtime": 51.1227,
"eval_samples_per_second": 195.608,
"eval_steps_per_second": 24.451,
"step": 47200
},
{
"epoch": 2.8451919339109435,
"grad_norm": 0.8946945667266846,
"learning_rate": 5.528842947649216e-06,
"loss": 0.2894,
"step": 47300
},
{
"epoch": 2.8451919339109435,
"eval_loss": 0.26096677780151367,
"eval_runtime": 48.2836,
"eval_samples_per_second": 207.109,
"eval_steps_per_second": 25.889,
"step": 47300
},
{
"epoch": 2.851207138845216,
"grad_norm": 0.9023754000663757,
"learning_rate": 5.527842614204735e-06,
"loss": 0.2875,
"step": 47400
},
{
"epoch": 2.851207138845216,
"eval_loss": 0.25718143582344055,
"eval_runtime": 51.1174,
"eval_samples_per_second": 195.628,
"eval_steps_per_second": 24.453,
"step": 47400
},
{
"epoch": 2.8572223437794886,
"grad_norm": 0.8229103088378906,
"learning_rate": 5.526842280760254e-06,
"loss": 0.2875,
"step": 47500
},
{
"epoch": 2.8572223437794886,
"eval_loss": 0.26064789295196533,
"eval_runtime": 51.0796,
"eval_samples_per_second": 195.773,
"eval_steps_per_second": 24.472,
"step": 47500
},
{
"epoch": 2.8632375487137613,
"grad_norm": 0.7903328537940979,
"learning_rate": 5.525841947315772e-06,
"loss": 0.2888,
"step": 47600
},
{
"epoch": 2.8632375487137613,
"eval_loss": 0.25777605175971985,
"eval_runtime": 51.0732,
"eval_samples_per_second": 195.797,
"eval_steps_per_second": 24.475,
"step": 47600
},
{
"epoch": 2.8692527536480337,
"grad_norm": 0.9628756046295166,
"learning_rate": 5.52484161387129e-06,
"loss": 0.2909,
"step": 47700
},
{
"epoch": 2.8692527536480337,
"eval_loss": 0.2552904188632965,
"eval_runtime": 51.1083,
"eval_samples_per_second": 195.663,
"eval_steps_per_second": 24.458,
"step": 47700
},
{
"epoch": 2.8752679585823064,
"grad_norm": 0.8853189945220947,
"learning_rate": 5.523841280426809e-06,
"loss": 0.2885,
"step": 47800
},
{
"epoch": 2.8752679585823064,
"eval_loss": 0.2585737407207489,
"eval_runtime": 51.0832,
"eval_samples_per_second": 195.759,
"eval_steps_per_second": 24.47,
"step": 47800
},
{
"epoch": 2.8812831635165788,
"grad_norm": 0.9299560785293579,
"learning_rate": 5.522840946982328e-06,
"loss": 0.2865,
"step": 47900
},
{
"epoch": 2.8812831635165788,
"eval_loss": 0.2563331425189972,
"eval_runtime": 51.0909,
"eval_samples_per_second": 195.729,
"eval_steps_per_second": 24.466,
"step": 47900
},
{
"epoch": 2.8872983684508515,
"grad_norm": 0.9286957383155823,
"learning_rate": 5.5218406135378465e-06,
"loss": 0.2873,
"step": 48000
},
{
"epoch": 2.8872983684508515,
"eval_loss": 0.2592049837112427,
"eval_runtime": 48.2359,
"eval_samples_per_second": 207.315,
"eval_steps_per_second": 25.914,
"step": 48000
},
{
"epoch": 2.8933135733851243,
"grad_norm": 0.8729236125946045,
"learning_rate": 5.520840280093364e-06,
"loss": 0.2861,
"step": 48100
},
{
"epoch": 2.8933135733851243,
"eval_loss": 0.25870123505592346,
"eval_runtime": 51.1066,
"eval_samples_per_second": 195.669,
"eval_steps_per_second": 24.459,
"step": 48100
},
{
"epoch": 2.899328778319397,
"grad_norm": 0.8652471899986267,
"learning_rate": 5.519839946648883e-06,
"loss": 0.2867,
"step": 48200
},
{
"epoch": 2.899328778319397,
"eval_loss": 0.2612285017967224,
"eval_runtime": 51.1028,
"eval_samples_per_second": 195.684,
"eval_steps_per_second": 24.46,
"step": 48200
},
{
"epoch": 2.9053439832536694,
"grad_norm": 0.8425643444061279,
"learning_rate": 5.518839613204402e-06,
"loss": 0.2852,
"step": 48300
},
{
"epoch": 2.9053439832536694,
"eval_loss": 0.2628696858882904,
"eval_runtime": 51.123,
"eval_samples_per_second": 195.607,
"eval_steps_per_second": 24.451,
"step": 48300
},
{
"epoch": 2.911359188187942,
"grad_norm": 0.9844802021980286,
"learning_rate": 5.51783927975992e-06,
"loss": 0.2877,
"step": 48400
},
{
"epoch": 2.911359188187942,
"eval_loss": 0.2612448036670685,
"eval_runtime": 51.0987,
"eval_samples_per_second": 195.7,
"eval_steps_per_second": 24.462,
"step": 48400
},
{
"epoch": 2.9173743931222145,
"grad_norm": 0.878381073474884,
"learning_rate": 5.516838946315438e-06,
"loss": 0.2869,
"step": 48500
},
{
"epoch": 2.9173743931222145,
"eval_loss": 0.25639012455940247,
"eval_runtime": 51.1127,
"eval_samples_per_second": 195.646,
"eval_steps_per_second": 24.456,
"step": 48500
},
{
"epoch": 2.9233895980564872,
"grad_norm": 0.8658349514007568,
"learning_rate": 5.515838612870957e-06,
"loss": 0.2862,
"step": 48600
},
{
"epoch": 2.9233895980564872,
"eval_loss": 0.24971692264080048,
"eval_runtime": 51.1228,
"eval_samples_per_second": 195.607,
"eval_steps_per_second": 24.451,
"step": 48600
},
{
"epoch": 2.92940480299076,
"grad_norm": 0.8590924143791199,
"learning_rate": 5.514838279426476e-06,
"loss": 0.2868,
"step": 48700
},
{
"epoch": 2.92940480299076,
"eval_loss": 0.2601747214794159,
"eval_runtime": 51.129,
"eval_samples_per_second": 195.584,
"eval_steps_per_second": 24.448,
"step": 48700
},
{
"epoch": 2.9354200079250328,
"grad_norm": 0.8948882222175598,
"learning_rate": 5.5138379459819945e-06,
"loss": 0.2876,
"step": 48800
},
{
"epoch": 2.9354200079250328,
"eval_loss": 0.256122350692749,
"eval_runtime": 51.1826,
"eval_samples_per_second": 195.379,
"eval_steps_per_second": 24.422,
"step": 48800
},
{
"epoch": 2.941435212859305,
"grad_norm": 0.8714300990104675,
"learning_rate": 5.512837612537512e-06,
"loss": 0.2854,
"step": 48900
},
{
"epoch": 2.941435212859305,
"eval_loss": 0.2527640163898468,
"eval_runtime": 51.143,
"eval_samples_per_second": 195.53,
"eval_steps_per_second": 24.441,
"step": 48900
},
{
"epoch": 2.947450417793578,
"grad_norm": 0.8347595930099487,
"learning_rate": 5.511837279093031e-06,
"loss": 0.2859,
"step": 49000
},
{
"epoch": 2.947450417793578,
"eval_loss": 0.2613712549209595,
"eval_runtime": 51.1079,
"eval_samples_per_second": 195.664,
"eval_steps_per_second": 24.458,
"step": 49000
},
{
"epoch": 2.95346562272785,
"grad_norm": 0.8538709878921509,
"learning_rate": 5.51083694564855e-06,
"loss": 0.2852,
"step": 49100
},
{
"epoch": 2.95346562272785,
"eval_loss": 0.25488194823265076,
"eval_runtime": 51.1132,
"eval_samples_per_second": 195.644,
"eval_steps_per_second": 24.456,
"step": 49100
},
{
"epoch": 2.959480827662123,
"grad_norm": 0.922144889831543,
"learning_rate": 5.509836612204068e-06,
"loss": 0.2847,
"step": 49200
},
{
"epoch": 2.959480827662123,
"eval_loss": 0.2526051700115204,
"eval_runtime": 51.1124,
"eval_samples_per_second": 195.647,
"eval_steps_per_second": 24.456,
"step": 49200
},
{
"epoch": 2.9654960325963957,
"grad_norm": 0.8684960007667542,
"learning_rate": 5.508836278759587e-06,
"loss": 0.2837,
"step": 49300
},
{
"epoch": 2.9654960325963957,
"eval_loss": 0.25194811820983887,
"eval_runtime": 51.0578,
"eval_samples_per_second": 195.857,
"eval_steps_per_second": 24.482,
"step": 49300
},
{
"epoch": 2.971511237530668,
"grad_norm": 0.9055145978927612,
"learning_rate": 5.507835945315106e-06,
"loss": 0.2817,
"step": 49400
},
{
"epoch": 2.971511237530668,
"eval_loss": 0.25218260288238525,
"eval_runtime": 51.0821,
"eval_samples_per_second": 195.763,
"eval_steps_per_second": 24.47,
"step": 49400
},
{
"epoch": 2.977526442464941,
"grad_norm": 0.8636729121208191,
"learning_rate": 5.506835611870624e-06,
"loss": 0.2855,
"step": 49500
},
{
"epoch": 2.977526442464941,
"eval_loss": 0.25728458166122437,
"eval_runtime": 51.06,
"eval_samples_per_second": 195.848,
"eval_steps_per_second": 24.481,
"step": 49500
},
{
"epoch": 2.983541647399213,
"grad_norm": 0.9919777512550354,
"learning_rate": 5.5058352784261424e-06,
"loss": 0.2816,
"step": 49600
},
{
"epoch": 2.983541647399213,
"eval_loss": 0.2515828311443329,
"eval_runtime": 51.2113,
"eval_samples_per_second": 195.269,
"eval_steps_per_second": 24.409,
"step": 49600
},
{
"epoch": 2.989556852333486,
"grad_norm": 0.9122774600982666,
"learning_rate": 5.50483494498166e-06,
"loss": 0.2832,
"step": 49700
},
{
"epoch": 2.989556852333486,
"eval_loss": 0.25426608324050903,
"eval_runtime": 51.098,
"eval_samples_per_second": 195.702,
"eval_steps_per_second": 24.463,
"step": 49700
},
{
"epoch": 2.9955720572677587,
"grad_norm": 0.8778186440467834,
"learning_rate": 5.503834611537179e-06,
"loss": 0.2821,
"step": 49800
},
{
"epoch": 2.9955720572677587,
"eval_loss": 0.2510456442832947,
"eval_runtime": 51.0495,
"eval_samples_per_second": 195.888,
"eval_steps_per_second": 24.486,
"step": 49800
},
{
"epoch": 3.001587262202031,
"grad_norm": 0.8645954132080078,
"learning_rate": 5.502834278092698e-06,
"loss": 0.283,
"step": 49900
},
{
"epoch": 3.001587262202031,
"eval_loss": 0.2549561858177185,
"eval_runtime": 51.1194,
"eval_samples_per_second": 195.62,
"eval_steps_per_second": 24.453,
"step": 49900
},
{
"epoch": 3.0076024671363037,
"grad_norm": 0.971116304397583,
"learning_rate": 5.501833944648216e-06,
"loss": 0.2833,
"step": 50000
},
{
"epoch": 3.0076024671363037,
"eval_loss": 0.24709643423557281,
"eval_runtime": 50.6183,
"eval_samples_per_second": 197.557,
"eval_steps_per_second": 24.695,
"step": 50000
},
{
"epoch": 3.0136176720705765,
"grad_norm": 0.9352070093154907,
"learning_rate": 5.500833611203735e-06,
"loss": 0.2829,
"step": 50100
},
{
"epoch": 3.0136176720705765,
"eval_loss": 0.2510698139667511,
"eval_runtime": 50.9108,
"eval_samples_per_second": 196.422,
"eval_steps_per_second": 24.553,
"step": 50100
},
{
"epoch": 3.019632877004849,
"grad_norm": 0.8702713847160339,
"learning_rate": 5.499833277759254e-06,
"loss": 0.2806,
"step": 50200
},
{
"epoch": 3.019632877004849,
"eval_loss": 0.25517037510871887,
"eval_runtime": 51.143,
"eval_samples_per_second": 195.53,
"eval_steps_per_second": 24.441,
"step": 50200
},
{
"epoch": 3.0256480819391216,
"grad_norm": 0.8589245676994324,
"learning_rate": 5.498832944314772e-06,
"loss": 0.2828,
"step": 50300
},
{
"epoch": 3.0256480819391216,
"eval_loss": 0.25433140993118286,
"eval_runtime": 48.9769,
"eval_samples_per_second": 204.178,
"eval_steps_per_second": 25.522,
"step": 50300
},
{
"epoch": 3.0316632868733944,
"grad_norm": 0.8240871429443359,
"learning_rate": 5.49783261087029e-06,
"loss": 0.2786,
"step": 50400
},
{
"epoch": 3.0316632868733944,
"eval_loss": 0.2537357807159424,
"eval_runtime": 43.57,
"eval_samples_per_second": 229.516,
"eval_steps_per_second": 28.689,
"step": 50400
},
{
"epoch": 3.0376784918076667,
"grad_norm": 0.8937031030654907,
"learning_rate": 5.496832277425809e-06,
"loss": 0.2818,
"step": 50500
},
{
"epoch": 3.0376784918076667,
"eval_loss": 0.25536617636680603,
"eval_runtime": 43.9342,
"eval_samples_per_second": 227.613,
"eval_steps_per_second": 28.452,
"step": 50500
},
{
"epoch": 3.0436936967419395,
"grad_norm": 0.8851022720336914,
"learning_rate": 5.495831943981327e-06,
"loss": 0.28,
"step": 50600
},
{
"epoch": 3.0436936967419395,
"eval_loss": 0.2511354684829712,
"eval_runtime": 43.4697,
"eval_samples_per_second": 230.045,
"eval_steps_per_second": 28.756,
"step": 50600
},
{
"epoch": 3.0497089016762122,
"grad_norm": 0.9308133125305176,
"learning_rate": 5.494831610536846e-06,
"loss": 0.2822,
"step": 50700
},
{
"epoch": 3.0497089016762122,
"eval_loss": 0.2528564929962158,
"eval_runtime": 38.8722,
"eval_samples_per_second": 257.253,
"eval_steps_per_second": 32.157,
"step": 50700
},
{
"epoch": 3.0557241066104845,
"grad_norm": 1.0158571004867554,
"learning_rate": 5.493831277092364e-06,
"loss": 0.2829,
"step": 50800
},
{
"epoch": 3.0557241066104845,
"eval_loss": 0.24908022582530975,
"eval_runtime": 37.7881,
"eval_samples_per_second": 264.634,
"eval_steps_per_second": 33.079,
"step": 50800
},
{
"epoch": 3.0617393115447573,
"grad_norm": 0.8238421082496643,
"learning_rate": 5.492830943647883e-06,
"loss": 0.2804,
"step": 50900
},
{
"epoch": 3.0617393115447573,
"eval_loss": 0.24608242511749268,
"eval_runtime": 40.8226,
"eval_samples_per_second": 244.962,
"eval_steps_per_second": 30.62,
"step": 50900
},
{
"epoch": 3.06775451647903,
"grad_norm": 0.8686819672584534,
"learning_rate": 5.491830610203402e-06,
"loss": 0.2793,
"step": 51000
},
{
"epoch": 3.06775451647903,
"eval_loss": 0.24653884768486023,
"eval_runtime": 43.055,
"eval_samples_per_second": 232.261,
"eval_steps_per_second": 29.033,
"step": 51000
},
{
"epoch": 3.0737697214133024,
"grad_norm": 0.9399664998054504,
"learning_rate": 5.49083027675892e-06,
"loss": 0.2812,
"step": 51100
},
{
"epoch": 3.0737697214133024,
"eval_loss": 0.25110530853271484,
"eval_runtime": 44.132,
"eval_samples_per_second": 226.593,
"eval_steps_per_second": 28.324,
"step": 51100
},
{
"epoch": 3.079784926347575,
"grad_norm": 0.9775184988975525,
"learning_rate": 5.489829943314438e-06,
"loss": 0.2791,
"step": 51200
},
{
"epoch": 3.079784926347575,
"eval_loss": 0.24785326421260834,
"eval_runtime": 39.588,
"eval_samples_per_second": 252.602,
"eval_steps_per_second": 31.575,
"step": 51200
},
{
"epoch": 3.0858001312818475,
"grad_norm": 0.9678452014923096,
"learning_rate": 5.488829609869957e-06,
"loss": 0.2799,
"step": 51300
},
{
"epoch": 3.0858001312818475,
"eval_loss": 0.25371748208999634,
"eval_runtime": 40.7507,
"eval_samples_per_second": 245.395,
"eval_steps_per_second": 30.674,
"step": 51300
},
{
"epoch": 3.0918153362161203,
"grad_norm": 0.9417468309402466,
"learning_rate": 5.487829276425475e-06,
"loss": 0.2794,
"step": 51400
},
{
"epoch": 3.0918153362161203,
"eval_loss": 0.2551732659339905,
"eval_runtime": 42.2338,
"eval_samples_per_second": 236.777,
"eval_steps_per_second": 29.597,
"step": 51400
},
{
"epoch": 3.097830541150393,
"grad_norm": 0.8855278491973877,
"learning_rate": 5.486828942980994e-06,
"loss": 0.2798,
"step": 51500
},
{
"epoch": 3.097830541150393,
"eval_loss": 0.24791452288627625,
"eval_runtime": 48.1906,
"eval_samples_per_second": 207.509,
"eval_steps_per_second": 25.939,
"step": 51500
},
{
"epoch": 3.1038457460846653,
"grad_norm": 0.8699272274971008,
"learning_rate": 5.485828609536512e-06,
"loss": 0.2777,
"step": 51600
},
{
"epoch": 3.1038457460846653,
"eval_loss": 0.24532942473888397,
"eval_runtime": 45.8295,
"eval_samples_per_second": 218.2,
"eval_steps_per_second": 27.275,
"step": 51600
},
{
"epoch": 3.109860951018938,
"grad_norm": 0.8299559950828552,
"learning_rate": 5.484828276092031e-06,
"loss": 0.277,
"step": 51700
},
{
"epoch": 3.109860951018938,
"eval_loss": 0.24607662856578827,
"eval_runtime": 46.3442,
"eval_samples_per_second": 215.777,
"eval_steps_per_second": 26.972,
"step": 51700
},
{
"epoch": 3.115876155953211,
"grad_norm": 0.8937397003173828,
"learning_rate": 5.48382794264755e-06,
"loss": 0.2823,
"step": 51800
},
{
"epoch": 3.115876155953211,
"eval_loss": 0.2510640621185303,
"eval_runtime": 47.5854,
"eval_samples_per_second": 210.148,
"eval_steps_per_second": 26.269,
"step": 51800
},
{
"epoch": 3.121891360887483,
"grad_norm": 0.7908412218093872,
"learning_rate": 5.482827609203068e-06,
"loss": 0.2764,
"step": 51900
},
{
"epoch": 3.121891360887483,
"eval_loss": 0.24473002552986145,
"eval_runtime": 48.2096,
"eval_samples_per_second": 207.427,
"eval_steps_per_second": 25.928,
"step": 51900
},
{
"epoch": 3.127906565821756,
"grad_norm": 0.8543498516082764,
"learning_rate": 5.481827275758586e-06,
"loss": 0.2782,
"step": 52000
},
{
"epoch": 3.127906565821756,
"eval_loss": 0.24760138988494873,
"eval_runtime": 48.6773,
"eval_samples_per_second": 205.435,
"eval_steps_per_second": 25.679,
"step": 52000
},
{
"epoch": 3.1339217707560287,
"grad_norm": 0.869742751121521,
"learning_rate": 5.480826942314105e-06,
"loss": 0.2778,
"step": 52100
},
{
"epoch": 3.1339217707560287,
"eval_loss": 0.2506987452507019,
"eval_runtime": 49.27,
"eval_samples_per_second": 202.963,
"eval_steps_per_second": 25.37,
"step": 52100
},
{
"epoch": 3.139936975690301,
"grad_norm": 0.97697514295578,
"learning_rate": 5.479826608869623e-06,
"loss": 0.2765,
"step": 52200
},
{
"epoch": 3.139936975690301,
"eval_loss": 0.248337984085083,
"eval_runtime": 50.0788,
"eval_samples_per_second": 199.685,
"eval_steps_per_second": 24.961,
"step": 52200
},
{
"epoch": 3.145952180624574,
"grad_norm": 0.9102049469947815,
"learning_rate": 5.478826275425142e-06,
"loss": 0.2776,
"step": 52300
},
{
"epoch": 3.145952180624574,
"eval_loss": 0.24709181487560272,
"eval_runtime": 50.384,
"eval_samples_per_second": 198.476,
"eval_steps_per_second": 24.809,
"step": 52300
},
{
"epoch": 3.151967385558846,
"grad_norm": 0.9332506656646729,
"learning_rate": 5.47782594198066e-06,
"loss": 0.2777,
"step": 52400
},
{
"epoch": 3.151967385558846,
"eval_loss": 0.2484249472618103,
"eval_runtime": 50.292,
"eval_samples_per_second": 198.839,
"eval_steps_per_second": 24.855,
"step": 52400
},
{
"epoch": 3.157982590493119,
"grad_norm": 0.8517917394638062,
"learning_rate": 5.476825608536179e-06,
"loss": 0.278,
"step": 52500
},
{
"epoch": 3.157982590493119,
"eval_loss": 0.24207893013954163,
"eval_runtime": 48.3341,
"eval_samples_per_second": 206.893,
"eval_steps_per_second": 25.862,
"step": 52500
},
{
"epoch": 3.1639977954273917,
"grad_norm": 0.8629357814788818,
"learning_rate": 5.475825275091698e-06,
"loss": 0.2775,
"step": 52600
},
{
"epoch": 3.1639977954273917,
"eval_loss": 0.24527695775032043,
"eval_runtime": 50.4058,
"eval_samples_per_second": 198.39,
"eval_steps_per_second": 24.799,
"step": 52600
},
{
"epoch": 3.170013000361664,
"grad_norm": 0.9194425940513611,
"learning_rate": 5.4748249416472156e-06,
"loss": 0.2775,
"step": 52700
},
{
"epoch": 3.170013000361664,
"eval_loss": 0.2455427497625351,
"eval_runtime": 48.0608,
"eval_samples_per_second": 208.07,
"eval_steps_per_second": 26.009,
"step": 52700
},
{
"epoch": 3.1760282052959368,
"grad_norm": 0.8746848702430725,
"learning_rate": 5.473824608202734e-06,
"loss": 0.278,
"step": 52800
},
{
"epoch": 3.1760282052959368,
"eval_loss": 0.24813415110111237,
"eval_runtime": 42.6735,
"eval_samples_per_second": 234.338,
"eval_steps_per_second": 29.292,
"step": 52800
},
{
"epoch": 3.1820434102302095,
"grad_norm": 0.9082689881324768,
"learning_rate": 5.472824274758253e-06,
"loss": 0.2732,
"step": 52900
},
{
"epoch": 3.1820434102302095,
"eval_loss": 0.24827983975410461,
"eval_runtime": 44.2364,
"eval_samples_per_second": 226.058,
"eval_steps_per_second": 28.257,
"step": 52900
},
{
"epoch": 3.188058615164482,
"grad_norm": 0.8607956171035767,
"learning_rate": 5.471823941313771e-06,
"loss": 0.2772,
"step": 53000
},
{
"epoch": 3.188058615164482,
"eval_loss": 0.24322330951690674,
"eval_runtime": 44.8161,
"eval_samples_per_second": 223.134,
"eval_steps_per_second": 27.892,
"step": 53000
},
{
"epoch": 3.1940738200987546,
"grad_norm": 0.9439307451248169,
"learning_rate": 5.4708236078692896e-06,
"loss": 0.2734,
"step": 53100
},
{
"epoch": 3.1940738200987546,
"eval_loss": 0.24696892499923706,
"eval_runtime": 47.0223,
"eval_samples_per_second": 212.665,
"eval_steps_per_second": 26.583,
"step": 53100
},
{
"epoch": 3.2000890250330274,
"grad_norm": 1.0130066871643066,
"learning_rate": 5.469823274424808e-06,
"loss": 0.2737,
"step": 53200
},
{
"epoch": 3.2000890250330274,
"eval_loss": 0.2521739602088928,
"eval_runtime": 46.5164,
"eval_samples_per_second": 214.978,
"eval_steps_per_second": 26.872,
"step": 53200
},
{
"epoch": 3.2061042299672997,
"grad_norm": 0.9969391822814941,
"learning_rate": 5.468822940980327e-06,
"loss": 0.2767,
"step": 53300
},
{
"epoch": 3.2061042299672997,
"eval_loss": 0.25239297747612,
"eval_runtime": 46.7418,
"eval_samples_per_second": 213.941,
"eval_steps_per_second": 26.743,
"step": 53300
},
{
"epoch": 3.2121194349015725,
"grad_norm": 0.9380843639373779,
"learning_rate": 5.467822607535846e-06,
"loss": 0.2743,
"step": 53400
},
{
"epoch": 3.2121194349015725,
"eval_loss": 0.2427060306072235,
"eval_runtime": 47.8166,
"eval_samples_per_second": 209.133,
"eval_steps_per_second": 26.142,
"step": 53400
},
{
"epoch": 3.2181346398358452,
"grad_norm": 0.8498116135597229,
"learning_rate": 5.466822274091364e-06,
"loss": 0.2752,
"step": 53500
},
{
"epoch": 3.2181346398358452,
"eval_loss": 0.23972123861312866,
"eval_runtime": 48.9235,
"eval_samples_per_second": 204.401,
"eval_steps_per_second": 25.55,
"step": 53500
},
{
"epoch": 3.2241498447701176,
"grad_norm": 0.8372825980186462,
"learning_rate": 5.465821940646882e-06,
"loss": 0.273,
"step": 53600
},
{
"epoch": 3.2241498447701176,
"eval_loss": 0.2440669685602188,
"eval_runtime": 47.5669,
"eval_samples_per_second": 210.23,
"eval_steps_per_second": 26.279,
"step": 53600
},
{
"epoch": 3.2301650497043903,
"grad_norm": 0.9698020815849304,
"learning_rate": 5.464821607202401e-06,
"loss": 0.2767,
"step": 53700
},
{
"epoch": 3.2301650497043903,
"eval_loss": 0.23816044628620148,
"eval_runtime": 34.9463,
"eval_samples_per_second": 286.153,
"eval_steps_per_second": 35.769,
"step": 53700
},
{
"epoch": 3.236180254638663,
"grad_norm": 0.822875440120697,
"learning_rate": 5.463821273757919e-06,
"loss": 0.2751,
"step": 53800
},
{
"epoch": 3.236180254638663,
"eval_loss": 0.24079230427742004,
"eval_runtime": 35.4307,
"eval_samples_per_second": 282.241,
"eval_steps_per_second": 35.28,
"step": 53800
},
{
"epoch": 3.2421954595729354,
"grad_norm": 0.8933221101760864,
"learning_rate": 5.4628209403134375e-06,
"loss": 0.2753,
"step": 53900
},
{
"epoch": 3.2421954595729354,
"eval_loss": 0.25047245621681213,
"eval_runtime": 36.1364,
"eval_samples_per_second": 276.729,
"eval_steps_per_second": 34.591,
"step": 53900
},
{
"epoch": 3.248210664507208,
"grad_norm": 0.915135383605957,
"learning_rate": 5.461820606868957e-06,
"loss": 0.2736,
"step": 54000
},
{
"epoch": 3.248210664507208,
"eval_loss": 0.24464978277683258,
"eval_runtime": 35.7495,
"eval_samples_per_second": 279.724,
"eval_steps_per_second": 34.966,
"step": 54000
},
{
"epoch": 3.2542258694414805,
"grad_norm": 0.8490029573440552,
"learning_rate": 5.460820273424475e-06,
"loss": 0.274,
"step": 54100
},
{
"epoch": 3.2542258694414805,
"eval_loss": 0.2507534325122833,
"eval_runtime": 38.4129,
"eval_samples_per_second": 260.329,
"eval_steps_per_second": 32.541,
"step": 54100
},
{
"epoch": 3.2602410743757533,
"grad_norm": 0.9220608472824097,
"learning_rate": 5.459819939979994e-06,
"loss": 0.2736,
"step": 54200
},
{
"epoch": 3.2602410743757533,
"eval_loss": 0.24634374678134918,
"eval_runtime": 41.8157,
"eval_samples_per_second": 239.145,
"eval_steps_per_second": 29.893,
"step": 54200
},
{
"epoch": 3.266256279310026,
"grad_norm": 0.8318041563034058,
"learning_rate": 5.458819606535512e-06,
"loss": 0.271,
"step": 54300
},
{
"epoch": 3.266256279310026,
"eval_loss": 0.24672181904315948,
"eval_runtime": 39.1233,
"eval_samples_per_second": 255.602,
"eval_steps_per_second": 31.95,
"step": 54300
},
{
"epoch": 3.2722714842442984,
"grad_norm": 0.8373593091964722,
"learning_rate": 5.45781927309103e-06,
"loss": 0.272,
"step": 54400
},
{
"epoch": 3.2722714842442984,
"eval_loss": 0.24106918275356293,
"eval_runtime": 36.4825,
"eval_samples_per_second": 274.104,
"eval_steps_per_second": 34.263,
"step": 54400
},
{
"epoch": 3.278286689178571,
"grad_norm": 0.8802669644355774,
"learning_rate": 5.456818939646549e-06,
"loss": 0.2683,
"step": 54500
},
{
"epoch": 3.278286689178571,
"eval_loss": 0.24452929198741913,
"eval_runtime": 33.0976,
"eval_samples_per_second": 302.137,
"eval_steps_per_second": 37.767,
"step": 54500
},
{
"epoch": 3.284301894112844,
"grad_norm": 0.8867002129554749,
"learning_rate": 5.455818606202067e-06,
"loss": 0.2697,
"step": 54600
},
{
"epoch": 3.284301894112844,
"eval_loss": 0.23936684429645538,
"eval_runtime": 40.904,
"eval_samples_per_second": 244.475,
"eval_steps_per_second": 30.559,
"step": 54600
},
{
"epoch": 3.2903170990471162,
"grad_norm": 0.91335529088974,
"learning_rate": 5.454818272757586e-06,
"loss": 0.2739,
"step": 54700
},
{
"epoch": 3.2903170990471162,
"eval_loss": 0.24262717366218567,
"eval_runtime": 43.6033,
"eval_samples_per_second": 229.34,
"eval_steps_per_second": 28.668,
"step": 54700
},
{
"epoch": 3.296332303981389,
"grad_norm": 0.8662433624267578,
"learning_rate": 5.453817939313105e-06,
"loss": 0.2715,
"step": 54800
},
{
"epoch": 3.296332303981389,
"eval_loss": 0.24885956943035126,
"eval_runtime": 45.6743,
"eval_samples_per_second": 218.942,
"eval_steps_per_second": 27.368,
"step": 54800
},
{
"epoch": 3.3023475089156618,
"grad_norm": 0.943458616733551,
"learning_rate": 5.452817605868623e-06,
"loss": 0.2709,
"step": 54900
},
{
"epoch": 3.3023475089156618,
"eval_loss": 0.24570631980895996,
"eval_runtime": 46.9183,
"eval_samples_per_second": 213.136,
"eval_steps_per_second": 26.642,
"step": 54900
},
{
"epoch": 3.308362713849934,
"grad_norm": 0.8767443299293518,
"learning_rate": 5.451817272424142e-06,
"loss": 0.2724,
"step": 55000
},
{
"epoch": 3.308362713849934,
"eval_loss": 0.24481208622455597,
"eval_runtime": 47.56,
"eval_samples_per_second": 210.261,
"eval_steps_per_second": 26.283,
"step": 55000
},
{
"epoch": 3.314377918784207,
"grad_norm": 0.9032852053642273,
"learning_rate": 5.45081693897966e-06,
"loss": 0.2733,
"step": 55100
},
{
"epoch": 3.314377918784207,
"eval_loss": 0.24037285149097443,
"eval_runtime": 48.5117,
"eval_samples_per_second": 206.136,
"eval_steps_per_second": 25.767,
"step": 55100
},
{
"epoch": 3.320393123718479,
"grad_norm": 0.8414300084114075,
"learning_rate": 5.449816605535178e-06,
"loss": 0.2709,
"step": 55200
},
{
"epoch": 3.320393123718479,
"eval_loss": 0.24620996415615082,
"eval_runtime": 48.2151,
"eval_samples_per_second": 207.404,
"eval_steps_per_second": 25.925,
"step": 55200
},
{
"epoch": 3.326408328652752,
"grad_norm": 0.9093489646911621,
"learning_rate": 5.448816272090697e-06,
"loss": 0.2683,
"step": 55300
},
{
"epoch": 3.326408328652752,
"eval_loss": 0.24467670917510986,
"eval_runtime": 49.7086,
"eval_samples_per_second": 201.172,
"eval_steps_per_second": 25.147,
"step": 55300
},
{
"epoch": 3.3324235335870247,
"grad_norm": 0.920391857624054,
"learning_rate": 5.447815938646216e-06,
"loss": 0.2703,
"step": 55400
},
{
"epoch": 3.3324235335870247,
"eval_loss": 0.24019140005111694,
"eval_runtime": 50.0394,
"eval_samples_per_second": 199.843,
"eval_steps_per_second": 24.98,
"step": 55400
},
{
"epoch": 3.338438738521297,
"grad_norm": 0.9286474585533142,
"learning_rate": 5.446815605201734e-06,
"loss": 0.2705,
"step": 55500
},
{
"epoch": 3.338438738521297,
"eval_loss": 0.24543143808841705,
"eval_runtime": 50.344,
"eval_samples_per_second": 198.633,
"eval_steps_per_second": 24.829,
"step": 55500
},
{
"epoch": 3.34445394345557,
"grad_norm": 0.9175123572349548,
"learning_rate": 5.445815271757253e-06,
"loss": 0.2713,
"step": 55600
},
{
"epoch": 3.34445394345557,
"eval_loss": 0.23898915946483612,
"eval_runtime": 50.3195,
"eval_samples_per_second": 198.73,
"eval_steps_per_second": 24.841,
"step": 55600
},
{
"epoch": 3.3504691483898426,
"grad_norm": 0.8990902900695801,
"learning_rate": 5.444814938312771e-06,
"loss": 0.2713,
"step": 55700
},
{
"epoch": 3.3504691483898426,
"eval_loss": 0.24149462580680847,
"eval_runtime": 50.7504,
"eval_samples_per_second": 197.043,
"eval_steps_per_second": 24.63,
"step": 55700
},
{
"epoch": 3.356484353324115,
"grad_norm": 0.8217372298240662,
"learning_rate": 5.4438146048682896e-06,
"loss": 0.2694,
"step": 55800
},
{
"epoch": 3.356484353324115,
"eval_loss": 0.24138091504573822,
"eval_runtime": 50.9006,
"eval_samples_per_second": 196.461,
"eval_steps_per_second": 24.558,
"step": 55800
},
{
"epoch": 3.3624995582583876,
"grad_norm": 0.8727395534515381,
"learning_rate": 5.442814271423808e-06,
"loss": 0.2694,
"step": 55900
},
{
"epoch": 3.3624995582583876,
"eval_loss": 0.24046172201633453,
"eval_runtime": 36.3936,
"eval_samples_per_second": 274.773,
"eval_steps_per_second": 34.347,
"step": 55900
},
{
"epoch": 3.3685147631926604,
"grad_norm": 0.8453567028045654,
"learning_rate": 5.441813937979326e-06,
"loss": 0.2683,
"step": 56000
},
{
"epoch": 3.3685147631926604,
"eval_loss": 0.24423474073410034,
"eval_runtime": 50.8544,
"eval_samples_per_second": 196.64,
"eval_steps_per_second": 24.58,
"step": 56000
},
{
"epoch": 3.3745299681269327,
"grad_norm": 0.86241614818573,
"learning_rate": 5.440813604534845e-06,
"loss": 0.2649,
"step": 56100
},
{
"epoch": 3.3745299681269327,
"eval_loss": 0.2407056838274002,
"eval_runtime": 50.778,
"eval_samples_per_second": 196.936,
"eval_steps_per_second": 24.617,
"step": 56100
},
{
"epoch": 3.3805451730612055,
"grad_norm": 0.9142568111419678,
"learning_rate": 5.4398132710903636e-06,
"loss": 0.2696,
"step": 56200
},
{
"epoch": 3.3805451730612055,
"eval_loss": 0.24098168313503265,
"eval_runtime": 51.0703,
"eval_samples_per_second": 195.809,
"eval_steps_per_second": 24.476,
"step": 56200
},
{
"epoch": 3.386560377995478,
"grad_norm": 0.8302989602088928,
"learning_rate": 5.438812937645882e-06,
"loss": 0.2695,
"step": 56300
},
{
"epoch": 3.386560377995478,
"eval_loss": 0.23798757791519165,
"eval_runtime": 50.9646,
"eval_samples_per_second": 196.215,
"eval_steps_per_second": 24.527,
"step": 56300
},
{
"epoch": 3.3925755829297506,
"grad_norm": 0.8420681357383728,
"learning_rate": 5.437812604201401e-06,
"loss": 0.2682,
"step": 56400
},
{
"epoch": 3.3925755829297506,
"eval_loss": 0.24360163509845734,
"eval_runtime": 51.0498,
"eval_samples_per_second": 195.887,
"eval_steps_per_second": 24.486,
"step": 56400
},
{
"epoch": 3.3985907878640234,
"grad_norm": 0.8456258773803711,
"learning_rate": 5.436812270756919e-06,
"loss": 0.2661,
"step": 56500
},
{
"epoch": 3.3985907878640234,
"eval_loss": 0.23989547789096832,
"eval_runtime": 49.5593,
"eval_samples_per_second": 201.778,
"eval_steps_per_second": 25.222,
"step": 56500
},
{
"epoch": 3.404605992798296,
"grad_norm": 0.9097959399223328,
"learning_rate": 5.4358119373124375e-06,
"loss": 0.2684,
"step": 56600
},
{
"epoch": 3.404605992798296,
"eval_loss": 0.2373836487531662,
"eval_runtime": 48.7156,
"eval_samples_per_second": 205.273,
"eval_steps_per_second": 25.659,
"step": 56600
},
{
"epoch": 3.4106211977325684,
"grad_norm": 0.8549370169639587,
"learning_rate": 5.434811603867956e-06,
"loss": 0.266,
"step": 56700
},
{
"epoch": 3.4106211977325684,
"eval_loss": 0.2353491634130478,
"eval_runtime": 48.0299,
"eval_samples_per_second": 208.204,
"eval_steps_per_second": 26.025,
"step": 56700
},
{
"epoch": 3.416636402666841,
"grad_norm": 0.9058821797370911,
"learning_rate": 5.433811270423474e-06,
"loss": 0.2712,
"step": 56800
},
{
"epoch": 3.416636402666841,
"eval_loss": 0.24013860523700714,
"eval_runtime": 48.2564,
"eval_samples_per_second": 207.226,
"eval_steps_per_second": 25.903,
"step": 56800
},
{
"epoch": 3.4226516076011135,
"grad_norm": 0.7843255400657654,
"learning_rate": 5.432810936978993e-06,
"loss": 0.2667,
"step": 56900
},
{
"epoch": 3.4226516076011135,
"eval_loss": 0.2440056949853897,
"eval_runtime": 49.156,
"eval_samples_per_second": 203.434,
"eval_steps_per_second": 25.429,
"step": 56900
},
{
"epoch": 3.4286668125353863,
"grad_norm": 0.8476096987724304,
"learning_rate": 5.4318106035345115e-06,
"loss": 0.2647,
"step": 57000
},
{
"epoch": 3.4286668125353863,
"eval_loss": 0.24185192584991455,
"eval_runtime": 48.8755,
"eval_samples_per_second": 204.602,
"eval_steps_per_second": 25.575,
"step": 57000
},
{
"epoch": 3.434682017469659,
"grad_norm": 0.8693493008613586,
"learning_rate": 5.43081027009003e-06,
"loss": 0.2667,
"step": 57100
},
{
"epoch": 3.434682017469659,
"eval_loss": 0.23922978341579437,
"eval_runtime": 49.1662,
"eval_samples_per_second": 203.392,
"eval_steps_per_second": 25.424,
"step": 57100
},
{
"epoch": 3.4406972224039314,
"grad_norm": 0.7601708769798279,
"learning_rate": 5.429809936645549e-06,
"loss": 0.268,
"step": 57200
},
{
"epoch": 3.4406972224039314,
"eval_loss": 0.2391706109046936,
"eval_runtime": 49.6653,
"eval_samples_per_second": 201.348,
"eval_steps_per_second": 25.168,
"step": 57200
},
{
"epoch": 3.446712427338204,
"grad_norm": 0.8476257920265198,
"learning_rate": 5.428809603201068e-06,
"loss": 0.2668,
"step": 57300
},
{
"epoch": 3.446712427338204,
"eval_loss": 0.23998339474201202,
"eval_runtime": 49.9477,
"eval_samples_per_second": 200.209,
"eval_steps_per_second": 25.026,
"step": 57300
},
{
"epoch": 3.452727632272477,
"grad_norm": 0.9185997843742371,
"learning_rate": 5.4278092697565855e-06,
"loss": 0.2649,
"step": 57400
},
{
"epoch": 3.452727632272477,
"eval_loss": 0.2374006062746048,
"eval_runtime": 49.539,
"eval_samples_per_second": 201.861,
"eval_steps_per_second": 25.233,
"step": 57400
},
{
"epoch": 3.4587428372067492,
"grad_norm": 0.8186565041542053,
"learning_rate": 5.426808936312104e-06,
"loss": 0.2667,
"step": 57500
},
{
"epoch": 3.4587428372067492,
"eval_loss": 0.23729223012924194,
"eval_runtime": 50.958,
"eval_samples_per_second": 196.24,
"eval_steps_per_second": 24.53,
"step": 57500
},
{
"epoch": 3.464758042141022,
"grad_norm": 0.876054048538208,
"learning_rate": 5.425808602867622e-06,
"loss": 0.2644,
"step": 57600
},
{
"epoch": 3.464758042141022,
"eval_loss": 0.2387179434299469,
"eval_runtime": 36.8167,
"eval_samples_per_second": 271.616,
"eval_steps_per_second": 33.952,
"step": 57600
},
{
"epoch": 3.4707732470752948,
"grad_norm": 0.8078221678733826,
"learning_rate": 5.424808269423141e-06,
"loss": 0.2671,
"step": 57700
},
{
"epoch": 3.4707732470752948,
"eval_loss": 0.23494240641593933,
"eval_runtime": 50.9663,
"eval_samples_per_second": 196.208,
"eval_steps_per_second": 24.526,
"step": 57700
},
{
"epoch": 3.476788452009567,
"grad_norm": 0.8425822257995605,
"learning_rate": 5.4238079359786595e-06,
"loss": 0.2662,
"step": 57800
},
{
"epoch": 3.476788452009567,
"eval_loss": 0.23349033296108246,
"eval_runtime": 50.9349,
"eval_samples_per_second": 196.329,
"eval_steps_per_second": 24.541,
"step": 57800
},
{
"epoch": 3.48280365694384,
"grad_norm": 0.8718583583831787,
"learning_rate": 5.422807602534178e-06,
"loss": 0.267,
"step": 57900
},
{
"epoch": 3.48280365694384,
"eval_loss": 0.23534800112247467,
"eval_runtime": 50.6689,
"eval_samples_per_second": 197.36,
"eval_steps_per_second": 24.67,
"step": 57900
},
{
"epoch": 3.488818861878112,
"grad_norm": 0.8161312341690063,
"learning_rate": 5.421807269089697e-06,
"loss": 0.2641,
"step": 58000
},
{
"epoch": 3.488818861878112,
"eval_loss": 0.23691873252391815,
"eval_runtime": 50.9223,
"eval_samples_per_second": 196.377,
"eval_steps_per_second": 24.547,
"step": 58000
},
{
"epoch": 3.494834066812385,
"grad_norm": 0.781482458114624,
"learning_rate": 5.420806935645216e-06,
"loss": 0.2652,
"step": 58100
},
{
"epoch": 3.494834066812385,
"eval_loss": 0.2412412315607071,
"eval_runtime": 51.059,
"eval_samples_per_second": 195.852,
"eval_steps_per_second": 24.481,
"step": 58100
},
{
"epoch": 3.5008492717466577,
"grad_norm": 0.869367778301239,
"learning_rate": 5.4198066022007335e-06,
"loss": 0.2639,
"step": 58200
},
{
"epoch": 3.5008492717466577,
"eval_loss": 0.23919972777366638,
"eval_runtime": 50.9672,
"eval_samples_per_second": 196.205,
"eval_steps_per_second": 24.526,
"step": 58200
},
{
"epoch": 3.5068644766809305,
"grad_norm": 0.8614550828933716,
"learning_rate": 5.418806268756252e-06,
"loss": 0.2637,
"step": 58300
},
{
"epoch": 3.5068644766809305,
"eval_loss": 0.23232702910900116,
"eval_runtime": 50.8155,
"eval_samples_per_second": 196.79,
"eval_steps_per_second": 24.599,
"step": 58300
},
{
"epoch": 3.512879681615203,
"grad_norm": 0.9519971609115601,
"learning_rate": 5.417805935311771e-06,
"loss": 0.2636,
"step": 58400
},
{
"epoch": 3.512879681615203,
"eval_loss": 0.2359647899866104,
"eval_runtime": 51.0167,
"eval_samples_per_second": 196.014,
"eval_steps_per_second": 24.502,
"step": 58400
},
{
"epoch": 3.5188948865494756,
"grad_norm": 0.7815201282501221,
"learning_rate": 5.416805601867289e-06,
"loss": 0.263,
"step": 58500
},
{
"epoch": 3.5188948865494756,
"eval_loss": 0.2390337437391281,
"eval_runtime": 50.9327,
"eval_samples_per_second": 196.337,
"eval_steps_per_second": 24.542,
"step": 58500
},
{
"epoch": 3.524910091483748,
"grad_norm": 0.9015016555786133,
"learning_rate": 5.415805268422808e-06,
"loss": 0.2635,
"step": 58600
},
{
"epoch": 3.524910091483748,
"eval_loss": 0.23515385389328003,
"eval_runtime": 50.6423,
"eval_samples_per_second": 197.463,
"eval_steps_per_second": 24.683,
"step": 58600
},
{
"epoch": 3.5309252964180207,
"grad_norm": 0.9041895866394043,
"learning_rate": 5.414804934978326e-06,
"loss": 0.2633,
"step": 58700
},
{
"epoch": 3.5309252964180207,
"eval_loss": 0.2379036694765091,
"eval_runtime": 50.2383,
"eval_samples_per_second": 199.051,
"eval_steps_per_second": 24.881,
"step": 58700
},
{
"epoch": 3.5369405013522934,
"grad_norm": 0.884931743144989,
"learning_rate": 5.413804601533845e-06,
"loss": 0.2612,
"step": 58800
},
{
"epoch": 3.5369405013522934,
"eval_loss": 0.23683039844036102,
"eval_runtime": 50.2696,
"eval_samples_per_second": 198.928,
"eval_steps_per_second": 24.866,
"step": 58800
},
{
"epoch": 3.5429557062865658,
"grad_norm": 0.862382709980011,
"learning_rate": 5.4128042680893636e-06,
"loss": 0.2623,
"step": 58900
},
{
"epoch": 3.5429557062865658,
"eval_loss": 0.23638789355754852,
"eval_runtime": 50.4759,
"eval_samples_per_second": 198.114,
"eval_steps_per_second": 24.764,
"step": 58900
},
{
"epoch": 3.5489709112208385,
"grad_norm": 0.8239731788635254,
"learning_rate": 5.4118039346448814e-06,
"loss": 0.2652,
"step": 59000
},
{
"epoch": 3.5489709112208385,
"eval_loss": 0.23644813895225525,
"eval_runtime": 49.8805,
"eval_samples_per_second": 200.479,
"eval_steps_per_second": 25.06,
"step": 59000
},
{
"epoch": 3.554986116155111,
"grad_norm": 0.8433008193969727,
"learning_rate": 5.4108036012004e-06,
"loss": 0.2628,
"step": 59100
},
{
"epoch": 3.554986116155111,
"eval_loss": 0.23331347107887268,
"eval_runtime": 50.0038,
"eval_samples_per_second": 199.985,
"eval_steps_per_second": 24.998,
"step": 59100
},
{
"epoch": 3.5610013210893836,
"grad_norm": 0.8740643858909607,
"learning_rate": 5.409803267755919e-06,
"loss": 0.2615,
"step": 59200
},
{
"epoch": 3.5610013210893836,
"eval_loss": 0.23751728236675262,
"eval_runtime": 49.4105,
"eval_samples_per_second": 202.386,
"eval_steps_per_second": 25.298,
"step": 59200
},
{
"epoch": 3.5670165260236564,
"grad_norm": 0.7903056144714355,
"learning_rate": 5.4088029343114375e-06,
"loss": 0.2621,
"step": 59300
},
{
"epoch": 3.5670165260236564,
"eval_loss": 0.23228037357330322,
"eval_runtime": 49.2273,
"eval_samples_per_second": 203.139,
"eval_steps_per_second": 25.392,
"step": 59300
},
{
"epoch": 3.573031730957929,
"grad_norm": 0.8559598326683044,
"learning_rate": 5.407802600866956e-06,
"loss": 0.2621,
"step": 59400
},
{
"epoch": 3.573031730957929,
"eval_loss": 0.23780353367328644,
"eval_runtime": 49.4165,
"eval_samples_per_second": 202.362,
"eval_steps_per_second": 25.295,
"step": 59400
},
{
"epoch": 3.5790469358922015,
"grad_norm": 0.9178751111030579,
"learning_rate": 5.406802267422474e-06,
"loss": 0.2635,
"step": 59500
},
{
"epoch": 3.5790469358922015,
"eval_loss": 0.23736293613910675,
"eval_runtime": 49.1576,
"eval_samples_per_second": 203.427,
"eval_steps_per_second": 25.428,
"step": 59500
},
{
"epoch": 3.5850621408264742,
"grad_norm": 0.8310320377349854,
"learning_rate": 5.405801933977993e-06,
"loss": 0.2626,
"step": 59600
},
{
"epoch": 3.5850621408264742,
"eval_loss": 0.2320030778646469,
"eval_runtime": 49.4934,
"eval_samples_per_second": 202.047,
"eval_steps_per_second": 25.256,
"step": 59600
},
{
"epoch": 3.5910773457607466,
"grad_norm": 0.7860143184661865,
"learning_rate": 5.4048016005335115e-06,
"loss": 0.2632,
"step": 59700
},
{
"epoch": 3.5910773457607466,
"eval_loss": 0.2336650937795639,
"eval_runtime": 49.1673,
"eval_samples_per_second": 203.387,
"eval_steps_per_second": 25.423,
"step": 59700
},
{
"epoch": 3.5970925506950193,
"grad_norm": 0.836063027381897,
"learning_rate": 5.403801267089029e-06,
"loss": 0.2621,
"step": 59800
},
{
"epoch": 3.5970925506950193,
"eval_loss": 0.23437707126140594,
"eval_runtime": 49.5986,
"eval_samples_per_second": 201.619,
"eval_steps_per_second": 25.202,
"step": 59800
},
{
"epoch": 3.603107755629292,
"grad_norm": 0.8768342137336731,
"learning_rate": 5.402800933644548e-06,
"loss": 0.2609,
"step": 59900
},
{
"epoch": 3.603107755629292,
"eval_loss": 0.23560036718845367,
"eval_runtime": 49.2225,
"eval_samples_per_second": 203.159,
"eval_steps_per_second": 25.395,
"step": 59900
},
{
"epoch": 3.6091229605635644,
"grad_norm": 0.8093357682228088,
"learning_rate": 5.401800600200067e-06,
"loss": 0.26,
"step": 60000
},
{
"epoch": 3.6091229605635644,
"eval_loss": 0.2340717762708664,
"eval_runtime": 49.3844,
"eval_samples_per_second": 202.493,
"eval_steps_per_second": 25.312,
"step": 60000
},
{
"epoch": 3.615138165497837,
"grad_norm": 0.8731770515441895,
"learning_rate": 5.4008002667555855e-06,
"loss": 0.2614,
"step": 60100
},
{
"epoch": 3.615138165497837,
"eval_loss": 0.2342948466539383,
"eval_runtime": 48.6563,
"eval_samples_per_second": 205.523,
"eval_steps_per_second": 25.69,
"step": 60100
},
{
"epoch": 3.6211533704321095,
"grad_norm": 0.8906363844871521,
"learning_rate": 5.399799933311104e-06,
"loss": 0.2601,
"step": 60200
},
{
"epoch": 3.6211533704321095,
"eval_loss": 0.2331141084432602,
"eval_runtime": 49.3998,
"eval_samples_per_second": 202.43,
"eval_steps_per_second": 25.304,
"step": 60200
},
{
"epoch": 3.6271685753663823,
"grad_norm": 0.8565790057182312,
"learning_rate": 5.398799599866623e-06,
"loss": 0.2603,
"step": 60300
},
{
"epoch": 3.6271685753663823,
"eval_loss": 0.23420780897140503,
"eval_runtime": 48.2983,
"eval_samples_per_second": 207.046,
"eval_steps_per_second": 25.881,
"step": 60300
},
{
"epoch": 3.633183780300655,
"grad_norm": 0.9718087911605835,
"learning_rate": 5.397799266422141e-06,
"loss": 0.2635,
"step": 60400
},
{
"epoch": 3.633183780300655,
"eval_loss": 0.2375570833683014,
"eval_runtime": 48.9976,
"eval_samples_per_second": 204.091,
"eval_steps_per_second": 25.511,
"step": 60400
},
{
"epoch": 3.639198985234928,
"grad_norm": 0.8572448492050171,
"learning_rate": 5.3967989329776595e-06,
"loss": 0.2626,
"step": 60500
},
{
"epoch": 3.639198985234928,
"eval_loss": 0.23931777477264404,
"eval_runtime": 49.0436,
"eval_samples_per_second": 203.9,
"eval_steps_per_second": 25.488,
"step": 60500
},
{
"epoch": 3.6452141901692,
"grad_norm": 0.8994346857070923,
"learning_rate": 5.395798599533177e-06,
"loss": 0.2595,
"step": 60600
},
{
"epoch": 3.6452141901692,
"eval_loss": 0.2317589819431305,
"eval_runtime": 49.5846,
"eval_samples_per_second": 201.675,
"eval_steps_per_second": 25.209,
"step": 60600
},
{
"epoch": 3.651229395103473,
"grad_norm": 0.8513436913490295,
"learning_rate": 5.394798266088696e-06,
"loss": 0.2614,
"step": 60700
},
{
"epoch": 3.651229395103473,
"eval_loss": 0.23111025989055634,
"eval_runtime": 49.7262,
"eval_samples_per_second": 201.101,
"eval_steps_per_second": 25.138,
"step": 60700
},
{
"epoch": 3.657244600037745,
"grad_norm": 0.9126865267753601,
"learning_rate": 5.393797932644215e-06,
"loss": 0.2583,
"step": 60800
},
{
"epoch": 3.657244600037745,
"eval_loss": 0.23351147770881653,
"eval_runtime": 49.8967,
"eval_samples_per_second": 200.414,
"eval_steps_per_second": 25.052,
"step": 60800
},
{
"epoch": 3.663259804972018,
"grad_norm": 0.8021876811981201,
"learning_rate": 5.3927975991997335e-06,
"loss": 0.2601,
"step": 60900
},
{
"epoch": 3.663259804972018,
"eval_loss": 0.23443163931369781,
"eval_runtime": 49.9056,
"eval_samples_per_second": 200.378,
"eval_steps_per_second": 25.047,
"step": 60900
},
{
"epoch": 3.6692750099062907,
"grad_norm": 0.8586119413375854,
"learning_rate": 5.391797265755252e-06,
"loss": 0.2605,
"step": 61000
},
{
"epoch": 3.6692750099062907,
"eval_loss": 0.229187473654747,
"eval_runtime": 40.9269,
"eval_samples_per_second": 244.338,
"eval_steps_per_second": 30.542,
"step": 61000
},
{
"epoch": 3.6752902148405635,
"grad_norm": 0.9336073398590088,
"learning_rate": 5.390796932310771e-06,
"loss": 0.2612,
"step": 61100
},
{
"epoch": 3.6752902148405635,
"eval_loss": 0.23033183813095093,
"eval_runtime": 49.8614,
"eval_samples_per_second": 200.556,
"eval_steps_per_second": 25.069,
"step": 61100
},
{
"epoch": 3.681305419774836,
"grad_norm": 0.7944173812866211,
"learning_rate": 5.389796598866289e-06,
"loss": 0.2595,
"step": 61200
},
{
"epoch": 3.681305419774836,
"eval_loss": 0.22884014248847961,
"eval_runtime": 50.2375,
"eval_samples_per_second": 199.055,
"eval_steps_per_second": 24.882,
"step": 61200
},
{
"epoch": 3.6873206247091086,
"grad_norm": 0.8038543462753296,
"learning_rate": 5.3887962654218075e-06,
"loss": 0.2588,
"step": 61300
},
{
"epoch": 3.6873206247091086,
"eval_loss": 0.23328329622745514,
"eval_runtime": 51.0649,
"eval_samples_per_second": 195.829,
"eval_steps_per_second": 24.479,
"step": 61300
},
{
"epoch": 3.693335829643381,
"grad_norm": 0.8919224143028259,
"learning_rate": 5.387795931977326e-06,
"loss": 0.2592,
"step": 61400
},
{
"epoch": 3.693335829643381,
"eval_loss": 0.23098503053188324,
"eval_runtime": 51.0915,
"eval_samples_per_second": 195.727,
"eval_steps_per_second": 24.466,
"step": 61400
},
{
"epoch": 3.6993510345776537,
"grad_norm": 0.81063312292099,
"learning_rate": 5.386795598532844e-06,
"loss": 0.2598,
"step": 61500
},
{
"epoch": 3.6993510345776537,
"eval_loss": 0.23130032420158386,
"eval_runtime": 51.1499,
"eval_samples_per_second": 195.504,
"eval_steps_per_second": 24.438,
"step": 61500
},
{
"epoch": 3.7053662395119265,
"grad_norm": 0.8565428853034973,
"learning_rate": 5.385795265088363e-06,
"loss": 0.2569,
"step": 61600
},
{
"epoch": 3.7053662395119265,
"eval_loss": 0.23042194545269012,
"eval_runtime": 51.0719,
"eval_samples_per_second": 195.802,
"eval_steps_per_second": 24.475,
"step": 61600
},
{
"epoch": 3.7113814444461988,
"grad_norm": 0.8808117508888245,
"learning_rate": 5.3847949316438814e-06,
"loss": 0.2579,
"step": 61700
},
{
"epoch": 3.7113814444461988,
"eval_loss": 0.22964029014110565,
"eval_runtime": 51.1788,
"eval_samples_per_second": 195.393,
"eval_steps_per_second": 24.424,
"step": 61700
},
{
"epoch": 3.7173966493804715,
"grad_norm": 0.8812440037727356,
"learning_rate": 5.3837945981994e-06,
"loss": 0.2568,
"step": 61800
},
{
"epoch": 3.7173966493804715,
"eval_loss": 0.23177900910377502,
"eval_runtime": 51.1658,
"eval_samples_per_second": 195.443,
"eval_steps_per_second": 24.43,
"step": 61800
},
{
"epoch": 3.723411854314744,
"grad_norm": 0.8692899346351624,
"learning_rate": 5.382794264754919e-06,
"loss": 0.2567,
"step": 61900
},
{
"epoch": 3.723411854314744,
"eval_loss": 0.23119042813777924,
"eval_runtime": 51.1394,
"eval_samples_per_second": 195.544,
"eval_steps_per_second": 24.443,
"step": 61900
},
{
"epoch": 3.7294270592490166,
"grad_norm": 0.8057258725166321,
"learning_rate": 5.381793931310437e-06,
"loss": 0.2574,
"step": 62000
},
{
"epoch": 3.7294270592490166,
"eval_loss": 0.2311127930879593,
"eval_runtime": 51.1109,
"eval_samples_per_second": 195.653,
"eval_steps_per_second": 24.457,
"step": 62000
},
{
"epoch": 3.7354422641832894,
"grad_norm": 0.7970178127288818,
"learning_rate": 5.380793597865955e-06,
"loss": 0.2589,
"step": 62100
},
{
"epoch": 3.7354422641832894,
"eval_loss": 0.2320980727672577,
"eval_runtime": 51.1619,
"eval_samples_per_second": 195.458,
"eval_steps_per_second": 24.432,
"step": 62100
},
{
"epoch": 3.741457469117562,
"grad_norm": 0.8987645506858826,
"learning_rate": 5.379793264421474e-06,
"loss": 0.2565,
"step": 62200
},
{
"epoch": 3.741457469117562,
"eval_loss": 0.22809037566184998,
"eval_runtime": 51.1437,
"eval_samples_per_second": 195.527,
"eval_steps_per_second": 24.441,
"step": 62200
},
{
"epoch": 3.7474726740518345,
"grad_norm": 0.8491466641426086,
"learning_rate": 5.378792930976992e-06,
"loss": 0.2572,
"step": 62300
},
{
"epoch": 3.7474726740518345,
"eval_loss": 0.23448967933654785,
"eval_runtime": 51.1016,
"eval_samples_per_second": 195.688,
"eval_steps_per_second": 24.461,
"step": 62300
},
{
"epoch": 3.7534878789861073,
"grad_norm": 0.8310768008232117,
"learning_rate": 5.377792597532511e-06,
"loss": 0.2558,
"step": 62400
},
{
"epoch": 3.7534878789861073,
"eval_loss": 0.2314356416463852,
"eval_runtime": 51.1436,
"eval_samples_per_second": 195.528,
"eval_steps_per_second": 24.441,
"step": 62400
},
{
"epoch": 3.7595030839203796,
"grad_norm": 0.8902222514152527,
"learning_rate": 5.376792264088029e-06,
"loss": 0.256,
"step": 62500
},
{
"epoch": 3.7595030839203796,
"eval_loss": 0.23469364643096924,
"eval_runtime": 51.1102,
"eval_samples_per_second": 195.656,
"eval_steps_per_second": 24.457,
"step": 62500
},
{
"epoch": 3.7655182888546523,
"grad_norm": 0.7377832531929016,
"learning_rate": 5.375791930643548e-06,
"loss": 0.2574,
"step": 62600
},
{
"epoch": 3.7655182888546523,
"eval_loss": 0.23291806876659393,
"eval_runtime": 51.1312,
"eval_samples_per_second": 195.575,
"eval_steps_per_second": 24.447,
"step": 62600
},
{
"epoch": 3.771533493788925,
"grad_norm": 0.7997824549674988,
"learning_rate": 5.374791597199067e-06,
"loss": 0.257,
"step": 62700
},
{
"epoch": 3.771533493788925,
"eval_loss": 0.23000933229923248,
"eval_runtime": 48.2655,
"eval_samples_per_second": 207.187,
"eval_steps_per_second": 25.898,
"step": 62700
},
{
"epoch": 3.7775486987231974,
"grad_norm": 0.8683999180793762,
"learning_rate": 5.373791263754585e-06,
"loss": 0.2564,
"step": 62800
},
{
"epoch": 3.7775486987231974,
"eval_loss": 0.23462143540382385,
"eval_runtime": 51.0748,
"eval_samples_per_second": 195.791,
"eval_steps_per_second": 24.474,
"step": 62800
},
{
"epoch": 3.78356390365747,
"grad_norm": 0.8755656480789185,
"learning_rate": 5.372790930310103e-06,
"loss": 0.2558,
"step": 62900
},
{
"epoch": 3.78356390365747,
"eval_loss": 0.23621977865695953,
"eval_runtime": 51.1202,
"eval_samples_per_second": 195.617,
"eval_steps_per_second": 24.452,
"step": 62900
},
{
"epoch": 3.7895791085917425,
"grad_norm": 0.9032362699508667,
"learning_rate": 5.371790596865622e-06,
"loss": 0.2551,
"step": 63000
},
{
"epoch": 3.7895791085917425,
"eval_loss": 0.2294510453939438,
"eval_runtime": 51.1388,
"eval_samples_per_second": 195.546,
"eval_steps_per_second": 24.443,
"step": 63000
}
],
"logging_steps": 100,
"max_steps": 600000,
"num_input_tokens_seen": 0,
"num_train_epochs": 37,
"save_steps": 1000,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 10,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 8
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 5.304354533994406e+18,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}