llama_openr1_sft / trainer_state.json
imdatta0's picture
Model save
8c0669a verified
{
"best_global_step": 1408,
"best_metric": 0.3940983712673187,
"best_model_checkpoint": "/home/ubuntu/mnt/dattafs/train/llama_openr1_sft/checkpoint-1408",
"epoch": 1.9995726860952057,
"eval_steps": 32,
"global_step": 1462,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.021878471925476456,
"grad_norm": 1.7315069437026978,
"learning_rate": 5.4421768707483e-06,
"loss": 0.7967,
"num_tokens": 11790317.0,
"step": 16
},
{
"epoch": 0.04375694385095291,
"grad_norm": 0.6117880344390869,
"learning_rate": 1.08843537414966e-05,
"loss": 0.6135,
"step": 32
},
{
"epoch": 0.04375694385095291,
"eval_loss": 0.5614376664161682,
"eval_num_tokens": 23652572.0,
"eval_runtime": 18.6204,
"eval_samples_per_second": 6.874,
"eval_steps_per_second": 1.719,
"step": 32
},
{
"epoch": 0.06563541577642937,
"grad_norm": 0.5508595108985901,
"learning_rate": 1.6326530612244897e-05,
"loss": 0.5603,
"num_tokens": 35271932.0,
"step": 48
},
{
"epoch": 0.08751388770190582,
"grad_norm": 0.980832576751709,
"learning_rate": 2.17687074829932e-05,
"loss": 0.5404,
"step": 64
},
{
"epoch": 0.08751388770190582,
"eval_loss": 0.5128401517868042,
"eval_num_tokens": 46694102.0,
"eval_runtime": 43.903,
"eval_samples_per_second": 2.916,
"eval_steps_per_second": 0.729,
"step": 64
},
{
"epoch": 0.10939235962738228,
"grad_norm": 0.918790876865387,
"learning_rate": 2.72108843537415e-05,
"loss": 0.5293,
"num_tokens": 58386969.0,
"step": 80
},
{
"epoch": 0.13127083155285874,
"grad_norm": 1.004357933998108,
"learning_rate": 3.265306122448979e-05,
"loss": 0.5192,
"step": 96
},
{
"epoch": 0.13127083155285874,
"eval_loss": 0.5079865455627441,
"eval_num_tokens": 69750037.0,
"eval_runtime": 18.3625,
"eval_samples_per_second": 6.971,
"eval_steps_per_second": 1.743,
"step": 96
},
{
"epoch": 0.1531493034783352,
"grad_norm": 1.4387233257293701,
"learning_rate": 3.809523809523809e-05,
"loss": 0.5224,
"num_tokens": 81535265.0,
"step": 112
},
{
"epoch": 0.17502777540381165,
"grad_norm": 1.11899733543396,
"learning_rate": 4.35374149659864e-05,
"loss": 0.5234,
"step": 128
},
{
"epoch": 0.17502777540381165,
"eval_loss": 0.5003859400749207,
"eval_num_tokens": 92959530.0,
"eval_runtime": 44.753,
"eval_samples_per_second": 2.86,
"eval_steps_per_second": 0.715,
"step": 128
},
{
"epoch": 0.1969062473292881,
"grad_norm": 1.0003859996795654,
"learning_rate": 4.89795918367347e-05,
"loss": 0.5166,
"num_tokens": 104537897.0,
"step": 144
},
{
"epoch": 0.21878471925476456,
"grad_norm": 0.7199622392654419,
"learning_rate": 4.99879438109886e-05,
"loss": 0.5217,
"step": 160
},
{
"epoch": 0.21878471925476456,
"eval_loss": 0.49373239278793335,
"eval_num_tokens": 116348744.0,
"eval_runtime": 18.4241,
"eval_samples_per_second": 6.947,
"eval_steps_per_second": 1.737,
"step": 160
},
{
"epoch": 0.24066319118024102,
"grad_norm": 0.7497566938400269,
"learning_rate": 4.9940023582279216e-05,
"loss": 0.5074,
"num_tokens": 127932733.0,
"step": 176
},
{
"epoch": 0.26254166310571747,
"grad_norm": 0.5236051082611084,
"learning_rate": 4.985566722849454e-05,
"loss": 0.504,
"step": 192
},
{
"epoch": 0.26254166310571747,
"eval_loss": 0.4837675094604492,
"eval_num_tokens": 139301330.0,
"eval_runtime": 18.3533,
"eval_samples_per_second": 6.974,
"eval_steps_per_second": 1.744,
"step": 192
},
{
"epoch": 0.2844201350311939,
"grad_norm": 0.5282280445098877,
"learning_rate": 4.973499799004161e-05,
"loss": 0.501,
"num_tokens": 151017333.0,
"step": 208
},
{
"epoch": 0.3062986069566704,
"grad_norm": 0.7268691062927246,
"learning_rate": 4.957819215863282e-05,
"loss": 0.4915,
"step": 224
},
{
"epoch": 0.3062986069566704,
"eval_loss": 0.476335346698761,
"eval_num_tokens": 162752028.0,
"eval_runtime": 18.509,
"eval_samples_per_second": 6.916,
"eval_steps_per_second": 1.729,
"step": 224
},
{
"epoch": 0.3281770788821468,
"grad_norm": 0.4880088269710541,
"learning_rate": 4.9385478819732645e-05,
"loss": 0.493,
"num_tokens": 174636003.0,
"step": 240
},
{
"epoch": 0.3500555508076233,
"grad_norm": 0.8152353763580322,
"learning_rate": 4.9157139517875176e-05,
"loss": 0.4993,
"step": 256
},
{
"epoch": 0.3500555508076233,
"eval_loss": 0.47467267513275146,
"eval_num_tokens": 186441099.0,
"eval_runtime": 18.5278,
"eval_samples_per_second": 6.909,
"eval_steps_per_second": 1.727,
"step": 256
},
{
"epoch": 0.3719340227330997,
"grad_norm": 0.5629775524139404,
"learning_rate": 4.889350784534168e-05,
"loss": 0.4943,
"num_tokens": 198380851.0,
"step": 272
},
{
"epoch": 0.3938124946585762,
"grad_norm": 0.5178916454315186,
"learning_rate": 4.859496895479903e-05,
"loss": 0.4932,
"step": 288
},
{
"epoch": 0.3938124946585762,
"eval_loss": 0.4674685001373291,
"eval_num_tokens": 210236651.0,
"eval_runtime": 46.2968,
"eval_samples_per_second": 2.765,
"eval_steps_per_second": 0.691,
"step": 288
},
{
"epoch": 0.41569096658405263,
"grad_norm": 0.41224056482315063,
"learning_rate": 4.8261958996610914e-05,
"loss": 0.4811,
"num_tokens": 221999996.0,
"step": 304
},
{
"epoch": 0.4375694385095291,
"grad_norm": 0.5033276081085205,
"learning_rate": 4.7894964481643984e-05,
"loss": 0.4833,
"step": 320
},
{
"epoch": 0.4375694385095291,
"eval_loss": 0.4603790044784546,
"eval_num_tokens": 233668486.0,
"eval_runtime": 18.4345,
"eval_samples_per_second": 6.943,
"eval_steps_per_second": 1.736,
"step": 320
},
{
"epoch": 0.45944791043500555,
"grad_norm": 0.4538171887397766,
"learning_rate": 4.7494521570499914e-05,
"loss": 0.4769,
"num_tokens": 245229299.0,
"step": 336
},
{
"epoch": 0.48132638236048203,
"grad_norm": 0.44999656081199646,
"learning_rate": 4.706121529021158e-05,
"loss": 0.476,
"step": 352
},
{
"epoch": 0.48132638236048203,
"eval_loss": 0.45518478751182556,
"eval_num_tokens": 256808967.0,
"eval_runtime": 44.9829,
"eval_samples_per_second": 2.846,
"eval_steps_per_second": 0.711,
"step": 352
},
{
"epoch": 0.5032048542859585,
"grad_norm": 0.4638039171695709,
"learning_rate": 4.659567867954784e-05,
"loss": 0.4792,
"num_tokens": 268354735.0,
"step": 368
},
{
"epoch": 0.5250833262114349,
"grad_norm": 0.43113598227500916,
"learning_rate": 4.6098591864175696e-05,
"loss": 0.4666,
"step": 384
},
{
"epoch": 0.5250833262114349,
"eval_loss": 0.45199862122535706,
"eval_num_tokens": 279922749.0,
"eval_runtime": 18.4358,
"eval_samples_per_second": 6.943,
"eval_steps_per_second": 1.736,
"step": 384
},
{
"epoch": 0.5469617981369114,
"grad_norm": 0.44990119338035583,
"learning_rate": 4.557068106303067e-05,
"loss": 0.467,
"num_tokens": 291496113.0,
"step": 400
},
{
"epoch": 0.5688402700623878,
"grad_norm": 0.4064292907714844,
"learning_rate": 4.501271752734737e-05,
"loss": 0.4635,
"step": 416
},
{
"epoch": 0.5688402700623878,
"eval_loss": 0.4471489489078522,
"eval_num_tokens": 303134880.0,
"eval_runtime": 44.7579,
"eval_samples_per_second": 2.86,
"eval_steps_per_second": 0.715,
"step": 416
},
{
"epoch": 0.5907187419878643,
"grad_norm": 0.4212772846221924,
"learning_rate": 4.442551641390008e-05,
"loss": 0.46,
"num_tokens": 314857074.0,
"step": 432
},
{
"epoch": 0.6125972139133408,
"grad_norm": 0.43323245644569397,
"learning_rate": 4.3809935594099515e-05,
"loss": 0.4614,
"step": 448
},
{
"epoch": 0.6125972139133408,
"eval_loss": 0.442826509475708,
"eval_num_tokens": 326457951.0,
"eval_runtime": 18.6512,
"eval_samples_per_second": 6.863,
"eval_steps_per_second": 1.716,
"step": 448
},
{
"epoch": 0.6344756858388172,
"grad_norm": 0.33663076162338257,
"learning_rate": 4.3166874400685694e-05,
"loss": 0.4593,
"num_tokens": 337990615.0,
"step": 464
},
{
"epoch": 0.6563541577642936,
"grad_norm": 0.537545919418335,
"learning_rate": 4.2497272313847825e-05,
"loss": 0.4582,
"step": 480
},
{
"epoch": 0.6563541577642936,
"eval_loss": 0.43967390060424805,
"eval_num_tokens": 349803965.0,
"eval_runtime": 18.4027,
"eval_samples_per_second": 6.956,
"eval_steps_per_second": 1.739,
"step": 480
},
{
"epoch": 0.67823262968977,
"grad_norm": 0.4213317334651947,
"learning_rate": 4.1802107588690856e-05,
"loss": 0.4565,
"num_tokens": 361540653.0,
"step": 496
},
{
"epoch": 0.7001111016152466,
"grad_norm": 0.3804231882095337,
"learning_rate": 4.108239582605374e-05,
"loss": 0.4545,
"step": 512
},
{
"epoch": 0.7001111016152466,
"eval_loss": 0.4352826476097107,
"eval_num_tokens": 373230587.0,
"eval_runtime": 18.5333,
"eval_samples_per_second": 6.906,
"eval_steps_per_second": 1.727,
"step": 512
},
{
"epoch": 0.721989573540723,
"grad_norm": 0.40216949582099915,
"learning_rate": 4.033918848876751e-05,
"loss": 0.4513,
"num_tokens": 384593207.0,
"step": 528
},
{
"epoch": 0.7438680454661994,
"grad_norm": 0.39182427525520325,
"learning_rate": 3.957357136552072e-05,
"loss": 0.4457,
"step": 544
},
{
"epoch": 0.7438680454661994,
"eval_loss": 0.43387627601623535,
"eval_num_tokens": 395921551.0,
"eval_runtime": 18.4858,
"eval_samples_per_second": 6.924,
"eval_steps_per_second": 1.731,
"step": 544
},
{
"epoch": 0.7657465173916759,
"grad_norm": 0.4319497346878052,
"learning_rate": 3.8786662984576605e-05,
"loss": 0.449,
"num_tokens": 407202540.0,
"step": 560
},
{
"epoch": 0.7876249893171524,
"grad_norm": 0.41336458921432495,
"learning_rate": 3.79796129796593e-05,
"loss": 0.4501,
"step": 576
},
{
"epoch": 0.7876249893171524,
"eval_loss": 0.4308420419692993,
"eval_num_tokens": 418778095.0,
"eval_runtime": 45.2458,
"eval_samples_per_second": 2.829,
"eval_steps_per_second": 0.707,
"step": 576
},
{
"epoch": 0.8095034612426288,
"grad_norm": 0.43935731053352356,
"learning_rate": 3.715360041039655e-05,
"loss": 0.4463,
"num_tokens": 430448376.0,
"step": 592
},
{
"epoch": 0.8313819331681053,
"grad_norm": 0.3827608525753021,
"learning_rate": 3.6309832039772707e-05,
"loss": 0.4445,
"step": 608
},
{
"epoch": 0.8313819331681053,
"eval_loss": 0.4279908537864685,
"eval_num_tokens": 441738429.0,
"eval_runtime": 18.3837,
"eval_samples_per_second": 6.963,
"eval_steps_per_second": 1.741,
"step": 608
},
{
"epoch": 0.8532604050935817,
"grad_norm": 0.3697313368320465,
"learning_rate": 3.544954057110839e-05,
"loss": 0.4484,
"num_tokens": 453881118.0,
"step": 624
},
{
"epoch": 0.8751388770190582,
"grad_norm": 0.37059286236763,
"learning_rate": 3.457398284714275e-05,
"loss": 0.4403,
"step": 640
},
{
"epoch": 0.8751388770190582,
"eval_loss": 0.42479920387268066,
"eval_num_tokens": 465534133.0,
"eval_runtime": 45.8322,
"eval_samples_per_second": 2.793,
"eval_steps_per_second": 0.698,
"step": 640
},
{
"epoch": 0.8970173489445347,
"grad_norm": 0.3813212513923645,
"learning_rate": 3.3684438013849154e-05,
"loss": 0.4401,
"num_tokens": 476853966.0,
"step": 656
},
{
"epoch": 0.9188958208700111,
"grad_norm": 0.32739850878715515,
"learning_rate": 3.2782205651667013e-05,
"loss": 0.4392,
"step": 672
},
{
"epoch": 0.9188958208700111,
"eval_loss": 0.4213526248931885,
"eval_num_tokens": 488461810.0,
"eval_runtime": 18.2948,
"eval_samples_per_second": 6.997,
"eval_steps_per_second": 1.749,
"step": 672
},
{
"epoch": 0.9407742927954875,
"grad_norm": 0.34722763299942017,
"learning_rate": 3.186860387687986e-05,
"loss": 0.4433,
"num_tokens": 500387798.0,
"step": 688
},
{
"epoch": 0.9626527647209641,
"grad_norm": 0.3709475100040436,
"learning_rate": 3.094496741591349e-05,
"loss": 0.4418,
"step": 704
},
{
"epoch": 0.9626527647209641,
"eval_loss": 0.41918742656707764,
"eval_num_tokens": 512072235.0,
"eval_runtime": 18.1717,
"eval_samples_per_second": 7.044,
"eval_steps_per_second": 1.761,
"step": 704
},
{
"epoch": 0.9845312366464405,
"grad_norm": 0.3451824188232422,
"learning_rate": 3.00126456553675e-05,
"loss": 0.4338,
"num_tokens": 523848298.0,
"step": 720
},
{
"epoch": 1.0068370224767114,
"grad_norm": 0.517411470413208,
"learning_rate": 2.9073000670629098e-05,
"loss": 0.4445,
"step": 736
},
{
"epoch": 1.0068370224767114,
"eval_loss": 0.4266531467437744,
"eval_num_tokens": 535523389.0,
"eval_runtime": 18.4611,
"eval_samples_per_second": 6.934,
"eval_steps_per_second": 1.733,
"step": 736
},
{
"epoch": 1.028715494402188,
"grad_norm": 0.3714677691459656,
"learning_rate": 2.8127405235949174e-05,
"loss": 0.3777,
"num_tokens": 547246764.0,
"step": 752
},
{
"epoch": 1.0505939663276642,
"grad_norm": 0.3552056550979614,
"learning_rate": 2.7177240818887893e-05,
"loss": 0.3742,
"step": 768
},
{
"epoch": 1.0505939663276642,
"eval_loss": 0.421397864818573,
"eval_num_tokens": 558829084.0,
"eval_runtime": 18.4181,
"eval_samples_per_second": 6.95,
"eval_steps_per_second": 1.737,
"step": 768
},
{
"epoch": 1.0724724382531408,
"grad_norm": 0.3726598620414734,
"learning_rate": 2.6223895562059786e-05,
"loss": 0.3683,
"num_tokens": 570404128.0,
"step": 784
},
{
"epoch": 1.0943509101786173,
"grad_norm": 0.33536332845687866,
"learning_rate": 2.5268762255126948e-05,
"loss": 0.3658,
"step": 800
},
{
"epoch": 1.0943509101786173,
"eval_loss": 0.41810134053230286,
"eval_num_tokens": 582073105.0,
"eval_runtime": 43.7846,
"eval_samples_per_second": 2.923,
"eval_steps_per_second": 0.731,
"step": 800
},
{
"epoch": 1.1162293821040936,
"grad_norm": 0.32326406240463257,
"learning_rate": 2.4313236300003103e-05,
"loss": 0.3683,
"num_tokens": 593969991.0,
"step": 816
},
{
"epoch": 1.1381078540295702,
"grad_norm": 0.3347606956958771,
"learning_rate": 2.33587136722413e-05,
"loss": 0.3697,
"step": 832
},
{
"epoch": 1.1381078540295702,
"eval_loss": 0.41542908549308777,
"eval_num_tokens": 605516295.0,
"eval_runtime": 18.4764,
"eval_samples_per_second": 6.928,
"eval_steps_per_second": 1.732,
"step": 832
},
{
"epoch": 1.1599863259550465,
"grad_norm": 0.34967222809791565,
"learning_rate": 2.2406588881583594e-05,
"loss": 0.3639,
"num_tokens": 617129829.0,
"step": 848
},
{
"epoch": 1.181864797880523,
"grad_norm": 0.3748638927936554,
"learning_rate": 2.1458252934652146e-05,
"loss": 0.366,
"step": 864
},
{
"epoch": 1.181864797880523,
"eval_loss": 0.41449588537216187,
"eval_num_tokens": 628546914.0,
"eval_runtime": 45.7716,
"eval_samples_per_second": 2.796,
"eval_steps_per_second": 0.699,
"step": 864
},
{
"epoch": 1.2037432698059995,
"grad_norm": 0.33047473430633545,
"learning_rate": 2.0515091302758217e-05,
"loss": 0.3634,
"num_tokens": 640328734.0,
"step": 880
},
{
"epoch": 1.2256217417314759,
"grad_norm": 0.34247222542762756,
"learning_rate": 1.9578481897798028e-05,
"loss": 0.3618,
"step": 896
},
{
"epoch": 1.2256217417314759,
"eval_loss": 0.4123848080635071,
"eval_num_tokens": 651969596.0,
"eval_runtime": 18.5044,
"eval_samples_per_second": 6.917,
"eval_steps_per_second": 1.729,
"step": 896
},
{
"epoch": 1.2475002136569524,
"grad_norm": 0.3337569534778595,
"learning_rate": 1.864979305919248e-05,
"loss": 0.3651,
"num_tokens": 663693234.0,
"step": 912
},
{
"epoch": 1.2693786855824287,
"grad_norm": 0.3318181335926056,
"learning_rate": 1.7730381554811815e-05,
"loss": 0.3625,
"step": 928
},
{
"epoch": 1.2693786855824287,
"eval_loss": 0.4105568528175354,
"eval_num_tokens": 675331630.0,
"eval_runtime": 44.7946,
"eval_samples_per_second": 2.857,
"eval_steps_per_second": 0.714,
"step": 928
},
{
"epoch": 1.2912571575079053,
"grad_norm": 0.3317345678806305,
"learning_rate": 1.6821590598805708e-05,
"loss": 0.3615,
"num_tokens": 686943119.0,
"step": 944
},
{
"epoch": 1.3131356294333818,
"grad_norm": 0.3148461580276489,
"learning_rate": 1.5924747889234743e-05,
"loss": 0.3602,
"step": 960
},
{
"epoch": 1.3131356294333818,
"eval_loss": 0.40809541940689087,
"eval_num_tokens": 698738800.0,
"eval_runtime": 18.4793,
"eval_samples_per_second": 6.927,
"eval_steps_per_second": 1.732,
"step": 960
},
{
"epoch": 1.3350141013588583,
"grad_norm": 0.3473955988883972,
"learning_rate": 1.5041163668369939e-05,
"loss": 0.3595,
"num_tokens": 710486757.0,
"step": 976
},
{
"epoch": 1.3568925732843347,
"grad_norm": 0.2996233105659485,
"learning_rate": 1.4172128808494572e-05,
"loss": 0.358,
"step": 992
},
{
"epoch": 1.3568925732843347,
"eval_loss": 0.406377911567688,
"eval_num_tokens": 722294728.0,
"eval_runtime": 18.2468,
"eval_samples_per_second": 7.015,
"eval_steps_per_second": 1.754,
"step": 992
},
{
"epoch": 1.3787710452098112,
"grad_norm": 0.31939494609832764,
"learning_rate": 1.3318912926004351e-05,
"loss": 0.3543,
"num_tokens": 733756855.0,
"step": 1008
},
{
"epoch": 1.4006495171352875,
"grad_norm": 0.2936459481716156,
"learning_rate": 1.2482762526561448e-05,
"loss": 0.3626,
"step": 1024
},
{
"epoch": 1.4006495171352875,
"eval_loss": 0.40473318099975586,
"eval_num_tokens": 745739855.0,
"eval_runtime": 18.318,
"eval_samples_per_second": 6.988,
"eval_steps_per_second": 1.747,
"step": 1024
},
{
"epoch": 1.422527989060764,
"grad_norm": 0.29807737469673157,
"learning_rate": 1.1664899184012229e-05,
"loss": 0.3576,
"num_tokens": 757373615.0,
"step": 1040
},
{
"epoch": 1.4444064609862406,
"grad_norm": 0.31612807512283325,
"learning_rate": 1.0866517755729063e-05,
"loss": 0.3584,
"step": 1056
},
{
"epoch": 1.4444064609862406,
"eval_loss": 0.40248239040374756,
"eval_num_tokens": 769145930.0,
"eval_runtime": 18.5352,
"eval_samples_per_second": 6.906,
"eval_steps_per_second": 1.726,
"step": 1056
},
{
"epoch": 1.466284932911717,
"grad_norm": 0.2999597191810608,
"learning_rate": 1.0088784636983473e-05,
"loss": 0.3546,
"num_tokens": 780592475.0,
"step": 1072
},
{
"epoch": 1.4881634048371934,
"grad_norm": 0.30209967494010925,
"learning_rate": 9.332836056901176e-06,
"loss": 0.3531,
"step": 1088
},
{
"epoch": 1.4881634048371934,
"eval_loss": 0.4016121029853821,
"eval_num_tokens": 792419009.0,
"eval_runtime": 45.144,
"eval_samples_per_second": 2.835,
"eval_steps_per_second": 0.709,
"step": 1088
},
{
"epoch": 1.5100418767626698,
"grad_norm": 0.28912022709846497,
"learning_rate": 8.599776418488159e-06,
"loss": 0.3476,
"num_tokens": 804175102.0,
"step": 1104
},
{
"epoch": 1.5319203486881463,
"grad_norm": 0.28878605365753174,
"learning_rate": 7.890676685153314e-06,
"loss": 0.351,
"step": 1120
},
{
"epoch": 1.5319203486881463,
"eval_loss": 0.3999693691730499,
"eval_num_tokens": 815878094.0,
"eval_runtime": 18.5504,
"eval_samples_per_second": 6.9,
"eval_steps_per_second": 1.725,
"step": 1120
},
{
"epoch": 1.5537988206136228,
"grad_norm": 0.29332441091537476,
"learning_rate": 7.206572816084464e-06,
"loss": 0.3525,
"num_tokens": 827490302.0,
"step": 1136
},
{
"epoch": 1.5756772925390994,
"grad_norm": 0.2913924753665924,
"learning_rate": 6.5484642527639055e-06,
"loss": 0.3529,
"step": 1152
},
{
"epoch": 1.5756772925390994,
"eval_loss": 0.39920222759246826,
"eval_num_tokens": 839135863.0,
"eval_runtime": 45.1069,
"eval_samples_per_second": 2.838,
"eval_steps_per_second": 0.709,
"step": 1152
},
{
"epoch": 1.5975557644645757,
"grad_norm": 0.2814177870750427,
"learning_rate": 5.917312458834495e-06,
"loss": 0.353,
"num_tokens": 850717611.0,
"step": 1168
},
{
"epoch": 1.619434236390052,
"grad_norm": 0.2677833139896393,
"learning_rate": 5.314039515449418e-06,
"loss": 0.3538,
"step": 1184
},
{
"epoch": 1.619434236390052,
"eval_loss": 0.397605836391449,
"eval_num_tokens": 862683824.0,
"eval_runtime": 18.3743,
"eval_samples_per_second": 6.966,
"eval_steps_per_second": 1.742,
"step": 1184
},
{
"epoch": 1.6413127083155286,
"grad_norm": 0.27799126505851746,
"learning_rate": 4.739526774157807e-06,
"loss": 0.3505,
"num_tokens": 874236055.0,
"step": 1200
},
{
"epoch": 1.663191180241005,
"grad_norm": 0.2723589241504669,
"learning_rate": 4.19461356929429e-06,
"loss": 0.3524,
"step": 1216
},
{
"epoch": 1.663191180241005,
"eval_loss": 0.3971025049686432,
"eval_num_tokens": 885655282.0,
"eval_runtime": 18.1636,
"eval_samples_per_second": 7.047,
"eval_steps_per_second": 1.762,
"step": 1216
},
{
"epoch": 1.6850696521664816,
"grad_norm": 0.27828726172447205,
"learning_rate": 3.6800959917535765e-06,
"loss": 0.349,
"num_tokens": 897507679.0,
"step": 1232
},
{
"epoch": 1.706948124091958,
"grad_norm": 0.2791634798049927,
"learning_rate": 3.1967257259415185e-06,
"loss": 0.3458,
"step": 1248
},
{
"epoch": 1.706948124091958,
"eval_loss": 0.3961884379386902,
"eval_num_tokens": 908891789.0,
"eval_runtime": 18.2964,
"eval_samples_per_second": 6.996,
"eval_steps_per_second": 1.749,
"step": 1248
},
{
"epoch": 1.7288265960174343,
"grad_norm": 0.27799129486083984,
"learning_rate": 2.7452089516018935e-06,
"loss": 0.3519,
"num_tokens": 920647977.0,
"step": 1264
},
{
"epoch": 1.7507050679429108,
"grad_norm": 0.2728063762187958,
"learning_rate": 2.326205312123136e-06,
"loss": 0.3437,
"step": 1280
},
{
"epoch": 1.7507050679429108,
"eval_loss": 0.39544403553009033,
"eval_num_tokens": 931912973.0,
"eval_runtime": 18.3293,
"eval_samples_per_second": 6.983,
"eval_steps_per_second": 1.746,
"step": 1280
},
{
"epoch": 1.7725835398683873,
"grad_norm": 0.27384352684020996,
"learning_rate": 1.940326950832391e-06,
"loss": 0.3451,
"num_tokens": 943388074.0,
"step": 1296
},
{
"epoch": 1.7944620117938639,
"grad_norm": 0.25831547379493713,
"learning_rate": 1.5881376166848149e-06,
"loss": 0.3496,
"step": 1312
},
{
"epoch": 1.7944620117938639,
"eval_loss": 0.3953617215156555,
"eval_num_tokens": 955111326.0,
"eval_runtime": 44.1506,
"eval_samples_per_second": 2.899,
"eval_steps_per_second": 0.725,
"step": 1312
},
{
"epoch": 1.8163404837193402,
"grad_norm": 0.2703952193260193,
"learning_rate": 1.2701518406545571e-06,
"loss": 0.3486,
"num_tokens": 966913070.0,
"step": 1328
},
{
"epoch": 1.8382189556448165,
"grad_norm": 0.26095762848854065,
"learning_rate": 9.868341840307993e-07,
"loss": 0.3481,
"step": 1344
},
{
"epoch": 1.8382189556448165,
"eval_loss": 0.3945625424385071,
"eval_num_tokens": 978491221.0,
"eval_runtime": 18.31,
"eval_samples_per_second": 6.991,
"eval_steps_per_second": 1.748,
"step": 1344
},
{
"epoch": 1.860097427570293,
"grad_norm": 0.2432006448507309,
"learning_rate": 7.385985597169798e-07,
"loss": 0.351,
"num_tokens": 990169429.0,
"step": 1360
},
{
"epoch": 1.8819758994957696,
"grad_norm": 0.27600374817848206,
"learning_rate": 5.258076275247825e-07,
"loss": 0.3432,
"step": 1376
},
{
"epoch": 1.8819758994957696,
"eval_loss": 0.3942786455154419,
"eval_num_tokens": 1001694852.0,
"eval_runtime": 45.1966,
"eval_samples_per_second": 2.832,
"eval_steps_per_second": 0.708,
"step": 1376
},
{
"epoch": 1.9038543714212461,
"grad_norm": 0.2700289487838745,
"learning_rate": 3.4877226434630315e-07,
"loss": 0.3444,
"num_tokens": 1013251008.0,
"step": 1392
},
{
"epoch": 1.9257328433467225,
"grad_norm": 0.250434011220932,
"learning_rate": 2.0775110997850733e-07,
"loss": 0.345,
"step": 1408
},
{
"epoch": 1.9257328433467225,
"eval_loss": 0.3940983712673187,
"eval_num_tokens": 1025080577.0,
"eval_runtime": 18.3362,
"eval_samples_per_second": 6.981,
"eval_steps_per_second": 1.745,
"step": 1408
},
{
"epoch": 1.947611315272199,
"grad_norm": 0.2536456882953644,
"learning_rate": 1.0295018926342881e-07,
"loss": 0.3464,
"num_tokens": 1036518998.0,
"step": 1424
},
{
"epoch": 1.9694897871976753,
"grad_norm": 0.2704523205757141,
"learning_rate": 3.4522611096193815e-08,
"loss": 0.3504,
"step": 1440
},
{
"epoch": 1.9694897871976753,
"eval_loss": 0.39410996437072754,
"eval_num_tokens": 1048000170.0,
"eval_runtime": 45.2694,
"eval_samples_per_second": 2.828,
"eval_steps_per_second": 0.707,
"step": 1440
},
{
"epoch": 1.9913682591231519,
"grad_norm": 0.2705751061439514,
"learning_rate": 2.568344740602746e-09,
"loss": 0.3473,
"num_tokens": 1059695234.0,
"step": 1456
},
{
"epoch": 1.9995726860952057,
"num_tokens": 1063961915.0,
"step": 1462,
"total_flos": 1.1979767668153516e+19,
"train_loss": 0.4216477999771995,
"train_runtime": 170373.8054,
"train_samples_per_second": 1.099,
"train_steps_per_second": 0.009
}
],
"logging_steps": 16,
"max_steps": 1462,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 32,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.1979767668153516e+19,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}