{ "best_metric": null, "best_model_checkpoint": null, "epoch": 14.854368932038835, "eval_steps": 500, "global_step": 765, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.1941747572815534, "grad_norm": 3.1270775891219507, "learning_rate": 6.493506493506493e-07, "loss": 1.8796, "step": 10 }, { "epoch": 0.3883495145631068, "grad_norm": 4.0478996237342, "learning_rate": 1.2987012987012986e-06, "loss": 1.9555, "step": 20 }, { "epoch": 0.5825242718446602, "grad_norm": 3.706686831129688, "learning_rate": 1.9480519480519483e-06, "loss": 1.8677, "step": 30 }, { "epoch": 0.7766990291262136, "grad_norm": 1.800540642872872, "learning_rate": 2.597402597402597e-06, "loss": 1.6047, "step": 40 }, { "epoch": 0.970873786407767, "grad_norm": 1.5787493617396333, "learning_rate": 3.246753246753247e-06, "loss": 1.3922, "step": 50 }, { "epoch": 1.1650485436893203, "grad_norm": 1.207810791025872, "learning_rate": 3.896103896103897e-06, "loss": 1.2978, "step": 60 }, { "epoch": 1.3592233009708738, "grad_norm": 0.9175578117993997, "learning_rate": 4.5454545454545455e-06, "loss": 1.1015, "step": 70 }, { "epoch": 1.5533980582524272, "grad_norm": 0.9409417065870398, "learning_rate": 4.999765432089186e-06, "loss": 0.9465, "step": 80 }, { "epoch": 1.7475728155339807, "grad_norm": 0.7673543415725506, "learning_rate": 4.995596560308607e-06, "loss": 0.802, "step": 90 }, { "epoch": 1.941747572815534, "grad_norm": 0.6596047063364, "learning_rate": 4.986225072382357e-06, "loss": 0.8013, "step": 100 }, { "epoch": 2.1359223300970873, "grad_norm": 0.6761088913203434, "learning_rate": 4.971670505224043e-06, "loss": 0.8454, "step": 110 }, { "epoch": 2.3300970873786406, "grad_norm": 0.8375974153078517, "learning_rate": 4.9519632010080765e-06, "loss": 0.7326, "step": 120 }, { "epoch": 2.524271844660194, "grad_norm": 0.7531376616419518, "learning_rate": 4.927144243914781e-06, "loss": 0.7275, "step": 130 }, { "epoch": 2.7184466019417477, "grad_norm": 0.6285281076273438, "learning_rate": 4.897265374481447e-06, "loss": 0.7418, "step": 140 }, { "epoch": 2.912621359223301, "grad_norm": 0.602398570146741, "learning_rate": 4.862388881737883e-06, "loss": 0.7094, "step": 150 }, { "epoch": 3.1067961165048543, "grad_norm": 0.6666767672393625, "learning_rate": 4.822587473351317e-06, "loss": 0.6429, "step": 160 }, { "epoch": 3.3009708737864076, "grad_norm": 0.5205954520717115, "learning_rate": 4.777944124051395e-06, "loss": 0.6477, "step": 170 }, { "epoch": 3.4951456310679614, "grad_norm": 0.7568300500075249, "learning_rate": 4.728551902651227e-06, "loss": 0.6659, "step": 180 }, { "epoch": 3.6893203883495147, "grad_norm": 0.7044549203105918, "learning_rate": 4.6745137780251125e-06, "loss": 0.656, "step": 190 }, { "epoch": 3.883495145631068, "grad_norm": 0.7556811208616407, "learning_rate": 4.615942404447439e-06, "loss": 0.6441, "step": 200 }, { "epoch": 4.077669902912621, "grad_norm": 0.5819276993822524, "learning_rate": 4.552959886740232e-06, "loss": 0.6552, "step": 210 }, { "epoch": 4.271844660194175, "grad_norm": 0.7872228459586639, "learning_rate": 4.48569752571899e-06, "loss": 0.5654, "step": 220 }, { "epoch": 4.466019417475728, "grad_norm": 0.6601391677800797, "learning_rate": 4.414295544467447e-06, "loss": 0.5617, "step": 230 }, { "epoch": 4.660194174757281, "grad_norm": 0.760089090328379, "learning_rate": 4.338902796011929e-06, "loss": 0.5924, "step": 240 }, { "epoch": 4.854368932038835, "grad_norm": 0.6323283888532482, "learning_rate": 4.259676453004709e-06, "loss": 0.5691, "step": 250 }, { "epoch": 5.048543689320389, "grad_norm": 0.5670514791770882, "learning_rate": 4.176781680063274e-06, "loss": 0.62, "step": 260 }, { "epoch": 5.242718446601942, "grad_norm": 0.8034918706341889, "learning_rate": 4.090391289448612e-06, "loss": 0.5791, "step": 270 }, { "epoch": 5.436893203883495, "grad_norm": 0.9966745647142712, "learning_rate": 4.000685380800299e-06, "loss": 0.5215, "step": 280 }, { "epoch": 5.631067961165049, "grad_norm": 0.6278542393836921, "learning_rate": 3.907850965679467e-06, "loss": 0.4971, "step": 290 }, { "epoch": 5.825242718446602, "grad_norm": 0.5200880725660694, "learning_rate": 3.812081577702351e-06, "loss": 0.4876, "step": 300 }, { "epoch": 6.019417475728155, "grad_norm": 0.6395614574611697, "learning_rate": 3.7135768690771958e-06, "loss": 0.5461, "step": 310 }, { "epoch": 6.213592233009709, "grad_norm": 0.6094813425270712, "learning_rate": 3.612542194385613e-06, "loss": 0.4818, "step": 320 }, { "epoch": 6.407766990291262, "grad_norm": 0.7863019179749429, "learning_rate": 3.509188182476105e-06, "loss": 0.4383, "step": 330 }, { "epoch": 6.601941747572815, "grad_norm": 0.7166231822803332, "learning_rate": 3.40373029736222e-06, "loss": 0.4665, "step": 340 }, { "epoch": 6.796116504854369, "grad_norm": 0.6263341506429414, "learning_rate": 3.29638838904075e-06, "loss": 0.4589, "step": 350 }, { "epoch": 6.990291262135923, "grad_norm": 0.6199864592752095, "learning_rate": 3.1873862351663966e-06, "loss": 0.4338, "step": 360 }, { "epoch": 7.184466019417476, "grad_norm": 0.5731484414815903, "learning_rate": 3.0769510745383603e-06, "loss": 0.4006, "step": 370 }, { "epoch": 7.378640776699029, "grad_norm": 0.6375447599953608, "learning_rate": 2.9653131333714357e-06, "loss": 0.4033, "step": 380 }, { "epoch": 7.572815533980583, "grad_norm": 0.7108113423728954, "learning_rate": 2.8527051453391763e-06, "loss": 0.3817, "step": 390 }, { "epoch": 7.766990291262136, "grad_norm": 0.5983539359474854, "learning_rate": 2.739361866389711e-06, "loss": 0.3906, "step": 400 }, { "epoch": 7.961165048543689, "grad_norm": 0.6703828114314603, "learning_rate": 2.6255195853456994e-06, "loss": 0.3998, "step": 410 }, { "epoch": 8.155339805825243, "grad_norm": 0.7763432614119512, "learning_rate": 2.511415631308664e-06, "loss": 0.3636, "step": 420 }, { "epoch": 8.349514563106796, "grad_norm": 0.7336274441363543, "learning_rate": 2.397287878894637e-06, "loss": 0.2913, "step": 430 }, { "epoch": 8.54368932038835, "grad_norm": 0.640311797061036, "learning_rate": 2.283374252332568e-06, "loss": 0.3709, "step": 440 }, { "epoch": 8.737864077669903, "grad_norm": 1.0549747152380415, "learning_rate": 2.169912229459296e-06, "loss": 0.3199, "step": 450 }, { "epoch": 8.932038834951456, "grad_norm": 0.7335809666111395, "learning_rate": 2.0571383466451237e-06, "loss": 0.3302, "step": 460 }, { "epoch": 9.12621359223301, "grad_norm": 0.8337042629082712, "learning_rate": 1.9452877056820936e-06, "loss": 0.3043, "step": 470 }, { "epoch": 9.320388349514563, "grad_norm": 0.7768236518159666, "learning_rate": 1.8345934836629424e-06, "loss": 0.2832, "step": 480 }, { "epoch": 9.514563106796116, "grad_norm": 0.8109748052215916, "learning_rate": 1.7252864468725218e-06, "loss": 0.2376, "step": 490 }, { "epoch": 9.70873786407767, "grad_norm": 0.884651680841133, "learning_rate": 1.6175944697050678e-06, "loss": 0.2854, "step": 500 }, { "epoch": 9.902912621359224, "grad_norm": 0.7174797476037638, "learning_rate": 1.511742059610255e-06, "loss": 0.2807, "step": 510 }, { "epoch": 10.097087378640778, "grad_norm": 0.6760588512048147, "learning_rate": 1.4079498890583766e-06, "loss": 0.2656, "step": 520 }, { "epoch": 10.29126213592233, "grad_norm": 0.6390548130600759, "learning_rate": 1.3064343355003775e-06, "loss": 0.2215, "step": 530 }, { "epoch": 10.485436893203884, "grad_norm": 0.7714829355919598, "learning_rate": 1.2074070302817962e-06, "loss": 0.21, "step": 540 }, { "epoch": 10.679611650485437, "grad_norm": 0.7133631416901189, "learning_rate": 1.1110744174509952e-06, "loss": 0.257, "step": 550 }, { "epoch": 10.87378640776699, "grad_norm": 1.0554896728347178, "learning_rate": 1.0176373233814509e-06, "loss": 0.235, "step": 560 }, { "epoch": 11.067961165048544, "grad_norm": 0.7568107855940384, "learning_rate": 9.272905381053132e-07, "loss": 0.2458, "step": 570 }, { "epoch": 11.262135922330097, "grad_norm": 1.1755882782765041, "learning_rate": 8.402224092310299e-07, "loss": 0.2139, "step": 580 }, { "epoch": 11.45631067961165, "grad_norm": 0.7277468096167472, "learning_rate": 7.566144492916191e-07, "loss": 0.1993, "step": 590 }, { "epoch": 11.650485436893204, "grad_norm": 0.7732351944532618, "learning_rate": 6.766409573421467e-07, "loss": 0.2075, "step": 600 }, { "epoch": 11.844660194174757, "grad_norm": 0.7272514601520167, "learning_rate": 6.004686555952743e-07, "loss": 0.2124, "step": 610 }, { "epoch": 12.03883495145631, "grad_norm": 0.5446717472743969, "learning_rate": 5.282563418523859e-07, "loss": 0.2142, "step": 620 }, { "epoch": 12.233009708737864, "grad_norm": 0.601799105236111, "learning_rate": 4.6015455845488805e-07, "loss": 0.1827, "step": 630 }, { "epoch": 12.427184466019417, "grad_norm": 0.8397139799250541, "learning_rate": 3.963052784458146e-07, "loss": 0.2212, "step": 640 }, { "epoch": 12.62135922330097, "grad_norm": 0.6624723438080478, "learning_rate": 3.368416095960092e-07, "loss": 0.1861, "step": 650 }, { "epoch": 12.815533980582524, "grad_norm": 0.8742788293711848, "learning_rate": 2.8188751691189813e-07, "loss": 0.1602, "step": 660 }, { "epoch": 13.009708737864077, "grad_norm": 0.8377328891704509, "learning_rate": 2.3155756420336046e-07, "loss": 0.2049, "step": 670 }, { "epoch": 13.20388349514563, "grad_norm": 0.6166032834335495, "learning_rate": 1.8595667525043965e-07, "loss": 0.1702, "step": 680 }, { "epoch": 13.398058252427184, "grad_norm": 0.5133188237146326, "learning_rate": 1.4517991506680762e-07, "loss": 0.1805, "step": 690 }, { "epoch": 13.592233009708737, "grad_norm": 0.6421575024333711, "learning_rate": 1.0931229171597584e-07, "loss": 0.172, "step": 700 }, { "epoch": 13.78640776699029, "grad_norm": 0.6833017346041286, "learning_rate": 7.842857909342166e-08, "loss": 0.1824, "step": 710 }, { "epoch": 13.980582524271846, "grad_norm": 0.6474937990820275, "learning_rate": 5.259316104406637e-08, "loss": 0.1891, "step": 720 }, { "epoch": 14.174757281553399, "grad_norm": 0.6526458668790724, "learning_rate": 3.185989714009186e-08, "loss": 0.2009, "step": 730 }, { "epoch": 14.368932038834952, "grad_norm": 0.5981773924609198, "learning_rate": 1.627201039889309e-08, "loss": 0.1903, "step": 740 }, { "epoch": 14.563106796116505, "grad_norm": 0.6732767479005811, "learning_rate": 5.861997175260759e-09, "loss": 0.181, "step": 750 }, { "epoch": 14.757281553398059, "grad_norm": 0.5952495060637577, "learning_rate": 6.515594156286664e-10, "loss": 0.1647, "step": 760 } ], "logging_steps": 10, "max_steps": 765, "num_input_tokens_seen": 0, "num_train_epochs": 15, "save_steps": 255, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 169821882810368.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }