| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 14.854368932038835, | |
| "eval_steps": 500, | |
| "global_step": 765, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.1941747572815534, | |
| "grad_norm": 3.1270775891219507, | |
| "learning_rate": 6.493506493506493e-07, | |
| "loss": 1.8796, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.3883495145631068, | |
| "grad_norm": 4.0478996237342, | |
| "learning_rate": 1.2987012987012986e-06, | |
| "loss": 1.9555, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.5825242718446602, | |
| "grad_norm": 3.706686831129688, | |
| "learning_rate": 1.9480519480519483e-06, | |
| "loss": 1.8677, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.7766990291262136, | |
| "grad_norm": 1.800540642872872, | |
| "learning_rate": 2.597402597402597e-06, | |
| "loss": 1.6047, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.970873786407767, | |
| "grad_norm": 1.5787493617396333, | |
| "learning_rate": 3.246753246753247e-06, | |
| "loss": 1.3922, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 1.1650485436893203, | |
| "grad_norm": 1.207810791025872, | |
| "learning_rate": 3.896103896103897e-06, | |
| "loss": 1.2978, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 1.3592233009708738, | |
| "grad_norm": 0.9175578117993997, | |
| "learning_rate": 4.5454545454545455e-06, | |
| "loss": 1.1015, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 1.5533980582524272, | |
| "grad_norm": 0.9409417065870398, | |
| "learning_rate": 4.999765432089186e-06, | |
| "loss": 0.9465, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 1.7475728155339807, | |
| "grad_norm": 0.7673543415725506, | |
| "learning_rate": 4.995596560308607e-06, | |
| "loss": 0.802, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 1.941747572815534, | |
| "grad_norm": 0.6596047063364, | |
| "learning_rate": 4.986225072382357e-06, | |
| "loss": 0.8013, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 2.1359223300970873, | |
| "grad_norm": 0.6761088913203434, | |
| "learning_rate": 4.971670505224043e-06, | |
| "loss": 0.8454, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 2.3300970873786406, | |
| "grad_norm": 0.8375974153078517, | |
| "learning_rate": 4.9519632010080765e-06, | |
| "loss": 0.7326, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 2.524271844660194, | |
| "grad_norm": 0.7531376616419518, | |
| "learning_rate": 4.927144243914781e-06, | |
| "loss": 0.7275, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 2.7184466019417477, | |
| "grad_norm": 0.6285281076273438, | |
| "learning_rate": 4.897265374481447e-06, | |
| "loss": 0.7418, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 2.912621359223301, | |
| "grad_norm": 0.602398570146741, | |
| "learning_rate": 4.862388881737883e-06, | |
| "loss": 0.7094, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 3.1067961165048543, | |
| "grad_norm": 0.6666767672393625, | |
| "learning_rate": 4.822587473351317e-06, | |
| "loss": 0.6429, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 3.3009708737864076, | |
| "grad_norm": 0.5205954520717115, | |
| "learning_rate": 4.777944124051395e-06, | |
| "loss": 0.6477, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 3.4951456310679614, | |
| "grad_norm": 0.7568300500075249, | |
| "learning_rate": 4.728551902651227e-06, | |
| "loss": 0.6659, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 3.6893203883495147, | |
| "grad_norm": 0.7044549203105918, | |
| "learning_rate": 4.6745137780251125e-06, | |
| "loss": 0.656, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 3.883495145631068, | |
| "grad_norm": 0.7556811208616407, | |
| "learning_rate": 4.615942404447439e-06, | |
| "loss": 0.6441, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 4.077669902912621, | |
| "grad_norm": 0.5819276993822524, | |
| "learning_rate": 4.552959886740232e-06, | |
| "loss": 0.6552, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 4.271844660194175, | |
| "grad_norm": 0.7872228459586639, | |
| "learning_rate": 4.48569752571899e-06, | |
| "loss": 0.5654, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 4.466019417475728, | |
| "grad_norm": 0.6601391677800797, | |
| "learning_rate": 4.414295544467447e-06, | |
| "loss": 0.5617, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 4.660194174757281, | |
| "grad_norm": 0.760089090328379, | |
| "learning_rate": 4.338902796011929e-06, | |
| "loss": 0.5924, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 4.854368932038835, | |
| "grad_norm": 0.6323283888532482, | |
| "learning_rate": 4.259676453004709e-06, | |
| "loss": 0.5691, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 5.048543689320389, | |
| "grad_norm": 0.5670514791770882, | |
| "learning_rate": 4.176781680063274e-06, | |
| "loss": 0.62, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 5.242718446601942, | |
| "grad_norm": 0.8034918706341889, | |
| "learning_rate": 4.090391289448612e-06, | |
| "loss": 0.5791, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 5.436893203883495, | |
| "grad_norm": 0.9966745647142712, | |
| "learning_rate": 4.000685380800299e-06, | |
| "loss": 0.5215, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 5.631067961165049, | |
| "grad_norm": 0.6278542393836921, | |
| "learning_rate": 3.907850965679467e-06, | |
| "loss": 0.4971, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 5.825242718446602, | |
| "grad_norm": 0.5200880725660694, | |
| "learning_rate": 3.812081577702351e-06, | |
| "loss": 0.4876, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 6.019417475728155, | |
| "grad_norm": 0.6395614574611697, | |
| "learning_rate": 3.7135768690771958e-06, | |
| "loss": 0.5461, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 6.213592233009709, | |
| "grad_norm": 0.6094813425270712, | |
| "learning_rate": 3.612542194385613e-06, | |
| "loss": 0.4818, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 6.407766990291262, | |
| "grad_norm": 0.7863019179749429, | |
| "learning_rate": 3.509188182476105e-06, | |
| "loss": 0.4383, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 6.601941747572815, | |
| "grad_norm": 0.7166231822803332, | |
| "learning_rate": 3.40373029736222e-06, | |
| "loss": 0.4665, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 6.796116504854369, | |
| "grad_norm": 0.6263341506429414, | |
| "learning_rate": 3.29638838904075e-06, | |
| "loss": 0.4589, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 6.990291262135923, | |
| "grad_norm": 0.6199864592752095, | |
| "learning_rate": 3.1873862351663966e-06, | |
| "loss": 0.4338, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 7.184466019417476, | |
| "grad_norm": 0.5731484414815903, | |
| "learning_rate": 3.0769510745383603e-06, | |
| "loss": 0.4006, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 7.378640776699029, | |
| "grad_norm": 0.6375447599953608, | |
| "learning_rate": 2.9653131333714357e-06, | |
| "loss": 0.4033, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 7.572815533980583, | |
| "grad_norm": 0.7108113423728954, | |
| "learning_rate": 2.8527051453391763e-06, | |
| "loss": 0.3817, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 7.766990291262136, | |
| "grad_norm": 0.5983539359474854, | |
| "learning_rate": 2.739361866389711e-06, | |
| "loss": 0.3906, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 7.961165048543689, | |
| "grad_norm": 0.6703828114314603, | |
| "learning_rate": 2.6255195853456994e-06, | |
| "loss": 0.3998, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 8.155339805825243, | |
| "grad_norm": 0.7763432614119512, | |
| "learning_rate": 2.511415631308664e-06, | |
| "loss": 0.3636, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 8.349514563106796, | |
| "grad_norm": 0.7336274441363543, | |
| "learning_rate": 2.397287878894637e-06, | |
| "loss": 0.2913, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 8.54368932038835, | |
| "grad_norm": 0.640311797061036, | |
| "learning_rate": 2.283374252332568e-06, | |
| "loss": 0.3709, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 8.737864077669903, | |
| "grad_norm": 1.0549747152380415, | |
| "learning_rate": 2.169912229459296e-06, | |
| "loss": 0.3199, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 8.932038834951456, | |
| "grad_norm": 0.7335809666111395, | |
| "learning_rate": 2.0571383466451237e-06, | |
| "loss": 0.3302, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 9.12621359223301, | |
| "grad_norm": 0.8337042629082712, | |
| "learning_rate": 1.9452877056820936e-06, | |
| "loss": 0.3043, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 9.320388349514563, | |
| "grad_norm": 0.7768236518159666, | |
| "learning_rate": 1.8345934836629424e-06, | |
| "loss": 0.2832, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 9.514563106796116, | |
| "grad_norm": 0.8109748052215916, | |
| "learning_rate": 1.7252864468725218e-06, | |
| "loss": 0.2376, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 9.70873786407767, | |
| "grad_norm": 0.884651680841133, | |
| "learning_rate": 1.6175944697050678e-06, | |
| "loss": 0.2854, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 9.902912621359224, | |
| "grad_norm": 0.7174797476037638, | |
| "learning_rate": 1.511742059610255e-06, | |
| "loss": 0.2807, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 10.097087378640778, | |
| "grad_norm": 0.6760588512048147, | |
| "learning_rate": 1.4079498890583766e-06, | |
| "loss": 0.2656, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 10.29126213592233, | |
| "grad_norm": 0.6390548130600759, | |
| "learning_rate": 1.3064343355003775e-06, | |
| "loss": 0.2215, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 10.485436893203884, | |
| "grad_norm": 0.7714829355919598, | |
| "learning_rate": 1.2074070302817962e-06, | |
| "loss": 0.21, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 10.679611650485437, | |
| "grad_norm": 0.7133631416901189, | |
| "learning_rate": 1.1110744174509952e-06, | |
| "loss": 0.257, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 10.87378640776699, | |
| "grad_norm": 1.0554896728347178, | |
| "learning_rate": 1.0176373233814509e-06, | |
| "loss": 0.235, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 11.067961165048544, | |
| "grad_norm": 0.7568107855940384, | |
| "learning_rate": 9.272905381053132e-07, | |
| "loss": 0.2458, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 11.262135922330097, | |
| "grad_norm": 1.1755882782765041, | |
| "learning_rate": 8.402224092310299e-07, | |
| "loss": 0.2139, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 11.45631067961165, | |
| "grad_norm": 0.7277468096167472, | |
| "learning_rate": 7.566144492916191e-07, | |
| "loss": 0.1993, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 11.650485436893204, | |
| "grad_norm": 0.7732351944532618, | |
| "learning_rate": 6.766409573421467e-07, | |
| "loss": 0.2075, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 11.844660194174757, | |
| "grad_norm": 0.7272514601520167, | |
| "learning_rate": 6.004686555952743e-07, | |
| "loss": 0.2124, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 12.03883495145631, | |
| "grad_norm": 0.5446717472743969, | |
| "learning_rate": 5.282563418523859e-07, | |
| "loss": 0.2142, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 12.233009708737864, | |
| "grad_norm": 0.601799105236111, | |
| "learning_rate": 4.6015455845488805e-07, | |
| "loss": 0.1827, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 12.427184466019417, | |
| "grad_norm": 0.8397139799250541, | |
| "learning_rate": 3.963052784458146e-07, | |
| "loss": 0.2212, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 12.62135922330097, | |
| "grad_norm": 0.6624723438080478, | |
| "learning_rate": 3.368416095960092e-07, | |
| "loss": 0.1861, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 12.815533980582524, | |
| "grad_norm": 0.8742788293711848, | |
| "learning_rate": 2.8188751691189813e-07, | |
| "loss": 0.1602, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 13.009708737864077, | |
| "grad_norm": 0.8377328891704509, | |
| "learning_rate": 2.3155756420336046e-07, | |
| "loss": 0.2049, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 13.20388349514563, | |
| "grad_norm": 0.6166032834335495, | |
| "learning_rate": 1.8595667525043965e-07, | |
| "loss": 0.1702, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 13.398058252427184, | |
| "grad_norm": 0.5133188237146326, | |
| "learning_rate": 1.4517991506680762e-07, | |
| "loss": 0.1805, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 13.592233009708737, | |
| "grad_norm": 0.6421575024333711, | |
| "learning_rate": 1.0931229171597584e-07, | |
| "loss": 0.172, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 13.78640776699029, | |
| "grad_norm": 0.6833017346041286, | |
| "learning_rate": 7.842857909342166e-08, | |
| "loss": 0.1824, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 13.980582524271846, | |
| "grad_norm": 0.6474937990820275, | |
| "learning_rate": 5.259316104406637e-08, | |
| "loss": 0.1891, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 14.174757281553399, | |
| "grad_norm": 0.6526458668790724, | |
| "learning_rate": 3.185989714009186e-08, | |
| "loss": 0.2009, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 14.368932038834952, | |
| "grad_norm": 0.5981773924609198, | |
| "learning_rate": 1.627201039889309e-08, | |
| "loss": 0.1903, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 14.563106796116505, | |
| "grad_norm": 0.6732767479005811, | |
| "learning_rate": 5.861997175260759e-09, | |
| "loss": 0.181, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 14.757281553398059, | |
| "grad_norm": 0.5952495060637577, | |
| "learning_rate": 6.515594156286664e-10, | |
| "loss": 0.1647, | |
| "step": 760 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 765, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 15, | |
| "save_steps": 255, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 169821882810368.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |