{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.994675460468109, "eval_steps": 500, "global_step": 62000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.016086222150727903, "grad_norm": 0.5078127384185791, "learning_rate": 4.959864230101023e-05, "loss": 2.1432, "step": 500 }, { "epoch": 0.032172444301455806, "grad_norm": 0.4508506655693054, "learning_rate": 4.9196480277974395e-05, "loss": 1.9093, "step": 1000 }, { "epoch": 0.048258666452183706, "grad_norm": 0.4430558979511261, "learning_rate": 4.879431825493855e-05, "loss": 1.8418, "step": 1500 }, { "epoch": 0.06434488860291161, "grad_norm": 0.4775325059890747, "learning_rate": 4.8392156231902713e-05, "loss": 1.7771, "step": 2000 }, { "epoch": 0.08043111075363951, "grad_norm": 0.49685001373291016, "learning_rate": 4.7989994208866876e-05, "loss": 1.7226, "step": 2500 }, { "epoch": 0.09651733290436741, "grad_norm": 0.5552434325218201, "learning_rate": 4.7587832185831025e-05, "loss": 1.6767, "step": 3000 }, { "epoch": 0.11260355505509531, "grad_norm": 0.6779139637947083, "learning_rate": 4.718567016279519e-05, "loss": 1.6588, "step": 3500 }, { "epoch": 0.12868977720582322, "grad_norm": 0.5552022457122803, "learning_rate": 4.6783508139759344e-05, "loss": 1.603, "step": 4000 }, { "epoch": 0.1447759993565511, "grad_norm": 0.5302042365074158, "learning_rate": 4.638134611672351e-05, "loss": 1.5776, "step": 4500 }, { "epoch": 0.16086222150727902, "grad_norm": 0.5810815691947937, "learning_rate": 4.597918409368766e-05, "loss": 1.5333, "step": 5000 }, { "epoch": 0.1769484436580069, "grad_norm": 0.5819700956344604, "learning_rate": 4.5577022070651826e-05, "loss": 1.5168, "step": 5500 }, { "epoch": 0.19303466580873482, "grad_norm": 0.6134072542190552, "learning_rate": 4.517486004761599e-05, "loss": 1.4748, "step": 6000 }, { "epoch": 0.2091208879594627, "grad_norm": 0.5746152400970459, "learning_rate": 4.4772698024580144e-05, "loss": 1.4622, "step": 6500 }, { "epoch": 0.22520711011019062, "grad_norm": 0.7663710713386536, "learning_rate": 4.437053600154431e-05, "loss": 1.4767, "step": 7000 }, { "epoch": 0.24129333226091854, "grad_norm": 0.7993176579475403, "learning_rate": 4.396837397850846e-05, "loss": 1.4527, "step": 7500 }, { "epoch": 0.25737955441164645, "grad_norm": 0.6892676949501038, "learning_rate": 4.3566211955472626e-05, "loss": 1.4325, "step": 8000 }, { "epoch": 0.2734657765623743, "grad_norm": 0.6928556561470032, "learning_rate": 4.316404993243678e-05, "loss": 1.4038, "step": 8500 }, { "epoch": 0.2895519987131022, "grad_norm": 0.7578593492507935, "learning_rate": 4.2761887909400944e-05, "loss": 1.3945, "step": 9000 }, { "epoch": 0.30563822086383013, "grad_norm": 0.7504703402519226, "learning_rate": 4.23597258863651e-05, "loss": 1.3644, "step": 9500 }, { "epoch": 0.32172444301455805, "grad_norm": 0.8370710611343384, "learning_rate": 4.1957563863329256e-05, "loss": 1.3619, "step": 10000 }, { "epoch": 0.3378106651652859, "grad_norm": 0.8501142263412476, "learning_rate": 4.155540184029342e-05, "loss": 1.3448, "step": 10500 }, { "epoch": 0.3538968873160138, "grad_norm": 0.9001900553703308, "learning_rate": 4.1153239817257575e-05, "loss": 1.3004, "step": 11000 }, { "epoch": 0.36998310946674173, "grad_norm": 1.0658681392669678, "learning_rate": 4.075107779422174e-05, "loss": 1.2789, "step": 11500 }, { "epoch": 0.38606933161746965, "grad_norm": 1.1038371324539185, "learning_rate": 4.0348915771185894e-05, "loss": 1.2651, "step": 12000 }, { "epoch": 0.40215555376819756, "grad_norm": 1.2004213333129883, "learning_rate": 3.994755807219613e-05, "loss": 1.2216, "step": 12500 }, { "epoch": 0.4182417759189254, "grad_norm": 1.235543966293335, "learning_rate": 3.9545396049160286e-05, "loss": 1.1955, "step": 13000 }, { "epoch": 0.43432799806965333, "grad_norm": 1.5088828802108765, "learning_rate": 3.914323402612445e-05, "loss": 1.1836, "step": 13500 }, { "epoch": 0.45041422022038125, "grad_norm": 1.264153242111206, "learning_rate": 3.8741072003088605e-05, "loss": 1.1658, "step": 14000 }, { "epoch": 0.46650044237110916, "grad_norm": 1.3023343086242676, "learning_rate": 3.833971430409884e-05, "loss": 1.1481, "step": 14500 }, { "epoch": 0.48258666452183707, "grad_norm": 1.3824670314788818, "learning_rate": 3.7938356605109064e-05, "loss": 1.1221, "step": 15000 }, { "epoch": 0.49867288667256493, "grad_norm": 1.4364969730377197, "learning_rate": 3.75369989061193e-05, "loss": 1.1057, "step": 15500 }, { "epoch": 0.5147591088232929, "grad_norm": 2.051701545715332, "learning_rate": 3.7134836883083456e-05, "loss": 1.0873, "step": 16000 }, { "epoch": 0.5308453309740208, "grad_norm": 1.4329720735549927, "learning_rate": 3.673267486004762e-05, "loss": 1.0607, "step": 16500 }, { "epoch": 0.5469315531247486, "grad_norm": 1.4981014728546143, "learning_rate": 3.6330512837011775e-05, "loss": 1.0516, "step": 17000 }, { "epoch": 0.5630177752754766, "grad_norm": 1.3012079000473022, "learning_rate": 3.592835081397594e-05, "loss": 1.0317, "step": 17500 }, { "epoch": 0.5791039974262044, "grad_norm": 1.401825189590454, "learning_rate": 3.552699311498617e-05, "loss": 1.0183, "step": 18000 }, { "epoch": 0.5951902195769324, "grad_norm": 2.0783369541168213, "learning_rate": 3.512483109195033e-05, "loss": 0.9985, "step": 18500 }, { "epoch": 0.6112764417276603, "grad_norm": 2.3940794467926025, "learning_rate": 3.4722669068914486e-05, "loss": 0.9698, "step": 19000 }, { "epoch": 0.6273626638783881, "grad_norm": 1.4747998714447021, "learning_rate": 3.432050704587865e-05, "loss": 0.9657, "step": 19500 }, { "epoch": 0.6434488860291161, "grad_norm": 3.0782012939453125, "learning_rate": 3.391914934688888e-05, "loss": 0.9379, "step": 20000 }, { "epoch": 0.659535108179844, "grad_norm": 2.4914307594299316, "learning_rate": 3.3516987323853034e-05, "loss": 0.915, "step": 20500 }, { "epoch": 0.6756213303305718, "grad_norm": 2.772120237350464, "learning_rate": 3.3115629624863264e-05, "loss": 0.9047, "step": 21000 }, { "epoch": 0.6917075524812998, "grad_norm": 2.519575595855713, "learning_rate": 3.271346760182743e-05, "loss": 0.8688, "step": 21500 }, { "epoch": 0.7077937746320276, "grad_norm": 4.085098743438721, "learning_rate": 3.231130557879158e-05, "loss": 0.8581, "step": 22000 }, { "epoch": 0.7238799967827556, "grad_norm": 1.4670002460479736, "learning_rate": 3.1909143555755745e-05, "loss": 0.8354, "step": 22500 }, { "epoch": 0.7399662189334835, "grad_norm": 2.4749488830566406, "learning_rate": 3.1507785856765975e-05, "loss": 0.8108, "step": 23000 }, { "epoch": 0.7560524410842113, "grad_norm": 1.8635029792785645, "learning_rate": 3.110562383373014e-05, "loss": 0.7773, "step": 23500 }, { "epoch": 0.7721386632349393, "grad_norm": 3.5713748931884766, "learning_rate": 3.0703461810694294e-05, "loss": 0.756, "step": 24000 }, { "epoch": 0.7882248853856672, "grad_norm": 1.8903526067733765, "learning_rate": 3.0301299787658456e-05, "loss": 0.7326, "step": 24500 }, { "epoch": 0.8043111075363951, "grad_norm": 8.286703109741211, "learning_rate": 2.9899942088668686e-05, "loss": 0.6948, "step": 25000 }, { "epoch": 0.820397329687123, "grad_norm": 2.2209272384643555, "learning_rate": 2.9497780065632845e-05, "loss": 0.6914, "step": 25500 }, { "epoch": 0.8364835518378508, "grad_norm": 2.2284536361694336, "learning_rate": 2.9095618042597e-05, "loss": 0.6585, "step": 26000 }, { "epoch": 0.8525697739885788, "grad_norm": 3.4615938663482666, "learning_rate": 2.869345601956116e-05, "loss": 0.633, "step": 26500 }, { "epoch": 0.8686559961393067, "grad_norm": 3.1158838272094727, "learning_rate": 2.829209832057139e-05, "loss": 0.6181, "step": 27000 }, { "epoch": 0.8847422182900346, "grad_norm": 2.3320417404174805, "learning_rate": 2.7889936297535553e-05, "loss": 0.5993, "step": 27500 }, { "epoch": 0.9008284404407625, "grad_norm": 1.8331427574157715, "learning_rate": 2.7487774274499712e-05, "loss": 0.5839, "step": 28000 }, { "epoch": 0.9169146625914903, "grad_norm": 3.2398369312286377, "learning_rate": 2.708561225146387e-05, "loss": 0.562, "step": 28500 }, { "epoch": 0.9330008847422183, "grad_norm": 1.6575061082839966, "learning_rate": 2.66842545524741e-05, "loss": 0.5313, "step": 29000 }, { "epoch": 0.9490871068929462, "grad_norm": 2.1604230403900146, "learning_rate": 2.6282092529438264e-05, "loss": 0.5203, "step": 29500 }, { "epoch": 0.9651733290436741, "grad_norm": 3.3743808269500732, "learning_rate": 2.5879930506402423e-05, "loss": 0.4938, "step": 30000 }, { "epoch": 0.981259551194402, "grad_norm": 3.766514301300049, "learning_rate": 2.5477768483366583e-05, "loss": 0.4724, "step": 30500 }, { "epoch": 0.9973457733451299, "grad_norm": 2.26712703704834, "learning_rate": 2.5075606460330742e-05, "loss": 0.4656, "step": 31000 }, { "epoch": 1.0, "eval_loss": 0.26554691791534424, "eval_runtime": 1917.4803, "eval_samples_per_second": 345.81, "eval_steps_per_second": 43.227, "step": 31083 }, { "epoch": 1.0134159092737072, "grad_norm": 2.1041958332061768, "learning_rate": 2.467424876134097e-05, "loss": 0.4381, "step": 31500 }, { "epoch": 1.029502131424435, "grad_norm": 1.7629106044769287, "learning_rate": 2.427208673830513e-05, "loss": 0.4298, "step": 32000 }, { "epoch": 1.0455883535751629, "grad_norm": 2.5032904148101807, "learning_rate": 2.386992471526929e-05, "loss": 0.4188, "step": 32500 }, { "epoch": 1.0616745757258907, "grad_norm": 1.6467881202697754, "learning_rate": 2.3467762692233446e-05, "loss": 0.3986, "step": 33000 }, { "epoch": 1.0777607978766186, "grad_norm": 1.957220435142517, "learning_rate": 2.3065600669197606e-05, "loss": 0.382, "step": 33500 }, { "epoch": 1.0938470200273467, "grad_norm": 1.6566946506500244, "learning_rate": 2.2663438646161765e-05, "loss": 0.3689, "step": 34000 }, { "epoch": 1.1099332421780745, "grad_norm": 2.081613540649414, "learning_rate": 2.2261276623125928e-05, "loss": 0.3603, "step": 34500 }, { "epoch": 1.1260194643288024, "grad_norm": 2.155226945877075, "learning_rate": 2.1859918924136157e-05, "loss": 0.3478, "step": 35000 }, { "epoch": 1.1421056864795303, "grad_norm": 1.9459590911865234, "learning_rate": 2.1457756901100317e-05, "loss": 0.3315, "step": 35500 }, { "epoch": 1.1581919086302581, "grad_norm": 2.3381567001342773, "learning_rate": 2.1055594878064476e-05, "loss": 0.3259, "step": 36000 }, { "epoch": 1.1742781307809862, "grad_norm": 1.4302254915237427, "learning_rate": 2.0653432855028635e-05, "loss": 0.3168, "step": 36500 }, { "epoch": 1.190364352931714, "grad_norm": 1.1770597696304321, "learning_rate": 2.0251270831992795e-05, "loss": 0.3082, "step": 37000 }, { "epoch": 1.206450575082442, "grad_norm": 1.7475298643112183, "learning_rate": 1.9849913133003024e-05, "loss": 0.3014, "step": 37500 }, { "epoch": 1.2225367972331698, "grad_norm": 1.2397468090057373, "learning_rate": 1.9447751109967187e-05, "loss": 0.288, "step": 38000 }, { "epoch": 1.2386230193838976, "grad_norm": 1.6603740453720093, "learning_rate": 1.9045589086931343e-05, "loss": 0.2797, "step": 38500 }, { "epoch": 1.2547092415346257, "grad_norm": 1.7009538412094116, "learning_rate": 1.8643427063895502e-05, "loss": 0.275, "step": 39000 }, { "epoch": 1.2707954636853536, "grad_norm": 1.4941717386245728, "learning_rate": 1.8241265040859662e-05, "loss": 0.2623, "step": 39500 }, { "epoch": 1.2868816858360814, "grad_norm": 1.941115140914917, "learning_rate": 1.7839907341869895e-05, "loss": 0.2572, "step": 40000 }, { "epoch": 1.3029679079868093, "grad_norm": 1.487726092338562, "learning_rate": 1.7437745318834054e-05, "loss": 0.2502, "step": 40500 }, { "epoch": 1.3190541301375371, "grad_norm": 1.4628674983978271, "learning_rate": 1.7035583295798213e-05, "loss": 0.2437, "step": 41000 }, { "epoch": 1.3351403522882652, "grad_norm": 1.401607632637024, "learning_rate": 1.663342127276237e-05, "loss": 0.2421, "step": 41500 }, { "epoch": 1.351226574438993, "grad_norm": 1.1497563123703003, "learning_rate": 1.623125924972653e-05, "loss": 0.231, "step": 42000 }, { "epoch": 1.367312796589721, "grad_norm": 1.322836995124817, "learning_rate": 1.5829097226690688e-05, "loss": 0.2261, "step": 42500 }, { "epoch": 1.3833990187404488, "grad_norm": 1.5328525304794312, "learning_rate": 1.542773952770092e-05, "loss": 0.2177, "step": 43000 }, { "epoch": 1.3994852408911767, "grad_norm": 1.7748241424560547, "learning_rate": 1.502557750466508e-05, "loss": 0.2186, "step": 43500 }, { "epoch": 1.4155714630419047, "grad_norm": 1.6542141437530518, "learning_rate": 1.4623415481629241e-05, "loss": 0.2138, "step": 44000 }, { "epoch": 1.4316576851926326, "grad_norm": 1.3098843097686768, "learning_rate": 1.4221253458593397e-05, "loss": 0.211, "step": 44500 }, { "epoch": 1.4477439073433604, "grad_norm": 1.345651626586914, "learning_rate": 1.3819091435557557e-05, "loss": 0.2027, "step": 45000 }, { "epoch": 1.4638301294940883, "grad_norm": 1.4520297050476074, "learning_rate": 1.3416929412521718e-05, "loss": 0.2039, "step": 45500 }, { "epoch": 1.4799163516448162, "grad_norm": 1.5913499593734741, "learning_rate": 1.3014767389485877e-05, "loss": 0.1939, "step": 46000 }, { "epoch": 1.4960025737955442, "grad_norm": 1.1803226470947266, "learning_rate": 1.2612605366450037e-05, "loss": 0.1887, "step": 46500 }, { "epoch": 1.5120887959462719, "grad_norm": 1.1462236642837524, "learning_rate": 1.2210443343414194e-05, "loss": 0.1883, "step": 47000 }, { "epoch": 1.528175018097, "grad_norm": 0.8483968377113342, "learning_rate": 1.1808281320378355e-05, "loss": 0.1809, "step": 47500 }, { "epoch": 1.5442612402477278, "grad_norm": 1.1205823421478271, "learning_rate": 1.1406119297342515e-05, "loss": 0.1813, "step": 48000 }, { "epoch": 1.5603474623984557, "grad_norm": 1.417622447013855, "learning_rate": 1.1003957274306672e-05, "loss": 0.1788, "step": 48500 }, { "epoch": 1.5764336845491838, "grad_norm": 1.179103970527649, "learning_rate": 1.0602599575316904e-05, "loss": 0.1809, "step": 49000 }, { "epoch": 1.5925199066999114, "grad_norm": 1.1092889308929443, "learning_rate": 1.0200437552281065e-05, "loss": 0.1734, "step": 49500 }, { "epoch": 1.6086061288506395, "grad_norm": 1.0196574926376343, "learning_rate": 9.798275529245222e-06, "loss": 0.1688, "step": 50000 }, { "epoch": 1.6246923510013673, "grad_norm": 1.1376862525939941, "learning_rate": 9.396113506209382e-06, "loss": 0.1703, "step": 50500 }, { "epoch": 1.6407785731520952, "grad_norm": 0.8885149955749512, "learning_rate": 8.995560131265685e-06, "loss": 0.1691, "step": 51000 }, { "epoch": 1.6568647953028233, "grad_norm": 1.2574944496154785, "learning_rate": 8.593398108229844e-06, "loss": 0.1615, "step": 51500 }, { "epoch": 1.672951017453551, "grad_norm": 1.2620723247528076, "learning_rate": 8.191236085194004e-06, "loss": 0.1593, "step": 52000 }, { "epoch": 1.689037239604279, "grad_norm": 1.551480770111084, "learning_rate": 7.789074062158163e-06, "loss": 0.1639, "step": 52500 }, { "epoch": 1.7051234617550068, "grad_norm": 1.5938962697982788, "learning_rate": 7.386912039122322e-06, "loss": 0.1587, "step": 53000 }, { "epoch": 1.7212096839057347, "grad_norm": 1.0503953695297241, "learning_rate": 6.984750016086482e-06, "loss": 0.1599, "step": 53500 }, { "epoch": 1.7372959060564628, "grad_norm": 1.1205036640167236, "learning_rate": 6.583392317096712e-06, "loss": 0.1541, "step": 54000 }, { "epoch": 1.7533821282071904, "grad_norm": 0.7524433732032776, "learning_rate": 6.181230294060872e-06, "loss": 0.1521, "step": 54500 }, { "epoch": 1.7694683503579185, "grad_norm": 0.9619775414466858, "learning_rate": 5.779068271025031e-06, "loss": 0.1521, "step": 55000 }, { "epoch": 1.7855545725086464, "grad_norm": 0.9406844973564148, "learning_rate": 5.37690624798919e-06, "loss": 0.1509, "step": 55500 }, { "epoch": 1.8016407946593742, "grad_norm": 0.9363726377487183, "learning_rate": 4.975548548999421e-06, "loss": 0.1513, "step": 56000 }, { "epoch": 1.8177270168101023, "grad_norm": 0.9941402673721313, "learning_rate": 4.573386525963581e-06, "loss": 0.1484, "step": 56500 }, { "epoch": 1.83381323896083, "grad_norm": 1.3756345510482788, "learning_rate": 4.17122450292774e-06, "loss": 0.1509, "step": 57000 }, { "epoch": 1.849899461111558, "grad_norm": 1.0644595623016357, "learning_rate": 3.7690624798918986e-06, "loss": 0.1486, "step": 57500 }, { "epoch": 1.8659856832622859, "grad_norm": 1.070890188217163, "learning_rate": 3.3669004568560584e-06, "loss": 0.1462, "step": 58000 }, { "epoch": 1.8820719054130137, "grad_norm": 1.3034768104553223, "learning_rate": 2.9647384338202173e-06, "loss": 0.1481, "step": 58500 }, { "epoch": 1.8981581275637418, "grad_norm": 1.127517580986023, "learning_rate": 2.5625764107843767e-06, "loss": 0.1451, "step": 59000 }, { "epoch": 1.9142443497144694, "grad_norm": 0.9431403279304504, "learning_rate": 2.1604143877485364e-06, "loss": 0.1458, "step": 59500 }, { "epoch": 1.9303305718651975, "grad_norm": 1.271483302116394, "learning_rate": 1.7590566887587673e-06, "loss": 0.1463, "step": 60000 }, { "epoch": 1.9464167940159254, "grad_norm": 0.7327952980995178, "learning_rate": 1.3568946657229264e-06, "loss": 0.1434, "step": 60500 }, { "epoch": 1.9625030161666532, "grad_norm": 1.0670543909072876, "learning_rate": 9.547326426870858e-07, "loss": 0.1424, "step": 61000 }, { "epoch": 1.9785892383173813, "grad_norm": 1.2705425024032593, "learning_rate": 5.525706196512451e-07, "loss": 0.1431, "step": 61500 }, { "epoch": 1.994675460468109, "grad_norm": 0.9267213344573975, "learning_rate": 1.5040859661540443e-07, "loss": 0.1418, "step": 62000 } ], "logging_steps": 500, "max_steps": 62164, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.3425693542685082e+17, "train_batch_size": 32, "trial_name": null, "trial_params": null }