{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 54000, "global_step": 14319, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.028284098051539912, "grad_norm": 0.04601588801994645, "learning_rate": 9.42737430167598e-06, "loss": 0.1706, "step": 135 }, { "epoch": 0.056568196103079824, "grad_norm": 0.08599903987598387, "learning_rate": 1.885474860335196e-05, "loss": 0.1407, "step": 270 }, { "epoch": 0.08485229415461974, "grad_norm": 0.07811249522270598, "learning_rate": 2.8282122905027936e-05, "loss": 0.133, "step": 405 }, { "epoch": 0.11313639220615965, "grad_norm": 0.08106903455078629, "learning_rate": 3.770949720670392e-05, "loss": 0.1302, "step": 540 }, { "epoch": 0.14142049025769957, "grad_norm": 0.07762084811197388, "learning_rate": 4.713687150837989e-05, "loss": 0.128, "step": 675 }, { "epoch": 0.16970458830923948, "grad_norm": 0.06702784826347409, "learning_rate": 5.656424581005587e-05, "loss": 0.1269, "step": 810 }, { "epoch": 0.19798868636077938, "grad_norm": 0.06832201155426712, "learning_rate": 6.599162011173185e-05, "loss": 0.1258, "step": 945 }, { "epoch": 0.2262727844123193, "grad_norm": 0.07124429027326014, "learning_rate": 7.541899441340783e-05, "loss": 0.1251, "step": 1080 }, { "epoch": 0.2545568824638592, "grad_norm": 0.059836090071897996, "learning_rate": 8.48463687150838e-05, "loss": 0.125, "step": 1215 }, { "epoch": 0.28284098051539913, "grad_norm": 0.06065917805260472, "learning_rate": 9.427374301675978e-05, "loss": 0.1244, "step": 1350 }, { "epoch": 0.31112507856693905, "grad_norm": 0.05448515282733843, "learning_rate": 9.999582667896216e-05, "loss": 0.1241, "step": 1485 }, { "epoch": 0.33940917661847897, "grad_norm": 0.05195941534715265, "learning_rate": 9.994749800860066e-05, "loss": 0.1236, "step": 1620 }, { "epoch": 0.36769327467001883, "grad_norm": 0.0533796560627076, "learning_rate": 9.984507669983246e-05, "loss": 0.123, "step": 1755 }, { "epoch": 0.39597737272155875, "grad_norm": 0.05673889785368016, "learning_rate": 9.968867367390571e-05, "loss": 0.1222, "step": 1890 }, { "epoch": 0.42426147077309867, "grad_norm": 0.052072201102599504, "learning_rate": 9.947845831372577e-05, "loss": 0.1217, "step": 2025 }, { "epoch": 0.4525455688246386, "grad_norm": 0.054891014733737455, "learning_rate": 9.921465828041518e-05, "loss": 0.1218, "step": 2160 }, { "epoch": 0.4808296668761785, "grad_norm": 0.05168215652940432, "learning_rate": 9.889755926675904e-05, "loss": 0.121, "step": 2295 }, { "epoch": 0.5091137649277184, "grad_norm": 0.051281710130397264, "learning_rate": 9.85275046878025e-05, "loss": 0.121, "step": 2430 }, { "epoch": 0.5373978629792583, "grad_norm": 0.05193059833614012, "learning_rate": 9.810489530893578e-05, "loss": 0.1205, "step": 2565 }, { "epoch": 0.5656819610307983, "grad_norm": 0.052032831215777654, "learning_rate": 9.763018881186927e-05, "loss": 0.1195, "step": 2700 }, { "epoch": 0.5939660590823381, "grad_norm": 0.055670900157506434, "learning_rate": 9.710389929896887e-05, "loss": 0.1197, "step": 2835 }, { "epoch": 0.6222501571338781, "grad_norm": 0.05191679567872077, "learning_rate": 9.652659673648816e-05, "loss": 0.1191, "step": 2970 }, { "epoch": 0.650534255185418, "grad_norm": 0.053751440231836235, "learning_rate": 9.589890633730087e-05, "loss": 0.1185, "step": 3105 }, { "epoch": 0.6788183532369579, "grad_norm": 0.05742218717400969, "learning_rate": 9.522150788380149e-05, "loss": 0.1181, "step": 3240 }, { "epoch": 0.7071024512884978, "grad_norm": 0.05641344503893988, "learning_rate": 9.449513499170775e-05, "loss": 0.118, "step": 3375 }, { "epoch": 0.7353865493400377, "grad_norm": 0.05184155469686776, "learning_rate": 9.372057431556227e-05, "loss": 0.1177, "step": 3510 }, { "epoch": 0.7636706473915776, "grad_norm": 0.04878199937516276, "learning_rate": 9.289866469679355e-05, "loss": 0.1175, "step": 3645 }, { "epoch": 0.7919547454431175, "grad_norm": 0.05310159239349626, "learning_rate": 9.203029625525912e-05, "loss": 0.1169, "step": 3780 }, { "epoch": 0.8202388434946575, "grad_norm": 0.0534469001920457, "learning_rate": 9.111640942525466e-05, "loss": 0.1175, "step": 3915 }, { "epoch": 0.8485229415461973, "grad_norm": 0.05052535987329732, "learning_rate": 9.015799393703315e-05, "loss": 0.1169, "step": 4050 }, { "epoch": 0.8768070395977373, "grad_norm": 0.05309924206712465, "learning_rate": 8.915608774493695e-05, "loss": 0.1166, "step": 4185 }, { "epoch": 0.9050911376492772, "grad_norm": 0.05471927034944372, "learning_rate": 8.811177590330367e-05, "loss": 0.1158, "step": 4320 }, { "epoch": 0.933375235700817, "grad_norm": 0.051718680639674705, "learning_rate": 8.702618939136322e-05, "loss": 0.1156, "step": 4455 }, { "epoch": 0.961659333752357, "grad_norm": 0.055160448975554825, "learning_rate": 8.590050388839863e-05, "loss": 0.1155, "step": 4590 }, { "epoch": 0.9899434318038969, "grad_norm": 0.05086520628842916, "learning_rate": 8.473593850049731e-05, "loss": 0.1155, "step": 4725 }, { "epoch": 1.0182275298554369, "grad_norm": 0.05154998381506173, "learning_rate": 8.353375444027128e-05, "loss": 0.1066, "step": 4860 }, { "epoch": 1.0465116279069768, "grad_norm": 0.05189766725554911, "learning_rate": 8.22952536609767e-05, "loss": 0.1021, "step": 4995 }, { "epoch": 1.0747957259585166, "grad_norm": 0.052617917188326715, "learning_rate": 8.102177744651149e-05, "loss": 0.1024, "step": 5130 }, { "epoch": 1.1030798240100566, "grad_norm": 0.05214363158387452, "learning_rate": 7.971470495881836e-05, "loss": 0.1025, "step": 5265 }, { "epoch": 1.1313639220615965, "grad_norm": 0.054295844912421495, "learning_rate": 7.837545174426639e-05, "loss": 0.1023, "step": 5400 }, { "epoch": 1.1596480201131363, "grad_norm": 0.05197457231465077, "learning_rate": 7.700546820062839e-05, "loss": 0.1025, "step": 5535 }, { "epoch": 1.1879321181646763, "grad_norm": 0.056484265602417545, "learning_rate": 7.560623800631472e-05, "loss": 0.1023, "step": 5670 }, { "epoch": 1.2162162162162162, "grad_norm": 0.052111946846749885, "learning_rate": 7.417927651356462e-05, "loss": 0.1024, "step": 5805 }, { "epoch": 1.2445003142677562, "grad_norm": 0.05566575920944282, "learning_rate": 7.272612910733475e-05, "loss": 0.1017, "step": 5940 }, { "epoch": 1.2727844123192962, "grad_norm": 0.05811617199922452, "learning_rate": 7.124836953166298e-05, "loss": 0.1019, "step": 6075 }, { "epoch": 1.301068510370836, "grad_norm": 0.061671271903986996, "learning_rate": 6.974759818531935e-05, "loss": 0.1019, "step": 6210 }, { "epoch": 1.329352608422376, "grad_norm": 0.05466121007796382, "learning_rate": 6.822544038859025e-05, "loss": 0.1016, "step": 6345 }, { "epoch": 1.3576367064739157, "grad_norm": 0.05425557430418602, "learning_rate": 6.668354462307296e-05, "loss": 0.1011, "step": 6480 }, { "epoch": 1.3859208045254556, "grad_norm": 0.05874672603708157, "learning_rate": 6.512358074638657e-05, "loss": 0.1012, "step": 6615 }, { "epoch": 1.4142049025769956, "grad_norm": 0.05823531384414933, "learning_rate": 6.354723818373301e-05, "loss": 0.1008, "step": 6750 }, { "epoch": 1.4424890006285356, "grad_norm": 0.05608262826782312, "learning_rate": 6.195622409826653e-05, "loss": 0.1007, "step": 6885 }, { "epoch": 1.4707730986800756, "grad_norm": 0.05408687704162592, "learning_rate": 6.035226154225313e-05, "loss": 0.1002, "step": 7020 }, { "epoch": 1.4990571967316153, "grad_norm": 0.054980789427209784, "learning_rate": 5.8737087591022275e-05, "loss": 0.1004, "step": 7155 }, { "epoch": 1.5273412947831553, "grad_norm": 0.05745041355343903, "learning_rate": 5.7112451461731854e-05, "loss": 0.0999, "step": 7290 }, { "epoch": 1.555625392834695, "grad_norm": 0.05803905423764401, "learning_rate": 5.5480112618983404e-05, "loss": 0.0995, "step": 7425 }, { "epoch": 1.583909490886235, "grad_norm": 0.056971104280436516, "learning_rate": 5.384183886933983e-05, "loss": 0.0997, "step": 7560 }, { "epoch": 1.612193588937775, "grad_norm": 0.056543402577003486, "learning_rate": 5.2199404446808475e-05, "loss": 0.0988, "step": 7695 }, { "epoch": 1.640477686989315, "grad_norm": 0.05414144319537392, "learning_rate": 5.0554588091363683e-05, "loss": 0.0988, "step": 7830 }, { "epoch": 1.668761785040855, "grad_norm": 0.05756652870031753, "learning_rate": 4.890917112258916e-05, "loss": 0.0988, "step": 7965 }, { "epoch": 1.6970458830923947, "grad_norm": 0.054317396460825465, "learning_rate": 4.726493551052682e-05, "loss": 0.0985, "step": 8100 }, { "epoch": 1.7253299811439347, "grad_norm": 0.05780938044176143, "learning_rate": 4.562366194582113e-05, "loss": 0.0979, "step": 8235 }, { "epoch": 1.7536140791954744, "grad_norm": 0.05615442700243257, "learning_rate": 4.398712791124905e-05, "loss": 0.0976, "step": 8370 }, { "epoch": 1.7818981772470144, "grad_norm": 0.0550653325962579, "learning_rate": 4.235710575672401e-05, "loss": 0.0975, "step": 8505 }, { "epoch": 1.8101822752985544, "grad_norm": 0.055514099512198385, "learning_rate": 4.073536077985884e-05, "loss": 0.0974, "step": 8640 }, { "epoch": 1.8384663733500943, "grad_norm": 0.05542114420833896, "learning_rate": 3.9123649314166065e-05, "loss": 0.0968, "step": 8775 }, { "epoch": 1.8667504714016343, "grad_norm": 0.05466973411282308, "learning_rate": 3.752371682696652e-05, "loss": 0.0966, "step": 8910 }, { "epoch": 1.895034569453174, "grad_norm": 0.05615670182195563, "learning_rate": 3.5937296029065625e-05, "loss": 0.0967, "step": 9045 }, { "epoch": 1.923318667504714, "grad_norm": 0.05525832187066413, "learning_rate": 3.4366104998245154e-05, "loss": 0.096, "step": 9180 }, { "epoch": 1.9516027655562538, "grad_norm": 0.05493108241819906, "learning_rate": 3.28118453186021e-05, "loss": 0.0957, "step": 9315 }, { "epoch": 1.9798868636077938, "grad_norm": 0.05505518318771863, "learning_rate": 3.1276200237750355e-05, "loss": 0.0955, "step": 9450 }, { "epoch": 2.0081709616593337, "grad_norm": 0.05306378687028771, "learning_rate": 2.976083284388031e-05, "loss": 0.0906, "step": 9585 }, { "epoch": 2.0364550597108737, "grad_norm": 0.05130179788880526, "learning_rate": 2.8267384264651188e-05, "loss": 0.0784, "step": 9720 }, { "epoch": 2.0647391577624137, "grad_norm": 0.05335109388781897, "learning_rate": 2.679747188986622e-05, "loss": 0.0782, "step": 9855 }, { "epoch": 2.0930232558139537, "grad_norm": 0.0513888628362633, "learning_rate": 2.53526876198557e-05, "loss": 0.0778, "step": 9990 }, { "epoch": 2.121307353865493, "grad_norm": 0.055136967997034346, "learning_rate": 2.3934596141465028e-05, "loss": 0.0778, "step": 10125 }, { "epoch": 2.149591451917033, "grad_norm": 0.05620080185680983, "learning_rate": 2.254473323351446e-05, "loss": 0.0777, "step": 10260 }, { "epoch": 2.177875549968573, "grad_norm": 0.05464669812207657, "learning_rate": 2.1184604103566198e-05, "loss": 0.0774, "step": 10395 }, { "epoch": 2.206159648020113, "grad_norm": 0.05398354993752342, "learning_rate": 1.9855681757799664e-05, "loss": 0.0774, "step": 10530 }, { "epoch": 2.234443746071653, "grad_norm": 0.055897809997969714, "learning_rate": 1.8559405405760584e-05, "loss": 0.0772, "step": 10665 }, { "epoch": 2.262727844123193, "grad_norm": 0.05732086913703312, "learning_rate": 1.729717890171157e-05, "loss": 0.0767, "step": 10800 }, { "epoch": 2.291011942174733, "grad_norm": 0.0565406180756469, "learning_rate": 1.607036922427203e-05, "loss": 0.0765, "step": 10935 }, { "epoch": 2.3192960402262726, "grad_norm": 0.057074660751889154, "learning_rate": 1.4880304995994099e-05, "loss": 0.0765, "step": 11070 }, { "epoch": 2.3475801382778125, "grad_norm": 0.05609805473293312, "learning_rate": 1.3728275044477673e-05, "loss": 0.0762, "step": 11205 }, { "epoch": 2.3758642363293525, "grad_norm": 0.05516011004588835, "learning_rate": 1.2615527006583178e-05, "loss": 0.0763, "step": 11340 }, { "epoch": 2.4041483343808925, "grad_norm": 0.05722853773387842, "learning_rate": 1.1543265977253332e-05, "loss": 0.0762, "step": 11475 }, { "epoch": 2.4324324324324325, "grad_norm": 0.0587961606821977, "learning_rate": 1.0512653204407463e-05, "loss": 0.0757, "step": 11610 }, { "epoch": 2.4607165304839724, "grad_norm": 0.0569771841520313, "learning_rate": 9.524804831321604e-06, "loss": 0.0759, "step": 11745 }, { "epoch": 2.4890006285355124, "grad_norm": 0.059353282770618576, "learning_rate": 8.580790687856661e-06, "loss": 0.0756, "step": 11880 }, { "epoch": 2.517284726587052, "grad_norm": 0.05454050877182314, "learning_rate": 7.68163313184333e-06, "loss": 0.0754, "step": 12015 }, { "epoch": 2.5455688246385924, "grad_norm": 0.05815652945221045, "learning_rate": 6.828305941878904e-06, "loss": 0.0752, "step": 12150 }, { "epoch": 2.573852922690132, "grad_norm": 0.057648681945551325, "learning_rate": 6.021733262734758e-06, "loss": 0.075, "step": 12285 }, { "epoch": 2.602137020741672, "grad_norm": 0.05540948539481964, "learning_rate": 5.262788604516944e-06, "loss": 0.075, "step": 12420 }, { "epoch": 2.630421118793212, "grad_norm": 0.06718456332465579, "learning_rate": 4.552293896663451e-06, "loss": 0.075, "step": 12555 }, { "epoch": 2.658705216844752, "grad_norm": 0.07105890518243332, "learning_rate": 3.8910185978029314e-06, "loss": 0.0748, "step": 12690 }, { "epoch": 2.686989314896292, "grad_norm": 0.05567353880578137, "learning_rate": 3.2796788624387066e-06, "loss": 0.0748, "step": 12825 }, { "epoch": 2.7152734129478313, "grad_norm": 0.05612203710501599, "learning_rate": 2.71893676536063e-06, "loss": 0.0749, "step": 12960 }, { "epoch": 2.7435575109993717, "grad_norm": 0.05673870284934767, "learning_rate": 2.209399584624794e-06, "loss": 0.0748, "step": 13095 }, { "epoch": 2.7718416090509113, "grad_norm": 0.05761628274356501, "learning_rate": 1.7516191438774588e-06, "loss": 0.0747, "step": 13230 }, { "epoch": 2.8001257071024512, "grad_norm": 0.054886010400605305, "learning_rate": 1.3460912147355787e-06, "loss": 0.0746, "step": 13365 }, { "epoch": 2.828409805153991, "grad_norm": 0.05340561070795086, "learning_rate": 9.932549798711443e-07, "loss": 0.0746, "step": 13500 }, { "epoch": 2.856693903205531, "grad_norm": 0.059674346591722494, "learning_rate": 6.934925573807704e-07, "loss": 0.0747, "step": 13635 }, { "epoch": 2.884978001257071, "grad_norm": 0.058825608417044505, "learning_rate": 4.4712858695560856e-07, "loss": 0.0742, "step": 13770 }, { "epoch": 2.9132620993086107, "grad_norm": 0.05775417712295375, "learning_rate": 2.5442987829985556e-07, "loss": 0.0745, "step": 13905 }, { "epoch": 2.941546197360151, "grad_norm": 0.056228990397294835, "learning_rate": 1.1560512217849707e-07, "loss": 0.0747, "step": 14040 }, { "epoch": 2.9698302954116906, "grad_norm": 0.056212393072501816, "learning_rate": 3.080466440732455e-08, "loss": 0.0743, "step": 14175 }, { "epoch": 2.9981143934632306, "grad_norm": 0.056538060201727615, "learning_rate": 1.2034302991903445e-10, "loss": 0.0745, "step": 14310 } ], "logging_steps": 135, "max_steps": 14319, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 54000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.413317465141412e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }