{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.0037833671968051565, "eval_steps": 102, "global_step": 81, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 4.6708236997594524e-05, "grad_norm": 0.24815663695335388, "learning_rate": 0.0, "loss": 1.5212, "mean_token_accuracy": 0.6624278724193573, "num_tokens": 6610.0, "step": 1 }, { "epoch": 9.341647399518905e-05, "grad_norm": 0.22391362488269806, "learning_rate": 1.0000000000000002e-06, "loss": 1.3169, "mean_token_accuracy": 0.6899301111698151, "num_tokens": 13855.0, "step": 2 }, { "epoch": 0.00014012471099278357, "grad_norm": 0.24716292321681976, "learning_rate": 2.0000000000000003e-06, "loss": 1.5019, "mean_token_accuracy": 0.6771045923233032, "num_tokens": 19204.0, "step": 3 }, { "epoch": 0.0001868329479903781, "grad_norm": 0.20678554475307465, "learning_rate": 3e-06, "loss": 1.2039, "mean_token_accuracy": 0.7312012910842896, "num_tokens": 27787.0, "step": 4 }, { "epoch": 0.00023354118498797262, "grad_norm": 0.16259518265724182, "learning_rate": 4.000000000000001e-06, "loss": 1.234, "mean_token_accuracy": 0.7289687991142273, "num_tokens": 38962.0, "step": 5 }, { "epoch": 0.00028024942198556714, "grad_norm": 0.18290941417217255, "learning_rate": 5e-06, "loss": 1.3001, "mean_token_accuracy": 0.685863584280014, "num_tokens": 46738.0, "step": 6 }, { "epoch": 0.00032695765898316167, "grad_norm": 0.22102956473827362, "learning_rate": 4.998754476483098e-06, "loss": 1.1942, "mean_token_accuracy": 0.7294453680515289, "num_tokens": 54364.0, "step": 7 }, { "epoch": 0.0003736658959807562, "grad_norm": 0.23882372677326202, "learning_rate": 4.9950192123145654e-06, "loss": 1.3233, "mean_token_accuracy": 0.7093064486980438, "num_tokens": 60646.0, "step": 8 }, { "epoch": 0.0004203741329783507, "grad_norm": 0.20760418474674225, "learning_rate": 4.988798125270709e-06, "loss": 1.8058, "mean_token_accuracy": 0.6353311538696289, "num_tokens": 70548.0, "step": 9 }, { "epoch": 0.00046708236997594524, "grad_norm": 0.18925875425338745, "learning_rate": 4.980097740412761e-06, "loss": 1.6046, "mean_token_accuracy": 0.6708937883377075, "num_tokens": 78213.0, "step": 10 }, { "epoch": 0.0005137906069735398, "grad_norm": 0.2128538340330124, "learning_rate": 4.968927183242991e-06, "loss": 1.4572, "mean_token_accuracy": 0.6536244750022888, "num_tokens": 85224.0, "step": 11 }, { "epoch": 0.0005604988439711343, "grad_norm": 0.19441735744476318, "learning_rate": 4.955298170133318e-06, "loss": 1.2837, "mean_token_accuracy": 0.702611893415451, "num_tokens": 93493.0, "step": 12 }, { "epoch": 0.0006072070809687288, "grad_norm": 0.2139551192522049, "learning_rate": 4.93922499603645e-06, "loss": 1.2754, "mean_token_accuracy": 0.6683319509029388, "num_tokens": 102255.0, "step": 13 }, { "epoch": 0.0006539153179663233, "grad_norm": 0.2441701591014862, "learning_rate": 4.920724519492452e-06, "loss": 1.3914, "mean_token_accuracy": 0.6826636791229248, "num_tokens": 109224.0, "step": 14 }, { "epoch": 0.0007006235549639179, "grad_norm": 0.21002362668514252, "learning_rate": 4.89981614494647e-06, "loss": 1.7201, "mean_token_accuracy": 0.6224022209644318, "num_tokens": 115222.0, "step": 15 }, { "epoch": 0.0007473317919615124, "grad_norm": 0.22687377035617828, "learning_rate": 4.876521802396143e-06, "loss": 1.5563, "mean_token_accuracy": 0.6480969190597534, "num_tokens": 121037.0, "step": 16 }, { "epoch": 0.0007940400289591069, "grad_norm": 0.23102138936519623, "learning_rate": 4.850865924390067e-06, "loss": 1.9085, "mean_token_accuracy": 0.5840953290462494, "num_tokens": 126199.0, "step": 17 }, { "epoch": 0.0008407482659567014, "grad_norm": 0.20352718234062195, "learning_rate": 4.822875420401423e-06, "loss": 1.6604, "mean_token_accuracy": 0.6558521389961243, "num_tokens": 133602.0, "step": 18 }, { "epoch": 0.000887456502954296, "grad_norm": 0.2476186901330948, "learning_rate": 4.792579648603658e-06, "loss": 1.3569, "mean_token_accuracy": 0.6907783448696136, "num_tokens": 139069.0, "step": 19 }, { "epoch": 0.0009341647399518905, "grad_norm": 0.2237555831670761, "learning_rate": 4.760010385077814e-06, "loss": 1.3358, "mean_token_accuracy": 0.6829327940940857, "num_tokens": 144727.0, "step": 20 }, { "epoch": 0.000980872976949485, "grad_norm": 0.1982390135526657, "learning_rate": 4.725201790483807e-06, "loss": 1.3553, "mean_token_accuracy": 0.6921784579753876, "num_tokens": 151639.0, "step": 21 }, { "epoch": 0.0010275812139470795, "grad_norm": 0.21326391398906708, "learning_rate": 4.688190374230609e-06, "loss": 1.2572, "mean_token_accuracy": 0.7189462780952454, "num_tokens": 160851.0, "step": 22 }, { "epoch": 0.0010742894509446742, "grad_norm": 0.2238379716873169, "learning_rate": 4.649014956182927e-06, "loss": 1.5684, "mean_token_accuracy": 0.6826211810112, "num_tokens": 167413.0, "step": 23 }, { "epoch": 0.0011209976879422686, "grad_norm": 0.20658670365810394, "learning_rate": 4.607716625944519e-06, "loss": 1.3036, "mean_token_accuracy": 0.6903411149978638, "num_tokens": 174102.0, "step": 24 }, { "epoch": 0.0011677059249398632, "grad_norm": 0.21213635802268982, "learning_rate": 4.5643386997608765e-06, "loss": 1.4276, "mean_token_accuracy": 0.6844733953475952, "num_tokens": 182236.0, "step": 25 }, { "epoch": 0.0012144141619374576, "grad_norm": 0.24115009605884552, "learning_rate": 4.518926675086462e-06, "loss": 1.9658, "mean_token_accuracy": 0.5898375511169434, "num_tokens": 189613.0, "step": 26 }, { "epoch": 0.0012611223989350522, "grad_norm": 0.22647137939929962, "learning_rate": 4.471528182864168e-06, "loss": 1.5963, "mean_token_accuracy": 0.6704545319080353, "num_tokens": 196900.0, "step": 27 }, { "epoch": 0.0013078306359326467, "grad_norm": 0.17949607968330383, "learning_rate": 4.422192937567027e-06, "loss": 1.2397, "mean_token_accuracy": 0.7182678580284119, "num_tokens": 204513.0, "step": 28 }, { "epoch": 0.0013545388729302413, "grad_norm": 0.2529963552951813, "learning_rate": 4.3709726850546015e-06, "loss": 1.7072, "mean_token_accuracy": 0.6386008858680725, "num_tokens": 210803.0, "step": 29 }, { "epoch": 0.0014012471099278357, "grad_norm": 0.1803470253944397, "learning_rate": 4.3179211482987196e-06, "loss": 1.2488, "mean_token_accuracy": 0.7105345129966736, "num_tokens": 218783.0, "step": 30 }, { "epoch": 0.0014479553469254303, "grad_norm": 0.1690954566001892, "learning_rate": 4.2630939710354985e-06, "loss": 1.1768, "mean_token_accuracy": 0.7205143570899963, "num_tokens": 228472.0, "step": 31 }, { "epoch": 0.0014946635839230248, "grad_norm": 0.2781330943107605, "learning_rate": 4.206548659402743e-06, "loss": 1.2211, "mean_token_accuracy": 0.7092916369438171, "num_tokens": 233617.0, "step": 32 }, { "epoch": 0.0015413718209206194, "grad_norm": 0.194204643368721, "learning_rate": 4.148344521623957e-06, "loss": 1.4836, "mean_token_accuracy": 0.6891669929027557, "num_tokens": 241676.0, "step": 33 }, { "epoch": 0.0015880800579182138, "grad_norm": 0.1841108798980713, "learning_rate": 4.088542605802202e-06, "loss": 1.3113, "mean_token_accuracy": 0.7001610100269318, "num_tokens": 251517.0, "step": 34 }, { "epoch": 0.0016347882949158084, "grad_norm": 0.19068853557109833, "learning_rate": 4.0272056358890665e-06, "loss": 1.3375, "mean_token_accuracy": 0.7067949175834656, "num_tokens": 260718.0, "step": 35 }, { "epoch": 0.0016814965319134029, "grad_norm": 0.22810104489326477, "learning_rate": 3.964397945895903e-06, "loss": 1.5774, "mean_token_accuracy": 0.6743293404579163, "num_tokens": 267063.0, "step": 36 }, { "epoch": 0.0017282047689109975, "grad_norm": 0.2199760228395462, "learning_rate": 3.900185412416337e-06, "loss": 1.7639, "mean_token_accuracy": 0.6289554536342621, "num_tokens": 274278.0, "step": 37 }, { "epoch": 0.001774913005908592, "grad_norm": 0.23150336742401123, "learning_rate": 3.834635385530813e-06, "loss": 2.4011, "mean_token_accuracy": 0.5110038071870804, "num_tokens": 279752.0, "step": 38 }, { "epoch": 0.0018216212429061865, "grad_norm": 0.2343943864107132, "learning_rate": 3.7678166181656624e-06, "loss": 1.3846, "mean_token_accuracy": 0.6946818828582764, "num_tokens": 287145.0, "step": 39 }, { "epoch": 0.001868329479903781, "grad_norm": 0.1842251569032669, "learning_rate": 3.6997991939807804e-06, "loss": 1.5058, "mean_token_accuracy": 0.6877120733261108, "num_tokens": 295574.0, "step": 40 }, { "epoch": 0.0019150377169013756, "grad_norm": 0.21155694127082825, "learning_rate": 3.63065445386154e-06, "loss": 1.7073, "mean_token_accuracy": 0.6352330446243286, "num_tokens": 302707.0, "step": 41 }, { "epoch": 0.00196174595389897, "grad_norm": 0.25946730375289917, "learning_rate": 3.5604549210920576e-06, "loss": 1.5031, "mean_token_accuracy": 0.6669047772884369, "num_tokens": 307612.0, "step": 42 }, { "epoch": 0.0020084541908965644, "grad_norm": 0.18682092428207397, "learning_rate": 3.489274225288284e-06, "loss": 1.3015, "mean_token_accuracy": 0.7169822454452515, "num_tokens": 319133.0, "step": 43 }, { "epoch": 0.002055162427894159, "grad_norm": 0.2042531669139862, "learning_rate": 3.4171870251706995e-06, "loss": 2.0483, "mean_token_accuracy": 0.5778546035289764, "num_tokens": 326268.0, "step": 44 }, { "epoch": 0.0021018706648917537, "grad_norm": 0.2018388956785202, "learning_rate": 3.344268930257633e-06, "loss": 1.4825, "mean_token_accuracy": 0.6688913702964783, "num_tokens": 332923.0, "step": 45 }, { "epoch": 0.0021485789018893483, "grad_norm": 0.21121808886528015, "learning_rate": 3.2705964215613145e-06, "loss": 1.4353, "mean_token_accuracy": 0.68389692902565, "num_tokens": 338889.0, "step": 46 }, { "epoch": 0.0021952871388869425, "grad_norm": 0.2037765383720398, "learning_rate": 3.196246771369853e-06, "loss": 1.1499, "mean_token_accuracy": 0.7377910912036896, "num_tokens": 346176.0, "step": 47 }, { "epoch": 0.002241995375884537, "grad_norm": 0.1926920861005783, "learning_rate": 3.121297962199279e-06, "loss": 1.4606, "mean_token_accuracy": 0.6756649613380432, "num_tokens": 354291.0, "step": 48 }, { "epoch": 0.0022887036128821318, "grad_norm": 0.23878051340579987, "learning_rate": 3.0458286050006548e-06, "loss": 1.6098, "mean_token_accuracy": 0.6611288487911224, "num_tokens": 361893.0, "step": 49 }, { "epoch": 0.0023354118498797264, "grad_norm": 0.2237703949213028, "learning_rate": 2.96991785670804e-06, "loss": 1.7951, "mean_token_accuracy": 0.6226321458816528, "num_tokens": 369849.0, "step": 50 }, { "epoch": 0.0023821200868773206, "grad_norm": 0.21574032306671143, "learning_rate": 2.8936453372138006e-06, "loss": 1.2809, "mean_token_accuracy": 0.71575528383255, "num_tokens": 376459.0, "step": 51 }, { "epoch": 0.0024288283238749152, "grad_norm": 0.1896604299545288, "learning_rate": 2.8170910458583355e-06, "loss": 1.7823, "mean_token_accuracy": 0.6189461052417755, "num_tokens": 383519.0, "step": 52 }, { "epoch": 0.00247553656087251, "grad_norm": 0.17899923026561737, "learning_rate": 2.740335277521815e-06, "loss": 1.6108, "mean_token_accuracy": 0.6556365489959717, "num_tokens": 391616.0, "step": 53 }, { "epoch": 0.0025222447978701045, "grad_norm": 0.22684413194656372, "learning_rate": 2.6634585384059415e-06, "loss": 1.7163, "mean_token_accuracy": 0.629599392414093, "num_tokens": 399812.0, "step": 54 }, { "epoch": 0.002568953034867699, "grad_norm": 0.18395479023456573, "learning_rate": 2.5865414615940594e-06, "loss": 1.2312, "mean_token_accuracy": 0.7152631878852844, "num_tokens": 407387.0, "step": 55 }, { "epoch": 0.0026156612718652933, "grad_norm": 0.17716501653194427, "learning_rate": 2.509664722478186e-06, "loss": 1.6189, "mean_token_accuracy": 0.6399194896221161, "num_tokens": 415514.0, "step": 56 }, { "epoch": 0.002662369508862888, "grad_norm": 0.2080097496509552, "learning_rate": 2.4329089541416655e-06, "loss": 1.5846, "mean_token_accuracy": 0.663497120141983, "num_tokens": 422039.0, "step": 57 }, { "epoch": 0.0027090777458604826, "grad_norm": 0.19784171879291534, "learning_rate": 2.3563546627862e-06, "loss": 1.3426, "mean_token_accuracy": 0.6939470767974854, "num_tokens": 428566.0, "step": 58 }, { "epoch": 0.0027557859828580772, "grad_norm": 0.1710215061903, "learning_rate": 2.2800821432919614e-06, "loss": 1.5268, "mean_token_accuracy": 0.6385573744773865, "num_tokens": 436877.0, "step": 59 }, { "epoch": 0.0028024942198556714, "grad_norm": 0.19530710577964783, "learning_rate": 2.204171394999346e-06, "loss": 1.7253, "mean_token_accuracy": 0.6284772753715515, "num_tokens": 444515.0, "step": 60 }, { "epoch": 0.002849202456853266, "grad_norm": 0.20333561301231384, "learning_rate": 2.1287020378007216e-06, "loss": 1.6405, "mean_token_accuracy": 0.6564561724662781, "num_tokens": 450241.0, "step": 61 }, { "epoch": 0.0028959106938508607, "grad_norm": 0.24220742285251617, "learning_rate": 2.0537532286301483e-06, "loss": 1.7357, "mean_token_accuracy": 0.613721638917923, "num_tokens": 455245.0, "step": 62 }, { "epoch": 0.0029426189308484553, "grad_norm": 0.25850263237953186, "learning_rate": 1.9794035784386857e-06, "loss": 1.2092, "mean_token_accuracy": 0.7280539572238922, "num_tokens": 461165.0, "step": 63 }, { "epoch": 0.0029893271678460495, "grad_norm": 0.1687914878129959, "learning_rate": 1.9057310697423676e-06, "loss": 1.4988, "mean_token_accuracy": 0.6855064630508423, "num_tokens": 470696.0, "step": 64 }, { "epoch": 0.003036035404843644, "grad_norm": 0.22230161726474762, "learning_rate": 1.8328129748293017e-06, "loss": 1.9035, "mean_token_accuracy": 0.5855222791433334, "num_tokens": 475858.0, "step": 65 }, { "epoch": 0.0030827436418412388, "grad_norm": 0.21983900666236877, "learning_rate": 1.7607257747117174e-06, "loss": 1.7954, "mean_token_accuracy": 0.6393535435199738, "num_tokens": 481280.0, "step": 66 }, { "epoch": 0.0031294518788388334, "grad_norm": 0.28015828132629395, "learning_rate": 1.6895450789079434e-06, "loss": 1.254, "mean_token_accuracy": 0.7204155027866364, "num_tokens": 488210.0, "step": 67 }, { "epoch": 0.0031761601158364276, "grad_norm": 0.21209849417209625, "learning_rate": 1.6193455461384617e-06, "loss": 1.9336, "mean_token_accuracy": 0.6008668541908264, "num_tokens": 494581.0, "step": 68 }, { "epoch": 0.0032228683528340222, "grad_norm": 0.25260934233665466, "learning_rate": 1.5502008060192202e-06, "loss": 1.7713, "mean_token_accuracy": 0.592128798365593, "num_tokens": 499120.0, "step": 69 }, { "epoch": 0.003269576589831617, "grad_norm": 0.19961196184158325, "learning_rate": 1.4821833818343378e-06, "loss": 1.3307, "mean_token_accuracy": 0.7077135443687439, "num_tokens": 506700.0, "step": 70 }, { "epoch": 0.0033162848268292115, "grad_norm": 0.21448221802711487, "learning_rate": 1.4153646144691887e-06, "loss": 1.4005, "mean_token_accuracy": 0.6793873608112335, "num_tokens": 513622.0, "step": 71 }, { "epoch": 0.0033629930638268057, "grad_norm": 0.19134606420993805, "learning_rate": 1.3498145875836636e-06, "loss": 1.8944, "mean_token_accuracy": 0.6288950145244598, "num_tokens": 520502.0, "step": 72 }, { "epoch": 0.0034097013008244003, "grad_norm": 0.20466452836990356, "learning_rate": 1.285602054104097e-06, "loss": 1.9276, "mean_token_accuracy": 0.57016322016716, "num_tokens": 528001.0, "step": 73 }, { "epoch": 0.003456409537821995, "grad_norm": 0.20611968636512756, "learning_rate": 1.2227943641109345e-06, "loss": 1.8754, "mean_token_accuracy": 0.5942300260066986, "num_tokens": 533840.0, "step": 74 }, { "epoch": 0.0035031177748195896, "grad_norm": 0.22362089157104492, "learning_rate": 1.1614573941977975e-06, "loss": 2.0853, "mean_token_accuracy": 0.5632732808589935, "num_tokens": 540459.0, "step": 75 }, { "epoch": 0.003549826011817184, "grad_norm": 0.16187940537929535, "learning_rate": 1.1016554783760433e-06, "loss": 1.3277, "mean_token_accuracy": 0.7023499011993408, "num_tokens": 548772.0, "step": 76 }, { "epoch": 0.0035965342488147784, "grad_norm": 0.22978803515434265, "learning_rate": 2.2033109567520866e-06, "loss": 1.8168, "mean_token_accuracy": 0.6008757054805756, "num_tokens": 554584.0, "step": 77 }, { "epoch": 0.003643242485812373, "grad_norm": 0.1978112757205963, "learning_rate": 2.0419772472095698e-06, "loss": 1.5754, "mean_token_accuracy": 0.6501273214817047, "num_tokens": 561654.0, "step": 78 }, { "epoch": 0.0036899507228099677, "grad_norm": 0.2072610855102539, "learning_rate": 1.652483217564065e-06, "loss": 1.7213, "mean_token_accuracy": 0.6490782797336578, "num_tokens": 567900.0, "step": 79 }, { "epoch": 0.003736658959807562, "grad_norm": 0.22125592827796936, "learning_rate": 1.2629891879185599e-06, "loss": 1.4386, "mean_token_accuracy": 0.6827996075153351, "num_tokens": 576574.0, "step": 80 }, { "epoch": 0.0037833671968051565, "grad_norm": 0.18630705773830414, "learning_rate": 1.1016554783760433e-06, "loss": 1.0299, "mean_token_accuracy": 0.7590331435203552, "num_tokens": 583830.0, "step": 81 } ], "logging_steps": 1, "max_steps": 102, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2266017525923840.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }