| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.0037833671968051565, | |
| "eval_steps": 102, | |
| "global_step": 81, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 4.6708236997594524e-05, | |
| "grad_norm": 0.24815663695335388, | |
| "learning_rate": 0.0, | |
| "loss": 1.5212, | |
| "mean_token_accuracy": 0.6624278724193573, | |
| "num_tokens": 6610.0, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 9.341647399518905e-05, | |
| "grad_norm": 0.22391362488269806, | |
| "learning_rate": 1.0000000000000002e-06, | |
| "loss": 1.3169, | |
| "mean_token_accuracy": 0.6899301111698151, | |
| "num_tokens": 13855.0, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.00014012471099278357, | |
| "grad_norm": 0.24716292321681976, | |
| "learning_rate": 2.0000000000000003e-06, | |
| "loss": 1.5019, | |
| "mean_token_accuracy": 0.6771045923233032, | |
| "num_tokens": 19204.0, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.0001868329479903781, | |
| "grad_norm": 0.20678554475307465, | |
| "learning_rate": 3e-06, | |
| "loss": 1.2039, | |
| "mean_token_accuracy": 0.7312012910842896, | |
| "num_tokens": 27787.0, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.00023354118498797262, | |
| "grad_norm": 0.16259518265724182, | |
| "learning_rate": 4.000000000000001e-06, | |
| "loss": 1.234, | |
| "mean_token_accuracy": 0.7289687991142273, | |
| "num_tokens": 38962.0, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.00028024942198556714, | |
| "grad_norm": 0.18290941417217255, | |
| "learning_rate": 5e-06, | |
| "loss": 1.3001, | |
| "mean_token_accuracy": 0.685863584280014, | |
| "num_tokens": 46738.0, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.00032695765898316167, | |
| "grad_norm": 0.22102956473827362, | |
| "learning_rate": 4.998754476483098e-06, | |
| "loss": 1.1942, | |
| "mean_token_accuracy": 0.7294453680515289, | |
| "num_tokens": 54364.0, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.0003736658959807562, | |
| "grad_norm": 0.23882372677326202, | |
| "learning_rate": 4.9950192123145654e-06, | |
| "loss": 1.3233, | |
| "mean_token_accuracy": 0.7093064486980438, | |
| "num_tokens": 60646.0, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.0004203741329783507, | |
| "grad_norm": 0.20760418474674225, | |
| "learning_rate": 4.988798125270709e-06, | |
| "loss": 1.8058, | |
| "mean_token_accuracy": 0.6353311538696289, | |
| "num_tokens": 70548.0, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.00046708236997594524, | |
| "grad_norm": 0.18925875425338745, | |
| "learning_rate": 4.980097740412761e-06, | |
| "loss": 1.6046, | |
| "mean_token_accuracy": 0.6708937883377075, | |
| "num_tokens": 78213.0, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.0005137906069735398, | |
| "grad_norm": 0.2128538340330124, | |
| "learning_rate": 4.968927183242991e-06, | |
| "loss": 1.4572, | |
| "mean_token_accuracy": 0.6536244750022888, | |
| "num_tokens": 85224.0, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.0005604988439711343, | |
| "grad_norm": 0.19441735744476318, | |
| "learning_rate": 4.955298170133318e-06, | |
| "loss": 1.2837, | |
| "mean_token_accuracy": 0.702611893415451, | |
| "num_tokens": 93493.0, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.0006072070809687288, | |
| "grad_norm": 0.2139551192522049, | |
| "learning_rate": 4.93922499603645e-06, | |
| "loss": 1.2754, | |
| "mean_token_accuracy": 0.6683319509029388, | |
| "num_tokens": 102255.0, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.0006539153179663233, | |
| "grad_norm": 0.2441701591014862, | |
| "learning_rate": 4.920724519492452e-06, | |
| "loss": 1.3914, | |
| "mean_token_accuracy": 0.6826636791229248, | |
| "num_tokens": 109224.0, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.0007006235549639179, | |
| "grad_norm": 0.21002362668514252, | |
| "learning_rate": 4.89981614494647e-06, | |
| "loss": 1.7201, | |
| "mean_token_accuracy": 0.6224022209644318, | |
| "num_tokens": 115222.0, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.0007473317919615124, | |
| "grad_norm": 0.22687377035617828, | |
| "learning_rate": 4.876521802396143e-06, | |
| "loss": 1.5563, | |
| "mean_token_accuracy": 0.6480969190597534, | |
| "num_tokens": 121037.0, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.0007940400289591069, | |
| "grad_norm": 0.23102138936519623, | |
| "learning_rate": 4.850865924390067e-06, | |
| "loss": 1.9085, | |
| "mean_token_accuracy": 0.5840953290462494, | |
| "num_tokens": 126199.0, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.0008407482659567014, | |
| "grad_norm": 0.20352718234062195, | |
| "learning_rate": 4.822875420401423e-06, | |
| "loss": 1.6604, | |
| "mean_token_accuracy": 0.6558521389961243, | |
| "num_tokens": 133602.0, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.000887456502954296, | |
| "grad_norm": 0.2476186901330948, | |
| "learning_rate": 4.792579648603658e-06, | |
| "loss": 1.3569, | |
| "mean_token_accuracy": 0.6907783448696136, | |
| "num_tokens": 139069.0, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.0009341647399518905, | |
| "grad_norm": 0.2237555831670761, | |
| "learning_rate": 4.760010385077814e-06, | |
| "loss": 1.3358, | |
| "mean_token_accuracy": 0.6829327940940857, | |
| "num_tokens": 144727.0, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.000980872976949485, | |
| "grad_norm": 0.1982390135526657, | |
| "learning_rate": 4.725201790483807e-06, | |
| "loss": 1.3553, | |
| "mean_token_accuracy": 0.6921784579753876, | |
| "num_tokens": 151639.0, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.0010275812139470795, | |
| "grad_norm": 0.21326391398906708, | |
| "learning_rate": 4.688190374230609e-06, | |
| "loss": 1.2572, | |
| "mean_token_accuracy": 0.7189462780952454, | |
| "num_tokens": 160851.0, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.0010742894509446742, | |
| "grad_norm": 0.2238379716873169, | |
| "learning_rate": 4.649014956182927e-06, | |
| "loss": 1.5684, | |
| "mean_token_accuracy": 0.6826211810112, | |
| "num_tokens": 167413.0, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.0011209976879422686, | |
| "grad_norm": 0.20658670365810394, | |
| "learning_rate": 4.607716625944519e-06, | |
| "loss": 1.3036, | |
| "mean_token_accuracy": 0.6903411149978638, | |
| "num_tokens": 174102.0, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.0011677059249398632, | |
| "grad_norm": 0.21213635802268982, | |
| "learning_rate": 4.5643386997608765e-06, | |
| "loss": 1.4276, | |
| "mean_token_accuracy": 0.6844733953475952, | |
| "num_tokens": 182236.0, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.0012144141619374576, | |
| "grad_norm": 0.24115009605884552, | |
| "learning_rate": 4.518926675086462e-06, | |
| "loss": 1.9658, | |
| "mean_token_accuracy": 0.5898375511169434, | |
| "num_tokens": 189613.0, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.0012611223989350522, | |
| "grad_norm": 0.22647137939929962, | |
| "learning_rate": 4.471528182864168e-06, | |
| "loss": 1.5963, | |
| "mean_token_accuracy": 0.6704545319080353, | |
| "num_tokens": 196900.0, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.0013078306359326467, | |
| "grad_norm": 0.17949607968330383, | |
| "learning_rate": 4.422192937567027e-06, | |
| "loss": 1.2397, | |
| "mean_token_accuracy": 0.7182678580284119, | |
| "num_tokens": 204513.0, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.0013545388729302413, | |
| "grad_norm": 0.2529963552951813, | |
| "learning_rate": 4.3709726850546015e-06, | |
| "loss": 1.7072, | |
| "mean_token_accuracy": 0.6386008858680725, | |
| "num_tokens": 210803.0, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.0014012471099278357, | |
| "grad_norm": 0.1803470253944397, | |
| "learning_rate": 4.3179211482987196e-06, | |
| "loss": 1.2488, | |
| "mean_token_accuracy": 0.7105345129966736, | |
| "num_tokens": 218783.0, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.0014479553469254303, | |
| "grad_norm": 0.1690954566001892, | |
| "learning_rate": 4.2630939710354985e-06, | |
| "loss": 1.1768, | |
| "mean_token_accuracy": 0.7205143570899963, | |
| "num_tokens": 228472.0, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.0014946635839230248, | |
| "grad_norm": 0.2781330943107605, | |
| "learning_rate": 4.206548659402743e-06, | |
| "loss": 1.2211, | |
| "mean_token_accuracy": 0.7092916369438171, | |
| "num_tokens": 233617.0, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.0015413718209206194, | |
| "grad_norm": 0.194204643368721, | |
| "learning_rate": 4.148344521623957e-06, | |
| "loss": 1.4836, | |
| "mean_token_accuracy": 0.6891669929027557, | |
| "num_tokens": 241676.0, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.0015880800579182138, | |
| "grad_norm": 0.1841108798980713, | |
| "learning_rate": 4.088542605802202e-06, | |
| "loss": 1.3113, | |
| "mean_token_accuracy": 0.7001610100269318, | |
| "num_tokens": 251517.0, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.0016347882949158084, | |
| "grad_norm": 0.19068853557109833, | |
| "learning_rate": 4.0272056358890665e-06, | |
| "loss": 1.3375, | |
| "mean_token_accuracy": 0.7067949175834656, | |
| "num_tokens": 260718.0, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.0016814965319134029, | |
| "grad_norm": 0.22810104489326477, | |
| "learning_rate": 3.964397945895903e-06, | |
| "loss": 1.5774, | |
| "mean_token_accuracy": 0.6743293404579163, | |
| "num_tokens": 267063.0, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.0017282047689109975, | |
| "grad_norm": 0.2199760228395462, | |
| "learning_rate": 3.900185412416337e-06, | |
| "loss": 1.7639, | |
| "mean_token_accuracy": 0.6289554536342621, | |
| "num_tokens": 274278.0, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.001774913005908592, | |
| "grad_norm": 0.23150336742401123, | |
| "learning_rate": 3.834635385530813e-06, | |
| "loss": 2.4011, | |
| "mean_token_accuracy": 0.5110038071870804, | |
| "num_tokens": 279752.0, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.0018216212429061865, | |
| "grad_norm": 0.2343943864107132, | |
| "learning_rate": 3.7678166181656624e-06, | |
| "loss": 1.3846, | |
| "mean_token_accuracy": 0.6946818828582764, | |
| "num_tokens": 287145.0, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.001868329479903781, | |
| "grad_norm": 0.1842251569032669, | |
| "learning_rate": 3.6997991939807804e-06, | |
| "loss": 1.5058, | |
| "mean_token_accuracy": 0.6877120733261108, | |
| "num_tokens": 295574.0, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.0019150377169013756, | |
| "grad_norm": 0.21155694127082825, | |
| "learning_rate": 3.63065445386154e-06, | |
| "loss": 1.7073, | |
| "mean_token_accuracy": 0.6352330446243286, | |
| "num_tokens": 302707.0, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.00196174595389897, | |
| "grad_norm": 0.25946730375289917, | |
| "learning_rate": 3.5604549210920576e-06, | |
| "loss": 1.5031, | |
| "mean_token_accuracy": 0.6669047772884369, | |
| "num_tokens": 307612.0, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.0020084541908965644, | |
| "grad_norm": 0.18682092428207397, | |
| "learning_rate": 3.489274225288284e-06, | |
| "loss": 1.3015, | |
| "mean_token_accuracy": 0.7169822454452515, | |
| "num_tokens": 319133.0, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.002055162427894159, | |
| "grad_norm": 0.2042531669139862, | |
| "learning_rate": 3.4171870251706995e-06, | |
| "loss": 2.0483, | |
| "mean_token_accuracy": 0.5778546035289764, | |
| "num_tokens": 326268.0, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.0021018706648917537, | |
| "grad_norm": 0.2018388956785202, | |
| "learning_rate": 3.344268930257633e-06, | |
| "loss": 1.4825, | |
| "mean_token_accuracy": 0.6688913702964783, | |
| "num_tokens": 332923.0, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.0021485789018893483, | |
| "grad_norm": 0.21121808886528015, | |
| "learning_rate": 3.2705964215613145e-06, | |
| "loss": 1.4353, | |
| "mean_token_accuracy": 0.68389692902565, | |
| "num_tokens": 338889.0, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.0021952871388869425, | |
| "grad_norm": 0.2037765383720398, | |
| "learning_rate": 3.196246771369853e-06, | |
| "loss": 1.1499, | |
| "mean_token_accuracy": 0.7377910912036896, | |
| "num_tokens": 346176.0, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.002241995375884537, | |
| "grad_norm": 0.1926920861005783, | |
| "learning_rate": 3.121297962199279e-06, | |
| "loss": 1.4606, | |
| "mean_token_accuracy": 0.6756649613380432, | |
| "num_tokens": 354291.0, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.0022887036128821318, | |
| "grad_norm": 0.23878051340579987, | |
| "learning_rate": 3.0458286050006548e-06, | |
| "loss": 1.6098, | |
| "mean_token_accuracy": 0.6611288487911224, | |
| "num_tokens": 361893.0, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.0023354118498797264, | |
| "grad_norm": 0.2237703949213028, | |
| "learning_rate": 2.96991785670804e-06, | |
| "loss": 1.7951, | |
| "mean_token_accuracy": 0.6226321458816528, | |
| "num_tokens": 369849.0, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.0023821200868773206, | |
| "grad_norm": 0.21574032306671143, | |
| "learning_rate": 2.8936453372138006e-06, | |
| "loss": 1.2809, | |
| "mean_token_accuracy": 0.71575528383255, | |
| "num_tokens": 376459.0, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.0024288283238749152, | |
| "grad_norm": 0.1896604299545288, | |
| "learning_rate": 2.8170910458583355e-06, | |
| "loss": 1.7823, | |
| "mean_token_accuracy": 0.6189461052417755, | |
| "num_tokens": 383519.0, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.00247553656087251, | |
| "grad_norm": 0.17899923026561737, | |
| "learning_rate": 2.740335277521815e-06, | |
| "loss": 1.6108, | |
| "mean_token_accuracy": 0.6556365489959717, | |
| "num_tokens": 391616.0, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.0025222447978701045, | |
| "grad_norm": 0.22684413194656372, | |
| "learning_rate": 2.6634585384059415e-06, | |
| "loss": 1.7163, | |
| "mean_token_accuracy": 0.629599392414093, | |
| "num_tokens": 399812.0, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.002568953034867699, | |
| "grad_norm": 0.18395479023456573, | |
| "learning_rate": 2.5865414615940594e-06, | |
| "loss": 1.2312, | |
| "mean_token_accuracy": 0.7152631878852844, | |
| "num_tokens": 407387.0, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.0026156612718652933, | |
| "grad_norm": 0.17716501653194427, | |
| "learning_rate": 2.509664722478186e-06, | |
| "loss": 1.6189, | |
| "mean_token_accuracy": 0.6399194896221161, | |
| "num_tokens": 415514.0, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.002662369508862888, | |
| "grad_norm": 0.2080097496509552, | |
| "learning_rate": 2.4329089541416655e-06, | |
| "loss": 1.5846, | |
| "mean_token_accuracy": 0.663497120141983, | |
| "num_tokens": 422039.0, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.0027090777458604826, | |
| "grad_norm": 0.19784171879291534, | |
| "learning_rate": 2.3563546627862e-06, | |
| "loss": 1.3426, | |
| "mean_token_accuracy": 0.6939470767974854, | |
| "num_tokens": 428566.0, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.0027557859828580772, | |
| "grad_norm": 0.1710215061903, | |
| "learning_rate": 2.2800821432919614e-06, | |
| "loss": 1.5268, | |
| "mean_token_accuracy": 0.6385573744773865, | |
| "num_tokens": 436877.0, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.0028024942198556714, | |
| "grad_norm": 0.19530710577964783, | |
| "learning_rate": 2.204171394999346e-06, | |
| "loss": 1.7253, | |
| "mean_token_accuracy": 0.6284772753715515, | |
| "num_tokens": 444515.0, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.002849202456853266, | |
| "grad_norm": 0.20333561301231384, | |
| "learning_rate": 2.1287020378007216e-06, | |
| "loss": 1.6405, | |
| "mean_token_accuracy": 0.6564561724662781, | |
| "num_tokens": 450241.0, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.0028959106938508607, | |
| "grad_norm": 0.24220742285251617, | |
| "learning_rate": 2.0537532286301483e-06, | |
| "loss": 1.7357, | |
| "mean_token_accuracy": 0.613721638917923, | |
| "num_tokens": 455245.0, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.0029426189308484553, | |
| "grad_norm": 0.25850263237953186, | |
| "learning_rate": 1.9794035784386857e-06, | |
| "loss": 1.2092, | |
| "mean_token_accuracy": 0.7280539572238922, | |
| "num_tokens": 461165.0, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.0029893271678460495, | |
| "grad_norm": 0.1687914878129959, | |
| "learning_rate": 1.9057310697423676e-06, | |
| "loss": 1.4988, | |
| "mean_token_accuracy": 0.6855064630508423, | |
| "num_tokens": 470696.0, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.003036035404843644, | |
| "grad_norm": 0.22230161726474762, | |
| "learning_rate": 1.8328129748293017e-06, | |
| "loss": 1.9035, | |
| "mean_token_accuracy": 0.5855222791433334, | |
| "num_tokens": 475858.0, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.0030827436418412388, | |
| "grad_norm": 0.21983900666236877, | |
| "learning_rate": 1.7607257747117174e-06, | |
| "loss": 1.7954, | |
| "mean_token_accuracy": 0.6393535435199738, | |
| "num_tokens": 481280.0, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.0031294518788388334, | |
| "grad_norm": 0.28015828132629395, | |
| "learning_rate": 1.6895450789079434e-06, | |
| "loss": 1.254, | |
| "mean_token_accuracy": 0.7204155027866364, | |
| "num_tokens": 488210.0, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.0031761601158364276, | |
| "grad_norm": 0.21209849417209625, | |
| "learning_rate": 1.6193455461384617e-06, | |
| "loss": 1.9336, | |
| "mean_token_accuracy": 0.6008668541908264, | |
| "num_tokens": 494581.0, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.0032228683528340222, | |
| "grad_norm": 0.25260934233665466, | |
| "learning_rate": 1.5502008060192202e-06, | |
| "loss": 1.7713, | |
| "mean_token_accuracy": 0.592128798365593, | |
| "num_tokens": 499120.0, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.003269576589831617, | |
| "grad_norm": 0.19961196184158325, | |
| "learning_rate": 1.4821833818343378e-06, | |
| "loss": 1.3307, | |
| "mean_token_accuracy": 0.7077135443687439, | |
| "num_tokens": 506700.0, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.0033162848268292115, | |
| "grad_norm": 0.21448221802711487, | |
| "learning_rate": 1.4153646144691887e-06, | |
| "loss": 1.4005, | |
| "mean_token_accuracy": 0.6793873608112335, | |
| "num_tokens": 513622.0, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.0033629930638268057, | |
| "grad_norm": 0.19134606420993805, | |
| "learning_rate": 1.3498145875836636e-06, | |
| "loss": 1.8944, | |
| "mean_token_accuracy": 0.6288950145244598, | |
| "num_tokens": 520502.0, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.0034097013008244003, | |
| "grad_norm": 0.20466452836990356, | |
| "learning_rate": 1.285602054104097e-06, | |
| "loss": 1.9276, | |
| "mean_token_accuracy": 0.57016322016716, | |
| "num_tokens": 528001.0, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.003456409537821995, | |
| "grad_norm": 0.20611968636512756, | |
| "learning_rate": 1.2227943641109345e-06, | |
| "loss": 1.8754, | |
| "mean_token_accuracy": 0.5942300260066986, | |
| "num_tokens": 533840.0, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.0035031177748195896, | |
| "grad_norm": 0.22362089157104492, | |
| "learning_rate": 1.1614573941977975e-06, | |
| "loss": 2.0853, | |
| "mean_token_accuracy": 0.5632732808589935, | |
| "num_tokens": 540459.0, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.003549826011817184, | |
| "grad_norm": 0.16187940537929535, | |
| "learning_rate": 1.1016554783760433e-06, | |
| "loss": 1.3277, | |
| "mean_token_accuracy": 0.7023499011993408, | |
| "num_tokens": 548772.0, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.0035965342488147784, | |
| "grad_norm": 0.22978803515434265, | |
| "learning_rate": 2.2033109567520866e-06, | |
| "loss": 1.8168, | |
| "mean_token_accuracy": 0.6008757054805756, | |
| "num_tokens": 554584.0, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.003643242485812373, | |
| "grad_norm": 0.1978112757205963, | |
| "learning_rate": 2.0419772472095698e-06, | |
| "loss": 1.5754, | |
| "mean_token_accuracy": 0.6501273214817047, | |
| "num_tokens": 561654.0, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.0036899507228099677, | |
| "grad_norm": 0.2072610855102539, | |
| "learning_rate": 1.652483217564065e-06, | |
| "loss": 1.7213, | |
| "mean_token_accuracy": 0.6490782797336578, | |
| "num_tokens": 567900.0, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.003736658959807562, | |
| "grad_norm": 0.22125592827796936, | |
| "learning_rate": 1.2629891879185599e-06, | |
| "loss": 1.4386, | |
| "mean_token_accuracy": 0.6827996075153351, | |
| "num_tokens": 576574.0, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.0037833671968051565, | |
| "grad_norm": 0.18630705773830414, | |
| "learning_rate": 1.1016554783760433e-06, | |
| "loss": 1.0299, | |
| "mean_token_accuracy": 0.7590331435203552, | |
| "num_tokens": 583830.0, | |
| "step": 81 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 102, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2266017525923840.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |