diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4244 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.759526938239159, + "eval_steps": 10240, + "global_step": 2100, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.001314060446780552, + "grad_norm": 6.0887322425842285, + "learning_rate": 1.4492753623188408e-07, + "loss": 1.0770764350891113, + "memory(GiB)": 39.15, + "step": 1, + "token_acc": 0.7469458987783595, + "train_speed(iter/s)": 0.013712 + }, + { + "epoch": 0.006570302233902759, + "grad_norm": 6.300453186035156, + "learning_rate": 7.246376811594204e-07, + "loss": 1.0886579751968384, + "memory(GiB)": 84.21, + "step": 5, + "token_acc": 0.7434342087721637, + "train_speed(iter/s)": 0.026613 + }, + { + "epoch": 0.013140604467805518, + "grad_norm": 5.09555196762085, + "learning_rate": 1.4492753623188408e-06, + "loss": 1.0676928520202638, + "memory(GiB)": 84.21, + "step": 10, + "token_acc": 0.740495867768595, + "train_speed(iter/s)": 0.030047 + }, + { + "epoch": 0.01971090670170828, + "grad_norm": 3.2556324005126953, + "learning_rate": 2.173913043478261e-06, + "loss": 0.9635882377624512, + "memory(GiB)": 84.21, + "step": 15, + "token_acc": 0.7701234008830522, + "train_speed(iter/s)": 0.031287 + }, + { + "epoch": 0.026281208935611037, + "grad_norm": 1.6578456163406372, + "learning_rate": 2.8985507246376816e-06, + "loss": 0.8276536941528321, + "memory(GiB)": 84.21, + "step": 20, + "token_acc": 0.7827149763702537, + "train_speed(iter/s)": 0.031906 + }, + { + "epoch": 0.0328515111695138, + "grad_norm": 1.177905797958374, + "learning_rate": 3.6231884057971017e-06, + "loss": 0.7361048221588135, + "memory(GiB)": 84.21, + "step": 25, + "token_acc": 0.7906462683962538, + "train_speed(iter/s)": 0.032398 + }, + { + "epoch": 0.03942181340341656, + "grad_norm": 0.6714381575584412, + "learning_rate": 4.347826086956522e-06, + "loss": 0.6780746459960938, + "memory(GiB)": 84.21, + "step": 30, + "token_acc": 0.8038660725039143, + "train_speed(iter/s)": 0.032405 + }, + { + "epoch": 0.045992115637319315, + "grad_norm": 0.6768696904182434, + "learning_rate": 5.072463768115943e-06, + "loss": 0.6487759590148926, + "memory(GiB)": 84.21, + "step": 35, + "token_acc": 0.833314147576839, + "train_speed(iter/s)": 0.03272 + }, + { + "epoch": 0.052562417871222074, + "grad_norm": 0.5112195611000061, + "learning_rate": 5.797101449275363e-06, + "loss": 0.6321969032287598, + "memory(GiB)": 84.21, + "step": 40, + "token_acc": 0.8408800826596973, + "train_speed(iter/s)": 0.032903 + }, + { + "epoch": 0.05913272010512484, + "grad_norm": 0.4675757586956024, + "learning_rate": 6.521739130434783e-06, + "loss": 0.6117629528045654, + "memory(GiB)": 84.21, + "step": 45, + "token_acc": 0.8229637648856907, + "train_speed(iter/s)": 0.033031 + }, + { + "epoch": 0.0657030223390276, + "grad_norm": 0.4107670783996582, + "learning_rate": 7.246376811594203e-06, + "loss": 0.5980951309204101, + "memory(GiB)": 84.21, + "step": 50, + "token_acc": 0.8370074882776961, + "train_speed(iter/s)": 0.033154 + }, + { + "epoch": 0.07227332457293036, + "grad_norm": 0.37559813261032104, + "learning_rate": 7.971014492753623e-06, + "loss": 0.5822395801544189, + "memory(GiB)": 84.21, + "step": 55, + "token_acc": 0.8410229088971763, + "train_speed(iter/s)": 0.033226 + }, + { + "epoch": 0.07884362680683311, + "grad_norm": 0.4154057502746582, + "learning_rate": 8.695652173913044e-06, + "loss": 0.5758543968200683, + "memory(GiB)": 84.21, + "step": 60, + "token_acc": 0.8529284789178299, + "train_speed(iter/s)": 0.033321 + }, + { + "epoch": 0.08541392904073587, + "grad_norm": 0.42753836512565613, + "learning_rate": 9.420289855072464e-06, + "loss": 0.5728845596313477, + "memory(GiB)": 84.21, + "step": 65, + "token_acc": 0.8387224954055531, + "train_speed(iter/s)": 0.033349 + }, + { + "epoch": 0.09198423127463863, + "grad_norm": 0.42457839846611023, + "learning_rate": 9.999994966333388e-06, + "loss": 0.564476203918457, + "memory(GiB)": 84.21, + "step": 70, + "token_acc": 0.8388107377603047, + "train_speed(iter/s)": 0.033448 + }, + { + "epoch": 0.09855453350854139, + "grad_norm": 0.36299943923950195, + "learning_rate": 9.999818789066164e-06, + "loss": 0.555049991607666, + "memory(GiB)": 84.21, + "step": 75, + "token_acc": 0.8408914844169001, + "train_speed(iter/s)": 0.033455 + }, + { + "epoch": 0.10512483574244415, + "grad_norm": 0.3913320302963257, + "learning_rate": 9.99939093860338e-06, + "loss": 0.5565983772277832, + "memory(GiB)": 84.21, + "step": 80, + "token_acc": 0.8320722084099016, + "train_speed(iter/s)": 0.03343 + }, + { + "epoch": 0.1116951379763469, + "grad_norm": 0.36235758662223816, + "learning_rate": 9.998711436481519e-06, + "loss": 0.5525528907775878, + "memory(GiB)": 84.21, + "step": 85, + "token_acc": 0.8355197947641537, + "train_speed(iter/s)": 0.033468 + }, + { + "epoch": 0.11826544021024968, + "grad_norm": 0.38250720500946045, + "learning_rate": 9.99778031690431e-06, + "loss": 0.5516636848449707, + "memory(GiB)": 84.21, + "step": 90, + "token_acc": 0.8384240551461849, + "train_speed(iter/s)": 0.033529 + }, + { + "epoch": 0.12483574244415244, + "grad_norm": 0.37428662180900574, + "learning_rate": 9.996597626741023e-06, + "loss": 0.5437192440032959, + "memory(GiB)": 84.21, + "step": 95, + "token_acc": 0.8406958239587334, + "train_speed(iter/s)": 0.033585 + }, + { + "epoch": 0.1314060446780552, + "grad_norm": 0.3782438635826111, + "learning_rate": 9.995163425524097e-06, + "loss": 0.5443241119384765, + "memory(GiB)": 84.21, + "step": 100, + "token_acc": 0.834660268295343, + "train_speed(iter/s)": 0.03366 + }, + { + "epoch": 0.13797634691195795, + "grad_norm": 0.38486766815185547, + "learning_rate": 9.993477785446151e-06, + "loss": 0.5410516738891602, + "memory(GiB)": 86.38, + "step": 105, + "token_acc": 0.8449431198379305, + "train_speed(iter/s)": 0.033686 + }, + { + "epoch": 0.1445466491458607, + "grad_norm": 0.38819748163223267, + "learning_rate": 9.991540791356342e-06, + "loss": 0.5370469093322754, + "memory(GiB)": 86.38, + "step": 110, + "token_acc": 0.8543880362062181, + "train_speed(iter/s)": 0.033712 + }, + { + "epoch": 0.15111695137976347, + "grad_norm": 0.39973896741867065, + "learning_rate": 9.989352540756103e-06, + "loss": 0.5358469486236572, + "memory(GiB)": 86.38, + "step": 115, + "token_acc": 0.8282656701206047, + "train_speed(iter/s)": 0.033757 + }, + { + "epoch": 0.15768725361366623, + "grad_norm": 0.34199291467666626, + "learning_rate": 9.986913143794232e-06, + "loss": 0.5350133895874023, + "memory(GiB)": 86.38, + "step": 120, + "token_acc": 0.8469218989280245, + "train_speed(iter/s)": 0.033788 + }, + { + "epoch": 0.164257555847569, + "grad_norm": 0.41273701190948486, + "learning_rate": 9.984222723261344e-06, + "loss": 0.5307738304138183, + "memory(GiB)": 86.38, + "step": 125, + "token_acc": 0.8481556913328807, + "train_speed(iter/s)": 0.033799 + }, + { + "epoch": 0.17082785808147175, + "grad_norm": 0.4566132724285126, + "learning_rate": 9.981281414583693e-06, + "loss": 0.5298214912414551, + "memory(GiB)": 86.38, + "step": 130, + "token_acc": 0.840121171322787, + "train_speed(iter/s)": 0.033812 + }, + { + "epoch": 0.1773981603153745, + "grad_norm": 0.3990865647792816, + "learning_rate": 9.978089365816357e-06, + "loss": 0.5284788131713867, + "memory(GiB)": 86.38, + "step": 135, + "token_acc": 0.844040404040404, + "train_speed(iter/s)": 0.033875 + }, + { + "epoch": 0.18396846254927726, + "grad_norm": 0.36237913370132446, + "learning_rate": 9.974646737635781e-06, + "loss": 0.530832576751709, + "memory(GiB)": 86.38, + "step": 140, + "token_acc": 0.8354903823319877, + "train_speed(iter/s)": 0.033929 + }, + { + "epoch": 0.19053876478318002, + "grad_norm": 0.4100829064846039, + "learning_rate": 9.970953703331692e-06, + "loss": 0.5266030788421631, + "memory(GiB)": 86.38, + "step": 145, + "token_acc": 0.8457928481723842, + "train_speed(iter/s)": 0.033964 + }, + { + "epoch": 0.19710906701708278, + "grad_norm": 0.3652012050151825, + "learning_rate": 9.967010448798376e-06, + "loss": 0.5251831531524658, + "memory(GiB)": 86.38, + "step": 150, + "token_acc": 0.8387645380732939, + "train_speed(iter/s)": 0.033991 + }, + { + "epoch": 0.20367936925098554, + "grad_norm": 0.39163169264793396, + "learning_rate": 9.962817172525323e-06, + "loss": 0.5267560958862305, + "memory(GiB)": 86.38, + "step": 155, + "token_acc": 0.8567956034664975, + "train_speed(iter/s)": 0.03401 + }, + { + "epoch": 0.2102496714848883, + "grad_norm": 0.41479626297950745, + "learning_rate": 9.958374085587228e-06, + "loss": 0.519415283203125, + "memory(GiB)": 86.38, + "step": 160, + "token_acc": 0.8440078352228884, + "train_speed(iter/s)": 0.034022 + }, + { + "epoch": 0.21681997371879105, + "grad_norm": 0.3581003248691559, + "learning_rate": 9.953681411633376e-06, + "loss": 0.5208570480346679, + "memory(GiB)": 86.38, + "step": 165, + "token_acc": 0.8545801997287634, + "train_speed(iter/s)": 0.034026 + }, + { + "epoch": 0.2233902759526938, + "grad_norm": 0.44018271565437317, + "learning_rate": 9.948739386876376e-06, + "loss": 0.5224351406097412, + "memory(GiB)": 86.38, + "step": 170, + "token_acc": 0.8500869565217392, + "train_speed(iter/s)": 0.034009 + }, + { + "epoch": 0.22996057818659657, + "grad_norm": 0.40481236577033997, + "learning_rate": 9.943548260080277e-06, + "loss": 0.5226601600646973, + "memory(GiB)": 86.38, + "step": 175, + "token_acc": 0.8479028560807881, + "train_speed(iter/s)": 0.033953 + }, + { + "epoch": 0.23653088042049936, + "grad_norm": 0.3878992199897766, + "learning_rate": 9.938108292548044e-06, + "loss": 0.5180087566375733, + "memory(GiB)": 86.38, + "step": 180, + "token_acc": 0.8407539640869474, + "train_speed(iter/s)": 0.033944 + }, + { + "epoch": 0.24310118265440211, + "grad_norm": 0.3512628674507141, + "learning_rate": 9.932419758108403e-06, + "loss": 0.5186543464660645, + "memory(GiB)": 86.38, + "step": 185, + "token_acc": 0.8444778362133734, + "train_speed(iter/s)": 0.03394 + }, + { + "epoch": 0.24967148488830487, + "grad_norm": 0.4015056788921356, + "learning_rate": 9.92648294310206e-06, + "loss": 0.5142830848693848, + "memory(GiB)": 86.38, + "step": 190, + "token_acc": 0.8457510387614549, + "train_speed(iter/s)": 0.033952 + }, + { + "epoch": 0.25624178712220763, + "grad_norm": 0.4097774624824524, + "learning_rate": 9.920298146367287e-06, + "loss": 0.5161718368530274, + "memory(GiB)": 86.38, + "step": 195, + "token_acc": 0.8548741619958237, + "train_speed(iter/s)": 0.033951 + }, + { + "epoch": 0.2628120893561104, + "grad_norm": 0.3440331816673279, + "learning_rate": 9.913865679224876e-06, + "loss": 0.5165815353393555, + "memory(GiB)": 86.38, + "step": 200, + "token_acc": 0.8569646310273844, + "train_speed(iter/s)": 0.033922 + }, + { + "epoch": 0.26938239159001315, + "grad_norm": 0.37692517042160034, + "learning_rate": 9.907185865462476e-06, + "loss": 0.5182360649108887, + "memory(GiB)": 86.38, + "step": 205, + "token_acc": 0.85995085995086, + "train_speed(iter/s)": 0.033952 + }, + { + "epoch": 0.2759526938239159, + "grad_norm": 0.37486883997917175, + "learning_rate": 9.90025904131829e-06, + "loss": 0.5185696125030518, + "memory(GiB)": 86.38, + "step": 210, + "token_acc": 0.8403378378378379, + "train_speed(iter/s)": 0.033943 + }, + { + "epoch": 0.28252299605781866, + "grad_norm": 0.37737980484962463, + "learning_rate": 9.893085555464143e-06, + "loss": 0.5123628616333008, + "memory(GiB)": 86.38, + "step": 215, + "token_acc": 0.8524216190921853, + "train_speed(iter/s)": 0.033965 + }, + { + "epoch": 0.2890932982917214, + "grad_norm": 0.4532665014266968, + "learning_rate": 9.885665768987947e-06, + "loss": 0.5087783813476563, + "memory(GiB)": 86.38, + "step": 220, + "token_acc": 0.8544157346702661, + "train_speed(iter/s)": 0.033986 + }, + { + "epoch": 0.2956636005256242, + "grad_norm": 0.3860194683074951, + "learning_rate": 9.878000055375512e-06, + "loss": 0.5123799324035645, + "memory(GiB)": 86.38, + "step": 225, + "token_acc": 0.842546362339515, + "train_speed(iter/s)": 0.03397 + }, + { + "epoch": 0.30223390275952694, + "grad_norm": 0.3862650692462921, + "learning_rate": 9.87008880049175e-06, + "loss": 0.50973482131958, + "memory(GiB)": 86.38, + "step": 230, + "token_acc": 0.8520688830423344, + "train_speed(iter/s)": 0.033961 + }, + { + "epoch": 0.3088042049934297, + "grad_norm": 0.37506306171417236, + "learning_rate": 9.861932402561253e-06, + "loss": 0.5082354545593262, + "memory(GiB)": 86.38, + "step": 235, + "token_acc": 0.8468783963289458, + "train_speed(iter/s)": 0.033958 + }, + { + "epoch": 0.31537450722733246, + "grad_norm": 0.3809449076652527, + "learning_rate": 9.853531272148248e-06, + "loss": 0.5086749076843262, + "memory(GiB)": 86.38, + "step": 240, + "token_acc": 0.8515756420320736, + "train_speed(iter/s)": 0.033967 + }, + { + "epoch": 0.3219448094612352, + "grad_norm": 0.4132705628871918, + "learning_rate": 9.844885832135928e-06, + "loss": 0.5116987228393555, + "memory(GiB)": 86.38, + "step": 245, + "token_acc": 0.8335253065925876, + "train_speed(iter/s)": 0.033937 + }, + { + "epoch": 0.328515111695138, + "grad_norm": 0.4488829970359802, + "learning_rate": 9.83599651770517e-06, + "loss": 0.5052802085876464, + "memory(GiB)": 86.38, + "step": 250, + "token_acc": 0.8370962333743154, + "train_speed(iter/s)": 0.033926 + }, + { + "epoch": 0.33508541392904073, + "grad_norm": 0.39081957936286926, + "learning_rate": 9.826863776312621e-06, + "loss": 0.5067138671875, + "memory(GiB)": 86.38, + "step": 255, + "token_acc": 0.8409980116734013, + "train_speed(iter/s)": 0.033929 + }, + { + "epoch": 0.3416557161629435, + "grad_norm": 0.35503068566322327, + "learning_rate": 9.817488067668186e-06, + "loss": 0.503065824508667, + "memory(GiB)": 86.38, + "step": 260, + "token_acc": 0.8459525843656557, + "train_speed(iter/s)": 0.033938 + }, + { + "epoch": 0.34822601839684625, + "grad_norm": 0.36853545904159546, + "learning_rate": 9.807869863711878e-06, + "loss": 0.5073853015899659, + "memory(GiB)": 86.38, + "step": 265, + "token_acc": 0.8587078651685394, + "train_speed(iter/s)": 0.033943 + }, + { + "epoch": 0.354796320630749, + "grad_norm": 0.36008450388908386, + "learning_rate": 9.798009648590073e-06, + "loss": 0.5045706748962402, + "memory(GiB)": 86.38, + "step": 270, + "token_acc": 0.861764007597341, + "train_speed(iter/s)": 0.033959 + }, + { + "epoch": 0.36136662286465177, + "grad_norm": 0.3388707637786865, + "learning_rate": 9.787907918631125e-06, + "loss": 0.5048944473266601, + "memory(GiB)": 86.38, + "step": 275, + "token_acc": 0.8515256760109154, + "train_speed(iter/s)": 0.033951 + }, + { + "epoch": 0.3679369250985545, + "grad_norm": 0.36713555455207825, + "learning_rate": 9.777565182320396e-06, + "loss": 0.501971435546875, + "memory(GiB)": 86.38, + "step": 280, + "token_acc": 0.8557236741555861, + "train_speed(iter/s)": 0.033953 + }, + { + "epoch": 0.3745072273324573, + "grad_norm": 0.3958764970302582, + "learning_rate": 9.766981960274653e-06, + "loss": 0.5066198825836181, + "memory(GiB)": 86.38, + "step": 285, + "token_acc": 0.8477457935158585, + "train_speed(iter/s)": 0.033957 + }, + { + "epoch": 0.38107752956636004, + "grad_norm": 0.3786795139312744, + "learning_rate": 9.756158785215866e-06, + "loss": 0.5043275833129883, + "memory(GiB)": 86.38, + "step": 290, + "token_acc": 0.8627160493827161, + "train_speed(iter/s)": 0.033965 + }, + { + "epoch": 0.3876478318002628, + "grad_norm": 0.3754529058933258, + "learning_rate": 9.745096201944391e-06, + "loss": 0.5016345977783203, + "memory(GiB)": 86.38, + "step": 295, + "token_acc": 0.8560241897968678, + "train_speed(iter/s)": 0.033947 + }, + { + "epoch": 0.39421813403416556, + "grad_norm": 0.32459399104118347, + "learning_rate": 9.733794767311545e-06, + "loss": 0.5030747890472412, + "memory(GiB)": 86.38, + "step": 300, + "token_acc": 0.8558913059618383, + "train_speed(iter/s)": 0.033938 + }, + { + "epoch": 0.4007884362680683, + "grad_norm": 0.37864384055137634, + "learning_rate": 9.72225505019158e-06, + "loss": 0.5041725158691406, + "memory(GiB)": 86.38, + "step": 305, + "token_acc": 0.8588684699566385, + "train_speed(iter/s)": 0.033755 + }, + { + "epoch": 0.4073587385019711, + "grad_norm": 0.39976298809051514, + "learning_rate": 9.710477631453044e-06, + "loss": 0.49967308044433595, + "memory(GiB)": 86.38, + "step": 310, + "token_acc": 0.8473580002474941, + "train_speed(iter/s)": 0.033765 + }, + { + "epoch": 0.41392904073587383, + "grad_norm": 0.4079159200191498, + "learning_rate": 9.698463103929542e-06, + "loss": 0.5030883312225342, + "memory(GiB)": 86.38, + "step": 315, + "token_acc": 0.8564925878083287, + "train_speed(iter/s)": 0.033776 + }, + { + "epoch": 0.4204993429697766, + "grad_norm": 0.4643027186393738, + "learning_rate": 9.686212072389904e-06, + "loss": 0.5033651351928711, + "memory(GiB)": 86.38, + "step": 320, + "token_acc": 0.8536913611894386, + "train_speed(iter/s)": 0.033774 + }, + { + "epoch": 0.42706964520367935, + "grad_norm": 0.37644535303115845, + "learning_rate": 9.673725153507727e-06, + "loss": 0.4978950500488281, + "memory(GiB)": 86.38, + "step": 325, + "token_acc": 0.8490523718739487, + "train_speed(iter/s)": 0.033788 + }, + { + "epoch": 0.4336399474375821, + "grad_norm": 0.3504714369773865, + "learning_rate": 9.66100297583035e-06, + "loss": 0.503141212463379, + "memory(GiB)": 86.38, + "step": 330, + "token_acc": 0.8508279539713725, + "train_speed(iter/s)": 0.033789 + }, + { + "epoch": 0.44021024967148487, + "grad_norm": 0.3424312174320221, + "learning_rate": 9.6480461797472e-06, + "loss": 0.5007185459136962, + "memory(GiB)": 86.38, + "step": 335, + "token_acc": 0.8463611859838275, + "train_speed(iter/s)": 0.03379 + }, + { + "epoch": 0.4467805519053876, + "grad_norm": 0.3270646631717682, + "learning_rate": 9.63485541745757e-06, + "loss": 0.4969663143157959, + "memory(GiB)": 86.38, + "step": 340, + "token_acc": 0.8463258785942492, + "train_speed(iter/s)": 0.033791 + }, + { + "epoch": 0.4533508541392904, + "grad_norm": 0.3828498423099518, + "learning_rate": 9.62143135293779e-06, + "loss": 0.49769058227539065, + "memory(GiB)": 86.38, + "step": 345, + "token_acc": 0.8501317996645099, + "train_speed(iter/s)": 0.033805 + }, + { + "epoch": 0.45992115637319314, + "grad_norm": 0.38863444328308105, + "learning_rate": 9.607774661907783e-06, + "loss": 0.49465193748474123, + "memory(GiB)": 86.38, + "step": 350, + "token_acc": 0.8597788232418891, + "train_speed(iter/s)": 0.033787 + }, + { + "epoch": 0.4664914586070959, + "grad_norm": 0.34471848607063293, + "learning_rate": 9.593886031797081e-06, + "loss": 0.4969064712524414, + "memory(GiB)": 86.38, + "step": 355, + "token_acc": 0.8570174985804986, + "train_speed(iter/s)": 0.033791 + }, + { + "epoch": 0.4730617608409987, + "grad_norm": 0.32791054248809814, + "learning_rate": 9.579766161710209e-06, + "loss": 0.5029778480529785, + "memory(GiB)": 86.38, + "step": 360, + "token_acc": 0.8601830935679468, + "train_speed(iter/s)": 0.033798 + }, + { + "epoch": 0.47963206307490147, + "grad_norm": 0.3596540093421936, + "learning_rate": 9.565415762391485e-06, + "loss": 0.49364757537841797, + "memory(GiB)": 86.38, + "step": 365, + "token_acc": 0.8599964223958023, + "train_speed(iter/s)": 0.033808 + }, + { + "epoch": 0.48620236530880423, + "grad_norm": 0.3652913570404053, + "learning_rate": 9.550835556189264e-06, + "loss": 0.4974925994873047, + "memory(GiB)": 86.38, + "step": 370, + "token_acc": 0.8650134518657153, + "train_speed(iter/s)": 0.033823 + }, + { + "epoch": 0.492772667542707, + "grad_norm": 0.3590964674949646, + "learning_rate": 9.536026277019562e-06, + "loss": 0.49645166397094725, + "memory(GiB)": 86.38, + "step": 375, + "token_acc": 0.8576561956647734, + "train_speed(iter/s)": 0.033837 + }, + { + "epoch": 0.49934296977660975, + "grad_norm": 0.3402176797389984, + "learning_rate": 9.520988670329114e-06, + "loss": 0.4980118751525879, + "memory(GiB)": 86.38, + "step": 380, + "token_acc": 0.8511267926246301, + "train_speed(iter/s)": 0.033834 + }, + { + "epoch": 0.5059132720105125, + "grad_norm": 0.3765329122543335, + "learning_rate": 9.505723493057862e-06, + "loss": 0.49571590423583983, + "memory(GiB)": 86.38, + "step": 385, + "token_acc": 0.8535285568175701, + "train_speed(iter/s)": 0.033843 + }, + { + "epoch": 0.5124835742444153, + "grad_norm": 0.3668725788593292, + "learning_rate": 9.490231513600842e-06, + "loss": 0.4947934150695801, + "memory(GiB)": 86.38, + "step": 390, + "token_acc": 0.8614418845456899, + "train_speed(iter/s)": 0.033846 + }, + { + "epoch": 0.519053876478318, + "grad_norm": 0.3342001140117645, + "learning_rate": 9.474513511769513e-06, + "loss": 0.4992271900177002, + "memory(GiB)": 86.38, + "step": 395, + "token_acc": 0.8471820311423454, + "train_speed(iter/s)": 0.033854 + }, + { + "epoch": 0.5256241787122208, + "grad_norm": 0.3347104787826538, + "learning_rate": 9.458570278752501e-06, + "loss": 0.4942744731903076, + "memory(GiB)": 86.38, + "step": 400, + "token_acc": 0.8615504682622268, + "train_speed(iter/s)": 0.033853 + }, + { + "epoch": 0.5321944809461235, + "grad_norm": 0.3521013855934143, + "learning_rate": 9.442402617075765e-06, + "loss": 0.4942043304443359, + "memory(GiB)": 86.38, + "step": 405, + "token_acc": 0.8467462686567164, + "train_speed(iter/s)": 0.033851 + }, + { + "epoch": 0.5387647831800263, + "grad_norm": 0.35290876030921936, + "learning_rate": 9.426011340562222e-06, + "loss": 0.4902125358581543, + "memory(GiB)": 86.38, + "step": 410, + "token_acc": 0.8508040849865007, + "train_speed(iter/s)": 0.033855 + }, + { + "epoch": 0.545335085413929, + "grad_norm": 0.3326910436153412, + "learning_rate": 9.409397274290756e-06, + "loss": 0.4964996337890625, + "memory(GiB)": 86.38, + "step": 415, + "token_acc": 0.8513913558318532, + "train_speed(iter/s)": 0.03386 + }, + { + "epoch": 0.5519053876478318, + "grad_norm": 0.3406986892223358, + "learning_rate": 9.392561254554712e-06, + "loss": 0.4953129768371582, + "memory(GiB)": 86.38, + "step": 420, + "token_acc": 0.8444802578565673, + "train_speed(iter/s)": 0.03387 + }, + { + "epoch": 0.5584756898817346, + "grad_norm": 0.33178892731666565, + "learning_rate": 9.375504128819779e-06, + "loss": 0.4913620471954346, + "memory(GiB)": 86.38, + "step": 425, + "token_acc": 0.8482620320855615, + "train_speed(iter/s)": 0.033876 + }, + { + "epoch": 0.5650459921156373, + "grad_norm": 0.33092719316482544, + "learning_rate": 9.358226755681342e-06, + "loss": 0.4906820297241211, + "memory(GiB)": 86.38, + "step": 430, + "token_acc": 0.8481144343302991, + "train_speed(iter/s)": 0.033885 + }, + { + "epoch": 0.5716162943495401, + "grad_norm": 0.34297481179237366, + "learning_rate": 9.340730004821266e-06, + "loss": 0.49637956619262696, + "memory(GiB)": 86.38, + "step": 435, + "token_acc": 0.8484118291347207, + "train_speed(iter/s)": 0.03389 + }, + { + "epoch": 0.5781865965834428, + "grad_norm": 0.32844671607017517, + "learning_rate": 9.323014756964104e-06, + "loss": 0.4932809352874756, + "memory(GiB)": 86.38, + "step": 440, + "token_acc": 0.8545686404967842, + "train_speed(iter/s)": 0.03389 + }, + { + "epoch": 0.5847568988173456, + "grad_norm": 0.3436914086341858, + "learning_rate": 9.305081903832784e-06, + "loss": 0.49259676933288576, + "memory(GiB)": 86.38, + "step": 445, + "token_acc": 0.8611830312686716, + "train_speed(iter/s)": 0.03388 + }, + { + "epoch": 0.5913272010512484, + "grad_norm": 0.32494404911994934, + "learning_rate": 9.286932348103716e-06, + "loss": 0.4914635181427002, + "memory(GiB)": 86.38, + "step": 450, + "token_acc": 0.8426534209261336, + "train_speed(iter/s)": 0.033884 + }, + { + "epoch": 0.5978975032851511, + "grad_norm": 0.31298619508743286, + "learning_rate": 9.268567003361341e-06, + "loss": 0.49518795013427735, + "memory(GiB)": 86.38, + "step": 455, + "token_acc": 0.8555702841334794, + "train_speed(iter/s)": 0.033881 + }, + { + "epoch": 0.6044678055190539, + "grad_norm": 0.3161918818950653, + "learning_rate": 9.249986794052168e-06, + "loss": 0.4909826278686523, + "memory(GiB)": 86.38, + "step": 460, + "token_acc": 0.8514960996623588, + "train_speed(iter/s)": 0.033883 + }, + { + "epoch": 0.6110381077529566, + "grad_norm": 0.32942476868629456, + "learning_rate": 9.231192655438222e-06, + "loss": 0.49195499420166017, + "memory(GiB)": 86.38, + "step": 465, + "token_acc": 0.8575532549189658, + "train_speed(iter/s)": 0.033886 + }, + { + "epoch": 0.6176084099868594, + "grad_norm": 0.3199692666530609, + "learning_rate": 9.21218553354997e-06, + "loss": 0.48216657638549804, + "memory(GiB)": 86.38, + "step": 470, + "token_acc": 0.8621787172711987, + "train_speed(iter/s)": 0.033885 + }, + { + "epoch": 0.6241787122207622, + "grad_norm": 0.33308735489845276, + "learning_rate": 9.192966385138714e-06, + "loss": 0.49132823944091797, + "memory(GiB)": 86.38, + "step": 475, + "token_acc": 0.8502202643171806, + "train_speed(iter/s)": 0.033894 + }, + { + "epoch": 0.6307490144546649, + "grad_norm": 0.34672704339027405, + "learning_rate": 9.17353617762841e-06, + "loss": 0.49529352188110354, + "memory(GiB)": 86.38, + "step": 480, + "token_acc": 0.8439504061564771, + "train_speed(iter/s)": 0.033881 + }, + { + "epoch": 0.6373193166885677, + "grad_norm": 0.391335666179657, + "learning_rate": 9.153895889066988e-06, + "loss": 0.4896709442138672, + "memory(GiB)": 86.38, + "step": 485, + "token_acc": 0.8555057299451918, + "train_speed(iter/s)": 0.033888 + }, + { + "epoch": 0.6438896189224704, + "grad_norm": 0.32497450709342957, + "learning_rate": 9.134046508077116e-06, + "loss": 0.48676557540893556, + "memory(GiB)": 86.38, + "step": 490, + "token_acc": 0.8605180168536422, + "train_speed(iter/s)": 0.033894 + }, + { + "epoch": 0.6504599211563732, + "grad_norm": 0.3421924114227295, + "learning_rate": 9.113989033806434e-06, + "loss": 0.49125194549560547, + "memory(GiB)": 86.38, + "step": 495, + "token_acc": 0.8528348991524867, + "train_speed(iter/s)": 0.033897 + }, + { + "epoch": 0.657030223390276, + "grad_norm": 0.3321194350719452, + "learning_rate": 9.093724475877262e-06, + "loss": 0.4898836135864258, + "memory(GiB)": 86.38, + "step": 500, + "token_acc": 0.8522178943084704, + "train_speed(iter/s)": 0.033898 + }, + { + "epoch": 0.6636005256241787, + "grad_norm": 0.32021504640579224, + "learning_rate": 9.073253854335777e-06, + "loss": 0.48738608360290525, + "memory(GiB)": 86.38, + "step": 505, + "token_acc": 0.8417130814391088, + "train_speed(iter/s)": 0.033901 + }, + { + "epoch": 0.6701708278580815, + "grad_norm": 0.32002168893814087, + "learning_rate": 9.052578199600675e-06, + "loss": 0.49272966384887695, + "memory(GiB)": 86.38, + "step": 510, + "token_acc": 0.8602219376867264, + "train_speed(iter/s)": 0.033901 + }, + { + "epoch": 0.6767411300919842, + "grad_norm": 0.31045857071876526, + "learning_rate": 9.03169855241129e-06, + "loss": 0.4898507118225098, + "memory(GiB)": 86.38, + "step": 515, + "token_acc": 0.8575417434522812, + "train_speed(iter/s)": 0.033907 + }, + { + "epoch": 0.683311432325887, + "grad_norm": 0.3088115453720093, + "learning_rate": 9.01061596377522e-06, + "loss": 0.4901163578033447, + "memory(GiB)": 86.38, + "step": 520, + "token_acc": 0.8511583445793972, + "train_speed(iter/s)": 0.033899 + }, + { + "epoch": 0.6898817345597897, + "grad_norm": 0.34883564710617065, + "learning_rate": 8.989331494915417e-06, + "loss": 0.49116034507751466, + "memory(GiB)": 86.38, + "step": 525, + "token_acc": 0.8551282847735603, + "train_speed(iter/s)": 0.033901 + }, + { + "epoch": 0.6964520367936925, + "grad_norm": 0.32082292437553406, + "learning_rate": 8.967846217216771e-06, + "loss": 0.48834967613220215, + "memory(GiB)": 86.38, + "step": 530, + "token_acc": 0.8506810071870131, + "train_speed(iter/s)": 0.033906 + }, + { + "epoch": 0.7030223390275953, + "grad_norm": 0.3607739806175232, + "learning_rate": 8.946161212172172e-06, + "loss": 0.48694772720336915, + "memory(GiB)": 86.38, + "step": 535, + "token_acc": 0.8500481340959284, + "train_speed(iter/s)": 0.033908 + }, + { + "epoch": 0.709592641261498, + "grad_norm": 0.3413682281970978, + "learning_rate": 8.924277571328091e-06, + "loss": 0.48662757873535156, + "memory(GiB)": 86.38, + "step": 540, + "token_acc": 0.8603295945861269, + "train_speed(iter/s)": 0.033909 + }, + { + "epoch": 0.7161629434954008, + "grad_norm": 0.3510483503341675, + "learning_rate": 8.902196396229605e-06, + "loss": 0.48763227462768555, + "memory(GiB)": 86.38, + "step": 545, + "token_acc": 0.8508162458340395, + "train_speed(iter/s)": 0.03392 + }, + { + "epoch": 0.7227332457293035, + "grad_norm": 0.31174516677856445, + "learning_rate": 8.879918798364984e-06, + "loss": 0.48741979598999025, + "memory(GiB)": 86.38, + "step": 550, + "token_acc": 0.8652033455768465, + "train_speed(iter/s)": 0.033921 + }, + { + "epoch": 0.7293035479632063, + "grad_norm": 0.37009692192077637, + "learning_rate": 8.857445899109716e-06, + "loss": 0.48439769744873046, + "memory(GiB)": 86.38, + "step": 555, + "token_acc": 0.8583586264357556, + "train_speed(iter/s)": 0.033917 + }, + { + "epoch": 0.735873850197109, + "grad_norm": 0.32648202776908875, + "learning_rate": 8.83477882967007e-06, + "loss": 0.4858428955078125, + "memory(GiB)": 86.38, + "step": 560, + "token_acc": 0.8660503897045496, + "train_speed(iter/s)": 0.03392 + }, + { + "epoch": 0.7424441524310118, + "grad_norm": 0.3123824894428253, + "learning_rate": 8.81191873102616e-06, + "loss": 0.4876396179199219, + "memory(GiB)": 86.38, + "step": 565, + "token_acc": 0.8565744150136596, + "train_speed(iter/s)": 0.033919 + }, + { + "epoch": 0.7490144546649146, + "grad_norm": 0.3010823428630829, + "learning_rate": 8.788866753874504e-06, + "loss": 0.48569602966308595, + "memory(GiB)": 86.38, + "step": 570, + "token_acc": 0.846796506265936, + "train_speed(iter/s)": 0.033922 + }, + { + "epoch": 0.7555847568988173, + "grad_norm": 0.32120397686958313, + "learning_rate": 8.765624058570106e-06, + "loss": 0.4865298271179199, + "memory(GiB)": 86.38, + "step": 575, + "token_acc": 0.8490352484639431, + "train_speed(iter/s)": 0.033924 + }, + { + "epoch": 0.7621550591327201, + "grad_norm": 0.33722633123397827, + "learning_rate": 8.742191815068048e-06, + "loss": 0.4867109298706055, + "memory(GiB)": 86.38, + "step": 580, + "token_acc": 0.8612191958495461, + "train_speed(iter/s)": 0.033917 + }, + { + "epoch": 0.7687253613666228, + "grad_norm": 0.32410791516304016, + "learning_rate": 8.718571202864598e-06, + "loss": 0.4851318359375, + "memory(GiB)": 86.38, + "step": 585, + "token_acc": 0.8603109706993743, + "train_speed(iter/s)": 0.033921 + }, + { + "epoch": 0.7752956636005256, + "grad_norm": 0.326885461807251, + "learning_rate": 8.69476341093784e-06, + "loss": 0.4805999755859375, + "memory(GiB)": 86.38, + "step": 590, + "token_acc": 0.8454463103616473, + "train_speed(iter/s)": 0.033929 + }, + { + "epoch": 0.7818659658344284, + "grad_norm": 0.3168047070503235, + "learning_rate": 8.67076963768782e-06, + "loss": 0.48687124252319336, + "memory(GiB)": 86.38, + "step": 595, + "token_acc": 0.8451851851851852, + "train_speed(iter/s)": 0.033931 + }, + { + "epoch": 0.7884362680683311, + "grad_norm": 0.3170868456363678, + "learning_rate": 8.646591090876225e-06, + "loss": 0.48125357627868653, + "memory(GiB)": 86.38, + "step": 600, + "token_acc": 0.8502272038776129, + "train_speed(iter/s)": 0.033925 + }, + { + "epoch": 0.7950065703022339, + "grad_norm": 0.3512137532234192, + "learning_rate": 8.622228987565597e-06, + "loss": 0.48726634979248046, + "memory(GiB)": 86.38, + "step": 605, + "token_acc": 0.8433869839048286, + "train_speed(iter/s)": 0.033836 + }, + { + "epoch": 0.8015768725361366, + "grad_norm": 0.34979116916656494, + "learning_rate": 8.597684554058053e-06, + "loss": 0.4839656829833984, + "memory(GiB)": 86.38, + "step": 610, + "token_acc": 0.8488303749853062, + "train_speed(iter/s)": 0.033836 + }, + { + "epoch": 0.8081471747700394, + "grad_norm": 0.33397239446640015, + "learning_rate": 8.572959025833573e-06, + "loss": 0.4833966255187988, + "memory(GiB)": 86.38, + "step": 615, + "token_acc": 0.8552229366501528, + "train_speed(iter/s)": 0.03383 + }, + { + "epoch": 0.8147174770039421, + "grad_norm": 0.31006062030792236, + "learning_rate": 8.548053647487808e-06, + "loss": 0.4889863967895508, + "memory(GiB)": 86.38, + "step": 620, + "token_acc": 0.8452540855160062, + "train_speed(iter/s)": 0.033832 + }, + { + "epoch": 0.8212877792378449, + "grad_norm": 0.3102535307407379, + "learning_rate": 8.522969672669419e-06, + "loss": 0.48553314208984377, + "memory(GiB)": 86.38, + "step": 625, + "token_acc": 0.8545072273324573, + "train_speed(iter/s)": 0.033831 + }, + { + "epoch": 0.8278580814717477, + "grad_norm": 0.3058727979660034, + "learning_rate": 8.49770836401699e-06, + "loss": 0.47721147537231445, + "memory(GiB)": 86.38, + "step": 630, + "token_acc": 0.8590224444841341, + "train_speed(iter/s)": 0.033834 + }, + { + "epoch": 0.8344283837056504, + "grad_norm": 0.3120846152305603, + "learning_rate": 8.47227099309546e-06, + "loss": 0.48225932121276854, + "memory(GiB)": 86.38, + "step": 635, + "token_acc": 0.854253918870408, + "train_speed(iter/s)": 0.033836 + }, + { + "epoch": 0.8409986859395532, + "grad_norm": 0.3198888301849365, + "learning_rate": 8.446658840332115e-06, + "loss": 0.4882974624633789, + "memory(GiB)": 86.38, + "step": 640, + "token_acc": 0.8472647079746746, + "train_speed(iter/s)": 0.033841 + }, + { + "epoch": 0.8475689881734559, + "grad_norm": 0.3015914857387543, + "learning_rate": 8.420873194952153e-06, + "loss": 0.483825159072876, + "memory(GiB)": 86.38, + "step": 645, + "token_acc": 0.8493750329623965, + "train_speed(iter/s)": 0.033848 + }, + { + "epoch": 0.8541392904073587, + "grad_norm": 0.33040115237236023, + "learning_rate": 8.394915354913763e-06, + "loss": 0.48243865966796873, + "memory(GiB)": 86.38, + "step": 650, + "token_acc": 0.8504132231404958, + "train_speed(iter/s)": 0.033849 + }, + { + "epoch": 0.8607095926412615, + "grad_norm": 0.3229842782020569, + "learning_rate": 8.368786626842815e-06, + "loss": 0.4843127250671387, + "memory(GiB)": 86.38, + "step": 655, + "token_acc": 0.8529356357927786, + "train_speed(iter/s)": 0.033853 + }, + { + "epoch": 0.8672798948751642, + "grad_norm": 0.31925421953201294, + "learning_rate": 8.342488325967068e-06, + "loss": 0.48301048278808595, + "memory(GiB)": 86.38, + "step": 660, + "token_acc": 0.8582582960770733, + "train_speed(iter/s)": 0.033854 + }, + { + "epoch": 0.873850197109067, + "grad_norm": 0.30799737572669983, + "learning_rate": 8.31602177604999e-06, + "loss": 0.48166284561157224, + "memory(GiB)": 86.38, + "step": 665, + "token_acc": 0.8686445412895295, + "train_speed(iter/s)": 0.033865 + }, + { + "epoch": 0.8804204993429697, + "grad_norm": 0.31392061710357666, + "learning_rate": 8.289388309324094e-06, + "loss": 0.483530855178833, + "memory(GiB)": 86.38, + "step": 670, + "token_acc": 0.8583989950896426, + "train_speed(iter/s)": 0.033868 + }, + { + "epoch": 0.8869908015768725, + "grad_norm": 0.33349302411079407, + "learning_rate": 8.262589266423908e-06, + "loss": 0.48435115814208984, + "memory(GiB)": 86.38, + "step": 675, + "token_acc": 0.8416313213703099, + "train_speed(iter/s)": 0.033874 + }, + { + "epoch": 0.8935611038107752, + "grad_norm": 0.3091382086277008, + "learning_rate": 8.235625996318475e-06, + "loss": 0.4799081802368164, + "memory(GiB)": 86.38, + "step": 680, + "token_acc": 0.8609777777777777, + "train_speed(iter/s)": 0.033877 + }, + { + "epoch": 0.900131406044678, + "grad_norm": 0.3427553176879883, + "learning_rate": 8.208499856243453e-06, + "loss": 0.48143601417541504, + "memory(GiB)": 86.38, + "step": 685, + "token_acc": 0.8536925941249482, + "train_speed(iter/s)": 0.033876 + }, + { + "epoch": 0.9067017082785808, + "grad_norm": 0.3548396825790405, + "learning_rate": 8.1812122116328e-06, + "loss": 0.48082866668701174, + "memory(GiB)": 86.38, + "step": 690, + "token_acc": 0.8531232091690545, + "train_speed(iter/s)": 0.033877 + }, + { + "epoch": 0.9132720105124835, + "grad_norm": 0.3253563940525055, + "learning_rate": 8.15376443605004e-06, + "loss": 0.4795668601989746, + "memory(GiB)": 86.38, + "step": 695, + "token_acc": 0.853655830467103, + "train_speed(iter/s)": 0.033883 + }, + { + "epoch": 0.9198423127463863, + "grad_norm": 0.2970241606235504, + "learning_rate": 8.126157911119124e-06, + "loss": 0.479010009765625, + "memory(GiB)": 86.38, + "step": 700, + "token_acc": 0.859375, + "train_speed(iter/s)": 0.033891 + }, + { + "epoch": 0.926412614980289, + "grad_norm": 0.3558485805988312, + "learning_rate": 8.098394026454886e-06, + "loss": 0.4783782482147217, + "memory(GiB)": 86.38, + "step": 705, + "token_acc": 0.8596869328493648, + "train_speed(iter/s)": 0.033894 + }, + { + "epoch": 0.9329829172141918, + "grad_norm": 0.3010825514793396, + "learning_rate": 8.070474179593088e-06, + "loss": 0.47974371910095215, + "memory(GiB)": 86.38, + "step": 710, + "token_acc": 0.8615735767991407, + "train_speed(iter/s)": 0.033899 + }, + { + "epoch": 0.9395532194480947, + "grad_norm": 0.31274092197418213, + "learning_rate": 8.042399775920084e-06, + "loss": 0.48296613693237306, + "memory(GiB)": 86.38, + "step": 715, + "token_acc": 0.8443671593590858, + "train_speed(iter/s)": 0.033904 + }, + { + "epoch": 0.9461235216819974, + "grad_norm": 0.30195385217666626, + "learning_rate": 8.014172228602063e-06, + "loss": 0.48566722869873047, + "memory(GiB)": 86.38, + "step": 720, + "token_acc": 0.8442668136714443, + "train_speed(iter/s)": 0.033907 + }, + { + "epoch": 0.9526938239159002, + "grad_norm": 0.29728612303733826, + "learning_rate": 7.985792958513932e-06, + "loss": 0.4842525005340576, + "memory(GiB)": 86.38, + "step": 725, + "token_acc": 0.8693410760843802, + "train_speed(iter/s)": 0.03391 + }, + { + "epoch": 0.9592641261498029, + "grad_norm": 0.3458816707134247, + "learning_rate": 7.957263394167778e-06, + "loss": 0.47885870933532715, + "memory(GiB)": 86.38, + "step": 730, + "token_acc": 0.8596500419111484, + "train_speed(iter/s)": 0.033899 + }, + { + "epoch": 0.9658344283837057, + "grad_norm": 0.3230541944503784, + "learning_rate": 7.928584971640974e-06, + "loss": 0.4798708915710449, + "memory(GiB)": 86.38, + "step": 735, + "token_acc": 0.8699983578739942, + "train_speed(iter/s)": 0.033899 + }, + { + "epoch": 0.9724047306176085, + "grad_norm": 0.3110128939151764, + "learning_rate": 7.899759134503888e-06, + "loss": 0.4790318489074707, + "memory(GiB)": 86.38, + "step": 740, + "token_acc": 0.8630462405391968, + "train_speed(iter/s)": 0.033907 + }, + { + "epoch": 0.9789750328515112, + "grad_norm": 0.3367188274860382, + "learning_rate": 7.870787333747216e-06, + "loss": 0.47907276153564454, + "memory(GiB)": 86.38, + "step": 745, + "token_acc": 0.8586263243898582, + "train_speed(iter/s)": 0.03391 + }, + { + "epoch": 0.985545335085414, + "grad_norm": 0.3082112967967987, + "learning_rate": 7.841671027708945e-06, + "loss": 0.481706428527832, + "memory(GiB)": 86.38, + "step": 750, + "token_acc": 0.8511583011583012, + "train_speed(iter/s)": 0.033906 + }, + { + "epoch": 0.9921156373193167, + "grad_norm": 0.332453191280365, + "learning_rate": 7.81241168200095e-06, + "loss": 0.4739673137664795, + "memory(GiB)": 86.38, + "step": 755, + "token_acc": 0.8551256316190212, + "train_speed(iter/s)": 0.033908 + }, + { + "epoch": 0.9986859395532195, + "grad_norm": 0.28533536195755005, + "learning_rate": 7.783010769435216e-06, + "loss": 0.4861409664154053, + "memory(GiB)": 86.38, + "step": 760, + "token_acc": 0.8556530110172211, + "train_speed(iter/s)": 0.033911 + }, + { + "epoch": 1.0052562417871223, + "grad_norm": 0.33001649379730225, + "learning_rate": 7.753469769949701e-06, + "loss": 0.46169567108154297, + "memory(GiB)": 86.38, + "step": 765, + "token_acc": 0.8602941176470589, + "train_speed(iter/s)": 0.033926 + }, + { + "epoch": 1.011826544021025, + "grad_norm": 0.305500328540802, + "learning_rate": 7.723790170533848e-06, + "loss": 0.46022186279296873, + "memory(GiB)": 86.38, + "step": 770, + "token_acc": 0.8599308445173768, + "train_speed(iter/s)": 0.033928 + }, + { + "epoch": 1.0183968462549278, + "grad_norm": 0.2889300584793091, + "learning_rate": 7.693973465153724e-06, + "loss": 0.46282401084899905, + "memory(GiB)": 86.38, + "step": 775, + "token_acc": 0.862350683914093, + "train_speed(iter/s)": 0.03393 + }, + { + "epoch": 1.0249671484888305, + "grad_norm": 0.33990442752838135, + "learning_rate": 7.664021154676828e-06, + "loss": 0.4604497909545898, + "memory(GiB)": 86.38, + "step": 780, + "token_acc": 0.8684050268504678, + "train_speed(iter/s)": 0.033928 + }, + { + "epoch": 1.0315374507227333, + "grad_norm": 0.31965604424476624, + "learning_rate": 7.633934746796545e-06, + "loss": 0.46096210479736327, + "memory(GiB)": 86.38, + "step": 785, + "token_acc": 0.8674027168912702, + "train_speed(iter/s)": 0.033927 + }, + { + "epoch": 1.038107752956636, + "grad_norm": 0.32439425587654114, + "learning_rate": 7.603715755956243e-06, + "loss": 0.45728340148925783, + "memory(GiB)": 86.38, + "step": 790, + "token_acc": 0.8674415479709755, + "train_speed(iter/s)": 0.033927 + }, + { + "epoch": 1.0446780551905388, + "grad_norm": 0.3270528018474579, + "learning_rate": 7.573365703273045e-06, + "loss": 0.46488609313964846, + "memory(GiB)": 86.38, + "step": 795, + "token_acc": 0.850026525198939, + "train_speed(iter/s)": 0.033931 + }, + { + "epoch": 1.0512483574244416, + "grad_norm": 0.2934127748012543, + "learning_rate": 7.542886116461272e-06, + "loss": 0.45778141021728513, + "memory(GiB)": 86.38, + "step": 800, + "token_acc": 0.8622505823964347, + "train_speed(iter/s)": 0.033934 + }, + { + "epoch": 1.0578186596583443, + "grad_norm": 0.31371569633483887, + "learning_rate": 7.512278529755529e-06, + "loss": 0.45838513374328616, + "memory(GiB)": 86.38, + "step": 805, + "token_acc": 0.8530397056400681, + "train_speed(iter/s)": 0.033931 + }, + { + "epoch": 1.064388961892247, + "grad_norm": 0.2872871160507202, + "learning_rate": 7.481544483833485e-06, + "loss": 0.4574404239654541, + "memory(GiB)": 86.38, + "step": 810, + "token_acc": 0.8523446658851114, + "train_speed(iter/s)": 0.033933 + }, + { + "epoch": 1.0709592641261498, + "grad_norm": 0.2994791865348816, + "learning_rate": 7.450685525738315e-06, + "loss": 0.45713510513305666, + "memory(GiB)": 86.38, + "step": 815, + "token_acc": 0.8546161825726141, + "train_speed(iter/s)": 0.033938 + }, + { + "epoch": 1.0775295663600526, + "grad_norm": 0.29632824659347534, + "learning_rate": 7.419703208800839e-06, + "loss": 0.45964574813842773, + "memory(GiB)": 86.38, + "step": 820, + "token_acc": 0.8663826261908989, + "train_speed(iter/s)": 0.033941 + }, + { + "epoch": 1.0840998685939554, + "grad_norm": 0.30519089102745056, + "learning_rate": 7.388599092561315e-06, + "loss": 0.4573044776916504, + "memory(GiB)": 86.38, + "step": 825, + "token_acc": 0.8629596640793994, + "train_speed(iter/s)": 0.033938 + }, + { + "epoch": 1.090670170827858, + "grad_norm": 0.29544419050216675, + "learning_rate": 7.357374742690956e-06, + "loss": 0.45876827239990237, + "memory(GiB)": 86.38, + "step": 830, + "token_acc": 0.8560570320280967, + "train_speed(iter/s)": 0.033938 + }, + { + "epoch": 1.0972404730617609, + "grad_norm": 0.3168863356113434, + "learning_rate": 7.326031730913107e-06, + "loss": 0.4636601448059082, + "memory(GiB)": 86.38, + "step": 835, + "token_acc": 0.8670317181527017, + "train_speed(iter/s)": 0.033943 + }, + { + "epoch": 1.1038107752956636, + "grad_norm": 0.30908459424972534, + "learning_rate": 7.2945716349241305e-06, + "loss": 0.4574262619018555, + "memory(GiB)": 86.38, + "step": 840, + "token_acc": 0.8620744343412984, + "train_speed(iter/s)": 0.03394 + }, + { + "epoch": 1.1103810775295664, + "grad_norm": 0.3176266551017761, + "learning_rate": 7.262996038314001e-06, + "loss": 0.461370849609375, + "memory(GiB)": 86.38, + "step": 845, + "token_acc": 0.8680333119795003, + "train_speed(iter/s)": 0.033941 + }, + { + "epoch": 1.1169513797634691, + "grad_norm": 0.302416056394577, + "learning_rate": 7.231306530486579e-06, + "loss": 0.45732645988464354, + "memory(GiB)": 86.38, + "step": 850, + "token_acc": 0.8647487633428794, + "train_speed(iter/s)": 0.033935 + }, + { + "epoch": 1.123521681997372, + "grad_norm": 0.30254605412483215, + "learning_rate": 7.199504706579617e-06, + "loss": 0.46079111099243164, + "memory(GiB)": 86.38, + "step": 855, + "token_acc": 0.8521696665271383, + "train_speed(iter/s)": 0.033936 + }, + { + "epoch": 1.1300919842312747, + "grad_norm": 0.29616811871528625, + "learning_rate": 7.167592167384461e-06, + "loss": 0.45458307266235354, + "memory(GiB)": 86.38, + "step": 860, + "token_acc": 0.8740381023533806, + "train_speed(iter/s)": 0.033936 + }, + { + "epoch": 1.1366622864651774, + "grad_norm": 0.2893197238445282, + "learning_rate": 7.135570519265473e-06, + "loss": 0.4566815853118896, + "memory(GiB)": 86.38, + "step": 865, + "token_acc": 0.8525364274150027, + "train_speed(iter/s)": 0.033926 + }, + { + "epoch": 1.1432325886990802, + "grad_norm": 0.30079302191734314, + "learning_rate": 7.1034413740791705e-06, + "loss": 0.4587052345275879, + "memory(GiB)": 86.38, + "step": 870, + "token_acc": 0.8628954358850519, + "train_speed(iter/s)": 0.033926 + }, + { + "epoch": 1.149802890932983, + "grad_norm": 0.3086967170238495, + "learning_rate": 7.071206349093097e-06, + "loss": 0.45635190010070803, + "memory(GiB)": 86.38, + "step": 875, + "token_acc": 0.859285550721319, + "train_speed(iter/s)": 0.033929 + }, + { + "epoch": 1.1563731931668857, + "grad_norm": 0.3067159354686737, + "learning_rate": 7.038867066904407e-06, + "loss": 0.45715036392211916, + "memory(GiB)": 86.38, + "step": 880, + "token_acc": 0.8618468146027202, + "train_speed(iter/s)": 0.033924 + }, + { + "epoch": 1.1629434954007885, + "grad_norm": 0.28498393297195435, + "learning_rate": 7.006425155358195e-06, + "loss": 0.4554757118225098, + "memory(GiB)": 86.38, + "step": 885, + "token_acc": 0.8687036756920284, + "train_speed(iter/s)": 0.033921 + }, + { + "epoch": 1.1695137976346912, + "grad_norm": 0.2907336950302124, + "learning_rate": 6.9738822474655555e-06, + "loss": 0.45355930328369143, + "memory(GiB)": 86.38, + "step": 890, + "token_acc": 0.8599979554283378, + "train_speed(iter/s)": 0.033924 + }, + { + "epoch": 1.176084099868594, + "grad_norm": 0.29509079456329346, + "learning_rate": 6.941239981321379e-06, + "loss": 0.45787954330444336, + "memory(GiB)": 86.38, + "step": 895, + "token_acc": 0.8733064370446197, + "train_speed(iter/s)": 0.033929 + }, + { + "epoch": 1.1826544021024967, + "grad_norm": 0.2977595031261444, + "learning_rate": 6.908500000021905e-06, + "loss": 0.456469202041626, + "memory(GiB)": 86.38, + "step": 900, + "token_acc": 0.8686904761904762, + "train_speed(iter/s)": 0.03393 + }, + { + "epoch": 1.1892247043363995, + "grad_norm": 0.28251177072525024, + "learning_rate": 6.875663951582e-06, + "loss": 0.45859241485595703, + "memory(GiB)": 86.38, + "step": 905, + "token_acc": 0.8703482454975884, + "train_speed(iter/s)": 0.033871 + }, + { + "epoch": 1.1957950065703022, + "grad_norm": 0.30164870619773865, + "learning_rate": 6.842733488852218e-06, + "loss": 0.45961766242980956, + "memory(GiB)": 86.38, + "step": 910, + "token_acc": 0.8695999018163967, + "train_speed(iter/s)": 0.033871 + }, + { + "epoch": 1.202365308804205, + "grad_norm": 0.2958962023258209, + "learning_rate": 6.80971026943559e-06, + "loss": 0.45937299728393555, + "memory(GiB)": 86.38, + "step": 915, + "token_acc": 0.8646010935387148, + "train_speed(iter/s)": 0.033864 + }, + { + "epoch": 1.2089356110381078, + "grad_norm": 0.305772989988327, + "learning_rate": 6.776595955604192e-06, + "loss": 0.4570772171020508, + "memory(GiB)": 86.38, + "step": 920, + "token_acc": 0.861623201438849, + "train_speed(iter/s)": 0.033866 + }, + { + "epoch": 1.2155059132720105, + "grad_norm": 0.29926493763923645, + "learning_rate": 6.743392214215473e-06, + "loss": 0.45430717468261717, + "memory(GiB)": 86.38, + "step": 925, + "token_acc": 0.8663976363767385, + "train_speed(iter/s)": 0.033865 + }, + { + "epoch": 1.2220762155059133, + "grad_norm": 0.3044881522655487, + "learning_rate": 6.710100716628345e-06, + "loss": 0.455517578125, + "memory(GiB)": 86.38, + "step": 930, + "token_acc": 0.8511478910838227, + "train_speed(iter/s)": 0.033863 + }, + { + "epoch": 1.228646517739816, + "grad_norm": 0.3772009313106537, + "learning_rate": 6.676723138619056e-06, + "loss": 0.46090059280395507, + "memory(GiB)": 86.38, + "step": 935, + "token_acc": 0.8711496746203905, + "train_speed(iter/s)": 0.033858 + }, + { + "epoch": 1.2352168199737188, + "grad_norm": 0.29388174414634705, + "learning_rate": 6.6432611602968445e-06, + "loss": 0.456877326965332, + "memory(GiB)": 86.38, + "step": 940, + "token_acc": 0.8651419558359621, + "train_speed(iter/s)": 0.03386 + }, + { + "epoch": 1.2417871222076216, + "grad_norm": 0.29652050137519836, + "learning_rate": 6.609716466019356e-06, + "loss": 0.45618433952331544, + "memory(GiB)": 86.38, + "step": 945, + "token_acc": 0.8603668915085418, + "train_speed(iter/s)": 0.033859 + }, + { + "epoch": 1.2483574244415243, + "grad_norm": 0.28154268860816956, + "learning_rate": 6.576090744307866e-06, + "loss": 0.45843868255615233, + "memory(GiB)": 86.38, + "step": 950, + "token_acc": 0.8659341793046529, + "train_speed(iter/s)": 0.033861 + }, + { + "epoch": 1.254927726675427, + "grad_norm": 0.284541517496109, + "learning_rate": 6.542385687762287e-06, + "loss": 0.4614737033843994, + "memory(GiB)": 86.38, + "step": 955, + "token_acc": 0.8557089929269114, + "train_speed(iter/s)": 0.033856 + }, + { + "epoch": 1.2614980289093298, + "grad_norm": 0.2883804142475128, + "learning_rate": 6.508602992975963e-06, + "loss": 0.4575353622436523, + "memory(GiB)": 86.38, + "step": 960, + "token_acc": 0.862012703222423, + "train_speed(iter/s)": 0.03385 + }, + { + "epoch": 1.2680683311432326, + "grad_norm": 0.2853713035583496, + "learning_rate": 6.474744360450274e-06, + "loss": 0.4590480804443359, + "memory(GiB)": 86.38, + "step": 965, + "token_acc": 0.8613731343283582, + "train_speed(iter/s)": 0.033851 + }, + { + "epoch": 1.2746386333771353, + "grad_norm": 0.2936136722564697, + "learning_rate": 6.44081149450904e-06, + "loss": 0.45726985931396485, + "memory(GiB)": 86.38, + "step": 970, + "token_acc": 0.8545799374647599, + "train_speed(iter/s)": 0.033849 + }, + { + "epoch": 1.281208935611038, + "grad_norm": 0.31412455439567566, + "learning_rate": 6.406806103212725e-06, + "loss": 0.45641331672668456, + "memory(GiB)": 86.38, + "step": 975, + "token_acc": 0.8715530697190427, + "train_speed(iter/s)": 0.033845 + }, + { + "epoch": 1.2877792378449409, + "grad_norm": 0.31974250078201294, + "learning_rate": 6.372729898272463e-06, + "loss": 0.46121625900268554, + "memory(GiB)": 86.38, + "step": 980, + "token_acc": 0.8484265561803295, + "train_speed(iter/s)": 0.033852 + }, + { + "epoch": 1.2943495400788436, + "grad_norm": 0.29389360547065735, + "learning_rate": 6.338584594963898e-06, + "loss": 0.4556922435760498, + "memory(GiB)": 86.38, + "step": 985, + "token_acc": 0.8639753820476712, + "train_speed(iter/s)": 0.033848 + }, + { + "epoch": 1.3009198423127464, + "grad_norm": 0.30771321058273315, + "learning_rate": 6.30437191204084e-06, + "loss": 0.46083745956420896, + "memory(GiB)": 86.38, + "step": 990, + "token_acc": 0.8666952159549737, + "train_speed(iter/s)": 0.033849 + }, + { + "epoch": 1.3074901445466491, + "grad_norm": 0.29386404156684875, + "learning_rate": 6.270093571648752e-06, + "loss": 0.45865530967712403, + "memory(GiB)": 86.38, + "step": 995, + "token_acc": 0.8546142578125, + "train_speed(iter/s)": 0.033849 + }, + { + "epoch": 1.314060446780552, + "grad_norm": 0.2929444909095764, + "learning_rate": 6.23575129923806e-06, + "loss": 0.45972671508789065, + "memory(GiB)": 86.38, + "step": 1000, + "token_acc": 0.8530415342981528, + "train_speed(iter/s)": 0.033851 + }, + { + "epoch": 1.3206307490144547, + "grad_norm": 0.2973506450653076, + "learning_rate": 6.2013468234773034e-06, + "loss": 0.45803632736206057, + "memory(GiB)": 86.38, + "step": 1005, + "token_acc": 0.857928142355208, + "train_speed(iter/s)": 0.033853 + }, + { + "epoch": 1.3272010512483574, + "grad_norm": 0.30529940128326416, + "learning_rate": 6.166881876166119e-06, + "loss": 0.4576756000518799, + "memory(GiB)": 86.38, + "step": 1010, + "token_acc": 0.8755669493196608, + "train_speed(iter/s)": 0.03385 + }, + { + "epoch": 1.3337713534822602, + "grad_norm": 0.293550968170166, + "learning_rate": 6.132358192148065e-06, + "loss": 0.4561765670776367, + "memory(GiB)": 86.38, + "step": 1015, + "token_acc": 0.8672781599610868, + "train_speed(iter/s)": 0.033849 + }, + { + "epoch": 1.340341655716163, + "grad_norm": 0.29839423298835754, + "learning_rate": 6.097777509223299e-06, + "loss": 0.455903148651123, + "memory(GiB)": 86.38, + "step": 1020, + "token_acc": 0.8684119278779473, + "train_speed(iter/s)": 0.033847 + }, + { + "epoch": 1.3469119579500657, + "grad_norm": 0.3058245778083801, + "learning_rate": 6.063141568061104e-06, + "loss": 0.4578727722167969, + "memory(GiB)": 86.38, + "step": 1025, + "token_acc": 0.8626132709733996, + "train_speed(iter/s)": 0.033852 + }, + { + "epoch": 1.3534822601839684, + "grad_norm": 0.2938694357872009, + "learning_rate": 6.02845211211226e-06, + "loss": 0.45619792938232423, + "memory(GiB)": 86.38, + "step": 1030, + "token_acc": 0.864321608040201, + "train_speed(iter/s)": 0.033855 + }, + { + "epoch": 1.3600525624178712, + "grad_norm": 0.33827096223831177, + "learning_rate": 5.993710887521302e-06, + "loss": 0.45999650955200194, + "memory(GiB)": 86.38, + "step": 1035, + "token_acc": 0.8575886524822695, + "train_speed(iter/s)": 0.033856 + }, + { + "epoch": 1.366622864651774, + "grad_norm": 0.2824879586696625, + "learning_rate": 5.958919643038609e-06, + "loss": 0.45719089508056643, + "memory(GiB)": 86.38, + "step": 1040, + "token_acc": 0.8549390889830508, + "train_speed(iter/s)": 0.033856 + }, + { + "epoch": 1.3731931668856767, + "grad_norm": 0.2904459238052368, + "learning_rate": 5.924080129932386e-06, + "loss": 0.4534614562988281, + "memory(GiB)": 86.38, + "step": 1045, + "token_acc": 0.8642217245240762, + "train_speed(iter/s)": 0.033848 + }, + { + "epoch": 1.3797634691195795, + "grad_norm": 0.31164076924324036, + "learning_rate": 5.8891941019005095e-06, + "loss": 0.4557456970214844, + "memory(GiB)": 86.38, + "step": 1050, + "token_acc": 0.8531942479962282, + "train_speed(iter/s)": 0.033847 + }, + { + "epoch": 1.3863337713534822, + "grad_norm": 0.2827838063240051, + "learning_rate": 5.854263314982252e-06, + "loss": 0.4562164306640625, + "memory(GiB)": 86.38, + "step": 1055, + "token_acc": 0.8564340588988476, + "train_speed(iter/s)": 0.033846 + }, + { + "epoch": 1.392904073587385, + "grad_norm": 0.29443469643592834, + "learning_rate": 5.819289527469897e-06, + "loss": 0.45438013076782224, + "memory(GiB)": 86.38, + "step": 1060, + "token_acc": 0.8631507279773751, + "train_speed(iter/s)": 0.033851 + }, + { + "epoch": 1.3994743758212878, + "grad_norm": 0.2858130633831024, + "learning_rate": 5.784274499820214e-06, + "loss": 0.45337843894958496, + "memory(GiB)": 86.38, + "step": 1065, + "token_acc": 0.8435270132517839, + "train_speed(iter/s)": 0.033852 + }, + { + "epoch": 1.4060446780551905, + "grad_norm": 0.2949610650539398, + "learning_rate": 5.749219994565863e-06, + "loss": 0.4539140224456787, + "memory(GiB)": 86.38, + "step": 1070, + "token_acc": 0.8618331826401446, + "train_speed(iter/s)": 0.033854 + }, + { + "epoch": 1.4126149802890933, + "grad_norm": 0.2909865081310272, + "learning_rate": 5.714127776226667e-06, + "loss": 0.4557938575744629, + "memory(GiB)": 86.38, + "step": 1075, + "token_acc": 0.8680278588011191, + "train_speed(iter/s)": 0.033856 + }, + { + "epoch": 1.419185282522996, + "grad_norm": 0.28090617060661316, + "learning_rate": 5.6789996112207865e-06, + "loss": 0.4519779205322266, + "memory(GiB)": 86.38, + "step": 1080, + "token_acc": 0.8621539840860697, + "train_speed(iter/s)": 0.033857 + }, + { + "epoch": 1.4257555847568988, + "grad_norm": 0.26703914999961853, + "learning_rate": 5.64383726777582e-06, + "loss": 0.4575533866882324, + "memory(GiB)": 86.38, + "step": 1085, + "token_acc": 0.8600892222150385, + "train_speed(iter/s)": 0.03386 + }, + { + "epoch": 1.4323258869908015, + "grad_norm": 0.29428642988204956, + "learning_rate": 5.608642515839777e-06, + "loss": 0.4562852382659912, + "memory(GiB)": 86.38, + "step": 1090, + "token_acc": 0.8570395907473309, + "train_speed(iter/s)": 0.033858 + }, + { + "epoch": 1.4388961892247043, + "grad_norm": 0.2922196090221405, + "learning_rate": 5.573417126992004e-06, + "loss": 0.455198860168457, + "memory(GiB)": 86.38, + "step": 1095, + "token_acc": 0.8534050553582619, + "train_speed(iter/s)": 0.033859 + }, + { + "epoch": 1.445466491458607, + "grad_norm": 0.2833230793476105, + "learning_rate": 5.538162874353994e-06, + "loss": 0.45499043464660643, + "memory(GiB)": 86.38, + "step": 1100, + "token_acc": 0.8599968372779505, + "train_speed(iter/s)": 0.033861 + }, + { + "epoch": 1.4520367936925098, + "grad_norm": 0.30704233050346375, + "learning_rate": 5.502881532500149e-06, + "loss": 0.4561596870422363, + "memory(GiB)": 86.38, + "step": 1105, + "token_acc": 0.8647945610404966, + "train_speed(iter/s)": 0.033863 + }, + { + "epoch": 1.4586070959264126, + "grad_norm": 0.2708365321159363, + "learning_rate": 5.467574877368441e-06, + "loss": 0.45220632553100587, + "memory(GiB)": 86.38, + "step": 1110, + "token_acc": 0.86642938687798, + "train_speed(iter/s)": 0.033866 + }, + { + "epoch": 1.4651773981603153, + "grad_norm": 0.28449153900146484, + "learning_rate": 5.432244686171025e-06, + "loss": 0.45653414726257324, + "memory(GiB)": 86.38, + "step": 1115, + "token_acc": 0.8675830627892519, + "train_speed(iter/s)": 0.033865 + }, + { + "epoch": 1.471747700394218, + "grad_norm": 0.28766512870788574, + "learning_rate": 5.396892737304779e-06, + "loss": 0.4552262783050537, + "memory(GiB)": 86.38, + "step": 1120, + "token_acc": 0.8638403990024938, + "train_speed(iter/s)": 0.033865 + }, + { + "epoch": 1.4783180026281209, + "grad_norm": 0.28682559728622437, + "learning_rate": 5.361520810261779e-06, + "loss": 0.45450830459594727, + "memory(GiB)": 86.38, + "step": 1125, + "token_acc": 0.860114404576183, + "train_speed(iter/s)": 0.033865 + }, + { + "epoch": 1.4848883048620236, + "grad_norm": 0.30013778805732727, + "learning_rate": 5.3261306855397395e-06, + "loss": 0.45503602027893064, + "memory(GiB)": 86.38, + "step": 1130, + "token_acc": 0.8707037643207856, + "train_speed(iter/s)": 0.033864 + }, + { + "epoch": 1.4914586070959264, + "grad_norm": 0.28545552492141724, + "learning_rate": 5.290724144552379e-06, + "loss": 0.45638151168823243, + "memory(GiB)": 86.38, + "step": 1135, + "token_acc": 0.8654683330992838, + "train_speed(iter/s)": 0.033865 + }, + { + "epoch": 1.4980289093298291, + "grad_norm": 0.2808593213558197, + "learning_rate": 5.255302969539753e-06, + "loss": 0.454376745223999, + "memory(GiB)": 86.38, + "step": 1140, + "token_acc": 0.8695363037301251, + "train_speed(iter/s)": 0.033865 + }, + { + "epoch": 1.5045992115637319, + "grad_norm": 0.30250662565231323, + "learning_rate": 5.219868943478542e-06, + "loss": 0.45623059272766114, + "memory(GiB)": 86.38, + "step": 1145, + "token_acc": 0.8605342850962578, + "train_speed(iter/s)": 0.033865 + }, + { + "epoch": 1.5111695137976346, + "grad_norm": 0.296613484621048, + "learning_rate": 5.184423849992299e-06, + "loss": 0.4548806190490723, + "memory(GiB)": 86.38, + "step": 1150, + "token_acc": 0.8635175178664808, + "train_speed(iter/s)": 0.033865 + }, + { + "epoch": 1.5177398160315374, + "grad_norm": 0.28246545791625977, + "learning_rate": 5.1489694732616805e-06, + "loss": 0.4554699420928955, + "memory(GiB)": 86.38, + "step": 1155, + "token_acc": 0.862121567707111, + "train_speed(iter/s)": 0.033866 + }, + { + "epoch": 1.5243101182654402, + "grad_norm": 0.26761719584465027, + "learning_rate": 5.11350759793462e-06, + "loss": 0.45384392738342283, + "memory(GiB)": 86.38, + "step": 1160, + "token_acc": 0.8527565417365902, + "train_speed(iter/s)": 0.033865 + }, + { + "epoch": 1.530880420499343, + "grad_norm": 0.2766062021255493, + "learning_rate": 5.078040009036509e-06, + "loss": 0.45311508178710935, + "memory(GiB)": 86.38, + "step": 1165, + "token_acc": 0.860136895026955, + "train_speed(iter/s)": 0.033865 + }, + { + "epoch": 1.5374507227332457, + "grad_norm": 0.2843003571033478, + "learning_rate": 5.042568491880338e-06, + "loss": 0.455690860748291, + "memory(GiB)": 86.38, + "step": 1170, + "token_acc": 0.8672405980969642, + "train_speed(iter/s)": 0.033867 + }, + { + "epoch": 1.5440210249671484, + "grad_norm": 0.2944943308830261, + "learning_rate": 5.007094831976832e-06, + "loss": 0.45423293113708496, + "memory(GiB)": 86.38, + "step": 1175, + "token_acc": 0.865735444638449, + "train_speed(iter/s)": 0.033867 + }, + { + "epoch": 1.5505913272010512, + "grad_norm": 0.2819548547267914, + "learning_rate": 4.9716208149445776e-06, + "loss": 0.45132970809936523, + "memory(GiB)": 86.38, + "step": 1180, + "token_acc": 0.8634401381427476, + "train_speed(iter/s)": 0.033869 + }, + { + "epoch": 1.557161629434954, + "grad_norm": 0.27042356133461, + "learning_rate": 4.936148226420133e-06, + "loss": 0.45566673278808595, + "memory(GiB)": 86.38, + "step": 1185, + "token_acc": 0.8692132269099202, + "train_speed(iter/s)": 0.033865 + }, + { + "epoch": 1.563731931668857, + "grad_norm": 0.29058489203453064, + "learning_rate": 4.900678851968152e-06, + "loss": 0.4520698070526123, + "memory(GiB)": 86.38, + "step": 1190, + "token_acc": 0.8643418665591615, + "train_speed(iter/s)": 0.033866 + }, + { + "epoch": 1.5703022339027597, + "grad_norm": 0.274539053440094, + "learning_rate": 4.865214476991506e-06, + "loss": 0.4568329811096191, + "memory(GiB)": 86.38, + "step": 1195, + "token_acc": 0.8561119477911646, + "train_speed(iter/s)": 0.033867 + }, + { + "epoch": 1.5768725361366625, + "grad_norm": 0.2732899785041809, + "learning_rate": 4.829756886641408e-06, + "loss": 0.45705676078796387, + "memory(GiB)": 86.38, + "step": 1200, + "token_acc": 0.8784363482569029, + "train_speed(iter/s)": 0.033866 + }, + { + "epoch": 1.5834428383705652, + "grad_norm": 0.27467477321624756, + "learning_rate": 4.794307865727555e-06, + "loss": 0.45558509826660154, + "memory(GiB)": 86.38, + "step": 1205, + "token_acc": 0.8533988533988534, + "train_speed(iter/s)": 0.033825 + }, + { + "epoch": 1.590013140604468, + "grad_norm": 0.2909936308860779, + "learning_rate": 4.758869198628296e-06, + "loss": 0.45391244888305665, + "memory(GiB)": 86.38, + "step": 1210, + "token_acc": 0.8756224804363292, + "train_speed(iter/s)": 0.033824 + }, + { + "epoch": 1.5965834428383707, + "grad_norm": 0.2969980835914612, + "learning_rate": 4.7234426692007985e-06, + "loss": 0.454874324798584, + "memory(GiB)": 86.38, + "step": 1215, + "token_acc": 0.8589074167649206, + "train_speed(iter/s)": 0.033825 + }, + { + "epoch": 1.6031537450722735, + "grad_norm": 0.2968142032623291, + "learning_rate": 4.688030060691264e-06, + "loss": 0.4513202667236328, + "memory(GiB)": 86.38, + "step": 1220, + "token_acc": 0.8506660149089575, + "train_speed(iter/s)": 0.033824 + }, + { + "epoch": 1.6097240473061762, + "grad_norm": 0.28620168566703796, + "learning_rate": 4.6526331556451674e-06, + "loss": 0.44993081092834475, + "memory(GiB)": 86.38, + "step": 1225, + "token_acc": 0.8493528096896605, + "train_speed(iter/s)": 0.033826 + }, + { + "epoch": 1.616294349540079, + "grad_norm": 0.2923036515712738, + "learning_rate": 4.617253735817522e-06, + "loss": 0.4529541492462158, + "memory(GiB)": 86.38, + "step": 1230, + "token_acc": 0.8594011423296601, + "train_speed(iter/s)": 0.033823 + }, + { + "epoch": 1.6228646517739818, + "grad_norm": 0.29773661494255066, + "learning_rate": 4.5818935820832014e-06, + "loss": 0.4512050151824951, + "memory(GiB)": 86.38, + "step": 1235, + "token_acc": 0.8610426631879017, + "train_speed(iter/s)": 0.033826 + }, + { + "epoch": 1.6294349540078845, + "grad_norm": 0.2810444235801697, + "learning_rate": 4.546554474347291e-06, + "loss": 0.4555663108825684, + "memory(GiB)": 86.38, + "step": 1240, + "token_acc": 0.8596368270149729, + "train_speed(iter/s)": 0.033828 + }, + { + "epoch": 1.6360052562417873, + "grad_norm": 0.2784985601902008, + "learning_rate": 4.511238191455491e-06, + "loss": 0.45386524200439454, + "memory(GiB)": 86.38, + "step": 1245, + "token_acc": 0.866062264796442, + "train_speed(iter/s)": 0.033827 + }, + { + "epoch": 1.64257555847569, + "grad_norm": 0.27828744053840637, + "learning_rate": 4.475946511104588e-06, + "loss": 0.45246143341064454, + "memory(GiB)": 86.38, + "step": 1250, + "token_acc": 0.8584367661858436, + "train_speed(iter/s)": 0.033828 + }, + { + "epoch": 1.6491458607095928, + "grad_norm": 0.2854389250278473, + "learning_rate": 4.440681209752955e-06, + "loss": 0.4526336669921875, + "memory(GiB)": 86.38, + "step": 1255, + "token_acc": 0.851116058685848, + "train_speed(iter/s)": 0.033825 + }, + { + "epoch": 1.6557161629434956, + "grad_norm": 0.29449641704559326, + "learning_rate": 4.405444062531145e-06, + "loss": 0.4575493812561035, + "memory(GiB)": 86.38, + "step": 1260, + "token_acc": 0.8626177520332339, + "train_speed(iter/s)": 0.033821 + }, + { + "epoch": 1.6622864651773983, + "grad_norm": 0.28538015484809875, + "learning_rate": 4.37023684315253e-06, + "loss": 0.45549468994140624, + "memory(GiB)": 86.38, + "step": 1265, + "token_acc": 0.8691069738087724, + "train_speed(iter/s)": 0.03382 + }, + { + "epoch": 1.668856767411301, + "grad_norm": 0.27826598286628723, + "learning_rate": 4.335061323824019e-06, + "loss": 0.44781084060668946, + "memory(GiB)": 86.38, + "step": 1270, + "token_acc": 0.8674536256323777, + "train_speed(iter/s)": 0.033821 + }, + { + "epoch": 1.6754270696452038, + "grad_norm": 0.2691604495048523, + "learning_rate": 4.299919275156857e-06, + "loss": 0.4545548439025879, + "memory(GiB)": 86.38, + "step": 1275, + "token_acc": 0.8644137364892598, + "train_speed(iter/s)": 0.033824 + }, + { + "epoch": 1.6819973718791066, + "grad_norm": 0.27578890323638916, + "learning_rate": 4.264812466077486e-06, + "loss": 0.4538686752319336, + "memory(GiB)": 86.38, + "step": 1280, + "token_acc": 0.8544989775051125, + "train_speed(iter/s)": 0.033826 + }, + { + "epoch": 1.6885676741130093, + "grad_norm": 0.2718227803707123, + "learning_rate": 4.229742663738521e-06, + "loss": 0.4527297496795654, + "memory(GiB)": 86.38, + "step": 1285, + "token_acc": 0.8661887694145759, + "train_speed(iter/s)": 0.033825 + }, + { + "epoch": 1.695137976346912, + "grad_norm": 0.2723022997379303, + "learning_rate": 4.194711633429782e-06, + "loss": 0.4542956829071045, + "memory(GiB)": 86.38, + "step": 1290, + "token_acc": 0.8571600048013444, + "train_speed(iter/s)": 0.033826 + }, + { + "epoch": 1.7017082785808149, + "grad_norm": 0.2890985310077667, + "learning_rate": 4.159721138489445e-06, + "loss": 0.449599027633667, + "memory(GiB)": 86.38, + "step": 1295, + "token_acc": 0.8626619837713455, + "train_speed(iter/s)": 0.033827 + }, + { + "epoch": 1.7082785808147176, + "grad_norm": 0.279776394367218, + "learning_rate": 4.124772940215279e-06, + "loss": 0.4549734115600586, + "memory(GiB)": 86.38, + "step": 1300, + "token_acc": 0.8570111173728162, + "train_speed(iter/s)": 0.033827 + }, + { + "epoch": 1.7148488830486204, + "grad_norm": 0.2932436168193817, + "learning_rate": 4.0898687977759895e-06, + "loss": 0.45325145721435545, + "memory(GiB)": 86.38, + "step": 1305, + "token_acc": 0.8666294359547139, + "train_speed(iter/s)": 0.033831 + }, + { + "epoch": 1.7214191852825231, + "grad_norm": 0.2910197675228119, + "learning_rate": 4.0550104681226635e-06, + "loss": 0.45451927185058594, + "memory(GiB)": 86.38, + "step": 1310, + "token_acc": 0.8454388043379204, + "train_speed(iter/s)": 0.033828 + }, + { + "epoch": 1.727989487516426, + "grad_norm": 0.2771059274673462, + "learning_rate": 4.020199705900335e-06, + "loss": 0.45571699142456057, + "memory(GiB)": 86.38, + "step": 1315, + "token_acc": 0.8857914854356136, + "train_speed(iter/s)": 0.033827 + }, + { + "epoch": 1.7345597897503287, + "grad_norm": 0.27845674753189087, + "learning_rate": 3.985438263359667e-06, + "loss": 0.4508528709411621, + "memory(GiB)": 86.38, + "step": 1320, + "token_acc": 0.8715719063545151, + "train_speed(iter/s)": 0.033829 + }, + { + "epoch": 1.7411300919842314, + "grad_norm": 0.2838834524154663, + "learning_rate": 3.950727890268736e-06, + "loss": 0.45130367279052735, + "memory(GiB)": 86.38, + "step": 1325, + "token_acc": 0.8547756346523497, + "train_speed(iter/s)": 0.033827 + }, + { + "epoch": 1.7477003942181342, + "grad_norm": 0.27185139060020447, + "learning_rate": 3.91607033382497e-06, + "loss": 0.4526374340057373, + "memory(GiB)": 86.38, + "step": 1330, + "token_acc": 0.8633136094674556, + "train_speed(iter/s)": 0.033824 + }, + { + "epoch": 1.754270696452037, + "grad_norm": 0.28836262226104736, + "learning_rate": 3.88146733856719e-06, + "loss": 0.4543032646179199, + "memory(GiB)": 86.38, + "step": 1335, + "token_acc": 0.8587059705221084, + "train_speed(iter/s)": 0.033827 + }, + { + "epoch": 1.7608409986859397, + "grad_norm": 0.27373170852661133, + "learning_rate": 3.8469206462878e-06, + "loss": 0.4514758586883545, + "memory(GiB)": 86.38, + "step": 1340, + "token_acc": 0.861223101957546, + "train_speed(iter/s)": 0.033826 + }, + { + "epoch": 1.7674113009198424, + "grad_norm": 0.26478344202041626, + "learning_rate": 3.8124319959451133e-06, + "loss": 0.45225229263305666, + "memory(GiB)": 86.38, + "step": 1345, + "token_acc": 0.8613606419930531, + "train_speed(iter/s)": 0.033822 + }, + { + "epoch": 1.7739816031537452, + "grad_norm": 0.31700122356414795, + "learning_rate": 3.778003123575815e-06, + "loss": 0.45349550247192383, + "memory(GiB)": 86.38, + "step": 1350, + "token_acc": 0.8643513203214696, + "train_speed(iter/s)": 0.033818 + }, + { + "epoch": 1.780551905387648, + "grad_norm": 0.26822659373283386, + "learning_rate": 3.743635762207582e-06, + "loss": 0.44829654693603516, + "memory(GiB)": 86.38, + "step": 1355, + "token_acc": 0.8705515383524741, + "train_speed(iter/s)": 0.03382 + }, + { + "epoch": 1.7871222076215507, + "grad_norm": 0.2593797445297241, + "learning_rate": 3.7093316417718407e-06, + "loss": 0.45132102966308596, + "memory(GiB)": 86.38, + "step": 1360, + "token_acc": 0.8722857336129338, + "train_speed(iter/s)": 0.033822 + }, + { + "epoch": 1.7936925098554535, + "grad_norm": 0.2924158275127411, + "learning_rate": 3.675092489016693e-06, + "loss": 0.4512333869934082, + "memory(GiB)": 86.38, + "step": 1365, + "token_acc": 0.86383098856632, + "train_speed(iter/s)": 0.033825 + }, + { + "epoch": 1.8002628120893562, + "grad_norm": 0.2746325135231018, + "learning_rate": 3.640920027420001e-06, + "loss": 0.4558290481567383, + "memory(GiB)": 86.38, + "step": 1370, + "token_acc": 0.8634496357561483, + "train_speed(iter/s)": 0.033824 + }, + { + "epoch": 1.806833114323259, + "grad_norm": 0.27387329936027527, + "learning_rate": 3.6068159771026267e-06, + "loss": 0.4523761749267578, + "memory(GiB)": 86.38, + "step": 1375, + "token_acc": 0.8614295741693964, + "train_speed(iter/s)": 0.033825 + }, + { + "epoch": 1.8134034165571618, + "grad_norm": 0.2677063047885895, + "learning_rate": 3.5727820547418525e-06, + "loss": 0.4497382640838623, + "memory(GiB)": 86.38, + "step": 1380, + "token_acc": 0.8671490051768228, + "train_speed(iter/s)": 0.033826 + }, + { + "epoch": 1.8199737187910645, + "grad_norm": 0.26505404710769653, + "learning_rate": 3.5388199734849626e-06, + "loss": 0.45242948532104493, + "memory(GiB)": 86.38, + "step": 1385, + "token_acc": 0.8609067954770008, + "train_speed(iter/s)": 0.033825 + }, + { + "epoch": 1.8265440210249673, + "grad_norm": 0.28987395763397217, + "learning_rate": 3.504931442863023e-06, + "loss": 0.45121097564697266, + "memory(GiB)": 86.38, + "step": 1390, + "token_acc": 0.8593791633359978, + "train_speed(iter/s)": 0.033826 + }, + { + "epoch": 1.83311432325887, + "grad_norm": 0.2953889071941376, + "learning_rate": 3.4711181687048114e-06, + "loss": 0.4545147895812988, + "memory(GiB)": 86.38, + "step": 1395, + "token_acc": 0.8520735098537057, + "train_speed(iter/s)": 0.033826 + }, + { + "epoch": 1.8396846254927728, + "grad_norm": 0.27598556876182556, + "learning_rate": 3.4373818530509686e-06, + "loss": 0.45116052627563474, + "memory(GiB)": 86.38, + "step": 1400, + "token_acc": 0.866229439933375, + "train_speed(iter/s)": 0.033827 + }, + { + "epoch": 1.8462549277266755, + "grad_norm": 0.27450037002563477, + "learning_rate": 3.40372419406831e-06, + "loss": 0.4568813323974609, + "memory(GiB)": 86.38, + "step": 1405, + "token_acc": 0.86804211035818, + "train_speed(iter/s)": 0.033826 + }, + { + "epoch": 1.8528252299605783, + "grad_norm": 0.2719385027885437, + "learning_rate": 3.3701468859643583e-06, + "loss": 0.4519033432006836, + "memory(GiB)": 86.38, + "step": 1410, + "token_acc": 0.8648355441589822, + "train_speed(iter/s)": 0.033827 + }, + { + "epoch": 1.859395532194481, + "grad_norm": 0.2851196825504303, + "learning_rate": 3.336651618902054e-06, + "loss": 0.4524543762207031, + "memory(GiB)": 86.38, + "step": 1415, + "token_acc": 0.8464139526606158, + "train_speed(iter/s)": 0.033828 + }, + { + "epoch": 1.8659658344283838, + "grad_norm": 0.2691018879413605, + "learning_rate": 3.303240078914679e-06, + "loss": 0.45388317108154297, + "memory(GiB)": 86.38, + "step": 1420, + "token_acc": 0.8622832288312715, + "train_speed(iter/s)": 0.033828 + }, + { + "epoch": 1.8725361366622866, + "grad_norm": 0.2715182900428772, + "learning_rate": 3.2699139478209987e-06, + "loss": 0.4549809455871582, + "memory(GiB)": 86.38, + "step": 1425, + "token_acc": 0.8636651870640456, + "train_speed(iter/s)": 0.03383 + }, + { + "epoch": 1.8791064388961893, + "grad_norm": 0.2916743755340576, + "learning_rate": 3.2366749031405875e-06, + "loss": 0.4505608558654785, + "memory(GiB)": 86.38, + "step": 1430, + "token_acc": 0.8645326192794547, + "train_speed(iter/s)": 0.033829 + }, + { + "epoch": 1.885676741130092, + "grad_norm": 0.2814328968524933, + "learning_rate": 3.203524618009403e-06, + "loss": 0.4522216796875, + "memory(GiB)": 86.38, + "step": 1435, + "token_acc": 0.8565567219054724, + "train_speed(iter/s)": 0.033831 + }, + { + "epoch": 1.8922470433639949, + "grad_norm": 0.29333144426345825, + "learning_rate": 3.1704647610955618e-06, + "loss": 0.4518414497375488, + "memory(GiB)": 86.38, + "step": 1440, + "token_acc": 0.8547056199821588, + "train_speed(iter/s)": 0.033834 + }, + { + "epoch": 1.8988173455978976, + "grad_norm": 0.26604127883911133, + "learning_rate": 3.137496996515339e-06, + "loss": 0.4495247840881348, + "memory(GiB)": 86.38, + "step": 1445, + "token_acc": 0.8561262009251571, + "train_speed(iter/s)": 0.033832 + }, + { + "epoch": 1.9053876478318004, + "grad_norm": 0.26928678154945374, + "learning_rate": 3.1046229837494123e-06, + "loss": 0.44922027587890623, + "memory(GiB)": 86.38, + "step": 1450, + "token_acc": 0.8630366102954841, + "train_speed(iter/s)": 0.033835 + }, + { + "epoch": 1.9119579500657031, + "grad_norm": 0.2921224534511566, + "learning_rate": 3.0718443775593233e-06, + "loss": 0.44977540969848634, + "memory(GiB)": 86.38, + "step": 1455, + "token_acc": 0.8656272709255467, + "train_speed(iter/s)": 0.033835 + }, + { + "epoch": 1.9185282522996059, + "grad_norm": 0.2801390290260315, + "learning_rate": 3.0391628279041797e-06, + "loss": 0.45065975189208984, + "memory(GiB)": 86.38, + "step": 1460, + "token_acc": 0.8713450292397661, + "train_speed(iter/s)": 0.033836 + }, + { + "epoch": 1.9250985545335086, + "grad_norm": 0.28972676396369934, + "learning_rate": 3.0065799798576146e-06, + "loss": 0.4490159034729004, + "memory(GiB)": 86.38, + "step": 1465, + "token_acc": 0.8605760938308515, + "train_speed(iter/s)": 0.033833 + }, + { + "epoch": 1.9316688567674114, + "grad_norm": 0.2788577675819397, + "learning_rate": 2.9740974735249627e-06, + "loss": 0.45141172409057617, + "memory(GiB)": 86.38, + "step": 1470, + "token_acc": 0.8731429833765947, + "train_speed(iter/s)": 0.033833 + }, + { + "epoch": 1.9382391590013142, + "grad_norm": 0.27176031470298767, + "learning_rate": 2.941716943960716e-06, + "loss": 0.4523900508880615, + "memory(GiB)": 86.38, + "step": 1475, + "token_acc": 0.8687188222411486, + "train_speed(iter/s)": 0.033835 + }, + { + "epoch": 1.944809461235217, + "grad_norm": 0.2714715003967285, + "learning_rate": 2.9094400210862206e-06, + "loss": 0.4515875816345215, + "memory(GiB)": 86.38, + "step": 1480, + "token_acc": 0.8687481415402915, + "train_speed(iter/s)": 0.033837 + }, + { + "epoch": 1.9513797634691197, + "grad_norm": 0.272011399269104, + "learning_rate": 2.8772683296076197e-06, + "loss": 0.44769134521484377, + "memory(GiB)": 86.38, + "step": 1485, + "token_acc": 0.8557253110726099, + "train_speed(iter/s)": 0.033839 + }, + { + "epoch": 1.9579500657030224, + "grad_norm": 0.2830789089202881, + "learning_rate": 2.8452034889340874e-06, + "loss": 0.4503666877746582, + "memory(GiB)": 86.38, + "step": 1490, + "token_acc": 0.8650519031141869, + "train_speed(iter/s)": 0.033839 + }, + { + "epoch": 1.9645203679369252, + "grad_norm": 0.27117088437080383, + "learning_rate": 2.8132471130962997e-06, + "loss": 0.44952926635742185, + "memory(GiB)": 86.38, + "step": 1495, + "token_acc": 0.8653084323712507, + "train_speed(iter/s)": 0.033838 + }, + { + "epoch": 1.971090670170828, + "grad_norm": 0.2866286337375641, + "learning_rate": 2.781400810665201e-06, + "loss": 0.45142645835876466, + "memory(GiB)": 86.38, + "step": 1500, + "token_acc": 0.8606049336804265, + "train_speed(iter/s)": 0.033839 + }, + { + "epoch": 1.9776609724047307, + "grad_norm": 0.25524598360061646, + "learning_rate": 2.749666184671032e-06, + "loss": 0.45200319290161134, + "memory(GiB)": 86.38, + "step": 1505, + "token_acc": 0.8672011511974509, + "train_speed(iter/s)": 0.033802 + }, + { + "epoch": 1.9842312746386335, + "grad_norm": 0.269008070230484, + "learning_rate": 2.7180448325226283e-06, + "loss": 0.449237060546875, + "memory(GiB)": 86.38, + "step": 1510, + "token_acc": 0.8631796690307328, + "train_speed(iter/s)": 0.0338 + }, + { + "epoch": 1.9908015768725362, + "grad_norm": 0.2759488821029663, + "learning_rate": 2.686538345927027e-06, + "loss": 0.454377269744873, + "memory(GiB)": 86.38, + "step": 1515, + "token_acc": 0.8493589743589743, + "train_speed(iter/s)": 0.033795 + }, + { + "epoch": 1.997371879106439, + "grad_norm": 0.2774396538734436, + "learning_rate": 2.6551483108093378e-06, + "loss": 0.45154151916503904, + "memory(GiB)": 86.38, + "step": 1520, + "token_acc": 0.854857977170162, + "train_speed(iter/s)": 0.033795 + }, + { + "epoch": 2.0039421813403417, + "grad_norm": 0.2865091860294342, + "learning_rate": 2.623876307232919e-06, + "loss": 0.43844971656799314, + "memory(GiB)": 86.38, + "step": 1525, + "token_acc": 0.8616791354945968, + "train_speed(iter/s)": 0.0338 + }, + { + "epoch": 2.0105124835742445, + "grad_norm": 0.28435423970222473, + "learning_rate": 2.5927239093198273e-06, + "loss": 0.4346470832824707, + "memory(GiB)": 86.38, + "step": 1530, + "token_acc": 0.8676384460206937, + "train_speed(iter/s)": 0.033799 + }, + { + "epoch": 2.0170827858081473, + "grad_norm": 0.2833334505558014, + "learning_rate": 2.5616926851716055e-06, + "loss": 0.43649768829345703, + "memory(GiB)": 86.38, + "step": 1535, + "token_acc": 0.8553893161942894, + "train_speed(iter/s)": 0.033798 + }, + { + "epoch": 2.02365308804205, + "grad_norm": 0.2661850154399872, + "learning_rate": 2.5307841967903337e-06, + "loss": 0.4341902732849121, + "memory(GiB)": 86.38, + "step": 1540, + "token_acc": 0.853354760948172, + "train_speed(iter/s)": 0.033796 + }, + { + "epoch": 2.0302233902759528, + "grad_norm": 0.2832602262496948, + "learning_rate": 2.5000000000000015e-06, + "loss": 0.4348430633544922, + "memory(GiB)": 86.38, + "step": 1545, + "token_acc": 0.8529144141733126, + "train_speed(iter/s)": 0.033797 + }, + { + "epoch": 2.0367936925098555, + "grad_norm": 0.26590895652770996, + "learning_rate": 2.4693416443682074e-06, + "loss": 0.431856632232666, + "memory(GiB)": 86.38, + "step": 1550, + "token_acc": 0.8682563338301044, + "train_speed(iter/s)": 0.033797 + }, + { + "epoch": 2.0433639947437583, + "grad_norm": 0.28006982803344727, + "learning_rate": 2.4388106731281496e-06, + "loss": 0.43282361030578614, + "memory(GiB)": 86.38, + "step": 1555, + "token_acc": 0.869759845139435, + "train_speed(iter/s)": 0.0338 + }, + { + "epoch": 2.049934296977661, + "grad_norm": 0.2961016893386841, + "learning_rate": 2.40840862310094e-06, + "loss": 0.43299617767333987, + "memory(GiB)": 86.38, + "step": 1560, + "token_acc": 0.8845442367799962, + "train_speed(iter/s)": 0.033797 + }, + { + "epoch": 2.056504599211564, + "grad_norm": 0.2669562101364136, + "learning_rate": 2.378137024618262e-06, + "loss": 0.4347973823547363, + "memory(GiB)": 86.38, + "step": 1565, + "token_acc": 0.8502078945947406, + "train_speed(iter/s)": 0.033798 + }, + { + "epoch": 2.0630749014454666, + "grad_norm": 0.2754296362400055, + "learning_rate": 2.3479974014453255e-06, + "loss": 0.43701701164245604, + "memory(GiB)": 86.38, + "step": 1570, + "token_acc": 0.8600905562742561, + "train_speed(iter/s)": 0.033799 + }, + { + "epoch": 2.0696452036793693, + "grad_norm": 0.2642713189125061, + "learning_rate": 2.317991270704167e-06, + "loss": 0.43048667907714844, + "memory(GiB)": 86.38, + "step": 1575, + "token_acc": 0.8709290926914279, + "train_speed(iter/s)": 0.033802 + }, + { + "epoch": 2.076215505913272, + "grad_norm": 0.2664032280445099, + "learning_rate": 2.2881201427972894e-06, + "loss": 0.43495759963989256, + "memory(GiB)": 86.38, + "step": 1580, + "token_acc": 0.8594156340829127, + "train_speed(iter/s)": 0.033803 + }, + { + "epoch": 2.082785808147175, + "grad_norm": 0.27893051505088806, + "learning_rate": 2.2583855213316326e-06, + "loss": 0.4322032928466797, + "memory(GiB)": 86.38, + "step": 1585, + "token_acc": 0.8674502122102514, + "train_speed(iter/s)": 0.0338 + }, + { + "epoch": 2.0893561103810776, + "grad_norm": 0.25695356726646423, + "learning_rate": 2.228788903042877e-06, + "loss": 0.4315330505371094, + "memory(GiB)": 86.38, + "step": 1590, + "token_acc": 0.8767547253233116, + "train_speed(iter/s)": 0.033798 + }, + { + "epoch": 2.0959264126149804, + "grad_norm": 0.2659642696380615, + "learning_rate": 2.1993317777201197e-06, + "loss": 0.43229498863220217, + "memory(GiB)": 86.38, + "step": 1595, + "token_acc": 0.8707460370247201, + "train_speed(iter/s)": 0.033796 + }, + { + "epoch": 2.102496714848883, + "grad_norm": 0.2697013020515442, + "learning_rate": 2.170015628130871e-06, + "loss": 0.4357916355133057, + "memory(GiB)": 86.38, + "step": 1600, + "token_acc": 0.8637946662850055, + "train_speed(iter/s)": 0.033796 + }, + { + "epoch": 2.109067017082786, + "grad_norm": 0.27165451645851135, + "learning_rate": 2.1408419299464245e-06, + "loss": 0.4324627876281738, + "memory(GiB)": 86.38, + "step": 1605, + "token_acc": 0.8698603817087229, + "train_speed(iter/s)": 0.033795 + }, + { + "epoch": 2.1156373193166886, + "grad_norm": 0.2767409384250641, + "learning_rate": 2.111812151667567e-06, + "loss": 0.433492374420166, + "memory(GiB)": 86.38, + "step": 1610, + "token_acc": 0.8622224420157262, + "train_speed(iter/s)": 0.033795 + }, + { + "epoch": 2.1222076215505914, + "grad_norm": 0.2886437177658081, + "learning_rate": 2.0829277545506736e-06, + "loss": 0.4330601692199707, + "memory(GiB)": 86.38, + "step": 1615, + "token_acc": 0.87356944538498, + "train_speed(iter/s)": 0.033795 + }, + { + "epoch": 2.128777923784494, + "grad_norm": 0.27543848752975464, + "learning_rate": 2.0541901925341446e-06, + "loss": 0.4322654724121094, + "memory(GiB)": 86.38, + "step": 1620, + "token_acc": 0.8578295433536698, + "train_speed(iter/s)": 0.033796 + }, + { + "epoch": 2.135348226018397, + "grad_norm": 0.2620643675327301, + "learning_rate": 2.0256009121652147e-06, + "loss": 0.43578500747680665, + "memory(GiB)": 86.38, + "step": 1625, + "token_acc": 0.868349382355802, + "train_speed(iter/s)": 0.033789 + }, + { + "epoch": 2.1419185282522997, + "grad_norm": 0.28385990858078003, + "learning_rate": 1.9971613525271523e-06, + "loss": 0.43427586555480957, + "memory(GiB)": 86.38, + "step": 1630, + "token_acc": 0.8664960419022677, + "train_speed(iter/s)": 0.033789 + }, + { + "epoch": 2.1484888304862024, + "grad_norm": 0.2743207514286041, + "learning_rate": 1.9688729451668116e-06, + "loss": 0.43171100616455077, + "memory(GiB)": 86.38, + "step": 1635, + "token_acc": 0.8658852104123765, + "train_speed(iter/s)": 0.033789 + }, + { + "epoch": 2.155059132720105, + "grad_norm": 0.27282217144966125, + "learning_rate": 1.940737114022572e-06, + "loss": 0.43387999534606936, + "memory(GiB)": 86.38, + "step": 1640, + "token_acc": 0.8552638446683021, + "train_speed(iter/s)": 0.033785 + }, + { + "epoch": 2.161629434954008, + "grad_norm": 0.26848945021629333, + "learning_rate": 1.9127552753526683e-06, + "loss": 0.4308422565460205, + "memory(GiB)": 86.38, + "step": 1645, + "token_acc": 0.8723747980613893, + "train_speed(iter/s)": 0.033783 + }, + { + "epoch": 2.1681997371879107, + "grad_norm": 0.2596457600593567, + "learning_rate": 1.884928837663902e-06, + "loss": 0.4331303596496582, + "memory(GiB)": 86.38, + "step": 1650, + "token_acc": 0.8620848945234307, + "train_speed(iter/s)": 0.033785 + }, + { + "epoch": 2.1747700394218135, + "grad_norm": 0.2749711871147156, + "learning_rate": 1.8572592016407337e-06, + "loss": 0.4339931488037109, + "memory(GiB)": 86.38, + "step": 1655, + "token_acc": 0.8706686188384578, + "train_speed(iter/s)": 0.033784 + }, + { + "epoch": 2.181340341655716, + "grad_norm": 0.26862356066703796, + "learning_rate": 1.8297477600747854e-06, + "loss": 0.43131422996520996, + "memory(GiB)": 86.38, + "step": 1660, + "token_acc": 0.8703601718250908, + "train_speed(iter/s)": 0.033783 + }, + { + "epoch": 2.187910643889619, + "grad_norm": 0.28293994069099426, + "learning_rate": 1.8023958977947303e-06, + "loss": 0.4327284812927246, + "memory(GiB)": 86.38, + "step": 1665, + "token_acc": 0.8674884437596302, + "train_speed(iter/s)": 0.033781 + }, + { + "epoch": 2.1944809461235217, + "grad_norm": 0.2755849063396454, + "learning_rate": 1.7752049915965807e-06, + "loss": 0.43210086822509763, + "memory(GiB)": 86.38, + "step": 1670, + "token_acc": 0.8653022928516977, + "train_speed(iter/s)": 0.033781 + }, + { + "epoch": 2.2010512483574245, + "grad_norm": 0.2687658965587616, + "learning_rate": 1.7481764101743925e-06, + "loss": 0.4309385776519775, + "memory(GiB)": 86.38, + "step": 1675, + "token_acc": 0.8708192896033187, + "train_speed(iter/s)": 0.033778 + }, + { + "epoch": 2.2076215505913273, + "grad_norm": 0.2643987536430359, + "learning_rate": 1.7213115140513687e-06, + "loss": 0.43217859268188474, + "memory(GiB)": 86.38, + "step": 1680, + "token_acc": 0.8690569923081582, + "train_speed(iter/s)": 0.033776 + }, + { + "epoch": 2.21419185282523, + "grad_norm": 0.27602747082710266, + "learning_rate": 1.694611655511365e-06, + "loss": 0.42904300689697267, + "memory(GiB)": 86.38, + "step": 1685, + "token_acc": 0.8896275737429807, + "train_speed(iter/s)": 0.033776 + }, + { + "epoch": 2.2207621550591328, + "grad_norm": 0.25782617926597595, + "learning_rate": 1.668078178530837e-06, + "loss": 0.4349325180053711, + "memory(GiB)": 86.38, + "step": 1690, + "token_acc": 0.8658529694298469, + "train_speed(iter/s)": 0.033775 + }, + { + "epoch": 2.2273324572930355, + "grad_norm": 0.26953521370887756, + "learning_rate": 1.6417124187111778e-06, + "loss": 0.4276991844177246, + "memory(GiB)": 86.38, + "step": 1695, + "token_acc": 0.8727225739759659, + "train_speed(iter/s)": 0.033775 + }, + { + "epoch": 2.2339027595269383, + "grad_norm": 0.2712646424770355, + "learning_rate": 1.6155157032114926e-06, + "loss": 0.4300542831420898, + "memory(GiB)": 86.38, + "step": 1700, + "token_acc": 0.8694365753855838, + "train_speed(iter/s)": 0.033776 + }, + { + "epoch": 2.240473061760841, + "grad_norm": 0.28259536623954773, + "learning_rate": 1.589489350681791e-06, + "loss": 0.43476276397705077, + "memory(GiB)": 86.38, + "step": 1705, + "token_acc": 0.8633074766964344, + "train_speed(iter/s)": 0.033776 + }, + { + "epoch": 2.247043363994744, + "grad_norm": 0.2692559063434601, + "learning_rate": 1.5636346711966154e-06, + "loss": 0.4304978847503662, + "memory(GiB)": 86.38, + "step": 1710, + "token_acc": 0.8691604140423901, + "train_speed(iter/s)": 0.033776 + }, + { + "epoch": 2.2536136662286466, + "grad_norm": 0.26556524634361267, + "learning_rate": 1.5379529661890956e-06, + "loss": 0.4372213363647461, + "memory(GiB)": 86.38, + "step": 1715, + "token_acc": 0.8606243830207305, + "train_speed(iter/s)": 0.033778 + }, + { + "epoch": 2.2601839684625493, + "grad_norm": 0.26940152049064636, + "learning_rate": 1.512445528385434e-06, + "loss": 0.4369645118713379, + "memory(GiB)": 86.38, + "step": 1720, + "token_acc": 0.857104328673529, + "train_speed(iter/s)": 0.033779 + }, + { + "epoch": 2.266754270696452, + "grad_norm": 0.2632419466972351, + "learning_rate": 1.4871136417398407e-06, + "loss": 0.43130922317504883, + "memory(GiB)": 86.38, + "step": 1725, + "token_acc": 0.8684261345349211, + "train_speed(iter/s)": 0.033776 + }, + { + "epoch": 2.273324572930355, + "grad_norm": 0.27120915055274963, + "learning_rate": 1.4619585813699032e-06, + "loss": 0.436324405670166, + "memory(GiB)": 86.38, + "step": 1730, + "token_acc": 0.8729593158849442, + "train_speed(iter/s)": 0.033776 + }, + { + "epoch": 2.2798948751642576, + "grad_norm": 0.28977081179618835, + "learning_rate": 1.436981613492394e-06, + "loss": 0.434481143951416, + "memory(GiB)": 86.38, + "step": 1735, + "token_acc": 0.863697705802969, + "train_speed(iter/s)": 0.033771 + }, + { + "epoch": 2.2864651773981604, + "grad_norm": 0.27072688937187195, + "learning_rate": 1.412183995359544e-06, + "loss": 0.43726301193237305, + "memory(GiB)": 86.38, + "step": 1740, + "token_acc": 0.8646803900325027, + "train_speed(iter/s)": 0.033769 + }, + { + "epoch": 2.293035479632063, + "grad_norm": 0.2683422863483429, + "learning_rate": 1.3875669751957548e-06, + "loss": 0.4344059467315674, + "memory(GiB)": 86.38, + "step": 1745, + "token_acc": 0.8645030938249779, + "train_speed(iter/s)": 0.03377 + }, + { + "epoch": 2.299605781865966, + "grad_norm": 0.2548208236694336, + "learning_rate": 1.3631317921347564e-06, + "loss": 0.4341590881347656, + "memory(GiB)": 86.38, + "step": 1750, + "token_acc": 0.8695078031212485, + "train_speed(iter/s)": 0.033772 + }, + { + "epoch": 2.3061760840998686, + "grad_norm": 0.25699329376220703, + "learning_rate": 1.3388796761572493e-06, + "loss": 0.43475918769836425, + "memory(GiB)": 86.38, + "step": 1755, + "token_acc": 0.8668202539091221, + "train_speed(iter/s)": 0.033773 + }, + { + "epoch": 2.3127463863337714, + "grad_norm": 0.284801721572876, + "learning_rate": 1.3148118480289834e-06, + "loss": 0.43476195335388185, + "memory(GiB)": 86.38, + "step": 1760, + "token_acc": 0.872836719337848, + "train_speed(iter/s)": 0.033772 + }, + { + "epoch": 2.319316688567674, + "grad_norm": 0.2635682225227356, + "learning_rate": 1.2909295192393057e-06, + "loss": 0.4339436531066895, + "memory(GiB)": 86.38, + "step": 1765, + "token_acc": 0.8698166676305592, + "train_speed(iter/s)": 0.033774 + }, + { + "epoch": 2.325886990801577, + "grad_norm": 0.25871872901916504, + "learning_rate": 1.2672338919401866e-06, + "loss": 0.4373739719390869, + "memory(GiB)": 86.38, + "step": 1770, + "token_acc": 0.8606108452163616, + "train_speed(iter/s)": 0.033775 + }, + { + "epoch": 2.3324572930354797, + "grad_norm": 0.26007142663002014, + "learning_rate": 1.2437261588857037e-06, + "loss": 0.432224702835083, + "memory(GiB)": 86.38, + "step": 1775, + "token_acc": 0.8673443326352352, + "train_speed(iter/s)": 0.033776 + }, + { + "epoch": 2.3390275952693824, + "grad_norm": 0.26318100094795227, + "learning_rate": 1.2204075033720025e-06, + "loss": 0.4342185020446777, + "memory(GiB)": 86.38, + "step": 1780, + "token_acc": 0.8722417109878918, + "train_speed(iter/s)": 0.033777 + }, + { + "epoch": 2.345597897503285, + "grad_norm": 0.25941622257232666, + "learning_rate": 1.197279099177731e-06, + "loss": 0.43193416595458983, + "memory(GiB)": 86.38, + "step": 1785, + "token_acc": 0.8598272926295305, + "train_speed(iter/s)": 0.033777 + }, + { + "epoch": 2.352168199737188, + "grad_norm": 0.2658545970916748, + "learning_rate": 1.1743421105049612e-06, + "loss": 0.432745361328125, + "memory(GiB)": 86.38, + "step": 1790, + "token_acc": 0.8685264027451229, + "train_speed(iter/s)": 0.033774 + }, + { + "epoch": 2.3587385019710907, + "grad_norm": 0.2550273537635803, + "learning_rate": 1.1515976919205869e-06, + "loss": 0.43065509796142576, + "memory(GiB)": 86.38, + "step": 1795, + "token_acc": 0.8694100591056094, + "train_speed(iter/s)": 0.033774 + }, + { + "epoch": 2.3653088042049935, + "grad_norm": 0.27043265104293823, + "learning_rate": 1.1290469882981987e-06, + "loss": 0.4335516929626465, + "memory(GiB)": 86.38, + "step": 1800, + "token_acc": 0.8756407695892418, + "train_speed(iter/s)": 0.033772 + }, + { + "epoch": 2.371879106438896, + "grad_norm": 0.2714201509952545, + "learning_rate": 1.1066911347604653e-06, + "loss": 0.43355650901794435, + "memory(GiB)": 86.38, + "step": 1805, + "token_acc": 0.8681369627127624, + "train_speed(iter/s)": 0.033741 + }, + { + "epoch": 2.378449408672799, + "grad_norm": 0.2614336311817169, + "learning_rate": 1.0845312566219924e-06, + "loss": 0.4319025993347168, + "memory(GiB)": 86.38, + "step": 1810, + "token_acc": 0.869137266528313, + "train_speed(iter/s)": 0.033739 + }, + { + "epoch": 2.3850197109067017, + "grad_norm": 0.258635014295578, + "learning_rate": 1.0625684693326727e-06, + "loss": 0.4368411064147949, + "memory(GiB)": 86.38, + "step": 1815, + "token_acc": 0.8626625620405856, + "train_speed(iter/s)": 0.033739 + }, + { + "epoch": 2.3915900131406045, + "grad_norm": 0.2613593637943268, + "learning_rate": 1.0408038784215462e-06, + "loss": 0.43021059036254883, + "memory(GiB)": 86.38, + "step": 1820, + "token_acc": 0.8690397350993377, + "train_speed(iter/s)": 0.033738 + }, + { + "epoch": 2.3981603153745072, + "grad_norm": 0.2565341889858246, + "learning_rate": 1.019238579441148e-06, + "loss": 0.43543272018432616, + "memory(GiB)": 86.38, + "step": 1825, + "token_acc": 0.8536611843890789, + "train_speed(iter/s)": 0.033739 + }, + { + "epoch": 2.40473061760841, + "grad_norm": 0.27332931756973267, + "learning_rate": 9.978736579123577e-07, + "loss": 0.43721885681152345, + "memory(GiB)": 86.38, + "step": 1830, + "token_acc": 0.8662766830870279, + "train_speed(iter/s)": 0.03374 + }, + { + "epoch": 2.4113009198423128, + "grad_norm": 0.25557610392570496, + "learning_rate": 9.7671018926977e-07, + "loss": 0.4312717914581299, + "memory(GiB)": 86.38, + "step": 1835, + "token_acc": 0.8687478440841669, + "train_speed(iter/s)": 0.033742 + }, + { + "epoch": 2.4178712220762155, + "grad_norm": 0.2686271071434021, + "learning_rate": 9.5574923880755e-07, + "loss": 0.43270196914672854, + "memory(GiB)": 86.38, + "step": 1840, + "token_acc": 0.8766914011348756, + "train_speed(iter/s)": 0.033742 + }, + { + "epoch": 2.4244415243101183, + "grad_norm": 0.27892932295799255, + "learning_rate": 9.349918616258113e-07, + "loss": 0.43126745223999025, + "memory(GiB)": 86.38, + "step": 1845, + "token_acc": 0.8783595334685599, + "train_speed(iter/s)": 0.033743 + }, + { + "epoch": 2.431011826544021, + "grad_norm": 0.258810430765152, + "learning_rate": 9.144391025775123e-07, + "loss": 0.4329942226409912, + "memory(GiB)": 86.38, + "step": 1850, + "token_acc": 0.8575249047268837, + "train_speed(iter/s)": 0.033743 + }, + { + "epoch": 2.437582128777924, + "grad_norm": 0.2641558051109314, + "learning_rate": 8.940919962158584e-07, + "loss": 0.4300084114074707, + "memory(GiB)": 86.38, + "step": 1855, + "token_acc": 0.8717879906071788, + "train_speed(iter/s)": 0.033743 + }, + { + "epoch": 2.4441524310118266, + "grad_norm": 0.27095404267311096, + "learning_rate": 8.739515667422211e-07, + "loss": 0.42922472953796387, + "memory(GiB)": 86.38, + "step": 1860, + "token_acc": 0.8675078864353313, + "train_speed(iter/s)": 0.033745 + }, + { + "epoch": 2.4507227332457293, + "grad_norm": 0.25571873784065247, + "learning_rate": 8.540188279545942e-07, + "loss": 0.4320818901062012, + "memory(GiB)": 86.38, + "step": 1865, + "token_acc": 0.8654411764705883, + "train_speed(iter/s)": 0.033748 + }, + { + "epoch": 2.457293035479632, + "grad_norm": 0.260748952627182, + "learning_rate": 8.342947831965537e-07, + "loss": 0.4332849979400635, + "memory(GiB)": 86.38, + "step": 1870, + "token_acc": 0.8596434777012678, + "train_speed(iter/s)": 0.033746 + }, + { + "epoch": 2.463863337713535, + "grad_norm": 0.26162466406822205, + "learning_rate": 8.147804253067581e-07, + "loss": 0.4324943065643311, + "memory(GiB)": 86.38, + "step": 1875, + "token_acc": 0.8610932130584192, + "train_speed(iter/s)": 0.033746 + }, + { + "epoch": 2.4704336399474376, + "grad_norm": 0.2608964443206787, + "learning_rate": 7.954767365689675e-07, + "loss": 0.43703885078430177, + "memory(GiB)": 86.38, + "step": 1880, + "token_acc": 0.8689590565933764, + "train_speed(iter/s)": 0.033743 + }, + { + "epoch": 2.4770039421813403, + "grad_norm": 0.2531532943248749, + "learning_rate": 7.763846886626048e-07, + "loss": 0.4334650993347168, + "memory(GiB)": 86.38, + "step": 1885, + "token_acc": 0.8626056024899955, + "train_speed(iter/s)": 0.033744 + }, + { + "epoch": 2.483574244415243, + "grad_norm": 0.25901561975479126, + "learning_rate": 7.575052426138424e-07, + "loss": 0.43249049186706545, + "memory(GiB)": 86.38, + "step": 1890, + "token_acc": 0.8670503026093859, + "train_speed(iter/s)": 0.033743 + }, + { + "epoch": 2.490144546649146, + "grad_norm": 0.2607087194919586, + "learning_rate": 7.388393487472223e-07, + "loss": 0.4332951545715332, + "memory(GiB)": 86.38, + "step": 1895, + "token_acc": 0.8609271523178808, + "train_speed(iter/s)": 0.033744 + }, + { + "epoch": 2.4967148488830486, + "grad_norm": 0.24934022128582, + "learning_rate": 7.203879466378311e-07, + "loss": 0.43254899978637695, + "memory(GiB)": 86.38, + "step": 1900, + "token_acc": 0.8726016035388443, + "train_speed(iter/s)": 0.033742 + }, + { + "epoch": 2.5032851511169514, + "grad_norm": 0.274565726518631, + "learning_rate": 7.021519650639952e-07, + "loss": 0.42643136978149415, + "memory(GiB)": 86.38, + "step": 1905, + "token_acc": 0.8752810932364643, + "train_speed(iter/s)": 0.033745 + }, + { + "epoch": 2.509855453350854, + "grad_norm": 0.2578328847885132, + "learning_rate": 6.841323219605333e-07, + "loss": 0.43291406631469725, + "memory(GiB)": 86.38, + "step": 1910, + "token_acc": 0.8636747967479674, + "train_speed(iter/s)": 0.033744 + }, + { + "epoch": 2.516425755584757, + "grad_norm": 0.248977929353714, + "learning_rate": 6.663299243725512e-07, + "loss": 0.42647299766540525, + "memory(GiB)": 86.38, + "step": 1915, + "token_acc": 0.8544173576906291, + "train_speed(iter/s)": 0.033743 + }, + { + "epoch": 2.5229960578186597, + "grad_norm": 0.2570980191230774, + "learning_rate": 6.487456684097848e-07, + "loss": 0.43337106704711914, + "memory(GiB)": 86.38, + "step": 1920, + "token_acc": 0.8634151992585728, + "train_speed(iter/s)": 0.033741 + }, + { + "epoch": 2.5295663600525624, + "grad_norm": 0.2565690875053406, + "learning_rate": 6.313804392014905e-07, + "loss": 0.4316126823425293, + "memory(GiB)": 86.38, + "step": 1925, + "token_acc": 0.8810017459624618, + "train_speed(iter/s)": 0.033741 + }, + { + "epoch": 2.536136662286465, + "grad_norm": 0.2750677168369293, + "learning_rate": 6.142351108518929e-07, + "loss": 0.4336524963378906, + "memory(GiB)": 86.38, + "step": 1930, + "token_acc": 0.8709823449524672, + "train_speed(iter/s)": 0.03374 + }, + { + "epoch": 2.542706964520368, + "grad_norm": 0.2661495506763458, + "learning_rate": 5.973105463961864e-07, + "loss": 0.43224172592163085, + "memory(GiB)": 86.38, + "step": 1935, + "token_acc": 0.8723215768783567, + "train_speed(iter/s)": 0.033739 + }, + { + "epoch": 2.5492772667542707, + "grad_norm": 0.2650693655014038, + "learning_rate": 5.806075977570886e-07, + "loss": 0.43565120697021487, + "memory(GiB)": 86.38, + "step": 1940, + "token_acc": 0.868112798264642, + "train_speed(iter/s)": 0.033737 + }, + { + "epoch": 2.5558475689881734, + "grad_norm": 0.26757803559303284, + "learning_rate": 5.641271057019637e-07, + "loss": 0.4298720359802246, + "memory(GiB)": 86.38, + "step": 1945, + "token_acc": 0.8649701539428213, + "train_speed(iter/s)": 0.03374 + }, + { + "epoch": 2.562417871222076, + "grad_norm": 0.2645432949066162, + "learning_rate": 5.478698998004967e-07, + "loss": 0.4320925235748291, + "memory(GiB)": 86.38, + "step": 1950, + "token_acc": 0.8747405689171042, + "train_speed(iter/s)": 0.033741 + }, + { + "epoch": 2.568988173455979, + "grad_norm": 0.2562493085861206, + "learning_rate": 5.318367983829393e-07, + "loss": 0.43427433967590334, + "memory(GiB)": 86.38, + "step": 1955, + "token_acc": 0.8623294224281183, + "train_speed(iter/s)": 0.033742 + }, + { + "epoch": 2.5755584756898817, + "grad_norm": 0.2527105212211609, + "learning_rate": 5.160286084989119e-07, + "loss": 0.4341059684753418, + "memory(GiB)": 86.38, + "step": 1960, + "token_acc": 0.8693252448908557, + "train_speed(iter/s)": 0.033744 + }, + { + "epoch": 2.5821287779237845, + "grad_norm": 0.2574499249458313, + "learning_rate": 5.004461258767873e-07, + "loss": 0.43187813758850097, + "memory(GiB)": 86.38, + "step": 1965, + "token_acc": 0.866506053867062, + "train_speed(iter/s)": 0.033744 + }, + { + "epoch": 2.5886990801576872, + "grad_norm": 0.2509396970272064, + "learning_rate": 4.850901348836328e-07, + "loss": 0.4363058090209961, + "memory(GiB)": 86.38, + "step": 1970, + "token_acc": 0.8651067174557108, + "train_speed(iter/s)": 0.033744 + }, + { + "epoch": 2.59526938239159, + "grad_norm": 0.25620976090431213, + "learning_rate": 4.699614084857257e-07, + "loss": 0.43309574127197265, + "memory(GiB)": 86.38, + "step": 1975, + "token_acc": 0.8586676260718354, + "train_speed(iter/s)": 0.033748 + }, + { + "epoch": 2.6018396846254928, + "grad_norm": 0.26240846514701843, + "learning_rate": 4.5506070820964973e-07, + "loss": 0.4343746185302734, + "memory(GiB)": 86.38, + "step": 1980, + "token_acc": 0.8798804986092511, + "train_speed(iter/s)": 0.033746 + }, + { + "epoch": 2.6084099868593955, + "grad_norm": 0.2610469460487366, + "learning_rate": 4.4038878410396003e-07, + "loss": 0.43410425186157225, + "memory(GiB)": 86.38, + "step": 1985, + "token_acc": 0.8795611253711813, + "train_speed(iter/s)": 0.033747 + }, + { + "epoch": 2.6149802890932983, + "grad_norm": 0.26572689414024353, + "learning_rate": 4.2594637470142587e-07, + "loss": 0.4306765556335449, + "memory(GiB)": 86.38, + "step": 1990, + "token_acc": 0.8712702886577899, + "train_speed(iter/s)": 0.033746 + }, + { + "epoch": 2.621550591327201, + "grad_norm": 0.2641327381134033, + "learning_rate": 4.1173420698186027e-07, + "loss": 0.4300968647003174, + "memory(GiB)": 86.38, + "step": 1995, + "token_acc": 0.8624918094168305, + "train_speed(iter/s)": 0.033745 + }, + { + "epoch": 2.628120893561104, + "grad_norm": 0.2534749507904053, + "learning_rate": 3.9775299633552535e-07, + "loss": 0.43284106254577637, + "memory(GiB)": 86.38, + "step": 2000, + "token_acc": 0.8643312431984246, + "train_speed(iter/s)": 0.033744 + }, + { + "epoch": 2.6346911957950065, + "grad_norm": 0.2566858232021332, + "learning_rate": 3.840034465271164e-07, + "loss": 0.4347895622253418, + "memory(GiB)": 86.38, + "step": 2005, + "token_acc": 0.8744274109814939, + "train_speed(iter/s)": 0.033742 + }, + { + "epoch": 2.6412614980289093, + "grad_norm": 0.2541216313838959, + "learning_rate": 3.7048624966034506e-07, + "loss": 0.4313460350036621, + "memory(GiB)": 86.38, + "step": 2010, + "token_acc": 0.8607475533545572, + "train_speed(iter/s)": 0.033742 + }, + { + "epoch": 2.647831800262812, + "grad_norm": 0.2455441802740097, + "learning_rate": 3.572020861430997e-07, + "loss": 0.429301118850708, + "memory(GiB)": 86.38, + "step": 2015, + "token_acc": 0.8699587080717235, + "train_speed(iter/s)": 0.033745 + }, + { + "epoch": 2.654402102496715, + "grad_norm": 0.2605392336845398, + "learning_rate": 3.4415162465318843e-07, + "loss": 0.43214893341064453, + "memory(GiB)": 86.38, + "step": 2020, + "token_acc": 0.8749286122215877, + "train_speed(iter/s)": 0.033746 + }, + { + "epoch": 2.6609724047306176, + "grad_norm": 0.25466424226760864, + "learning_rate": 3.313355221046888e-07, + "loss": 0.4351536273956299, + "memory(GiB)": 86.38, + "step": 2025, + "token_acc": 0.8606416722999324, + "train_speed(iter/s)": 0.033747 + }, + { + "epoch": 2.6675427069645203, + "grad_norm": 0.26080095767974854, + "learning_rate": 3.1875442361487987e-07, + "loss": 0.43200006484985354, + "memory(GiB)": 86.38, + "step": 2030, + "token_acc": 0.8577261487147047, + "train_speed(iter/s)": 0.033746 + }, + { + "epoch": 2.674113009198423, + "grad_norm": 0.2586158215999603, + "learning_rate": 3.0640896247176257e-07, + "loss": 0.4336066246032715, + "memory(GiB)": 86.38, + "step": 2035, + "token_acc": 0.8641052229438411, + "train_speed(iter/s)": 0.033748 + }, + { + "epoch": 2.680683311432326, + "grad_norm": 0.25608712434768677, + "learning_rate": 2.942997601021924e-07, + "loss": 0.431638240814209, + "memory(GiB)": 86.38, + "step": 2040, + "token_acc": 0.8686192034065534, + "train_speed(iter/s)": 0.033749 + }, + { + "epoch": 2.6872536136662286, + "grad_norm": 0.2741917669773102, + "learning_rate": 2.824274260405896e-07, + "loss": 0.43211984634399414, + "memory(GiB)": 86.38, + "step": 2045, + "token_acc": 0.8671380975045897, + "train_speed(iter/s)": 0.03375 + }, + { + "epoch": 2.6938239159001314, + "grad_norm": 0.25155529379844666, + "learning_rate": 2.7079255789826565e-07, + "loss": 0.4306828022003174, + "memory(GiB)": 86.38, + "step": 2050, + "token_acc": 0.8573598004121922, + "train_speed(iter/s)": 0.03375 + }, + { + "epoch": 2.700394218134034, + "grad_norm": 0.2564401924610138, + "learning_rate": 2.593957413333331e-07, + "loss": 0.435395622253418, + "memory(GiB)": 86.38, + "step": 2055, + "token_acc": 0.8754750443374715, + "train_speed(iter/s)": 0.033751 + }, + { + "epoch": 2.706964520367937, + "grad_norm": 0.26199498772621155, + "learning_rate": 2.4823755002123253e-07, + "loss": 0.43353948593139646, + "memory(GiB)": 86.38, + "step": 2060, + "token_acc": 0.864589503613316, + "train_speed(iter/s)": 0.033752 + }, + { + "epoch": 2.7135348226018396, + "grad_norm": 0.25014081597328186, + "learning_rate": 2.373185456258531e-07, + "loss": 0.43132529258728025, + "memory(GiB)": 86.38, + "step": 2065, + "token_acc": 0.8730107001249253, + "train_speed(iter/s)": 0.033753 + }, + { + "epoch": 2.7201051248357424, + "grad_norm": 0.26392775774002075, + "learning_rate": 2.266392777712595e-07, + "loss": 0.4323751926422119, + "memory(GiB)": 86.38, + "step": 2070, + "token_acc": 0.8591085068536152, + "train_speed(iter/s)": 0.033755 + }, + { + "epoch": 2.726675427069645, + "grad_norm": 0.2512003779411316, + "learning_rate": 2.1620028401402815e-07, + "loss": 0.42936067581176757, + "memory(GiB)": 86.38, + "step": 2075, + "token_acc": 0.8530398736675878, + "train_speed(iter/s)": 0.033755 + }, + { + "epoch": 2.733245729303548, + "grad_norm": 0.2585814893245697, + "learning_rate": 2.060020898161863e-07, + "loss": 0.4324427604675293, + "memory(GiB)": 86.38, + "step": 2080, + "token_acc": 0.8682577296321372, + "train_speed(iter/s)": 0.033755 + }, + { + "epoch": 2.7398160315374507, + "grad_norm": 0.2547103464603424, + "learning_rate": 1.9604520851876196e-07, + "loss": 0.42908296585083006, + "memory(GiB)": 86.38, + "step": 2085, + "token_acc": 0.876189898744922, + "train_speed(iter/s)": 0.033753 + }, + { + "epoch": 2.7463863337713534, + "grad_norm": 0.25863251090049744, + "learning_rate": 1.863301413159474e-07, + "loss": 0.43100652694702146, + "memory(GiB)": 86.38, + "step": 2090, + "token_acc": 0.8747226144845672, + "train_speed(iter/s)": 0.033756 + }, + { + "epoch": 2.752956636005256, + "grad_norm": 0.2541860044002533, + "learning_rate": 1.768573772298665e-07, + "loss": 0.43143587112426757, + "memory(GiB)": 86.38, + "step": 2095, + "token_acc": 0.8809481163054511, + "train_speed(iter/s)": 0.033757 + }, + { + "epoch": 2.759526938239159, + "grad_norm": 0.24803873896598816, + "learning_rate": 1.6762739308596343e-07, + "loss": 0.4299370765686035, + "memory(GiB)": 86.38, + "step": 2100, + "token_acc": 0.8700363353231249, + "train_speed(iter/s)": 0.033756 + } + ], + "logging_steps": 5, + "max_steps": 2283, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 300, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.4827819580022116e+20, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}