{ "best_global_step": 14000, "best_metric": 1.92009127, "best_model_checkpoint": "/home/pawangcs/devichand/ms-swift-chatas/exp_output_minicpm_no_img_imgchat/v11-20250809-190924/checkpoint-14000", "epoch": 4.96915695681974, "eval_steps": 500, "global_step": 14500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0003427004797806717, "grad_norm": 4.963552474975586, "learning_rate": 9.999999884087794e-05, "loss": 4.6799492835998535, "memory(GiB)": 31.51, "step": 1, "token_acc": 0.2870159453302961, "train_speed(iter/s)": 0.316559 }, { "epoch": 0.0017135023989033585, "grad_norm": 2.4697046279907227, "learning_rate": 9.999997102195137e-05, "loss": 3.903712272644043, "memory(GiB)": 54.96, "step": 5, "token_acc": 0.34418865805727117, "train_speed(iter/s)": 0.757358 }, { "epoch": 0.003427004797806717, "grad_norm": 1.9197832345962524, "learning_rate": 9.999988408783905e-05, "loss": 3.2450187683105467, "memory(GiB)": 54.96, "step": 10, "token_acc": 0.39524011899702505, "train_speed(iter/s)": 0.968129 }, { "epoch": 0.005140507196710075, "grad_norm": 1.6383864879608154, "learning_rate": 9.999973919776382e-05, "loss": 2.8740705490112304, "memory(GiB)": 54.96, "step": 15, "token_acc": 0.4444444444444444, "train_speed(iter/s)": 1.056957 }, { "epoch": 0.006854009595613434, "grad_norm": 1.4952691793441772, "learning_rate": 9.99995363518936e-05, "loss": 2.742406463623047, "memory(GiB)": 54.96, "step": 20, "token_acc": 0.4467046379170057, "train_speed(iter/s)": 1.100865 }, { "epoch": 0.008567511994516792, "grad_norm": 1.639898419380188, "learning_rate": 9.999927555046355e-05, "loss": 2.7475746154785154, "memory(GiB)": 54.96, "step": 25, "token_acc": 0.4462118041021348, "train_speed(iter/s)": 1.128969 }, { "epoch": 0.01028101439342015, "grad_norm": 1.6686553955078125, "learning_rate": 9.999895679377595e-05, "loss": 2.6557409286499025, "memory(GiB)": 54.96, "step": 30, "token_acc": 0.4576271186440678, "train_speed(iter/s)": 1.15441 }, { "epoch": 0.011994516792323509, "grad_norm": 1.5342313051223755, "learning_rate": 9.999858008220029e-05, "loss": 2.7645309448242186, "memory(GiB)": 54.96, "step": 35, "token_acc": 0.4495924495924496, "train_speed(iter/s)": 1.163347 }, { "epoch": 0.013708019191226868, "grad_norm": 1.5538337230682373, "learning_rate": 9.99981454161732e-05, "loss": 2.626693344116211, "memory(GiB)": 69.72, "step": 40, "token_acc": 0.46480067854113655, "train_speed(iter/s)": 1.167367 }, { "epoch": 0.015421521590130226, "grad_norm": 1.3050143718719482, "learning_rate": 9.999765279619855e-05, "loss": 2.6281137466430664, "memory(GiB)": 69.72, "step": 45, "token_acc": 0.4478476821192053, "train_speed(iter/s)": 1.171322 }, { "epoch": 0.017135023989033583, "grad_norm": 1.6585451364517212, "learning_rate": 9.99971022228473e-05, "loss": 2.6257543563842773, "memory(GiB)": 69.72, "step": 50, "token_acc": 0.4501675041876047, "train_speed(iter/s)": 1.184802 }, { "epoch": 0.018848526387936944, "grad_norm": 1.2543548345565796, "learning_rate": 9.999649369675768e-05, "loss": 2.6810981750488283, "memory(GiB)": 69.72, "step": 55, "token_acc": 0.4629144851657941, "train_speed(iter/s)": 1.191383 }, { "epoch": 0.0205620287868403, "grad_norm": 1.4155943393707275, "learning_rate": 9.9995827218635e-05, "loss": 2.7071456909179688, "memory(GiB)": 69.72, "step": 60, "token_acc": 0.45621090259159963, "train_speed(iter/s)": 1.18736 }, { "epoch": 0.02227553118574366, "grad_norm": 1.5393792390823364, "learning_rate": 9.999510278925182e-05, "loss": 2.6428159713745116, "memory(GiB)": 69.72, "step": 65, "token_acc": 0.45918367346938777, "train_speed(iter/s)": 1.193354 }, { "epoch": 0.023989033584647018, "grad_norm": 1.508380651473999, "learning_rate": 9.999432040944783e-05, "loss": 2.545497512817383, "memory(GiB)": 69.72, "step": 70, "token_acc": 0.4757822546078011, "train_speed(iter/s)": 1.204209 }, { "epoch": 0.02570253598355038, "grad_norm": 1.606309413909912, "learning_rate": 9.99934800801299e-05, "loss": 2.508285903930664, "memory(GiB)": 69.72, "step": 75, "token_acc": 0.47923875432525953, "train_speed(iter/s)": 1.207441 }, { "epoch": 0.027416038382453736, "grad_norm": 1.2237852811813354, "learning_rate": 9.999258180227209e-05, "loss": 2.6128358840942383, "memory(GiB)": 69.72, "step": 80, "token_acc": 0.4613733905579399, "train_speed(iter/s)": 1.205907 }, { "epoch": 0.029129540781357093, "grad_norm": 1.8111507892608643, "learning_rate": 9.99916255769156e-05, "loss": 2.6486885070800783, "memory(GiB)": 69.72, "step": 85, "token_acc": 0.45609436435124506, "train_speed(iter/s)": 1.203679 }, { "epoch": 0.030843043180260453, "grad_norm": 1.6752275228500366, "learning_rate": 9.999061140516881e-05, "loss": 2.507363700866699, "memory(GiB)": 69.72, "step": 90, "token_acc": 0.48720173535791755, "train_speed(iter/s)": 1.210082 }, { "epoch": 0.03255654557916381, "grad_norm": 1.8334081172943115, "learning_rate": 9.998953928820727e-05, "loss": 2.550128936767578, "memory(GiB)": 69.72, "step": 95, "token_acc": 0.4745762711864407, "train_speed(iter/s)": 1.207351 }, { "epoch": 0.03427004797806717, "grad_norm": 1.4305516481399536, "learning_rate": 9.998840922727371e-05, "loss": 2.5760061264038088, "memory(GiB)": 87.66, "step": 100, "token_acc": 0.4654300168634064, "train_speed(iter/s)": 1.200469 }, { "epoch": 0.03598355037697053, "grad_norm": 1.2939285039901733, "learning_rate": 9.9987221223678e-05, "loss": 2.5824636459350585, "memory(GiB)": 87.66, "step": 105, "token_acc": 0.46153846153846156, "train_speed(iter/s)": 1.204674 }, { "epoch": 0.03769705277587389, "grad_norm": 1.2965565919876099, "learning_rate": 9.998597527879717e-05, "loss": 2.58685245513916, "memory(GiB)": 87.66, "step": 110, "token_acc": 0.47638272103350826, "train_speed(iter/s)": 1.197748 }, { "epoch": 0.03941055517477725, "grad_norm": 1.4692373275756836, "learning_rate": 9.998467139407544e-05, "loss": 2.483348274230957, "memory(GiB)": 87.66, "step": 115, "token_acc": 0.4832155477031802, "train_speed(iter/s)": 1.197898 }, { "epoch": 0.0411240575736806, "grad_norm": 1.321492314338684, "learning_rate": 9.998330957102415e-05, "loss": 2.5091089248657226, "memory(GiB)": 87.66, "step": 120, "token_acc": 0.46923398911678527, "train_speed(iter/s)": 1.200078 }, { "epoch": 0.04283755997258396, "grad_norm": 1.4497312307357788, "learning_rate": 9.998188981122183e-05, "loss": 2.495448684692383, "memory(GiB)": 87.66, "step": 125, "token_acc": 0.48687782805429863, "train_speed(iter/s)": 1.198457 }, { "epoch": 0.04455106237148732, "grad_norm": 1.3354806900024414, "learning_rate": 9.998041211631417e-05, "loss": 2.555104637145996, "memory(GiB)": 87.66, "step": 130, "token_acc": 0.4580703336339044, "train_speed(iter/s)": 1.205398 }, { "epoch": 0.046264564770390676, "grad_norm": 1.7531394958496094, "learning_rate": 9.997887648801396e-05, "loss": 2.587027359008789, "memory(GiB)": 87.66, "step": 135, "token_acc": 0.4696513129573827, "train_speed(iter/s)": 1.206073 }, { "epoch": 0.047978067169294036, "grad_norm": 1.6935189962387085, "learning_rate": 9.997728292810122e-05, "loss": 2.5349538803100584, "memory(GiB)": 87.66, "step": 140, "token_acc": 0.47285464098073554, "train_speed(iter/s)": 1.205216 }, { "epoch": 0.0496915695681974, "grad_norm": 1.319119930267334, "learning_rate": 9.997563143842308e-05, "loss": 2.5872756958007814, "memory(GiB)": 87.66, "step": 145, "token_acc": 0.4660314830157415, "train_speed(iter/s)": 1.206578 }, { "epoch": 0.05140507196710076, "grad_norm": 1.3608235120773315, "learning_rate": 9.997392202089377e-05, "loss": 2.6414863586425783, "memory(GiB)": 87.66, "step": 150, "token_acc": 0.46983857264231094, "train_speed(iter/s)": 1.204836 }, { "epoch": 0.05311857436600411, "grad_norm": 1.2583929300308228, "learning_rate": 9.997215467749477e-05, "loss": 2.5688735961914064, "memory(GiB)": 87.66, "step": 155, "token_acc": 0.48328396106644095, "train_speed(iter/s)": 1.204271 }, { "epoch": 0.05483207676490747, "grad_norm": 1.326547622680664, "learning_rate": 9.997032941027462e-05, "loss": 2.5065399169921876, "memory(GiB)": 87.66, "step": 160, "token_acc": 0.49165596919127086, "train_speed(iter/s)": 1.20722 }, { "epoch": 0.05654557916381083, "grad_norm": 1.3818352222442627, "learning_rate": 9.996844622134902e-05, "loss": 2.5955501556396485, "memory(GiB)": 87.66, "step": 165, "token_acc": 0.472400513478819, "train_speed(iter/s)": 1.206246 }, { "epoch": 0.058259081562714185, "grad_norm": 1.3249913454055786, "learning_rate": 9.996650511290082e-05, "loss": 2.6052120208740233, "memory(GiB)": 87.66, "step": 170, "token_acc": 0.4742268041237113, "train_speed(iter/s)": 1.209995 }, { "epoch": 0.059972583961617545, "grad_norm": 1.24324369430542, "learning_rate": 9.996450608718002e-05, "loss": 2.570241928100586, "memory(GiB)": 87.66, "step": 175, "token_acc": 0.4542079207920792, "train_speed(iter/s)": 1.213079 }, { "epoch": 0.061686086360520906, "grad_norm": 1.253162145614624, "learning_rate": 9.996244914650372e-05, "loss": 2.5835597991943358, "memory(GiB)": 87.66, "step": 180, "token_acc": 0.457286432160804, "train_speed(iter/s)": 1.210395 }, { "epoch": 0.06339958875942427, "grad_norm": 1.3544718027114868, "learning_rate": 9.996033429325617e-05, "loss": 2.513744926452637, "memory(GiB)": 87.66, "step": 185, "token_acc": 0.4768962510897995, "train_speed(iter/s)": 1.211349 }, { "epoch": 0.06511309115832763, "grad_norm": 1.230699896812439, "learning_rate": 9.995816152988873e-05, "loss": 2.5308584213256835, "memory(GiB)": 87.66, "step": 190, "token_acc": 0.47096227214921577, "train_speed(iter/s)": 1.208525 }, { "epoch": 0.06682659355723097, "grad_norm": 1.3964964151382446, "learning_rate": 9.995593085891991e-05, "loss": 2.564719581604004, "memory(GiB)": 87.66, "step": 195, "token_acc": 0.46993524514338575, "train_speed(iter/s)": 1.210525 }, { "epoch": 0.06854009595613433, "grad_norm": 1.6201012134552002, "learning_rate": 9.995364228293533e-05, "loss": 2.578047180175781, "memory(GiB)": 87.66, "step": 200, "token_acc": 0.4709326641572564, "train_speed(iter/s)": 1.212112 }, { "epoch": 0.0702535983550377, "grad_norm": 1.4579530954360962, "learning_rate": 9.995129580458773e-05, "loss": 2.535914993286133, "memory(GiB)": 87.66, "step": 205, "token_acc": 0.48203221809169766, "train_speed(iter/s)": 1.204315 }, { "epoch": 0.07196710075394105, "grad_norm": 1.4185296297073364, "learning_rate": 9.994889142659695e-05, "loss": 2.6034374237060547, "memory(GiB)": 87.66, "step": 210, "token_acc": 0.4606035205364627, "train_speed(iter/s)": 1.206031 }, { "epoch": 0.07368060315284441, "grad_norm": 1.3260804414749146, "learning_rate": 9.994642915174997e-05, "loss": 2.5803178787231444, "memory(GiB)": 87.66, "step": 215, "token_acc": 0.47303609341825903, "train_speed(iter/s)": 1.20442 }, { "epoch": 0.07539410555174778, "grad_norm": 1.2348705530166626, "learning_rate": 9.994390898290086e-05, "loss": 2.5485214233398437, "memory(GiB)": 87.66, "step": 220, "token_acc": 0.4727108705457826, "train_speed(iter/s)": 1.205745 }, { "epoch": 0.07710760795065114, "grad_norm": 1.3087815046310425, "learning_rate": 9.994133092297081e-05, "loss": 2.487457275390625, "memory(GiB)": 87.66, "step": 225, "token_acc": 0.4639872662156785, "train_speed(iter/s)": 1.201123 }, { "epoch": 0.0788211103495545, "grad_norm": 1.4699616432189941, "learning_rate": 9.99386949749481e-05, "loss": 2.490413475036621, "memory(GiB)": 87.66, "step": 230, "token_acc": 0.48089171974522293, "train_speed(iter/s)": 1.202065 }, { "epoch": 0.08053461274845784, "grad_norm": 1.407140851020813, "learning_rate": 9.993600114188812e-05, "loss": 2.558310890197754, "memory(GiB)": 87.66, "step": 235, "token_acc": 0.46523388116308473, "train_speed(iter/s)": 1.202485 }, { "epoch": 0.0822481151473612, "grad_norm": 1.2033467292785645, "learning_rate": 9.993324942691336e-05, "loss": 2.53837890625, "memory(GiB)": 87.66, "step": 240, "token_acc": 0.46382978723404256, "train_speed(iter/s)": 1.200899 }, { "epoch": 0.08396161754626456, "grad_norm": 1.2666771411895752, "learning_rate": 9.993043983321338e-05, "loss": 2.569959259033203, "memory(GiB)": 87.66, "step": 245, "token_acc": 0.47185970636215335, "train_speed(iter/s)": 1.201564 }, { "epoch": 0.08567511994516792, "grad_norm": 1.47136652469635, "learning_rate": 9.992757236404483e-05, "loss": 2.5225528717041015, "memory(GiB)": 87.66, "step": 250, "token_acc": 0.4750874125874126, "train_speed(iter/s)": 1.203403 }, { "epoch": 0.08738862234407128, "grad_norm": 1.7947242259979248, "learning_rate": 9.992464702273148e-05, "loss": 2.533863830566406, "memory(GiB)": 87.66, "step": 255, "token_acc": 0.48626144879267275, "train_speed(iter/s)": 1.199789 }, { "epoch": 0.08910212474297464, "grad_norm": 1.2617156505584717, "learning_rate": 9.992166381266413e-05, "loss": 2.645303726196289, "memory(GiB)": 87.66, "step": 260, "token_acc": 0.45305770887166236, "train_speed(iter/s)": 1.19941 }, { "epoch": 0.090815627141878, "grad_norm": 1.5156151056289673, "learning_rate": 9.991862273730073e-05, "loss": 2.516830825805664, "memory(GiB)": 87.66, "step": 265, "token_acc": 0.471846355303361, "train_speed(iter/s)": 1.201021 }, { "epoch": 0.09252912954078135, "grad_norm": 1.2194738388061523, "learning_rate": 9.99155238001662e-05, "loss": 2.5057514190673826, "memory(GiB)": 87.66, "step": 270, "token_acc": 0.4816414686825054, "train_speed(iter/s)": 1.203571 }, { "epoch": 0.09424263193968471, "grad_norm": 1.2770516872406006, "learning_rate": 9.991236700485263e-05, "loss": 2.5224559783935545, "memory(GiB)": 87.66, "step": 275, "token_acc": 0.47106434641845407, "train_speed(iter/s)": 1.203552 }, { "epoch": 0.09595613433858807, "grad_norm": 1.3374152183532715, "learning_rate": 9.990915235501913e-05, "loss": 2.4192970275878904, "memory(GiB)": 87.66, "step": 280, "token_acc": 0.4807293825113966, "train_speed(iter/s)": 1.200578 }, { "epoch": 0.09766963673749143, "grad_norm": 1.406653642654419, "learning_rate": 9.990587985439183e-05, "loss": 2.460310173034668, "memory(GiB)": 87.66, "step": 285, "token_acc": 0.5004194630872483, "train_speed(iter/s)": 1.20038 }, { "epoch": 0.0993831391363948, "grad_norm": 1.3873275518417358, "learning_rate": 9.9902549506764e-05, "loss": 2.532239532470703, "memory(GiB)": 87.66, "step": 290, "token_acc": 0.4988558352402746, "train_speed(iter/s)": 1.200887 }, { "epoch": 0.10109664153529815, "grad_norm": 1.2837986946105957, "learning_rate": 9.989916131599591e-05, "loss": 2.3827672958374024, "memory(GiB)": 87.66, "step": 295, "token_acc": 0.5098882201203784, "train_speed(iter/s)": 1.201252 }, { "epoch": 0.10281014393420151, "grad_norm": 1.3599152565002441, "learning_rate": 9.989571528601487e-05, "loss": 2.5518108367919923, "memory(GiB)": 87.66, "step": 300, "token_acc": 0.4681912681912682, "train_speed(iter/s)": 1.203327 }, { "epoch": 0.10452364633310486, "grad_norm": 1.0804427862167358, "learning_rate": 9.989221142081526e-05, "loss": 2.4857067108154296, "memory(GiB)": 87.66, "step": 305, "token_acc": 0.47590601354042217, "train_speed(iter/s)": 1.202566 }, { "epoch": 0.10623714873200822, "grad_norm": 1.4274872541427612, "learning_rate": 9.988864972445848e-05, "loss": 2.4979413986206054, "memory(GiB)": 87.66, "step": 310, "token_acc": 0.4841168996188056, "train_speed(iter/s)": 1.203768 }, { "epoch": 0.10795065113091158, "grad_norm": 1.2808573246002197, "learning_rate": 9.988503020107298e-05, "loss": 2.6247100830078125, "memory(GiB)": 87.66, "step": 315, "token_acc": 0.4632263307598537, "train_speed(iter/s)": 1.203242 }, { "epoch": 0.10966415352981494, "grad_norm": 1.7363955974578857, "learning_rate": 9.988135285485422e-05, "loss": 2.6593313217163086, "memory(GiB)": 87.66, "step": 320, "token_acc": 0.45465890183028285, "train_speed(iter/s)": 1.203089 }, { "epoch": 0.1113776559287183, "grad_norm": 1.2582919597625732, "learning_rate": 9.987761769006471e-05, "loss": 2.574449157714844, "memory(GiB)": 87.66, "step": 325, "token_acc": 0.46214201647704983, "train_speed(iter/s)": 1.202757 }, { "epoch": 0.11309115832762166, "grad_norm": 1.2567654848098755, "learning_rate": 9.987382471103395e-05, "loss": 2.4789525985717775, "memory(GiB)": 87.66, "step": 330, "token_acc": 0.49022164276401564, "train_speed(iter/s)": 1.203761 }, { "epoch": 0.11480466072652502, "grad_norm": 1.3611072301864624, "learning_rate": 9.986997392215846e-05, "loss": 2.4648691177368165, "memory(GiB)": 87.66, "step": 335, "token_acc": 0.47387173396674587, "train_speed(iter/s)": 1.204872 }, { "epoch": 0.11651816312542837, "grad_norm": 1.4040131568908691, "learning_rate": 9.986606532790178e-05, "loss": 2.6041484832763673, "memory(GiB)": 87.66, "step": 340, "token_acc": 0.47257212515237706, "train_speed(iter/s)": 1.206294 }, { "epoch": 0.11823166552433173, "grad_norm": 1.3243968486785889, "learning_rate": 9.986209893279444e-05, "loss": 2.519818878173828, "memory(GiB)": 87.66, "step": 345, "token_acc": 0.46811337466784764, "train_speed(iter/s)": 1.204376 }, { "epoch": 0.11994516792323509, "grad_norm": 1.3620216846466064, "learning_rate": 9.985807474143398e-05, "loss": 2.4966924667358397, "memory(GiB)": 87.66, "step": 350, "token_acc": 0.48434148434148433, "train_speed(iter/s)": 1.204392 }, { "epoch": 0.12165867032213845, "grad_norm": 1.1787054538726807, "learning_rate": 9.985399275848492e-05, "loss": 2.5471403121948244, "memory(GiB)": 87.66, "step": 355, "token_acc": 0.47379454926624737, "train_speed(iter/s)": 1.204744 }, { "epoch": 0.12337217272104181, "grad_norm": 1.2229851484298706, "learning_rate": 9.984985298867882e-05, "loss": 2.355154037475586, "memory(GiB)": 87.66, "step": 360, "token_acc": 0.5121845082680592, "train_speed(iter/s)": 1.203347 }, { "epoch": 0.12508567511994517, "grad_norm": 1.1907895803451538, "learning_rate": 9.984565543681412e-05, "loss": 2.6052608489990234, "memory(GiB)": 107.04, "step": 365, "token_acc": 0.46141793967880923, "train_speed(iter/s)": 1.201379 }, { "epoch": 0.12679917751884853, "grad_norm": 1.4068623781204224, "learning_rate": 9.984140010775632e-05, "loss": 2.508440399169922, "memory(GiB)": 107.04, "step": 370, "token_acc": 0.46904315196998125, "train_speed(iter/s)": 1.20246 }, { "epoch": 0.1285126799177519, "grad_norm": 1.2812556028366089, "learning_rate": 9.983708700643786e-05, "loss": 2.497690963745117, "memory(GiB)": 107.04, "step": 375, "token_acc": 0.4769874476987448, "train_speed(iter/s)": 1.203348 }, { "epoch": 0.13022618231665525, "grad_norm": 1.301218867301941, "learning_rate": 9.983271613785816e-05, "loss": 2.518081474304199, "memory(GiB)": 107.04, "step": 380, "token_acc": 0.46313963573287076, "train_speed(iter/s)": 1.204025 }, { "epoch": 0.13193968471555861, "grad_norm": 1.4108742475509644, "learning_rate": 9.982828750708359e-05, "loss": 2.521461296081543, "memory(GiB)": 107.04, "step": 385, "token_acc": 0.47820567075751164, "train_speed(iter/s)": 1.200145 }, { "epoch": 0.13365318711446195, "grad_norm": 1.318585991859436, "learning_rate": 9.982380111924745e-05, "loss": 2.560490608215332, "memory(GiB)": 107.04, "step": 390, "token_acc": 0.4805194805194805, "train_speed(iter/s)": 1.199871 }, { "epoch": 0.1353666895133653, "grad_norm": 1.0891865491867065, "learning_rate": 9.981925697955004e-05, "loss": 2.5543228149414063, "memory(GiB)": 107.04, "step": 395, "token_acc": 0.4604993860008187, "train_speed(iter/s)": 1.199677 }, { "epoch": 0.13708019191226867, "grad_norm": 1.4444414377212524, "learning_rate": 9.981465509325856e-05, "loss": 2.5815128326416015, "memory(GiB)": 107.04, "step": 400, "token_acc": 0.46881720430107526, "train_speed(iter/s)": 1.200204 }, { "epoch": 0.13879369431117203, "grad_norm": 1.2715994119644165, "learning_rate": 9.980999546570715e-05, "loss": 2.4680997848510744, "memory(GiB)": 107.04, "step": 405, "token_acc": 0.4799655320982335, "train_speed(iter/s)": 1.200246 }, { "epoch": 0.1405071967100754, "grad_norm": 1.1026136875152588, "learning_rate": 9.980527810229691e-05, "loss": 2.4910659790039062, "memory(GiB)": 107.04, "step": 410, "token_acc": 0.48855291576673865, "train_speed(iter/s)": 1.201135 }, { "epoch": 0.14222069910897875, "grad_norm": 1.2569341659545898, "learning_rate": 9.980050300849582e-05, "loss": 2.5464752197265623, "memory(GiB)": 107.04, "step": 415, "token_acc": 0.4670094258783205, "train_speed(iter/s)": 1.201354 }, { "epoch": 0.1439342015078821, "grad_norm": 1.4204517602920532, "learning_rate": 9.979567018983878e-05, "loss": 2.4989944458007813, "memory(GiB)": 107.04, "step": 420, "token_acc": 0.4835758835758836, "train_speed(iter/s)": 1.201739 }, { "epoch": 0.14564770390678547, "grad_norm": 1.368656039237976, "learning_rate": 9.979077965192765e-05, "loss": 2.4992691040039063, "memory(GiB)": 107.04, "step": 425, "token_acc": 0.473142345568487, "train_speed(iter/s)": 1.202739 }, { "epoch": 0.14736120630568883, "grad_norm": 1.277221918106079, "learning_rate": 9.978583140043113e-05, "loss": 2.582627296447754, "memory(GiB)": 107.04, "step": 430, "token_acc": 0.4828193832599119, "train_speed(iter/s)": 1.204266 }, { "epoch": 0.1490747087045922, "grad_norm": 1.3156546354293823, "learning_rate": 9.978082544108487e-05, "loss": 2.4771116256713865, "memory(GiB)": 107.04, "step": 435, "token_acc": 0.47356426618049224, "train_speed(iter/s)": 1.204647 }, { "epoch": 0.15078821110349555, "grad_norm": 1.2151230573654175, "learning_rate": 9.97757617796914e-05, "loss": 2.4117715835571287, "memory(GiB)": 107.04, "step": 440, "token_acc": 0.4973474801061008, "train_speed(iter/s)": 1.205968 }, { "epoch": 0.1525017135023989, "grad_norm": 1.1947633028030396, "learning_rate": 9.977064042212008e-05, "loss": 2.5334638595581054, "memory(GiB)": 107.04, "step": 445, "token_acc": 0.47806661251015437, "train_speed(iter/s)": 1.205426 }, { "epoch": 0.15421521590130227, "grad_norm": 1.4307478666305542, "learning_rate": 9.97654613743072e-05, "loss": 2.603252983093262, "memory(GiB)": 107.04, "step": 450, "token_acc": 0.46218842416561046, "train_speed(iter/s)": 1.205513 }, { "epoch": 0.15592871830020563, "grad_norm": 1.3171420097351074, "learning_rate": 9.976022464225592e-05, "loss": 2.4788990020751953, "memory(GiB)": 107.04, "step": 455, "token_acc": 0.4887589928057554, "train_speed(iter/s)": 1.205497 }, { "epoch": 0.157642220699109, "grad_norm": 1.2103610038757324, "learning_rate": 9.975493023203626e-05, "loss": 2.5018871307373045, "memory(GiB)": 107.04, "step": 460, "token_acc": 0.4725516029863856, "train_speed(iter/s)": 1.205335 }, { "epoch": 0.15935572309801233, "grad_norm": 1.1404234170913696, "learning_rate": 9.974957814978507e-05, "loss": 2.540557861328125, "memory(GiB)": 107.04, "step": 465, "token_acc": 0.4810590631364562, "train_speed(iter/s)": 1.204549 }, { "epoch": 0.16106922549691569, "grad_norm": 1.1332801580429077, "learning_rate": 9.974416840170606e-05, "loss": 2.5165504455566405, "memory(GiB)": 107.04, "step": 470, "token_acc": 0.47874149659863946, "train_speed(iter/s)": 1.205519 }, { "epoch": 0.16278272789581905, "grad_norm": 1.183814525604248, "learning_rate": 9.973870099406981e-05, "loss": 2.4643630981445312, "memory(GiB)": 107.04, "step": 475, "token_acc": 0.4962655601659751, "train_speed(iter/s)": 1.206052 }, { "epoch": 0.1644962302947224, "grad_norm": 1.3166519403457642, "learning_rate": 9.973317593321369e-05, "loss": 2.5922359466552733, "memory(GiB)": 107.04, "step": 480, "token_acc": 0.4768416409351566, "train_speed(iter/s)": 1.20582 }, { "epoch": 0.16620973269362577, "grad_norm": 1.235674262046814, "learning_rate": 9.972759322554195e-05, "loss": 2.480392837524414, "memory(GiB)": 107.04, "step": 485, "token_acc": 0.4810508182601206, "train_speed(iter/s)": 1.205851 }, { "epoch": 0.16792323509252913, "grad_norm": 1.232388973236084, "learning_rate": 9.97219528775256e-05, "loss": 2.6111339569091796, "memory(GiB)": 107.04, "step": 490, "token_acc": 0.4607438016528926, "train_speed(iter/s)": 1.206751 }, { "epoch": 0.1696367374914325, "grad_norm": 1.0716609954833984, "learning_rate": 9.971625489570248e-05, "loss": 2.4871824264526365, "memory(GiB)": 107.04, "step": 495, "token_acc": 0.4857829395274329, "train_speed(iter/s)": 1.207146 }, { "epoch": 0.17135023989033585, "grad_norm": 1.3028372526168823, "learning_rate": 9.971049928667729e-05, "loss": 2.5782838821411134, "memory(GiB)": 107.04, "step": 500, "token_acc": 0.47720618987871183, "train_speed(iter/s)": 1.204958 }, { "epoch": 0.17135023989033585, "eval_loss": 2.321640968322754, "eval_runtime": 3.6878, "eval_samples_per_second": 27.116, "eval_steps_per_second": 27.116, "eval_token_acc": 0.4777777777777778, "step": 500 }, { "epoch": 0.1730637422892392, "grad_norm": 1.2093425989151, "learning_rate": 9.970468605712144e-05, "loss": 2.502716827392578, "memory(GiB)": 107.04, "step": 505, "token_acc": 0.48424162836506895, "train_speed(iter/s)": 1.190213 }, { "epoch": 0.17477724468814257, "grad_norm": 1.294241189956665, "learning_rate": 9.969881521377319e-05, "loss": 2.5557674407958983, "memory(GiB)": 107.04, "step": 510, "token_acc": 0.49361702127659574, "train_speed(iter/s)": 1.18865 }, { "epoch": 0.17649074708704593, "grad_norm": 1.2488949298858643, "learning_rate": 9.969288676343758e-05, "loss": 2.4770881652832033, "memory(GiB)": 107.04, "step": 515, "token_acc": 0.4943775100401606, "train_speed(iter/s)": 1.187923 }, { "epoch": 0.1782042494859493, "grad_norm": 1.2035720348358154, "learning_rate": 9.968690071298638e-05, "loss": 2.464649200439453, "memory(GiB)": 107.04, "step": 520, "token_acc": 0.48495897903372837, "train_speed(iter/s)": 1.187578 }, { "epoch": 0.17991775188485265, "grad_norm": 1.4203979969024658, "learning_rate": 9.968085706935816e-05, "loss": 2.4021087646484376, "memory(GiB)": 107.04, "step": 525, "token_acc": 0.4981052631578947, "train_speed(iter/s)": 1.187779 }, { "epoch": 0.181631254283756, "grad_norm": 1.2177263498306274, "learning_rate": 9.967475583955824e-05, "loss": 2.424332046508789, "memory(GiB)": 107.04, "step": 530, "token_acc": 0.48980496453900707, "train_speed(iter/s)": 1.188497 }, { "epoch": 0.18334475668265934, "grad_norm": 1.324190616607666, "learning_rate": 9.96685970306587e-05, "loss": 2.5900314331054686, "memory(GiB)": 107.04, "step": 535, "token_acc": 0.47168141592920354, "train_speed(iter/s)": 1.186284 }, { "epoch": 0.1850582590815627, "grad_norm": 1.258746862411499, "learning_rate": 9.966238064979834e-05, "loss": 2.5319984436035154, "memory(GiB)": 107.04, "step": 540, "token_acc": 0.4910362920857018, "train_speed(iter/s)": 1.186486 }, { "epoch": 0.18677176148046606, "grad_norm": 1.1349014043807983, "learning_rate": 9.96561067041827e-05, "loss": 2.3855648040771484, "memory(GiB)": 107.04, "step": 545, "token_acc": 0.4980426272292301, "train_speed(iter/s)": 1.185724 }, { "epoch": 0.18848526387936942, "grad_norm": 1.247622013092041, "learning_rate": 9.964977520108408e-05, "loss": 2.558890724182129, "memory(GiB)": 107.04, "step": 550, "token_acc": 0.46898705934850515, "train_speed(iter/s)": 1.186612 }, { "epoch": 0.19019876627827279, "grad_norm": 1.1651710271835327, "learning_rate": 9.964338614784141e-05, "loss": 2.4813091278076174, "memory(GiB)": 107.04, "step": 555, "token_acc": 0.4900332225913621, "train_speed(iter/s)": 1.18654 }, { "epoch": 0.19191226867717615, "grad_norm": 1.1621208190917969, "learning_rate": 9.963693955186042e-05, "loss": 2.637445831298828, "memory(GiB)": 107.04, "step": 560, "token_acc": 0.4596707818930041, "train_speed(iter/s)": 1.184059 }, { "epoch": 0.1936257710760795, "grad_norm": 1.207560658454895, "learning_rate": 9.96304354206135e-05, "loss": 2.474043273925781, "memory(GiB)": 107.04, "step": 565, "token_acc": 0.4820804195804196, "train_speed(iter/s)": 1.185131 }, { "epoch": 0.19533927347498287, "grad_norm": 1.2527809143066406, "learning_rate": 9.96238737616397e-05, "loss": 2.5015649795532227, "memory(GiB)": 107.04, "step": 570, "token_acc": 0.4894894894894895, "train_speed(iter/s)": 1.182841 }, { "epoch": 0.19705277587388623, "grad_norm": 1.2823240756988525, "learning_rate": 9.961725458254483e-05, "loss": 2.488645553588867, "memory(GiB)": 107.04, "step": 575, "token_acc": 0.4782795698924731, "train_speed(iter/s)": 1.183578 }, { "epoch": 0.1987662782727896, "grad_norm": 1.4486905336380005, "learning_rate": 9.961057789100128e-05, "loss": 2.522553062438965, "memory(GiB)": 107.04, "step": 580, "token_acc": 0.477797513321492, "train_speed(iter/s)": 1.184895 }, { "epoch": 0.20047978067169295, "grad_norm": 1.2198811769485474, "learning_rate": 9.960384369474818e-05, "loss": 2.4064138412475584, "memory(GiB)": 107.04, "step": 585, "token_acc": 0.5039028620988725, "train_speed(iter/s)": 1.183296 }, { "epoch": 0.2021932830705963, "grad_norm": 1.0906304121017456, "learning_rate": 9.959705200159128e-05, "loss": 2.5081449508666993, "memory(GiB)": 107.04, "step": 590, "token_acc": 0.4808861859252824, "train_speed(iter/s)": 1.183854 }, { "epoch": 0.20390678546949967, "grad_norm": 1.128794550895691, "learning_rate": 9.959020281940297e-05, "loss": 2.532901382446289, "memory(GiB)": 107.04, "step": 595, "token_acc": 0.4764705882352941, "train_speed(iter/s)": 1.183672 }, { "epoch": 0.20562028786840303, "grad_norm": 1.3799782991409302, "learning_rate": 9.95832961561223e-05, "loss": 2.484122657775879, "memory(GiB)": 107.04, "step": 600, "token_acc": 0.4855687606112054, "train_speed(iter/s)": 1.184172 }, { "epoch": 0.20733379026730636, "grad_norm": 1.2213202714920044, "learning_rate": 9.957633201975492e-05, "loss": 2.5398059844970704, "memory(GiB)": 107.04, "step": 605, "token_acc": 0.47468354430379744, "train_speed(iter/s)": 1.184434 }, { "epoch": 0.20904729266620972, "grad_norm": 1.3055410385131836, "learning_rate": 9.956931041837312e-05, "loss": 2.436742401123047, "memory(GiB)": 107.04, "step": 610, "token_acc": 0.4879668049792531, "train_speed(iter/s)": 1.182191 }, { "epoch": 0.21076079506511308, "grad_norm": 1.1349438428878784, "learning_rate": 9.95622313601158e-05, "loss": 2.493661117553711, "memory(GiB)": 107.04, "step": 615, "token_acc": 0.47721822541966424, "train_speed(iter/s)": 1.182673 }, { "epoch": 0.21247429746401644, "grad_norm": 1.4202725887298584, "learning_rate": 9.955509485318843e-05, "loss": 2.4772903442382814, "memory(GiB)": 107.04, "step": 620, "token_acc": 0.47664720600500415, "train_speed(iter/s)": 1.183061 }, { "epoch": 0.2141877998629198, "grad_norm": 1.235129714012146, "learning_rate": 9.954790090586314e-05, "loss": 2.482021713256836, "memory(GiB)": 107.04, "step": 625, "token_acc": 0.4780045351473923, "train_speed(iter/s)": 1.183957 }, { "epoch": 0.21590130226182316, "grad_norm": 1.1287906169891357, "learning_rate": 9.954064952647854e-05, "loss": 2.533871078491211, "memory(GiB)": 107.04, "step": 630, "token_acc": 0.4853187919463087, "train_speed(iter/s)": 1.184954 }, { "epoch": 0.21761480466072652, "grad_norm": 1.075061321258545, "learning_rate": 9.953334072343987e-05, "loss": 2.545399856567383, "memory(GiB)": 107.04, "step": 635, "token_acc": 0.47224558452481075, "train_speed(iter/s)": 1.183743 }, { "epoch": 0.21932830705962988, "grad_norm": 1.3835502862930298, "learning_rate": 9.952597450521897e-05, "loss": 2.556488800048828, "memory(GiB)": 107.04, "step": 640, "token_acc": 0.45827929096411585, "train_speed(iter/s)": 1.184701 }, { "epoch": 0.22104180945853324, "grad_norm": 1.3940082788467407, "learning_rate": 9.951855088035413e-05, "loss": 2.5208831787109376, "memory(GiB)": 107.04, "step": 645, "token_acc": 0.47782002534854245, "train_speed(iter/s)": 1.183393 }, { "epoch": 0.2227553118574366, "grad_norm": 1.2291702032089233, "learning_rate": 9.951106985745024e-05, "loss": 2.463373565673828, "memory(GiB)": 107.04, "step": 650, "token_acc": 0.4775132275132275, "train_speed(iter/s)": 1.184214 }, { "epoch": 0.22446881425633997, "grad_norm": 1.347756266593933, "learning_rate": 9.950353144517877e-05, "loss": 2.5355144500732423, "memory(GiB)": 107.04, "step": 655, "token_acc": 0.4743752647183397, "train_speed(iter/s)": 1.184087 }, { "epoch": 0.22618231665524333, "grad_norm": 1.2864567041397095, "learning_rate": 9.949593565227762e-05, "loss": 2.472720146179199, "memory(GiB)": 107.04, "step": 660, "token_acc": 0.4872651356993737, "train_speed(iter/s)": 1.182799 }, { "epoch": 0.2278958190541467, "grad_norm": 1.1586545705795288, "learning_rate": 9.948828248755122e-05, "loss": 2.518563461303711, "memory(GiB)": 107.04, "step": 665, "token_acc": 0.46293245469522243, "train_speed(iter/s)": 1.183317 }, { "epoch": 0.22960932145305005, "grad_norm": 1.3029749393463135, "learning_rate": 9.948057195987057e-05, "loss": 2.599252128601074, "memory(GiB)": 107.04, "step": 670, "token_acc": 0.46543372963452223, "train_speed(iter/s)": 1.180917 }, { "epoch": 0.23132282385195338, "grad_norm": 1.2748653888702393, "learning_rate": 9.947280407817308e-05, "loss": 2.4968685150146483, "memory(GiB)": 107.04, "step": 675, "token_acc": 0.47860262008733623, "train_speed(iter/s)": 1.181222 }, { "epoch": 0.23303632625085674, "grad_norm": 1.1449968814849854, "learning_rate": 9.946497885146269e-05, "loss": 2.4727148056030273, "memory(GiB)": 107.04, "step": 680, "token_acc": 0.48674010607915136, "train_speed(iter/s)": 1.181577 }, { "epoch": 0.2347498286497601, "grad_norm": 1.0794786214828491, "learning_rate": 9.945709628880976e-05, "loss": 2.616939353942871, "memory(GiB)": 107.04, "step": 685, "token_acc": 0.4627926421404682, "train_speed(iter/s)": 1.182272 }, { "epoch": 0.23646333104866346, "grad_norm": 1.184170126914978, "learning_rate": 9.94491563993512e-05, "loss": 2.562537956237793, "memory(GiB)": 107.04, "step": 690, "token_acc": 0.46487603305785125, "train_speed(iter/s)": 1.182758 }, { "epoch": 0.23817683344756682, "grad_norm": 1.2025244235992432, "learning_rate": 9.944115919229025e-05, "loss": 2.5975284576416016, "memory(GiB)": 107.04, "step": 695, "token_acc": 0.48403361344537815, "train_speed(iter/s)": 1.183471 }, { "epoch": 0.23989033584647018, "grad_norm": 1.1473746299743652, "learning_rate": 9.943310467689666e-05, "loss": 2.5795188903808595, "memory(GiB)": 107.04, "step": 700, "token_acc": 0.4620997162545602, "train_speed(iter/s)": 1.183239 }, { "epoch": 0.24160383824537354, "grad_norm": 1.2044447660446167, "learning_rate": 9.942499286250664e-05, "loss": 2.50523681640625, "memory(GiB)": 107.04, "step": 705, "token_acc": 0.4862270450751252, "train_speed(iter/s)": 1.18341 }, { "epoch": 0.2433173406442769, "grad_norm": 1.1171855926513672, "learning_rate": 9.941682375852271e-05, "loss": 2.3973468780517577, "memory(GiB)": 107.04, "step": 710, "token_acc": 0.48898305084745763, "train_speed(iter/s)": 1.180274 }, { "epoch": 0.24503084304318026, "grad_norm": 1.261797547340393, "learning_rate": 9.94085973744139e-05, "loss": 2.462741470336914, "memory(GiB)": 107.04, "step": 715, "token_acc": 0.4851650647722524, "train_speed(iter/s)": 1.1808 }, { "epoch": 0.24674434544208362, "grad_norm": 1.2870444059371948, "learning_rate": 9.940031371971559e-05, "loss": 2.5753931045532226, "memory(GiB)": 107.04, "step": 720, "token_acc": 0.46157148990983254, "train_speed(iter/s)": 1.18144 }, { "epoch": 0.24845784784098698, "grad_norm": 1.196175217628479, "learning_rate": 9.939197280402952e-05, "loss": 2.5084272384643556, "memory(GiB)": 107.04, "step": 725, "token_acc": 0.4734660033167496, "train_speed(iter/s)": 1.179644 }, { "epoch": 0.25017135023989034, "grad_norm": 1.1984542608261108, "learning_rate": 9.938357463702386e-05, "loss": 2.4408279418945313, "memory(GiB)": 107.04, "step": 730, "token_acc": 0.4875555555555556, "train_speed(iter/s)": 1.180484 }, { "epoch": 0.2518848526387937, "grad_norm": 1.2635442018508911, "learning_rate": 9.937511922843308e-05, "loss": 2.4381027221679688, "memory(GiB)": 107.04, "step": 735, "token_acc": 0.47655894401456533, "train_speed(iter/s)": 1.179334 }, { "epoch": 0.25359835503769707, "grad_norm": 1.3986057043075562, "learning_rate": 9.936660658805806e-05, "loss": 2.4227922439575194, "memory(GiB)": 107.04, "step": 740, "token_acc": 0.4821991888237945, "train_speed(iter/s)": 1.18006 }, { "epoch": 0.2553118574366004, "grad_norm": 1.1378962993621826, "learning_rate": 9.935803672576596e-05, "loss": 2.4402996063232423, "memory(GiB)": 107.04, "step": 745, "token_acc": 0.5041868664609961, "train_speed(iter/s)": 1.180719 }, { "epoch": 0.2570253598355038, "grad_norm": 1.2280017137527466, "learning_rate": 9.93494096514903e-05, "loss": 2.600652885437012, "memory(GiB)": 107.04, "step": 750, "token_acc": 0.4653896386591206, "train_speed(iter/s)": 1.178752 }, { "epoch": 0.2587388622344071, "grad_norm": 1.8681074380874634, "learning_rate": 9.934072537523093e-05, "loss": 2.573705291748047, "memory(GiB)": 107.04, "step": 755, "token_acc": 0.48223121907332434, "train_speed(iter/s)": 1.179404 }, { "epoch": 0.2604523646333105, "grad_norm": 1.2438080310821533, "learning_rate": 9.933198390705396e-05, "loss": 2.448863410949707, "memory(GiB)": 107.04, "step": 760, "token_acc": 0.49666221628838453, "train_speed(iter/s)": 1.179559 }, { "epoch": 0.26216586703221384, "grad_norm": 1.288278579711914, "learning_rate": 9.932318525709185e-05, "loss": 2.635662078857422, "memory(GiB)": 107.04, "step": 765, "token_acc": 0.4624796084828711, "train_speed(iter/s)": 1.179121 }, { "epoch": 0.26387936943111723, "grad_norm": 1.2884206771850586, "learning_rate": 9.931432943554329e-05, "loss": 2.4294502258300783, "memory(GiB)": 107.04, "step": 770, "token_acc": 0.47874535699546017, "train_speed(iter/s)": 1.179514 }, { "epoch": 0.26559287183002056, "grad_norm": 1.2971642017364502, "learning_rate": 9.930541645267323e-05, "loss": 2.5408504486083983, "memory(GiB)": 107.04, "step": 775, "token_acc": 0.46750524109014674, "train_speed(iter/s)": 1.18004 }, { "epoch": 0.2673063742289239, "grad_norm": 1.1014388799667358, "learning_rate": 9.929644631881297e-05, "loss": 2.5004741668701174, "memory(GiB)": 107.04, "step": 780, "token_acc": 0.474990153603781, "train_speed(iter/s)": 1.179215 }, { "epoch": 0.2690198766278273, "grad_norm": 1.2679884433746338, "learning_rate": 9.928741904435992e-05, "loss": 2.5923603057861326, "memory(GiB)": 107.04, "step": 785, "token_acc": 0.46908776480760916, "train_speed(iter/s)": 1.179673 }, { "epoch": 0.2707333790267306, "grad_norm": 1.2329357862472534, "learning_rate": 9.927833463977782e-05, "loss": 2.4765838623046874, "memory(GiB)": 107.04, "step": 790, "token_acc": 0.47347826086956524, "train_speed(iter/s)": 1.179957 }, { "epoch": 0.272446881425634, "grad_norm": 1.1546977758407593, "learning_rate": 9.926919311559661e-05, "loss": 2.418170166015625, "memory(GiB)": 107.04, "step": 795, "token_acc": 0.47900599828620394, "train_speed(iter/s)": 1.180282 }, { "epoch": 0.27416038382453733, "grad_norm": 1.2930309772491455, "learning_rate": 9.925999448241244e-05, "loss": 2.4681064605712892, "memory(GiB)": 107.04, "step": 800, "token_acc": 0.48525469168900803, "train_speed(iter/s)": 1.181219 }, { "epoch": 0.2758738862234407, "grad_norm": 1.023927927017212, "learning_rate": 9.925073875088763e-05, "loss": 2.4732891082763673, "memory(GiB)": 107.04, "step": 805, "token_acc": 0.4848861283643892, "train_speed(iter/s)": 1.180806 }, { "epoch": 0.27758738862234406, "grad_norm": 1.3010534048080444, "learning_rate": 9.924142593175069e-05, "loss": 2.512874221801758, "memory(GiB)": 107.04, "step": 810, "token_acc": 0.48452278589853826, "train_speed(iter/s)": 1.181705 }, { "epoch": 0.27930089102124744, "grad_norm": 1.1734291315078735, "learning_rate": 9.923205603579635e-05, "loss": 2.555068016052246, "memory(GiB)": 107.04, "step": 815, "token_acc": 0.48080357142857144, "train_speed(iter/s)": 1.18254 }, { "epoch": 0.2810143934201508, "grad_norm": 1.1736445426940918, "learning_rate": 9.922262907388542e-05, "loss": 2.545096588134766, "memory(GiB)": 107.04, "step": 820, "token_acc": 0.4698151950718686, "train_speed(iter/s)": 1.182428 }, { "epoch": 0.28272789581905416, "grad_norm": 1.0682395696640015, "learning_rate": 9.921314505694492e-05, "loss": 2.558370018005371, "memory(GiB)": 107.04, "step": 825, "token_acc": 0.4693186627168853, "train_speed(iter/s)": 1.182571 }, { "epoch": 0.2844413982179575, "grad_norm": 1.180648684501648, "learning_rate": 9.920360399596799e-05, "loss": 2.467837905883789, "memory(GiB)": 107.04, "step": 830, "token_acc": 0.4857507443641004, "train_speed(iter/s)": 1.183294 }, { "epoch": 0.2861549006168609, "grad_norm": 2.080698013305664, "learning_rate": 9.919400590201386e-05, "loss": 2.5844657897949217, "memory(GiB)": 107.04, "step": 835, "token_acc": 0.47658643326039385, "train_speed(iter/s)": 1.183586 }, { "epoch": 0.2878684030157642, "grad_norm": 1.2099300622940063, "learning_rate": 9.918435078620792e-05, "loss": 2.5908416748046874, "memory(GiB)": 107.04, "step": 840, "token_acc": 0.469215291750503, "train_speed(iter/s)": 1.183721 }, { "epoch": 0.2895819054146676, "grad_norm": 1.0201884508132935, "learning_rate": 9.917463865974161e-05, "loss": 2.4965965270996096, "memory(GiB)": 107.04, "step": 845, "token_acc": 0.47034425549564496, "train_speed(iter/s)": 1.183859 }, { "epoch": 0.29129540781357094, "grad_norm": 1.1658945083618164, "learning_rate": 9.916486953387246e-05, "loss": 2.353656005859375, "memory(GiB)": 107.04, "step": 850, "token_acc": 0.5, "train_speed(iter/s)": 1.184313 }, { "epoch": 0.29300891021247427, "grad_norm": 1.1799263954162598, "learning_rate": 9.91550434199241e-05, "loss": 2.569581985473633, "memory(GiB)": 107.04, "step": 855, "token_acc": 0.483150800336984, "train_speed(iter/s)": 1.184508 }, { "epoch": 0.29472241261137766, "grad_norm": 1.2661787271499634, "learning_rate": 9.914516032928618e-05, "loss": 2.4798137664794924, "memory(GiB)": 107.04, "step": 860, "token_acc": 0.4924731182795699, "train_speed(iter/s)": 1.185289 }, { "epoch": 0.296435915010281, "grad_norm": 1.2165000438690186, "learning_rate": 9.91352202734144e-05, "loss": 2.4704492568969725, "memory(GiB)": 107.04, "step": 865, "token_acc": 0.4795463750506278, "train_speed(iter/s)": 1.185323 }, { "epoch": 0.2981494174091844, "grad_norm": 1.3711497783660889, "learning_rate": 9.912522326383052e-05, "loss": 2.530714416503906, "memory(GiB)": 107.04, "step": 870, "token_acc": 0.46453287197231835, "train_speed(iter/s)": 1.185393 }, { "epoch": 0.2998629198080877, "grad_norm": 1.097999930381775, "learning_rate": 9.911516931212228e-05, "loss": 2.4248580932617188, "memory(GiB)": 107.29, "step": 875, "token_acc": 0.4932489451476793, "train_speed(iter/s)": 1.184859 }, { "epoch": 0.3015764222069911, "grad_norm": 1.1377798318862915, "learning_rate": 9.910505842994343e-05, "loss": 2.5094181060791017, "memory(GiB)": 107.29, "step": 880, "token_acc": 0.47917525773195874, "train_speed(iter/s)": 1.185052 }, { "epoch": 0.30328992460589443, "grad_norm": 1.138631820678711, "learning_rate": 9.909489062901372e-05, "loss": 2.4675504684448244, "memory(GiB)": 107.29, "step": 885, "token_acc": 0.4816067653276956, "train_speed(iter/s)": 1.184468 }, { "epoch": 0.3050034270047978, "grad_norm": 1.0629688501358032, "learning_rate": 9.908466592111887e-05, "loss": 2.507090759277344, "memory(GiB)": 107.29, "step": 890, "token_acc": 0.46869639794168094, "train_speed(iter/s)": 1.185174 }, { "epoch": 0.30671692940370116, "grad_norm": 1.1706143617630005, "learning_rate": 9.907438431811059e-05, "loss": 2.4931135177612305, "memory(GiB)": 107.29, "step": 895, "token_acc": 0.4909680407596109, "train_speed(iter/s)": 1.185568 }, { "epoch": 0.30843043180260454, "grad_norm": 1.13985276222229, "learning_rate": 9.906404583190645e-05, "loss": 2.4697280883789063, "memory(GiB)": 107.29, "step": 900, "token_acc": 0.4814977973568282, "train_speed(iter/s)": 1.186009 }, { "epoch": 0.3101439342015079, "grad_norm": 1.1766681671142578, "learning_rate": 9.905365047449008e-05, "loss": 2.50244026184082, "memory(GiB)": 107.29, "step": 905, "token_acc": 0.4735099337748344, "train_speed(iter/s)": 1.186367 }, { "epoch": 0.31185743660041126, "grad_norm": 1.1171972751617432, "learning_rate": 9.904319825791094e-05, "loss": 2.47772102355957, "memory(GiB)": 107.29, "step": 910, "token_acc": 0.4736612973443622, "train_speed(iter/s)": 1.187132 }, { "epoch": 0.3135709389993146, "grad_norm": 1.1783778667449951, "learning_rate": 9.903268919428442e-05, "loss": 2.586382293701172, "memory(GiB)": 107.29, "step": 915, "token_acc": 0.4671221178479932, "train_speed(iter/s)": 1.187849 }, { "epoch": 0.315284441398218, "grad_norm": 1.15598726272583, "learning_rate": 9.902212329579182e-05, "loss": 2.4426319122314455, "memory(GiB)": 107.29, "step": 920, "token_acc": 0.4781696053736356, "train_speed(iter/s)": 1.188244 }, { "epoch": 0.3169979437971213, "grad_norm": 1.1066339015960693, "learning_rate": 9.901150057468028e-05, "loss": 2.4981735229492186, "memory(GiB)": 107.29, "step": 925, "token_acc": 0.48252032520325205, "train_speed(iter/s)": 1.188702 }, { "epoch": 0.31871144619602465, "grad_norm": 1.0880540609359741, "learning_rate": 9.900082104326284e-05, "loss": 2.537156677246094, "memory(GiB)": 107.29, "step": 930, "token_acc": 0.45820063694267515, "train_speed(iter/s)": 1.189224 }, { "epoch": 0.32042494859492804, "grad_norm": 1.1282163858413696, "learning_rate": 9.89900847139184e-05, "loss": 2.5515966415405273, "memory(GiB)": 107.29, "step": 935, "token_acc": 0.4717842323651452, "train_speed(iter/s)": 1.189429 }, { "epoch": 0.32213845099383137, "grad_norm": 1.1894043684005737, "learning_rate": 9.897929159909168e-05, "loss": 2.558162879943848, "memory(GiB)": 107.29, "step": 940, "token_acc": 0.47141673570836784, "train_speed(iter/s)": 1.187902 }, { "epoch": 0.32385195339273476, "grad_norm": 1.013088583946228, "learning_rate": 9.896844171129316e-05, "loss": 2.471614646911621, "memory(GiB)": 107.29, "step": 945, "token_acc": 0.4763795156808257, "train_speed(iter/s)": 1.187681 }, { "epoch": 0.3255654557916381, "grad_norm": 1.2161592245101929, "learning_rate": 9.895753506309922e-05, "loss": 2.497726249694824, "memory(GiB)": 107.29, "step": 950, "token_acc": 0.47087166009636444, "train_speed(iter/s)": 1.187222 }, { "epoch": 0.3272789581905415, "grad_norm": 1.1651531457901, "learning_rate": 9.894657166715201e-05, "loss": 2.417892837524414, "memory(GiB)": 107.29, "step": 955, "token_acc": 0.4838709677419355, "train_speed(iter/s)": 1.187483 }, { "epoch": 0.3289924605894448, "grad_norm": 1.0287449359893799, "learning_rate": 9.893555153615942e-05, "loss": 2.4650337219238283, "memory(GiB)": 107.29, "step": 960, "token_acc": 0.4947275922671353, "train_speed(iter/s)": 1.187666 }, { "epoch": 0.3307059629883482, "grad_norm": 1.1438298225402832, "learning_rate": 9.892447468289514e-05, "loss": 2.5536678314208983, "memory(GiB)": 112.5, "step": 965, "token_acc": 0.4677351328553353, "train_speed(iter/s)": 1.187039 }, { "epoch": 0.33241946538725153, "grad_norm": 1.0947679281234741, "learning_rate": 9.891334112019858e-05, "loss": 2.437881851196289, "memory(GiB)": 112.5, "step": 970, "token_acc": 0.48113964686998395, "train_speed(iter/s)": 1.187106 }, { "epoch": 0.3341329677861549, "grad_norm": 1.231793999671936, "learning_rate": 9.89021508609749e-05, "loss": 2.518886375427246, "memory(GiB)": 112.5, "step": 975, "token_acc": 0.4750415973377704, "train_speed(iter/s)": 1.187192 }, { "epoch": 0.33584647018505825, "grad_norm": 1.108077883720398, "learning_rate": 9.889090391819497e-05, "loss": 2.527558517456055, "memory(GiB)": 112.5, "step": 980, "token_acc": 0.4675324675324675, "train_speed(iter/s)": 1.187581 }, { "epoch": 0.33755997258396164, "grad_norm": 1.2914066314697266, "learning_rate": 9.887960030489539e-05, "loss": 2.4260705947875976, "memory(GiB)": 112.5, "step": 985, "token_acc": 0.4878048780487805, "train_speed(iter/s)": 1.1873 }, { "epoch": 0.339273474982865, "grad_norm": 1.1249085664749146, "learning_rate": 9.886824003417841e-05, "loss": 2.4947824478149414, "memory(GiB)": 112.5, "step": 990, "token_acc": 0.473044183218484, "train_speed(iter/s)": 1.187282 }, { "epoch": 0.3409869773817683, "grad_norm": 1.22458815574646, "learning_rate": 9.885682311921197e-05, "loss": 2.471230125427246, "memory(GiB)": 112.5, "step": 995, "token_acc": 0.47807757166947723, "train_speed(iter/s)": 1.18805 }, { "epoch": 0.3427004797806717, "grad_norm": 1.0531980991363525, "learning_rate": 9.884534957322966e-05, "loss": 2.5519657135009766, "memory(GiB)": 112.5, "step": 1000, "token_acc": 0.46494156928213687, "train_speed(iter/s)": 1.188467 }, { "epoch": 0.3427004797806717, "eval_loss": 2.0715537071228027, "eval_runtime": 3.7137, "eval_samples_per_second": 26.927, "eval_steps_per_second": 26.927, "eval_token_acc": 0.5023255813953489, "step": 1000 }, { "epoch": 0.34441398217957503, "grad_norm": 1.0632538795471191, "learning_rate": 9.883381940953074e-05, "loss": 2.4449430465698243, "memory(GiB)": 112.5, "step": 1005, "token_acc": 0.4830508474576271, "train_speed(iter/s)": 1.181625 }, { "epoch": 0.3461274845784784, "grad_norm": 1.3233312368392944, "learning_rate": 9.882223264148007e-05, "loss": 2.464262580871582, "memory(GiB)": 112.5, "step": 1010, "token_acc": 0.492194275802255, "train_speed(iter/s)": 1.181676 }, { "epoch": 0.34784098697738175, "grad_norm": 1.0508471727371216, "learning_rate": 9.881058928250811e-05, "loss": 2.4272560119628905, "memory(GiB)": 112.5, "step": 1015, "token_acc": 0.4915984489444205, "train_speed(iter/s)": 1.181833 }, { "epoch": 0.34955448937628514, "grad_norm": 1.158613681793213, "learning_rate": 9.879888934611093e-05, "loss": 2.467195510864258, "memory(GiB)": 112.5, "step": 1020, "token_acc": 0.4851190476190476, "train_speed(iter/s)": 1.182608 }, { "epoch": 0.35126799177518847, "grad_norm": 1.3239381313323975, "learning_rate": 9.878713284585021e-05, "loss": 2.5926055908203125, "memory(GiB)": 112.5, "step": 1025, "token_acc": 0.46680672268907564, "train_speed(iter/s)": 1.181584 }, { "epoch": 0.35298149417409186, "grad_norm": 1.1437301635742188, "learning_rate": 9.877531979535315e-05, "loss": 2.5061677932739257, "memory(GiB)": 112.5, "step": 1030, "token_acc": 0.48196573489630296, "train_speed(iter/s)": 1.182179 }, { "epoch": 0.3546949965729952, "grad_norm": 1.1104246377944946, "learning_rate": 9.876345020831253e-05, "loss": 2.4491600036621093, "memory(GiB)": 112.5, "step": 1035, "token_acc": 0.49385525716886663, "train_speed(iter/s)": 1.182628 }, { "epoch": 0.3564084989718986, "grad_norm": 1.1326972246170044, "learning_rate": 9.875152409848662e-05, "loss": 2.460665702819824, "memory(GiB)": 112.5, "step": 1040, "token_acc": 0.4741156169111303, "train_speed(iter/s)": 1.1829 }, { "epoch": 0.3581220013708019, "grad_norm": 1.1639753580093384, "learning_rate": 9.873954147969926e-05, "loss": 2.4081695556640623, "memory(GiB)": 112.5, "step": 1045, "token_acc": 0.4924178601516428, "train_speed(iter/s)": 1.182952 }, { "epoch": 0.3598355037697053, "grad_norm": 1.63519287109375, "learning_rate": 9.872750236583977e-05, "loss": 2.502071762084961, "memory(GiB)": 112.5, "step": 1050, "token_acc": 0.48212801330008315, "train_speed(iter/s)": 1.182804 }, { "epoch": 0.36154900616860863, "grad_norm": 1.1909750699996948, "learning_rate": 9.871540677086293e-05, "loss": 2.537074661254883, "memory(GiB)": 112.5, "step": 1055, "token_acc": 0.483539974348012, "train_speed(iter/s)": 1.183342 }, { "epoch": 0.363262508567512, "grad_norm": 1.1886450052261353, "learning_rate": 9.870325470878902e-05, "loss": 2.524631690979004, "memory(GiB)": 112.5, "step": 1060, "token_acc": 0.4662327095199349, "train_speed(iter/s)": 1.183456 }, { "epoch": 0.36497601096641535, "grad_norm": 1.0836304426193237, "learning_rate": 9.869104619370376e-05, "loss": 2.5331483840942384, "memory(GiB)": 112.5, "step": 1065, "token_acc": 0.4835960801022582, "train_speed(iter/s)": 1.183449 }, { "epoch": 0.3666895133653187, "grad_norm": 1.226065754890442, "learning_rate": 9.867878123975832e-05, "loss": 2.4290719985961915, "memory(GiB)": 112.5, "step": 1070, "token_acc": 0.48521959459459457, "train_speed(iter/s)": 1.183943 }, { "epoch": 0.3684030157642221, "grad_norm": 1.3345614671707153, "learning_rate": 9.866645986116927e-05, "loss": 2.4910892486572265, "memory(GiB)": 112.5, "step": 1075, "token_acc": 0.47750424448217316, "train_speed(iter/s)": 1.184387 }, { "epoch": 0.3701165181631254, "grad_norm": 1.5596996545791626, "learning_rate": 9.865408207221858e-05, "loss": 2.538698196411133, "memory(GiB)": 112.5, "step": 1080, "token_acc": 0.4908393694077546, "train_speed(iter/s)": 1.184794 }, { "epoch": 0.3718300205620288, "grad_norm": 1.1085871458053589, "learning_rate": 9.864164788725363e-05, "loss": 2.535078239440918, "memory(GiB)": 112.5, "step": 1085, "token_acc": 0.4790444258172674, "train_speed(iter/s)": 1.185032 }, { "epoch": 0.37354352296093213, "grad_norm": 1.2119961977005005, "learning_rate": 9.862915732068715e-05, "loss": 2.413924217224121, "memory(GiB)": 112.5, "step": 1090, "token_acc": 0.4831111111111111, "train_speed(iter/s)": 1.185561 }, { "epoch": 0.3752570253598355, "grad_norm": 1.186673641204834, "learning_rate": 9.861661038699723e-05, "loss": 2.571664047241211, "memory(GiB)": 112.5, "step": 1095, "token_acc": 0.47623054270088344, "train_speed(iter/s)": 1.18473 }, { "epoch": 0.37697052775873885, "grad_norm": 1.2185087203979492, "learning_rate": 9.860400710072731e-05, "loss": 2.5130756378173826, "memory(GiB)": 112.5, "step": 1100, "token_acc": 0.48534635879218474, "train_speed(iter/s)": 1.184912 }, { "epoch": 0.37868403015764224, "grad_norm": 1.0953351259231567, "learning_rate": 9.859134747648611e-05, "loss": 2.417750930786133, "memory(GiB)": 112.5, "step": 1105, "token_acc": 0.4907292954264524, "train_speed(iter/s)": 1.185354 }, { "epoch": 0.38039753255654557, "grad_norm": 1.0650701522827148, "learning_rate": 9.857863152894768e-05, "loss": 2.3910507202148437, "memory(GiB)": 112.5, "step": 1110, "token_acc": 0.4851316907391674, "train_speed(iter/s)": 1.185689 }, { "epoch": 0.38211103495544896, "grad_norm": 1.134871244430542, "learning_rate": 9.856585927285137e-05, "loss": 2.663742446899414, "memory(GiB)": 112.5, "step": 1115, "token_acc": 0.4600465477114042, "train_speed(iter/s)": 1.185669 }, { "epoch": 0.3838245373543523, "grad_norm": 1.418212890625, "learning_rate": 9.85530307230018e-05, "loss": 2.4259565353393553, "memory(GiB)": 112.5, "step": 1120, "token_acc": 0.4876279863481229, "train_speed(iter/s)": 1.185831 }, { "epoch": 0.3855380397532557, "grad_norm": 1.3134695291519165, "learning_rate": 9.854014589426878e-05, "loss": 2.5041067123413088, "memory(GiB)": 112.5, "step": 1125, "token_acc": 0.48447986577181207, "train_speed(iter/s)": 1.18643 }, { "epoch": 0.387251542152159, "grad_norm": 1.2502626180648804, "learning_rate": 9.852720480158743e-05, "loss": 2.5034212112426757, "memory(GiB)": 112.5, "step": 1130, "token_acc": 0.4808247422680412, "train_speed(iter/s)": 1.186599 }, { "epoch": 0.38896504455106234, "grad_norm": 1.1662282943725586, "learning_rate": 9.851420745995805e-05, "loss": 2.4486515045166017, "memory(GiB)": 112.5, "step": 1135, "token_acc": 0.4765279007971656, "train_speed(iter/s)": 1.187077 }, { "epoch": 0.39067854694996573, "grad_norm": 1.3025171756744385, "learning_rate": 9.850115388444613e-05, "loss": 2.567709541320801, "memory(GiB)": 112.5, "step": 1140, "token_acc": 0.4761491760624458, "train_speed(iter/s)": 1.187535 }, { "epoch": 0.39239204934886907, "grad_norm": 1.1560261249542236, "learning_rate": 9.848804409018235e-05, "loss": 2.436934471130371, "memory(GiB)": 112.5, "step": 1145, "token_acc": 0.4928825622775801, "train_speed(iter/s)": 1.187643 }, { "epoch": 0.39410555174777245, "grad_norm": 1.0981005430221558, "learning_rate": 9.84748780923626e-05, "loss": 2.5011260986328123, "memory(GiB)": 112.5, "step": 1150, "token_acc": 0.4776304888152444, "train_speed(iter/s)": 1.188087 }, { "epoch": 0.3958190541466758, "grad_norm": 1.106001377105713, "learning_rate": 9.846165590624783e-05, "loss": 2.475779342651367, "memory(GiB)": 112.5, "step": 1155, "token_acc": 0.4776414721013059, "train_speed(iter/s)": 1.188269 }, { "epoch": 0.3975325565455792, "grad_norm": 1.0769869089126587, "learning_rate": 9.844837754716419e-05, "loss": 2.2842002868652345, "memory(GiB)": 112.5, "step": 1160, "token_acc": 0.5042543663233319, "train_speed(iter/s)": 1.188588 }, { "epoch": 0.3992460589444825, "grad_norm": 1.570983648300171, "learning_rate": 9.843504303050292e-05, "loss": 2.501449966430664, "memory(GiB)": 112.5, "step": 1165, "token_acc": 0.48123154787009703, "train_speed(iter/s)": 1.188361 }, { "epoch": 0.4009595613433859, "grad_norm": 1.2547701597213745, "learning_rate": 9.842165237172034e-05, "loss": 2.476102828979492, "memory(GiB)": 112.5, "step": 1170, "token_acc": 0.4798801369863014, "train_speed(iter/s)": 1.188045 }, { "epoch": 0.40267306374228923, "grad_norm": 1.198660135269165, "learning_rate": 9.840820558633788e-05, "loss": 2.6096214294433593, "memory(GiB)": 112.5, "step": 1175, "token_acc": 0.47534429142603285, "train_speed(iter/s)": 1.188269 }, { "epoch": 0.4043865661411926, "grad_norm": 1.2324682474136353, "learning_rate": 9.839470268994195e-05, "loss": 2.4547271728515625, "memory(GiB)": 112.5, "step": 1180, "token_acc": 0.4898224339540927, "train_speed(iter/s)": 1.188243 }, { "epoch": 0.40610006854009595, "grad_norm": 1.0649844408035278, "learning_rate": 9.838114369818412e-05, "loss": 2.4506484985351564, "memory(GiB)": 112.5, "step": 1185, "token_acc": 0.4689199689199689, "train_speed(iter/s)": 1.188245 }, { "epoch": 0.40781357093899934, "grad_norm": 1.3361562490463257, "learning_rate": 9.836752862678087e-05, "loss": 2.5612396240234374, "memory(GiB)": 112.5, "step": 1190, "token_acc": 0.45787988714228134, "train_speed(iter/s)": 1.188564 }, { "epoch": 0.40952707333790267, "grad_norm": 0.9884452819824219, "learning_rate": 9.835385749151376e-05, "loss": 2.498444366455078, "memory(GiB)": 112.5, "step": 1195, "token_acc": 0.4672780325135473, "train_speed(iter/s)": 1.188511 }, { "epoch": 0.41124057573680606, "grad_norm": 1.2644041776657104, "learning_rate": 9.834013030822926e-05, "loss": 2.361698341369629, "memory(GiB)": 112.5, "step": 1200, "token_acc": 0.49778172138420584, "train_speed(iter/s)": 1.188661 }, { "epoch": 0.4129540781357094, "grad_norm": 1.04469633102417, "learning_rate": 9.832634709283888e-05, "loss": 2.382892036437988, "memory(GiB)": 112.5, "step": 1205, "token_acc": 0.49051724137931035, "train_speed(iter/s)": 1.188917 }, { "epoch": 0.4146675805346127, "grad_norm": 1.0489764213562012, "learning_rate": 9.831250786131904e-05, "loss": 2.4960182189941404, "memory(GiB)": 112.5, "step": 1210, "token_acc": 0.4622203461376108, "train_speed(iter/s)": 1.188634 }, { "epoch": 0.4163810829335161, "grad_norm": 1.361219048500061, "learning_rate": 9.829861262971112e-05, "loss": 2.46813850402832, "memory(GiB)": 112.5, "step": 1215, "token_acc": 0.48533333333333334, "train_speed(iter/s)": 1.188736 }, { "epoch": 0.41809458533241944, "grad_norm": 1.1651471853256226, "learning_rate": 9.828466141412134e-05, "loss": 2.5644916534423827, "memory(GiB)": 112.5, "step": 1220, "token_acc": 0.48172905525846704, "train_speed(iter/s)": 1.188343 }, { "epoch": 0.41980808773132283, "grad_norm": 1.1304482221603394, "learning_rate": 9.82706542307209e-05, "loss": 2.5272552490234377, "memory(GiB)": 112.5, "step": 1225, "token_acc": 0.48794679966749793, "train_speed(iter/s)": 1.188668 }, { "epoch": 0.42152159013022616, "grad_norm": 1.1784542798995972, "learning_rate": 9.825659109574583e-05, "loss": 2.4949974060058593, "memory(GiB)": 112.5, "step": 1230, "token_acc": 0.47878787878787876, "train_speed(iter/s)": 1.188178 }, { "epoch": 0.42323509252912955, "grad_norm": 1.0670850276947021, "learning_rate": 9.824247202549699e-05, "loss": 2.4830886840820314, "memory(GiB)": 112.5, "step": 1235, "token_acc": 0.4722106142916841, "train_speed(iter/s)": 1.188633 }, { "epoch": 0.4249485949280329, "grad_norm": 1.0241999626159668, "learning_rate": 9.822829703634013e-05, "loss": 2.509450912475586, "memory(GiB)": 112.5, "step": 1240, "token_acc": 0.4799005387484459, "train_speed(iter/s)": 1.188665 }, { "epoch": 0.4266620973269363, "grad_norm": 1.0372072458267212, "learning_rate": 9.821406614470578e-05, "loss": 2.422958183288574, "memory(GiB)": 112.5, "step": 1245, "token_acc": 0.48986083499005967, "train_speed(iter/s)": 1.188546 }, { "epoch": 0.4283755997258396, "grad_norm": 1.0603771209716797, "learning_rate": 9.819977936708931e-05, "loss": 2.4764820098876954, "memory(GiB)": 112.5, "step": 1250, "token_acc": 0.490272373540856, "train_speed(iter/s)": 1.18884 }, { "epoch": 0.430089102124743, "grad_norm": 1.0865055322647095, "learning_rate": 9.818543672005078e-05, "loss": 2.506429672241211, "memory(GiB)": 112.5, "step": 1255, "token_acc": 0.4698184888138455, "train_speed(iter/s)": 1.189225 }, { "epoch": 0.4318026045236463, "grad_norm": 1.024645447731018, "learning_rate": 9.817103822021511e-05, "loss": 2.4287574768066404, "memory(GiB)": 112.5, "step": 1260, "token_acc": 0.49206349206349204, "train_speed(iter/s)": 1.18923 }, { "epoch": 0.4335161069225497, "grad_norm": 0.9992151856422424, "learning_rate": 9.815658388427189e-05, "loss": 2.4817684173583983, "memory(GiB)": 112.5, "step": 1265, "token_acc": 0.4684343434343434, "train_speed(iter/s)": 1.189683 }, { "epoch": 0.43522960932145305, "grad_norm": 1.2159701585769653, "learning_rate": 9.814207372897548e-05, "loss": 2.41900577545166, "memory(GiB)": 112.5, "step": 1270, "token_acc": 0.48787185354691076, "train_speed(iter/s)": 1.189411 }, { "epoch": 0.4369431117203564, "grad_norm": 1.0813621282577515, "learning_rate": 9.812750777114492e-05, "loss": 2.4038930892944337, "memory(GiB)": 112.5, "step": 1275, "token_acc": 0.49572264745610084, "train_speed(iter/s)": 1.189681 }, { "epoch": 0.43865661411925977, "grad_norm": 1.1763622760772705, "learning_rate": 9.811288602766392e-05, "loss": 2.4971340179443358, "memory(GiB)": 112.5, "step": 1280, "token_acc": 0.4793880152996175, "train_speed(iter/s)": 1.189735 }, { "epoch": 0.4403701165181631, "grad_norm": 0.9866830706596375, "learning_rate": 9.809820851548086e-05, "loss": 2.5354270935058594, "memory(GiB)": 112.5, "step": 1285, "token_acc": 0.46817599316531394, "train_speed(iter/s)": 1.189831 }, { "epoch": 0.4420836189170665, "grad_norm": 1.079627275466919, "learning_rate": 9.808347525160878e-05, "loss": 2.6047460556030275, "memory(GiB)": 112.5, "step": 1290, "token_acc": 0.4737521514629948, "train_speed(iter/s)": 1.190035 }, { "epoch": 0.4437971213159698, "grad_norm": 1.0686455965042114, "learning_rate": 9.806868625312533e-05, "loss": 2.5527927398681642, "memory(GiB)": 112.5, "step": 1295, "token_acc": 0.4780426599749059, "train_speed(iter/s)": 1.190782 }, { "epoch": 0.4455106237148732, "grad_norm": 1.0224637985229492, "learning_rate": 9.805384153717274e-05, "loss": 2.4891841888427733, "memory(GiB)": 112.5, "step": 1300, "token_acc": 0.4826086956521739, "train_speed(iter/s)": 1.190314 }, { "epoch": 0.44722412611377654, "grad_norm": 1.0033811330795288, "learning_rate": 9.803894112095786e-05, "loss": 2.5255237579345704, "memory(GiB)": 112.5, "step": 1305, "token_acc": 0.4780130293159609, "train_speed(iter/s)": 1.190465 }, { "epoch": 0.44893762851267993, "grad_norm": 1.2227169275283813, "learning_rate": 9.80239850217521e-05, "loss": 2.515412521362305, "memory(GiB)": 112.5, "step": 1310, "token_acc": 0.47592067988668557, "train_speed(iter/s)": 1.19057 }, { "epoch": 0.45065113091158326, "grad_norm": 1.0970875024795532, "learning_rate": 9.800897325689141e-05, "loss": 2.438773345947266, "memory(GiB)": 112.5, "step": 1315, "token_acc": 0.4874476987447699, "train_speed(iter/s)": 1.190655 }, { "epoch": 0.45236463331048665, "grad_norm": 1.0927454233169556, "learning_rate": 9.799390584377622e-05, "loss": 2.651423454284668, "memory(GiB)": 112.5, "step": 1320, "token_acc": 0.45770519262981574, "train_speed(iter/s)": 1.191019 }, { "epoch": 0.45407813570939, "grad_norm": 1.0489068031311035, "learning_rate": 9.797878279987152e-05, "loss": 2.5297290802001955, "memory(GiB)": 112.5, "step": 1325, "token_acc": 0.4602717167558666, "train_speed(iter/s)": 1.190932 }, { "epoch": 0.4557916381082934, "grad_norm": 0.9802359938621521, "learning_rate": 9.796360414270675e-05, "loss": 2.4645509719848633, "memory(GiB)": 112.5, "step": 1330, "token_acc": 0.48175787728026537, "train_speed(iter/s)": 1.19118 }, { "epoch": 0.4575051405071967, "grad_norm": 1.465628981590271, "learning_rate": 9.794836988987585e-05, "loss": 2.4714914321899415, "memory(GiB)": 112.5, "step": 1335, "token_acc": 0.4833187006145742, "train_speed(iter/s)": 1.191393 }, { "epoch": 0.4592186429061001, "grad_norm": 1.0763601064682007, "learning_rate": 9.793308005903716e-05, "loss": 2.53787784576416, "memory(GiB)": 112.5, "step": 1340, "token_acc": 0.4797809604043808, "train_speed(iter/s)": 1.190166 }, { "epoch": 0.4609321453050034, "grad_norm": 1.1457759141921997, "learning_rate": 9.791773466791348e-05, "loss": 2.4996364593505858, "memory(GiB)": 112.5, "step": 1345, "token_acc": 0.4888429752066116, "train_speed(iter/s)": 1.190606 }, { "epoch": 0.46264564770390676, "grad_norm": 1.037041187286377, "learning_rate": 9.790233373429195e-05, "loss": 2.4335580825805665, "memory(GiB)": 112.5, "step": 1350, "token_acc": 0.498994772818657, "train_speed(iter/s)": 1.19102 }, { "epoch": 0.46435915010281015, "grad_norm": 1.8047376871109009, "learning_rate": 9.788687727602414e-05, "loss": 2.3664791107177736, "memory(GiB)": 112.5, "step": 1355, "token_acc": 0.48268303375712407, "train_speed(iter/s)": 1.191315 }, { "epoch": 0.4660726525017135, "grad_norm": 1.8988566398620605, "learning_rate": 9.7871365311026e-05, "loss": 2.5260210037231445, "memory(GiB)": 112.5, "step": 1360, "token_acc": 0.4689075630252101, "train_speed(iter/s)": 1.191745 }, { "epoch": 0.46778615490061687, "grad_norm": 1.119966983795166, "learning_rate": 9.785579785727778e-05, "loss": 2.4252777099609375, "memory(GiB)": 112.5, "step": 1365, "token_acc": 0.4978595890410959, "train_speed(iter/s)": 1.191579 }, { "epoch": 0.4694996572995202, "grad_norm": 1.0393651723861694, "learning_rate": 9.784017493282403e-05, "loss": 2.3908510208129883, "memory(GiB)": 112.5, "step": 1370, "token_acc": 0.49638451722671206, "train_speed(iter/s)": 1.191975 }, { "epoch": 0.4712131596984236, "grad_norm": 1.1788526773452759, "learning_rate": 9.782449655577364e-05, "loss": 2.430359649658203, "memory(GiB)": 112.5, "step": 1375, "token_acc": 0.49166666666666664, "train_speed(iter/s)": 1.192178 }, { "epoch": 0.4729266620973269, "grad_norm": 1.3297359943389893, "learning_rate": 9.780876274429978e-05, "loss": 2.5318336486816406, "memory(GiB)": 112.5, "step": 1380, "token_acc": 0.4766770324300311, "train_speed(iter/s)": 1.192218 }, { "epoch": 0.4746401644962303, "grad_norm": 1.0843183994293213, "learning_rate": 9.779297351663981e-05, "loss": 2.4453582763671875, "memory(GiB)": 112.5, "step": 1385, "token_acc": 0.4902826855123675, "train_speed(iter/s)": 1.192469 }, { "epoch": 0.47635366689513364, "grad_norm": 1.1989637613296509, "learning_rate": 9.777712889109542e-05, "loss": 2.447914886474609, "memory(GiB)": 112.5, "step": 1390, "token_acc": 0.4743747437474375, "train_speed(iter/s)": 1.192641 }, { "epoch": 0.47806716929403703, "grad_norm": 1.1116583347320557, "learning_rate": 9.776122888603244e-05, "loss": 2.5436847686767576, "memory(GiB)": 112.5, "step": 1395, "token_acc": 0.47746967071057195, "train_speed(iter/s)": 1.192863 }, { "epoch": 0.47978067169294036, "grad_norm": 1.0853110551834106, "learning_rate": 9.774527351988092e-05, "loss": 2.484409713745117, "memory(GiB)": 112.5, "step": 1400, "token_acc": 0.48153093012906095, "train_speed(iter/s)": 1.193156 }, { "epoch": 0.48149417409184375, "grad_norm": 1.2044814825057983, "learning_rate": 9.772926281113506e-05, "loss": 2.478936767578125, "memory(GiB)": 112.5, "step": 1405, "token_acc": 0.47692307692307695, "train_speed(iter/s)": 1.192439 }, { "epoch": 0.4832076764907471, "grad_norm": 1.2951000928878784, "learning_rate": 9.771319677835325e-05, "loss": 2.5087558746337892, "memory(GiB)": 112.5, "step": 1410, "token_acc": 0.47265625, "train_speed(iter/s)": 1.192679 }, { "epoch": 0.4849211788896505, "grad_norm": 1.1424113512039185, "learning_rate": 9.769707544015796e-05, "loss": 2.6043310165405273, "memory(GiB)": 112.5, "step": 1415, "token_acc": 0.4679722562219502, "train_speed(iter/s)": 1.19271 }, { "epoch": 0.4866346812885538, "grad_norm": 1.0965598821640015, "learning_rate": 9.76808988152358e-05, "loss": 2.598417282104492, "memory(GiB)": 112.5, "step": 1420, "token_acc": 0.47036082474226804, "train_speed(iter/s)": 1.192774 }, { "epoch": 0.48834818368745714, "grad_norm": 1.1011883020401, "learning_rate": 9.766466692233742e-05, "loss": 2.6262588500976562, "memory(GiB)": 112.5, "step": 1425, "token_acc": 0.45875251509054327, "train_speed(iter/s)": 1.193048 }, { "epoch": 0.4900616860863605, "grad_norm": 1.0113840103149414, "learning_rate": 9.76483797802776e-05, "loss": 2.602730369567871, "memory(GiB)": 112.5, "step": 1430, "token_acc": 0.46450809464508097, "train_speed(iter/s)": 1.192263 }, { "epoch": 0.49177518848526386, "grad_norm": 1.044803261756897, "learning_rate": 9.763203740793514e-05, "loss": 2.4511566162109375, "memory(GiB)": 112.5, "step": 1435, "token_acc": 0.48209255533199197, "train_speed(iter/s)": 1.192385 }, { "epoch": 0.49348869088416725, "grad_norm": 1.139608383178711, "learning_rate": 9.761563982425279e-05, "loss": 2.476406478881836, "memory(GiB)": 112.5, "step": 1440, "token_acc": 0.49051724137931035, "train_speed(iter/s)": 1.192621 }, { "epoch": 0.4952021932830706, "grad_norm": 1.3843530416488647, "learning_rate": 9.759918704823737e-05, "loss": 2.4901700973510743, "memory(GiB)": 112.5, "step": 1445, "token_acc": 0.4853775643823658, "train_speed(iter/s)": 1.192969 }, { "epoch": 0.49691569568197397, "grad_norm": 1.112481713294983, "learning_rate": 9.758267909895966e-05, "loss": 2.41970272064209, "memory(GiB)": 112.5, "step": 1450, "token_acc": 0.48079413034095814, "train_speed(iter/s)": 1.192947 }, { "epoch": 0.4986291980808773, "grad_norm": 1.0654983520507812, "learning_rate": 9.75661159955544e-05, "loss": 2.525897979736328, "memory(GiB)": 112.5, "step": 1455, "token_acc": 0.46306231943871234, "train_speed(iter/s)": 1.192902 }, { "epoch": 0.5003427004797807, "grad_norm": 1.0768482685089111, "learning_rate": 9.754949775722022e-05, "loss": 2.4530139923095704, "memory(GiB)": 112.5, "step": 1460, "token_acc": 0.4843330349149508, "train_speed(iter/s)": 1.193402 }, { "epoch": 0.502056202878684, "grad_norm": 1.0348197221755981, "learning_rate": 9.75328244032197e-05, "loss": 2.5606609344482423, "memory(GiB)": 112.5, "step": 1465, "token_acc": 0.45098039215686275, "train_speed(iter/s)": 1.193655 }, { "epoch": 0.5037697052775874, "grad_norm": 1.0435640811920166, "learning_rate": 9.751609595287927e-05, "loss": 2.498982810974121, "memory(GiB)": 112.5, "step": 1470, "token_acc": 0.48860759493670886, "train_speed(iter/s)": 1.19356 }, { "epoch": 0.5054832076764908, "grad_norm": 1.0384700298309326, "learning_rate": 9.749931242558927e-05, "loss": 2.5362998962402346, "memory(GiB)": 112.5, "step": 1475, "token_acc": 0.49495798319327733, "train_speed(iter/s)": 1.193881 }, { "epoch": 0.5071967100753941, "grad_norm": 1.208359718322754, "learning_rate": 9.748247384080385e-05, "loss": 2.504754638671875, "memory(GiB)": 112.5, "step": 1480, "token_acc": 0.4913907284768212, "train_speed(iter/s)": 1.194033 }, { "epoch": 0.5089102124742975, "grad_norm": 1.0172767639160156, "learning_rate": 9.746558021804096e-05, "loss": 2.516647720336914, "memory(GiB)": 112.5, "step": 1485, "token_acc": 0.48442622950819675, "train_speed(iter/s)": 1.194372 }, { "epoch": 0.5106237148732008, "grad_norm": 1.2060613632202148, "learning_rate": 9.744863157688239e-05, "loss": 2.4624229431152345, "memory(GiB)": 112.5, "step": 1490, "token_acc": 0.48752151462994836, "train_speed(iter/s)": 1.19421 }, { "epoch": 0.5123372172721041, "grad_norm": 1.1291205883026123, "learning_rate": 9.74316279369737e-05, "loss": 2.4899768829345703, "memory(GiB)": 112.5, "step": 1495, "token_acc": 0.47924528301886793, "train_speed(iter/s)": 1.193695 }, { "epoch": 0.5140507196710076, "grad_norm": 1.0703821182250977, "learning_rate": 9.741456931802415e-05, "loss": 2.492710304260254, "memory(GiB)": 112.5, "step": 1500, "token_acc": 0.4759806081974438, "train_speed(iter/s)": 1.193856 }, { "epoch": 0.5140507196710076, "eval_loss": 2.1491615772247314, "eval_runtime": 3.6925, "eval_samples_per_second": 27.082, "eval_steps_per_second": 27.082, "eval_token_acc": 0.4755661501787843, "step": 1500 }, { "epoch": 0.5157642220699109, "grad_norm": 1.3122109174728394, "learning_rate": 9.739745573980676e-05, "loss": 2.4482900619506838, "memory(GiB)": 112.5, "step": 1505, "token_acc": 0.4952681388012618, "train_speed(iter/s)": 1.189058 }, { "epoch": 0.5174777244688142, "grad_norm": 1.1234946250915527, "learning_rate": 9.738028722215827e-05, "loss": 2.47161979675293, "memory(GiB)": 112.5, "step": 1510, "token_acc": 0.4810177705977383, "train_speed(iter/s)": 1.189125 }, { "epoch": 0.5191912268677176, "grad_norm": 1.051485300064087, "learning_rate": 9.736306378497908e-05, "loss": 2.4923768997192384, "memory(GiB)": 112.5, "step": 1515, "token_acc": 0.48423153692614773, "train_speed(iter/s)": 1.189226 }, { "epoch": 0.520904729266621, "grad_norm": 1.1596146821975708, "learning_rate": 9.734578544823325e-05, "loss": 2.6125261306762697, "memory(GiB)": 112.5, "step": 1520, "token_acc": 0.463256191636216, "train_speed(iter/s)": 1.189358 }, { "epoch": 0.5226182316655243, "grad_norm": 1.0802867412567139, "learning_rate": 9.732845223194849e-05, "loss": 2.481121635437012, "memory(GiB)": 112.5, "step": 1525, "token_acc": 0.4741666666666667, "train_speed(iter/s)": 1.189575 }, { "epoch": 0.5243317340644277, "grad_norm": 1.0049901008605957, "learning_rate": 9.73110641562161e-05, "loss": 2.4285234451293944, "memory(GiB)": 112.5, "step": 1530, "token_acc": 0.4869179600886918, "train_speed(iter/s)": 1.189819 }, { "epoch": 0.526045236463331, "grad_norm": 1.0273510217666626, "learning_rate": 9.729362124119098e-05, "loss": 2.5080245971679687, "memory(GiB)": 112.5, "step": 1535, "token_acc": 0.4806560134566863, "train_speed(iter/s)": 1.189706 }, { "epoch": 0.5277587388622345, "grad_norm": 1.1244173049926758, "learning_rate": 9.72761235070916e-05, "loss": 2.3940916061401367, "memory(GiB)": 112.5, "step": 1540, "token_acc": 0.49006050129645634, "train_speed(iter/s)": 1.189552 }, { "epoch": 0.5294722412611378, "grad_norm": 1.198725938796997, "learning_rate": 9.725857097419998e-05, "loss": 2.436057472229004, "memory(GiB)": 112.5, "step": 1545, "token_acc": 0.4823943661971831, "train_speed(iter/s)": 1.189814 }, { "epoch": 0.5311857436600411, "grad_norm": 1.141127347946167, "learning_rate": 9.724096366286162e-05, "loss": 2.479618453979492, "memory(GiB)": 112.5, "step": 1550, "token_acc": 0.47868712702472294, "train_speed(iter/s)": 1.189897 }, { "epoch": 0.5328992460589445, "grad_norm": 1.0794557332992554, "learning_rate": 9.722330159348556e-05, "loss": 2.602509689331055, "memory(GiB)": 112.5, "step": 1555, "token_acc": 0.475118996105582, "train_speed(iter/s)": 1.190126 }, { "epoch": 0.5346127484578478, "grad_norm": 1.1655759811401367, "learning_rate": 9.720558478654429e-05, "loss": 2.4426624298095705, "memory(GiB)": 112.5, "step": 1560, "token_acc": 0.4915555555555556, "train_speed(iter/s)": 1.190475 }, { "epoch": 0.5363262508567512, "grad_norm": 1.0848139524459839, "learning_rate": 9.718781326257374e-05, "loss": 2.462363052368164, "memory(GiB)": 112.5, "step": 1565, "token_acc": 0.47865072587532026, "train_speed(iter/s)": 1.190581 }, { "epoch": 0.5380397532556546, "grad_norm": 1.4192438125610352, "learning_rate": 9.71699870421733e-05, "loss": 2.4655384063720702, "memory(GiB)": 112.5, "step": 1570, "token_acc": 0.4907207596029348, "train_speed(iter/s)": 1.19059 }, { "epoch": 0.5397532556545579, "grad_norm": 1.2127572298049927, "learning_rate": 9.715210614600569e-05, "loss": 2.4991710662841795, "memory(GiB)": 112.5, "step": 1575, "token_acc": 0.4879032258064516, "train_speed(iter/s)": 1.190657 }, { "epoch": 0.5414667580534612, "grad_norm": 1.4569380283355713, "learning_rate": 9.713417059479709e-05, "loss": 2.5154052734375, "memory(GiB)": 112.5, "step": 1580, "token_acc": 0.4881516587677725, "train_speed(iter/s)": 1.190935 }, { "epoch": 0.5431802604523647, "grad_norm": 1.0734378099441528, "learning_rate": 9.711618040933695e-05, "loss": 2.3676876068115233, "memory(GiB)": 112.5, "step": 1585, "token_acc": 0.5067303517151541, "train_speed(iter/s)": 1.191186 }, { "epoch": 0.544893762851268, "grad_norm": 1.1251024007797241, "learning_rate": 9.709813561047814e-05, "loss": 2.4399463653564455, "memory(GiB)": 112.5, "step": 1590, "token_acc": 0.4754168448054724, "train_speed(iter/s)": 1.191367 }, { "epoch": 0.5466072652501713, "grad_norm": 1.018345832824707, "learning_rate": 9.708003621913672e-05, "loss": 2.4586999893188475, "memory(GiB)": 112.5, "step": 1595, "token_acc": 0.4876668014557218, "train_speed(iter/s)": 1.19177 }, { "epoch": 0.5483207676490747, "grad_norm": 1.471872091293335, "learning_rate": 9.706188225629216e-05, "loss": 2.6027040481567383, "memory(GiB)": 112.5, "step": 1600, "token_acc": 0.47061258278145696, "train_speed(iter/s)": 1.191997 }, { "epoch": 0.5500342700479781, "grad_norm": 1.1526862382888794, "learning_rate": 9.704367374298707e-05, "loss": 2.521046829223633, "memory(GiB)": 112.5, "step": 1605, "token_acc": 0.4612822647793505, "train_speed(iter/s)": 1.192044 }, { "epoch": 0.5517477724468814, "grad_norm": 1.246643304824829, "learning_rate": 9.702541070032736e-05, "loss": 2.3680171966552734, "memory(GiB)": 112.5, "step": 1610, "token_acc": 0.49405267629566696, "train_speed(iter/s)": 1.191449 }, { "epoch": 0.5534612748457848, "grad_norm": 1.0267210006713867, "learning_rate": 9.70070931494821e-05, "loss": 2.515140151977539, "memory(GiB)": 112.5, "step": 1615, "token_acc": 0.4844066423653301, "train_speed(iter/s)": 1.191641 }, { "epoch": 0.5551747772446881, "grad_norm": 1.2657138109207153, "learning_rate": 9.698872111168358e-05, "loss": 2.566476058959961, "memory(GiB)": 112.5, "step": 1620, "token_acc": 0.474331777683496, "train_speed(iter/s)": 1.191962 }, { "epoch": 0.5568882796435916, "grad_norm": 1.2420560121536255, "learning_rate": 9.697029460822725e-05, "loss": 2.523607635498047, "memory(GiB)": 112.5, "step": 1625, "token_acc": 0.4769433465085639, "train_speed(iter/s)": 1.192308 }, { "epoch": 0.5586017820424949, "grad_norm": 1.1801214218139648, "learning_rate": 9.695181366047164e-05, "loss": 2.471462059020996, "memory(GiB)": 112.5, "step": 1630, "token_acc": 0.4944665781319168, "train_speed(iter/s)": 1.192309 }, { "epoch": 0.5603152844413982, "grad_norm": 1.134597897529602, "learning_rate": 9.693327828983843e-05, "loss": 2.519057273864746, "memory(GiB)": 112.5, "step": 1635, "token_acc": 0.48963182395260263, "train_speed(iter/s)": 1.192391 }, { "epoch": 0.5620287868403016, "grad_norm": 1.1885380744934082, "learning_rate": 9.691468851781239e-05, "loss": 2.4392068862915037, "memory(GiB)": 112.5, "step": 1640, "token_acc": 0.5019388194743645, "train_speed(iter/s)": 1.192684 }, { "epoch": 0.5637422892392049, "grad_norm": 1.1517177820205688, "learning_rate": 9.689604436594133e-05, "loss": 2.476618766784668, "memory(GiB)": 112.5, "step": 1645, "token_acc": 0.46613545816733065, "train_speed(iter/s)": 1.192939 }, { "epoch": 0.5654557916381083, "grad_norm": 1.057328224182129, "learning_rate": 9.687734585583609e-05, "loss": 2.4335960388183593, "memory(GiB)": 112.5, "step": 1650, "token_acc": 0.4808437365475678, "train_speed(iter/s)": 1.193183 }, { "epoch": 0.5671692940370117, "grad_norm": 1.1696370840072632, "learning_rate": 9.685859300917052e-05, "loss": 2.450973701477051, "memory(GiB)": 112.5, "step": 1655, "token_acc": 0.5051362215274676, "train_speed(iter/s)": 1.193124 }, { "epoch": 0.568882796435915, "grad_norm": 1.2326666116714478, "learning_rate": 9.683978584768147e-05, "loss": 2.513360786437988, "memory(GiB)": 112.5, "step": 1660, "token_acc": 0.48233809924306137, "train_speed(iter/s)": 1.193281 }, { "epoch": 0.5705962988348183, "grad_norm": 0.9975337386131287, "learning_rate": 9.682092439316873e-05, "loss": 2.5402853012084963, "memory(GiB)": 112.5, "step": 1665, "token_acc": 0.4777777777777778, "train_speed(iter/s)": 1.192431 }, { "epoch": 0.5723098012337218, "grad_norm": 0.9944484829902649, "learning_rate": 9.6802008667495e-05, "loss": 2.5221866607666015, "memory(GiB)": 112.5, "step": 1670, "token_acc": 0.4740527182866557, "train_speed(iter/s)": 1.192611 }, { "epoch": 0.5740233036326251, "grad_norm": 1.1460825204849243, "learning_rate": 9.678303869258593e-05, "loss": 2.433994102478027, "memory(GiB)": 112.5, "step": 1675, "token_acc": 0.4891398783666377, "train_speed(iter/s)": 1.192869 }, { "epoch": 0.5757368060315284, "grad_norm": 1.170066475868225, "learning_rate": 9.676401449043005e-05, "loss": 2.479436492919922, "memory(GiB)": 112.5, "step": 1680, "token_acc": 0.4889629321116202, "train_speed(iter/s)": 1.192873 }, { "epoch": 0.5774503084304318, "grad_norm": 1.06419837474823, "learning_rate": 9.674493608307873e-05, "loss": 2.5335357666015623, "memory(GiB)": 112.5, "step": 1685, "token_acc": 0.4806652806652807, "train_speed(iter/s)": 1.193052 }, { "epoch": 0.5791638108293352, "grad_norm": 1.1477761268615723, "learning_rate": 9.672580349264614e-05, "loss": 2.5206165313720703, "memory(GiB)": 112.5, "step": 1690, "token_acc": 0.47250423011844334, "train_speed(iter/s)": 1.193075 }, { "epoch": 0.5808773132282385, "grad_norm": 1.064406394958496, "learning_rate": 9.67066167413093e-05, "loss": 2.3915510177612305, "memory(GiB)": 112.5, "step": 1695, "token_acc": 0.49604221635883905, "train_speed(iter/s)": 1.193309 }, { "epoch": 0.5825908156271419, "grad_norm": 1.1307398080825806, "learning_rate": 9.668737585130801e-05, "loss": 2.3663217544555666, "memory(GiB)": 112.5, "step": 1700, "token_acc": 0.49793729372937295, "train_speed(iter/s)": 1.193567 }, { "epoch": 0.5843043180260452, "grad_norm": 1.0778461694717407, "learning_rate": 9.666808084494478e-05, "loss": 2.4351247787475585, "memory(GiB)": 112.5, "step": 1705, "token_acc": 0.49627174813587405, "train_speed(iter/s)": 1.193139 }, { "epoch": 0.5860178204249485, "grad_norm": 1.1922672986984253, "learning_rate": 9.66487317445849e-05, "loss": 2.466790199279785, "memory(GiB)": 112.5, "step": 1710, "token_acc": 0.4754168448054724, "train_speed(iter/s)": 1.193331 }, { "epoch": 0.587731322823852, "grad_norm": 1.1069507598876953, "learning_rate": 9.662932857265635e-05, "loss": 2.561270523071289, "memory(GiB)": 112.5, "step": 1715, "token_acc": 0.4755423224159932, "train_speed(iter/s)": 1.192846 }, { "epoch": 0.5894448252227553, "grad_norm": 1.2803523540496826, "learning_rate": 9.660987135164972e-05, "loss": 2.448762130737305, "memory(GiB)": 112.5, "step": 1720, "token_acc": 0.4812300319488818, "train_speed(iter/s)": 1.192831 }, { "epoch": 0.5911583276216587, "grad_norm": 1.108947992324829, "learning_rate": 9.659036010411835e-05, "loss": 2.427801513671875, "memory(GiB)": 112.5, "step": 1725, "token_acc": 0.48829141370338247, "train_speed(iter/s)": 1.192967 }, { "epoch": 0.592871830020562, "grad_norm": 1.1427477598190308, "learning_rate": 9.657079485267812e-05, "loss": 2.4689075469970705, "memory(GiB)": 112.5, "step": 1730, "token_acc": 0.4762107051826678, "train_speed(iter/s)": 1.192624 }, { "epoch": 0.5945853324194654, "grad_norm": 1.1456891298294067, "learning_rate": 9.655117562000758e-05, "loss": 2.4884700775146484, "memory(GiB)": 112.5, "step": 1735, "token_acc": 0.4790446146913024, "train_speed(iter/s)": 1.192766 }, { "epoch": 0.5962988348183688, "grad_norm": 1.3419657945632935, "learning_rate": 9.653150242884778e-05, "loss": 2.454232406616211, "memory(GiB)": 112.5, "step": 1740, "token_acc": 0.48212728857890147, "train_speed(iter/s)": 1.19305 }, { "epoch": 0.5980123372172721, "grad_norm": 1.003048062324524, "learning_rate": 9.651177530200237e-05, "loss": 2.476537322998047, "memory(GiB)": 112.5, "step": 1745, "token_acc": 0.48370927318295737, "train_speed(iter/s)": 1.193122 }, { "epoch": 0.5997258396161754, "grad_norm": 1.4113993644714355, "learning_rate": 9.649199426233748e-05, "loss": 2.567608642578125, "memory(GiB)": 112.5, "step": 1750, "token_acc": 0.4664082687338501, "train_speed(iter/s)": 1.193375 }, { "epoch": 0.6014393420150789, "grad_norm": 1.297283411026001, "learning_rate": 9.647215933278176e-05, "loss": 2.4818794250488283, "memory(GiB)": 112.5, "step": 1755, "token_acc": 0.47710241465445463, "train_speed(iter/s)": 1.193536 }, { "epoch": 0.6031528444139822, "grad_norm": 1.112648844718933, "learning_rate": 9.64522705363263e-05, "loss": 2.4245418548583983, "memory(GiB)": 112.5, "step": 1760, "token_acc": 0.486652977412731, "train_speed(iter/s)": 1.193667 }, { "epoch": 0.6048663468128855, "grad_norm": 1.0891033411026, "learning_rate": 9.643232789602464e-05, "loss": 2.4204099655151365, "memory(GiB)": 112.5, "step": 1765, "token_acc": 0.4963314630988347, "train_speed(iter/s)": 1.193998 }, { "epoch": 0.6065798492117889, "grad_norm": 1.056281566619873, "learning_rate": 9.641233143499274e-05, "loss": 2.4291101455688477, "memory(GiB)": 112.5, "step": 1770, "token_acc": 0.485972850678733, "train_speed(iter/s)": 1.193729 }, { "epoch": 0.6082933516106922, "grad_norm": 1.0502032041549683, "learning_rate": 9.639228117640893e-05, "loss": 2.4426504135131837, "memory(GiB)": 112.5, "step": 1775, "token_acc": 0.4841168996188056, "train_speed(iter/s)": 1.194067 }, { "epoch": 0.6100068540095956, "grad_norm": 1.1227986812591553, "learning_rate": 9.637217714351392e-05, "loss": 2.4836374282836915, "memory(GiB)": 112.5, "step": 1780, "token_acc": 0.4825371336812525, "train_speed(iter/s)": 1.194194 }, { "epoch": 0.611720356408499, "grad_norm": 1.1237542629241943, "learning_rate": 9.635201935961073e-05, "loss": 2.5867425918579103, "memory(GiB)": 112.5, "step": 1785, "token_acc": 0.46860514117151286, "train_speed(iter/s)": 1.194248 }, { "epoch": 0.6134338588074023, "grad_norm": 1.3703871965408325, "learning_rate": 9.633180784806468e-05, "loss": 2.380452537536621, "memory(GiB)": 112.5, "step": 1790, "token_acc": 0.50199203187251, "train_speed(iter/s)": 1.194413 }, { "epoch": 0.6151473612063056, "grad_norm": 1.1152923107147217, "learning_rate": 9.631154263230337e-05, "loss": 2.5561544418334963, "memory(GiB)": 112.5, "step": 1795, "token_acc": 0.47151767151767154, "train_speed(iter/s)": 1.194379 }, { "epoch": 0.6168608636052091, "grad_norm": 1.1641733646392822, "learning_rate": 9.629122373581669e-05, "loss": 2.4307966232299805, "memory(GiB)": 112.5, "step": 1800, "token_acc": 0.4965217391304348, "train_speed(iter/s)": 1.194698 }, { "epoch": 0.6185743660041124, "grad_norm": 1.144924283027649, "learning_rate": 9.627085118215667e-05, "loss": 2.4129718780517577, "memory(GiB)": 112.5, "step": 1805, "token_acc": 0.4943868739205527, "train_speed(iter/s)": 1.194728 }, { "epoch": 0.6202878684030158, "grad_norm": 1.0424916744232178, "learning_rate": 9.625042499493762e-05, "loss": 2.465420150756836, "memory(GiB)": 112.5, "step": 1810, "token_acc": 0.48172757475083056, "train_speed(iter/s)": 1.194405 }, { "epoch": 0.6220013708019191, "grad_norm": 1.109460711479187, "learning_rate": 9.622994519783599e-05, "loss": 2.522054100036621, "memory(GiB)": 112.5, "step": 1815, "token_acc": 0.4723700368399509, "train_speed(iter/s)": 1.194443 }, { "epoch": 0.6237148732008225, "grad_norm": 1.1251331567764282, "learning_rate": 9.620941181459034e-05, "loss": 2.5717758178710937, "memory(GiB)": 112.5, "step": 1820, "token_acc": 0.46846489488298293, "train_speed(iter/s)": 1.194744 }, { "epoch": 0.6254283755997259, "grad_norm": 1.0535207986831665, "learning_rate": 9.618882486900135e-05, "loss": 2.4533926010131837, "memory(GiB)": 112.5, "step": 1825, "token_acc": 0.486088379705401, "train_speed(iter/s)": 1.194824 }, { "epoch": 0.6271418779986292, "grad_norm": 1.255971074104309, "learning_rate": 9.616818438493182e-05, "loss": 2.459266471862793, "memory(GiB)": 112.5, "step": 1830, "token_acc": 0.48585111014366567, "train_speed(iter/s)": 1.19485 }, { "epoch": 0.6288553803975325, "grad_norm": 1.0951584577560425, "learning_rate": 9.614749038630661e-05, "loss": 2.385970687866211, "memory(GiB)": 112.5, "step": 1835, "token_acc": 0.49049429657794674, "train_speed(iter/s)": 1.194403 }, { "epoch": 0.630568882796436, "grad_norm": 1.096276044845581, "learning_rate": 9.612674289711255e-05, "loss": 2.5752532958984373, "memory(GiB)": 112.5, "step": 1840, "token_acc": 0.4616052060737527, "train_speed(iter/s)": 1.194677 }, { "epoch": 0.6322823851953393, "grad_norm": 1.113045334815979, "learning_rate": 9.610594194139854e-05, "loss": 2.4305858612060547, "memory(GiB)": 112.5, "step": 1845, "token_acc": 0.4957044673539519, "train_speed(iter/s)": 1.194736 }, { "epoch": 0.6339958875942426, "grad_norm": 1.106731653213501, "learning_rate": 9.60850875432754e-05, "loss": 2.5621519088745117, "memory(GiB)": 112.5, "step": 1850, "token_acc": 0.4736614853195164, "train_speed(iter/s)": 1.194931 }, { "epoch": 0.635709389993146, "grad_norm": 1.150020718574524, "learning_rate": 9.606417972691592e-05, "loss": 2.479771041870117, "memory(GiB)": 112.5, "step": 1855, "token_acc": 0.48077792853912255, "train_speed(iter/s)": 1.194901 }, { "epoch": 0.6374228923920493, "grad_norm": 1.212924599647522, "learning_rate": 9.604321851655484e-05, "loss": 2.4242820739746094, "memory(GiB)": 112.5, "step": 1860, "token_acc": 0.49008373732921995, "train_speed(iter/s)": 1.195111 }, { "epoch": 0.6391363947909527, "grad_norm": 1.3640737533569336, "learning_rate": 9.602220393648875e-05, "loss": 2.5154193878173827, "memory(GiB)": 112.5, "step": 1865, "token_acc": 0.4731182795698925, "train_speed(iter/s)": 1.195237 }, { "epoch": 0.6408498971898561, "grad_norm": 1.0711596012115479, "learning_rate": 9.600113601107608e-05, "loss": 2.445130157470703, "memory(GiB)": 112.5, "step": 1870, "token_acc": 0.4949092518813634, "train_speed(iter/s)": 1.194958 }, { "epoch": 0.6425633995887594, "grad_norm": 1.0745368003845215, "learning_rate": 9.598001476473715e-05, "loss": 2.4586872100830077, "memory(GiB)": 112.5, "step": 1875, "token_acc": 0.48231079265562027, "train_speed(iter/s)": 1.195112 }, { "epoch": 0.6442769019876627, "grad_norm": 1.078946828842163, "learning_rate": 9.595884022195404e-05, "loss": 2.5108760833740233, "memory(GiB)": 112.5, "step": 1880, "token_acc": 0.4854030501089325, "train_speed(iter/s)": 1.195156 }, { "epoch": 0.6459904043865662, "grad_norm": 1.0040407180786133, "learning_rate": 9.593761240727066e-05, "loss": 2.4536890029907226, "memory(GiB)": 112.5, "step": 1885, "token_acc": 0.48284734133790735, "train_speed(iter/s)": 1.195307 }, { "epoch": 0.6477039067854695, "grad_norm": 1.1001267433166504, "learning_rate": 9.591633134529261e-05, "loss": 2.594244194030762, "memory(GiB)": 112.5, "step": 1890, "token_acc": 0.4601479046836483, "train_speed(iter/s)": 1.195367 }, { "epoch": 0.6494174091843729, "grad_norm": 1.1639182567596436, "learning_rate": 9.589499706068724e-05, "loss": 2.404505157470703, "memory(GiB)": 112.5, "step": 1895, "token_acc": 0.5002050020500205, "train_speed(iter/s)": 1.195697 }, { "epoch": 0.6511309115832762, "grad_norm": 1.1700315475463867, "learning_rate": 9.587360957818359e-05, "loss": 2.499865913391113, "memory(GiB)": 112.5, "step": 1900, "token_acc": 0.4846547314578005, "train_speed(iter/s)": 1.195422 }, { "epoch": 0.6528444139821796, "grad_norm": 1.1545811891555786, "learning_rate": 9.585216892257238e-05, "loss": 2.497202682495117, "memory(GiB)": 112.5, "step": 1905, "token_acc": 0.46524064171123, "train_speed(iter/s)": 1.195688 }, { "epoch": 0.654557916381083, "grad_norm": 1.0655843019485474, "learning_rate": 9.583067511870592e-05, "loss": 2.428143119812012, "memory(GiB)": 112.5, "step": 1910, "token_acc": 0.4875643224699828, "train_speed(iter/s)": 1.195968 }, { "epoch": 0.6562714187799863, "grad_norm": 1.1836930513381958, "learning_rate": 9.580912819149814e-05, "loss": 2.4873283386230467, "memory(GiB)": 112.5, "step": 1915, "token_acc": 0.4853700516351119, "train_speed(iter/s)": 1.196064 }, { "epoch": 0.6579849211788896, "grad_norm": 1.0977143049240112, "learning_rate": 9.578752816592458e-05, "loss": 2.4978458404541017, "memory(GiB)": 112.5, "step": 1920, "token_acc": 0.4857659286037054, "train_speed(iter/s)": 1.196342 }, { "epoch": 0.659698423577793, "grad_norm": 1.1002583503723145, "learning_rate": 9.57658750670223e-05, "loss": 2.6058937072753907, "memory(GiB)": 112.5, "step": 1925, "token_acc": 0.47264500205676674, "train_speed(iter/s)": 1.196367 }, { "epoch": 0.6614119259766964, "grad_norm": 1.1927456855773926, "learning_rate": 9.574416891988986e-05, "loss": 2.4976097106933595, "memory(GiB)": 112.5, "step": 1930, "token_acc": 0.4728979940247546, "train_speed(iter/s)": 1.19621 }, { "epoch": 0.6631254283755997, "grad_norm": 1.1601786613464355, "learning_rate": 9.572240974968738e-05, "loss": 2.4901100158691407, "memory(GiB)": 112.5, "step": 1935, "token_acc": 0.47277556440903057, "train_speed(iter/s)": 1.196196 }, { "epoch": 0.6648389307745031, "grad_norm": 1.0629874467849731, "learning_rate": 9.570059758163634e-05, "loss": 2.477762794494629, "memory(GiB)": 112.5, "step": 1940, "token_acc": 0.48184121621621623, "train_speed(iter/s)": 1.195779 }, { "epoch": 0.6665524331734064, "grad_norm": 1.106680989265442, "learning_rate": 9.567873244101973e-05, "loss": 2.351163864135742, "memory(GiB)": 112.5, "step": 1945, "token_acc": 0.5052219321148825, "train_speed(iter/s)": 1.196027 }, { "epoch": 0.6682659355723098, "grad_norm": 1.2521049976348877, "learning_rate": 9.56568143531819e-05, "loss": 2.5394241333007814, "memory(GiB)": 112.5, "step": 1950, "token_acc": 0.48088360237892946, "train_speed(iter/s)": 1.196005 }, { "epoch": 0.6699794379712132, "grad_norm": 1.0552915334701538, "learning_rate": 9.56348433435286e-05, "loss": 2.502278518676758, "memory(GiB)": 112.5, "step": 1955, "token_acc": 0.46485355648535565, "train_speed(iter/s)": 1.195945 }, { "epoch": 0.6716929403701165, "grad_norm": 1.1263126134872437, "learning_rate": 9.561281943752691e-05, "loss": 2.4716108322143553, "memory(GiB)": 112.5, "step": 1960, "token_acc": 0.4871347579590057, "train_speed(iter/s)": 1.195906 }, { "epoch": 0.6734064427690198, "grad_norm": 0.990192174911499, "learning_rate": 9.55907426607052e-05, "loss": 2.43975830078125, "memory(GiB)": 112.5, "step": 1965, "token_acc": 0.47412705090450147, "train_speed(iter/s)": 1.196354 }, { "epoch": 0.6751199451679233, "grad_norm": 1.099224328994751, "learning_rate": 9.556861303865317e-05, "loss": 2.4576766967773436, "memory(GiB)": 112.5, "step": 1970, "token_acc": 0.4893520616221115, "train_speed(iter/s)": 1.196443 }, { "epoch": 0.6768334475668266, "grad_norm": 1.1035367250442505, "learning_rate": 9.554643059702175e-05, "loss": 2.460598182678223, "memory(GiB)": 112.5, "step": 1975, "token_acc": 0.4780976220275344, "train_speed(iter/s)": 1.196325 }, { "epoch": 0.67854694996573, "grad_norm": 1.1674121618270874, "learning_rate": 9.552419536152309e-05, "loss": 2.443644714355469, "memory(GiB)": 112.5, "step": 1980, "token_acc": 0.48294930875576036, "train_speed(iter/s)": 1.196491 }, { "epoch": 0.6802604523646333, "grad_norm": 1.1150026321411133, "learning_rate": 9.550190735793054e-05, "loss": 2.421084976196289, "memory(GiB)": 112.5, "step": 1985, "token_acc": 0.4900414937759336, "train_speed(iter/s)": 1.19641 }, { "epoch": 0.6819739547635366, "grad_norm": 1.150238037109375, "learning_rate": 9.54795666120786e-05, "loss": 2.5589981079101562, "memory(GiB)": 112.5, "step": 1990, "token_acc": 0.47120843471208435, "train_speed(iter/s)": 1.196435 }, { "epoch": 0.6836874571624401, "grad_norm": 1.0342082977294922, "learning_rate": 9.545717314986293e-05, "loss": 2.4489181518554686, "memory(GiB)": 112.5, "step": 1995, "token_acc": 0.48219402374130166, "train_speed(iter/s)": 1.196674 }, { "epoch": 0.6854009595613434, "grad_norm": 1.0235646963119507, "learning_rate": 9.54347269972403e-05, "loss": 2.3635936737060548, "memory(GiB)": 112.5, "step": 2000, "token_acc": 0.5155574762316335, "train_speed(iter/s)": 1.196766 }, { "epoch": 0.6854009595613434, "eval_loss": 2.113537311553955, "eval_runtime": 3.7359, "eval_samples_per_second": 26.767, "eval_steps_per_second": 26.767, "eval_token_acc": 0.5108853410740203, "step": 2000 }, { "epoch": 0.6871144619602467, "grad_norm": 1.0279896259307861, "learning_rate": 9.541222818022851e-05, "loss": 2.506847381591797, "memory(GiB)": 112.5, "step": 2005, "token_acc": 0.48412438625204585, "train_speed(iter/s)": 1.193041 }, { "epoch": 0.6888279643591501, "grad_norm": 0.9627888202667236, "learning_rate": 9.538967672490645e-05, "loss": 2.4192226409912108, "memory(GiB)": 112.5, "step": 2010, "token_acc": 0.482195224130708, "train_speed(iter/s)": 1.193037 }, { "epoch": 0.6905414667580535, "grad_norm": 1.1154391765594482, "learning_rate": 9.5367072657414e-05, "loss": 2.4203912734985353, "memory(GiB)": 112.5, "step": 2015, "token_acc": 0.48031155344006926, "train_speed(iter/s)": 1.193329 }, { "epoch": 0.6922549691569568, "grad_norm": 1.1595253944396973, "learning_rate": 9.534441600395203e-05, "loss": 2.4176708221435548, "memory(GiB)": 112.5, "step": 2020, "token_acc": 0.49462827675118176, "train_speed(iter/s)": 1.193511 }, { "epoch": 0.6939684715558602, "grad_norm": 1.0932029485702515, "learning_rate": 9.532170679078239e-05, "loss": 2.3983619689941404, "memory(GiB)": 112.5, "step": 2025, "token_acc": 0.4865568083261058, "train_speed(iter/s)": 1.193402 }, { "epoch": 0.6956819739547635, "grad_norm": 1.0985567569732666, "learning_rate": 9.529894504422778e-05, "loss": 2.6209991455078123, "memory(GiB)": 112.5, "step": 2030, "token_acc": 0.4739833843463052, "train_speed(iter/s)": 1.193415 }, { "epoch": 0.6973954763536669, "grad_norm": 1.047693133354187, "learning_rate": 9.527613079067186e-05, "loss": 2.469858169555664, "memory(GiB)": 112.5, "step": 2035, "token_acc": 0.4813212740857255, "train_speed(iter/s)": 1.193492 }, { "epoch": 0.6991089787525703, "grad_norm": 1.0876046419143677, "learning_rate": 9.525326405655915e-05, "loss": 2.5222536087036134, "memory(GiB)": 112.5, "step": 2040, "token_acc": 0.47381756756756754, "train_speed(iter/s)": 1.193463 }, { "epoch": 0.7008224811514736, "grad_norm": 1.0597151517868042, "learning_rate": 9.523034486839494e-05, "loss": 2.4156185150146485, "memory(GiB)": 112.5, "step": 2045, "token_acc": 0.4855195911413969, "train_speed(iter/s)": 1.193509 }, { "epoch": 0.7025359835503769, "grad_norm": 1.164616346359253, "learning_rate": 9.520737325274544e-05, "loss": 2.4098243713378906, "memory(GiB)": 112.5, "step": 2050, "token_acc": 0.49663827879874495, "train_speed(iter/s)": 1.19335 }, { "epoch": 0.7042494859492803, "grad_norm": 0.9504627585411072, "learning_rate": 9.518434923623747e-05, "loss": 2.5200151443481444, "memory(GiB)": 112.5, "step": 2055, "token_acc": 0.4691211401425178, "train_speed(iter/s)": 1.193528 }, { "epoch": 0.7059629883481837, "grad_norm": 1.03997802734375, "learning_rate": 9.516127284555873e-05, "loss": 2.49122314453125, "memory(GiB)": 112.5, "step": 2060, "token_acc": 0.48753825972890247, "train_speed(iter/s)": 1.193811 }, { "epoch": 0.707676490747087, "grad_norm": 1.1306170225143433, "learning_rate": 9.513814410745754e-05, "loss": 2.4745656967163088, "memory(GiB)": 112.5, "step": 2065, "token_acc": 0.49207029575653666, "train_speed(iter/s)": 1.193407 }, { "epoch": 0.7093899931459904, "grad_norm": 1.1814864873886108, "learning_rate": 9.511496304874295e-05, "loss": 2.4358003616333006, "memory(GiB)": 112.5, "step": 2070, "token_acc": 0.49349013019739607, "train_speed(iter/s)": 1.193529 }, { "epoch": 0.7111034955448937, "grad_norm": 1.0155982971191406, "learning_rate": 9.509172969628461e-05, "loss": 2.433399772644043, "memory(GiB)": 112.5, "step": 2075, "token_acc": 0.4800488599348534, "train_speed(iter/s)": 1.193425 }, { "epoch": 0.7128169979437972, "grad_norm": 1.0593079328536987, "learning_rate": 9.506844407701282e-05, "loss": 2.619617462158203, "memory(GiB)": 112.5, "step": 2080, "token_acc": 0.4707390004271679, "train_speed(iter/s)": 1.193404 }, { "epoch": 0.7145305003427005, "grad_norm": 1.1212072372436523, "learning_rate": 9.504510621791846e-05, "loss": 2.334707832336426, "memory(GiB)": 112.5, "step": 2085, "token_acc": 0.5039665970772442, "train_speed(iter/s)": 1.193402 }, { "epoch": 0.7162440027416038, "grad_norm": 1.1150355339050293, "learning_rate": 9.502171614605295e-05, "loss": 2.50414924621582, "memory(GiB)": 112.5, "step": 2090, "token_acc": 0.4813417190775681, "train_speed(iter/s)": 1.19298 }, { "epoch": 0.7179575051405072, "grad_norm": 1.1114534139633179, "learning_rate": 9.499827388852825e-05, "loss": 2.5338878631591797, "memory(GiB)": 112.5, "step": 2095, "token_acc": 0.48901782014090345, "train_speed(iter/s)": 1.193311 }, { "epoch": 0.7196710075394106, "grad_norm": 1.2613255977630615, "learning_rate": 9.497477947251677e-05, "loss": 2.357622528076172, "memory(GiB)": 112.5, "step": 2100, "token_acc": 0.49403973509933774, "train_speed(iter/s)": 1.193538 }, { "epoch": 0.7213845099383139, "grad_norm": 1.06936514377594, "learning_rate": 9.495123292525139e-05, "loss": 2.4400045394897463, "memory(GiB)": 112.5, "step": 2105, "token_acc": 0.4813636363636364, "train_speed(iter/s)": 1.193649 }, { "epoch": 0.7230980123372173, "grad_norm": 1.2661341428756714, "learning_rate": 9.492763427402547e-05, "loss": 2.4582372665405274, "memory(GiB)": 112.5, "step": 2110, "token_acc": 0.4800175669740887, "train_speed(iter/s)": 1.19371 }, { "epoch": 0.7248115147361206, "grad_norm": 1.0023120641708374, "learning_rate": 9.49039835461927e-05, "loss": 2.434611701965332, "memory(GiB)": 112.5, "step": 2115, "token_acc": 0.4871578947368421, "train_speed(iter/s)": 1.193775 }, { "epoch": 0.726525017135024, "grad_norm": 1.1407883167266846, "learning_rate": 9.488028076916717e-05, "loss": 2.4555715560913085, "memory(GiB)": 112.5, "step": 2120, "token_acc": 0.49779151943462896, "train_speed(iter/s)": 1.193919 }, { "epoch": 0.7282385195339274, "grad_norm": 1.6360605955123901, "learning_rate": 9.485652597042328e-05, "loss": 2.5068126678466798, "memory(GiB)": 112.5, "step": 2125, "token_acc": 0.48670572329878325, "train_speed(iter/s)": 1.194192 }, { "epoch": 0.7299520219328307, "grad_norm": 1.2966521978378296, "learning_rate": 9.483271917749573e-05, "loss": 2.3955528259277346, "memory(GiB)": 112.5, "step": 2130, "token_acc": 0.49538203190596136, "train_speed(iter/s)": 1.194178 }, { "epoch": 0.731665524331734, "grad_norm": 1.424309492111206, "learning_rate": 9.480886041797952e-05, "loss": 2.4681446075439455, "memory(GiB)": 112.5, "step": 2135, "token_acc": 0.46994295743747255, "train_speed(iter/s)": 1.194259 }, { "epoch": 0.7333790267306374, "grad_norm": 1.1689362525939941, "learning_rate": 9.478494971952985e-05, "loss": 2.5358821868896486, "memory(GiB)": 112.5, "step": 2140, "token_acc": 0.45537657814540705, "train_speed(iter/s)": 1.194225 }, { "epoch": 0.7350925291295408, "grad_norm": 1.2838635444641113, "learning_rate": 9.476098710986212e-05, "loss": 2.546974945068359, "memory(GiB)": 112.5, "step": 2145, "token_acc": 0.46833829154271145, "train_speed(iter/s)": 1.194411 }, { "epoch": 0.7368060315284441, "grad_norm": 1.1939327716827393, "learning_rate": 9.473697261675196e-05, "loss": 2.4260997772216797, "memory(GiB)": 112.5, "step": 2150, "token_acc": 0.4876279863481229, "train_speed(iter/s)": 1.194505 }, { "epoch": 0.7385195339273475, "grad_norm": 1.2477980852127075, "learning_rate": 9.471290626803503e-05, "loss": 2.5270490646362305, "memory(GiB)": 112.5, "step": 2155, "token_acc": 0.47339983374896094, "train_speed(iter/s)": 1.194164 }, { "epoch": 0.7402330363262508, "grad_norm": 1.061293363571167, "learning_rate": 9.468878809160724e-05, "loss": 2.487618637084961, "memory(GiB)": 112.5, "step": 2160, "token_acc": 0.47162828947368424, "train_speed(iter/s)": 1.194356 }, { "epoch": 0.7419465387251543, "grad_norm": 1.1112269163131714, "learning_rate": 9.466461811542445e-05, "loss": 2.5234073638916015, "memory(GiB)": 112.5, "step": 2165, "token_acc": 0.4761519805982215, "train_speed(iter/s)": 1.193865 }, { "epoch": 0.7436600411240576, "grad_norm": 1.0237507820129395, "learning_rate": 9.46403963675026e-05, "loss": 2.5107305526733397, "memory(GiB)": 112.5, "step": 2170, "token_acc": 0.48461862621154655, "train_speed(iter/s)": 1.194136 }, { "epoch": 0.7453735435229609, "grad_norm": 1.1107593774795532, "learning_rate": 9.461612287591767e-05, "loss": 2.4590293884277346, "memory(GiB)": 112.5, "step": 2175, "token_acc": 0.4945994599459946, "train_speed(iter/s)": 1.194278 }, { "epoch": 0.7470870459218643, "grad_norm": 1.2323973178863525, "learning_rate": 9.459179766880559e-05, "loss": 2.5018760681152346, "memory(GiB)": 112.5, "step": 2180, "token_acc": 0.4834595456357114, "train_speed(iter/s)": 1.194224 }, { "epoch": 0.7488005483207677, "grad_norm": 1.0504121780395508, "learning_rate": 9.456742077436227e-05, "loss": 2.372061538696289, "memory(GiB)": 112.5, "step": 2185, "token_acc": 0.5010725010725011, "train_speed(iter/s)": 1.194407 }, { "epoch": 0.750514050719671, "grad_norm": 1.0788344144821167, "learning_rate": 9.454299222084347e-05, "loss": 2.3881084442138674, "memory(GiB)": 112.5, "step": 2190, "token_acc": 0.49391833188531714, "train_speed(iter/s)": 1.194627 }, { "epoch": 0.7522275531185744, "grad_norm": 1.0345773696899414, "learning_rate": 9.451851203656486e-05, "loss": 2.4998008728027346, "memory(GiB)": 112.5, "step": 2195, "token_acc": 0.48595744680851066, "train_speed(iter/s)": 1.194704 }, { "epoch": 0.7539410555174777, "grad_norm": 1.133090853691101, "learning_rate": 9.449398024990197e-05, "loss": 2.496778869628906, "memory(GiB)": 112.5, "step": 2200, "token_acc": 0.47520661157024796, "train_speed(iter/s)": 1.194797 }, { "epoch": 0.755654557916381, "grad_norm": 1.064302682876587, "learning_rate": 9.446939688929013e-05, "loss": 2.43002872467041, "memory(GiB)": 112.5, "step": 2205, "token_acc": 0.4852758191621734, "train_speed(iter/s)": 1.194205 }, { "epoch": 0.7573680603152845, "grad_norm": 1.1083664894104004, "learning_rate": 9.444476198322448e-05, "loss": 2.4128345489501952, "memory(GiB)": 112.5, "step": 2210, "token_acc": 0.49144764288694204, "train_speed(iter/s)": 1.194357 }, { "epoch": 0.7590815627141878, "grad_norm": 1.1567012071609497, "learning_rate": 9.442007556025983e-05, "loss": 2.561954879760742, "memory(GiB)": 112.5, "step": 2215, "token_acc": 0.47987884032886197, "train_speed(iter/s)": 1.194529 }, { "epoch": 0.7607950651130911, "grad_norm": 1.2598642110824585, "learning_rate": 9.439533764901078e-05, "loss": 2.4633323669433596, "memory(GiB)": 112.5, "step": 2220, "token_acc": 0.48020434227330777, "train_speed(iter/s)": 1.194626 }, { "epoch": 0.7625085675119945, "grad_norm": 0.946242094039917, "learning_rate": 9.437054827815159e-05, "loss": 2.4926570892333983, "memory(GiB)": 112.5, "step": 2225, "token_acc": 0.4805668016194332, "train_speed(iter/s)": 1.194654 }, { "epoch": 0.7642220699108979, "grad_norm": 1.0197423696517944, "learning_rate": 9.434570747641617e-05, "loss": 2.5405982971191405, "memory(GiB)": 112.5, "step": 2230, "token_acc": 0.48035117056856186, "train_speed(iter/s)": 1.194836 }, { "epoch": 0.7659355723098012, "grad_norm": 1.014859914779663, "learning_rate": 9.432081527259801e-05, "loss": 2.5398494720458986, "memory(GiB)": 112.5, "step": 2235, "token_acc": 0.4798951048951049, "train_speed(iter/s)": 1.194666 }, { "epoch": 0.7676490747087046, "grad_norm": 1.158779501914978, "learning_rate": 9.429587169555026e-05, "loss": 2.4079048156738283, "memory(GiB)": 112.5, "step": 2240, "token_acc": 0.49267643142476697, "train_speed(iter/s)": 1.194631 }, { "epoch": 0.7693625771076079, "grad_norm": 1.0444304943084717, "learning_rate": 9.427087677418551e-05, "loss": 2.520260238647461, "memory(GiB)": 112.5, "step": 2245, "token_acc": 0.4727112676056338, "train_speed(iter/s)": 1.19482 }, { "epoch": 0.7710760795065114, "grad_norm": 1.0765740871429443, "learning_rate": 9.424583053747595e-05, "loss": 2.517685127258301, "memory(GiB)": 112.5, "step": 2250, "token_acc": 0.48291666666666666, "train_speed(iter/s)": 1.195078 }, { "epoch": 0.7727895819054147, "grad_norm": 1.1033846139907837, "learning_rate": 9.422073301445322e-05, "loss": 2.5611595153808593, "memory(GiB)": 112.5, "step": 2255, "token_acc": 0.461441213653603, "train_speed(iter/s)": 1.19529 }, { "epoch": 0.774503084304318, "grad_norm": 1.096051812171936, "learning_rate": 9.419558423420842e-05, "loss": 2.3568328857421874, "memory(GiB)": 112.5, "step": 2260, "token_acc": 0.49302915082382764, "train_speed(iter/s)": 1.195417 }, { "epoch": 0.7762165867032214, "grad_norm": 1.272713541984558, "learning_rate": 9.417038422589203e-05, "loss": 2.491974449157715, "memory(GiB)": 112.5, "step": 2265, "token_acc": 0.4749034749034749, "train_speed(iter/s)": 1.195469 }, { "epoch": 0.7779300891021247, "grad_norm": 1.2392749786376953, "learning_rate": 9.414513301871395e-05, "loss": 2.4851306915283202, "memory(GiB)": 112.5, "step": 2270, "token_acc": 0.4889629321116202, "train_speed(iter/s)": 1.195461 }, { "epoch": 0.7796435915010281, "grad_norm": 1.0416293144226074, "learning_rate": 9.411983064194341e-05, "loss": 2.4389926910400392, "memory(GiB)": 112.5, "step": 2275, "token_acc": 0.4861288945795988, "train_speed(iter/s)": 1.195115 }, { "epoch": 0.7813570938999315, "grad_norm": 1.19965398311615, "learning_rate": 9.409447712490893e-05, "loss": 2.347130012512207, "memory(GiB)": 112.5, "step": 2280, "token_acc": 0.5101860053144376, "train_speed(iter/s)": 1.1952 }, { "epoch": 0.7830705962988348, "grad_norm": 1.0721126794815063, "learning_rate": 9.406907249699835e-05, "loss": 2.413678741455078, "memory(GiB)": 112.5, "step": 2285, "token_acc": 0.48388429752066114, "train_speed(iter/s)": 1.195311 }, { "epoch": 0.7847840986977381, "grad_norm": 1.008293867111206, "learning_rate": 9.404361678765871e-05, "loss": 2.42053165435791, "memory(GiB)": 112.5, "step": 2290, "token_acc": 0.48244147157190637, "train_speed(iter/s)": 1.195347 }, { "epoch": 0.7864976010966416, "grad_norm": 1.1763334274291992, "learning_rate": 9.401811002639631e-05, "loss": 2.494576072692871, "memory(GiB)": 112.5, "step": 2295, "token_acc": 0.4748839172646686, "train_speed(iter/s)": 1.19555 }, { "epoch": 0.7882111034955449, "grad_norm": 0.9985781908035278, "learning_rate": 9.399255224277659e-05, "loss": 2.4558155059814455, "memory(GiB)": 112.5, "step": 2300, "token_acc": 0.49254349627174815, "train_speed(iter/s)": 1.195144 }, { "epoch": 0.7899246058944482, "grad_norm": 1.2314133644104004, "learning_rate": 9.396694346642411e-05, "loss": 2.4168697357177735, "memory(GiB)": 112.5, "step": 2305, "token_acc": 0.4928971157985364, "train_speed(iter/s)": 1.19516 }, { "epoch": 0.7916381082933516, "grad_norm": 1.091518521308899, "learning_rate": 9.39412837270226e-05, "loss": 2.6016658782958983, "memory(GiB)": 112.94, "step": 2310, "token_acc": 0.463318948607297, "train_speed(iter/s)": 1.194898 }, { "epoch": 0.793351610692255, "grad_norm": 1.078382134437561, "learning_rate": 9.39155730543148e-05, "loss": 2.4975631713867186, "memory(GiB)": 112.94, "step": 2315, "token_acc": 0.4791318864774624, "train_speed(iter/s)": 1.194917 }, { "epoch": 0.7950651130911583, "grad_norm": 1.048224687576294, "learning_rate": 9.388981147810254e-05, "loss": 2.4485118865966795, "memory(GiB)": 112.94, "step": 2320, "token_acc": 0.4949664429530201, "train_speed(iter/s)": 1.195063 }, { "epoch": 0.7967786154900617, "grad_norm": 1.1993794441223145, "learning_rate": 9.386399902824661e-05, "loss": 2.4417020797729494, "memory(GiB)": 112.94, "step": 2325, "token_acc": 0.4892055267702936, "train_speed(iter/s)": 1.194928 }, { "epoch": 0.798492117888965, "grad_norm": 1.202965497970581, "learning_rate": 9.383813573466677e-05, "loss": 2.4955171585083007, "memory(GiB)": 112.94, "step": 2330, "token_acc": 0.4772535804549284, "train_speed(iter/s)": 1.195002 }, { "epoch": 0.8002056202878685, "grad_norm": 0.9930698871612549, "learning_rate": 9.381222162734178e-05, "loss": 2.4521297454833983, "memory(GiB)": 112.94, "step": 2335, "token_acc": 0.47929249352890424, "train_speed(iter/s)": 1.1953 }, { "epoch": 0.8019191226867718, "grad_norm": 1.2014509439468384, "learning_rate": 9.378625673630921e-05, "loss": 2.478650665283203, "memory(GiB)": 112.94, "step": 2340, "token_acc": 0.4827586206896552, "train_speed(iter/s)": 1.19521 }, { "epoch": 0.8036326250856751, "grad_norm": 1.1250066757202148, "learning_rate": 9.376024109166555e-05, "loss": 2.4040248870849608, "memory(GiB)": 112.94, "step": 2345, "token_acc": 0.48973350808213195, "train_speed(iter/s)": 1.195357 }, { "epoch": 0.8053461274845785, "grad_norm": 1.219016194343567, "learning_rate": 9.373417472356612e-05, "loss": 2.510386848449707, "memory(GiB)": 112.94, "step": 2350, "token_acc": 0.48428835489833644, "train_speed(iter/s)": 1.195543 }, { "epoch": 0.8070596298834818, "grad_norm": 1.2147624492645264, "learning_rate": 9.370805766222498e-05, "loss": 2.3621189117431642, "memory(GiB)": 112.94, "step": 2355, "token_acc": 0.506695464362851, "train_speed(iter/s)": 1.19567 }, { "epoch": 0.8087731322823852, "grad_norm": 1.0085160732269287, "learning_rate": 9.368188993791501e-05, "loss": 2.4573307037353516, "memory(GiB)": 112.94, "step": 2360, "token_acc": 0.47296137339055794, "train_speed(iter/s)": 1.195751 }, { "epoch": 0.8104866346812886, "grad_norm": 1.1723625659942627, "learning_rate": 9.36556715809678e-05, "loss": 2.4406238555908204, "memory(GiB)": 112.94, "step": 2365, "token_acc": 0.4960134284515317, "train_speed(iter/s)": 1.195685 }, { "epoch": 0.8122001370801919, "grad_norm": 1.018062949180603, "learning_rate": 9.362940262177362e-05, "loss": 2.4054519653320314, "memory(GiB)": 112.94, "step": 2370, "token_acc": 0.4849608570251339, "train_speed(iter/s)": 1.195787 }, { "epoch": 0.8139136394790952, "grad_norm": 0.9889655113220215, "learning_rate": 9.360308309078141e-05, "loss": 2.508740997314453, "memory(GiB)": 112.94, "step": 2375, "token_acc": 0.4770604953308973, "train_speed(iter/s)": 1.195785 }, { "epoch": 0.8156271418779987, "grad_norm": 1.1460593938827515, "learning_rate": 9.357671301849867e-05, "loss": 2.4740955352783205, "memory(GiB)": 112.94, "step": 2380, "token_acc": 0.48316571928290336, "train_speed(iter/s)": 1.19602 }, { "epoch": 0.817340644276902, "grad_norm": 1.1547998189926147, "learning_rate": 9.355029243549158e-05, "loss": 2.4473844528198243, "memory(GiB)": 112.94, "step": 2385, "token_acc": 0.4839812046134131, "train_speed(iter/s)": 1.196135 }, { "epoch": 0.8190541466758053, "grad_norm": 1.1192626953125, "learning_rate": 9.35238213723848e-05, "loss": 2.408906936645508, "memory(GiB)": 112.94, "step": 2390, "token_acc": 0.4882139619220308, "train_speed(iter/s)": 1.19619 }, { "epoch": 0.8207676490747087, "grad_norm": 1.0839086771011353, "learning_rate": 9.349729985986151e-05, "loss": 2.5533851623535155, "memory(GiB)": 112.94, "step": 2395, "token_acc": 0.46147540983606555, "train_speed(iter/s)": 1.19617 }, { "epoch": 0.8224811514736121, "grad_norm": 1.040085792541504, "learning_rate": 9.347072792866338e-05, "loss": 2.445387268066406, "memory(GiB)": 112.94, "step": 2400, "token_acc": 0.49747687132043733, "train_speed(iter/s)": 1.196253 }, { "epoch": 0.8241946538725154, "grad_norm": 1.2076857089996338, "learning_rate": 9.344410560959054e-05, "loss": 2.5358224868774415, "memory(GiB)": 112.94, "step": 2405, "token_acc": 0.47491946617579384, "train_speed(iter/s)": 1.196384 }, { "epoch": 0.8259081562714188, "grad_norm": 1.1590632200241089, "learning_rate": 9.341743293350147e-05, "loss": 2.557430076599121, "memory(GiB)": 112.94, "step": 2410, "token_acc": 0.4886128364389234, "train_speed(iter/s)": 1.196637 }, { "epoch": 0.8276216586703221, "grad_norm": 1.0910804271697998, "learning_rate": 9.339070993131309e-05, "loss": 2.526027297973633, "memory(GiB)": 112.94, "step": 2415, "token_acc": 0.48315098468271334, "train_speed(iter/s)": 1.196865 }, { "epoch": 0.8293351610692254, "grad_norm": 1.175915002822876, "learning_rate": 9.336393663400058e-05, "loss": 2.472036933898926, "memory(GiB)": 112.94, "step": 2420, "token_acc": 0.49206349206349204, "train_speed(iter/s)": 1.197006 }, { "epoch": 0.8310486634681289, "grad_norm": 1.0283546447753906, "learning_rate": 9.333711307259749e-05, "loss": 2.489332580566406, "memory(GiB)": 112.94, "step": 2425, "token_acc": 0.47761824324324326, "train_speed(iter/s)": 1.197191 }, { "epoch": 0.8327621658670322, "grad_norm": 1.4150633811950684, "learning_rate": 9.331023927819559e-05, "loss": 2.561343765258789, "memory(GiB)": 112.94, "step": 2430, "token_acc": 0.46804979253112033, "train_speed(iter/s)": 1.197212 }, { "epoch": 0.8344756682659356, "grad_norm": 1.1499344110488892, "learning_rate": 9.328331528194487e-05, "loss": 2.446832275390625, "memory(GiB)": 112.94, "step": 2435, "token_acc": 0.48982300884955754, "train_speed(iter/s)": 1.197241 }, { "epoch": 0.8361891706648389, "grad_norm": 1.0168741941452026, "learning_rate": 9.325634111505355e-05, "loss": 2.4511817932128905, "memory(GiB)": 112.94, "step": 2440, "token_acc": 0.4802065404475043, "train_speed(iter/s)": 1.197325 }, { "epoch": 0.8379026730637423, "grad_norm": 1.172275185585022, "learning_rate": 9.322931680878794e-05, "loss": 2.5671987533569336, "memory(GiB)": 119.12, "step": 2445, "token_acc": 0.4578313253012048, "train_speed(iter/s)": 1.19676 }, { "epoch": 0.8396161754626457, "grad_norm": 1.0001485347747803, "learning_rate": 9.320224239447256e-05, "loss": 2.485711669921875, "memory(GiB)": 119.12, "step": 2450, "token_acc": 0.4904296044236495, "train_speed(iter/s)": 1.197039 }, { "epoch": 0.841329677861549, "grad_norm": 1.0410354137420654, "learning_rate": 9.317511790348991e-05, "loss": 2.531973457336426, "memory(GiB)": 119.12, "step": 2455, "token_acc": 0.47597158378604265, "train_speed(iter/s)": 1.19713 }, { "epoch": 0.8430431802604523, "grad_norm": 1.019578456878662, "learning_rate": 9.31479433672806e-05, "loss": 2.3927581787109373, "memory(GiB)": 119.12, "step": 2460, "token_acc": 0.49556025369978857, "train_speed(iter/s)": 1.197231 }, { "epoch": 0.8447566826593558, "grad_norm": 1.092120885848999, "learning_rate": 9.312071881734323e-05, "loss": 2.472391128540039, "memory(GiB)": 119.12, "step": 2465, "token_acc": 0.47249190938511326, "train_speed(iter/s)": 1.197401 }, { "epoch": 0.8464701850582591, "grad_norm": 1.0255887508392334, "learning_rate": 9.309344428523439e-05, "loss": 2.579201507568359, "memory(GiB)": 119.12, "step": 2470, "token_acc": 0.46078813889972686, "train_speed(iter/s)": 1.19724 }, { "epoch": 0.8481836874571624, "grad_norm": 1.226416826248169, "learning_rate": 9.306611980256856e-05, "loss": 2.4260723114013674, "memory(GiB)": 119.12, "step": 2475, "token_acc": 0.4882096069868996, "train_speed(iter/s)": 1.197311 }, { "epoch": 0.8498971898560658, "grad_norm": 0.9382858276367188, "learning_rate": 9.303874540101818e-05, "loss": 2.4013452529907227, "memory(GiB)": 119.12, "step": 2480, "token_acc": 0.484094616639478, "train_speed(iter/s)": 1.19726 }, { "epoch": 0.8516106922549691, "grad_norm": 0.985362708568573, "learning_rate": 9.301132111231349e-05, "loss": 2.4481563568115234, "memory(GiB)": 119.12, "step": 2485, "token_acc": 0.48186528497409326, "train_speed(iter/s)": 1.197404 }, { "epoch": 0.8533241946538725, "grad_norm": 1.0447067022323608, "learning_rate": 9.29838469682426e-05, "loss": 2.518269920349121, "memory(GiB)": 119.12, "step": 2490, "token_acc": 0.47019311502938704, "train_speed(iter/s)": 1.197491 }, { "epoch": 0.8550376970527759, "grad_norm": 1.2224953174591064, "learning_rate": 9.295632300065138e-05, "loss": 2.498805046081543, "memory(GiB)": 119.12, "step": 2495, "token_acc": 0.48051372896368466, "train_speed(iter/s)": 1.197484 }, { "epoch": 0.8567511994516792, "grad_norm": 1.1818112134933472, "learning_rate": 9.292874924144348e-05, "loss": 2.5021665573120115, "memory(GiB)": 119.12, "step": 2500, "token_acc": 0.4794229953330505, "train_speed(iter/s)": 1.197677 }, { "epoch": 0.8567511994516792, "eval_loss": 2.0169012546539307, "eval_runtime": 3.7081, "eval_samples_per_second": 26.968, "eval_steps_per_second": 26.968, "eval_token_acc": 0.5063291139240507, "step": 2500 }, { "epoch": 0.8584647018505825, "grad_norm": 0.9785412549972534, "learning_rate": 9.290112572258025e-05, "loss": 2.4811985015869142, "memory(GiB)": 119.12, "step": 2505, "token_acc": 0.4878205128205128, "train_speed(iter/s)": 1.194115 }, { "epoch": 0.860178204249486, "grad_norm": 0.9883909821510315, "learning_rate": 9.287345247608071e-05, "loss": 2.4674570083618166, "memory(GiB)": 119.12, "step": 2510, "token_acc": 0.4755389718076285, "train_speed(iter/s)": 1.194072 }, { "epoch": 0.8618917066483893, "grad_norm": 1.1025676727294922, "learning_rate": 9.284572953402151e-05, "loss": 2.470703887939453, "memory(GiB)": 119.12, "step": 2515, "token_acc": 0.48430873621713316, "train_speed(iter/s)": 1.193357 }, { "epoch": 0.8636052090472927, "grad_norm": 1.240106463432312, "learning_rate": 9.281795692853697e-05, "loss": 2.4937580108642576, "memory(GiB)": 119.12, "step": 2520, "token_acc": 0.48312686885946177, "train_speed(iter/s)": 1.193453 }, { "epoch": 0.865318711446196, "grad_norm": 1.1166032552719116, "learning_rate": 9.279013469181888e-05, "loss": 2.448097991943359, "memory(GiB)": 119.12, "step": 2525, "token_acc": 0.4833948339483395, "train_speed(iter/s)": 1.193467 }, { "epoch": 0.8670322138450994, "grad_norm": 1.0423080921173096, "learning_rate": 9.27622628561166e-05, "loss": 2.4847747802734377, "memory(GiB)": 119.12, "step": 2530, "token_acc": 0.5100925147182507, "train_speed(iter/s)": 1.193803 }, { "epoch": 0.8687457162440028, "grad_norm": 1.1249302625656128, "learning_rate": 9.273434145373703e-05, "loss": 2.466493606567383, "memory(GiB)": 119.12, "step": 2535, "token_acc": 0.48429319371727747, "train_speed(iter/s)": 1.193943 }, { "epoch": 0.8704592186429061, "grad_norm": 1.0647779703140259, "learning_rate": 9.270637051704444e-05, "loss": 2.447389030456543, "memory(GiB)": 119.12, "step": 2540, "token_acc": 0.5006587615283268, "train_speed(iter/s)": 1.194 }, { "epoch": 0.8721727210418094, "grad_norm": 1.0153447389602661, "learning_rate": 9.26783500784606e-05, "loss": 2.5020092010498045, "memory(GiB)": 119.12, "step": 2545, "token_acc": 0.48157349896480334, "train_speed(iter/s)": 1.194004 }, { "epoch": 0.8738862234407128, "grad_norm": 0.9783706665039062, "learning_rate": 9.265028017046458e-05, "loss": 2.441141128540039, "memory(GiB)": 119.12, "step": 2550, "token_acc": 0.4907369287772746, "train_speed(iter/s)": 1.194147 }, { "epoch": 0.8755997258396162, "grad_norm": 1.2087771892547607, "learning_rate": 9.262216082559283e-05, "loss": 2.4830291748046873, "memory(GiB)": 119.12, "step": 2555, "token_acc": 0.48031496062992124, "train_speed(iter/s)": 1.194213 }, { "epoch": 0.8773132282385195, "grad_norm": 1.016451120376587, "learning_rate": 9.25939920764391e-05, "loss": 2.438629722595215, "memory(GiB)": 119.12, "step": 2560, "token_acc": 0.48896434634974534, "train_speed(iter/s)": 1.194398 }, { "epoch": 0.8790267306374229, "grad_norm": 0.9134995937347412, "learning_rate": 9.256577395565443e-05, "loss": 2.4313236236572267, "memory(GiB)": 119.12, "step": 2565, "token_acc": 0.49416180150125105, "train_speed(iter/s)": 1.194542 }, { "epoch": 0.8807402330363262, "grad_norm": 1.2038213014602661, "learning_rate": 9.253750649594702e-05, "loss": 2.316659927368164, "memory(GiB)": 119.12, "step": 2570, "token_acc": 0.49834044570886676, "train_speed(iter/s)": 1.194255 }, { "epoch": 0.8824537354352296, "grad_norm": 1.1491506099700928, "learning_rate": 9.250918973008233e-05, "loss": 2.429360198974609, "memory(GiB)": 119.12, "step": 2575, "token_acc": 0.5022123893805309, "train_speed(iter/s)": 1.194453 }, { "epoch": 0.884167237834133, "grad_norm": 1.1289997100830078, "learning_rate": 9.248082369088295e-05, "loss": 2.4129505157470703, "memory(GiB)": 119.12, "step": 2580, "token_acc": 0.49039692701664533, "train_speed(iter/s)": 1.194663 }, { "epoch": 0.8858807402330363, "grad_norm": 1.0729957818984985, "learning_rate": 9.245240841122857e-05, "loss": 2.5376518249511717, "memory(GiB)": 119.12, "step": 2585, "token_acc": 0.46530789245446663, "train_speed(iter/s)": 1.194782 }, { "epoch": 0.8875942426319396, "grad_norm": 1.1321606636047363, "learning_rate": 9.242394392405595e-05, "loss": 2.4013927459716795, "memory(GiB)": 119.12, "step": 2590, "token_acc": 0.49316879682679593, "train_speed(iter/s)": 1.19496 }, { "epoch": 0.8893077450308431, "grad_norm": 1.1319377422332764, "learning_rate": 9.239543026235893e-05, "loss": 2.442028045654297, "memory(GiB)": 119.12, "step": 2595, "token_acc": 0.4735376044568245, "train_speed(iter/s)": 1.195114 }, { "epoch": 0.8910212474297464, "grad_norm": 1.304236650466919, "learning_rate": 9.23668674591883e-05, "loss": 2.4181793212890623, "memory(GiB)": 119.12, "step": 2600, "token_acc": 0.5083041958041958, "train_speed(iter/s)": 1.195165 }, { "epoch": 0.8927347498286498, "grad_norm": 1.2041863203048706, "learning_rate": 9.233825554765186e-05, "loss": 2.550347328186035, "memory(GiB)": 119.12, "step": 2605, "token_acc": 0.46987951807228917, "train_speed(iter/s)": 1.195393 }, { "epoch": 0.8944482522275531, "grad_norm": 1.171813726425171, "learning_rate": 9.230959456091426e-05, "loss": 2.5138071060180662, "memory(GiB)": 119.12, "step": 2610, "token_acc": 0.4995715509854327, "train_speed(iter/s)": 1.195152 }, { "epoch": 0.8961617546264565, "grad_norm": 1.2137060165405273, "learning_rate": 9.22808845321971e-05, "loss": 2.504001235961914, "memory(GiB)": 119.12, "step": 2615, "token_acc": 0.4756637168141593, "train_speed(iter/s)": 1.195297 }, { "epoch": 0.8978752570253599, "grad_norm": 1.0690782070159912, "learning_rate": 9.225212549477882e-05, "loss": 2.5784013748168944, "memory(GiB)": 119.12, "step": 2620, "token_acc": 0.46904469044690444, "train_speed(iter/s)": 1.195251 }, { "epoch": 0.8995887594242632, "grad_norm": 1.1792352199554443, "learning_rate": 9.222331748199462e-05, "loss": 2.4886926651000976, "memory(GiB)": 119.12, "step": 2625, "token_acc": 0.46785566155122355, "train_speed(iter/s)": 1.195373 }, { "epoch": 0.9013022618231665, "grad_norm": 0.9265967607498169, "learning_rate": 9.219446052723652e-05, "loss": 2.4455965042114256, "memory(GiB)": 119.12, "step": 2630, "token_acc": 0.4911452184179457, "train_speed(iter/s)": 1.195325 }, { "epoch": 0.9030157642220699, "grad_norm": 1.2720205783843994, "learning_rate": 9.216555466395327e-05, "loss": 2.512978744506836, "memory(GiB)": 119.12, "step": 2635, "token_acc": 0.4762516046213094, "train_speed(iter/s)": 1.195155 }, { "epoch": 0.9047292666209733, "grad_norm": 1.0480738878250122, "learning_rate": 9.213659992565024e-05, "loss": 2.4229045867919923, "memory(GiB)": 119.12, "step": 2640, "token_acc": 0.4899074852817494, "train_speed(iter/s)": 1.19519 }, { "epoch": 0.9064427690198766, "grad_norm": 1.171968936920166, "learning_rate": 9.210759634588954e-05, "loss": 2.417769432067871, "memory(GiB)": 119.12, "step": 2645, "token_acc": 0.48133848133848134, "train_speed(iter/s)": 1.195272 }, { "epoch": 0.90815627141878, "grad_norm": 1.0042797327041626, "learning_rate": 9.207854395828985e-05, "loss": 2.5299110412597656, "memory(GiB)": 119.12, "step": 2650, "token_acc": 0.4679213002566296, "train_speed(iter/s)": 1.195438 }, { "epoch": 0.9098697738176833, "grad_norm": 1.0661354064941406, "learning_rate": 9.204944279652643e-05, "loss": 2.396357536315918, "memory(GiB)": 119.12, "step": 2655, "token_acc": 0.5024498886414254, "train_speed(iter/s)": 1.19556 }, { "epoch": 0.9115832762165867, "grad_norm": 0.9386459589004517, "learning_rate": 9.202029289433108e-05, "loss": 2.486680030822754, "memory(GiB)": 119.12, "step": 2660, "token_acc": 0.46994106090373283, "train_speed(iter/s)": 1.195701 }, { "epoch": 0.9132967786154901, "grad_norm": 1.0919015407562256, "learning_rate": 9.199109428549205e-05, "loss": 2.563818359375, "memory(GiB)": 119.12, "step": 2665, "token_acc": 0.4629934210526316, "train_speed(iter/s)": 1.195846 }, { "epoch": 0.9150102810143934, "grad_norm": 1.1241503953933716, "learning_rate": 9.196184700385413e-05, "loss": 2.482889175415039, "memory(GiB)": 119.12, "step": 2670, "token_acc": 0.4922691182615963, "train_speed(iter/s)": 1.195909 }, { "epoch": 0.9167237834132967, "grad_norm": 1.0982451438903809, "learning_rate": 9.193255108331849e-05, "loss": 2.5045536041259764, "memory(GiB)": 119.12, "step": 2675, "token_acc": 0.472233481961897, "train_speed(iter/s)": 1.196052 }, { "epoch": 0.9184372858122002, "grad_norm": 1.0509921312332153, "learning_rate": 9.190320655784265e-05, "loss": 2.449513626098633, "memory(GiB)": 119.12, "step": 2680, "token_acc": 0.48760330578512395, "train_speed(iter/s)": 1.196114 }, { "epoch": 0.9201507882111035, "grad_norm": 1.0328330993652344, "learning_rate": 9.187381346144053e-05, "loss": 2.43176326751709, "memory(GiB)": 119.12, "step": 2685, "token_acc": 0.4916247906197655, "train_speed(iter/s)": 1.196134 }, { "epoch": 0.9218642906100069, "grad_norm": 1.2855898141860962, "learning_rate": 9.184437182818227e-05, "loss": 2.513711357116699, "memory(GiB)": 119.12, "step": 2690, "token_acc": 0.4799163179916318, "train_speed(iter/s)": 1.196301 }, { "epoch": 0.9235777930089102, "grad_norm": 1.0858205556869507, "learning_rate": 9.181488169219432e-05, "loss": 2.4517818450927735, "memory(GiB)": 119.12, "step": 2695, "token_acc": 0.4979937583593402, "train_speed(iter/s)": 1.19649 }, { "epoch": 0.9252912954078135, "grad_norm": 1.2214590311050415, "learning_rate": 9.178534308765936e-05, "loss": 2.4320600509643553, "memory(GiB)": 119.12, "step": 2700, "token_acc": 0.4895287958115183, "train_speed(iter/s)": 1.196542 }, { "epoch": 0.927004797806717, "grad_norm": 1.0480893850326538, "learning_rate": 9.175575604881624e-05, "loss": 2.4320999145507813, "memory(GiB)": 119.12, "step": 2705, "token_acc": 0.48376068376068376, "train_speed(iter/s)": 1.196559 }, { "epoch": 0.9287183002056203, "grad_norm": 1.1293511390686035, "learning_rate": 9.172612060995994e-05, "loss": 2.509407615661621, "memory(GiB)": 119.12, "step": 2710, "token_acc": 0.4815429282455413, "train_speed(iter/s)": 1.196726 }, { "epoch": 0.9304318026045236, "grad_norm": 1.0537109375, "learning_rate": 9.16964368054415e-05, "loss": 2.4082218170166017, "memory(GiB)": 119.12, "step": 2715, "token_acc": 0.49236141422959406, "train_speed(iter/s)": 1.196867 }, { "epoch": 0.932145305003427, "grad_norm": 0.9345617890357971, "learning_rate": 9.166670466966816e-05, "loss": 2.479477119445801, "memory(GiB)": 119.12, "step": 2720, "token_acc": 0.48992747784045126, "train_speed(iter/s)": 1.196965 }, { "epoch": 0.9338588074023304, "grad_norm": 1.31269371509552, "learning_rate": 9.163692423710303e-05, "loss": 2.472517395019531, "memory(GiB)": 119.12, "step": 2725, "token_acc": 0.496046608406159, "train_speed(iter/s)": 1.19716 }, { "epoch": 0.9355723098012337, "grad_norm": 1.3020341396331787, "learning_rate": 9.160709554226528e-05, "loss": 2.5515768051147463, "memory(GiB)": 119.12, "step": 2730, "token_acc": 0.47419928825622776, "train_speed(iter/s)": 1.197257 }, { "epoch": 0.9372858122001371, "grad_norm": 1.017764925956726, "learning_rate": 9.157721861972999e-05, "loss": 2.4863983154296876, "memory(GiB)": 119.12, "step": 2735, "token_acc": 0.4833261152013859, "train_speed(iter/s)": 1.197386 }, { "epoch": 0.9389993145990404, "grad_norm": 1.2967298030853271, "learning_rate": 9.154729350412816e-05, "loss": 2.4479953765869142, "memory(GiB)": 119.12, "step": 2740, "token_acc": 0.4916629112212708, "train_speed(iter/s)": 1.197582 }, { "epoch": 0.9407128169979438, "grad_norm": 0.975490927696228, "learning_rate": 9.151732023014668e-05, "loss": 2.495541000366211, "memory(GiB)": 119.12, "step": 2745, "token_acc": 0.47322175732217575, "train_speed(iter/s)": 1.197736 }, { "epoch": 0.9424263193968472, "grad_norm": 1.2684855461120605, "learning_rate": 9.148729883252818e-05, "loss": 2.4956525802612304, "memory(GiB)": 119.12, "step": 2750, "token_acc": 0.48, "train_speed(iter/s)": 1.197825 }, { "epoch": 0.9441398217957505, "grad_norm": 1.0655467510223389, "learning_rate": 9.145722934607118e-05, "loss": 2.386100959777832, "memory(GiB)": 119.12, "step": 2755, "token_acc": 0.4983682983682984, "train_speed(iter/s)": 1.197788 }, { "epoch": 0.9458533241946538, "grad_norm": 1.0164494514465332, "learning_rate": 9.142711180562982e-05, "loss": 2.4896171569824217, "memory(GiB)": 119.12, "step": 2760, "token_acc": 0.488120050020842, "train_speed(iter/s)": 1.197838 }, { "epoch": 0.9475668265935572, "grad_norm": 1.0745728015899658, "learning_rate": 9.139694624611403e-05, "loss": 2.4319271087646483, "memory(GiB)": 119.12, "step": 2765, "token_acc": 0.5002161694768699, "train_speed(iter/s)": 1.197946 }, { "epoch": 0.9492803289924606, "grad_norm": 1.1509394645690918, "learning_rate": 9.136673270248937e-05, "loss": 2.3495336532592774, "memory(GiB)": 119.12, "step": 2770, "token_acc": 0.5081615120274914, "train_speed(iter/s)": 1.198041 }, { "epoch": 0.950993831391364, "grad_norm": 1.0483930110931396, "learning_rate": 9.133647120977704e-05, "loss": 2.4857181549072265, "memory(GiB)": 119.12, "step": 2775, "token_acc": 0.48365180467091295, "train_speed(iter/s)": 1.198125 }, { "epoch": 0.9527073337902673, "grad_norm": 1.1095170974731445, "learning_rate": 9.130616180305378e-05, "loss": 2.4442560195922853, "memory(GiB)": 119.12, "step": 2780, "token_acc": 0.4842292314526877, "train_speed(iter/s)": 1.198435 }, { "epoch": 0.9544208361891706, "grad_norm": 1.008376121520996, "learning_rate": 9.127580451745188e-05, "loss": 2.5422365188598635, "memory(GiB)": 119.12, "step": 2785, "token_acc": 0.4819115676641358, "train_speed(iter/s)": 1.198576 }, { "epoch": 0.9561343385880741, "grad_norm": 1.1318079233169556, "learning_rate": 9.124539938815917e-05, "loss": 2.455126190185547, "memory(GiB)": 119.12, "step": 2790, "token_acc": 0.47752073330423395, "train_speed(iter/s)": 1.198412 }, { "epoch": 0.9578478409869774, "grad_norm": 1.2430880069732666, "learning_rate": 9.121494645041886e-05, "loss": 2.4126663208007812, "memory(GiB)": 119.12, "step": 2795, "token_acc": 0.4941790445604175, "train_speed(iter/s)": 1.198341 }, { "epoch": 0.9595613433858807, "grad_norm": 1.6070297956466675, "learning_rate": 9.118444573952965e-05, "loss": 2.5326313018798827, "memory(GiB)": 119.12, "step": 2800, "token_acc": 0.4761133603238866, "train_speed(iter/s)": 1.198368 }, { "epoch": 0.9612748457847841, "grad_norm": 1.0152499675750732, "learning_rate": 9.115389729084557e-05, "loss": 2.5299152374267577, "memory(GiB)": 119.12, "step": 2805, "token_acc": 0.4855224506924045, "train_speed(iter/s)": 1.198064 }, { "epoch": 0.9629883481836875, "grad_norm": 1.063612937927246, "learning_rate": 9.1123301139776e-05, "loss": 2.441819763183594, "memory(GiB)": 119.12, "step": 2810, "token_acc": 0.497489539748954, "train_speed(iter/s)": 1.198117 }, { "epoch": 0.9647018505825908, "grad_norm": 1.0393316745758057, "learning_rate": 9.10926573217856e-05, "loss": 2.5461408615112306, "memory(GiB)": 119.12, "step": 2815, "token_acc": 0.48083475298126066, "train_speed(iter/s)": 1.197926 }, { "epoch": 0.9664153529814942, "grad_norm": 1.369353175163269, "learning_rate": 9.106196587239431e-05, "loss": 2.4450273513793945, "memory(GiB)": 119.12, "step": 2820, "token_acc": 0.4855679702048417, "train_speed(iter/s)": 1.198157 }, { "epoch": 0.9681288553803975, "grad_norm": 1.1728565692901611, "learning_rate": 9.103122682717726e-05, "loss": 2.4422222137451173, "memory(GiB)": 119.12, "step": 2825, "token_acc": 0.4822101297614064, "train_speed(iter/s)": 1.198168 }, { "epoch": 0.969842357779301, "grad_norm": 1.0293534994125366, "learning_rate": 9.100044022176475e-05, "loss": 2.48060302734375, "memory(GiB)": 119.12, "step": 2830, "token_acc": 0.47540983606557374, "train_speed(iter/s)": 1.198296 }, { "epoch": 0.9715558601782043, "grad_norm": 1.1197803020477295, "learning_rate": 9.096960609184219e-05, "loss": 2.4800878524780274, "memory(GiB)": 119.12, "step": 2835, "token_acc": 0.4856131031429836, "train_speed(iter/s)": 1.198375 }, { "epoch": 0.9732693625771076, "grad_norm": 1.0781974792480469, "learning_rate": 9.093872447315013e-05, "loss": 2.413246917724609, "memory(GiB)": 119.12, "step": 2840, "token_acc": 0.49054325955734407, "train_speed(iter/s)": 1.198508 }, { "epoch": 0.9749828649760109, "grad_norm": 0.9640703201293945, "learning_rate": 9.090779540148411e-05, "loss": 2.45723819732666, "memory(GiB)": 119.12, "step": 2845, "token_acc": 0.4837228714524207, "train_speed(iter/s)": 1.198642 }, { "epoch": 0.9766963673749143, "grad_norm": 1.1468006372451782, "learning_rate": 9.087681891269468e-05, "loss": 2.5888288497924803, "memory(GiB)": 119.12, "step": 2850, "token_acc": 0.46229228802726885, "train_speed(iter/s)": 1.198768 }, { "epoch": 0.9784098697738177, "grad_norm": 1.0947856903076172, "learning_rate": 9.084579504268742e-05, "loss": 2.5616954803466796, "memory(GiB)": 119.12, "step": 2855, "token_acc": 0.47650709219858156, "train_speed(iter/s)": 1.198839 }, { "epoch": 0.980123372172721, "grad_norm": 1.052904486656189, "learning_rate": 9.081472382742274e-05, "loss": 2.5207950592041017, "memory(GiB)": 119.12, "step": 2860, "token_acc": 0.48176748176748174, "train_speed(iter/s)": 1.198961 }, { "epoch": 0.9818368745716244, "grad_norm": 1.0575549602508545, "learning_rate": 9.078360530291596e-05, "loss": 2.4126367568969727, "memory(GiB)": 119.12, "step": 2865, "token_acc": 0.48980496453900707, "train_speed(iter/s)": 1.199226 }, { "epoch": 0.9835503769705277, "grad_norm": 1.1024768352508545, "learning_rate": 9.075243950523726e-05, "loss": 2.4973588943481446, "memory(GiB)": 119.12, "step": 2870, "token_acc": 0.46758817921830315, "train_speed(iter/s)": 1.199441 }, { "epoch": 0.9852638793694312, "grad_norm": 1.194924235343933, "learning_rate": 9.07212264705116e-05, "loss": 2.4870426177978517, "memory(GiB)": 119.12, "step": 2875, "token_acc": 0.4730514096185738, "train_speed(iter/s)": 1.199583 }, { "epoch": 0.9869773817683345, "grad_norm": 1.041340947151184, "learning_rate": 9.06899662349187e-05, "loss": 2.503125, "memory(GiB)": 119.12, "step": 2880, "token_acc": 0.47865353037766833, "train_speed(iter/s)": 1.199454 }, { "epoch": 0.9886908841672378, "grad_norm": 1.3264368772506714, "learning_rate": 9.065865883469298e-05, "loss": 2.4393341064453127, "memory(GiB)": 119.12, "step": 2885, "token_acc": 0.5019124521886953, "train_speed(iter/s)": 1.199633 }, { "epoch": 0.9904043865661412, "grad_norm": 1.1134300231933594, "learning_rate": 9.062730430612354e-05, "loss": 2.4788162231445314, "memory(GiB)": 119.12, "step": 2890, "token_acc": 0.4773102310231023, "train_speed(iter/s)": 1.199502 }, { "epoch": 0.9921178889650446, "grad_norm": 1.0972917079925537, "learning_rate": 9.059590268555408e-05, "loss": 2.460530471801758, "memory(GiB)": 119.12, "step": 2895, "token_acc": 0.4891903348876643, "train_speed(iter/s)": 1.199608 }, { "epoch": 0.9938313913639479, "grad_norm": 1.1109519004821777, "learning_rate": 9.056445400938293e-05, "loss": 2.4048389434814452, "memory(GiB)": 119.12, "step": 2900, "token_acc": 0.5050228310502283, "train_speed(iter/s)": 1.199512 }, { "epoch": 0.9955448937628513, "grad_norm": 1.13070547580719, "learning_rate": 9.053295831406292e-05, "loss": 2.479254150390625, "memory(GiB)": 119.12, "step": 2905, "token_acc": 0.4905349794238683, "train_speed(iter/s)": 1.199583 }, { "epoch": 0.9972583961617546, "grad_norm": 1.0645884275436401, "learning_rate": 9.050141563610143e-05, "loss": 2.4895185470581054, "memory(GiB)": 119.12, "step": 2910, "token_acc": 0.4846577498033045, "train_speed(iter/s)": 1.199682 }, { "epoch": 0.9989718985606579, "grad_norm": 2.4150378704071045, "learning_rate": 9.046982601206024e-05, "loss": 2.5314704895019533, "memory(GiB)": 119.12, "step": 2915, "token_acc": 0.47512437810945274, "train_speed(iter/s)": 1.199773 }, { "epoch": 1.0006854009595614, "grad_norm": 0.9650641083717346, "learning_rate": 9.04381894785556e-05, "loss": 2.401732635498047, "memory(GiB)": 119.12, "step": 2920, "token_acc": 0.4831514000949217, "train_speed(iter/s)": 1.200031 }, { "epoch": 1.0023989033584646, "grad_norm": 1.1351155042648315, "learning_rate": 9.040650607225812e-05, "loss": 2.314501953125, "memory(GiB)": 119.12, "step": 2925, "token_acc": 0.5089666951323655, "train_speed(iter/s)": 1.200103 }, { "epoch": 1.004112405757368, "grad_norm": 1.0562803745269775, "learning_rate": 9.037477582989269e-05, "loss": 2.4943511962890623, "memory(GiB)": 119.12, "step": 2930, "token_acc": 0.483741469289442, "train_speed(iter/s)": 1.199635 }, { "epoch": 1.0058259081562715, "grad_norm": 1.2441335916519165, "learning_rate": 9.034299878823855e-05, "loss": 2.454339027404785, "memory(GiB)": 119.12, "step": 2935, "token_acc": 0.4760452961672474, "train_speed(iter/s)": 1.199813 }, { "epoch": 1.0075394105551747, "grad_norm": 1.0559449195861816, "learning_rate": 9.031117498412918e-05, "loss": 2.4573013305664064, "memory(GiB)": 119.12, "step": 2940, "token_acc": 0.49451476793248944, "train_speed(iter/s)": 1.199413 }, { "epoch": 1.0092529129540782, "grad_norm": 1.0225869417190552, "learning_rate": 9.027930445445224e-05, "loss": 2.3427839279174805, "memory(GiB)": 119.12, "step": 2945, "token_acc": 0.49800088849400265, "train_speed(iter/s)": 1.199461 }, { "epoch": 1.0109664153529816, "grad_norm": 1.1661312580108643, "learning_rate": 9.024738723614956e-05, "loss": 2.3859167098999023, "memory(GiB)": 119.12, "step": 2950, "token_acc": 0.49956709956709955, "train_speed(iter/s)": 1.199637 }, { "epoch": 1.0126799177518848, "grad_norm": 1.1669907569885254, "learning_rate": 9.021542336621709e-05, "loss": 2.4322172164916993, "memory(GiB)": 119.12, "step": 2955, "token_acc": 0.48586118251928023, "train_speed(iter/s)": 1.199605 }, { "epoch": 1.0143934201507883, "grad_norm": 1.2747477293014526, "learning_rate": 9.018341288170485e-05, "loss": 2.4285396575927733, "memory(GiB)": 119.12, "step": 2960, "token_acc": 0.49120549120549123, "train_speed(iter/s)": 1.199592 }, { "epoch": 1.0161069225496915, "grad_norm": 1.1881349086761475, "learning_rate": 9.015135581971688e-05, "loss": 2.4247390747070314, "memory(GiB)": 119.12, "step": 2965, "token_acc": 0.4995843724023275, "train_speed(iter/s)": 1.199713 }, { "epoch": 1.017820424948595, "grad_norm": 1.2921404838562012, "learning_rate": 9.011925221741126e-05, "loss": 2.4849319458007812, "memory(GiB)": 119.12, "step": 2970, "token_acc": 0.4728122344944775, "train_speed(iter/s)": 1.199735 }, { "epoch": 1.0195339273474984, "grad_norm": 0.9554177522659302, "learning_rate": 9.008710211199996e-05, "loss": 2.336498260498047, "memory(GiB)": 119.12, "step": 2975, "token_acc": 0.5080412371134021, "train_speed(iter/s)": 1.199822 }, { "epoch": 1.0212474297464016, "grad_norm": 1.001139521598816, "learning_rate": 9.005490554074886e-05, "loss": 2.4595008850097657, "memory(GiB)": 119.12, "step": 2980, "token_acc": 0.48191318327974275, "train_speed(iter/s)": 1.199371 }, { "epoch": 1.022960932145305, "grad_norm": 1.2134971618652344, "learning_rate": 9.002266254097773e-05, "loss": 2.4221031188964846, "memory(GiB)": 119.12, "step": 2985, "token_acc": 0.49735664904432697, "train_speed(iter/s)": 1.199479 }, { "epoch": 1.0246744345442083, "grad_norm": 1.2965521812438965, "learning_rate": 8.999037315006014e-05, "loss": 2.358458709716797, "memory(GiB)": 119.12, "step": 2990, "token_acc": 0.49326599326599324, "train_speed(iter/s)": 1.199492 }, { "epoch": 1.0263879369431117, "grad_norm": 1.1573594808578491, "learning_rate": 8.995803740542341e-05, "loss": 2.4721904754638673, "memory(GiB)": 119.12, "step": 2995, "token_acc": 0.48134481344813446, "train_speed(iter/s)": 1.199638 }, { "epoch": 1.0281014393420151, "grad_norm": 1.1859855651855469, "learning_rate": 8.992565534454864e-05, "loss": 2.4309911727905273, "memory(GiB)": 119.12, "step": 3000, "token_acc": 0.49517009659806804, "train_speed(iter/s)": 1.199791 }, { "epoch": 1.0281014393420151, "eval_loss": 2.122789144515991, "eval_runtime": 3.7169, "eval_samples_per_second": 26.904, "eval_steps_per_second": 26.904, "eval_token_acc": 0.47705314009661837, "step": 3000 }, { "epoch": 1.0298149417409184, "grad_norm": 1.2586079835891724, "learning_rate": 8.989322700497058e-05, "loss": 2.4111600875854493, "memory(GiB)": 119.12, "step": 3005, "token_acc": 0.48458574181117536, "train_speed(iter/s)": 1.197406 }, { "epoch": 1.0315284441398218, "grad_norm": 1.0333932638168335, "learning_rate": 8.986075242427761e-05, "loss": 2.4596321105957033, "memory(GiB)": 119.12, "step": 3010, "token_acc": 0.4796511627906977, "train_speed(iter/s)": 1.197155 }, { "epoch": 1.0332419465387253, "grad_norm": 1.2709873914718628, "learning_rate": 8.982823164011175e-05, "loss": 2.440017509460449, "memory(GiB)": 119.12, "step": 3015, "token_acc": 0.47831858407079647, "train_speed(iter/s)": 1.197327 }, { "epoch": 1.0349554489376285, "grad_norm": 1.2657424211502075, "learning_rate": 8.979566469016855e-05, "loss": 2.4927669525146485, "memory(GiB)": 119.12, "step": 3020, "token_acc": 0.47878787878787876, "train_speed(iter/s)": 1.197455 }, { "epoch": 1.036668951336532, "grad_norm": 1.33733069896698, "learning_rate": 8.976305161219707e-05, "loss": 2.4255504608154297, "memory(GiB)": 119.12, "step": 3025, "token_acc": 0.496, "train_speed(iter/s)": 1.197109 }, { "epoch": 1.0383824537354351, "grad_norm": 1.2494611740112305, "learning_rate": 8.973039244399986e-05, "loss": 2.469639015197754, "memory(GiB)": 119.12, "step": 3030, "token_acc": 0.4826982041173894, "train_speed(iter/s)": 1.197235 }, { "epoch": 1.0400959561343386, "grad_norm": 1.1489266157150269, "learning_rate": 8.969768722343286e-05, "loss": 2.398925018310547, "memory(GiB)": 119.12, "step": 3035, "token_acc": 0.49875, "train_speed(iter/s)": 1.197019 }, { "epoch": 1.041809458533242, "grad_norm": 1.236856460571289, "learning_rate": 8.96649359884054e-05, "loss": 2.4451925277709963, "memory(GiB)": 119.12, "step": 3040, "token_acc": 0.47971854304635764, "train_speed(iter/s)": 1.196861 }, { "epoch": 1.0435229609321452, "grad_norm": 1.0748924016952515, "learning_rate": 8.963213877688019e-05, "loss": 2.4513214111328123, "memory(GiB)": 119.12, "step": 3045, "token_acc": 0.48056832427914753, "train_speed(iter/s)": 1.196944 }, { "epoch": 1.0452364633310487, "grad_norm": 1.2248098850250244, "learning_rate": 8.959929562687317e-05, "loss": 2.5176198959350584, "memory(GiB)": 119.12, "step": 3050, "token_acc": 0.4682156948706708, "train_speed(iter/s)": 1.197151 }, { "epoch": 1.046949965729952, "grad_norm": 1.1472731828689575, "learning_rate": 8.956640657645356e-05, "loss": 2.4643354415893555, "memory(GiB)": 119.12, "step": 3055, "token_acc": 0.47800085433575396, "train_speed(iter/s)": 1.197163 }, { "epoch": 1.0486634681288554, "grad_norm": 1.1701513528823853, "learning_rate": 8.953347166374381e-05, "loss": 2.573923873901367, "memory(GiB)": 119.12, "step": 3060, "token_acc": 0.4726175075463562, "train_speed(iter/s)": 1.197233 }, { "epoch": 1.0503769705277588, "grad_norm": 1.1786283254623413, "learning_rate": 8.950049092691946e-05, "loss": 2.3646806716918944, "memory(GiB)": 119.12, "step": 3065, "token_acc": 0.49227557411273487, "train_speed(iter/s)": 1.19732 }, { "epoch": 1.052090472926662, "grad_norm": 1.0703591108322144, "learning_rate": 8.946746440420921e-05, "loss": 2.407078742980957, "memory(GiB)": 119.12, "step": 3070, "token_acc": 0.4780242779405609, "train_speed(iter/s)": 1.197352 }, { "epoch": 1.0538039753255655, "grad_norm": 1.2114431858062744, "learning_rate": 8.943439213389487e-05, "loss": 2.4052236557006834, "memory(GiB)": 119.12, "step": 3075, "token_acc": 0.48333333333333334, "train_speed(iter/s)": 1.197431 }, { "epoch": 1.055517477724469, "grad_norm": 1.1484239101409912, "learning_rate": 8.940127415431117e-05, "loss": 2.4522762298583984, "memory(GiB)": 119.12, "step": 3080, "token_acc": 0.48506825938566556, "train_speed(iter/s)": 1.197415 }, { "epoch": 1.0572309801233721, "grad_norm": 1.2356456518173218, "learning_rate": 8.936811050384593e-05, "loss": 2.386565399169922, "memory(GiB)": 119.12, "step": 3085, "token_acc": 0.49434290687554394, "train_speed(iter/s)": 1.197632 }, { "epoch": 1.0589444825222756, "grad_norm": 1.206954002380371, "learning_rate": 8.933490122093986e-05, "loss": 2.461305618286133, "memory(GiB)": 119.12, "step": 3090, "token_acc": 0.4861612515042118, "train_speed(iter/s)": 1.197479 }, { "epoch": 1.0606579849211788, "grad_norm": 1.1470848321914673, "learning_rate": 8.930164634408656e-05, "loss": 2.449658966064453, "memory(GiB)": 119.12, "step": 3095, "token_acc": 0.4809421841541756, "train_speed(iter/s)": 1.197611 }, { "epoch": 1.0623714873200822, "grad_norm": 1.2401891946792603, "learning_rate": 8.926834591183249e-05, "loss": 2.433313751220703, "memory(GiB)": 119.12, "step": 3100, "token_acc": 0.47980008329862556, "train_speed(iter/s)": 1.197529 }, { "epoch": 1.0640849897189857, "grad_norm": 1.1167203187942505, "learning_rate": 8.923499996277691e-05, "loss": 2.5055873870849608, "memory(GiB)": 119.12, "step": 3105, "token_acc": 0.4692400482509047, "train_speed(iter/s)": 1.197402 }, { "epoch": 1.065798492117889, "grad_norm": 1.1031544208526611, "learning_rate": 8.920160853557184e-05, "loss": 2.4650991439819334, "memory(GiB)": 119.12, "step": 3110, "token_acc": 0.49727767695099817, "train_speed(iter/s)": 1.197522 }, { "epoch": 1.0675119945167924, "grad_norm": 1.1172529458999634, "learning_rate": 8.916817166892204e-05, "loss": 2.45572566986084, "memory(GiB)": 119.12, "step": 3115, "token_acc": 0.4795249795249795, "train_speed(iter/s)": 1.197703 }, { "epoch": 1.0692254969156956, "grad_norm": 1.1844639778137207, "learning_rate": 8.913468940158487e-05, "loss": 2.338591766357422, "memory(GiB)": 119.12, "step": 3120, "token_acc": 0.49471210340775557, "train_speed(iter/s)": 1.197801 }, { "epoch": 1.070938999314599, "grad_norm": 1.146248698234558, "learning_rate": 8.91011617723704e-05, "loss": 2.3699462890625, "memory(GiB)": 119.12, "step": 3125, "token_acc": 0.4862424763542562, "train_speed(iter/s)": 1.197799 }, { "epoch": 1.0726525017135025, "grad_norm": 1.1704654693603516, "learning_rate": 8.906758882014122e-05, "loss": 2.355485725402832, "memory(GiB)": 119.12, "step": 3130, "token_acc": 0.49868995633187774, "train_speed(iter/s)": 1.197509 }, { "epoch": 1.0743660041124057, "grad_norm": 1.2544697523117065, "learning_rate": 8.90339705838125e-05, "loss": 2.351310157775879, "memory(GiB)": 119.12, "step": 3135, "token_acc": 0.49760765550239233, "train_speed(iter/s)": 1.197645 }, { "epoch": 1.0760795065113091, "grad_norm": 1.0938721895217896, "learning_rate": 8.900030710235184e-05, "loss": 2.4411798477172852, "memory(GiB)": 119.12, "step": 3140, "token_acc": 0.4806716126513081, "train_speed(iter/s)": 1.197501 }, { "epoch": 1.0777930089102126, "grad_norm": 1.1261005401611328, "learning_rate": 8.896659841477935e-05, "loss": 2.304806137084961, "memory(GiB)": 119.12, "step": 3145, "token_acc": 0.5121503897294819, "train_speed(iter/s)": 1.19764 }, { "epoch": 1.0795065113091158, "grad_norm": 1.3616900444030762, "learning_rate": 8.89328445601675e-05, "loss": 2.4999359130859373, "memory(GiB)": 119.12, "step": 3150, "token_acc": 0.474331777683496, "train_speed(iter/s)": 1.197608 }, { "epoch": 1.0812200137080192, "grad_norm": 1.2088708877563477, "learning_rate": 8.889904557764111e-05, "loss": 2.4433805465698244, "memory(GiB)": 119.12, "step": 3155, "token_acc": 0.47872797593467986, "train_speed(iter/s)": 1.197646 }, { "epoch": 1.0829335161069225, "grad_norm": 1.1998867988586426, "learning_rate": 8.886520150637734e-05, "loss": 2.4071550369262695, "memory(GiB)": 119.12, "step": 3160, "token_acc": 0.4965263588067021, "train_speed(iter/s)": 1.197643 }, { "epoch": 1.084647018505826, "grad_norm": 1.015133023262024, "learning_rate": 8.88313123856056e-05, "loss": 2.363317108154297, "memory(GiB)": 119.12, "step": 3165, "token_acc": 0.4885132206328565, "train_speed(iter/s)": 1.197607 }, { "epoch": 1.0863605209047293, "grad_norm": 1.2342053651809692, "learning_rate": 8.879737825460748e-05, "loss": 2.456891632080078, "memory(GiB)": 119.12, "step": 3170, "token_acc": 0.48135593220338985, "train_speed(iter/s)": 1.197698 }, { "epoch": 1.0880740233036326, "grad_norm": 1.2349830865859985, "learning_rate": 8.876339915271684e-05, "loss": 2.404074478149414, "memory(GiB)": 119.12, "step": 3175, "token_acc": 0.4870801033591731, "train_speed(iter/s)": 1.197791 }, { "epoch": 1.089787525702536, "grad_norm": 1.2558836936950684, "learning_rate": 8.872937511931953e-05, "loss": 2.4405139923095702, "memory(GiB)": 119.12, "step": 3180, "token_acc": 0.5070360598065083, "train_speed(iter/s)": 1.197724 }, { "epoch": 1.0915010281014395, "grad_norm": 1.2971521615982056, "learning_rate": 8.869530619385357e-05, "loss": 2.4306468963623047, "memory(GiB)": 119.12, "step": 3185, "token_acc": 0.5029802842732691, "train_speed(iter/s)": 1.19786 }, { "epoch": 1.0932145305003427, "grad_norm": 1.404508113861084, "learning_rate": 8.866119241580906e-05, "loss": 2.4396820068359375, "memory(GiB)": 119.12, "step": 3190, "token_acc": 0.48026030368763556, "train_speed(iter/s)": 1.197704 }, { "epoch": 1.0949280328992461, "grad_norm": 1.3512502908706665, "learning_rate": 8.862703382472796e-05, "loss": 2.4796401977539064, "memory(GiB)": 119.12, "step": 3195, "token_acc": 0.4816731349719707, "train_speed(iter/s)": 1.197821 }, { "epoch": 1.0966415352981493, "grad_norm": 1.2369909286499023, "learning_rate": 8.859283046020427e-05, "loss": 2.4042503356933596, "memory(GiB)": 119.12, "step": 3200, "token_acc": 0.5017528483786152, "train_speed(iter/s)": 1.197842 }, { "epoch": 1.0983550376970528, "grad_norm": 1.222968578338623, "learning_rate": 8.855858236188386e-05, "loss": 2.418155479431152, "memory(GiB)": 119.12, "step": 3205, "token_acc": 0.4822231303637107, "train_speed(iter/s)": 1.197922 }, { "epoch": 1.1000685400959562, "grad_norm": 1.1935580968856812, "learning_rate": 8.852428956946443e-05, "loss": 2.459965705871582, "memory(GiB)": 119.12, "step": 3210, "token_acc": 0.49190371991247267, "train_speed(iter/s)": 1.197872 }, { "epoch": 1.1017820424948594, "grad_norm": 1.3342095613479614, "learning_rate": 8.848995212269556e-05, "loss": 2.4559539794921874, "memory(GiB)": 119.12, "step": 3215, "token_acc": 0.4862778730703259, "train_speed(iter/s)": 1.197856 }, { "epoch": 1.103495544893763, "grad_norm": 1.301409363746643, "learning_rate": 8.845557006137848e-05, "loss": 2.436818313598633, "memory(GiB)": 119.12, "step": 3220, "token_acc": 0.4892450442851118, "train_speed(iter/s)": 1.197987 }, { "epoch": 1.1052090472926661, "grad_norm": 1.2110590934753418, "learning_rate": 8.842114342536623e-05, "loss": 2.362786865234375, "memory(GiB)": 119.12, "step": 3225, "token_acc": 0.4944469124833407, "train_speed(iter/s)": 1.198089 }, { "epoch": 1.1069225496915696, "grad_norm": 1.1984931230545044, "learning_rate": 8.838667225456346e-05, "loss": 2.481568145751953, "memory(GiB)": 119.12, "step": 3230, "token_acc": 0.4831896551724138, "train_speed(iter/s)": 1.198012 }, { "epoch": 1.108636052090473, "grad_norm": 1.2102590799331665, "learning_rate": 8.835215658892646e-05, "loss": 2.3199668884277345, "memory(GiB)": 119.12, "step": 3235, "token_acc": 0.5166809238665526, "train_speed(iter/s)": 1.198124 }, { "epoch": 1.1103495544893762, "grad_norm": 1.3648126125335693, "learning_rate": 8.831759646846312e-05, "loss": 2.3656742095947267, "memory(GiB)": 119.12, "step": 3240, "token_acc": 0.48043676069153773, "train_speed(iter/s)": 1.198228 }, { "epoch": 1.1120630568882797, "grad_norm": 1.3762019872665405, "learning_rate": 8.828299193323281e-05, "loss": 2.313608741760254, "memory(GiB)": 120.47, "step": 3245, "token_acc": 0.5002178649237473, "train_speed(iter/s)": 1.197632 }, { "epoch": 1.1137765592871829, "grad_norm": 1.3695141077041626, "learning_rate": 8.824834302334641e-05, "loss": 2.4211273193359375, "memory(GiB)": 120.47, "step": 3250, "token_acc": 0.503569928601428, "train_speed(iter/s)": 1.197771 }, { "epoch": 1.1154900616860863, "grad_norm": 1.2168091535568237, "learning_rate": 8.821364977896625e-05, "loss": 2.4718809127807617, "memory(GiB)": 120.47, "step": 3255, "token_acc": 0.48011480114801147, "train_speed(iter/s)": 1.197747 }, { "epoch": 1.1172035640849898, "grad_norm": 1.1193376779556274, "learning_rate": 8.8178912240306e-05, "loss": 2.4394344329833983, "memory(GiB)": 120.47, "step": 3260, "token_acc": 0.4925775978407557, "train_speed(iter/s)": 1.197871 }, { "epoch": 1.118917066483893, "grad_norm": 1.3146957159042358, "learning_rate": 8.814413044763073e-05, "loss": 2.4569908142089845, "memory(GiB)": 120.47, "step": 3265, "token_acc": 0.48342059336823734, "train_speed(iter/s)": 1.198018 }, { "epoch": 1.1206305688827964, "grad_norm": 1.21794593334198, "learning_rate": 8.810930444125676e-05, "loss": 2.4477298736572264, "memory(GiB)": 120.47, "step": 3270, "token_acc": 0.4874567474048443, "train_speed(iter/s)": 1.197635 }, { "epoch": 1.1223440712816999, "grad_norm": 1.3804867267608643, "learning_rate": 8.807443426155168e-05, "loss": 2.4060157775878905, "memory(GiB)": 120.47, "step": 3275, "token_acc": 0.48149760557248583, "train_speed(iter/s)": 1.197736 }, { "epoch": 1.124057573680603, "grad_norm": 1.436147928237915, "learning_rate": 8.80395199489343e-05, "loss": 2.482632064819336, "memory(GiB)": 120.47, "step": 3280, "token_acc": 0.49635036496350365, "train_speed(iter/s)": 1.197678 }, { "epoch": 1.1257710760795065, "grad_norm": 1.3237684965133667, "learning_rate": 8.800456154387456e-05, "loss": 2.4854822158813477, "memory(GiB)": 120.47, "step": 3285, "token_acc": 0.4648711943793911, "train_speed(iter/s)": 1.197804 }, { "epoch": 1.12748457847841, "grad_norm": 1.2404135465621948, "learning_rate": 8.796955908689348e-05, "loss": 2.434207534790039, "memory(GiB)": 120.47, "step": 3290, "token_acc": 0.4909244406922752, "train_speed(iter/s)": 1.197946 }, { "epoch": 1.1291980808773132, "grad_norm": 1.1661313772201538, "learning_rate": 8.793451261856323e-05, "loss": 2.3528175354003906, "memory(GiB)": 120.47, "step": 3295, "token_acc": 0.5050460728389644, "train_speed(iter/s)": 1.19765 }, { "epoch": 1.1309115832762167, "grad_norm": 1.1046831607818604, "learning_rate": 8.789942217950691e-05, "loss": 2.4002958297729493, "memory(GiB)": 120.47, "step": 3300, "token_acc": 0.49392888117953165, "train_speed(iter/s)": 1.197702 }, { "epoch": 1.1326250856751199, "grad_norm": 1.1340787410736084, "learning_rate": 8.786428781039861e-05, "loss": 2.402260398864746, "memory(GiB)": 120.47, "step": 3305, "token_acc": 0.48503485034850347, "train_speed(iter/s)": 1.19762 }, { "epoch": 1.1343385880740233, "grad_norm": 1.312214970588684, "learning_rate": 8.782910955196337e-05, "loss": 2.4376211166381836, "memory(GiB)": 120.47, "step": 3310, "token_acc": 0.48895497026338147, "train_speed(iter/s)": 1.197737 }, { "epoch": 1.1360520904729268, "grad_norm": 1.1523048877716064, "learning_rate": 8.779388744497707e-05, "loss": 2.3913124084472654, "memory(GiB)": 120.47, "step": 3315, "token_acc": 0.4954760878931495, "train_speed(iter/s)": 1.197845 }, { "epoch": 1.13776559287183, "grad_norm": 1.1759029626846313, "learning_rate": 8.775862153026644e-05, "loss": 2.386087417602539, "memory(GiB)": 120.47, "step": 3320, "token_acc": 0.4958368026644463, "train_speed(iter/s)": 1.197792 }, { "epoch": 1.1394790952707334, "grad_norm": 2.039208173751831, "learning_rate": 8.772331184870894e-05, "loss": 2.3096950531005858, "memory(GiB)": 120.47, "step": 3325, "token_acc": 0.49489569462938304, "train_speed(iter/s)": 1.197857 }, { "epoch": 1.1411925976696367, "grad_norm": 1.324501395225525, "learning_rate": 8.768795844123285e-05, "loss": 2.3869667053222656, "memory(GiB)": 120.47, "step": 3330, "token_acc": 0.4968314321926489, "train_speed(iter/s)": 1.197793 }, { "epoch": 1.14290610006854, "grad_norm": 1.1686738729476929, "learning_rate": 8.765256134881703e-05, "loss": 2.338774299621582, "memory(GiB)": 120.47, "step": 3335, "token_acc": 0.49517966695880805, "train_speed(iter/s)": 1.197994 }, { "epoch": 1.1446196024674435, "grad_norm": 1.1431082487106323, "learning_rate": 8.761712061249105e-05, "loss": 2.384490203857422, "memory(GiB)": 120.47, "step": 3340, "token_acc": 0.4829290206648697, "train_speed(iter/s)": 1.198095 }, { "epoch": 1.1463331048663468, "grad_norm": 1.2709463834762573, "learning_rate": 8.758163627333506e-05, "loss": 2.4619136810302735, "memory(GiB)": 120.47, "step": 3345, "token_acc": 0.490590248075278, "train_speed(iter/s)": 1.19793 }, { "epoch": 1.1480466072652502, "grad_norm": 1.2266747951507568, "learning_rate": 8.75461083724797e-05, "loss": 2.4062944412231446, "memory(GiB)": 120.47, "step": 3350, "token_acc": 0.49529512403763903, "train_speed(iter/s)": 1.198109 }, { "epoch": 1.1497601096641534, "grad_norm": 1.2032591104507446, "learning_rate": 8.751053695110618e-05, "loss": 2.432042694091797, "memory(GiB)": 120.47, "step": 3355, "token_acc": 0.48703403565640196, "train_speed(iter/s)": 1.197998 }, { "epoch": 1.1514736120630569, "grad_norm": 1.1546566486358643, "learning_rate": 8.747492205044607e-05, "loss": 2.4591522216796875, "memory(GiB)": 120.47, "step": 3360, "token_acc": 0.48370221327967805, "train_speed(iter/s)": 1.197966 }, { "epoch": 1.1531871144619603, "grad_norm": 1.16510009765625, "learning_rate": 8.743926371178144e-05, "loss": 2.3603134155273438, "memory(GiB)": 120.47, "step": 3365, "token_acc": 0.4989491382934006, "train_speed(iter/s)": 1.198088 }, { "epoch": 1.1549006168608635, "grad_norm": 1.2009190320968628, "learning_rate": 8.740356197644459e-05, "loss": 2.4360992431640627, "memory(GiB)": 120.47, "step": 3370, "token_acc": 0.48996879179670083, "train_speed(iter/s)": 1.198153 }, { "epoch": 1.156614119259767, "grad_norm": 1.3205829858779907, "learning_rate": 8.736781688581823e-05, "loss": 2.4377920150756838, "memory(GiB)": 120.47, "step": 3375, "token_acc": 0.48060344827586204, "train_speed(iter/s)": 1.198263 }, { "epoch": 1.1583276216586702, "grad_norm": 1.267646312713623, "learning_rate": 8.733202848133526e-05, "loss": 2.338418388366699, "memory(GiB)": 120.47, "step": 3380, "token_acc": 0.4884210526315789, "train_speed(iter/s)": 1.19839 }, { "epoch": 1.1600411240575736, "grad_norm": 1.1179590225219727, "learning_rate": 8.729619680447883e-05, "loss": 2.3411725997924804, "memory(GiB)": 120.47, "step": 3385, "token_acc": 0.496160409556314, "train_speed(iter/s)": 1.19841 }, { "epoch": 1.161754626456477, "grad_norm": 1.2047507762908936, "learning_rate": 8.726032189678217e-05, "loss": 2.327042007446289, "memory(GiB)": 120.47, "step": 3390, "token_acc": 0.4836167565325591, "train_speed(iter/s)": 1.198363 }, { "epoch": 1.1634681288553803, "grad_norm": 1.3126161098480225, "learning_rate": 8.722440379982873e-05, "loss": 2.529039001464844, "memory(GiB)": 120.47, "step": 3395, "token_acc": 0.48046550290939316, "train_speed(iter/s)": 1.198463 }, { "epoch": 1.1651816312542838, "grad_norm": 1.2438774108886719, "learning_rate": 8.718844255525193e-05, "loss": 2.4384433746337892, "memory(GiB)": 120.47, "step": 3400, "token_acc": 0.48020585906571656, "train_speed(iter/s)": 1.198279 }, { "epoch": 1.1668951336531872, "grad_norm": 1.2080820798873901, "learning_rate": 8.715243820473524e-05, "loss": 2.4657211303710938, "memory(GiB)": 120.47, "step": 3405, "token_acc": 0.48550428746427116, "train_speed(iter/s)": 1.198171 }, { "epoch": 1.1686086360520904, "grad_norm": 1.3672468662261963, "learning_rate": 8.711639079001211e-05, "loss": 2.4511924743652345, "memory(GiB)": 120.47, "step": 3410, "token_acc": 0.4935787671232877, "train_speed(iter/s)": 1.198258 }, { "epoch": 1.1703221384509939, "grad_norm": 1.1559926271438599, "learning_rate": 8.708030035286587e-05, "loss": 2.3130477905273437, "memory(GiB)": 120.47, "step": 3415, "token_acc": 0.5116079105760963, "train_speed(iter/s)": 1.198257 }, { "epoch": 1.1720356408498973, "grad_norm": 1.3532333374023438, "learning_rate": 8.704416693512977e-05, "loss": 2.377058982849121, "memory(GiB)": 120.47, "step": 3420, "token_acc": 0.4918664383561644, "train_speed(iter/s)": 1.198307 }, { "epoch": 1.1737491432488005, "grad_norm": 1.2202341556549072, "learning_rate": 8.70079905786868e-05, "loss": 2.3803077697753907, "memory(GiB)": 120.47, "step": 3425, "token_acc": 0.5030826140567201, "train_speed(iter/s)": 1.198249 }, { "epoch": 1.175462645647704, "grad_norm": 1.219962477684021, "learning_rate": 8.697177132546981e-05, "loss": 2.4552886962890623, "memory(GiB)": 120.47, "step": 3430, "token_acc": 0.49760557248585113, "train_speed(iter/s)": 1.198502 }, { "epoch": 1.1771761480466072, "grad_norm": 1.3189717531204224, "learning_rate": 8.693550921746132e-05, "loss": 2.4478004455566404, "memory(GiB)": 120.47, "step": 3435, "token_acc": 0.4853068280034572, "train_speed(iter/s)": 1.198539 }, { "epoch": 1.1788896504455106, "grad_norm": 1.1409518718719482, "learning_rate": 8.689920429669352e-05, "loss": 2.528594970703125, "memory(GiB)": 120.47, "step": 3440, "token_acc": 0.4738863287250384, "train_speed(iter/s)": 1.198567 }, { "epoch": 1.180603152844414, "grad_norm": 1.3399080038070679, "learning_rate": 8.686285660524825e-05, "loss": 2.408614921569824, "memory(GiB)": 120.47, "step": 3445, "token_acc": 0.48722390645300995, "train_speed(iter/s)": 1.198719 }, { "epoch": 1.1823166552433173, "grad_norm": 1.4946085214614868, "learning_rate": 8.682646618525691e-05, "loss": 2.374177932739258, "memory(GiB)": 120.47, "step": 3450, "token_acc": 0.4905411350637923, "train_speed(iter/s)": 1.198718 }, { "epoch": 1.1840301576422207, "grad_norm": 1.172160267829895, "learning_rate": 8.679003307890045e-05, "loss": 2.4099781036376955, "memory(GiB)": 120.47, "step": 3455, "token_acc": 0.4950207468879668, "train_speed(iter/s)": 1.198817 }, { "epoch": 1.185743660041124, "grad_norm": 1.221497893333435, "learning_rate": 8.675355732840927e-05, "loss": 2.414739799499512, "memory(GiB)": 120.47, "step": 3460, "token_acc": 0.47536842105263155, "train_speed(iter/s)": 1.198905 }, { "epoch": 1.1874571624400274, "grad_norm": 1.3421334028244019, "learning_rate": 8.671703897606323e-05, "loss": 2.3541580200195313, "memory(GiB)": 120.47, "step": 3465, "token_acc": 0.5, "train_speed(iter/s)": 1.198929 }, { "epoch": 1.1891706648389309, "grad_norm": 1.2026309967041016, "learning_rate": 8.668047806419152e-05, "loss": 2.323431968688965, "memory(GiB)": 120.47, "step": 3470, "token_acc": 0.4974704890387858, "train_speed(iter/s)": 1.199129 }, { "epoch": 1.190884167237834, "grad_norm": 1.5423046350479126, "learning_rate": 8.664387463517274e-05, "loss": 2.380302619934082, "memory(GiB)": 120.47, "step": 3475, "token_acc": 0.49825479930191974, "train_speed(iter/s)": 1.199223 }, { "epoch": 1.1925976696367375, "grad_norm": 1.182039737701416, "learning_rate": 8.660722873143469e-05, "loss": 2.3233882904052736, "memory(GiB)": 120.47, "step": 3480, "token_acc": 0.4997875053123672, "train_speed(iter/s)": 1.199343 }, { "epoch": 1.1943111720356407, "grad_norm": 1.1608885526657104, "learning_rate": 8.657054039545445e-05, "loss": 2.391773986816406, "memory(GiB)": 120.47, "step": 3485, "token_acc": 0.5050240279598078, "train_speed(iter/s)": 1.199409 }, { "epoch": 1.1960246744345442, "grad_norm": 1.3142822980880737, "learning_rate": 8.65338096697583e-05, "loss": 2.4000722885131838, "memory(GiB)": 120.47, "step": 3490, "token_acc": 0.4981195152528207, "train_speed(iter/s)": 1.19937 }, { "epoch": 1.1977381768334476, "grad_norm": 1.1369225978851318, "learning_rate": 8.64970365969216e-05, "loss": 2.4011404037475588, "memory(GiB)": 120.47, "step": 3495, "token_acc": 0.4932088285229202, "train_speed(iter/s)": 1.199529 }, { "epoch": 1.1994516792323509, "grad_norm": 1.1893559694290161, "learning_rate": 8.646022121956884e-05, "loss": 2.379560089111328, "memory(GiB)": 120.47, "step": 3500, "token_acc": 0.513232104121475, "train_speed(iter/s)": 1.199626 }, { "epoch": 1.1994516792323509, "eval_loss": 2.1244728565216064, "eval_runtime": 3.7069, "eval_samples_per_second": 26.977, "eval_steps_per_second": 26.977, "eval_token_acc": 0.49259757738896365, "step": 3500 }, { "epoch": 1.2011651816312543, "grad_norm": 1.2252236604690552, "learning_rate": 8.642336358037353e-05, "loss": 2.4258888244628904, "memory(GiB)": 120.47, "step": 3505, "token_acc": 0.48914473684210524, "train_speed(iter/s)": 1.197291 }, { "epoch": 1.2028786840301575, "grad_norm": 1.105453610420227, "learning_rate": 8.638646372205816e-05, "loss": 2.4231851577758787, "memory(GiB)": 120.47, "step": 3510, "token_acc": 0.49424797613975285, "train_speed(iter/s)": 1.197147 }, { "epoch": 1.204592186429061, "grad_norm": 1.3435906171798706, "learning_rate": 8.634952168739418e-05, "loss": 2.461010551452637, "memory(GiB)": 120.47, "step": 3515, "token_acc": 0.4804096170970614, "train_speed(iter/s)": 1.197194 }, { "epoch": 1.2063056888279644, "grad_norm": 1.1007102727890015, "learning_rate": 8.63125375192019e-05, "loss": 2.4736509323120117, "memory(GiB)": 120.47, "step": 3520, "token_acc": 0.48854961832061067, "train_speed(iter/s)": 1.197323 }, { "epoch": 1.2080191912268676, "grad_norm": 1.2469522953033447, "learning_rate": 8.62755112603505e-05, "loss": 2.4852224349975587, "memory(GiB)": 120.47, "step": 3525, "token_acc": 0.4764628244610647, "train_speed(iter/s)": 1.197371 }, { "epoch": 1.209732693625771, "grad_norm": 2.1678285598754883, "learning_rate": 8.62384429537579e-05, "loss": 2.4341863632202148, "memory(GiB)": 120.47, "step": 3530, "token_acc": 0.5020242914979757, "train_speed(iter/s)": 1.197378 }, { "epoch": 1.2114461960246745, "grad_norm": 1.203878402709961, "learning_rate": 8.62013326423908e-05, "loss": 2.4693174362182617, "memory(GiB)": 120.47, "step": 3535, "token_acc": 0.48826291079812206, "train_speed(iter/s)": 1.19737 }, { "epoch": 1.2131596984235777, "grad_norm": 1.1283100843429565, "learning_rate": 8.61641803692646e-05, "loss": 2.4228885650634764, "memory(GiB)": 120.47, "step": 3540, "token_acc": 0.47276422764227644, "train_speed(iter/s)": 1.197479 }, { "epoch": 1.2148732008224812, "grad_norm": 1.1651345491409302, "learning_rate": 8.612698617744329e-05, "loss": 2.5077369689941404, "memory(GiB)": 120.47, "step": 3545, "token_acc": 0.4759472115793955, "train_speed(iter/s)": 1.197409 }, { "epoch": 1.2165867032213846, "grad_norm": 1.1583133935928345, "learning_rate": 8.608975011003948e-05, "loss": 2.4180652618408205, "memory(GiB)": 120.47, "step": 3550, "token_acc": 0.4810396250532595, "train_speed(iter/s)": 1.197555 }, { "epoch": 1.2183002056202878, "grad_norm": 1.2440030574798584, "learning_rate": 8.605247221021431e-05, "loss": 2.48699951171875, "memory(GiB)": 120.47, "step": 3555, "token_acc": 0.48184818481848185, "train_speed(iter/s)": 1.197649 }, { "epoch": 1.2200137080191913, "grad_norm": 1.312588095664978, "learning_rate": 8.60151525211774e-05, "loss": 2.456516456604004, "memory(GiB)": 120.47, "step": 3560, "token_acc": 0.47897392767031116, "train_speed(iter/s)": 1.197676 }, { "epoch": 1.2217272104180945, "grad_norm": 1.3296427726745605, "learning_rate": 8.597779108618684e-05, "loss": 2.359532928466797, "memory(GiB)": 120.47, "step": 3565, "token_acc": 0.5092783505154639, "train_speed(iter/s)": 1.197688 }, { "epoch": 1.223440712816998, "grad_norm": 1.2546695470809937, "learning_rate": 8.594038794854909e-05, "loss": 2.390875816345215, "memory(GiB)": 120.47, "step": 3570, "token_acc": 0.5144958892254435, "train_speed(iter/s)": 1.1977 }, { "epoch": 1.2251542152159014, "grad_norm": 1.3882033824920654, "learning_rate": 8.590294315161896e-05, "loss": 2.4897205352783205, "memory(GiB)": 120.47, "step": 3575, "token_acc": 0.47980894485453757, "train_speed(iter/s)": 1.197405 }, { "epoch": 1.2268677176148046, "grad_norm": 1.2242581844329834, "learning_rate": 8.586545673879948e-05, "loss": 2.4553014755249025, "memory(GiB)": 120.47, "step": 3580, "token_acc": 0.48751642575558474, "train_speed(iter/s)": 1.197487 }, { "epoch": 1.228581220013708, "grad_norm": 1.3005099296569824, "learning_rate": 8.582792875354204e-05, "loss": 2.3937511444091797, "memory(GiB)": 120.47, "step": 3585, "token_acc": 0.5014925373134328, "train_speed(iter/s)": 1.197502 }, { "epoch": 1.2302947224126113, "grad_norm": 1.7024785280227661, "learning_rate": 8.57903592393461e-05, "loss": 2.4094690322875976, "memory(GiB)": 120.47, "step": 3590, "token_acc": 0.5013134851138353, "train_speed(iter/s)": 1.197588 }, { "epoch": 1.2320082248115147, "grad_norm": 1.1915119886398315, "learning_rate": 8.575274823975935e-05, "loss": 2.4345340728759766, "memory(GiB)": 120.47, "step": 3595, "token_acc": 0.49262792714657416, "train_speed(iter/s)": 1.197734 }, { "epoch": 1.2337217272104182, "grad_norm": 1.1381174325942993, "learning_rate": 8.571509579837749e-05, "loss": 2.4626235961914062, "memory(GiB)": 120.47, "step": 3600, "token_acc": 0.475251677852349, "train_speed(iter/s)": 1.197568 }, { "epoch": 1.2354352296093214, "grad_norm": 1.2376484870910645, "learning_rate": 8.567740195884433e-05, "loss": 2.4150108337402343, "memory(GiB)": 120.47, "step": 3605, "token_acc": 0.49603008775595486, "train_speed(iter/s)": 1.19761 }, { "epoch": 1.2371487320082248, "grad_norm": 1.115372657775879, "learning_rate": 8.563966676485158e-05, "loss": 2.340071105957031, "memory(GiB)": 120.47, "step": 3610, "token_acc": 0.492616899097621, "train_speed(iter/s)": 1.197615 }, { "epoch": 1.238862234407128, "grad_norm": 1.2997362613677979, "learning_rate": 8.560189026013896e-05, "loss": 2.354812240600586, "memory(GiB)": 120.47, "step": 3615, "token_acc": 0.5041758241758242, "train_speed(iter/s)": 1.197411 }, { "epoch": 1.2405757368060315, "grad_norm": 1.1486895084381104, "learning_rate": 8.556407248849404e-05, "loss": 2.3973415374755858, "memory(GiB)": 120.47, "step": 3620, "token_acc": 0.48896434634974534, "train_speed(iter/s)": 1.197412 }, { "epoch": 1.242289239204935, "grad_norm": 1.3266569375991821, "learning_rate": 8.552621349375223e-05, "loss": 2.3992815017700195, "memory(GiB)": 120.47, "step": 3625, "token_acc": 0.49845882870981945, "train_speed(iter/s)": 1.197506 }, { "epoch": 1.2440027416038382, "grad_norm": 1.3450454473495483, "learning_rate": 8.548831331979672e-05, "loss": 2.4526771545410155, "memory(GiB)": 120.47, "step": 3630, "token_acc": 0.4857142857142857, "train_speed(iter/s)": 1.197737 }, { "epoch": 1.2457162440027416, "grad_norm": 1.2307863235473633, "learning_rate": 8.545037201055844e-05, "loss": 2.431321144104004, "memory(GiB)": 120.47, "step": 3635, "token_acc": 0.4918681318681319, "train_speed(iter/s)": 1.197901 }, { "epoch": 1.247429746401645, "grad_norm": 1.1673346757888794, "learning_rate": 8.541238961001599e-05, "loss": 2.4336469650268553, "memory(GiB)": 120.47, "step": 3640, "token_acc": 0.48555881121808286, "train_speed(iter/s)": 1.197908 }, { "epoch": 1.2491432488005483, "grad_norm": 1.1660102605819702, "learning_rate": 8.53743661621956e-05, "loss": 2.485552978515625, "memory(GiB)": 120.47, "step": 3645, "token_acc": 0.4883303411131059, "train_speed(iter/s)": 1.197958 }, { "epoch": 1.2508567511994517, "grad_norm": 1.3283594846725464, "learning_rate": 8.533630171117107e-05, "loss": 2.4668209075927736, "memory(GiB)": 120.47, "step": 3650, "token_acc": 0.488122962272939, "train_speed(iter/s)": 1.198109 }, { "epoch": 1.2525702535983552, "grad_norm": 1.142560601234436, "learning_rate": 8.529819630106376e-05, "loss": 2.3958389282226564, "memory(GiB)": 120.47, "step": 3655, "token_acc": 0.4960167714884696, "train_speed(iter/s)": 1.197971 }, { "epoch": 1.2542837559972584, "grad_norm": 1.2372616529464722, "learning_rate": 8.526004997604252e-05, "loss": 2.415126419067383, "memory(GiB)": 120.47, "step": 3660, "token_acc": 0.4972539079002957, "train_speed(iter/s)": 1.197978 }, { "epoch": 1.2559972583961618, "grad_norm": 1.1813700199127197, "learning_rate": 8.522186278032352e-05, "loss": 2.3507526397705076, "memory(GiB)": 120.47, "step": 3665, "token_acc": 0.5151122625215889, "train_speed(iter/s)": 1.197998 }, { "epoch": 1.257710760795065, "grad_norm": 1.1272108554840088, "learning_rate": 8.518363475817041e-05, "loss": 2.4758705139160155, "memory(GiB)": 120.47, "step": 3670, "token_acc": 0.46641477749790095, "train_speed(iter/s)": 1.198041 }, { "epoch": 1.2594242631939685, "grad_norm": 1.3255842924118042, "learning_rate": 8.514536595389415e-05, "loss": 2.3776153564453124, "memory(GiB)": 120.47, "step": 3675, "token_acc": 0.5017079419299744, "train_speed(iter/s)": 1.197975 }, { "epoch": 1.261137765592872, "grad_norm": 1.1379079818725586, "learning_rate": 8.510705641185293e-05, "loss": 2.352573013305664, "memory(GiB)": 120.47, "step": 3680, "token_acc": 0.49934065934065935, "train_speed(iter/s)": 1.198012 }, { "epoch": 1.2628512679917752, "grad_norm": 1.221144437789917, "learning_rate": 8.506870617645217e-05, "loss": 2.564453125, "memory(GiB)": 120.47, "step": 3685, "token_acc": 0.461767204757859, "train_speed(iter/s)": 1.197957 }, { "epoch": 1.2645647703906786, "grad_norm": 1.2257323265075684, "learning_rate": 8.503031529214449e-05, "loss": 2.502775955200195, "memory(GiB)": 120.47, "step": 3690, "token_acc": 0.4749791492910759, "train_speed(iter/s)": 1.197978 }, { "epoch": 1.2662782727895818, "grad_norm": 1.2505922317504883, "learning_rate": 8.49918838034296e-05, "loss": 2.4513452529907225, "memory(GiB)": 120.47, "step": 3695, "token_acc": 0.47973238882329794, "train_speed(iter/s)": 1.198025 }, { "epoch": 1.2679917751884853, "grad_norm": 1.2422032356262207, "learning_rate": 8.495341175485428e-05, "loss": 2.4468400955200194, "memory(GiB)": 120.47, "step": 3700, "token_acc": 0.48544558889386474, "train_speed(iter/s)": 1.198139 }, { "epoch": 1.2697052775873887, "grad_norm": 1.2203465700149536, "learning_rate": 8.491489919101233e-05, "loss": 2.428867149353027, "memory(GiB)": 120.47, "step": 3705, "token_acc": 0.4780033840947547, "train_speed(iter/s)": 1.198253 }, { "epoch": 1.271418779986292, "grad_norm": 1.3768031597137451, "learning_rate": 8.48763461565445e-05, "loss": 2.3769031524658204, "memory(GiB)": 120.47, "step": 3710, "token_acc": 0.4931623931623932, "train_speed(iter/s)": 1.198385 }, { "epoch": 1.2731322823851954, "grad_norm": 1.4688218832015991, "learning_rate": 8.483775269613848e-05, "loss": 2.332452964782715, "memory(GiB)": 120.47, "step": 3715, "token_acc": 0.494268684089867, "train_speed(iter/s)": 1.198465 }, { "epoch": 1.2748457847840986, "grad_norm": 1.5635221004486084, "learning_rate": 8.479911885452874e-05, "loss": 2.4059534072875977, "memory(GiB)": 120.47, "step": 3720, "token_acc": 0.491220556745182, "train_speed(iter/s)": 1.198496 }, { "epoch": 1.276559287183002, "grad_norm": 1.1297029256820679, "learning_rate": 8.47604446764967e-05, "loss": 2.4305593490600588, "memory(GiB)": 120.47, "step": 3725, "token_acc": 0.4846960167714885, "train_speed(iter/s)": 1.198612 }, { "epoch": 1.2782727895819055, "grad_norm": 1.2264817953109741, "learning_rate": 8.472173020687037e-05, "loss": 2.443625640869141, "memory(GiB)": 120.47, "step": 3730, "token_acc": 0.4949166327775518, "train_speed(iter/s)": 1.198533 }, { "epoch": 1.2799862919808087, "grad_norm": 1.1617134809494019, "learning_rate": 8.46829754905246e-05, "loss": 2.425181198120117, "memory(GiB)": 120.47, "step": 3735, "token_acc": 0.49287872248597325, "train_speed(iter/s)": 1.198642 }, { "epoch": 1.2816997943797122, "grad_norm": 1.2212659120559692, "learning_rate": 8.464418057238079e-05, "loss": 2.401826858520508, "memory(GiB)": 120.47, "step": 3740, "token_acc": 0.5027173913043478, "train_speed(iter/s)": 1.198761 }, { "epoch": 1.2834132967786154, "grad_norm": 1.2919607162475586, "learning_rate": 8.460534549740699e-05, "loss": 2.371310234069824, "memory(GiB)": 120.47, "step": 3745, "token_acc": 0.49640591966173364, "train_speed(iter/s)": 1.198877 }, { "epoch": 1.2851267991775188, "grad_norm": 1.2194546461105347, "learning_rate": 8.456647031061779e-05, "loss": 2.267724609375, "memory(GiB)": 120.47, "step": 3750, "token_acc": 0.5083892617449665, "train_speed(iter/s)": 1.198956 }, { "epoch": 1.2868403015764223, "grad_norm": 1.2517609596252441, "learning_rate": 8.45275550570743e-05, "loss": 2.452052116394043, "memory(GiB)": 120.47, "step": 3755, "token_acc": 0.47453310696095075, "train_speed(iter/s)": 1.199118 }, { "epoch": 1.2885538039753255, "grad_norm": 1.0844171047210693, "learning_rate": 8.448859978188401e-05, "loss": 2.2406633377075194, "memory(GiB)": 120.47, "step": 3760, "token_acc": 0.5042467590523022, "train_speed(iter/s)": 1.199171 }, { "epoch": 1.290267306374229, "grad_norm": 1.143375039100647, "learning_rate": 8.444960453020086e-05, "loss": 2.3409629821777345, "memory(GiB)": 120.47, "step": 3765, "token_acc": 0.4865424430641822, "train_speed(iter/s)": 1.199187 }, { "epoch": 1.2919808087731321, "grad_norm": 1.0780965089797974, "learning_rate": 8.441056934722506e-05, "loss": 2.360483932495117, "memory(GiB)": 120.47, "step": 3770, "token_acc": 0.5020145044319098, "train_speed(iter/s)": 1.199243 }, { "epoch": 1.2936943111720356, "grad_norm": 1.3521125316619873, "learning_rate": 8.437149427820319e-05, "loss": 2.392995071411133, "memory(GiB)": 120.47, "step": 3775, "token_acc": 0.5056986070071761, "train_speed(iter/s)": 1.199299 }, { "epoch": 1.295407813570939, "grad_norm": 1.1138265132904053, "learning_rate": 8.433237936842801e-05, "loss": 2.2771507263183595, "memory(GiB)": 120.47, "step": 3780, "token_acc": 0.5096629213483146, "train_speed(iter/s)": 1.199341 }, { "epoch": 1.2971213159698425, "grad_norm": 1.1213287115097046, "learning_rate": 8.429322466323845e-05, "loss": 2.336609649658203, "memory(GiB)": 120.47, "step": 3785, "token_acc": 0.5042992261392949, "train_speed(iter/s)": 1.19938 }, { "epoch": 1.2988348183687457, "grad_norm": 1.323590636253357, "learning_rate": 8.425403020801961e-05, "loss": 2.3138282775878904, "memory(GiB)": 120.47, "step": 3790, "token_acc": 0.5076001842468908, "train_speed(iter/s)": 1.199527 }, { "epoch": 1.3005483207676491, "grad_norm": 1.2860774993896484, "learning_rate": 8.421479604820263e-05, "loss": 2.4689443588256834, "memory(GiB)": 120.47, "step": 3795, "token_acc": 0.497235219055721, "train_speed(iter/s)": 1.199457 }, { "epoch": 1.3022618231665524, "grad_norm": 1.2725262641906738, "learning_rate": 8.41755222292647e-05, "loss": 2.3529630661010743, "memory(GiB)": 120.47, "step": 3800, "token_acc": 0.4867813680234998, "train_speed(iter/s)": 1.199249 }, { "epoch": 1.3039753255654558, "grad_norm": 1.3001890182495117, "learning_rate": 8.413620879672897e-05, "loss": 2.3951400756835937, "memory(GiB)": 120.47, "step": 3805, "token_acc": 0.49849462365591396, "train_speed(iter/s)": 1.199209 }, { "epoch": 1.3056888279643593, "grad_norm": 1.250755786895752, "learning_rate": 8.409685579616446e-05, "loss": 2.4024961471557615, "memory(GiB)": 120.47, "step": 3810, "token_acc": 0.4932987462170342, "train_speed(iter/s)": 1.19916 }, { "epoch": 1.3074023303632625, "grad_norm": 1.308695912361145, "learning_rate": 8.405746327318616e-05, "loss": 2.343105125427246, "memory(GiB)": 120.47, "step": 3815, "token_acc": 0.5017094017094017, "train_speed(iter/s)": 1.199266 }, { "epoch": 1.309115832762166, "grad_norm": 1.30150306224823, "learning_rate": 8.401803127345475e-05, "loss": 2.4246654510498047, "memory(GiB)": 120.47, "step": 3820, "token_acc": 0.49148936170212765, "train_speed(iter/s)": 1.199341 }, { "epoch": 1.3108293351610691, "grad_norm": 1.3977800607681274, "learning_rate": 8.397855984267675e-05, "loss": 2.346688079833984, "memory(GiB)": 120.47, "step": 3825, "token_acc": 0.5130912162162162, "train_speed(iter/s)": 1.199063 }, { "epoch": 1.3125428375599726, "grad_norm": 1.2000664472579956, "learning_rate": 8.393904902660436e-05, "loss": 2.386330795288086, "memory(GiB)": 120.47, "step": 3830, "token_acc": 0.5025231286795626, "train_speed(iter/s)": 1.199088 }, { "epoch": 1.314256339958876, "grad_norm": 1.1693263053894043, "learning_rate": 8.389949887103546e-05, "loss": 2.391347885131836, "memory(GiB)": 120.47, "step": 3835, "token_acc": 0.5084459459459459, "train_speed(iter/s)": 1.19893 }, { "epoch": 1.3159698423577793, "grad_norm": 1.2863795757293701, "learning_rate": 8.385990942181345e-05, "loss": 2.430052947998047, "memory(GiB)": 120.47, "step": 3840, "token_acc": 0.4934952298352125, "train_speed(iter/s)": 1.199076 }, { "epoch": 1.3176833447566827, "grad_norm": 1.1274211406707764, "learning_rate": 8.382028072482737e-05, "loss": 2.409532356262207, "memory(GiB)": 120.47, "step": 3845, "token_acc": 0.4935117599351176, "train_speed(iter/s)": 1.199201 }, { "epoch": 1.319396847155586, "grad_norm": 1.2044168710708618, "learning_rate": 8.378061282601171e-05, "loss": 2.4346981048583984, "memory(GiB)": 120.47, "step": 3850, "token_acc": 0.48886086591004624, "train_speed(iter/s)": 1.199026 }, { "epoch": 1.3211103495544894, "grad_norm": 1.421698808670044, "learning_rate": 8.374090577134639e-05, "loss": 2.357512092590332, "memory(GiB)": 120.47, "step": 3855, "token_acc": 0.4900173611111111, "train_speed(iter/s)": 1.199077 }, { "epoch": 1.3228238519533928, "grad_norm": 1.2738149166107178, "learning_rate": 8.370115960685672e-05, "loss": 2.376637840270996, "memory(GiB)": 120.47, "step": 3860, "token_acc": 0.4983974358974359, "train_speed(iter/s)": 1.199068 }, { "epoch": 1.324537354352296, "grad_norm": 1.2729289531707764, "learning_rate": 8.366137437861337e-05, "loss": 2.481694221496582, "memory(GiB)": 120.47, "step": 3865, "token_acc": 0.4762734000870701, "train_speed(iter/s)": 1.199147 }, { "epoch": 1.3262508567511995, "grad_norm": 1.2514145374298096, "learning_rate": 8.362155013273227e-05, "loss": 2.3517059326171874, "memory(GiB)": 120.47, "step": 3870, "token_acc": 0.5091383812010444, "train_speed(iter/s)": 1.199272 }, { "epoch": 1.3279643591501027, "grad_norm": 1.4784657955169678, "learning_rate": 8.358168691537456e-05, "loss": 2.408245849609375, "memory(GiB)": 120.47, "step": 3875, "token_acc": 0.4987384356602187, "train_speed(iter/s)": 1.199308 }, { "epoch": 1.3296778615490061, "grad_norm": 1.171868085861206, "learning_rate": 8.354178477274659e-05, "loss": 2.4298679351806642, "memory(GiB)": 120.47, "step": 3880, "token_acc": 0.4864643150123052, "train_speed(iter/s)": 1.199435 }, { "epoch": 1.3313913639479096, "grad_norm": 1.2706754207611084, "learning_rate": 8.350184375109979e-05, "loss": 2.3622737884521485, "memory(GiB)": 120.47, "step": 3885, "token_acc": 0.505020920502092, "train_speed(iter/s)": 1.199461 }, { "epoch": 1.333104866346813, "grad_norm": 1.201836109161377, "learning_rate": 8.34618638967307e-05, "loss": 2.3039499282836915, "memory(GiB)": 120.47, "step": 3890, "token_acc": 0.5164835164835165, "train_speed(iter/s)": 1.199394 }, { "epoch": 1.3348183687457162, "grad_norm": 1.2829241752624512, "learning_rate": 8.342184525598082e-05, "loss": 2.3741403579711915, "memory(GiB)": 120.47, "step": 3895, "token_acc": 0.5034013605442177, "train_speed(iter/s)": 1.199486 }, { "epoch": 1.3365318711446195, "grad_norm": 1.3760724067687988, "learning_rate": 8.338178787523667e-05, "loss": 2.3863277435302734, "memory(GiB)": 120.47, "step": 3900, "token_acc": 0.4985100042571307, "train_speed(iter/s)": 1.199553 }, { "epoch": 1.338245373543523, "grad_norm": 1.3376798629760742, "learning_rate": 8.334169180092958e-05, "loss": 2.390571403503418, "memory(GiB)": 120.47, "step": 3905, "token_acc": 0.5023022185014651, "train_speed(iter/s)": 1.199693 }, { "epoch": 1.3399588759424264, "grad_norm": 1.364235758781433, "learning_rate": 8.330155707953586e-05, "loss": 2.3748708724975587, "memory(GiB)": 120.47, "step": 3910, "token_acc": 0.5031446540880503, "train_speed(iter/s)": 1.199763 }, { "epoch": 1.3416723783413298, "grad_norm": 1.2469782829284668, "learning_rate": 8.326138375757649e-05, "loss": 2.3122180938720702, "memory(GiB)": 121.15, "step": 3915, "token_acc": 0.4934393638170974, "train_speed(iter/s)": 1.19963 }, { "epoch": 1.343385880740233, "grad_norm": 1.2481683492660522, "learning_rate": 8.322117188161728e-05, "loss": 2.3892915725708006, "memory(GiB)": 121.15, "step": 3920, "token_acc": 0.490932096161957, "train_speed(iter/s)": 1.199793 }, { "epoch": 1.3450993831391365, "grad_norm": 1.185382604598999, "learning_rate": 8.318092149826868e-05, "loss": 2.39760799407959, "memory(GiB)": 121.15, "step": 3925, "token_acc": 0.47900599828620394, "train_speed(iter/s)": 1.199909 }, { "epoch": 1.3468128855380397, "grad_norm": 1.451438546180725, "learning_rate": 8.314063265418581e-05, "loss": 2.390066719055176, "memory(GiB)": 121.15, "step": 3930, "token_acc": 0.48417858690940613, "train_speed(iter/s)": 1.199682 }, { "epoch": 1.3485263879369431, "grad_norm": 1.3795408010482788, "learning_rate": 8.310030539606834e-05, "loss": 2.3807886123657225, "memory(GiB)": 121.15, "step": 3935, "token_acc": 0.49123557075673363, "train_speed(iter/s)": 1.199762 }, { "epoch": 1.3502398903358466, "grad_norm": 1.2840182781219482, "learning_rate": 8.30599397706605e-05, "loss": 2.424063491821289, "memory(GiB)": 121.15, "step": 3940, "token_acc": 0.4975951027547005, "train_speed(iter/s)": 1.199752 }, { "epoch": 1.3519533927347498, "grad_norm": 1.231553316116333, "learning_rate": 8.301953582475093e-05, "loss": 2.4554344177246095, "memory(GiB)": 121.15, "step": 3945, "token_acc": 0.4890901605599012, "train_speed(iter/s)": 1.199768 }, { "epoch": 1.3536668951336532, "grad_norm": 1.315907597541809, "learning_rate": 8.297909360517277e-05, "loss": 2.33245849609375, "memory(GiB)": 121.15, "step": 3950, "token_acc": 0.502127659574468, "train_speed(iter/s)": 1.199802 }, { "epoch": 1.3553803975325565, "grad_norm": 1.140169382095337, "learning_rate": 8.293861315880347e-05, "loss": 2.425136947631836, "memory(GiB)": 121.15, "step": 3955, "token_acc": 0.48321591380024864, "train_speed(iter/s)": 1.199558 }, { "epoch": 1.35709389993146, "grad_norm": 1.2283692359924316, "learning_rate": 8.28980945325648e-05, "loss": 2.3609466552734375, "memory(GiB)": 121.15, "step": 3960, "token_acc": 0.49569707401032703, "train_speed(iter/s)": 1.199635 }, { "epoch": 1.3588074023303633, "grad_norm": 1.3140783309936523, "learning_rate": 8.285753777342281e-05, "loss": 2.5152914047241213, "memory(GiB)": 121.15, "step": 3965, "token_acc": 0.48899647887323944, "train_speed(iter/s)": 1.199695 }, { "epoch": 1.3605209047292666, "grad_norm": 1.3486734628677368, "learning_rate": 8.28169429283877e-05, "loss": 2.4717735290527343, "memory(GiB)": 121.15, "step": 3970, "token_acc": 0.46608140462889064, "train_speed(iter/s)": 1.199738 }, { "epoch": 1.36223440712817, "grad_norm": 1.3648747205734253, "learning_rate": 8.277631004451387e-05, "loss": 2.39815616607666, "memory(GiB)": 121.15, "step": 3975, "token_acc": 0.4859525899912204, "train_speed(iter/s)": 1.199763 }, { "epoch": 1.3639479095270732, "grad_norm": 1.229526400566101, "learning_rate": 8.273563916889977e-05, "loss": 2.423750877380371, "memory(GiB)": 121.15, "step": 3980, "token_acc": 0.4963027403218791, "train_speed(iter/s)": 1.199818 }, { "epoch": 1.3656614119259767, "grad_norm": 1.3261311054229736, "learning_rate": 8.269493034868792e-05, "loss": 2.348065185546875, "memory(GiB)": 121.15, "step": 3985, "token_acc": 0.5055878408582923, "train_speed(iter/s)": 1.199857 }, { "epoch": 1.3673749143248801, "grad_norm": 1.2266411781311035, "learning_rate": 8.265418363106478e-05, "loss": 2.5331987380981444, "memory(GiB)": 121.15, "step": 3990, "token_acc": 0.46561264822134385, "train_speed(iter/s)": 1.199761 }, { "epoch": 1.3690884167237833, "grad_norm": 1.4348913431167603, "learning_rate": 8.261339906326079e-05, "loss": 2.3430339813232424, "memory(GiB)": 121.15, "step": 3995, "token_acc": 0.511879949979158, "train_speed(iter/s)": 1.199657 }, { "epoch": 1.3708019191226868, "grad_norm": 1.19913649559021, "learning_rate": 8.257257669255025e-05, "loss": 2.35363712310791, "memory(GiB)": 121.15, "step": 4000, "token_acc": 0.5021496130696474, "train_speed(iter/s)": 1.199631 }, { "epoch": 1.3708019191226868, "eval_loss": 1.9994819164276123, "eval_runtime": 3.6769, "eval_samples_per_second": 27.197, "eval_steps_per_second": 27.197, "eval_token_acc": 0.5006839945280438, "step": 4000 }, { "epoch": 1.37251542152159, "grad_norm": 1.2362052202224731, "learning_rate": 8.253171656625123e-05, "loss": 2.4738813400268556, "memory(GiB)": 121.15, "step": 4005, "token_acc": 0.4815278647463995, "train_speed(iter/s)": 1.197788 }, { "epoch": 1.3742289239204935, "grad_norm": 1.2269550561904907, "learning_rate": 8.249081873172562e-05, "loss": 2.3731548309326174, "memory(GiB)": 121.15, "step": 4010, "token_acc": 0.4900083263946711, "train_speed(iter/s)": 1.197683 }, { "epoch": 1.375942426319397, "grad_norm": 1.1061843633651733, "learning_rate": 8.244988323637898e-05, "loss": 2.3413619995117188, "memory(GiB)": 121.15, "step": 4015, "token_acc": 0.494331983805668, "train_speed(iter/s)": 1.197816 }, { "epoch": 1.3776559287183003, "grad_norm": 1.233312964439392, "learning_rate": 8.240891012766056e-05, "loss": 2.4487377166748048, "memory(GiB)": 121.15, "step": 4020, "token_acc": 0.48197820620285, "train_speed(iter/s)": 1.19792 }, { "epoch": 1.3793694311172036, "grad_norm": 1.2667466402053833, "learning_rate": 8.23678994530632e-05, "loss": 2.352078437805176, "memory(GiB)": 121.15, "step": 4025, "token_acc": 0.4983079526226734, "train_speed(iter/s)": 1.197856 }, { "epoch": 1.381082933516107, "grad_norm": 1.4398390054702759, "learning_rate": 8.232685126012323e-05, "loss": 2.3293838500976562, "memory(GiB)": 121.15, "step": 4030, "token_acc": 0.5013623978201635, "train_speed(iter/s)": 1.197947 }, { "epoch": 1.3827964359150102, "grad_norm": 1.305124282836914, "learning_rate": 8.228576559642056e-05, "loss": 2.475699234008789, "memory(GiB)": 121.15, "step": 4035, "token_acc": 0.49013722126929676, "train_speed(iter/s)": 1.197889 }, { "epoch": 1.3845099383139137, "grad_norm": 1.2989954948425293, "learning_rate": 8.224464250957845e-05, "loss": 2.4683361053466797, "memory(GiB)": 121.15, "step": 4040, "token_acc": 0.49605645496056455, "train_speed(iter/s)": 1.197515 }, { "epoch": 1.3862234407128171, "grad_norm": 1.29116952419281, "learning_rate": 8.220348204726358e-05, "loss": 2.366946220397949, "memory(GiB)": 121.15, "step": 4045, "token_acc": 0.49774774774774777, "train_speed(iter/s)": 1.197634 }, { "epoch": 1.3879369431117203, "grad_norm": 1.2287814617156982, "learning_rate": 8.216228425718596e-05, "loss": 2.407224655151367, "memory(GiB)": 121.15, "step": 4050, "token_acc": 0.49333888426311406, "train_speed(iter/s)": 1.197738 }, { "epoch": 1.3896504455106238, "grad_norm": 1.2524367570877075, "learning_rate": 8.212104918709885e-05, "loss": 2.48535270690918, "memory(GiB)": 121.15, "step": 4055, "token_acc": 0.4890295358649789, "train_speed(iter/s)": 1.197777 }, { "epoch": 1.391363947909527, "grad_norm": 1.3030301332473755, "learning_rate": 8.207977688479869e-05, "loss": 2.4218732833862306, "memory(GiB)": 121.15, "step": 4060, "token_acc": 0.5008810572687225, "train_speed(iter/s)": 1.197887 }, { "epoch": 1.3930774503084304, "grad_norm": 1.2048760652542114, "learning_rate": 8.203846739812516e-05, "loss": 2.49257755279541, "memory(GiB)": 121.15, "step": 4065, "token_acc": 0.4774166314900802, "train_speed(iter/s)": 1.197914 }, { "epoch": 1.3947909527073339, "grad_norm": 1.2661006450653076, "learning_rate": 8.199712077496095e-05, "loss": 2.4581308364868164, "memory(GiB)": 121.15, "step": 4070, "token_acc": 0.49612403100775193, "train_speed(iter/s)": 1.197971 }, { "epoch": 1.396504455106237, "grad_norm": 1.224412202835083, "learning_rate": 8.195573706323187e-05, "loss": 2.457468605041504, "memory(GiB)": 121.15, "step": 4075, "token_acc": 0.4965433102887353, "train_speed(iter/s)": 1.197939 }, { "epoch": 1.3982179575051406, "grad_norm": 1.2082946300506592, "learning_rate": 8.19143163109067e-05, "loss": 2.3584529876708986, "memory(GiB)": 121.15, "step": 4080, "token_acc": 0.469327731092437, "train_speed(iter/s)": 1.197955 }, { "epoch": 1.3999314599040438, "grad_norm": 1.2929818630218506, "learning_rate": 8.18728585659971e-05, "loss": 2.4957002639770507, "memory(GiB)": 121.15, "step": 4085, "token_acc": 0.4742779783393502, "train_speed(iter/s)": 1.198109 }, { "epoch": 1.4016449623029472, "grad_norm": 1.3828281164169312, "learning_rate": 8.183136387655767e-05, "loss": 2.3549983978271483, "memory(GiB)": 121.15, "step": 4090, "token_acc": 0.49955516014234874, "train_speed(iter/s)": 1.198029 }, { "epoch": 1.4033584647018507, "grad_norm": 1.365759015083313, "learning_rate": 8.17898322906858e-05, "loss": 2.3910179138183594, "memory(GiB)": 121.15, "step": 4095, "token_acc": 0.49897582957804176, "train_speed(iter/s)": 1.198063 }, { "epoch": 1.4050719671007539, "grad_norm": 1.2733948230743408, "learning_rate": 8.174826385652172e-05, "loss": 2.3976566314697267, "memory(GiB)": 121.15, "step": 4100, "token_acc": 0.4975214060387562, "train_speed(iter/s)": 1.198029 }, { "epoch": 1.4067854694996573, "grad_norm": 1.4681867361068726, "learning_rate": 8.170665862224824e-05, "loss": 2.359284210205078, "memory(GiB)": 121.15, "step": 4105, "token_acc": 0.5111499781372978, "train_speed(iter/s)": 1.19789 }, { "epoch": 1.4084989718985605, "grad_norm": 1.2736233472824097, "learning_rate": 8.166501663609095e-05, "loss": 2.433635711669922, "memory(GiB)": 121.15, "step": 4110, "token_acc": 0.4864526886202584, "train_speed(iter/s)": 1.197908 }, { "epoch": 1.410212474297464, "grad_norm": 1.197859525680542, "learning_rate": 8.162333794631798e-05, "loss": 2.4704187393188475, "memory(GiB)": 121.15, "step": 4115, "token_acc": 0.49777418049372724, "train_speed(iter/s)": 1.19788 }, { "epoch": 1.4119259766963674, "grad_norm": 1.6448601484298706, "learning_rate": 8.158162260124e-05, "loss": 2.3604339599609374, "memory(GiB)": 121.15, "step": 4120, "token_acc": 0.5138760407030527, "train_speed(iter/s)": 1.197973 }, { "epoch": 1.4136394790952707, "grad_norm": 1.247015118598938, "learning_rate": 8.153987064921018e-05, "loss": 2.3329326629638674, "memory(GiB)": 121.15, "step": 4125, "token_acc": 0.4941533131225639, "train_speed(iter/s)": 1.198052 }, { "epoch": 1.415352981494174, "grad_norm": 1.279115915298462, "learning_rate": 8.149808213862416e-05, "loss": 2.3491403579711916, "memory(GiB)": 121.15, "step": 4130, "token_acc": 0.49362244897959184, "train_speed(iter/s)": 1.197633 }, { "epoch": 1.4170664838930773, "grad_norm": 1.2791599035263062, "learning_rate": 8.145625711791988e-05, "loss": 2.50787296295166, "memory(GiB)": 121.15, "step": 4135, "token_acc": 0.48293515358361777, "train_speed(iter/s)": 1.197809 }, { "epoch": 1.4187799862919808, "grad_norm": 1.319530725479126, "learning_rate": 8.141439563557765e-05, "loss": 2.4397747039794924, "memory(GiB)": 121.15, "step": 4140, "token_acc": 0.5066489361702128, "train_speed(iter/s)": 1.197791 }, { "epoch": 1.4204934886908842, "grad_norm": 1.2269868850708008, "learning_rate": 8.137249774012004e-05, "loss": 2.432621192932129, "memory(GiB)": 121.15, "step": 4145, "token_acc": 0.48656237464901725, "train_speed(iter/s)": 1.197891 }, { "epoch": 1.4222069910897877, "grad_norm": 1.182410717010498, "learning_rate": 8.133056348011181e-05, "loss": 2.371144676208496, "memory(GiB)": 121.15, "step": 4150, "token_acc": 0.49803750545137376, "train_speed(iter/s)": 1.198021 }, { "epoch": 1.4239204934886909, "grad_norm": 1.0814718008041382, "learning_rate": 8.12885929041599e-05, "loss": 2.3792715072631836, "memory(GiB)": 121.15, "step": 4155, "token_acc": 0.5046768707482994, "train_speed(iter/s)": 1.198021 }, { "epoch": 1.4256339958875943, "grad_norm": 1.327575922012329, "learning_rate": 8.12465860609133e-05, "loss": 2.336556243896484, "memory(GiB)": 121.15, "step": 4160, "token_acc": 0.5110262934690416, "train_speed(iter/s)": 1.198068 }, { "epoch": 1.4273474982864975, "grad_norm": 1.2345808744430542, "learning_rate": 8.120454299906308e-05, "loss": 2.404781723022461, "memory(GiB)": 121.15, "step": 4165, "token_acc": 0.5014836795252225, "train_speed(iter/s)": 1.198129 }, { "epoch": 1.429061000685401, "grad_norm": 1.2586119174957275, "learning_rate": 8.116246376734228e-05, "loss": 2.492977523803711, "memory(GiB)": 121.15, "step": 4170, "token_acc": 0.4981443298969072, "train_speed(iter/s)": 1.198142 }, { "epoch": 1.4307745030843044, "grad_norm": 1.2660249471664429, "learning_rate": 8.112034841452585e-05, "loss": 2.449069023132324, "memory(GiB)": 121.15, "step": 4175, "token_acc": 0.49225122349102773, "train_speed(iter/s)": 1.198124 }, { "epoch": 1.4324880054832076, "grad_norm": 1.5121350288391113, "learning_rate": 8.107819698943063e-05, "loss": 2.4274925231933593, "memory(GiB)": 121.15, "step": 4180, "token_acc": 0.489664676159853, "train_speed(iter/s)": 1.198106 }, { "epoch": 1.434201507882111, "grad_norm": 1.360559105873108, "learning_rate": 8.103600954091523e-05, "loss": 2.4377761840820313, "memory(GiB)": 121.15, "step": 4185, "token_acc": 0.483539974348012, "train_speed(iter/s)": 1.198086 }, { "epoch": 1.4359150102810143, "grad_norm": 1.2750414609909058, "learning_rate": 8.099378611788009e-05, "loss": 2.3538990020751953, "memory(GiB)": 121.15, "step": 4190, "token_acc": 0.5056028686687584, "train_speed(iter/s)": 1.198093 }, { "epoch": 1.4376285126799178, "grad_norm": 1.3249727487564087, "learning_rate": 8.095152676926729e-05, "loss": 2.456207275390625, "memory(GiB)": 121.15, "step": 4195, "token_acc": 0.4849128771780705, "train_speed(iter/s)": 1.198102 }, { "epoch": 1.4393420150788212, "grad_norm": 1.1921268701553345, "learning_rate": 8.090923154406056e-05, "loss": 2.3330955505371094, "memory(GiB)": 121.15, "step": 4200, "token_acc": 0.4934839270199826, "train_speed(iter/s)": 1.198175 }, { "epoch": 1.4410555174777244, "grad_norm": 1.2943819761276245, "learning_rate": 8.086690049128524e-05, "loss": 2.4237091064453127, "memory(GiB)": 121.15, "step": 4205, "token_acc": 0.4953229398663697, "train_speed(iter/s)": 1.198229 }, { "epoch": 1.4427690198766279, "grad_norm": 1.2634509801864624, "learning_rate": 8.082453366000818e-05, "loss": 2.349758529663086, "memory(GiB)": 121.15, "step": 4210, "token_acc": 0.4934725848563969, "train_speed(iter/s)": 1.198224 }, { "epoch": 1.444482522275531, "grad_norm": 1.1872868537902832, "learning_rate": 8.078213109933767e-05, "loss": 2.4358057022094726, "memory(GiB)": 121.15, "step": 4215, "token_acc": 0.4868898749495764, "train_speed(iter/s)": 1.198282 }, { "epoch": 1.4461960246744345, "grad_norm": 1.272073745727539, "learning_rate": 8.07396928584235e-05, "loss": 2.422246551513672, "memory(GiB)": 121.15, "step": 4220, "token_acc": 0.4985398414685023, "train_speed(iter/s)": 1.197885 }, { "epoch": 1.447909527073338, "grad_norm": 1.1708053350448608, "learning_rate": 8.069721898645672e-05, "loss": 2.352522277832031, "memory(GiB)": 121.15, "step": 4225, "token_acc": 0.4946013289036545, "train_speed(iter/s)": 1.197954 }, { "epoch": 1.4496230294722412, "grad_norm": 1.218586802482605, "learning_rate": 8.065470953266976e-05, "loss": 2.316191864013672, "memory(GiB)": 121.15, "step": 4230, "token_acc": 0.5078053259871441, "train_speed(iter/s)": 1.198011 }, { "epoch": 1.4513365318711446, "grad_norm": 1.326109528541565, "learning_rate": 8.061216454633624e-05, "loss": 2.373508834838867, "memory(GiB)": 121.15, "step": 4235, "token_acc": 0.5012562814070352, "train_speed(iter/s)": 1.198154 }, { "epoch": 1.4530500342700479, "grad_norm": 1.3339958190917969, "learning_rate": 8.056958407677102e-05, "loss": 2.417826271057129, "memory(GiB)": 121.15, "step": 4240, "token_acc": 0.4805804524114383, "train_speed(iter/s)": 1.198047 }, { "epoch": 1.4547635366689513, "grad_norm": 1.389556646347046, "learning_rate": 8.052696817333002e-05, "loss": 2.385920524597168, "memory(GiB)": 121.15, "step": 4245, "token_acc": 0.5011286681715575, "train_speed(iter/s)": 1.197961 }, { "epoch": 1.4564770390678548, "grad_norm": 1.2374475002288818, "learning_rate": 8.048431688541028e-05, "loss": 2.5059770584106444, "memory(GiB)": 121.15, "step": 4250, "token_acc": 0.4742558326629123, "train_speed(iter/s)": 1.197956 }, { "epoch": 1.458190541466758, "grad_norm": 1.2501296997070312, "learning_rate": 8.044163026244985e-05, "loss": 2.478226661682129, "memory(GiB)": 121.15, "step": 4255, "token_acc": 0.4937093275488069, "train_speed(iter/s)": 1.19796 }, { "epoch": 1.4599040438656614, "grad_norm": 1.1827924251556396, "learning_rate": 8.039890835392773e-05, "loss": 2.4130016326904298, "memory(GiB)": 121.15, "step": 4260, "token_acc": 0.4984709480122324, "train_speed(iter/s)": 1.198083 }, { "epoch": 1.4616175462645646, "grad_norm": 1.0497323274612427, "learning_rate": 8.035615120936381e-05, "loss": 2.3048015594482423, "memory(GiB)": 121.15, "step": 4265, "token_acc": 0.5110584518167457, "train_speed(iter/s)": 1.198175 }, { "epoch": 1.463331048663468, "grad_norm": 1.3040748834609985, "learning_rate": 8.031335887831886e-05, "loss": 2.4846405029296874, "memory(GiB)": 121.15, "step": 4270, "token_acc": 0.4881618596642273, "train_speed(iter/s)": 1.198012 }, { "epoch": 1.4650445510623715, "grad_norm": 1.243501901626587, "learning_rate": 8.027053141039438e-05, "loss": 2.383168411254883, "memory(GiB)": 121.15, "step": 4275, "token_acc": 0.4963235294117647, "train_speed(iter/s)": 1.198158 }, { "epoch": 1.466758053461275, "grad_norm": 1.3033121824264526, "learning_rate": 8.022766885523265e-05, "loss": 2.3856462478637694, "memory(GiB)": 121.15, "step": 4280, "token_acc": 0.47996695580338705, "train_speed(iter/s)": 1.198157 }, { "epoch": 1.4684715558601782, "grad_norm": 1.1398913860321045, "learning_rate": 8.018477126251658e-05, "loss": 2.41900749206543, "memory(GiB)": 121.15, "step": 4285, "token_acc": 0.4890131841789852, "train_speed(iter/s)": 1.198271 }, { "epoch": 1.4701850582590816, "grad_norm": 1.3075584173202515, "learning_rate": 8.014183868196974e-05, "loss": 2.366316032409668, "memory(GiB)": 121.15, "step": 4290, "token_acc": 0.4980611805256355, "train_speed(iter/s)": 1.198208 }, { "epoch": 1.4718985606579849, "grad_norm": 1.312857747077942, "learning_rate": 8.009887116335619e-05, "loss": 2.4643953323364256, "memory(GiB)": 121.15, "step": 4295, "token_acc": 0.4800509337860781, "train_speed(iter/s)": 1.198133 }, { "epoch": 1.4736120630568883, "grad_norm": 1.4190046787261963, "learning_rate": 8.005586875648055e-05, "loss": 2.375961685180664, "memory(GiB)": 121.15, "step": 4300, "token_acc": 0.49959216965742254, "train_speed(iter/s)": 1.198091 }, { "epoch": 1.4753255654557917, "grad_norm": 1.4326435327529907, "learning_rate": 8.001283151118785e-05, "loss": 2.447621154785156, "memory(GiB)": 121.15, "step": 4305, "token_acc": 0.49036402569593146, "train_speed(iter/s)": 1.197853 }, { "epoch": 1.477039067854695, "grad_norm": 1.4578076601028442, "learning_rate": 7.996975947736349e-05, "loss": 2.5901275634765626, "memory(GiB)": 121.15, "step": 4310, "token_acc": 0.4684300341296928, "train_speed(iter/s)": 1.197759 }, { "epoch": 1.4787525702535984, "grad_norm": 1.279104232788086, "learning_rate": 7.992665270493323e-05, "loss": 2.370095443725586, "memory(GiB)": 121.15, "step": 4315, "token_acc": 0.498016747465844, "train_speed(iter/s)": 1.19785 }, { "epoch": 1.4804660726525016, "grad_norm": 1.2100595235824585, "learning_rate": 7.988351124386306e-05, "loss": 2.3290212631225584, "memory(GiB)": 121.15, "step": 4320, "token_acc": 0.49934411893310016, "train_speed(iter/s)": 1.197859 }, { "epoch": 1.482179575051405, "grad_norm": 1.4547137022018433, "learning_rate": 7.98403351441592e-05, "loss": 2.3975290298461913, "memory(GiB)": 121.15, "step": 4325, "token_acc": 0.49546044098573283, "train_speed(iter/s)": 1.197903 }, { "epoch": 1.4838930774503085, "grad_norm": 1.3260672092437744, "learning_rate": 7.979712445586802e-05, "loss": 2.4701911926269533, "memory(GiB)": 121.15, "step": 4330, "token_acc": 0.47440273037542663, "train_speed(iter/s)": 1.197694 }, { "epoch": 1.4856065798492117, "grad_norm": 1.2933984994888306, "learning_rate": 7.975387922907597e-05, "loss": 2.412765693664551, "memory(GiB)": 121.15, "step": 4335, "token_acc": 0.4807849550286182, "train_speed(iter/s)": 1.197736 }, { "epoch": 1.4873200822481152, "grad_norm": 1.087294101715088, "learning_rate": 7.971059951390954e-05, "loss": 2.378759002685547, "memory(GiB)": 121.15, "step": 4340, "token_acc": 0.502680412371134, "train_speed(iter/s)": 1.197833 }, { "epoch": 1.4890335846470184, "grad_norm": 1.329214334487915, "learning_rate": 7.96672853605352e-05, "loss": 2.370139312744141, "memory(GiB)": 121.15, "step": 4345, "token_acc": 0.49684968496849685, "train_speed(iter/s)": 1.197879 }, { "epoch": 1.4907470870459218, "grad_norm": 1.3279253244400024, "learning_rate": 7.962393681915934e-05, "loss": 2.337952423095703, "memory(GiB)": 121.15, "step": 4350, "token_acc": 0.49377949377949376, "train_speed(iter/s)": 1.197985 }, { "epoch": 1.4924605894448253, "grad_norm": 1.185733437538147, "learning_rate": 7.958055394002821e-05, "loss": 2.305440330505371, "memory(GiB)": 121.15, "step": 4355, "token_acc": 0.5020261143628996, "train_speed(iter/s)": 1.198092 }, { "epoch": 1.4941740918437285, "grad_norm": 1.6660346984863281, "learning_rate": 7.953713677342785e-05, "loss": 2.4737911224365234, "memory(GiB)": 121.15, "step": 4360, "token_acc": 0.489556135770235, "train_speed(iter/s)": 1.197836 }, { "epoch": 1.495887594242632, "grad_norm": 1.2695832252502441, "learning_rate": 7.949368536968404e-05, "loss": 2.3665702819824217, "memory(GiB)": 121.15, "step": 4365, "token_acc": 0.5029787234042553, "train_speed(iter/s)": 1.197765 }, { "epoch": 1.4976010966415352, "grad_norm": 1.2121082544326782, "learning_rate": 7.945019977916225e-05, "loss": 2.4364673614501955, "memory(GiB)": 121.15, "step": 4370, "token_acc": 0.4804446274186908, "train_speed(iter/s)": 1.197866 }, { "epoch": 1.4993145990404386, "grad_norm": 1.1675889492034912, "learning_rate": 7.940668005226764e-05, "loss": 2.3651748657226563, "memory(GiB)": 121.15, "step": 4375, "token_acc": 0.49401860877270715, "train_speed(iter/s)": 1.197848 }, { "epoch": 1.501028101439342, "grad_norm": 1.3454729318618774, "learning_rate": 7.936312623944482e-05, "loss": 2.388148880004883, "memory(GiB)": 121.15, "step": 4380, "token_acc": 0.5018344883815736, "train_speed(iter/s)": 1.197899 }, { "epoch": 1.5027416038382455, "grad_norm": 1.3749786615371704, "learning_rate": 7.931953839117798e-05, "loss": 2.403509330749512, "memory(GiB)": 121.15, "step": 4385, "token_acc": 0.49635036496350365, "train_speed(iter/s)": 1.197816 }, { "epoch": 1.5044551062371487, "grad_norm": 1.2846324443817139, "learning_rate": 7.927591655799078e-05, "loss": 2.497965621948242, "memory(GiB)": 121.15, "step": 4390, "token_acc": 0.4770231801545344, "train_speed(iter/s)": 1.197906 }, { "epoch": 1.506168608636052, "grad_norm": 1.2617073059082031, "learning_rate": 7.92322607904462e-05, "loss": 2.3905803680419924, "memory(GiB)": 121.15, "step": 4395, "token_acc": 0.49597286986011024, "train_speed(iter/s)": 1.19787 }, { "epoch": 1.5078821110349554, "grad_norm": 1.1989659070968628, "learning_rate": 7.918857113914664e-05, "loss": 2.470463180541992, "memory(GiB)": 121.15, "step": 4400, "token_acc": 0.4827011254689454, "train_speed(iter/s)": 1.197877 }, { "epoch": 1.5095956134338588, "grad_norm": 1.1720634698867798, "learning_rate": 7.914484765473372e-05, "loss": 2.3945993423461913, "memory(GiB)": 121.15, "step": 4405, "token_acc": 0.4931567328918322, "train_speed(iter/s)": 1.197902 }, { "epoch": 1.5113091158327623, "grad_norm": 1.1962370872497559, "learning_rate": 7.910109038788827e-05, "loss": 2.3785959243774415, "memory(GiB)": 121.15, "step": 4410, "token_acc": 0.48632218844984804, "train_speed(iter/s)": 1.197777 }, { "epoch": 1.5130226182316655, "grad_norm": 1.1295472383499146, "learning_rate": 7.905729938933032e-05, "loss": 2.4054628372192384, "memory(GiB)": 121.15, "step": 4415, "token_acc": 0.4973523421588595, "train_speed(iter/s)": 1.197842 }, { "epoch": 1.5147361206305687, "grad_norm": 1.3177980184555054, "learning_rate": 7.901347470981898e-05, "loss": 2.4768320083618165, "memory(GiB)": 121.15, "step": 4420, "token_acc": 0.4923413566739606, "train_speed(iter/s)": 1.197928 }, { "epoch": 1.5164496230294722, "grad_norm": 1.0973713397979736, "learning_rate": 7.896961640015239e-05, "loss": 2.2660429000854494, "memory(GiB)": 121.15, "step": 4425, "token_acc": 0.49637835534725183, "train_speed(iter/s)": 1.197959 }, { "epoch": 1.5181631254283756, "grad_norm": 1.462823748588562, "learning_rate": 7.892572451116767e-05, "loss": 2.442591094970703, "memory(GiB)": 121.15, "step": 4430, "token_acc": 0.47568881685575365, "train_speed(iter/s)": 1.198004 }, { "epoch": 1.519876627827279, "grad_norm": 1.2731252908706665, "learning_rate": 7.888179909374089e-05, "loss": 2.3682199478149415, "memory(GiB)": 121.15, "step": 4435, "token_acc": 0.488666085440279, "train_speed(iter/s)": 1.198067 }, { "epoch": 1.5215901302261823, "grad_norm": 1.2113103866577148, "learning_rate": 7.883784019878696e-05, "loss": 2.3509458541870116, "memory(GiB)": 121.15, "step": 4440, "token_acc": 0.49149024491490245, "train_speed(iter/s)": 1.198155 }, { "epoch": 1.5233036326250857, "grad_norm": 1.3235459327697754, "learning_rate": 7.879384787725959e-05, "loss": 2.4128894805908203, "memory(GiB)": 121.15, "step": 4445, "token_acc": 0.5017482517482518, "train_speed(iter/s)": 1.19815 }, { "epoch": 1.525017135023989, "grad_norm": 1.383519172668457, "learning_rate": 7.874982218015126e-05, "loss": 2.4938375473022463, "memory(GiB)": 121.15, "step": 4450, "token_acc": 0.49044056525353286, "train_speed(iter/s)": 1.198181 }, { "epoch": 1.5267306374228924, "grad_norm": 1.221901297569275, "learning_rate": 7.870576315849309e-05, "loss": 2.4347549438476563, "memory(GiB)": 121.15, "step": 4455, "token_acc": 0.48464163822525597, "train_speed(iter/s)": 1.198282 }, { "epoch": 1.5284441398217958, "grad_norm": 1.324179768562317, "learning_rate": 7.866167086335492e-05, "loss": 2.417927360534668, "memory(GiB)": 121.15, "step": 4460, "token_acc": 0.4902758932609679, "train_speed(iter/s)": 1.198288 }, { "epoch": 1.530157642220699, "grad_norm": 1.3754081726074219, "learning_rate": 7.861754534584502e-05, "loss": 2.434174156188965, "memory(GiB)": 121.15, "step": 4465, "token_acc": 0.4953436807095344, "train_speed(iter/s)": 1.19814 }, { "epoch": 1.5318711446196025, "grad_norm": 1.1258294582366943, "learning_rate": 7.857338665711031e-05, "loss": 2.3335807800292967, "memory(GiB)": 121.15, "step": 4470, "token_acc": 0.49269311064718163, "train_speed(iter/s)": 1.198042 }, { "epoch": 1.5335846470185057, "grad_norm": 1.2428501844406128, "learning_rate": 7.852919484833607e-05, "loss": 2.4096012115478516, "memory(GiB)": 121.15, "step": 4475, "token_acc": 0.4954760878931495, "train_speed(iter/s)": 1.198173 }, { "epoch": 1.5352981494174092, "grad_norm": 1.2230764627456665, "learning_rate": 7.848496997074599e-05, "loss": 2.5218873977661134, "memory(GiB)": 121.15, "step": 4480, "token_acc": 0.48466003316749584, "train_speed(iter/s)": 1.198196 }, { "epoch": 1.5370116518163126, "grad_norm": 1.2757443189620972, "learning_rate": 7.84407120756021e-05, "loss": 2.331070327758789, "memory(GiB)": 121.15, "step": 4485, "token_acc": 0.49936682144364714, "train_speed(iter/s)": 1.198102 }, { "epoch": 1.538725154215216, "grad_norm": 1.3167136907577515, "learning_rate": 7.83964212142047e-05, "loss": 2.3805973052978517, "memory(GiB)": 121.15, "step": 4490, "token_acc": 0.5036527718091964, "train_speed(iter/s)": 1.198248 }, { "epoch": 1.5404386566141193, "grad_norm": 1.23412024974823, "learning_rate": 7.835209743789229e-05, "loss": 2.3636409759521486, "memory(GiB)": 121.15, "step": 4495, "token_acc": 0.5019676432006996, "train_speed(iter/s)": 1.198331 }, { "epoch": 1.5421521590130225, "grad_norm": 1.2242361307144165, "learning_rate": 7.830774079804156e-05, "loss": 2.3725099563598633, "memory(GiB)": 121.15, "step": 4500, "token_acc": 0.5029021558872305, "train_speed(iter/s)": 1.198157 }, { "epoch": 1.5421521590130225, "eval_loss": 2.1724958419799805, "eval_runtime": 3.6649, "eval_samples_per_second": 27.286, "eval_steps_per_second": 27.286, "eval_token_acc": 0.48481012658227846, "step": 4500 }, { "epoch": 1.543865661411926, "grad_norm": 1.267007827758789, "learning_rate": 7.826335134606724e-05, "loss": 2.3231597900390626, "memory(GiB)": 121.15, "step": 4505, "token_acc": 0.49920102269095556, "train_speed(iter/s)": 1.196523 }, { "epoch": 1.5455791638108294, "grad_norm": 1.3915871381759644, "learning_rate": 7.821892913342211e-05, "loss": 2.445119285583496, "memory(GiB)": 121.15, "step": 4510, "token_acc": 0.4912595248767369, "train_speed(iter/s)": 1.196578 }, { "epoch": 1.5472926662097328, "grad_norm": 1.2158557176589966, "learning_rate": 7.817447421159697e-05, "loss": 2.361173629760742, "memory(GiB)": 121.15, "step": 4515, "token_acc": 0.5090751944684528, "train_speed(iter/s)": 1.196706 }, { "epoch": 1.549006168608636, "grad_norm": 1.4937447309494019, "learning_rate": 7.812998663212043e-05, "loss": 2.3255733489990233, "memory(GiB)": 121.15, "step": 4520, "token_acc": 0.5006753714542999, "train_speed(iter/s)": 1.19662 }, { "epoch": 1.5507196710075393, "grad_norm": 1.3768820762634277, "learning_rate": 7.80854664465591e-05, "loss": 2.4701255798339843, "memory(GiB)": 121.15, "step": 4525, "token_acc": 0.4788153023447141, "train_speed(iter/s)": 1.196659 }, { "epoch": 1.5524331734064427, "grad_norm": 1.1223810911178589, "learning_rate": 7.804091370651724e-05, "loss": 2.3499710083007814, "memory(GiB)": 121.15, "step": 4530, "token_acc": 0.49187161317215505, "train_speed(iter/s)": 1.196651 }, { "epoch": 1.5541466758053462, "grad_norm": 1.2632102966308594, "learning_rate": 7.799632846363694e-05, "loss": 2.3007246017456056, "memory(GiB)": 121.15, "step": 4535, "token_acc": 0.5062797747942832, "train_speed(iter/s)": 1.196747 }, { "epoch": 1.5558601782042496, "grad_norm": 1.3010098934173584, "learning_rate": 7.795171076959794e-05, "loss": 2.513018035888672, "memory(GiB)": 121.15, "step": 4540, "token_acc": 0.4718430034129693, "train_speed(iter/s)": 1.19683 }, { "epoch": 1.5575736806031528, "grad_norm": 1.1733958721160889, "learning_rate": 7.790706067611754e-05, "loss": 2.307486343383789, "memory(GiB)": 121.15, "step": 4545, "token_acc": 0.5132819194515853, "train_speed(iter/s)": 1.196748 }, { "epoch": 1.559287183002056, "grad_norm": 1.3037654161453247, "learning_rate": 7.786237823495069e-05, "loss": 2.4843271255493162, "memory(GiB)": 121.15, "step": 4550, "token_acc": 0.4937007874015748, "train_speed(iter/s)": 1.196825 }, { "epoch": 1.5610006854009595, "grad_norm": 1.1355501413345337, "learning_rate": 7.781766349788979e-05, "loss": 2.4149763107299806, "memory(GiB)": 121.15, "step": 4555, "token_acc": 0.4972905377240517, "train_speed(iter/s)": 1.196612 }, { "epoch": 1.562714187799863, "grad_norm": 1.4498003721237183, "learning_rate": 7.777291651676466e-05, "loss": 2.389247703552246, "memory(GiB)": 121.15, "step": 4560, "token_acc": 0.4946013289036545, "train_speed(iter/s)": 1.196709 }, { "epoch": 1.5644276901987664, "grad_norm": 1.2720869779586792, "learning_rate": 7.772813734344248e-05, "loss": 2.484358215332031, "memory(GiB)": 121.15, "step": 4565, "token_acc": 0.49386373254337707, "train_speed(iter/s)": 1.196713 }, { "epoch": 1.5661411925976696, "grad_norm": 1.306341290473938, "learning_rate": 7.768332602982784e-05, "loss": 2.4091053009033203, "memory(GiB)": 121.15, "step": 4570, "token_acc": 0.48457424928013165, "train_speed(iter/s)": 1.196683 }, { "epoch": 1.567854694996573, "grad_norm": 1.292869210243225, "learning_rate": 7.763848262786244e-05, "loss": 2.4068748474121096, "memory(GiB)": 121.15, "step": 4575, "token_acc": 0.49716033202271737, "train_speed(iter/s)": 1.196739 }, { "epoch": 1.5695681973954763, "grad_norm": 1.323853611946106, "learning_rate": 7.75936071895253e-05, "loss": 2.456185722351074, "memory(GiB)": 121.15, "step": 4580, "token_acc": 0.48543273350471294, "train_speed(iter/s)": 1.196823 }, { "epoch": 1.5712816997943797, "grad_norm": 1.152788758277893, "learning_rate": 7.754869976683252e-05, "loss": 2.3217933654785154, "memory(GiB)": 121.15, "step": 4585, "token_acc": 0.502869757174393, "train_speed(iter/s)": 1.19654 }, { "epoch": 1.5729952021932831, "grad_norm": 1.2503676414489746, "learning_rate": 7.750376041183726e-05, "loss": 2.435915946960449, "memory(GiB)": 121.15, "step": 4590, "token_acc": 0.49174078780177893, "train_speed(iter/s)": 1.19655 }, { "epoch": 1.5747087045921866, "grad_norm": 1.2281100749969482, "learning_rate": 7.745878917662974e-05, "loss": 2.4342931747436523, "memory(GiB)": 121.15, "step": 4595, "token_acc": 0.4977836879432624, "train_speed(iter/s)": 1.196589 }, { "epoch": 1.5764222069910898, "grad_norm": 1.2097660303115845, "learning_rate": 7.741378611333708e-05, "loss": 2.3825870513916017, "memory(GiB)": 121.15, "step": 4600, "token_acc": 0.5006036217303823, "train_speed(iter/s)": 1.196714 }, { "epoch": 1.578135709389993, "grad_norm": 1.213644027709961, "learning_rate": 7.736875127412332e-05, "loss": 2.4300676345825196, "memory(GiB)": 121.15, "step": 4605, "token_acc": 0.48839137645107794, "train_speed(iter/s)": 1.196816 }, { "epoch": 1.5798492117888965, "grad_norm": 1.384742021560669, "learning_rate": 7.732368471118936e-05, "loss": 2.354480171203613, "memory(GiB)": 121.15, "step": 4610, "token_acc": 0.4930615784908933, "train_speed(iter/s)": 1.196858 }, { "epoch": 1.5815627141878, "grad_norm": 1.117099642753601, "learning_rate": 7.727858647677281e-05, "loss": 2.332767677307129, "memory(GiB)": 121.15, "step": 4615, "token_acc": 0.5032706459525756, "train_speed(iter/s)": 1.196903 }, { "epoch": 1.5832762165867034, "grad_norm": 1.1627610921859741, "learning_rate": 7.723345662314804e-05, "loss": 2.4164911270141602, "memory(GiB)": 121.15, "step": 4620, "token_acc": 0.48333333333333334, "train_speed(iter/s)": 1.196996 }, { "epoch": 1.5849897189856066, "grad_norm": 1.3300153017044067, "learning_rate": 7.718829520262604e-05, "loss": 2.440071868896484, "memory(GiB)": 121.15, "step": 4625, "token_acc": 0.4805945499587118, "train_speed(iter/s)": 1.197079 }, { "epoch": 1.5867032213845098, "grad_norm": 1.221786379814148, "learning_rate": 7.714310226755441e-05, "loss": 2.469926452636719, "memory(GiB)": 121.15, "step": 4630, "token_acc": 0.46809440559440557, "train_speed(iter/s)": 1.197201 }, { "epoch": 1.5884167237834133, "grad_norm": 1.2182178497314453, "learning_rate": 7.709787787031728e-05, "loss": 2.4945531845092774, "memory(GiB)": 121.15, "step": 4635, "token_acc": 0.4877663772691397, "train_speed(iter/s)": 1.197126 }, { "epoch": 1.5901302261823167, "grad_norm": 1.2302736043930054, "learning_rate": 7.705262206333523e-05, "loss": 2.421390724182129, "memory(GiB)": 121.15, "step": 4640, "token_acc": 0.50418410041841, "train_speed(iter/s)": 1.197257 }, { "epoch": 1.5918437285812201, "grad_norm": 1.2372007369995117, "learning_rate": 7.700733489906527e-05, "loss": 2.343089294433594, "memory(GiB)": 121.15, "step": 4645, "token_acc": 0.5134665508253693, "train_speed(iter/s)": 1.197221 }, { "epoch": 1.5935572309801234, "grad_norm": 1.2642532587051392, "learning_rate": 7.696201643000074e-05, "loss": 2.4774394989013673, "memory(GiB)": 121.15, "step": 4650, "token_acc": 0.48438818565400843, "train_speed(iter/s)": 1.196948 }, { "epoch": 1.5952707333790266, "grad_norm": 1.4041543006896973, "learning_rate": 7.691666670867127e-05, "loss": 2.4312694549560545, "memory(GiB)": 121.15, "step": 4655, "token_acc": 0.4832104832104832, "train_speed(iter/s)": 1.197035 }, { "epoch": 1.59698423577793, "grad_norm": 1.1615498065948486, "learning_rate": 7.687128578764273e-05, "loss": 2.3431299209594725, "memory(GiB)": 121.15, "step": 4660, "token_acc": 0.5043706293706294, "train_speed(iter/s)": 1.197044 }, { "epoch": 1.5986977381768335, "grad_norm": 1.2503665685653687, "learning_rate": 7.68258737195171e-05, "loss": 2.346792984008789, "memory(GiB)": 121.15, "step": 4665, "token_acc": 0.5014299332697807, "train_speed(iter/s)": 1.197158 }, { "epoch": 1.600411240575737, "grad_norm": 1.2230453491210938, "learning_rate": 7.678043055693258e-05, "loss": 2.3909639358520507, "memory(GiB)": 121.15, "step": 4670, "token_acc": 0.4894532931554025, "train_speed(iter/s)": 1.197264 }, { "epoch": 1.6021247429746401, "grad_norm": 1.3277429342269897, "learning_rate": 7.673495635256326e-05, "loss": 2.423063850402832, "memory(GiB)": 121.15, "step": 4675, "token_acc": 0.48590021691973967, "train_speed(iter/s)": 1.197357 }, { "epoch": 1.6038382453735434, "grad_norm": 1.2097318172454834, "learning_rate": 7.668945115911934e-05, "loss": 2.3887651443481444, "memory(GiB)": 121.15, "step": 4680, "token_acc": 0.4971302428256071, "train_speed(iter/s)": 1.197484 }, { "epoch": 1.6055517477724468, "grad_norm": 1.2819020748138428, "learning_rate": 7.664391502934687e-05, "loss": 2.3886045455932616, "memory(GiB)": 121.15, "step": 4685, "token_acc": 0.5072961373390558, "train_speed(iter/s)": 1.197394 }, { "epoch": 1.6072652501713502, "grad_norm": 1.3107560873031616, "learning_rate": 7.659834801602777e-05, "loss": 2.3885826110839843, "memory(GiB)": 121.15, "step": 4690, "token_acc": 0.5031525851197982, "train_speed(iter/s)": 1.197535 }, { "epoch": 1.6089787525702537, "grad_norm": 1.3957180976867676, "learning_rate": 7.655275017197976e-05, "loss": 2.48367977142334, "memory(GiB)": 121.15, "step": 4695, "token_acc": 0.49210963455149503, "train_speed(iter/s)": 1.197602 }, { "epoch": 1.610692254969157, "grad_norm": 1.3271739482879639, "learning_rate": 7.650712155005632e-05, "loss": 2.442237663269043, "memory(GiB)": 121.15, "step": 4700, "token_acc": 0.5004440497335702, "train_speed(iter/s)": 1.197462 }, { "epoch": 1.6124057573680604, "grad_norm": 1.3151746988296509, "learning_rate": 7.64614622031466e-05, "loss": 2.4514976501464845, "memory(GiB)": 121.15, "step": 4705, "token_acc": 0.4840686274509804, "train_speed(iter/s)": 1.197498 }, { "epoch": 1.6141192597669636, "grad_norm": 1.2759171724319458, "learning_rate": 7.641577218417532e-05, "loss": 2.417926216125488, "memory(GiB)": 121.15, "step": 4710, "token_acc": 0.4946512623020967, "train_speed(iter/s)": 1.19759 }, { "epoch": 1.615832762165867, "grad_norm": 1.1665434837341309, "learning_rate": 7.63700515461028e-05, "loss": 2.402243423461914, "memory(GiB)": 121.15, "step": 4715, "token_acc": 0.4805900621118012, "train_speed(iter/s)": 1.197206 }, { "epoch": 1.6175462645647705, "grad_norm": 1.1989632844924927, "learning_rate": 7.632430034192482e-05, "loss": 2.385320854187012, "memory(GiB)": 121.15, "step": 4720, "token_acc": 0.48748921484037966, "train_speed(iter/s)": 1.197246 }, { "epoch": 1.619259766963674, "grad_norm": 1.24038827419281, "learning_rate": 7.627851862467263e-05, "loss": 2.310911178588867, "memory(GiB)": 121.15, "step": 4725, "token_acc": 0.5053497942386831, "train_speed(iter/s)": 1.19715 }, { "epoch": 1.6209732693625771, "grad_norm": 1.300613522529602, "learning_rate": 7.623270644741279e-05, "loss": 2.5714017868041994, "memory(GiB)": 121.15, "step": 4730, "token_acc": 0.48450586264656614, "train_speed(iter/s)": 1.197172 }, { "epoch": 1.6226867717614804, "grad_norm": 1.2583612203598022, "learning_rate": 7.618686386324723e-05, "loss": 2.5071739196777343, "memory(GiB)": 121.15, "step": 4735, "token_acc": 0.48236371965185526, "train_speed(iter/s)": 1.197169 }, { "epoch": 1.6244002741603838, "grad_norm": 1.2039369344711304, "learning_rate": 7.614099092531308e-05, "loss": 2.3251901626586915, "memory(GiB)": 121.15, "step": 4740, "token_acc": 0.5140358950759318, "train_speed(iter/s)": 1.197265 }, { "epoch": 1.6261137765592872, "grad_norm": 1.2146568298339844, "learning_rate": 7.609508768678268e-05, "loss": 2.443262481689453, "memory(GiB)": 121.15, "step": 4745, "token_acc": 0.49274160099543757, "train_speed(iter/s)": 1.197316 }, { "epoch": 1.6278272789581907, "grad_norm": 1.201899766921997, "learning_rate": 7.604915420086347e-05, "loss": 2.4560928344726562, "memory(GiB)": 121.15, "step": 4750, "token_acc": 0.4755364806866953, "train_speed(iter/s)": 1.197281 }, { "epoch": 1.629540781357094, "grad_norm": 1.3217486143112183, "learning_rate": 7.600319052079796e-05, "loss": 2.387775993347168, "memory(GiB)": 121.15, "step": 4755, "token_acc": 0.5, "train_speed(iter/s)": 1.197379 }, { "epoch": 1.6312542837559971, "grad_norm": 1.3063288927078247, "learning_rate": 7.595719669986368e-05, "loss": 2.4642318725585937, "memory(GiB)": 121.15, "step": 4760, "token_acc": 0.48057813911472447, "train_speed(iter/s)": 1.197514 }, { "epoch": 1.6329677861549006, "grad_norm": 1.215790867805481, "learning_rate": 7.591117279137306e-05, "loss": 2.344537353515625, "memory(GiB)": 121.15, "step": 4765, "token_acc": 0.509067903838043, "train_speed(iter/s)": 1.197555 }, { "epoch": 1.634681288553804, "grad_norm": 1.456971526145935, "learning_rate": 7.58651188486734e-05, "loss": 2.298064422607422, "memory(GiB)": 121.15, "step": 4770, "token_acc": 0.4874888293118856, "train_speed(iter/s)": 1.197713 }, { "epoch": 1.6363947909527075, "grad_norm": 1.2006725072860718, "learning_rate": 7.581903492514688e-05, "loss": 2.4400835037231445, "memory(GiB)": 121.15, "step": 4775, "token_acc": 0.4901202749140893, "train_speed(iter/s)": 1.197832 }, { "epoch": 1.6381082933516107, "grad_norm": 1.2613848447799683, "learning_rate": 7.577292107421037e-05, "loss": 2.3947027206420897, "memory(GiB)": 121.15, "step": 4780, "token_acc": 0.48324140857021636, "train_speed(iter/s)": 1.19774 }, { "epoch": 1.639821795750514, "grad_norm": 1.1873891353607178, "learning_rate": 7.572677734931546e-05, "loss": 2.493109130859375, "memory(GiB)": 121.15, "step": 4785, "token_acc": 0.4767580452920143, "train_speed(iter/s)": 1.19778 }, { "epoch": 1.6415352981494173, "grad_norm": 1.1597732305526733, "learning_rate": 7.568060380394831e-05, "loss": 2.4144960403442384, "memory(GiB)": 121.15, "step": 4790, "token_acc": 0.4961306964746346, "train_speed(iter/s)": 1.197683 }, { "epoch": 1.6432488005483208, "grad_norm": 1.3092656135559082, "learning_rate": 7.563440049162973e-05, "loss": 2.274212646484375, "memory(GiB)": 121.15, "step": 4795, "token_acc": 0.5141488898563343, "train_speed(iter/s)": 1.197751 }, { "epoch": 1.6449623029472242, "grad_norm": 1.2231327295303345, "learning_rate": 7.558816746591499e-05, "loss": 2.420313262939453, "memory(GiB)": 121.15, "step": 4800, "token_acc": 0.504757785467128, "train_speed(iter/s)": 1.197541 }, { "epoch": 1.6466758053461275, "grad_norm": 1.167057752609253, "learning_rate": 7.554190478039378e-05, "loss": 2.4123043060302733, "memory(GiB)": 121.15, "step": 4805, "token_acc": 0.4959183673469388, "train_speed(iter/s)": 1.197561 }, { "epoch": 1.648389307745031, "grad_norm": 1.2618101835250854, "learning_rate": 7.549561248869021e-05, "loss": 2.484409141540527, "memory(GiB)": 121.15, "step": 4810, "token_acc": 0.4812239221140473, "train_speed(iter/s)": 1.197661 }, { "epoch": 1.6501028101439341, "grad_norm": 1.3207452297210693, "learning_rate": 7.544929064446274e-05, "loss": 2.4632183074951173, "memory(GiB)": 121.15, "step": 4815, "token_acc": 0.48621553884711777, "train_speed(iter/s)": 1.197632 }, { "epoch": 1.6518163125428376, "grad_norm": 1.2185578346252441, "learning_rate": 7.540293930140393e-05, "loss": 2.462504005432129, "memory(GiB)": 121.15, "step": 4820, "token_acc": 0.4866108786610879, "train_speed(iter/s)": 1.19759 }, { "epoch": 1.653529814941741, "grad_norm": 1.2456600666046143, "learning_rate": 7.53565585132407e-05, "loss": 2.4001928329467774, "memory(GiB)": 121.15, "step": 4825, "token_acc": 0.4847107438016529, "train_speed(iter/s)": 1.197608 }, { "epoch": 1.6552433173406442, "grad_norm": 1.5484812259674072, "learning_rate": 7.531014833373407e-05, "loss": 2.450650215148926, "memory(GiB)": 121.15, "step": 4830, "token_acc": 0.4879518072289157, "train_speed(iter/s)": 1.197441 }, { "epoch": 1.6569568197395477, "grad_norm": 1.4388456344604492, "learning_rate": 7.526370881667906e-05, "loss": 2.345694732666016, "memory(GiB)": 121.15, "step": 4835, "token_acc": 0.5088652482269503, "train_speed(iter/s)": 1.197409 }, { "epoch": 1.658670322138451, "grad_norm": 1.215116024017334, "learning_rate": 7.521724001590472e-05, "loss": 2.52197208404541, "memory(GiB)": 121.15, "step": 4840, "token_acc": 0.47324281150159747, "train_speed(iter/s)": 1.197318 }, { "epoch": 1.6603838245373543, "grad_norm": 1.3573802709579468, "learning_rate": 7.517074198527407e-05, "loss": 2.4009368896484373, "memory(GiB)": 121.15, "step": 4845, "token_acc": 0.49383802816901406, "train_speed(iter/s)": 1.197297 }, { "epoch": 1.6620973269362578, "grad_norm": 1.3217761516571045, "learning_rate": 7.512421477868402e-05, "loss": 2.2715732574462892, "memory(GiB)": 121.15, "step": 4850, "token_acc": 0.5028621752531924, "train_speed(iter/s)": 1.19734 }, { "epoch": 1.6638108293351612, "grad_norm": 1.3204689025878906, "learning_rate": 7.507765845006525e-05, "loss": 2.3557043075561523, "memory(GiB)": 121.15, "step": 4855, "token_acc": 0.5053619302949062, "train_speed(iter/s)": 1.197412 }, { "epoch": 1.6655243317340644, "grad_norm": 1.3034210205078125, "learning_rate": 7.503107305338224e-05, "loss": 2.3246623992919924, "memory(GiB)": 121.15, "step": 4860, "token_acc": 0.49414414414414415, "train_speed(iter/s)": 1.197525 }, { "epoch": 1.6672378341329677, "grad_norm": 1.2075061798095703, "learning_rate": 7.498445864263312e-05, "loss": 2.311963653564453, "memory(GiB)": 121.15, "step": 4865, "token_acc": 0.4956597222222222, "train_speed(iter/s)": 1.197449 }, { "epoch": 1.668951336531871, "grad_norm": 1.335494875907898, "learning_rate": 7.493781527184968e-05, "loss": 2.465910720825195, "memory(GiB)": 121.15, "step": 4870, "token_acc": 0.48500428449014565, "train_speed(iter/s)": 1.197464 }, { "epoch": 1.6706648389307746, "grad_norm": 1.1777746677398682, "learning_rate": 7.489114299509732e-05, "loss": 2.4988330841064452, "memory(GiB)": 121.15, "step": 4875, "token_acc": 0.4797047970479705, "train_speed(iter/s)": 1.197465 }, { "epoch": 1.672378341329678, "grad_norm": 1.2024515867233276, "learning_rate": 7.484444186647485e-05, "loss": 2.4079551696777344, "memory(GiB)": 121.15, "step": 4880, "token_acc": 0.48181083265966046, "train_speed(iter/s)": 1.197297 }, { "epoch": 1.6740918437285812, "grad_norm": 1.1642180681228638, "learning_rate": 7.47977119401146e-05, "loss": 2.4010700225830077, "memory(GiB)": 121.15, "step": 4885, "token_acc": 0.5042844901456727, "train_speed(iter/s)": 1.197361 }, { "epoch": 1.6758053461274844, "grad_norm": 1.2267218828201294, "learning_rate": 7.475095327018223e-05, "loss": 2.367637825012207, "memory(GiB)": 121.15, "step": 4890, "token_acc": 0.5015002143163309, "train_speed(iter/s)": 1.197289 }, { "epoch": 1.6775188485263879, "grad_norm": 1.4971561431884766, "learning_rate": 7.470416591087677e-05, "loss": 2.3228084564208986, "memory(GiB)": 121.15, "step": 4895, "token_acc": 0.5080190723883832, "train_speed(iter/s)": 1.197311 }, { "epoch": 1.6792323509252913, "grad_norm": 1.2891055345535278, "learning_rate": 7.465734991643045e-05, "loss": 2.3747154235839845, "memory(GiB)": 121.15, "step": 4900, "token_acc": 0.4878366637706342, "train_speed(iter/s)": 1.197333 }, { "epoch": 1.6809458533241948, "grad_norm": 1.3809009790420532, "learning_rate": 7.461050534110871e-05, "loss": 2.4666973114013673, "memory(GiB)": 121.15, "step": 4905, "token_acc": 0.4758228362454287, "train_speed(iter/s)": 1.19722 }, { "epoch": 1.682659355723098, "grad_norm": 1.5265142917633057, "learning_rate": 7.456363223921017e-05, "loss": 2.363654899597168, "memory(GiB)": 121.15, "step": 4910, "token_acc": 0.4919627749576988, "train_speed(iter/s)": 1.197334 }, { "epoch": 1.6843728581220012, "grad_norm": 1.3125158548355103, "learning_rate": 7.451673066506643e-05, "loss": 2.427523612976074, "memory(GiB)": 121.15, "step": 4915, "token_acc": 0.4886897140418267, "train_speed(iter/s)": 1.197398 }, { "epoch": 1.6860863605209047, "grad_norm": 1.4760777950286865, "learning_rate": 7.446980067304214e-05, "loss": 2.3164108276367186, "memory(GiB)": 121.15, "step": 4920, "token_acc": 0.4941808415398389, "train_speed(iter/s)": 1.197487 }, { "epoch": 1.687799862919808, "grad_norm": 1.5499227046966553, "learning_rate": 7.44228423175349e-05, "loss": 2.358456039428711, "memory(GiB)": 121.15, "step": 4925, "token_acc": 0.5125177137458667, "train_speed(iter/s)": 1.19756 }, { "epoch": 1.6895133653187115, "grad_norm": 1.3509254455566406, "learning_rate": 7.437585565297516e-05, "loss": 2.454172134399414, "memory(GiB)": 121.15, "step": 4930, "token_acc": 0.4847027972027972, "train_speed(iter/s)": 1.197512 }, { "epoch": 1.6912268677176148, "grad_norm": 1.3138846158981323, "learning_rate": 7.432884073382618e-05, "loss": 2.426289367675781, "memory(GiB)": 121.15, "step": 4935, "token_acc": 0.4844591794446747, "train_speed(iter/s)": 1.197533 }, { "epoch": 1.6929403701165182, "grad_norm": 1.359837293624878, "learning_rate": 7.4281797614584e-05, "loss": 2.3388341903686523, "memory(GiB)": 121.15, "step": 4940, "token_acc": 0.5043327556325823, "train_speed(iter/s)": 1.197561 }, { "epoch": 1.6946538725154214, "grad_norm": 1.1416168212890625, "learning_rate": 7.423472634977736e-05, "loss": 2.2894289016723635, "memory(GiB)": 121.15, "step": 4945, "token_acc": 0.506215173596228, "train_speed(iter/s)": 1.197671 }, { "epoch": 1.6963673749143249, "grad_norm": 1.5766090154647827, "learning_rate": 7.418762699396751e-05, "loss": 2.5473079681396484, "memory(GiB)": 121.15, "step": 4950, "token_acc": 0.47230571060541005, "train_speed(iter/s)": 1.197748 }, { "epoch": 1.6980808773132283, "grad_norm": 1.2188444137573242, "learning_rate": 7.414049960174845e-05, "loss": 2.4649171829223633, "memory(GiB)": 121.15, "step": 4955, "token_acc": 0.48721023181454837, "train_speed(iter/s)": 1.197605 }, { "epoch": 1.6997943797121315, "grad_norm": 1.3534198999404907, "learning_rate": 7.409334422774652e-05, "loss": 2.552993965148926, "memory(GiB)": 121.15, "step": 4960, "token_acc": 0.4766069086139047, "train_speed(iter/s)": 1.19768 }, { "epoch": 1.701507882111035, "grad_norm": 1.3040398359298706, "learning_rate": 7.404616092662053e-05, "loss": 2.4946054458618163, "memory(GiB)": 121.15, "step": 4965, "token_acc": 0.47532362459546923, "train_speed(iter/s)": 1.197731 }, { "epoch": 1.7032213845099382, "grad_norm": 1.1915825605392456, "learning_rate": 7.399894975306174e-05, "loss": 2.4039541244506837, "memory(GiB)": 121.15, "step": 4970, "token_acc": 0.49151857835218093, "train_speed(iter/s)": 1.197451 }, { "epoch": 1.7049348869088417, "grad_norm": 1.3802520036697388, "learning_rate": 7.395171076179362e-05, "loss": 2.289769744873047, "memory(GiB)": 121.15, "step": 4975, "token_acc": 0.5086607520067596, "train_speed(iter/s)": 1.197472 }, { "epoch": 1.706648389307745, "grad_norm": 1.439793586730957, "learning_rate": 7.390444400757191e-05, "loss": 2.3692941665649414, "memory(GiB)": 121.15, "step": 4980, "token_acc": 0.4943868739205527, "train_speed(iter/s)": 1.197513 }, { "epoch": 1.7083618917066485, "grad_norm": 1.2987219095230103, "learning_rate": 7.385714954518454e-05, "loss": 2.424397850036621, "memory(GiB)": 121.15, "step": 4985, "token_acc": 0.48625792811839325, "train_speed(iter/s)": 1.197627 }, { "epoch": 1.7100753941055518, "grad_norm": 1.4580899477005005, "learning_rate": 7.380982742945161e-05, "loss": 2.424925422668457, "memory(GiB)": 121.15, "step": 4990, "token_acc": 0.5027075812274369, "train_speed(iter/s)": 1.197704 }, { "epoch": 1.711788896504455, "grad_norm": 1.253578543663025, "learning_rate": 7.376247771522517e-05, "loss": 2.439616394042969, "memory(GiB)": 121.15, "step": 4995, "token_acc": 0.4965263588067021, "train_speed(iter/s)": 1.197646 }, { "epoch": 1.7135023989033584, "grad_norm": 1.2150100469589233, "learning_rate": 7.371510045738934e-05, "loss": 2.4213104248046875, "memory(GiB)": 121.15, "step": 5000, "token_acc": 0.4893797790994053, "train_speed(iter/s)": 1.197699 }, { "epoch": 1.7135023989033584, "eval_loss": 1.9223049879074097, "eval_runtime": 4.1655, "eval_samples_per_second": 24.007, "eval_steps_per_second": 24.007, "eval_token_acc": 0.528169014084507, "step": 5000 }, { "epoch": 1.7152159013022619, "grad_norm": 1.2026276588439941, "learning_rate": 7.366769571086014e-05, "loss": 2.421061706542969, "memory(GiB)": 121.15, "step": 5005, "token_acc": 0.5001578781180929, "train_speed(iter/s)": 1.196155 }, { "epoch": 1.7169294037011653, "grad_norm": 1.1675009727478027, "learning_rate": 7.362026353058545e-05, "loss": 2.4344688415527345, "memory(GiB)": 121.15, "step": 5010, "token_acc": 0.4983065198983912, "train_speed(iter/s)": 1.19624 }, { "epoch": 1.7186429061000685, "grad_norm": 1.422363519668579, "learning_rate": 7.357280397154492e-05, "loss": 2.4616539001464846, "memory(GiB)": 121.15, "step": 5015, "token_acc": 0.4758650149508757, "train_speed(iter/s)": 1.196071 }, { "epoch": 1.7203564084989718, "grad_norm": 1.2639539241790771, "learning_rate": 7.352531708875e-05, "loss": 2.351002311706543, "memory(GiB)": 121.15, "step": 5020, "token_acc": 0.5072836332476436, "train_speed(iter/s)": 1.196145 }, { "epoch": 1.7220699108978752, "grad_norm": 1.590264081954956, "learning_rate": 7.347780293724378e-05, "loss": 2.235446548461914, "memory(GiB)": 121.15, "step": 5025, "token_acc": 0.5185185185185185, "train_speed(iter/s)": 1.196196 }, { "epoch": 1.7237834132967786, "grad_norm": 1.2797834873199463, "learning_rate": 7.343026157210093e-05, "loss": 2.299345779418945, "memory(GiB)": 121.15, "step": 5030, "token_acc": 0.498903989478299, "train_speed(iter/s)": 1.196168 }, { "epoch": 1.725496915695682, "grad_norm": 1.219597578048706, "learning_rate": 7.338269304842771e-05, "loss": 2.4794694900512697, "memory(GiB)": 121.15, "step": 5035, "token_acc": 0.48799668874172186, "train_speed(iter/s)": 1.196243 }, { "epoch": 1.7272104180945853, "grad_norm": 1.2706630229949951, "learning_rate": 7.333509742136185e-05, "loss": 2.3702627182006837, "memory(GiB)": 121.15, "step": 5040, "token_acc": 0.5114076625053809, "train_speed(iter/s)": 1.196094 }, { "epoch": 1.7289239204934885, "grad_norm": 1.147678017616272, "learning_rate": 7.328747474607244e-05, "loss": 2.3807491302490233, "memory(GiB)": 121.15, "step": 5045, "token_acc": 0.5085168259243872, "train_speed(iter/s)": 1.196107 }, { "epoch": 1.730637422892392, "grad_norm": 1.2982813119888306, "learning_rate": 7.323982507776001e-05, "loss": 2.5088169097900392, "memory(GiB)": 121.15, "step": 5050, "token_acc": 0.48125, "train_speed(iter/s)": 1.196158 }, { "epoch": 1.7323509252912954, "grad_norm": 1.260150671005249, "learning_rate": 7.319214847165632e-05, "loss": 2.395477294921875, "memory(GiB)": 121.15, "step": 5055, "token_acc": 0.47485080988917305, "train_speed(iter/s)": 1.196124 }, { "epoch": 1.7340644276901989, "grad_norm": 1.4429035186767578, "learning_rate": 7.314444498302439e-05, "loss": 2.317232131958008, "memory(GiB)": 121.15, "step": 5060, "token_acc": 0.506029477445288, "train_speed(iter/s)": 1.196055 }, { "epoch": 1.735777930089102, "grad_norm": 1.3191522359848022, "learning_rate": 7.309671466715834e-05, "loss": 2.414741325378418, "memory(GiB)": 121.15, "step": 5065, "token_acc": 0.48784507622579315, "train_speed(iter/s)": 1.196078 }, { "epoch": 1.7374914324880055, "grad_norm": 1.3446946144104004, "learning_rate": 7.304895757938349e-05, "loss": 2.3309322357177735, "memory(GiB)": 121.15, "step": 5070, "token_acc": 0.4948154292824554, "train_speed(iter/s)": 1.196141 }, { "epoch": 1.7392049348869087, "grad_norm": 1.3731722831726074, "learning_rate": 7.300117377505606e-05, "loss": 2.4624984741210936, "memory(GiB)": 121.15, "step": 5075, "token_acc": 0.4835030549898167, "train_speed(iter/s)": 1.196267 }, { "epoch": 1.7409184372858122, "grad_norm": 1.4737749099731445, "learning_rate": 7.295336330956335e-05, "loss": 2.4912227630615233, "memory(GiB)": 121.15, "step": 5080, "token_acc": 0.48315098468271334, "train_speed(iter/s)": 1.196147 }, { "epoch": 1.7426319396847156, "grad_norm": 1.2870268821716309, "learning_rate": 7.29055262383235e-05, "loss": 2.4697399139404297, "memory(GiB)": 121.15, "step": 5085, "token_acc": 0.4804804804804805, "train_speed(iter/s)": 1.196199 }, { "epoch": 1.744345442083619, "grad_norm": 1.2554540634155273, "learning_rate": 7.285766261678551e-05, "loss": 2.3569877624511717, "memory(GiB)": 121.15, "step": 5090, "token_acc": 0.4932520679146713, "train_speed(iter/s)": 1.196293 }, { "epoch": 1.7460589444825223, "grad_norm": 1.4504461288452148, "learning_rate": 7.280977250042916e-05, "loss": 2.497797966003418, "memory(GiB)": 121.15, "step": 5095, "token_acc": 0.47994880546075086, "train_speed(iter/s)": 1.196288 }, { "epoch": 1.7477724468814255, "grad_norm": 1.245123028755188, "learning_rate": 7.276185594476494e-05, "loss": 2.4098968505859375, "memory(GiB)": 121.15, "step": 5100, "token_acc": 0.4856548856548857, "train_speed(iter/s)": 1.196284 }, { "epoch": 1.749485949280329, "grad_norm": 1.2245147228240967, "learning_rate": 7.271391300533398e-05, "loss": 2.358654022216797, "memory(GiB)": 121.15, "step": 5105, "token_acc": 0.5010679196924391, "train_speed(iter/s)": 1.196147 }, { "epoch": 1.7511994516792324, "grad_norm": 1.3872697353363037, "learning_rate": 7.266594373770797e-05, "loss": 2.411460113525391, "memory(GiB)": 121.15, "step": 5110, "token_acc": 0.49911504424778763, "train_speed(iter/s)": 1.196221 }, { "epoch": 1.7529129540781359, "grad_norm": 1.2773380279541016, "learning_rate": 7.261794819748918e-05, "loss": 2.4140913009643556, "memory(GiB)": 121.15, "step": 5115, "token_acc": 0.4821576763485477, "train_speed(iter/s)": 1.196277 }, { "epoch": 1.754626456477039, "grad_norm": 1.376909852027893, "learning_rate": 7.256992644031027e-05, "loss": 2.458011245727539, "memory(GiB)": 121.15, "step": 5120, "token_acc": 0.47845711940910957, "train_speed(iter/s)": 1.196349 }, { "epoch": 1.7563399588759423, "grad_norm": 1.3013631105422974, "learning_rate": 7.252187852183432e-05, "loss": 2.449685478210449, "memory(GiB)": 121.15, "step": 5125, "token_acc": 0.4836379090522737, "train_speed(iter/s)": 1.196398 }, { "epoch": 1.7580534612748457, "grad_norm": 1.17084538936615, "learning_rate": 7.247380449775472e-05, "loss": 2.356489372253418, "memory(GiB)": 121.15, "step": 5130, "token_acc": 0.5033641715727503, "train_speed(iter/s)": 1.196465 }, { "epoch": 1.7597669636737492, "grad_norm": 1.6032733917236328, "learning_rate": 7.242570442379512e-05, "loss": 2.4103275299072267, "memory(GiB)": 121.15, "step": 5135, "token_acc": 0.49898744430943703, "train_speed(iter/s)": 1.196532 }, { "epoch": 1.7614804660726526, "grad_norm": 1.3157172203063965, "learning_rate": 7.237757835570938e-05, "loss": 2.4408206939697266, "memory(GiB)": 121.15, "step": 5140, "token_acc": 0.49331662489557226, "train_speed(iter/s)": 1.196544 }, { "epoch": 1.7631939684715559, "grad_norm": 1.1885268688201904, "learning_rate": 7.232942634928149e-05, "loss": 2.369022178649902, "memory(GiB)": 121.15, "step": 5145, "token_acc": 0.5132340777502068, "train_speed(iter/s)": 1.196484 }, { "epoch": 1.764907470870459, "grad_norm": 1.1398416757583618, "learning_rate": 7.22812484603255e-05, "loss": 2.428313446044922, "memory(GiB)": 121.15, "step": 5150, "token_acc": 0.4818430300663803, "train_speed(iter/s)": 1.19642 }, { "epoch": 1.7666209732693625, "grad_norm": 1.2544193267822266, "learning_rate": 7.223304474468541e-05, "loss": 2.412771224975586, "memory(GiB)": 121.15, "step": 5155, "token_acc": 0.4894168466522678, "train_speed(iter/s)": 1.196404 }, { "epoch": 1.768334475668266, "grad_norm": 1.4088751077651978, "learning_rate": 7.218481525823528e-05, "loss": 2.449677276611328, "memory(GiB)": 121.15, "step": 5160, "token_acc": 0.5015815634884772, "train_speed(iter/s)": 1.196447 }, { "epoch": 1.7700479780671694, "grad_norm": 1.243284821510315, "learning_rate": 7.213656005687893e-05, "loss": 2.5565948486328125, "memory(GiB)": 121.15, "step": 5165, "token_acc": 0.47590870667793744, "train_speed(iter/s)": 1.196518 }, { "epoch": 1.7717614804660726, "grad_norm": 1.2429063320159912, "learning_rate": 7.208827919654999e-05, "loss": 2.3683643341064453, "memory(GiB)": 121.15, "step": 5170, "token_acc": 0.4934405416842996, "train_speed(iter/s)": 1.196567 }, { "epoch": 1.7734749828649758, "grad_norm": 1.1386752128601074, "learning_rate": 7.203997273321192e-05, "loss": 2.3264528274536134, "memory(GiB)": 121.15, "step": 5175, "token_acc": 0.5169971671388102, "train_speed(iter/s)": 1.196625 }, { "epoch": 1.7751884852638793, "grad_norm": 1.42714262008667, "learning_rate": 7.199164072285778e-05, "loss": 2.423206901550293, "memory(GiB)": 121.15, "step": 5180, "token_acc": 0.49518810148731407, "train_speed(iter/s)": 1.196704 }, { "epoch": 1.7769019876627827, "grad_norm": 1.2523210048675537, "learning_rate": 7.194328322151024e-05, "loss": 2.443472671508789, "memory(GiB)": 121.15, "step": 5185, "token_acc": 0.48068134607395097, "train_speed(iter/s)": 1.19678 }, { "epoch": 1.7786154900616862, "grad_norm": 1.3115876913070679, "learning_rate": 7.189490028522158e-05, "loss": 2.4518531799316405, "memory(GiB)": 121.15, "step": 5190, "token_acc": 0.5054852320675105, "train_speed(iter/s)": 1.196698 }, { "epoch": 1.7803289924605894, "grad_norm": 1.2455567121505737, "learning_rate": 7.184649197007351e-05, "loss": 2.431066131591797, "memory(GiB)": 121.15, "step": 5195, "token_acc": 0.48316571928290336, "train_speed(iter/s)": 1.196748 }, { "epoch": 1.7820424948594928, "grad_norm": 1.1394755840301514, "learning_rate": 7.179805833217715e-05, "loss": 2.39774169921875, "memory(GiB)": 121.15, "step": 5200, "token_acc": 0.49237113402061855, "train_speed(iter/s)": 1.196762 }, { "epoch": 1.783755997258396, "grad_norm": 1.4447901248931885, "learning_rate": 7.174959942767303e-05, "loss": 2.3093379974365233, "memory(GiB)": 121.15, "step": 5205, "token_acc": 0.5085714285714286, "train_speed(iter/s)": 1.196884 }, { "epoch": 1.7854694996572995, "grad_norm": 1.1830451488494873, "learning_rate": 7.170111531273089e-05, "loss": 2.4190546035766602, "memory(GiB)": 121.15, "step": 5210, "token_acc": 0.49524203558129914, "train_speed(iter/s)": 1.196675 }, { "epoch": 1.787183002056203, "grad_norm": 1.3000568151474, "learning_rate": 7.165260604354976e-05, "loss": 2.3712133407592773, "memory(GiB)": 121.15, "step": 5215, "token_acc": 0.5092235092235092, "train_speed(iter/s)": 1.196697 }, { "epoch": 1.7888965044551064, "grad_norm": 1.268646478652954, "learning_rate": 7.160407167635779e-05, "loss": 2.389765167236328, "memory(GiB)": 121.15, "step": 5220, "token_acc": 0.4866920152091255, "train_speed(iter/s)": 1.196757 }, { "epoch": 1.7906100068540096, "grad_norm": 1.2979974746704102, "learning_rate": 7.155551226741223e-05, "loss": 2.389493942260742, "memory(GiB)": 121.15, "step": 5225, "token_acc": 0.5103857566765578, "train_speed(iter/s)": 1.19674 }, { "epoch": 1.7923235092529128, "grad_norm": 1.2965034246444702, "learning_rate": 7.150692787299936e-05, "loss": 2.426810455322266, "memory(GiB)": 121.15, "step": 5230, "token_acc": 0.49468954248366015, "train_speed(iter/s)": 1.19682 }, { "epoch": 1.7940370116518163, "grad_norm": 1.7855819463729858, "learning_rate": 7.14583185494344e-05, "loss": 2.4149410247802736, "memory(GiB)": 121.15, "step": 5235, "token_acc": 0.49786507258753204, "train_speed(iter/s)": 1.196774 }, { "epoch": 1.7957505140507197, "grad_norm": 1.2461841106414795, "learning_rate": 7.14096843530615e-05, "loss": 2.39089412689209, "memory(GiB)": 121.15, "step": 5240, "token_acc": 0.48201127004768096, "train_speed(iter/s)": 1.196887 }, { "epoch": 1.7974640164496232, "grad_norm": 1.3028051853179932, "learning_rate": 7.136102534025361e-05, "loss": 2.413047218322754, "memory(GiB)": 121.15, "step": 5245, "token_acc": 0.49605645496056455, "train_speed(iter/s)": 1.196928 }, { "epoch": 1.7991775188485264, "grad_norm": 1.3146750926971436, "learning_rate": 7.13123415674125e-05, "loss": 2.348434257507324, "memory(GiB)": 121.15, "step": 5250, "token_acc": 0.4973129392310872, "train_speed(iter/s)": 1.196788 }, { "epoch": 1.8008910212474296, "grad_norm": 3.940899133682251, "learning_rate": 7.126363309096856e-05, "loss": 2.4444360733032227, "memory(GiB)": 121.15, "step": 5255, "token_acc": 0.4858509366281387, "train_speed(iter/s)": 1.196684 }, { "epoch": 1.802604523646333, "grad_norm": 1.414834976196289, "learning_rate": 7.121489996738085e-05, "loss": 2.4238956451416014, "memory(GiB)": 121.15, "step": 5260, "token_acc": 0.48103874690849135, "train_speed(iter/s)": 1.196598 }, { "epoch": 1.8043180260452365, "grad_norm": 1.2292766571044922, "learning_rate": 7.116614225313702e-05, "loss": 2.338291549682617, "memory(GiB)": 121.15, "step": 5265, "token_acc": 0.49568221070811747, "train_speed(iter/s)": 1.196517 }, { "epoch": 1.80603152844414, "grad_norm": 1.1891450881958008, "learning_rate": 7.111736000475324e-05, "loss": 2.360321044921875, "memory(GiB)": 121.15, "step": 5270, "token_acc": 0.49935539321014183, "train_speed(iter/s)": 1.196408 }, { "epoch": 1.8077450308430432, "grad_norm": 1.262709140777588, "learning_rate": 7.106855327877401e-05, "loss": 2.426603317260742, "memory(GiB)": 121.15, "step": 5275, "token_acc": 0.4843485617597293, "train_speed(iter/s)": 1.19647 }, { "epoch": 1.8094585332419464, "grad_norm": 1.4121195077896118, "learning_rate": 7.101972213177233e-05, "loss": 2.437047576904297, "memory(GiB)": 121.15, "step": 5280, "token_acc": 0.4816573154941735, "train_speed(iter/s)": 1.196535 }, { "epoch": 1.8111720356408498, "grad_norm": 1.2512810230255127, "learning_rate": 7.097086662034944e-05, "loss": 2.4278633117675783, "memory(GiB)": 121.15, "step": 5285, "token_acc": 0.4778834229020256, "train_speed(iter/s)": 1.196539 }, { "epoch": 1.8128855380397533, "grad_norm": 1.1546963453292847, "learning_rate": 7.092198680113483e-05, "loss": 2.3532032012939452, "memory(GiB)": 121.15, "step": 5290, "token_acc": 0.5069060773480663, "train_speed(iter/s)": 1.196574 }, { "epoch": 1.8145990404386567, "grad_norm": 1.2150464057922363, "learning_rate": 7.087308273078617e-05, "loss": 2.4294673919677736, "memory(GiB)": 121.15, "step": 5295, "token_acc": 0.4886267902274642, "train_speed(iter/s)": 1.196491 }, { "epoch": 1.81631254283756, "grad_norm": 1.3174079656600952, "learning_rate": 7.082415446598927e-05, "loss": 2.375569725036621, "memory(GiB)": 121.15, "step": 5300, "token_acc": 0.5045083726921425, "train_speed(iter/s)": 1.196633 }, { "epoch": 1.8180260452364634, "grad_norm": 1.297182559967041, "learning_rate": 7.077520206345791e-05, "loss": 2.253119468688965, "memory(GiB)": 121.15, "step": 5305, "token_acc": 0.5176413726437893, "train_speed(iter/s)": 1.196684 }, { "epoch": 1.8197395476353666, "grad_norm": 1.4139959812164307, "learning_rate": 7.072622557993394e-05, "loss": 2.3813846588134764, "memory(GiB)": 121.15, "step": 5310, "token_acc": 0.5051592642440557, "train_speed(iter/s)": 1.196759 }, { "epoch": 1.82145305003427, "grad_norm": 1.2398375272750854, "learning_rate": 7.067722507218703e-05, "loss": 2.348771667480469, "memory(GiB)": 121.15, "step": 5315, "token_acc": 0.49027072758037227, "train_speed(iter/s)": 1.196825 }, { "epoch": 1.8231665524331735, "grad_norm": 1.3225500583648682, "learning_rate": 7.062820059701478e-05, "loss": 2.305897521972656, "memory(GiB)": 121.15, "step": 5320, "token_acc": 0.5106290672451194, "train_speed(iter/s)": 1.196883 }, { "epoch": 1.8248800548320767, "grad_norm": 1.5555260181427002, "learning_rate": 7.057915221124253e-05, "loss": 2.441262054443359, "memory(GiB)": 121.15, "step": 5325, "token_acc": 0.5102725366876311, "train_speed(iter/s)": 1.196833 }, { "epoch": 1.8265935572309802, "grad_norm": 1.1822102069854736, "learning_rate": 7.053007997172333e-05, "loss": 2.4682416915893555, "memory(GiB)": 121.15, "step": 5330, "token_acc": 0.4859504132231405, "train_speed(iter/s)": 1.1969 }, { "epoch": 1.8283070596298834, "grad_norm": 1.353039264678955, "learning_rate": 7.048098393533789e-05, "loss": 2.498739814758301, "memory(GiB)": 121.15, "step": 5335, "token_acc": 0.47984224364592465, "train_speed(iter/s)": 1.196841 }, { "epoch": 1.8300205620287868, "grad_norm": 1.2120466232299805, "learning_rate": 7.04318641589945e-05, "loss": 2.4457645416259766, "memory(GiB)": 121.15, "step": 5340, "token_acc": 0.4951945080091533, "train_speed(iter/s)": 1.196921 }, { "epoch": 1.8317340644276903, "grad_norm": 1.1471744775772095, "learning_rate": 7.038272069962898e-05, "loss": 2.3350343704223633, "memory(GiB)": 121.15, "step": 5345, "token_acc": 0.5005978477481068, "train_speed(iter/s)": 1.196938 }, { "epoch": 1.8334475668265937, "grad_norm": 1.2102179527282715, "learning_rate": 7.03335536142046e-05, "loss": 2.359125328063965, "memory(GiB)": 121.15, "step": 5350, "token_acc": 0.4896640826873385, "train_speed(iter/s)": 1.197006 }, { "epoch": 1.835161069225497, "grad_norm": 1.1468292474746704, "learning_rate": 7.028436295971199e-05, "loss": 2.3943155288696287, "memory(GiB)": 121.15, "step": 5355, "token_acc": 0.49873949579831933, "train_speed(iter/s)": 1.196914 }, { "epoch": 1.8368745716244002, "grad_norm": 1.2948565483093262, "learning_rate": 7.023514879316912e-05, "loss": 2.410904884338379, "memory(GiB)": 121.15, "step": 5360, "token_acc": 0.48828451882845186, "train_speed(iter/s)": 1.196976 }, { "epoch": 1.8385880740233036, "grad_norm": 1.2564729452133179, "learning_rate": 7.01859111716212e-05, "loss": 2.417181968688965, "memory(GiB)": 121.15, "step": 5365, "token_acc": 0.48138178555406014, "train_speed(iter/s)": 1.197042 }, { "epoch": 1.840301576422207, "grad_norm": 1.316306233406067, "learning_rate": 7.01366501521407e-05, "loss": 2.310393714904785, "memory(GiB)": 121.15, "step": 5370, "token_acc": 0.5127877237851662, "train_speed(iter/s)": 1.197045 }, { "epoch": 1.8420150788211105, "grad_norm": 1.354409098625183, "learning_rate": 7.008736579182705e-05, "loss": 2.44771671295166, "memory(GiB)": 121.15, "step": 5375, "token_acc": 0.48467600700525393, "train_speed(iter/s)": 1.197139 }, { "epoch": 1.8437285812200137, "grad_norm": 1.160861849784851, "learning_rate": 7.003805814780692e-05, "loss": 2.3886240005493162, "memory(GiB)": 121.15, "step": 5380, "token_acc": 0.4925249169435216, "train_speed(iter/s)": 1.196942 }, { "epoch": 1.845442083618917, "grad_norm": 1.4434077739715576, "learning_rate": 6.998872727723383e-05, "loss": 2.478921890258789, "memory(GiB)": 121.15, "step": 5385, "token_acc": 0.4739075095460331, "train_speed(iter/s)": 1.197009 }, { "epoch": 1.8471555860178204, "grad_norm": 1.3403353691101074, "learning_rate": 6.993937323728828e-05, "loss": 2.3476911544799806, "memory(GiB)": 121.15, "step": 5390, "token_acc": 0.4950452391210685, "train_speed(iter/s)": 1.197083 }, { "epoch": 1.8488690884167238, "grad_norm": 1.2672699689865112, "learning_rate": 6.988999608517766e-05, "loss": 2.417894744873047, "memory(GiB)": 121.15, "step": 5395, "token_acc": 0.4844961240310077, "train_speed(iter/s)": 1.197133 }, { "epoch": 1.8505825908156273, "grad_norm": 1.2731293439865112, "learning_rate": 6.984059587813607e-05, "loss": 2.3179910659790037, "memory(GiB)": 121.15, "step": 5400, "token_acc": 0.49978876214617657, "train_speed(iter/s)": 1.197074 }, { "epoch": 1.8522960932145305, "grad_norm": 1.2332401275634766, "learning_rate": 6.979117267342438e-05, "loss": 2.4797264099121095, "memory(GiB)": 121.15, "step": 5405, "token_acc": 0.48070607553366174, "train_speed(iter/s)": 1.196694 }, { "epoch": 1.8540095956134337, "grad_norm": 1.2826714515686035, "learning_rate": 6.974172652833013e-05, "loss": 2.411201286315918, "memory(GiB)": 121.15, "step": 5410, "token_acc": 0.4936151475121092, "train_speed(iter/s)": 1.196786 }, { "epoch": 1.8557230980123371, "grad_norm": 1.246603012084961, "learning_rate": 6.969225750016743e-05, "loss": 2.3893699645996094, "memory(GiB)": 121.15, "step": 5415, "token_acc": 0.49685816876122085, "train_speed(iter/s)": 1.196872 }, { "epoch": 1.8574366004112406, "grad_norm": 1.236885666847229, "learning_rate": 6.964276564627687e-05, "loss": 2.422398567199707, "memory(GiB)": 121.15, "step": 5420, "token_acc": 0.4831107619795758, "train_speed(iter/s)": 1.196975 }, { "epoch": 1.859150102810144, "grad_norm": 1.3109166622161865, "learning_rate": 6.959325102402562e-05, "loss": 2.3902442932128904, "memory(GiB)": 121.15, "step": 5425, "token_acc": 0.5137887144675435, "train_speed(iter/s)": 1.197008 }, { "epoch": 1.8608636052090473, "grad_norm": 1.3245840072631836, "learning_rate": 6.95437136908071e-05, "loss": 2.4304855346679686, "memory(GiB)": 121.15, "step": 5430, "token_acc": 0.48013967699694454, "train_speed(iter/s)": 1.19707 }, { "epoch": 1.8625771076079507, "grad_norm": 1.295019507408142, "learning_rate": 6.949415370404115e-05, "loss": 2.344515800476074, "memory(GiB)": 121.15, "step": 5435, "token_acc": 0.5151888341543513, "train_speed(iter/s)": 1.197158 }, { "epoch": 1.864290610006854, "grad_norm": 1.4940534830093384, "learning_rate": 6.944457112117384e-05, "loss": 2.3558521270751953, "memory(GiB)": 121.15, "step": 5440, "token_acc": 0.4940970703979012, "train_speed(iter/s)": 1.197182 }, { "epoch": 1.8660041124057574, "grad_norm": 1.197662591934204, "learning_rate": 6.939496599967742e-05, "loss": 2.487857437133789, "memory(GiB)": 121.15, "step": 5445, "token_acc": 0.4774405250205086, "train_speed(iter/s)": 1.197217 }, { "epoch": 1.8677176148046608, "grad_norm": 1.2381094694137573, "learning_rate": 6.934533839705028e-05, "loss": 2.37841911315918, "memory(GiB)": 121.15, "step": 5450, "token_acc": 0.49915325994919557, "train_speed(iter/s)": 1.197311 }, { "epoch": 1.869431117203564, "grad_norm": 1.2393836975097656, "learning_rate": 6.929568837081686e-05, "loss": 2.4325183868408202, "memory(GiB)": 121.15, "step": 5455, "token_acc": 0.49793046357615894, "train_speed(iter/s)": 1.197395 }, { "epoch": 1.8711446196024675, "grad_norm": 1.3933783769607544, "learning_rate": 6.92460159785276e-05, "loss": 2.3841537475585937, "memory(GiB)": 121.15, "step": 5460, "token_acc": 0.4899184899184899, "train_speed(iter/s)": 1.197432 }, { "epoch": 1.8728581220013707, "grad_norm": 1.2289466857910156, "learning_rate": 6.919632127775885e-05, "loss": 2.406384086608887, "memory(GiB)": 121.15, "step": 5465, "token_acc": 0.4948081264108352, "train_speed(iter/s)": 1.197371 }, { "epoch": 1.8745716244002741, "grad_norm": 1.2226201295852661, "learning_rate": 6.914660432611285e-05, "loss": 2.3882381439208986, "memory(GiB)": 121.15, "step": 5470, "token_acc": 0.4903442485306465, "train_speed(iter/s)": 1.19714 }, { "epoch": 1.8762851267991776, "grad_norm": 1.1552906036376953, "learning_rate": 6.909686518121758e-05, "loss": 2.3681474685668946, "memory(GiB)": 121.15, "step": 5475, "token_acc": 0.5107505070993915, "train_speed(iter/s)": 1.197027 }, { "epoch": 1.877998629198081, "grad_norm": 1.1944262981414795, "learning_rate": 6.904710390072681e-05, "loss": 2.3038875579833986, "memory(GiB)": 121.15, "step": 5480, "token_acc": 0.5034275921165381, "train_speed(iter/s)": 1.197053 }, { "epoch": 1.8797121315969842, "grad_norm": 1.3570761680603027, "learning_rate": 6.899732054231989e-05, "loss": 2.4280797958374025, "memory(GiB)": 121.15, "step": 5485, "token_acc": 0.492436974789916, "train_speed(iter/s)": 1.197078 }, { "epoch": 1.8814256339958875, "grad_norm": 1.2633355855941772, "learning_rate": 6.894751516370183e-05, "loss": 2.4124073028564452, "memory(GiB)": 121.15, "step": 5490, "token_acc": 0.5027391487568479, "train_speed(iter/s)": 1.197188 }, { "epoch": 1.883139136394791, "grad_norm": 1.2537999153137207, "learning_rate": 6.889768782260313e-05, "loss": 2.428542900085449, "memory(GiB)": 121.15, "step": 5495, "token_acc": 0.48672181105790163, "train_speed(iter/s)": 1.197197 }, { "epoch": 1.8848526387936944, "grad_norm": 1.3347861766815186, "learning_rate": 6.884783857677977e-05, "loss": 2.3488109588623045, "memory(GiB)": 121.15, "step": 5500, "token_acc": 0.477579451458424, "train_speed(iter/s)": 1.197234 }, { "epoch": 1.8848526387936944, "eval_loss": 2.0847299098968506, "eval_runtime": 3.6922, "eval_samples_per_second": 27.084, "eval_steps_per_second": 27.084, "eval_token_acc": 0.4821882951653944, "step": 5500 }, { "epoch": 1.8865661411925978, "grad_norm": 1.363897681236267, "learning_rate": 6.879796748401308e-05, "loss": 2.3394805908203127, "memory(GiB)": 121.15, "step": 5505, "token_acc": 0.5024809791597751, "train_speed(iter/s)": 1.195983 }, { "epoch": 1.888279643591501, "grad_norm": 1.3506464958190918, "learning_rate": 6.874807460210974e-05, "loss": 2.4142616271972654, "memory(GiB)": 121.15, "step": 5510, "token_acc": 0.5070892410341952, "train_speed(iter/s)": 1.195917 }, { "epoch": 1.8899931459904042, "grad_norm": 1.2402634620666504, "learning_rate": 6.869815998890172e-05, "loss": 2.360979461669922, "memory(GiB)": 121.15, "step": 5515, "token_acc": 0.4930817610062893, "train_speed(iter/s)": 1.195873 }, { "epoch": 1.8917066483893077, "grad_norm": 1.4457182884216309, "learning_rate": 6.864822370224611e-05, "loss": 2.410553741455078, "memory(GiB)": 121.15, "step": 5520, "token_acc": 0.4780550774526678, "train_speed(iter/s)": 1.195901 }, { "epoch": 1.8934201507882111, "grad_norm": 1.2935283184051514, "learning_rate": 6.859826580002515e-05, "loss": 2.4739006042480467, "memory(GiB)": 121.15, "step": 5525, "token_acc": 0.49727767695099817, "train_speed(iter/s)": 1.195906 }, { "epoch": 1.8951336531871146, "grad_norm": 1.410256266593933, "learning_rate": 6.854828634014616e-05, "loss": 2.364804267883301, "memory(GiB)": 121.15, "step": 5530, "token_acc": 0.49477351916376305, "train_speed(iter/s)": 1.195968 }, { "epoch": 1.8968471555860178, "grad_norm": 1.1479358673095703, "learning_rate": 6.849828538054144e-05, "loss": 2.3632839202880858, "memory(GiB)": 121.15, "step": 5535, "token_acc": 0.49360217138425744, "train_speed(iter/s)": 1.19595 }, { "epoch": 1.898560657984921, "grad_norm": 1.4037853479385376, "learning_rate": 6.844826297916815e-05, "loss": 2.4949310302734373, "memory(GiB)": 121.15, "step": 5540, "token_acc": 0.4906303236797274, "train_speed(iter/s)": 1.195747 }, { "epoch": 1.9002741603838245, "grad_norm": 1.3124510049819946, "learning_rate": 6.839821919400841e-05, "loss": 2.4014610290527343, "memory(GiB)": 121.15, "step": 5545, "token_acc": 0.5106094808126411, "train_speed(iter/s)": 1.195776 }, { "epoch": 1.901987662782728, "grad_norm": 1.4767038822174072, "learning_rate": 6.834815408306902e-05, "loss": 2.2915904998779295, "memory(GiB)": 121.15, "step": 5550, "token_acc": 0.5082536924413553, "train_speed(iter/s)": 1.195713 }, { "epoch": 1.9037011651816313, "grad_norm": 1.2998137474060059, "learning_rate": 6.829806770438161e-05, "loss": 2.398386001586914, "memory(GiB)": 121.15, "step": 5555, "token_acc": 0.4952870608397601, "train_speed(iter/s)": 1.195698 }, { "epoch": 1.9054146675805346, "grad_norm": 1.2826026678085327, "learning_rate": 6.824796011600234e-05, "loss": 2.3865453720092775, "memory(GiB)": 121.15, "step": 5560, "token_acc": 0.5039435450394355, "train_speed(iter/s)": 1.195729 }, { "epoch": 1.907128169979438, "grad_norm": 1.3569114208221436, "learning_rate": 6.819783137601204e-05, "loss": 2.4408380508422853, "memory(GiB)": 121.15, "step": 5565, "token_acc": 0.4836683417085427, "train_speed(iter/s)": 1.195565 }, { "epoch": 1.9088416723783412, "grad_norm": 1.2808315753936768, "learning_rate": 6.814768154251605e-05, "loss": 2.373945426940918, "memory(GiB)": 121.15, "step": 5570, "token_acc": 0.5013134851138353, "train_speed(iter/s)": 1.195547 }, { "epoch": 1.9105551747772447, "grad_norm": 1.3185986280441284, "learning_rate": 6.80975106736441e-05, "loss": 2.405092239379883, "memory(GiB)": 121.15, "step": 5575, "token_acc": 0.49453551912568305, "train_speed(iter/s)": 1.195578 }, { "epoch": 1.9122686771761481, "grad_norm": 1.2197626829147339, "learning_rate": 6.80473188275504e-05, "loss": 2.3534908294677734, "memory(GiB)": 121.15, "step": 5580, "token_acc": 0.49236641221374045, "train_speed(iter/s)": 1.195566 }, { "epoch": 1.9139821795750516, "grad_norm": 1.334638237953186, "learning_rate": 6.799710606241338e-05, "loss": 2.3431064605712892, "memory(GiB)": 121.15, "step": 5585, "token_acc": 0.5041459369817579, "train_speed(iter/s)": 1.195449 }, { "epoch": 1.9156956819739548, "grad_norm": 1.2378220558166504, "learning_rate": 6.794687243643575e-05, "loss": 2.3658401489257814, "memory(GiB)": 121.15, "step": 5590, "token_acc": 0.50239651416122, "train_speed(iter/s)": 1.195487 }, { "epoch": 1.917409184372858, "grad_norm": 1.353344202041626, "learning_rate": 6.789661800784445e-05, "loss": 2.3320526123046874, "memory(GiB)": 121.15, "step": 5595, "token_acc": 0.49633699633699635, "train_speed(iter/s)": 1.195576 }, { "epoch": 1.9191226867717615, "grad_norm": 1.3538494110107422, "learning_rate": 6.784634283489047e-05, "loss": 2.3570690155029297, "memory(GiB)": 121.15, "step": 5600, "token_acc": 0.502601908065915, "train_speed(iter/s)": 1.195527 }, { "epoch": 1.920836189170665, "grad_norm": 1.1680636405944824, "learning_rate": 6.779604697584884e-05, "loss": 2.322873115539551, "memory(GiB)": 121.15, "step": 5605, "token_acc": 0.4917267713194739, "train_speed(iter/s)": 1.195655 }, { "epoch": 1.9225496915695683, "grad_norm": 1.2156356573104858, "learning_rate": 6.774573048901864e-05, "loss": 2.3429725646972654, "memory(GiB)": 121.15, "step": 5610, "token_acc": 0.49934469200524245, "train_speed(iter/s)": 1.195624 }, { "epoch": 1.9242631939684716, "grad_norm": 1.236417531967163, "learning_rate": 6.769539343272277e-05, "loss": 2.3153091430664063, "memory(GiB)": 121.15, "step": 5615, "token_acc": 0.5016196205460435, "train_speed(iter/s)": 1.195452 }, { "epoch": 1.9259766963673748, "grad_norm": 1.425718069076538, "learning_rate": 6.764503586530806e-05, "loss": 2.3464561462402345, "memory(GiB)": 121.15, "step": 5620, "token_acc": 0.5006347862886161, "train_speed(iter/s)": 1.195413 }, { "epoch": 1.9276901987662782, "grad_norm": 1.293174386024475, "learning_rate": 6.759465784514502e-05, "loss": 2.2919158935546875, "memory(GiB)": 121.15, "step": 5625, "token_acc": 0.4970513900589722, "train_speed(iter/s)": 1.195448 }, { "epoch": 1.9294037011651817, "grad_norm": 1.2050981521606445, "learning_rate": 6.7544259430628e-05, "loss": 2.3376819610595705, "memory(GiB)": 121.15, "step": 5630, "token_acc": 0.4968152866242038, "train_speed(iter/s)": 1.195492 }, { "epoch": 1.9311172035640851, "grad_norm": 1.2958369255065918, "learning_rate": 6.749384068017482e-05, "loss": 2.388623809814453, "memory(GiB)": 121.15, "step": 5635, "token_acc": 0.5045474231268947, "train_speed(iter/s)": 1.195546 }, { "epoch": 1.9328307059629883, "grad_norm": 1.6173927783966064, "learning_rate": 6.7443401652227e-05, "loss": 2.4136940002441407, "memory(GiB)": 121.15, "step": 5640, "token_acc": 0.47690857681432614, "train_speed(iter/s)": 1.195294 }, { "epoch": 1.9345442083618916, "grad_norm": 1.3506683111190796, "learning_rate": 6.739294240524955e-05, "loss": 2.3561931610107423, "memory(GiB)": 121.15, "step": 5645, "token_acc": 0.4888597640891219, "train_speed(iter/s)": 1.195263 }, { "epoch": 1.936257710760795, "grad_norm": 1.328127384185791, "learning_rate": 6.734246299773083e-05, "loss": 2.402357482910156, "memory(GiB)": 121.15, "step": 5650, "token_acc": 0.4926315789473684, "train_speed(iter/s)": 1.195214 }, { "epoch": 1.9379712131596984, "grad_norm": 1.2230862379074097, "learning_rate": 6.729196348818267e-05, "loss": 2.3842859268188477, "memory(GiB)": 121.15, "step": 5655, "token_acc": 0.49281903980303654, "train_speed(iter/s)": 1.19527 }, { "epoch": 1.939684715558602, "grad_norm": 1.4028003215789795, "learning_rate": 6.724144393514016e-05, "loss": 2.3484066009521483, "memory(GiB)": 121.15, "step": 5660, "token_acc": 0.489202657807309, "train_speed(iter/s)": 1.195111 }, { "epoch": 1.9413982179575051, "grad_norm": 1.3623894453048706, "learning_rate": 6.719090439716161e-05, "loss": 2.350746726989746, "memory(GiB)": 121.15, "step": 5665, "token_acc": 0.4894714224323163, "train_speed(iter/s)": 1.195233 }, { "epoch": 1.9431117203564083, "grad_norm": 1.2848446369171143, "learning_rate": 6.71403449328285e-05, "loss": 2.398483085632324, "memory(GiB)": 121.15, "step": 5670, "token_acc": 0.5058528428093646, "train_speed(iter/s)": 1.195321 }, { "epoch": 1.9448252227553118, "grad_norm": 1.2890852689743042, "learning_rate": 6.708976560074545e-05, "loss": 2.3159353256225588, "memory(GiB)": 121.15, "step": 5675, "token_acc": 0.49895876718034154, "train_speed(iter/s)": 1.195189 }, { "epoch": 1.9465387251542152, "grad_norm": 1.332423210144043, "learning_rate": 6.703916645954002e-05, "loss": 2.42370662689209, "memory(GiB)": 121.15, "step": 5680, "token_acc": 0.4906382978723404, "train_speed(iter/s)": 1.195239 }, { "epoch": 1.9482522275531187, "grad_norm": 1.1948307752609253, "learning_rate": 6.698854756786284e-05, "loss": 2.472254180908203, "memory(GiB)": 121.15, "step": 5685, "token_acc": 0.4758135444151275, "train_speed(iter/s)": 1.195302 }, { "epoch": 1.9499657299520219, "grad_norm": 1.2592694759368896, "learning_rate": 6.693790898438734e-05, "loss": 2.363982009887695, "memory(GiB)": 121.15, "step": 5690, "token_acc": 0.47787979966611016, "train_speed(iter/s)": 1.195055 }, { "epoch": 1.9516792323509253, "grad_norm": 1.2436282634735107, "learning_rate": 6.688725076780984e-05, "loss": 2.608768844604492, "memory(GiB)": 121.15, "step": 5695, "token_acc": 0.46748831279218017, "train_speed(iter/s)": 1.195092 }, { "epoch": 1.9533927347498286, "grad_norm": 1.3399641513824463, "learning_rate": 6.683657297684935e-05, "loss": 2.435945510864258, "memory(GiB)": 121.15, "step": 5700, "token_acc": 0.4805299076676034, "train_speed(iter/s)": 1.195083 }, { "epoch": 1.955106237148732, "grad_norm": 1.3390629291534424, "learning_rate": 6.678587567024765e-05, "loss": 2.507559585571289, "memory(GiB)": 121.15, "step": 5705, "token_acc": 0.4840305892937472, "train_speed(iter/s)": 1.195129 }, { "epoch": 1.9568197395476354, "grad_norm": 1.408202052116394, "learning_rate": 6.673515890676911e-05, "loss": 2.3633291244506838, "memory(GiB)": 121.15, "step": 5710, "token_acc": 0.4973214285714286, "train_speed(iter/s)": 1.195217 }, { "epoch": 1.9585332419465389, "grad_norm": 1.4465725421905518, "learning_rate": 6.66844227452006e-05, "loss": 2.3444522857666015, "memory(GiB)": 121.15, "step": 5715, "token_acc": 0.5062298107983387, "train_speed(iter/s)": 1.194949 }, { "epoch": 1.960246744345442, "grad_norm": 1.3422907590866089, "learning_rate": 6.663366724435151e-05, "loss": 2.4549114227294924, "memory(GiB)": 121.15, "step": 5720, "token_acc": 0.4869747899159664, "train_speed(iter/s)": 1.195009 }, { "epoch": 1.9619602467443453, "grad_norm": 1.4035427570343018, "learning_rate": 6.658289246305374e-05, "loss": 2.470903015136719, "memory(GiB)": 121.15, "step": 5725, "token_acc": 0.488279698053238, "train_speed(iter/s)": 1.194967 }, { "epoch": 1.9636737491432488, "grad_norm": 1.3312467336654663, "learning_rate": 6.653209846016136e-05, "loss": 2.4429283142089844, "memory(GiB)": 121.15, "step": 5730, "token_acc": 0.48681366191093817, "train_speed(iter/s)": 1.194958 }, { "epoch": 1.9653872515421522, "grad_norm": 1.4129302501678467, "learning_rate": 6.648128529455086e-05, "loss": 2.5493087768554688, "memory(GiB)": 121.15, "step": 5735, "token_acc": 0.4785276073619632, "train_speed(iter/s)": 1.195044 }, { "epoch": 1.9671007539410557, "grad_norm": 1.4012486934661865, "learning_rate": 6.64304530251209e-05, "loss": 2.3948997497558593, "memory(GiB)": 121.15, "step": 5740, "token_acc": 0.4955710955710956, "train_speed(iter/s)": 1.194977 }, { "epoch": 1.9688142563399589, "grad_norm": 1.191076636314392, "learning_rate": 6.637960171079224e-05, "loss": 2.475513458251953, "memory(GiB)": 121.15, "step": 5745, "token_acc": 0.48263614838200475, "train_speed(iter/s)": 1.194795 }, { "epoch": 1.970527758738862, "grad_norm": 1.2743687629699707, "learning_rate": 6.63287314105078e-05, "loss": 2.3600849151611327, "memory(GiB)": 121.15, "step": 5750, "token_acc": 0.5, "train_speed(iter/s)": 1.194676 }, { "epoch": 1.9722412611377655, "grad_norm": 1.2223845720291138, "learning_rate": 6.627784218323243e-05, "loss": 2.3628768920898438, "memory(GiB)": 121.15, "step": 5755, "token_acc": 0.4935483870967742, "train_speed(iter/s)": 1.194741 }, { "epoch": 1.973954763536669, "grad_norm": 1.2257802486419678, "learning_rate": 6.622693408795297e-05, "loss": 2.477371025085449, "memory(GiB)": 121.15, "step": 5760, "token_acc": 0.48403152218996265, "train_speed(iter/s)": 1.194793 }, { "epoch": 1.9756682659355724, "grad_norm": 1.5564388036727905, "learning_rate": 6.617600718367809e-05, "loss": 2.4117862701416017, "memory(GiB)": 121.15, "step": 5765, "token_acc": 0.4915254237288136, "train_speed(iter/s)": 1.194832 }, { "epoch": 1.9773817683344757, "grad_norm": 1.2757925987243652, "learning_rate": 6.612506152943829e-05, "loss": 2.3885501861572265, "memory(GiB)": 121.15, "step": 5770, "token_acc": 0.4857379082265399, "train_speed(iter/s)": 1.194826 }, { "epoch": 1.9790952707333789, "grad_norm": 1.2676723003387451, "learning_rate": 6.607409718428582e-05, "loss": 2.4736915588378907, "memory(GiB)": 121.15, "step": 5775, "token_acc": 0.4966996699669967, "train_speed(iter/s)": 1.194924 }, { "epoch": 1.9808087731322823, "grad_norm": 1.304724097251892, "learning_rate": 6.602311420729453e-05, "loss": 2.4538082122802733, "memory(GiB)": 121.15, "step": 5780, "token_acc": 0.4957081545064378, "train_speed(iter/s)": 1.194995 }, { "epoch": 1.9825222755311858, "grad_norm": 1.1920771598815918, "learning_rate": 6.597211265755994e-05, "loss": 2.468863296508789, "memory(GiB)": 121.15, "step": 5785, "token_acc": 0.4767879548306148, "train_speed(iter/s)": 1.195044 }, { "epoch": 1.9842357779300892, "grad_norm": 1.2430585622787476, "learning_rate": 6.592109259419907e-05, "loss": 2.401581382751465, "memory(GiB)": 121.15, "step": 5790, "token_acc": 0.5057947019867549, "train_speed(iter/s)": 1.194924 }, { "epoch": 1.9859492803289924, "grad_norm": 1.1658742427825928, "learning_rate": 6.587005407635034e-05, "loss": 2.396575927734375, "memory(GiB)": 121.15, "step": 5795, "token_acc": 0.4865934065934066, "train_speed(iter/s)": 1.194906 }, { "epoch": 1.9876627827278959, "grad_norm": 1.3530640602111816, "learning_rate": 6.581899716317367e-05, "loss": 2.504709815979004, "memory(GiB)": 121.15, "step": 5800, "token_acc": 0.4781144781144781, "train_speed(iter/s)": 1.194746 }, { "epoch": 1.989376285126799, "grad_norm": 1.2307320833206177, "learning_rate": 6.576792191385024e-05, "loss": 2.4839851379394533, "memory(GiB)": 121.15, "step": 5805, "token_acc": 0.4867294520547945, "train_speed(iter/s)": 1.194831 }, { "epoch": 1.9910897875257025, "grad_norm": 1.3626519441604614, "learning_rate": 6.57168283875825e-05, "loss": 2.4445009231567383, "memory(GiB)": 121.15, "step": 5810, "token_acc": 0.47415254237288135, "train_speed(iter/s)": 1.194892 }, { "epoch": 1.992803289924606, "grad_norm": 1.167412519454956, "learning_rate": 6.566571664359404e-05, "loss": 2.2358657836914064, "memory(GiB)": 121.15, "step": 5815, "token_acc": 0.517377914650242, "train_speed(iter/s)": 1.194732 }, { "epoch": 1.9945167923235092, "grad_norm": 1.193569302558899, "learning_rate": 6.561458674112964e-05, "loss": 2.3409467697143556, "memory(GiB)": 121.15, "step": 5820, "token_acc": 0.48451809856083733, "train_speed(iter/s)": 1.194809 }, { "epoch": 1.9962302947224126, "grad_norm": 1.176957607269287, "learning_rate": 6.556343873945508e-05, "loss": 2.441213607788086, "memory(GiB)": 121.15, "step": 5825, "token_acc": 0.48324140857021636, "train_speed(iter/s)": 1.194689 }, { "epoch": 1.9979437971213159, "grad_norm": 1.3127630949020386, "learning_rate": 6.551227269785711e-05, "loss": 2.355439376831055, "memory(GiB)": 121.15, "step": 5830, "token_acc": 0.49914748508098894, "train_speed(iter/s)": 1.194752 }, { "epoch": 1.9996572995202193, "grad_norm": 1.295285940170288, "learning_rate": 6.546108867564345e-05, "loss": 2.4319395065307616, "memory(GiB)": 121.15, "step": 5835, "token_acc": 0.4933675652545999, "train_speed(iter/s)": 1.194771 }, { "epoch": 2.0013708019191228, "grad_norm": 1.213097095489502, "learning_rate": 6.540988673214261e-05, "loss": 2.368752288818359, "memory(GiB)": 121.15, "step": 5840, "token_acc": 0.5042016806722689, "train_speed(iter/s)": 1.194717 }, { "epoch": 2.003084304318026, "grad_norm": 1.3773741722106934, "learning_rate": 6.535866692670387e-05, "loss": 2.3942144393920897, "memory(GiB)": 121.15, "step": 5845, "token_acc": 0.4946649594536918, "train_speed(iter/s)": 1.194754 }, { "epoch": 2.004797806716929, "grad_norm": 1.3043005466461182, "learning_rate": 6.530742931869724e-05, "loss": 2.377419090270996, "memory(GiB)": 121.15, "step": 5850, "token_acc": 0.5089637079142982, "train_speed(iter/s)": 1.194657 }, { "epoch": 2.0065113091158326, "grad_norm": 1.2014689445495605, "learning_rate": 6.525617396751338e-05, "loss": 2.4077945709228517, "memory(GiB)": 121.15, "step": 5855, "token_acc": 0.49230769230769234, "train_speed(iter/s)": 1.19458 }, { "epoch": 2.008224811514736, "grad_norm": 1.1757944822311401, "learning_rate": 6.520490093256344e-05, "loss": 2.297549247741699, "memory(GiB)": 121.15, "step": 5860, "token_acc": 0.5214922952149229, "train_speed(iter/s)": 1.194676 }, { "epoch": 2.0099383139136395, "grad_norm": 1.1674103736877441, "learning_rate": 6.515361027327918e-05, "loss": 2.302643966674805, "memory(GiB)": 121.15, "step": 5865, "token_acc": 0.5063394683026585, "train_speed(iter/s)": 1.194563 }, { "epoch": 2.011651816312543, "grad_norm": 1.678919792175293, "learning_rate": 6.510230204911268e-05, "loss": 2.244329833984375, "memory(GiB)": 121.15, "step": 5870, "token_acc": 0.502615518744551, "train_speed(iter/s)": 1.194626 }, { "epoch": 2.0133653187114464, "grad_norm": 1.3396438360214233, "learning_rate": 6.505097631953646e-05, "loss": 2.353954315185547, "memory(GiB)": 121.15, "step": 5875, "token_acc": 0.5064599483204134, "train_speed(iter/s)": 1.194563 }, { "epoch": 2.0150788211103494, "grad_norm": 1.3637374639511108, "learning_rate": 6.499963314404328e-05, "loss": 2.3956182479858397, "memory(GiB)": 121.15, "step": 5880, "token_acc": 0.5045230263157895, "train_speed(iter/s)": 1.194582 }, { "epoch": 2.016792323509253, "grad_norm": 1.4306305646896362, "learning_rate": 6.494827258214615e-05, "loss": 2.327225112915039, "memory(GiB)": 121.15, "step": 5885, "token_acc": 0.5151515151515151, "train_speed(iter/s)": 1.194651 }, { "epoch": 2.0185058259081563, "grad_norm": 1.2123055458068848, "learning_rate": 6.489689469337823e-05, "loss": 2.3161336898803713, "memory(GiB)": 121.15, "step": 5890, "token_acc": 0.5033869602032176, "train_speed(iter/s)": 1.194445 }, { "epoch": 2.0202193283070597, "grad_norm": 1.4665040969848633, "learning_rate": 6.484549953729275e-05, "loss": 2.3159923553466797, "memory(GiB)": 121.15, "step": 5895, "token_acc": 0.5063291139240507, "train_speed(iter/s)": 1.194437 }, { "epoch": 2.021932830705963, "grad_norm": 1.4545989036560059, "learning_rate": 6.479408717346296e-05, "loss": 2.4197515487670898, "memory(GiB)": 121.15, "step": 5900, "token_acc": 0.4801352493660186, "train_speed(iter/s)": 1.194497 }, { "epoch": 2.023646333104866, "grad_norm": 1.2483831644058228, "learning_rate": 6.474265766148206e-05, "loss": 2.213439178466797, "memory(GiB)": 121.15, "step": 5905, "token_acc": 0.5197788175244576, "train_speed(iter/s)": 1.194567 }, { "epoch": 2.0253598355037696, "grad_norm": 1.3602564334869385, "learning_rate": 6.469121106096315e-05, "loss": 2.4104232788085938, "memory(GiB)": 121.15, "step": 5910, "token_acc": 0.48668384879725085, "train_speed(iter/s)": 1.194666 }, { "epoch": 2.027073337902673, "grad_norm": 1.5078716278076172, "learning_rate": 6.463974743153908e-05, "loss": 2.338236618041992, "memory(GiB)": 121.15, "step": 5915, "token_acc": 0.49363057324840764, "train_speed(iter/s)": 1.194666 }, { "epoch": 2.0287868403015765, "grad_norm": 1.3241503238677979, "learning_rate": 6.458826683286249e-05, "loss": 2.2757888793945313, "memory(GiB)": 121.15, "step": 5920, "token_acc": 0.5204545454545455, "train_speed(iter/s)": 1.194774 }, { "epoch": 2.03050034270048, "grad_norm": 1.3866418600082397, "learning_rate": 6.453676932460566e-05, "loss": 2.2967607498168947, "memory(GiB)": 121.15, "step": 5925, "token_acc": 0.5045008183306056, "train_speed(iter/s)": 1.194618 }, { "epoch": 2.032213845099383, "grad_norm": 1.3310922384262085, "learning_rate": 6.448525496646049e-05, "loss": 2.374751663208008, "memory(GiB)": 121.15, "step": 5930, "token_acc": 0.49722814498933904, "train_speed(iter/s)": 1.19466 }, { "epoch": 2.0339273474982864, "grad_norm": 1.4557808637619019, "learning_rate": 6.443372381813841e-05, "loss": 2.2613115310668945, "memory(GiB)": 121.15, "step": 5935, "token_acc": 0.525347689546882, "train_speed(iter/s)": 1.194633 }, { "epoch": 2.03564084989719, "grad_norm": 1.3277899026870728, "learning_rate": 6.43821759393703e-05, "loss": 2.3875825881958006, "memory(GiB)": 121.15, "step": 5940, "token_acc": 0.48951048951048953, "train_speed(iter/s)": 1.194616 }, { "epoch": 2.0373543522960933, "grad_norm": 1.393363356590271, "learning_rate": 6.433061138990644e-05, "loss": 2.3806076049804688, "memory(GiB)": 121.15, "step": 5945, "token_acc": 0.5024674742036788, "train_speed(iter/s)": 1.194711 }, { "epoch": 2.0390678546949967, "grad_norm": 1.5312933921813965, "learning_rate": 6.427903022951641e-05, "loss": 2.3256467819213866, "memory(GiB)": 121.15, "step": 5950, "token_acc": 0.5132631578947369, "train_speed(iter/s)": 1.19469 }, { "epoch": 2.0407813570938997, "grad_norm": 1.3604727983474731, "learning_rate": 6.422743251798911e-05, "loss": 2.2348602294921873, "memory(GiB)": 121.15, "step": 5955, "token_acc": 0.5134798838656159, "train_speed(iter/s)": 1.194787 }, { "epoch": 2.042494859492803, "grad_norm": 1.4921821355819702, "learning_rate": 6.417581831513254e-05, "loss": 2.3274629592895506, "memory(GiB)": 121.15, "step": 5960, "token_acc": 0.5078192875760209, "train_speed(iter/s)": 1.194933 }, { "epoch": 2.0442083618917066, "grad_norm": 1.2611708641052246, "learning_rate": 6.412418768077383e-05, "loss": 2.3224058151245117, "memory(GiB)": 121.15, "step": 5965, "token_acc": 0.5088087248322147, "train_speed(iter/s)": 1.195002 }, { "epoch": 2.04592186429061, "grad_norm": 1.4681154489517212, "learning_rate": 6.407254067475926e-05, "loss": 2.259663200378418, "memory(GiB)": 121.15, "step": 5970, "token_acc": 0.5105913503971756, "train_speed(iter/s)": 1.195038 }, { "epoch": 2.0476353666895135, "grad_norm": 1.3170493841171265, "learning_rate": 6.402087735695396e-05, "loss": 2.323006248474121, "memory(GiB)": 121.15, "step": 5975, "token_acc": 0.5039777247414479, "train_speed(iter/s)": 1.195001 }, { "epoch": 2.0493488690884165, "grad_norm": 1.3461486101150513, "learning_rate": 6.396919778724201e-05, "loss": 2.3625247955322264, "memory(GiB)": 121.15, "step": 5980, "token_acc": 0.5077996715927751, "train_speed(iter/s)": 1.19495 }, { "epoch": 2.05106237148732, "grad_norm": 1.487100601196289, "learning_rate": 6.391750202552634e-05, "loss": 2.349253273010254, "memory(GiB)": 121.15, "step": 5985, "token_acc": 0.49794913863822804, "train_speed(iter/s)": 1.194824 }, { "epoch": 2.0527758738862234, "grad_norm": 1.7152727842330933, "learning_rate": 6.386579013172867e-05, "loss": 2.4067924499511717, "memory(GiB)": 121.15, "step": 5990, "token_acc": 0.4810181190681622, "train_speed(iter/s)": 1.194932 }, { "epoch": 2.054489376285127, "grad_norm": 1.4990715980529785, "learning_rate": 6.381406216578934e-05, "loss": 2.3525279998779296, "memory(GiB)": 121.15, "step": 5995, "token_acc": 0.5133985538068907, "train_speed(iter/s)": 1.194975 }, { "epoch": 2.0562028786840303, "grad_norm": 1.3149112462997437, "learning_rate": 6.376231818766741e-05, "loss": 2.3053606033325194, "memory(GiB)": 121.15, "step": 6000, "token_acc": 0.5051546391752577, "train_speed(iter/s)": 1.195003 }, { "epoch": 2.0562028786840303, "eval_loss": 2.3596043586730957, "eval_runtime": 3.7357, "eval_samples_per_second": 26.769, "eval_steps_per_second": 26.769, "eval_token_acc": 0.46115906288532676, "step": 6000 }, { "epoch": 2.0579163810829337, "grad_norm": 1.3646070957183838, "learning_rate": 6.371055825734046e-05, "loss": 2.2754533767700194, "memory(GiB)": 121.15, "step": 6005, "token_acc": 0.48724872487248727, "train_speed(iter/s)": 1.193812 }, { "epoch": 2.0596298834818367, "grad_norm": 1.3094472885131836, "learning_rate": 6.365878243480453e-05, "loss": 2.2878211975097655, "memory(GiB)": 121.15, "step": 6010, "token_acc": 0.5107505070993915, "train_speed(iter/s)": 1.193762 }, { "epoch": 2.06134338588074, "grad_norm": 1.4706716537475586, "learning_rate": 6.360699078007414e-05, "loss": 2.382808494567871, "memory(GiB)": 121.15, "step": 6015, "token_acc": 0.5044025157232704, "train_speed(iter/s)": 1.193709 }, { "epoch": 2.0630568882796436, "grad_norm": 1.3082791566848755, "learning_rate": 6.355518335318212e-05, "loss": 2.3272689819335937, "memory(GiB)": 121.15, "step": 6020, "token_acc": 0.5050416483998247, "train_speed(iter/s)": 1.193817 }, { "epoch": 2.064770390678547, "grad_norm": 1.4495173692703247, "learning_rate": 6.350336021417962e-05, "loss": 2.263610076904297, "memory(GiB)": 121.15, "step": 6025, "token_acc": 0.5198041833555852, "train_speed(iter/s)": 1.193815 }, { "epoch": 2.0664838930774505, "grad_norm": 1.285322666168213, "learning_rate": 6.345152142313592e-05, "loss": 2.403004837036133, "memory(GiB)": 121.15, "step": 6030, "token_acc": 0.49958745874587457, "train_speed(iter/s)": 1.193865 }, { "epoch": 2.0681973954763535, "grad_norm": 1.4422615766525269, "learning_rate": 6.339966704013857e-05, "loss": 2.386665153503418, "memory(GiB)": 121.15, "step": 6035, "token_acc": 0.4993581514762516, "train_speed(iter/s)": 1.193841 }, { "epoch": 2.069910897875257, "grad_norm": 1.4099221229553223, "learning_rate": 6.33477971252931e-05, "loss": 2.4236684799194337, "memory(GiB)": 121.15, "step": 6040, "token_acc": 0.4847428073234525, "train_speed(iter/s)": 1.193874 }, { "epoch": 2.0716244002741604, "grad_norm": 1.3678282499313354, "learning_rate": 6.329591173872305e-05, "loss": 2.3122406005859375, "memory(GiB)": 121.15, "step": 6045, "token_acc": 0.49912663755458514, "train_speed(iter/s)": 1.193951 }, { "epoch": 2.073337902673064, "grad_norm": 1.602415680885315, "learning_rate": 6.324401094056991e-05, "loss": 2.347946548461914, "memory(GiB)": 121.15, "step": 6050, "token_acc": 0.5075396825396825, "train_speed(iter/s)": 1.193999 }, { "epoch": 2.0750514050719673, "grad_norm": 1.4000916481018066, "learning_rate": 6.319209479099305e-05, "loss": 2.321035385131836, "memory(GiB)": 121.15, "step": 6055, "token_acc": 0.496628322094407, "train_speed(iter/s)": 1.193981 }, { "epoch": 2.0767649074708703, "grad_norm": 1.4017448425292969, "learning_rate": 6.31401633501696e-05, "loss": 2.316959571838379, "memory(GiB)": 121.15, "step": 6060, "token_acc": 0.5179195804195804, "train_speed(iter/s)": 1.194088 }, { "epoch": 2.0784784098697737, "grad_norm": 1.4070162773132324, "learning_rate": 6.308821667829446e-05, "loss": 2.2759517669677733, "memory(GiB)": 121.15, "step": 6065, "token_acc": 0.5019710906701709, "train_speed(iter/s)": 1.194089 }, { "epoch": 2.080191912268677, "grad_norm": 1.3886533975601196, "learning_rate": 6.303625483558014e-05, "loss": 2.288944625854492, "memory(GiB)": 121.15, "step": 6070, "token_acc": 0.4980271810609382, "train_speed(iter/s)": 1.194126 }, { "epoch": 2.0819054146675806, "grad_norm": 1.4215689897537231, "learning_rate": 6.298427788225673e-05, "loss": 2.3788185119628906, "memory(GiB)": 121.15, "step": 6075, "token_acc": 0.5051063829787235, "train_speed(iter/s)": 1.194186 }, { "epoch": 2.083618917066484, "grad_norm": 1.478683352470398, "learning_rate": 6.29322858785719e-05, "loss": 2.3072942733764648, "memory(GiB)": 121.15, "step": 6080, "token_acc": 0.5110629067245119, "train_speed(iter/s)": 1.194055 }, { "epoch": 2.085332419465387, "grad_norm": 1.3190534114837646, "learning_rate": 6.288027888479072e-05, "loss": 2.3548717498779297, "memory(GiB)": 121.15, "step": 6085, "token_acc": 0.49246231155778897, "train_speed(iter/s)": 1.194096 }, { "epoch": 2.0870459218642905, "grad_norm": 1.5177698135375977, "learning_rate": 6.28282569611956e-05, "loss": 2.2912126541137696, "memory(GiB)": 121.15, "step": 6090, "token_acc": 0.51994851994852, "train_speed(iter/s)": 1.194119 }, { "epoch": 2.088759424263194, "grad_norm": 1.4868731498718262, "learning_rate": 6.277622016808631e-05, "loss": 2.3680370330810545, "memory(GiB)": 121.15, "step": 6095, "token_acc": 0.49892749892749894, "train_speed(iter/s)": 1.194152 }, { "epoch": 2.0904729266620974, "grad_norm": 1.394860863685608, "learning_rate": 6.272416856577987e-05, "loss": 2.2492464065551756, "memory(GiB)": 121.15, "step": 6100, "token_acc": 0.5160560832202623, "train_speed(iter/s)": 1.194173 }, { "epoch": 2.092186429061001, "grad_norm": 1.3571327924728394, "learning_rate": 6.267210221461038e-05, "loss": 2.2895179748535157, "memory(GiB)": 121.15, "step": 6105, "token_acc": 0.49674337820234477, "train_speed(iter/s)": 1.193958 }, { "epoch": 2.093899931459904, "grad_norm": 1.3643492460250854, "learning_rate": 6.262002117492913e-05, "loss": 2.360524559020996, "memory(GiB)": 121.15, "step": 6110, "token_acc": 0.521553563807085, "train_speed(iter/s)": 1.193946 }, { "epoch": 2.0956134338588073, "grad_norm": 1.318539023399353, "learning_rate": 6.25679255071044e-05, "loss": 2.261493110656738, "memory(GiB)": 121.15, "step": 6115, "token_acc": 0.5026606631191158, "train_speed(iter/s)": 1.193921 }, { "epoch": 2.0973269362577107, "grad_norm": 1.3517218828201294, "learning_rate": 6.251581527152137e-05, "loss": 2.382615852355957, "memory(GiB)": 121.15, "step": 6120, "token_acc": 0.4931682322801025, "train_speed(iter/s)": 1.19393 }, { "epoch": 2.099040438656614, "grad_norm": 1.3209675550460815, "learning_rate": 6.24636905285822e-05, "loss": 2.4310724258422853, "memory(GiB)": 121.15, "step": 6125, "token_acc": 0.49166989538938394, "train_speed(iter/s)": 1.193976 }, { "epoch": 2.1007539410555176, "grad_norm": 1.5317579507827759, "learning_rate": 6.241155133870585e-05, "loss": 2.3705745697021485, "memory(GiB)": 121.15, "step": 6130, "token_acc": 0.517641804376954, "train_speed(iter/s)": 1.193874 }, { "epoch": 2.102467443454421, "grad_norm": 1.442338228225708, "learning_rate": 6.235939776232795e-05, "loss": 2.3232004165649416, "memory(GiB)": 121.15, "step": 6135, "token_acc": 0.49892749892749894, "train_speed(iter/s)": 1.193896 }, { "epoch": 2.104180945853324, "grad_norm": 1.475232720375061, "learning_rate": 6.230722985990085e-05, "loss": 2.3246023178100588, "memory(GiB)": 121.15, "step": 6140, "token_acc": 0.4870617696160267, "train_speed(iter/s)": 1.193961 }, { "epoch": 2.1058944482522275, "grad_norm": 3.5899665355682373, "learning_rate": 6.225504769189356e-05, "loss": 2.4624704360961913, "memory(GiB)": 121.15, "step": 6145, "token_acc": 0.49270216962524654, "train_speed(iter/s)": 1.193962 }, { "epoch": 2.107607950651131, "grad_norm": 1.3306244611740112, "learning_rate": 6.220285131879153e-05, "loss": 2.3768774032592774, "memory(GiB)": 121.15, "step": 6150, "token_acc": 0.5027414592998735, "train_speed(iter/s)": 1.194016 }, { "epoch": 2.1093214530500344, "grad_norm": 1.5376673936843872, "learning_rate": 6.215064080109675e-05, "loss": 2.3142349243164064, "memory(GiB)": 121.15, "step": 6155, "token_acc": 0.5033869602032176, "train_speed(iter/s)": 1.193846 }, { "epoch": 2.111034955448938, "grad_norm": 1.3781427145004272, "learning_rate": 6.209841619932757e-05, "loss": 2.3479839324951173, "memory(GiB)": 121.15, "step": 6160, "token_acc": 0.50237992211164, "train_speed(iter/s)": 1.193881 }, { "epoch": 2.112748457847841, "grad_norm": 1.608240008354187, "learning_rate": 6.204617757401866e-05, "loss": 2.2578853607177733, "memory(GiB)": 121.15, "step": 6165, "token_acc": 0.5205592105263158, "train_speed(iter/s)": 1.193886 }, { "epoch": 2.1144619602467443, "grad_norm": 1.4715975522994995, "learning_rate": 6.199392498572097e-05, "loss": 2.327267074584961, "memory(GiB)": 121.15, "step": 6170, "token_acc": 0.4965007776049767, "train_speed(iter/s)": 1.193844 }, { "epoch": 2.1161754626456477, "grad_norm": 1.8062294721603394, "learning_rate": 6.19416584950016e-05, "loss": 2.3256359100341797, "memory(GiB)": 121.15, "step": 6175, "token_acc": 0.5035366931918656, "train_speed(iter/s)": 1.193922 }, { "epoch": 2.117888965044551, "grad_norm": 1.7505638599395752, "learning_rate": 6.188937816244383e-05, "loss": 2.2837875366210936, "memory(GiB)": 121.15, "step": 6180, "token_acc": 0.5015371102327624, "train_speed(iter/s)": 1.193771 }, { "epoch": 2.1196024674434546, "grad_norm": 1.4742703437805176, "learning_rate": 6.183708404864689e-05, "loss": 2.2756250381469725, "memory(GiB)": 121.15, "step": 6185, "token_acc": 0.5046563192904656, "train_speed(iter/s)": 1.193789 }, { "epoch": 2.1213159698423576, "grad_norm": 1.446751594543457, "learning_rate": 6.178477621422606e-05, "loss": 2.3693687438964846, "memory(GiB)": 121.15, "step": 6190, "token_acc": 0.49428320140721194, "train_speed(iter/s)": 1.193691 }, { "epoch": 2.123029472241261, "grad_norm": 1.3394274711608887, "learning_rate": 6.173245471981252e-05, "loss": 2.3177669525146483, "memory(GiB)": 121.15, "step": 6195, "token_acc": 0.5043192102015631, "train_speed(iter/s)": 1.193726 }, { "epoch": 2.1247429746401645, "grad_norm": 1.3195737600326538, "learning_rate": 6.168011962605324e-05, "loss": 2.3528289794921875, "memory(GiB)": 121.15, "step": 6200, "token_acc": 0.4957644211375555, "train_speed(iter/s)": 1.193666 }, { "epoch": 2.126456477039068, "grad_norm": 1.3354506492614746, "learning_rate": 6.162777099361094e-05, "loss": 2.377069664001465, "memory(GiB)": 121.15, "step": 6205, "token_acc": 0.4986094557012316, "train_speed(iter/s)": 1.193682 }, { "epoch": 2.1281699794379714, "grad_norm": 1.4279505014419556, "learning_rate": 6.157540888316415e-05, "loss": 2.265804672241211, "memory(GiB)": 121.15, "step": 6210, "token_acc": 0.5112781954887218, "train_speed(iter/s)": 1.193756 }, { "epoch": 2.1298834818368744, "grad_norm": 1.2742176055908203, "learning_rate": 6.152303335540688e-05, "loss": 2.26376895904541, "memory(GiB)": 121.15, "step": 6215, "token_acc": 0.5140343527440302, "train_speed(iter/s)": 1.19376 }, { "epoch": 2.131596984235778, "grad_norm": 1.3930140733718872, "learning_rate": 6.147064447104876e-05, "loss": 2.3215747833251954, "memory(GiB)": 121.15, "step": 6220, "token_acc": 0.49873524451939294, "train_speed(iter/s)": 1.193809 }, { "epoch": 2.1333104866346813, "grad_norm": 1.4721319675445557, "learning_rate": 6.141824229081491e-05, "loss": 2.33490104675293, "memory(GiB)": 121.15, "step": 6225, "token_acc": 0.5245683930942895, "train_speed(iter/s)": 1.193843 }, { "epoch": 2.1350239890335847, "grad_norm": 1.4913055896759033, "learning_rate": 6.136582687544585e-05, "loss": 2.1955589294433593, "memory(GiB)": 121.15, "step": 6230, "token_acc": 0.5205959684487291, "train_speed(iter/s)": 1.193669 }, { "epoch": 2.136737491432488, "grad_norm": 1.5069217681884766, "learning_rate": 6.13133982856974e-05, "loss": 2.313599967956543, "memory(GiB)": 121.15, "step": 6235, "token_acc": 0.5088417329796641, "train_speed(iter/s)": 1.193724 }, { "epoch": 2.138450993831391, "grad_norm": 1.5174211263656616, "learning_rate": 6.126095658234076e-05, "loss": 2.332576560974121, "memory(GiB)": 121.15, "step": 6240, "token_acc": 0.5117647058823529, "train_speed(iter/s)": 1.193784 }, { "epoch": 2.1401644962302946, "grad_norm": 1.4564101696014404, "learning_rate": 6.12085018261622e-05, "loss": 2.2936542510986326, "memory(GiB)": 121.15, "step": 6245, "token_acc": 0.503305785123967, "train_speed(iter/s)": 1.193777 }, { "epoch": 2.141877998629198, "grad_norm": 1.3655526638031006, "learning_rate": 6.11560340779632e-05, "loss": 2.2239227294921875, "memory(GiB)": 121.15, "step": 6250, "token_acc": 0.5042054006197433, "train_speed(iter/s)": 1.193827 }, { "epoch": 2.1435915010281015, "grad_norm": 1.481973648071289, "learning_rate": 6.110355339856028e-05, "loss": 2.2681858062744142, "memory(GiB)": 121.15, "step": 6255, "token_acc": 0.5150709219858156, "train_speed(iter/s)": 1.193894 }, { "epoch": 2.145305003427005, "grad_norm": 1.236138939857483, "learning_rate": 6.105105984878493e-05, "loss": 2.292881393432617, "memory(GiB)": 121.15, "step": 6260, "token_acc": 0.509567387687188, "train_speed(iter/s)": 1.193993 }, { "epoch": 2.1470185058259084, "grad_norm": 1.7196894884109497, "learning_rate": 6.09985534894836e-05, "loss": 2.3781902313232424, "memory(GiB)": 121.15, "step": 6265, "token_acc": 0.49528301886792453, "train_speed(iter/s)": 1.19398 }, { "epoch": 2.1487320082248114, "grad_norm": 1.5046684741973877, "learning_rate": 6.094603438151756e-05, "loss": 2.2723134994506835, "memory(GiB)": 121.15, "step": 6270, "token_acc": 0.5010836584308626, "train_speed(iter/s)": 1.193992 }, { "epoch": 2.150445510623715, "grad_norm": 1.3622146844863892, "learning_rate": 6.089350258576284e-05, "loss": 2.2931142807006837, "memory(GiB)": 121.15, "step": 6275, "token_acc": 0.5183157894736842, "train_speed(iter/s)": 1.193893 }, { "epoch": 2.1521590130226183, "grad_norm": 1.4759321212768555, "learning_rate": 6.0840958163110215e-05, "loss": 2.437862014770508, "memory(GiB)": 121.15, "step": 6280, "token_acc": 0.48691741618969747, "train_speed(iter/s)": 1.193904 }, { "epoch": 2.1538725154215217, "grad_norm": 1.6412752866744995, "learning_rate": 6.078840117446507e-05, "loss": 2.3469661712646483, "memory(GiB)": 121.15, "step": 6285, "token_acc": 0.4947411003236246, "train_speed(iter/s)": 1.193862 }, { "epoch": 2.155586017820425, "grad_norm": 1.6018892526626587, "learning_rate": 6.073583168074737e-05, "loss": 2.360525131225586, "memory(GiB)": 121.15, "step": 6290, "token_acc": 0.49305244285073957, "train_speed(iter/s)": 1.193917 }, { "epoch": 2.157299520219328, "grad_norm": 1.7983801364898682, "learning_rate": 6.068324974289157e-05, "loss": 2.3974145889282226, "memory(GiB)": 121.15, "step": 6295, "token_acc": 0.48008213552361395, "train_speed(iter/s)": 1.19392 }, { "epoch": 2.1590130226182316, "grad_norm": 1.51160728931427, "learning_rate": 6.063065542184654e-05, "loss": 2.3776723861694338, "memory(GiB)": 121.15, "step": 6300, "token_acc": 0.4952991452991453, "train_speed(iter/s)": 1.193944 }, { "epoch": 2.160726525017135, "grad_norm": 1.5331616401672363, "learning_rate": 6.0578048778575514e-05, "loss": 2.2665882110595703, "memory(GiB)": 121.15, "step": 6305, "token_acc": 0.5171035095513106, "train_speed(iter/s)": 1.19395 }, { "epoch": 2.1624400274160385, "grad_norm": 1.3656672239303589, "learning_rate": 6.0525429874056016e-05, "loss": 2.391295623779297, "memory(GiB)": 121.15, "step": 6310, "token_acc": 0.49331103678929766, "train_speed(iter/s)": 1.194002 }, { "epoch": 2.164153529814942, "grad_norm": 1.3319144248962402, "learning_rate": 6.047279876927976e-05, "loss": 2.3309677124023436, "memory(GiB)": 121.15, "step": 6315, "token_acc": 0.5201005025125628, "train_speed(iter/s)": 1.194071 }, { "epoch": 2.165867032213845, "grad_norm": 1.4424887895584106, "learning_rate": 6.0420155525252617e-05, "loss": 2.3866649627685548, "memory(GiB)": 121.15, "step": 6320, "token_acc": 0.49656287909421754, "train_speed(iter/s)": 1.193761 }, { "epoch": 2.1675805346127484, "grad_norm": 1.4315173625946045, "learning_rate": 6.036750020299453e-05, "loss": 2.289365768432617, "memory(GiB)": 121.15, "step": 6325, "token_acc": 0.5093141405588484, "train_speed(iter/s)": 1.193863 }, { "epoch": 2.169294037011652, "grad_norm": 1.355168104171753, "learning_rate": 6.031483286353945e-05, "loss": 2.377447509765625, "memory(GiB)": 121.15, "step": 6330, "token_acc": 0.49640861931364727, "train_speed(iter/s)": 1.193908 }, { "epoch": 2.1710075394105552, "grad_norm": 1.4001637697219849, "learning_rate": 6.026215356793523e-05, "loss": 2.353282356262207, "memory(GiB)": 121.15, "step": 6335, "token_acc": 0.512608353033885, "train_speed(iter/s)": 1.193958 }, { "epoch": 2.1727210418094587, "grad_norm": 1.6647001504898071, "learning_rate": 6.0209462377243595e-05, "loss": 2.3578800201416015, "memory(GiB)": 121.15, "step": 6340, "token_acc": 0.49734982332155475, "train_speed(iter/s)": 1.194011 }, { "epoch": 2.1744345442083617, "grad_norm": 1.50197172164917, "learning_rate": 6.0156759352540074e-05, "loss": 2.451688766479492, "memory(GiB)": 121.15, "step": 6345, "token_acc": 0.48677884615384615, "train_speed(iter/s)": 1.193809 }, { "epoch": 2.176148046607265, "grad_norm": 1.4372060298919678, "learning_rate": 6.0104044554913896e-05, "loss": 2.4346343994140627, "memory(GiB)": 121.15, "step": 6350, "token_acc": 0.4787234042553192, "train_speed(iter/s)": 1.193877 }, { "epoch": 2.1778615490061686, "grad_norm": 1.5170360803604126, "learning_rate": 6.005131804546792e-05, "loss": 2.3698673248291016, "memory(GiB)": 121.15, "step": 6355, "token_acc": 0.4976545842217484, "train_speed(iter/s)": 1.193877 }, { "epoch": 2.179575051405072, "grad_norm": 1.4658695459365845, "learning_rate": 5.999857988531864e-05, "loss": 2.3896505355834963, "memory(GiB)": 121.15, "step": 6360, "token_acc": 0.4933788979068774, "train_speed(iter/s)": 1.193923 }, { "epoch": 2.1812885538039755, "grad_norm": 1.6552778482437134, "learning_rate": 5.994583013559598e-05, "loss": 2.4203422546386717, "memory(GiB)": 121.15, "step": 6365, "token_acc": 0.4967291757522896, "train_speed(iter/s)": 1.193999 }, { "epoch": 2.183002056202879, "grad_norm": 1.514603614807129, "learning_rate": 5.989306885744334e-05, "loss": 2.343790626525879, "memory(GiB)": 121.15, "step": 6370, "token_acc": 0.5039647577092511, "train_speed(iter/s)": 1.193907 }, { "epoch": 2.184715558601782, "grad_norm": 1.4361199140548706, "learning_rate": 5.984029611201749e-05, "loss": 2.263724136352539, "memory(GiB)": 121.15, "step": 6375, "token_acc": 0.5176201372997712, "train_speed(iter/s)": 1.193951 }, { "epoch": 2.1864290610006853, "grad_norm": 1.5530214309692383, "learning_rate": 5.9787511960488464e-05, "loss": 2.394741249084473, "memory(GiB)": 121.15, "step": 6380, "token_acc": 0.497310715763343, "train_speed(iter/s)": 1.193936 }, { "epoch": 2.188142563399589, "grad_norm": 1.3276317119598389, "learning_rate": 5.973471646403952e-05, "loss": 2.3248424530029297, "memory(GiB)": 121.15, "step": 6385, "token_acc": 0.5113954418232707, "train_speed(iter/s)": 1.193956 }, { "epoch": 2.1898560657984922, "grad_norm": 1.4704241752624512, "learning_rate": 5.9681909683867086e-05, "loss": 2.4329437255859374, "memory(GiB)": 121.15, "step": 6390, "token_acc": 0.496512641673932, "train_speed(iter/s)": 1.194012 }, { "epoch": 2.1915695681973957, "grad_norm": 1.4750797748565674, "learning_rate": 5.9629091681180694e-05, "loss": 2.311421585083008, "memory(GiB)": 121.15, "step": 6395, "token_acc": 0.5108562064727571, "train_speed(iter/s)": 1.194009 }, { "epoch": 2.1932830705962987, "grad_norm": 1.4777847528457642, "learning_rate": 5.957626251720281e-05, "loss": 2.332541275024414, "memory(GiB)": 121.15, "step": 6400, "token_acc": 0.5106571936056838, "train_speed(iter/s)": 1.194074 }, { "epoch": 2.194996572995202, "grad_norm": 1.4420772790908813, "learning_rate": 5.952342225316887e-05, "loss": 2.3511531829833983, "memory(GiB)": 121.15, "step": 6405, "token_acc": 0.5006206040546132, "train_speed(iter/s)": 1.194072 }, { "epoch": 2.1967100753941056, "grad_norm": 1.5623418092727661, "learning_rate": 5.947057095032721e-05, "loss": 2.3536205291748047, "memory(GiB)": 121.15, "step": 6410, "token_acc": 0.489469250210615, "train_speed(iter/s)": 1.193866 }, { "epoch": 2.198423577793009, "grad_norm": 1.4335229396820068, "learning_rate": 5.941770866993893e-05, "loss": 2.3317745208740233, "memory(GiB)": 121.15, "step": 6415, "token_acc": 0.49861276258422516, "train_speed(iter/s)": 1.193848 }, { "epoch": 2.2001370801919125, "grad_norm": 1.4165736436843872, "learning_rate": 5.9364835473277844e-05, "loss": 2.42581787109375, "memory(GiB)": 121.15, "step": 6420, "token_acc": 0.49132014533710133, "train_speed(iter/s)": 1.193878 }, { "epoch": 2.2018505825908155, "grad_norm": 1.3954169750213623, "learning_rate": 5.931195142163046e-05, "loss": 2.259684371948242, "memory(GiB)": 121.15, "step": 6425, "token_acc": 0.5053523639607493, "train_speed(iter/s)": 1.193939 }, { "epoch": 2.203564084989719, "grad_norm": 1.373735785484314, "learning_rate": 5.925905657629582e-05, "loss": 2.366745185852051, "memory(GiB)": 121.15, "step": 6430, "token_acc": 0.5065040650406504, "train_speed(iter/s)": 1.194076 }, { "epoch": 2.2052775873886223, "grad_norm": 1.4291170835494995, "learning_rate": 5.920615099858551e-05, "loss": 2.296539306640625, "memory(GiB)": 121.15, "step": 6435, "token_acc": 0.5115569123419101, "train_speed(iter/s)": 1.194062 }, { "epoch": 2.206991089787526, "grad_norm": 1.3152799606323242, "learning_rate": 5.915323474982354e-05, "loss": 2.23785285949707, "memory(GiB)": 121.15, "step": 6440, "token_acc": 0.516048353480617, "train_speed(iter/s)": 1.19403 }, { "epoch": 2.2087045921864292, "grad_norm": 1.4754137992858887, "learning_rate": 5.9100307891346316e-05, "loss": 2.2943069458007814, "memory(GiB)": 121.15, "step": 6445, "token_acc": 0.5087719298245614, "train_speed(iter/s)": 1.194111 }, { "epoch": 2.2104180945853322, "grad_norm": 1.4741127490997314, "learning_rate": 5.9047370484502484e-05, "loss": 2.257503890991211, "memory(GiB)": 121.15, "step": 6450, "token_acc": 0.5150880134115675, "train_speed(iter/s)": 1.19417 }, { "epoch": 2.2121315969842357, "grad_norm": 1.737919807434082, "learning_rate": 5.899442259065297e-05, "loss": 2.457481575012207, "memory(GiB)": 121.15, "step": 6455, "token_acc": 0.4960285941223193, "train_speed(iter/s)": 1.194239 }, { "epoch": 2.213845099383139, "grad_norm": 1.4437376260757446, "learning_rate": 5.894146427117089e-05, "loss": 2.3245738983154296, "memory(GiB)": 121.15, "step": 6460, "token_acc": 0.5022294284556141, "train_speed(iter/s)": 1.194224 }, { "epoch": 2.2155586017820426, "grad_norm": 1.440878987312317, "learning_rate": 5.8888495587441306e-05, "loss": 2.2563243865966798, "memory(GiB)": 121.15, "step": 6465, "token_acc": 0.5126306620209059, "train_speed(iter/s)": 1.194328 }, { "epoch": 2.217272104180946, "grad_norm": 1.4185523986816406, "learning_rate": 5.883551660086143e-05, "loss": 2.4040626525878905, "memory(GiB)": 121.15, "step": 6470, "token_acc": 0.49598214285714287, "train_speed(iter/s)": 1.194435 }, { "epoch": 2.2189856065798494, "grad_norm": 1.4470362663269043, "learning_rate": 5.878252737284038e-05, "loss": 2.3720489501953126, "memory(GiB)": 121.15, "step": 6475, "token_acc": 0.5015079707022835, "train_speed(iter/s)": 1.194466 }, { "epoch": 2.2206991089787524, "grad_norm": 1.5621999502182007, "learning_rate": 5.8729527964799104e-05, "loss": 2.2859106063842773, "memory(GiB)": 121.15, "step": 6480, "token_acc": 0.49727024567788897, "train_speed(iter/s)": 1.194524 }, { "epoch": 2.222412611377656, "grad_norm": 1.5171304941177368, "learning_rate": 5.867651843817038e-05, "loss": 2.2680782318115233, "memory(GiB)": 121.15, "step": 6485, "token_acc": 0.50371017023134, "train_speed(iter/s)": 1.194607 }, { "epoch": 2.2241261137765593, "grad_norm": 1.409926414489746, "learning_rate": 5.862349885439874e-05, "loss": 2.2544750213623046, "memory(GiB)": 121.15, "step": 6490, "token_acc": 0.5, "train_speed(iter/s)": 1.194535 }, { "epoch": 2.2258396161754628, "grad_norm": 1.5348302125930786, "learning_rate": 5.8570469274940335e-05, "loss": 2.3210906982421875, "memory(GiB)": 121.15, "step": 6495, "token_acc": 0.5035639412997903, "train_speed(iter/s)": 1.194491 }, { "epoch": 2.2275531185743658, "grad_norm": 1.4120264053344727, "learning_rate": 5.851742976126289e-05, "loss": 2.3163169860839843, "memory(GiB)": 121.15, "step": 6500, "token_acc": 0.5079297042434634, "train_speed(iter/s)": 1.194561 }, { "epoch": 2.2275531185743658, "eval_loss": 2.0792226791381836, "eval_runtime": 3.7216, "eval_samples_per_second": 26.87, "eval_steps_per_second": 26.87, "eval_token_acc": 0.49247606019151846, "step": 6500 }, { "epoch": 2.229266620973269, "grad_norm": 1.5365630388259888, "learning_rate": 5.8464380374845704e-05, "loss": 2.412842559814453, "memory(GiB)": 121.15, "step": 6505, "token_acc": 0.5019505851755527, "train_speed(iter/s)": 1.193274 }, { "epoch": 2.2309801233721727, "grad_norm": 1.5109455585479736, "learning_rate": 5.841132117717948e-05, "loss": 2.24912052154541, "memory(GiB)": 121.15, "step": 6510, "token_acc": 0.5002101723413199, "train_speed(iter/s)": 1.19328 }, { "epoch": 2.232693625771076, "grad_norm": 1.6099090576171875, "learning_rate": 5.8358252229766274e-05, "loss": 2.482375907897949, "memory(GiB)": 121.15, "step": 6515, "token_acc": 0.4845360824742268, "train_speed(iter/s)": 1.193317 }, { "epoch": 2.2344071281699796, "grad_norm": 1.610846757888794, "learning_rate": 5.830517359411949e-05, "loss": 2.4521636962890625, "memory(GiB)": 121.15, "step": 6520, "token_acc": 0.48064085447263016, "train_speed(iter/s)": 1.193407 }, { "epoch": 2.236120630568883, "grad_norm": 1.420902967453003, "learning_rate": 5.825208533176373e-05, "loss": 2.3287588119506837, "memory(GiB)": 121.15, "step": 6525, "token_acc": 0.49484092863284607, "train_speed(iter/s)": 1.193485 }, { "epoch": 2.237834132967786, "grad_norm": 1.4001245498657227, "learning_rate": 5.819898750423476e-05, "loss": 2.276485061645508, "memory(GiB)": 121.15, "step": 6530, "token_acc": 0.5009149130832571, "train_speed(iter/s)": 1.193548 }, { "epoch": 2.2395476353666894, "grad_norm": 1.438793659210205, "learning_rate": 5.814588017307946e-05, "loss": 2.270552635192871, "memory(GiB)": 121.15, "step": 6535, "token_acc": 0.5252263906856404, "train_speed(iter/s)": 1.193556 }, { "epoch": 2.241261137765593, "grad_norm": 1.5286474227905273, "learning_rate": 5.809276339985568e-05, "loss": 2.3361160278320314, "memory(GiB)": 121.15, "step": 6540, "token_acc": 0.5063131313131313, "train_speed(iter/s)": 1.193556 }, { "epoch": 2.2429746401644963, "grad_norm": 1.4792217016220093, "learning_rate": 5.803963724613223e-05, "loss": 2.385980224609375, "memory(GiB)": 121.15, "step": 6545, "token_acc": 0.49759930161501525, "train_speed(iter/s)": 1.193461 }, { "epoch": 2.2446881425633998, "grad_norm": 1.4262676239013672, "learning_rate": 5.7986501773488824e-05, "loss": 2.2639497756958007, "memory(GiB)": 121.15, "step": 6550, "token_acc": 0.5342172797262618, "train_speed(iter/s)": 1.193513 }, { "epoch": 2.2464016449623028, "grad_norm": 1.4480303525924683, "learning_rate": 5.793335704351596e-05, "loss": 2.397037124633789, "memory(GiB)": 121.15, "step": 6555, "token_acc": 0.5008291873963516, "train_speed(iter/s)": 1.193568 }, { "epoch": 2.248115147361206, "grad_norm": 1.5789005756378174, "learning_rate": 5.7880203117814826e-05, "loss": 2.2337726593017577, "memory(GiB)": 121.15, "step": 6560, "token_acc": 0.5183654729109275, "train_speed(iter/s)": 1.193623 }, { "epoch": 2.2498286497601097, "grad_norm": 1.6324741840362549, "learning_rate": 5.782704005799732e-05, "loss": 2.364979553222656, "memory(GiB)": 121.15, "step": 6565, "token_acc": 0.48483554036736437, "train_speed(iter/s)": 1.193717 }, { "epoch": 2.251542152159013, "grad_norm": 1.710494041442871, "learning_rate": 5.777386792568593e-05, "loss": 2.358463478088379, "memory(GiB)": 121.15, "step": 6570, "token_acc": 0.49868766404199477, "train_speed(iter/s)": 1.193741 }, { "epoch": 2.2532556545579165, "grad_norm": 1.492855429649353, "learning_rate": 5.772068678251362e-05, "loss": 2.3041534423828125, "memory(GiB)": 121.15, "step": 6575, "token_acc": 0.5176470588235295, "train_speed(iter/s)": 1.193609 }, { "epoch": 2.25496915695682, "grad_norm": 1.5220158100128174, "learning_rate": 5.7667496690123826e-05, "loss": 2.4065872192382813, "memory(GiB)": 121.15, "step": 6580, "token_acc": 0.49472830494728304, "train_speed(iter/s)": 1.19369 }, { "epoch": 2.256682659355723, "grad_norm": 1.6159882545471191, "learning_rate": 5.761429771017035e-05, "loss": 2.389237976074219, "memory(GiB)": 121.15, "step": 6585, "token_acc": 0.494641384995878, "train_speed(iter/s)": 1.193712 }, { "epoch": 2.2583961617546264, "grad_norm": 1.4515048265457153, "learning_rate": 5.7561089904317315e-05, "loss": 2.29199104309082, "memory(GiB)": 121.15, "step": 6590, "token_acc": 0.5077586206896552, "train_speed(iter/s)": 1.193759 }, { "epoch": 2.26010966415353, "grad_norm": 1.4616087675094604, "learning_rate": 5.7507873334239016e-05, "loss": 2.3054637908935547, "memory(GiB)": 121.15, "step": 6595, "token_acc": 0.5081148564294632, "train_speed(iter/s)": 1.193767 }, { "epoch": 2.2618231665524333, "grad_norm": 1.7176591157913208, "learning_rate": 5.745464806161999e-05, "loss": 2.356603813171387, "memory(GiB)": 121.15, "step": 6600, "token_acc": 0.5091145833333334, "train_speed(iter/s)": 1.193539 }, { "epoch": 2.2635366689513363, "grad_norm": 1.5203980207443237, "learning_rate": 5.74014141481548e-05, "loss": 2.4175251007080076, "memory(GiB)": 121.15, "step": 6605, "token_acc": 0.49017969076473045, "train_speed(iter/s)": 1.193531 }, { "epoch": 2.2652501713502398, "grad_norm": 1.4895308017730713, "learning_rate": 5.734817165554803e-05, "loss": 2.2703189849853516, "memory(GiB)": 121.15, "step": 6610, "token_acc": 0.5181576616474757, "train_speed(iter/s)": 1.193585 }, { "epoch": 2.266963673749143, "grad_norm": 1.5693156719207764, "learning_rate": 5.729492064551425e-05, "loss": 2.3685855865478516, "memory(GiB)": 121.15, "step": 6615, "token_acc": 0.48947811447811446, "train_speed(iter/s)": 1.193653 }, { "epoch": 2.2686771761480466, "grad_norm": 1.4438668489456177, "learning_rate": 5.724166117977785e-05, "loss": 2.3231412887573244, "memory(GiB)": 121.15, "step": 6620, "token_acc": 0.5082996859578286, "train_speed(iter/s)": 1.193682 }, { "epoch": 2.27039067854695, "grad_norm": 1.3775097131729126, "learning_rate": 5.718839332007305e-05, "loss": 2.360209655761719, "memory(GiB)": 121.15, "step": 6625, "token_acc": 0.4993875051041241, "train_speed(iter/s)": 1.193538 }, { "epoch": 2.2721041809458535, "grad_norm": 1.4166795015335083, "learning_rate": 5.7135117128143814e-05, "loss": 2.3021272659301757, "memory(GiB)": 121.15, "step": 6630, "token_acc": 0.5014786649767639, "train_speed(iter/s)": 1.193635 }, { "epoch": 2.2738176833447565, "grad_norm": 1.5534600019454956, "learning_rate": 5.708183266574372e-05, "loss": 2.438323974609375, "memory(GiB)": 121.15, "step": 6635, "token_acc": 0.4880034275921165, "train_speed(iter/s)": 1.193539 }, { "epoch": 2.27553118574366, "grad_norm": 1.4750494956970215, "learning_rate": 5.702853999463598e-05, "loss": 2.3245033264160155, "memory(GiB)": 121.15, "step": 6640, "token_acc": 0.5071090047393365, "train_speed(iter/s)": 1.193538 }, { "epoch": 2.2772446881425634, "grad_norm": 1.6494193077087402, "learning_rate": 5.6975239176593274e-05, "loss": 2.4118824005126953, "memory(GiB)": 121.15, "step": 6645, "token_acc": 0.4896907216494845, "train_speed(iter/s)": 1.193564 }, { "epoch": 2.278958190541467, "grad_norm": 1.4238402843475342, "learning_rate": 5.692193027339778e-05, "loss": 2.270964431762695, "memory(GiB)": 121.15, "step": 6650, "token_acc": 0.5141874462596733, "train_speed(iter/s)": 1.19338 }, { "epoch": 2.2806716929403703, "grad_norm": 1.4437165260314941, "learning_rate": 5.686861334684099e-05, "loss": 2.306829833984375, "memory(GiB)": 121.15, "step": 6655, "token_acc": 0.5027368421052631, "train_speed(iter/s)": 1.193447 }, { "epoch": 2.2823851953392733, "grad_norm": 1.4748270511627197, "learning_rate": 5.6815288458723735e-05, "loss": 2.3140209197998045, "memory(GiB)": 121.15, "step": 6660, "token_acc": 0.5050372317126588, "train_speed(iter/s)": 1.193516 }, { "epoch": 2.2840986977381768, "grad_norm": 1.5188193321228027, "learning_rate": 5.676195567085607e-05, "loss": 2.3243118286132813, "memory(GiB)": 121.15, "step": 6665, "token_acc": 0.5063721325403568, "train_speed(iter/s)": 1.19352 }, { "epoch": 2.28581220013708, "grad_norm": 1.492263913154602, "learning_rate": 5.67086150450572e-05, "loss": 2.318635940551758, "memory(GiB)": 121.15, "step": 6670, "token_acc": 0.5120101137800253, "train_speed(iter/s)": 1.193594 }, { "epoch": 2.2875257025359836, "grad_norm": 1.5583804845809937, "learning_rate": 5.665526664315539e-05, "loss": 2.3803634643554688, "memory(GiB)": 121.15, "step": 6675, "token_acc": 0.5033250207813799, "train_speed(iter/s)": 1.193567 }, { "epoch": 2.289239204934887, "grad_norm": 1.5670968294143677, "learning_rate": 5.6601910526987964e-05, "loss": 2.3633245468139648, "memory(GiB)": 121.15, "step": 6680, "token_acc": 0.5082322357019065, "train_speed(iter/s)": 1.19363 }, { "epoch": 2.29095270733379, "grad_norm": 1.3600131273269653, "learning_rate": 5.6548546758401176e-05, "loss": 2.3016504287719726, "memory(GiB)": 121.15, "step": 6685, "token_acc": 0.5096801346801347, "train_speed(iter/s)": 1.193595 }, { "epoch": 2.2926662097326935, "grad_norm": 1.6411832571029663, "learning_rate": 5.649517539925011e-05, "loss": 2.280206298828125, "memory(GiB)": 121.15, "step": 6690, "token_acc": 0.5139009556907037, "train_speed(iter/s)": 1.193605 }, { "epoch": 2.294379712131597, "grad_norm": 1.4359835386276245, "learning_rate": 5.6441796511398725e-05, "loss": 2.279168891906738, "memory(GiB)": 121.15, "step": 6695, "token_acc": 0.5177335640138409, "train_speed(iter/s)": 1.193652 }, { "epoch": 2.2960932145305004, "grad_norm": 1.4841585159301758, "learning_rate": 5.6388410156719606e-05, "loss": 2.303262138366699, "memory(GiB)": 121.15, "step": 6700, "token_acc": 0.5118110236220472, "train_speed(iter/s)": 1.193648 }, { "epoch": 2.297806716929404, "grad_norm": 1.422268033027649, "learning_rate": 5.6335016397094106e-05, "loss": 2.2229610443115235, "memory(GiB)": 121.15, "step": 6705, "token_acc": 0.5280312907431551, "train_speed(iter/s)": 1.193636 }, { "epoch": 2.299520219328307, "grad_norm": 1.5904232263565063, "learning_rate": 5.628161529441207e-05, "loss": 2.289058494567871, "memory(GiB)": 121.15, "step": 6710, "token_acc": 0.49461474730737365, "train_speed(iter/s)": 1.193706 }, { "epoch": 2.3012337217272103, "grad_norm": 1.6476119756698608, "learning_rate": 5.622820691057188e-05, "loss": 2.284168815612793, "memory(GiB)": 121.15, "step": 6715, "token_acc": 0.5166527893422148, "train_speed(iter/s)": 1.193777 }, { "epoch": 2.3029472241261137, "grad_norm": 1.6753212213516235, "learning_rate": 5.617479130748039e-05, "loss": 2.3606264114379885, "memory(GiB)": 121.15, "step": 6720, "token_acc": 0.5066666666666667, "train_speed(iter/s)": 1.193846 }, { "epoch": 2.304660726525017, "grad_norm": 1.4239219427108765, "learning_rate": 5.612136854705278e-05, "loss": 2.3327003479003907, "memory(GiB)": 121.15, "step": 6725, "token_acc": 0.5121951219512195, "train_speed(iter/s)": 1.193871 }, { "epoch": 2.3063742289239206, "grad_norm": 1.4680436849594116, "learning_rate": 5.606793869121255e-05, "loss": 2.2677286148071287, "memory(GiB)": 121.15, "step": 6730, "token_acc": 0.5182546749777382, "train_speed(iter/s)": 1.193936 }, { "epoch": 2.308087731322824, "grad_norm": 1.46982741355896, "learning_rate": 5.601450180189143e-05, "loss": 2.4115150451660154, "memory(GiB)": 121.15, "step": 6735, "token_acc": 0.48899647887323944, "train_speed(iter/s)": 1.194048 }, { "epoch": 2.309801233721727, "grad_norm": 1.4602432250976562, "learning_rate": 5.596105794102927e-05, "loss": 2.32318172454834, "memory(GiB)": 121.15, "step": 6740, "token_acc": 0.5034662045060658, "train_speed(iter/s)": 1.194096 }, { "epoch": 2.3115147361206305, "grad_norm": 1.3879858255386353, "learning_rate": 5.590760717057404e-05, "loss": 2.4406694412231444, "memory(GiB)": 121.15, "step": 6745, "token_acc": 0.5127753303964758, "train_speed(iter/s)": 1.194169 }, { "epoch": 2.313228238519534, "grad_norm": 1.6694260835647583, "learning_rate": 5.58541495524817e-05, "loss": 2.304374885559082, "memory(GiB)": 121.15, "step": 6750, "token_acc": 0.5078318219291014, "train_speed(iter/s)": 1.194149 }, { "epoch": 2.3149417409184374, "grad_norm": 1.5484799146652222, "learning_rate": 5.580068514871612e-05, "loss": 2.428537368774414, "memory(GiB)": 121.15, "step": 6755, "token_acc": 0.48926507018992565, "train_speed(iter/s)": 1.194127 }, { "epoch": 2.3166552433173404, "grad_norm": 1.3541953563690186, "learning_rate": 5.5747214021249094e-05, "loss": 2.21937255859375, "memory(GiB)": 121.15, "step": 6760, "token_acc": 0.5167106420404574, "train_speed(iter/s)": 1.194192 }, { "epoch": 2.318368745716244, "grad_norm": 1.4458922147750854, "learning_rate": 5.569373623206017e-05, "loss": 2.2606239318847656, "memory(GiB)": 121.15, "step": 6765, "token_acc": 0.512154233025985, "train_speed(iter/s)": 1.194205 }, { "epoch": 2.3200822481151473, "grad_norm": 1.6431487798690796, "learning_rate": 5.564025184313664e-05, "loss": 2.3124153137207033, "memory(GiB)": 121.15, "step": 6770, "token_acc": 0.5101952277657267, "train_speed(iter/s)": 1.194257 }, { "epoch": 2.3217957505140507, "grad_norm": 1.5888065099716187, "learning_rate": 5.558676091647341e-05, "loss": 2.3579177856445312, "memory(GiB)": 121.15, "step": 6775, "token_acc": 0.4886008046490836, "train_speed(iter/s)": 1.194318 }, { "epoch": 2.323509252912954, "grad_norm": 1.4371185302734375, "learning_rate": 5.5533263514072994e-05, "loss": 2.4156286239624025, "memory(GiB)": 121.15, "step": 6780, "token_acc": 0.5125215889464594, "train_speed(iter/s)": 1.194302 }, { "epoch": 2.3252227553118576, "grad_norm": 1.61894953250885, "learning_rate": 5.547975969794542e-05, "loss": 2.3965375900268553, "memory(GiB)": 121.15, "step": 6785, "token_acc": 0.4897792313982011, "train_speed(iter/s)": 1.194368 }, { "epoch": 2.3269362577107606, "grad_norm": 1.5263527631759644, "learning_rate": 5.542624953010812e-05, "loss": 2.3688819885253904, "memory(GiB)": 121.15, "step": 6790, "token_acc": 0.4964973730297723, "train_speed(iter/s)": 1.194348 }, { "epoch": 2.328649760109664, "grad_norm": 1.4234838485717773, "learning_rate": 5.53727330725859e-05, "loss": 2.29298095703125, "memory(GiB)": 121.15, "step": 6795, "token_acc": 0.5037593984962406, "train_speed(iter/s)": 1.194263 }, { "epoch": 2.3303632625085675, "grad_norm": 1.5556721687316895, "learning_rate": 5.531921038741089e-05, "loss": 2.272321319580078, "memory(GiB)": 121.15, "step": 6800, "token_acc": 0.5150040551500406, "train_speed(iter/s)": 1.194296 }, { "epoch": 2.332076764907471, "grad_norm": 1.4571304321289062, "learning_rate": 5.526568153662237e-05, "loss": 2.3399433135986327, "memory(GiB)": 121.15, "step": 6805, "token_acc": 0.4829694323144105, "train_speed(iter/s)": 1.19431 }, { "epoch": 2.3337902673063744, "grad_norm": 1.929189682006836, "learning_rate": 5.521214658226683e-05, "loss": 2.406355857849121, "memory(GiB)": 121.15, "step": 6810, "token_acc": 0.502771855010661, "train_speed(iter/s)": 1.194402 }, { "epoch": 2.3355037697052774, "grad_norm": 1.5805915594100952, "learning_rate": 5.5158605586397804e-05, "loss": 2.3629268646240233, "memory(GiB)": 121.15, "step": 6815, "token_acc": 0.4997908824759515, "train_speed(iter/s)": 1.19434 }, { "epoch": 2.337217272104181, "grad_norm": 1.3887803554534912, "learning_rate": 5.510505861107583e-05, "loss": 2.3353837966918944, "memory(GiB)": 121.15, "step": 6820, "token_acc": 0.48225214198286415, "train_speed(iter/s)": 1.194378 }, { "epoch": 2.3389307745030843, "grad_norm": 1.3848775625228882, "learning_rate": 5.505150571836839e-05, "loss": 2.3289485931396485, "memory(GiB)": 121.15, "step": 6825, "token_acc": 0.5032996040475143, "train_speed(iter/s)": 1.194468 }, { "epoch": 2.3406442769019877, "grad_norm": 1.4520514011383057, "learning_rate": 5.49979469703498e-05, "loss": 2.3436309814453127, "memory(GiB)": 121.15, "step": 6830, "token_acc": 0.5063291139240507, "train_speed(iter/s)": 1.19454 }, { "epoch": 2.342357779300891, "grad_norm": 1.6270135641098022, "learning_rate": 5.4944382429101226e-05, "loss": 2.388681411743164, "memory(GiB)": 121.15, "step": 6835, "token_acc": 0.5019222554463905, "train_speed(iter/s)": 1.194489 }, { "epoch": 2.3440712816997946, "grad_norm": 1.3604955673217773, "learning_rate": 5.4890812156710446e-05, "loss": 2.5034769058227537, "memory(GiB)": 121.15, "step": 6840, "token_acc": 0.4684796044499382, "train_speed(iter/s)": 1.194596 }, { "epoch": 2.3457847840986976, "grad_norm": 1.4090074300765991, "learning_rate": 5.483723621527197e-05, "loss": 2.3893781661987306, "memory(GiB)": 121.15, "step": 6845, "token_acc": 0.4806135492117597, "train_speed(iter/s)": 1.194661 }, { "epoch": 2.347498286497601, "grad_norm": 1.5924240350723267, "learning_rate": 5.4783654666886864e-05, "loss": 2.389159393310547, "memory(GiB)": 121.15, "step": 6850, "token_acc": 0.4803010181496237, "train_speed(iter/s)": 1.194674 }, { "epoch": 2.3492117888965045, "grad_norm": 1.638968825340271, "learning_rate": 5.473006757366263e-05, "loss": 2.305162811279297, "memory(GiB)": 121.15, "step": 6855, "token_acc": 0.5023118957545187, "train_speed(iter/s)": 1.194774 }, { "epoch": 2.350925291295408, "grad_norm": 1.5005874633789062, "learning_rate": 5.467647499771326e-05, "loss": 2.367520332336426, "memory(GiB)": 121.15, "step": 6860, "token_acc": 0.5063613231552163, "train_speed(iter/s)": 1.194664 }, { "epoch": 2.352638793694311, "grad_norm": 1.539304256439209, "learning_rate": 5.46228770011591e-05, "loss": 2.36169319152832, "memory(GiB)": 121.15, "step": 6865, "token_acc": 0.49386084583901774, "train_speed(iter/s)": 1.19471 }, { "epoch": 2.3543522960932144, "grad_norm": 1.7755825519561768, "learning_rate": 5.4569273646126774e-05, "loss": 2.3879039764404295, "memory(GiB)": 121.15, "step": 6870, "token_acc": 0.5025619128949615, "train_speed(iter/s)": 1.194775 }, { "epoch": 2.356065798492118, "grad_norm": 1.5661402940750122, "learning_rate": 5.4515664994749075e-05, "loss": 2.3658180236816406, "memory(GiB)": 121.15, "step": 6875, "token_acc": 0.5048666948793906, "train_speed(iter/s)": 1.194786 }, { "epoch": 2.3577793008910213, "grad_norm": 1.3104101419448853, "learning_rate": 5.446205110916498e-05, "loss": 2.38977108001709, "memory(GiB)": 121.15, "step": 6880, "token_acc": 0.5052809463455852, "train_speed(iter/s)": 1.194883 }, { "epoch": 2.3594928032899247, "grad_norm": 1.4004194736480713, "learning_rate": 5.440843205151953e-05, "loss": 2.263169288635254, "memory(GiB)": 121.15, "step": 6885, "token_acc": 0.5133222314737719, "train_speed(iter/s)": 1.19489 }, { "epoch": 2.361206305688828, "grad_norm": 1.500577688217163, "learning_rate": 5.435480788396374e-05, "loss": 2.3313901901245115, "memory(GiB)": 121.15, "step": 6890, "token_acc": 0.49620758483033933, "train_speed(iter/s)": 1.194909 }, { "epoch": 2.362919808087731, "grad_norm": 1.4060882329940796, "learning_rate": 5.430117866865457e-05, "loss": 2.3845380783081054, "memory(GiB)": 121.15, "step": 6895, "token_acc": 0.5122055674518201, "train_speed(iter/s)": 1.19497 }, { "epoch": 2.3646333104866346, "grad_norm": 1.5456217527389526, "learning_rate": 5.424754446775481e-05, "loss": 2.3208663940429686, "memory(GiB)": 121.15, "step": 6900, "token_acc": 0.5033768572714993, "train_speed(iter/s)": 1.195046 }, { "epoch": 2.366346812885538, "grad_norm": 1.5201215744018555, "learning_rate": 5.4193905343433035e-05, "loss": 2.359409713745117, "memory(GiB)": 121.15, "step": 6905, "token_acc": 0.5002084201750729, "train_speed(iter/s)": 1.195137 }, { "epoch": 2.3680603152844415, "grad_norm": 1.4560779333114624, "learning_rate": 5.414026135786354e-05, "loss": 2.3164196014404297, "memory(GiB)": 121.15, "step": 6910, "token_acc": 0.49622166246851385, "train_speed(iter/s)": 1.19518 }, { "epoch": 2.369773817683345, "grad_norm": 1.3674039840698242, "learning_rate": 5.408661257322627e-05, "loss": 2.3375999450683596, "memory(GiB)": 121.15, "step": 6915, "token_acc": 0.501779359430605, "train_speed(iter/s)": 1.195088 }, { "epoch": 2.371487320082248, "grad_norm": 1.4442931413650513, "learning_rate": 5.4032959051706656e-05, "loss": 2.3510105133056642, "memory(GiB)": 121.15, "step": 6920, "token_acc": 0.5080440304826418, "train_speed(iter/s)": 1.19513 }, { "epoch": 2.3732008224811514, "grad_norm": 1.5852452516555786, "learning_rate": 5.397930085549571e-05, "loss": 2.3874078750610352, "memory(GiB)": 121.15, "step": 6925, "token_acc": 0.4898045879354291, "train_speed(iter/s)": 1.195076 }, { "epoch": 2.374914324880055, "grad_norm": 1.3398293256759644, "learning_rate": 5.392563804678983e-05, "loss": 2.2960575103759764, "memory(GiB)": 121.15, "step": 6930, "token_acc": 0.5080469769464985, "train_speed(iter/s)": 1.195079 }, { "epoch": 2.3766278272789583, "grad_norm": 1.6207385063171387, "learning_rate": 5.387197068779072e-05, "loss": 2.345469284057617, "memory(GiB)": 121.15, "step": 6935, "token_acc": 0.4969749351771824, "train_speed(iter/s)": 1.195151 }, { "epoch": 2.3783413296778617, "grad_norm": 1.4732491970062256, "learning_rate": 5.381829884070541e-05, "loss": 2.3201534271240236, "memory(GiB)": 121.15, "step": 6940, "token_acc": 0.5017761989342806, "train_speed(iter/s)": 1.194934 }, { "epoch": 2.3800548320767647, "grad_norm": 1.5832836627960205, "learning_rate": 5.376462256774614e-05, "loss": 2.3886138916015627, "memory(GiB)": 121.15, "step": 6945, "token_acc": 0.49213483146067416, "train_speed(iter/s)": 1.194977 }, { "epoch": 2.381768334475668, "grad_norm": 1.52571439743042, "learning_rate": 5.371094193113022e-05, "loss": 2.390026092529297, "memory(GiB)": 121.15, "step": 6950, "token_acc": 0.4973821989528796, "train_speed(iter/s)": 1.19501 }, { "epoch": 2.3834818368745716, "grad_norm": 1.609789252281189, "learning_rate": 5.365725699308006e-05, "loss": 2.392902946472168, "memory(GiB)": 121.15, "step": 6955, "token_acc": 0.49761801645734083, "train_speed(iter/s)": 1.195051 }, { "epoch": 2.385195339273475, "grad_norm": 1.5214262008666992, "learning_rate": 5.3603567815823076e-05, "loss": 2.3147933959960936, "memory(GiB)": 121.15, "step": 6960, "token_acc": 0.5071927661323469, "train_speed(iter/s)": 1.195093 }, { "epoch": 2.3869088416723785, "grad_norm": 1.34889554977417, "learning_rate": 5.354987446159156e-05, "loss": 2.2500625610351563, "memory(GiB)": 121.15, "step": 6965, "token_acc": 0.5145210229735587, "train_speed(iter/s)": 1.194922 }, { "epoch": 2.3886223440712815, "grad_norm": 1.5107507705688477, "learning_rate": 5.3496176992622636e-05, "loss": 2.4435401916503907, "memory(GiB)": 121.15, "step": 6970, "token_acc": 0.48482220294882916, "train_speed(iter/s)": 1.194914 }, { "epoch": 2.390335846470185, "grad_norm": 1.4289759397506714, "learning_rate": 5.344247547115825e-05, "loss": 2.302109146118164, "memory(GiB)": 121.15, "step": 6975, "token_acc": 0.5223427331887202, "train_speed(iter/s)": 1.194802 }, { "epoch": 2.3920493488690884, "grad_norm": 1.5578374862670898, "learning_rate": 5.3388769959444995e-05, "loss": 2.3175228118896483, "memory(GiB)": 121.15, "step": 6980, "token_acc": 0.4993526111350885, "train_speed(iter/s)": 1.194858 }, { "epoch": 2.393762851267992, "grad_norm": 1.5591692924499512, "learning_rate": 5.333506051973409e-05, "loss": 2.339632034301758, "memory(GiB)": 121.15, "step": 6985, "token_acc": 0.4973404255319149, "train_speed(iter/s)": 1.194833 }, { "epoch": 2.3954763536668953, "grad_norm": 1.6587893962860107, "learning_rate": 5.3281347214281375e-05, "loss": 2.252810478210449, "memory(GiB)": 121.15, "step": 6990, "token_acc": 0.49546351084812623, "train_speed(iter/s)": 1.194829 }, { "epoch": 2.3971898560657987, "grad_norm": 1.4374006986618042, "learning_rate": 5.3227630105347094e-05, "loss": 2.2849411010742187, "memory(GiB)": 121.15, "step": 6995, "token_acc": 0.5066059225512528, "train_speed(iter/s)": 1.194879 }, { "epoch": 2.3989033584647017, "grad_norm": 1.4497212171554565, "learning_rate": 5.3173909255195896e-05, "loss": 2.293625831604004, "memory(GiB)": 121.15, "step": 7000, "token_acc": 0.5163343232923208, "train_speed(iter/s)": 1.194918 }, { "epoch": 2.3989033584647017, "eval_loss": 1.9679837226867676, "eval_runtime": 3.7727, "eval_samples_per_second": 26.506, "eval_steps_per_second": 26.506, "eval_token_acc": 0.514367816091954, "step": 7000 }, { "epoch": 2.400616860863605, "grad_norm": 1.588849663734436, "learning_rate": 5.312018472609685e-05, "loss": 2.285763740539551, "memory(GiB)": 121.15, "step": 7005, "token_acc": 0.5142954390742002, "train_speed(iter/s)": 1.19389 }, { "epoch": 2.4023303632625086, "grad_norm": 1.5322246551513672, "learning_rate": 5.306645658032321e-05, "loss": 2.3793697357177734, "memory(GiB)": 121.15, "step": 7010, "token_acc": 0.500814332247557, "train_speed(iter/s)": 1.193815 }, { "epoch": 2.404043865661412, "grad_norm": 1.4540693759918213, "learning_rate": 5.3012724880152465e-05, "loss": 2.3162633895874025, "memory(GiB)": 121.15, "step": 7015, "token_acc": 0.49527239981990095, "train_speed(iter/s)": 1.193902 }, { "epoch": 2.405757368060315, "grad_norm": 1.3981907367706299, "learning_rate": 5.295898968786617e-05, "loss": 2.3433401107788088, "memory(GiB)": 121.15, "step": 7020, "token_acc": 0.4968367777309152, "train_speed(iter/s)": 1.193916 }, { "epoch": 2.4074708704592185, "grad_norm": 1.6243805885314941, "learning_rate": 5.2905251065750013e-05, "loss": 2.3892799377441407, "memory(GiB)": 121.15, "step": 7025, "token_acc": 0.48966756513926324, "train_speed(iter/s)": 1.194031 }, { "epoch": 2.409184372858122, "grad_norm": 1.5162036418914795, "learning_rate": 5.285150907609359e-05, "loss": 2.256376266479492, "memory(GiB)": 121.15, "step": 7030, "token_acc": 0.5205298013245033, "train_speed(iter/s)": 1.194118 }, { "epoch": 2.4108978752570254, "grad_norm": 1.5582544803619385, "learning_rate": 5.27977637811904e-05, "loss": 2.3287559509277345, "memory(GiB)": 121.15, "step": 7035, "token_acc": 0.5140939597315436, "train_speed(iter/s)": 1.194041 }, { "epoch": 2.412611377655929, "grad_norm": 1.4745815992355347, "learning_rate": 5.274401524333783e-05, "loss": 2.315740966796875, "memory(GiB)": 121.15, "step": 7040, "token_acc": 0.5050041701417848, "train_speed(iter/s)": 1.194053 }, { "epoch": 2.4143248800548323, "grad_norm": 1.530961036682129, "learning_rate": 5.269026352483697e-05, "loss": 2.3171138763427734, "memory(GiB)": 121.15, "step": 7045, "token_acc": 0.5018021625951141, "train_speed(iter/s)": 1.194005 }, { "epoch": 2.4160383824537353, "grad_norm": 1.508900761604309, "learning_rate": 5.2636508687992617e-05, "loss": 2.285462760925293, "memory(GiB)": 121.15, "step": 7050, "token_acc": 0.5154905335628227, "train_speed(iter/s)": 1.193971 }, { "epoch": 2.4177518848526387, "grad_norm": 1.5905907154083252, "learning_rate": 5.258275079511318e-05, "loss": 2.3980022430419923, "memory(GiB)": 121.15, "step": 7055, "token_acc": 0.5020593080724877, "train_speed(iter/s)": 1.193888 }, { "epoch": 2.419465387251542, "grad_norm": 1.530753254890442, "learning_rate": 5.252898990851063e-05, "loss": 2.3148807525634765, "memory(GiB)": 121.15, "step": 7060, "token_acc": 0.5160324925181702, "train_speed(iter/s)": 1.193884 }, { "epoch": 2.4211788896504456, "grad_norm": 1.6051411628723145, "learning_rate": 5.2475226090500354e-05, "loss": 2.385605049133301, "memory(GiB)": 121.15, "step": 7065, "token_acc": 0.5059687786960514, "train_speed(iter/s)": 1.193919 }, { "epoch": 2.422892392049349, "grad_norm": 1.5238196849822998, "learning_rate": 5.242145940340122e-05, "loss": 2.338776397705078, "memory(GiB)": 121.15, "step": 7070, "token_acc": 0.4912352221769262, "train_speed(iter/s)": 1.193875 }, { "epoch": 2.424605894448252, "grad_norm": 1.3971974849700928, "learning_rate": 5.236768990953533e-05, "loss": 2.3646350860595704, "memory(GiB)": 121.15, "step": 7075, "token_acc": 0.4954682779456193, "train_speed(iter/s)": 1.193955 }, { "epoch": 2.4263193968471555, "grad_norm": 1.5002321004867554, "learning_rate": 5.231391767122813e-05, "loss": 2.2470575332641602, "memory(GiB)": 121.15, "step": 7080, "token_acc": 0.5148822745446469, "train_speed(iter/s)": 1.19403 }, { "epoch": 2.428032899246059, "grad_norm": 1.509620189666748, "learning_rate": 5.226014275080816e-05, "loss": 2.350153350830078, "memory(GiB)": 121.15, "step": 7085, "token_acc": 0.5039787798408488, "train_speed(iter/s)": 1.19407 }, { "epoch": 2.4297464016449624, "grad_norm": 1.6434178352355957, "learning_rate": 5.220636521060713e-05, "loss": 2.3798999786376953, "memory(GiB)": 121.15, "step": 7090, "token_acc": 0.5158590308370044, "train_speed(iter/s)": 1.194139 }, { "epoch": 2.431459904043866, "grad_norm": 1.368102788925171, "learning_rate": 5.215258511295977e-05, "loss": 2.308359909057617, "memory(GiB)": 121.15, "step": 7095, "token_acc": 0.5040387722132472, "train_speed(iter/s)": 1.194149 }, { "epoch": 2.4331734064427692, "grad_norm": 1.5475633144378662, "learning_rate": 5.209880252020377e-05, "loss": 2.3094945907592774, "memory(GiB)": 121.15, "step": 7100, "token_acc": 0.50169779286927, "train_speed(iter/s)": 1.194198 }, { "epoch": 2.4348869088416722, "grad_norm": 1.6065375804901123, "learning_rate": 5.2045017494679696e-05, "loss": 2.372488784790039, "memory(GiB)": 121.15, "step": 7105, "token_acc": 0.5006858710562414, "train_speed(iter/s)": 1.194239 }, { "epoch": 2.4366004112405757, "grad_norm": 1.8230966329574585, "learning_rate": 5.199123009873098e-05, "loss": 2.447874069213867, "memory(GiB)": 121.15, "step": 7110, "token_acc": 0.4870569494225408, "train_speed(iter/s)": 1.194258 }, { "epoch": 2.438313913639479, "grad_norm": 1.4152179956436157, "learning_rate": 5.193744039470374e-05, "loss": 2.381250762939453, "memory(GiB)": 121.15, "step": 7115, "token_acc": 0.49487617421007685, "train_speed(iter/s)": 1.194351 }, { "epoch": 2.4400274160383826, "grad_norm": 1.5421868562698364, "learning_rate": 5.1883648444946845e-05, "loss": 2.425484275817871, "memory(GiB)": 121.15, "step": 7120, "token_acc": 0.4958609271523179, "train_speed(iter/s)": 1.194364 }, { "epoch": 2.4417409184372856, "grad_norm": 1.43189537525177, "learning_rate": 5.182985431181168e-05, "loss": 2.3792652130126952, "memory(GiB)": 121.15, "step": 7125, "token_acc": 0.4881383192601528, "train_speed(iter/s)": 1.194421 }, { "epoch": 2.443454420836189, "grad_norm": 1.5869653224945068, "learning_rate": 5.177605805765222e-05, "loss": 2.3312496185302733, "memory(GiB)": 121.15, "step": 7130, "token_acc": 0.5035225859925404, "train_speed(iter/s)": 1.194444 }, { "epoch": 2.4451679232350925, "grad_norm": 1.6540780067443848, "learning_rate": 5.172225974482491e-05, "loss": 2.3968124389648438, "memory(GiB)": 121.15, "step": 7135, "token_acc": 0.484612228149364, "train_speed(iter/s)": 1.194476 }, { "epoch": 2.446881425633996, "grad_norm": 1.4597054719924927, "learning_rate": 5.166845943568852e-05, "loss": 2.3483306884765627, "memory(GiB)": 121.15, "step": 7140, "token_acc": 0.49240358395013634, "train_speed(iter/s)": 1.194477 }, { "epoch": 2.4485949280328994, "grad_norm": 1.6916561126708984, "learning_rate": 5.161465719260419e-05, "loss": 2.3412771224975586, "memory(GiB)": 121.15, "step": 7145, "token_acc": 0.5131873044255699, "train_speed(iter/s)": 1.194521 }, { "epoch": 2.450308430431803, "grad_norm": 1.5330169200897217, "learning_rate": 5.156085307793528e-05, "loss": 2.1963468551635743, "memory(GiB)": 121.15, "step": 7150, "token_acc": 0.5191304347826087, "train_speed(iter/s)": 1.194553 }, { "epoch": 2.452021932830706, "grad_norm": 1.5275481939315796, "learning_rate": 5.1507047154047296e-05, "loss": 2.275289535522461, "memory(GiB)": 121.15, "step": 7155, "token_acc": 0.5119469026548673, "train_speed(iter/s)": 1.194615 }, { "epoch": 2.4537354352296092, "grad_norm": 1.63747239112854, "learning_rate": 5.145323948330789e-05, "loss": 2.3521400451660157, "memory(GiB)": 121.15, "step": 7160, "token_acc": 0.4885398981324278, "train_speed(iter/s)": 1.194615 }, { "epoch": 2.4554489376285127, "grad_norm": 1.7024023532867432, "learning_rate": 5.139943012808671e-05, "loss": 2.4025547027587892, "memory(GiB)": 121.15, "step": 7165, "token_acc": 0.4920283436669619, "train_speed(iter/s)": 1.194663 }, { "epoch": 2.457162440027416, "grad_norm": 1.6943440437316895, "learning_rate": 5.1345619150755355e-05, "loss": 2.2998117446899413, "memory(GiB)": 121.15, "step": 7170, "token_acc": 0.5126582278481012, "train_speed(iter/s)": 1.194738 }, { "epoch": 2.4588759424263196, "grad_norm": 1.416205883026123, "learning_rate": 5.129180661368732e-05, "loss": 2.317942810058594, "memory(GiB)": 121.15, "step": 7175, "token_acc": 0.5014887282007656, "train_speed(iter/s)": 1.194794 }, { "epoch": 2.4605894448252226, "grad_norm": 1.5384572744369507, "learning_rate": 5.1237992579257885e-05, "loss": 2.31799201965332, "memory(GiB)": 121.15, "step": 7180, "token_acc": 0.5046968403074296, "train_speed(iter/s)": 1.19484 }, { "epoch": 2.462302947224126, "grad_norm": 1.4993733167648315, "learning_rate": 5.118417710984408e-05, "loss": 2.3325096130371095, "memory(GiB)": 121.15, "step": 7185, "token_acc": 0.49671772428884026, "train_speed(iter/s)": 1.194891 }, { "epoch": 2.4640164496230295, "grad_norm": 1.4674266576766968, "learning_rate": 5.1130360267824606e-05, "loss": 2.357491302490234, "memory(GiB)": 121.15, "step": 7190, "token_acc": 0.49894736842105264, "train_speed(iter/s)": 1.194932 }, { "epoch": 2.465729952021933, "grad_norm": 1.570107102394104, "learning_rate": 5.1076542115579725e-05, "loss": 2.3757081985473634, "memory(GiB)": 121.15, "step": 7195, "token_acc": 0.4949847361535107, "train_speed(iter/s)": 1.194973 }, { "epoch": 2.4674434544208363, "grad_norm": 1.5841985940933228, "learning_rate": 5.102272271549127e-05, "loss": 2.3464832305908203, "memory(GiB)": 121.15, "step": 7200, "token_acc": 0.49094650205761314, "train_speed(iter/s)": 1.19502 }, { "epoch": 2.46915695681974, "grad_norm": 1.6016767024993896, "learning_rate": 5.0968902129942455e-05, "loss": 2.4118614196777344, "memory(GiB)": 121.15, "step": 7205, "token_acc": 0.4899443731279418, "train_speed(iter/s)": 1.19504 }, { "epoch": 2.470870459218643, "grad_norm": 1.60132896900177, "learning_rate": 5.091508042131794e-05, "loss": 2.3916694641113283, "memory(GiB)": 121.15, "step": 7210, "token_acc": 0.49721627408993574, "train_speed(iter/s)": 1.1951 }, { "epoch": 2.4725839616175462, "grad_norm": 1.449406623840332, "learning_rate": 5.08612576520036e-05, "loss": 2.2293352127075194, "memory(GiB)": 121.15, "step": 7215, "token_acc": 0.507908611599297, "train_speed(iter/s)": 1.195136 }, { "epoch": 2.4742974640164497, "grad_norm": 1.5954667329788208, "learning_rate": 5.080743388438663e-05, "loss": 2.4155738830566404, "memory(GiB)": 121.15, "step": 7220, "token_acc": 0.503056768558952, "train_speed(iter/s)": 1.195172 }, { "epoch": 2.476010966415353, "grad_norm": 1.4287229776382446, "learning_rate": 5.075360918085532e-05, "loss": 2.3003339767456055, "memory(GiB)": 121.15, "step": 7225, "token_acc": 0.5190555095277548, "train_speed(iter/s)": 1.195227 }, { "epoch": 2.477724468814256, "grad_norm": 1.9896482229232788, "learning_rate": 5.069978360379908e-05, "loss": 2.353251647949219, "memory(GiB)": 121.15, "step": 7230, "token_acc": 0.4929762949956102, "train_speed(iter/s)": 1.195269 }, { "epoch": 2.4794379712131596, "grad_norm": 1.5004627704620361, "learning_rate": 5.06459572156083e-05, "loss": 2.3458791732788087, "memory(GiB)": 121.15, "step": 7235, "token_acc": 0.4941123188405797, "train_speed(iter/s)": 1.195274 }, { "epoch": 2.481151473612063, "grad_norm": 1.5125700235366821, "learning_rate": 5.059213007867434e-05, "loss": 2.2741182327270506, "memory(GiB)": 121.15, "step": 7240, "token_acc": 0.5153039832285116, "train_speed(iter/s)": 1.195274 }, { "epoch": 2.4828649760109665, "grad_norm": 1.5012954473495483, "learning_rate": 5.05383022553894e-05, "loss": 2.3030181884765626, "memory(GiB)": 121.15, "step": 7245, "token_acc": 0.5101863892501084, "train_speed(iter/s)": 1.195293 }, { "epoch": 2.48457847840987, "grad_norm": 1.4519821405410767, "learning_rate": 5.048447380814651e-05, "loss": 2.3383014678955076, "memory(GiB)": 121.15, "step": 7250, "token_acc": 0.5127061970575123, "train_speed(iter/s)": 1.195328 }, { "epoch": 2.4862919808087733, "grad_norm": 1.3907462358474731, "learning_rate": 5.04306447993394e-05, "loss": 2.4530075073242186, "memory(GiB)": 121.15, "step": 7255, "token_acc": 0.49845440494590415, "train_speed(iter/s)": 1.195316 }, { "epoch": 2.4880054832076763, "grad_norm": 1.8549563884735107, "learning_rate": 5.037681529136246e-05, "loss": 2.2786773681640624, "memory(GiB)": 121.15, "step": 7260, "token_acc": 0.5277777777777778, "train_speed(iter/s)": 1.195157 }, { "epoch": 2.48971898560658, "grad_norm": 1.3897548913955688, "learning_rate": 5.032298534661063e-05, "loss": 2.3921844482421877, "memory(GiB)": 121.15, "step": 7265, "token_acc": 0.4907640638119228, "train_speed(iter/s)": 1.195222 }, { "epoch": 2.4914324880054832, "grad_norm": 1.3759584426879883, "learning_rate": 5.026915502747941e-05, "loss": 2.381857490539551, "memory(GiB)": 121.15, "step": 7270, "token_acc": 0.49347917543121583, "train_speed(iter/s)": 1.195248 }, { "epoch": 2.4931459904043867, "grad_norm": 1.7493664026260376, "learning_rate": 5.021532439636468e-05, "loss": 2.2652313232421877, "memory(GiB)": 121.15, "step": 7275, "token_acc": 0.5109423849933006, "train_speed(iter/s)": 1.195234 }, { "epoch": 2.49485949280329, "grad_norm": 1.4227986335754395, "learning_rate": 5.016149351566272e-05, "loss": 2.3111412048339846, "memory(GiB)": 121.15, "step": 7280, "token_acc": 0.5010670081092616, "train_speed(iter/s)": 1.195088 }, { "epoch": 2.496572995202193, "grad_norm": 1.547659993171692, "learning_rate": 5.0107662447770074e-05, "loss": 2.2986936569213867, "memory(GiB)": 121.15, "step": 7285, "token_acc": 0.5136138613861386, "train_speed(iter/s)": 1.195134 }, { "epoch": 2.4982864976010966, "grad_norm": 1.6413965225219727, "learning_rate": 5.005383125508355e-05, "loss": 2.2554689407348634, "memory(GiB)": 121.15, "step": 7290, "token_acc": 0.5155756207674943, "train_speed(iter/s)": 1.195186 }, { "epoch": 2.5, "grad_norm": 1.635085105895996, "learning_rate": 5e-05, "loss": 2.4163904190063477, "memory(GiB)": 121.15, "step": 7295, "token_acc": 0.4914134742404227, "train_speed(iter/s)": 1.195275 }, { "epoch": 2.5017135023989034, "grad_norm": 1.404325246810913, "learning_rate": 4.994616874491646e-05, "loss": 2.3206178665161135, "memory(GiB)": 121.15, "step": 7300, "token_acc": 0.5046025104602511, "train_speed(iter/s)": 1.195309 }, { "epoch": 2.503427004797807, "grad_norm": 1.5174367427825928, "learning_rate": 4.989233755222993e-05, "loss": 2.3507675170898437, "memory(GiB)": 121.15, "step": 7305, "token_acc": 0.4916317991631799, "train_speed(iter/s)": 1.195297 }, { "epoch": 2.5051405071967103, "grad_norm": 1.381820559501648, "learning_rate": 4.9838506484337285e-05, "loss": 2.263645362854004, "memory(GiB)": 121.15, "step": 7310, "token_acc": 0.5174715351393797, "train_speed(iter/s)": 1.195228 }, { "epoch": 2.5068540095956133, "grad_norm": 1.412103533744812, "learning_rate": 4.9784675603635336e-05, "loss": 2.30224666595459, "memory(GiB)": 121.15, "step": 7315, "token_acc": 0.49919289749798224, "train_speed(iter/s)": 1.195208 }, { "epoch": 2.5085675119945168, "grad_norm": 1.7936816215515137, "learning_rate": 4.973084497252061e-05, "loss": 2.2693193435668944, "memory(GiB)": 121.15, "step": 7320, "token_acc": 0.5031525851197982, "train_speed(iter/s)": 1.195086 }, { "epoch": 2.51028101439342, "grad_norm": 1.6619452238082886, "learning_rate": 4.967701465338939e-05, "loss": 2.418718719482422, "memory(GiB)": 121.15, "step": 7325, "token_acc": 0.5011125945705385, "train_speed(iter/s)": 1.195149 }, { "epoch": 2.5119945167923237, "grad_norm": 2.135293960571289, "learning_rate": 4.9623184708637554e-05, "loss": 2.372015190124512, "memory(GiB)": 121.15, "step": 7330, "token_acc": 0.49264069264069266, "train_speed(iter/s)": 1.19524 }, { "epoch": 2.5137080191912267, "grad_norm": 1.4675841331481934, "learning_rate": 4.9569355200660605e-05, "loss": 2.3690780639648437, "memory(GiB)": 121.15, "step": 7335, "token_acc": 0.49471458773784355, "train_speed(iter/s)": 1.195265 }, { "epoch": 2.51542152159013, "grad_norm": 1.5337051153182983, "learning_rate": 4.951552619185349e-05, "loss": 2.3223138809204102, "memory(GiB)": 121.15, "step": 7340, "token_acc": 0.4976689976689977, "train_speed(iter/s)": 1.195238 }, { "epoch": 2.5171350239890335, "grad_norm": 1.5132410526275635, "learning_rate": 4.946169774461061e-05, "loss": 2.3220474243164064, "memory(GiB)": 121.15, "step": 7345, "token_acc": 0.519603424966201, "train_speed(iter/s)": 1.195312 }, { "epoch": 2.518848526387937, "grad_norm": 1.5150254964828491, "learning_rate": 4.940786992132568e-05, "loss": 2.261554145812988, "memory(GiB)": 121.15, "step": 7350, "token_acc": 0.515085264538697, "train_speed(iter/s)": 1.195332 }, { "epoch": 2.5205620287868404, "grad_norm": 1.5119916200637817, "learning_rate": 4.935404278439172e-05, "loss": 2.3613039016723634, "memory(GiB)": 121.15, "step": 7355, "token_acc": 0.5089869281045751, "train_speed(iter/s)": 1.195354 }, { "epoch": 2.522275531185744, "grad_norm": 1.6254961490631104, "learning_rate": 4.930021639620093e-05, "loss": 2.313271141052246, "memory(GiB)": 121.15, "step": 7360, "token_acc": 0.5090128755364807, "train_speed(iter/s)": 1.195309 }, { "epoch": 2.523989033584647, "grad_norm": 1.6993882656097412, "learning_rate": 4.924639081914469e-05, "loss": 2.307618522644043, "memory(GiB)": 121.15, "step": 7365, "token_acc": 0.5063063063063064, "train_speed(iter/s)": 1.195277 }, { "epoch": 2.5257025359835503, "grad_norm": 1.5780612230300903, "learning_rate": 4.919256611561338e-05, "loss": 2.3083660125732424, "memory(GiB)": 121.15, "step": 7370, "token_acc": 0.5203215721304154, "train_speed(iter/s)": 1.195234 }, { "epoch": 2.5274160383824538, "grad_norm": 1.6634838581085205, "learning_rate": 4.91387423479964e-05, "loss": 2.3030588150024416, "memory(GiB)": 121.15, "step": 7375, "token_acc": 0.5038428693424424, "train_speed(iter/s)": 1.195262 }, { "epoch": 2.529129540781357, "grad_norm": 1.691180944442749, "learning_rate": 4.908491957868209e-05, "loss": 2.313739013671875, "memory(GiB)": 121.15, "step": 7380, "token_acc": 0.5117117117117117, "train_speed(iter/s)": 1.19525 }, { "epoch": 2.53084304318026, "grad_norm": 1.5353107452392578, "learning_rate": 4.903109787005756e-05, "loss": 2.287641143798828, "memory(GiB)": 121.15, "step": 7385, "token_acc": 0.5035778175313059, "train_speed(iter/s)": 1.195279 }, { "epoch": 2.5325565455791637, "grad_norm": 1.3877042531967163, "learning_rate": 4.897727728450875e-05, "loss": 2.2915273666381837, "memory(GiB)": 121.15, "step": 7390, "token_acc": 0.5174765558397272, "train_speed(iter/s)": 1.195261 }, { "epoch": 2.534270047978067, "grad_norm": 1.5948596000671387, "learning_rate": 4.892345788442028e-05, "loss": 2.3240692138671877, "memory(GiB)": 121.15, "step": 7395, "token_acc": 0.49782608695652175, "train_speed(iter/s)": 1.195209 }, { "epoch": 2.5359835503769705, "grad_norm": 1.5354081392288208, "learning_rate": 4.886963973217541e-05, "loss": 2.334819030761719, "memory(GiB)": 121.15, "step": 7400, "token_acc": 0.5082322357019065, "train_speed(iter/s)": 1.19522 }, { "epoch": 2.537697052775874, "grad_norm": 1.774024248123169, "learning_rate": 4.8815822890155924e-05, "loss": 2.3925508499145507, "memory(GiB)": 121.15, "step": 7405, "token_acc": 0.49284511784511786, "train_speed(iter/s)": 1.195249 }, { "epoch": 2.5394105551747774, "grad_norm": 1.4770512580871582, "learning_rate": 4.876200742074213e-05, "loss": 2.26761474609375, "memory(GiB)": 121.15, "step": 7410, "token_acc": 0.5144897102057959, "train_speed(iter/s)": 1.195258 }, { "epoch": 2.541124057573681, "grad_norm": 1.3953367471694946, "learning_rate": 4.87081933863127e-05, "loss": 2.308322525024414, "memory(GiB)": 121.15, "step": 7415, "token_acc": 0.509771986970684, "train_speed(iter/s)": 1.195288 }, { "epoch": 2.542837559972584, "grad_norm": 1.424299955368042, "learning_rate": 4.865438084924466e-05, "loss": 2.340998077392578, "memory(GiB)": 121.15, "step": 7420, "token_acc": 0.5050675675675675, "train_speed(iter/s)": 1.195289 }, { "epoch": 2.5445510623714873, "grad_norm": 1.609476089477539, "learning_rate": 4.860056987191329e-05, "loss": 2.3494537353515623, "memory(GiB)": 121.15, "step": 7425, "token_acc": 0.5105808194506979, "train_speed(iter/s)": 1.195243 }, { "epoch": 2.5462645647703908, "grad_norm": 1.5335489511489868, "learning_rate": 4.854676051669212e-05, "loss": 2.280948257446289, "memory(GiB)": 121.15, "step": 7430, "token_acc": 0.5102125885785744, "train_speed(iter/s)": 1.195285 }, { "epoch": 2.547978067169294, "grad_norm": 1.498374581336975, "learning_rate": 4.84929528459527e-05, "loss": 2.4019479751586914, "memory(GiB)": 121.15, "step": 7435, "token_acc": 0.4997802197802198, "train_speed(iter/s)": 1.195217 }, { "epoch": 2.549691569568197, "grad_norm": 1.2700639963150024, "learning_rate": 4.8439146922064726e-05, "loss": 2.1908967971801756, "memory(GiB)": 121.15, "step": 7440, "token_acc": 0.5057716973065413, "train_speed(iter/s)": 1.195205 }, { "epoch": 2.5514050719671006, "grad_norm": 1.4304579496383667, "learning_rate": 4.8385342807395824e-05, "loss": 2.325701904296875, "memory(GiB)": 121.15, "step": 7445, "token_acc": 0.5036216446527482, "train_speed(iter/s)": 1.195299 }, { "epoch": 2.553118574366004, "grad_norm": 1.6815003156661987, "learning_rate": 4.8331540564311495e-05, "loss": 2.2641437530517576, "memory(GiB)": 121.15, "step": 7450, "token_acc": 0.5040883074407195, "train_speed(iter/s)": 1.195125 }, { "epoch": 2.5548320767649075, "grad_norm": 1.6596577167510986, "learning_rate": 4.82777402551751e-05, "loss": 2.2608016967773437, "memory(GiB)": 121.15, "step": 7455, "token_acc": 0.5050738007380073, "train_speed(iter/s)": 1.19515 }, { "epoch": 2.556545579163811, "grad_norm": 1.5335220098495483, "learning_rate": 4.8223941942347786e-05, "loss": 2.337635612487793, "memory(GiB)": 121.15, "step": 7460, "token_acc": 0.4927762592737212, "train_speed(iter/s)": 1.195153 }, { "epoch": 2.5582590815627144, "grad_norm": 1.4331127405166626, "learning_rate": 4.817014568818833e-05, "loss": 2.278610610961914, "memory(GiB)": 121.15, "step": 7465, "token_acc": 0.5124003542958371, "train_speed(iter/s)": 1.195157 }, { "epoch": 2.5599725839616174, "grad_norm": 1.7014871835708618, "learning_rate": 4.811635155505316e-05, "loss": 2.349204254150391, "memory(GiB)": 121.15, "step": 7470, "token_acc": 0.5030434782608696, "train_speed(iter/s)": 1.195207 }, { "epoch": 2.561686086360521, "grad_norm": 1.5167806148529053, "learning_rate": 4.806255960529627e-05, "loss": 2.279090690612793, "memory(GiB)": 121.15, "step": 7475, "token_acc": 0.5063054647361046, "train_speed(iter/s)": 1.195086 }, { "epoch": 2.5633995887594243, "grad_norm": 1.609417200088501, "learning_rate": 4.8008769901269045e-05, "loss": 2.2417139053344726, "memory(GiB)": 121.15, "step": 7480, "token_acc": 0.5111308993766697, "train_speed(iter/s)": 1.195147 }, { "epoch": 2.5651130911583278, "grad_norm": 1.6447162628173828, "learning_rate": 4.7954982505320315e-05, "loss": 2.3150257110595702, "memory(GiB)": 121.15, "step": 7485, "token_acc": 0.4986922406277245, "train_speed(iter/s)": 1.195031 }, { "epoch": 2.5668265935572308, "grad_norm": 1.511910319328308, "learning_rate": 4.7901197479796236e-05, "loss": 2.3166667938232424, "memory(GiB)": 121.15, "step": 7490, "token_acc": 0.5028150714595063, "train_speed(iter/s)": 1.195068 }, { "epoch": 2.568540095956134, "grad_norm": 1.6304092407226562, "learning_rate": 4.7847414887040235e-05, "loss": 2.318666458129883, "memory(GiB)": 121.15, "step": 7495, "token_acc": 0.5063235935455734, "train_speed(iter/s)": 1.19516 }, { "epoch": 2.5702535983550376, "grad_norm": 1.4574332237243652, "learning_rate": 4.779363478939287e-05, "loss": 2.317428398132324, "memory(GiB)": 121.15, "step": 7500, "token_acc": 0.5220900594732371, "train_speed(iter/s)": 1.195054 }, { "epoch": 2.5702535983550376, "eval_loss": 2.075268030166626, "eval_runtime": 3.701, "eval_samples_per_second": 27.019, "eval_steps_per_second": 27.019, "eval_token_acc": 0.4871099050203528, "step": 7500 }, { "epoch": 2.571967100753941, "grad_norm": 1.541059970855713, "learning_rate": 4.773985724919185e-05, "loss": 2.4888280868530273, "memory(GiB)": 121.15, "step": 7505, "token_acc": 0.4837905236907731, "train_speed(iter/s)": 1.194138 }, { "epoch": 2.5736806031528445, "grad_norm": 1.5735913515090942, "learning_rate": 4.7686082328771896e-05, "loss": 2.394282913208008, "memory(GiB)": 121.15, "step": 7510, "token_acc": 0.4852941176470588, "train_speed(iter/s)": 1.194147 }, { "epoch": 2.575394105551748, "grad_norm": 1.4405877590179443, "learning_rate": 4.763231009046468e-05, "loss": 2.312963104248047, "memory(GiB)": 121.15, "step": 7515, "token_acc": 0.5039175257731959, "train_speed(iter/s)": 1.194129 }, { "epoch": 2.577107607950651, "grad_norm": 1.6420897245407104, "learning_rate": 4.7578540596598794e-05, "loss": 2.371402549743652, "memory(GiB)": 121.15, "step": 7520, "token_acc": 0.4899615548910722, "train_speed(iter/s)": 1.193961 }, { "epoch": 2.5788211103495544, "grad_norm": 1.5806632041931152, "learning_rate": 4.752477390949965e-05, "loss": 2.367742156982422, "memory(GiB)": 121.15, "step": 7525, "token_acc": 0.4876676763305928, "train_speed(iter/s)": 1.194017 }, { "epoch": 2.580534612748458, "grad_norm": 1.3208173513412476, "learning_rate": 4.7471010091489385e-05, "loss": 2.3084186553955077, "memory(GiB)": 121.15, "step": 7530, "token_acc": 0.5083263946711074, "train_speed(iter/s)": 1.19396 }, { "epoch": 2.5822481151473613, "grad_norm": 1.550072431564331, "learning_rate": 4.741724920488682e-05, "loss": 2.444365692138672, "memory(GiB)": 121.15, "step": 7535, "token_acc": 0.4943061999156474, "train_speed(iter/s)": 1.194017 }, { "epoch": 2.5839616175462643, "grad_norm": 1.6925206184387207, "learning_rate": 4.73634913120074e-05, "loss": 2.390179252624512, "memory(GiB)": 121.15, "step": 7540, "token_acc": 0.4877944325481799, "train_speed(iter/s)": 1.19403 }, { "epoch": 2.5856751199451677, "grad_norm": 1.6336580514907837, "learning_rate": 4.730973647516305e-05, "loss": 2.249700736999512, "memory(GiB)": 121.15, "step": 7545, "token_acc": 0.5227272727272727, "train_speed(iter/s)": 1.193999 }, { "epoch": 2.587388622344071, "grad_norm": 1.4343419075012207, "learning_rate": 4.725598475666218e-05, "loss": 2.3861656188964844, "memory(GiB)": 121.15, "step": 7550, "token_acc": 0.5055187637969095, "train_speed(iter/s)": 1.194017 }, { "epoch": 2.5891021247429746, "grad_norm": 1.700991153717041, "learning_rate": 4.720223621880961e-05, "loss": 2.424510955810547, "memory(GiB)": 121.15, "step": 7555, "token_acc": 0.5036590615583297, "train_speed(iter/s)": 1.194066 }, { "epoch": 2.590815627141878, "grad_norm": 1.3990066051483154, "learning_rate": 4.714849092390642e-05, "loss": 2.393366241455078, "memory(GiB)": 121.15, "step": 7560, "token_acc": 0.5076855511638121, "train_speed(iter/s)": 1.194128 }, { "epoch": 2.5925291295407815, "grad_norm": 1.6342215538024902, "learning_rate": 4.709474893424999e-05, "loss": 2.317106246948242, "memory(GiB)": 121.15, "step": 7565, "token_acc": 0.49031095087877424, "train_speed(iter/s)": 1.194152 }, { "epoch": 2.594242631939685, "grad_norm": 1.5739423036575317, "learning_rate": 4.704101031213383e-05, "loss": 2.4369693756103517, "memory(GiB)": 121.15, "step": 7570, "token_acc": 0.4907016060862215, "train_speed(iter/s)": 1.194188 }, { "epoch": 2.595956134338588, "grad_norm": 1.5699642896652222, "learning_rate": 4.698727511984756e-05, "loss": 2.309399223327637, "memory(GiB)": 121.15, "step": 7575, "token_acc": 0.5125256673511294, "train_speed(iter/s)": 1.194159 }, { "epoch": 2.5976696367374914, "grad_norm": 1.5580517053604126, "learning_rate": 4.6933543419676804e-05, "loss": 2.348634147644043, "memory(GiB)": 121.15, "step": 7580, "token_acc": 0.49938398357289526, "train_speed(iter/s)": 1.194216 }, { "epoch": 2.599383139136395, "grad_norm": 1.5406012535095215, "learning_rate": 4.687981527390315e-05, "loss": 2.3261281967163088, "memory(GiB)": 121.15, "step": 7585, "token_acc": 0.4872331519464211, "train_speed(iter/s)": 1.194278 }, { "epoch": 2.6010966415352983, "grad_norm": 1.484816551208496, "learning_rate": 4.682609074480411e-05, "loss": 2.2784019470214845, "memory(GiB)": 121.15, "step": 7590, "token_acc": 0.5192714951291826, "train_speed(iter/s)": 1.194276 }, { "epoch": 2.6028101439342013, "grad_norm": 1.490928053855896, "learning_rate": 4.6772369894652924e-05, "loss": 2.3847660064697265, "memory(GiB)": 121.15, "step": 7595, "token_acc": 0.48001682793437106, "train_speed(iter/s)": 1.194271 }, { "epoch": 2.6045236463331047, "grad_norm": 1.5727627277374268, "learning_rate": 4.671865278571864e-05, "loss": 2.377579116821289, "memory(GiB)": 121.15, "step": 7600, "token_acc": 0.49763481551561023, "train_speed(iter/s)": 1.194316 }, { "epoch": 2.606237148732008, "grad_norm": 1.6534818410873413, "learning_rate": 4.666493948026592e-05, "loss": 2.4059757232666015, "memory(GiB)": 121.15, "step": 7605, "token_acc": 0.4834166288050886, "train_speed(iter/s)": 1.194355 }, { "epoch": 2.6079506511309116, "grad_norm": 1.4290753602981567, "learning_rate": 4.661123004055503e-05, "loss": 2.2393672943115233, "memory(GiB)": 121.15, "step": 7610, "token_acc": 0.5296477931341953, "train_speed(iter/s)": 1.194327 }, { "epoch": 2.609664153529815, "grad_norm": 1.5047563314437866, "learning_rate": 4.655752452884177e-05, "loss": 2.3309404373168947, "memory(GiB)": 121.15, "step": 7615, "token_acc": 0.5099052540913006, "train_speed(iter/s)": 1.194375 }, { "epoch": 2.6113776559287185, "grad_norm": 1.668607234954834, "learning_rate": 4.6503823007377376e-05, "loss": 2.1794424057006836, "memory(GiB)": 121.15, "step": 7620, "token_acc": 0.5168181818181818, "train_speed(iter/s)": 1.194437 }, { "epoch": 2.6130911583276215, "grad_norm": 1.5128235816955566, "learning_rate": 4.645012553840845e-05, "loss": 2.3129377365112305, "memory(GiB)": 121.15, "step": 7625, "token_acc": 0.5075440067057837, "train_speed(iter/s)": 1.194466 }, { "epoch": 2.614804660726525, "grad_norm": 1.4701507091522217, "learning_rate": 4.639643218417693e-05, "loss": 2.463394546508789, "memory(GiB)": 121.15, "step": 7630, "token_acc": 0.49381135296628254, "train_speed(iter/s)": 1.194444 }, { "epoch": 2.6165181631254284, "grad_norm": 1.5262147188186646, "learning_rate": 4.634274300691994e-05, "loss": 2.361238479614258, "memory(GiB)": 121.15, "step": 7635, "token_acc": 0.5094588649362076, "train_speed(iter/s)": 1.194503 }, { "epoch": 2.618231665524332, "grad_norm": 1.5861908197402954, "learning_rate": 4.6289058068869805e-05, "loss": 2.3775842666625975, "memory(GiB)": 121.15, "step": 7640, "token_acc": 0.49322033898305084, "train_speed(iter/s)": 1.194546 }, { "epoch": 2.619945167923235, "grad_norm": 1.4569517374038696, "learning_rate": 4.623537743225388e-05, "loss": 2.372156524658203, "memory(GiB)": 121.15, "step": 7645, "token_acc": 0.5016447368421053, "train_speed(iter/s)": 1.194558 }, { "epoch": 2.6216586703221383, "grad_norm": 1.5843819379806519, "learning_rate": 4.61817011592946e-05, "loss": 2.24230842590332, "memory(GiB)": 121.15, "step": 7650, "token_acc": 0.5115594787725936, "train_speed(iter/s)": 1.194557 }, { "epoch": 2.6233721727210417, "grad_norm": 1.5532374382019043, "learning_rate": 4.61280293122093e-05, "loss": 2.2843559265136717, "memory(GiB)": 121.15, "step": 7655, "token_acc": 0.5047276001800991, "train_speed(iter/s)": 1.194591 }, { "epoch": 2.625085675119945, "grad_norm": 1.5231555700302124, "learning_rate": 4.607436195321018e-05, "loss": 2.296681785583496, "memory(GiB)": 121.15, "step": 7660, "token_acc": 0.5135371179039301, "train_speed(iter/s)": 1.194622 }, { "epoch": 2.6267991775188486, "grad_norm": 1.5731053352355957, "learning_rate": 4.602069914450429e-05, "loss": 2.419519805908203, "memory(GiB)": 121.15, "step": 7665, "token_acc": 0.5039267015706806, "train_speed(iter/s)": 1.194649 }, { "epoch": 2.628512679917752, "grad_norm": 1.5793185234069824, "learning_rate": 4.596704094829336e-05, "loss": 2.3800386428833007, "memory(GiB)": 121.15, "step": 7670, "token_acc": 0.49809402795425667, "train_speed(iter/s)": 1.194665 }, { "epoch": 2.6302261823166555, "grad_norm": 2.2961971759796143, "learning_rate": 4.591338742677375e-05, "loss": 2.289373588562012, "memory(GiB)": 121.15, "step": 7675, "token_acc": 0.5215686274509804, "train_speed(iter/s)": 1.194601 }, { "epoch": 2.6319396847155585, "grad_norm": 1.5880624055862427, "learning_rate": 4.585973864213647e-05, "loss": 2.4122234344482423, "memory(GiB)": 121.15, "step": 7680, "token_acc": 0.48739837398373986, "train_speed(iter/s)": 1.194569 }, { "epoch": 2.633653187114462, "grad_norm": 1.672987937927246, "learning_rate": 4.580609465656697e-05, "loss": 2.354288673400879, "memory(GiB)": 121.15, "step": 7685, "token_acc": 0.5193936691930451, "train_speed(iter/s)": 1.194597 }, { "epoch": 2.6353666895133654, "grad_norm": 1.3561818599700928, "learning_rate": 4.57524555322452e-05, "loss": 2.2361894607543946, "memory(GiB)": 121.15, "step": 7690, "token_acc": 0.5154166666666666, "train_speed(iter/s)": 1.194644 }, { "epoch": 2.637080191912269, "grad_norm": 1.5481877326965332, "learning_rate": 4.5698821331345446e-05, "loss": 2.2494586944580077, "memory(GiB)": 121.15, "step": 7695, "token_acc": 0.5235602094240838, "train_speed(iter/s)": 1.194687 }, { "epoch": 2.638793694311172, "grad_norm": 1.7519561052322388, "learning_rate": 4.564519211603626e-05, "loss": 2.4199432373046874, "memory(GiB)": 121.15, "step": 7700, "token_acc": 0.4903303787268332, "train_speed(iter/s)": 1.194698 }, { "epoch": 2.6405071967100753, "grad_norm": 1.4188923835754395, "learning_rate": 4.559156794848049e-05, "loss": 2.258086013793945, "memory(GiB)": 121.15, "step": 7705, "token_acc": 0.5194570135746607, "train_speed(iter/s)": 1.194597 }, { "epoch": 2.6422206991089787, "grad_norm": 1.4117377996444702, "learning_rate": 4.553794889083503e-05, "loss": 2.248155403137207, "memory(GiB)": 121.15, "step": 7710, "token_acc": 0.524881003894418, "train_speed(iter/s)": 1.194613 }, { "epoch": 2.643934201507882, "grad_norm": 1.6772141456604004, "learning_rate": 4.5484335005250944e-05, "loss": 2.409458541870117, "memory(GiB)": 121.15, "step": 7715, "token_acc": 0.5045417010734929, "train_speed(iter/s)": 1.194587 }, { "epoch": 2.6456477039067856, "grad_norm": 1.5146554708480835, "learning_rate": 4.5430726353873245e-05, "loss": 2.35659236907959, "memory(GiB)": 121.15, "step": 7720, "token_acc": 0.5041072200605274, "train_speed(iter/s)": 1.19465 }, { "epoch": 2.647361206305689, "grad_norm": 1.6051379442214966, "learning_rate": 4.5377122998840906e-05, "loss": 2.4062095642089845, "memory(GiB)": 121.15, "step": 7725, "token_acc": 0.4984828781967924, "train_speed(iter/s)": 1.194664 }, { "epoch": 2.649074708704592, "grad_norm": 1.6774990558624268, "learning_rate": 4.532352500228674e-05, "loss": 2.3464563369750975, "memory(GiB)": 121.15, "step": 7730, "token_acc": 0.5084359325125399, "train_speed(iter/s)": 1.194476 }, { "epoch": 2.6507882111034955, "grad_norm": 1.6093635559082031, "learning_rate": 4.5269932426337404e-05, "loss": 2.3259143829345703, "memory(GiB)": 121.15, "step": 7735, "token_acc": 0.49872340425531914, "train_speed(iter/s)": 1.194504 }, { "epoch": 2.652501713502399, "grad_norm": 1.8166879415512085, "learning_rate": 4.521634533311316e-05, "loss": 2.29683837890625, "memory(GiB)": 121.15, "step": 7740, "token_acc": 0.5096406660823839, "train_speed(iter/s)": 1.194538 }, { "epoch": 2.6542152159013024, "grad_norm": 1.4583966732025146, "learning_rate": 4.516276378472804e-05, "loss": 2.2735269546508787, "memory(GiB)": 121.15, "step": 7745, "token_acc": 0.5079365079365079, "train_speed(iter/s)": 1.194587 }, { "epoch": 2.6559287183002054, "grad_norm": 1.537288784980774, "learning_rate": 4.5109187843289566e-05, "loss": 2.2741714477539063, "memory(GiB)": 121.15, "step": 7750, "token_acc": 0.5069914298601714, "train_speed(iter/s)": 1.194631 }, { "epoch": 2.657642220699109, "grad_norm": 1.6596726179122925, "learning_rate": 4.5055617570898786e-05, "loss": 2.292998695373535, "memory(GiB)": 121.15, "step": 7755, "token_acc": 0.5048625792811839, "train_speed(iter/s)": 1.194614 }, { "epoch": 2.6593557230980123, "grad_norm": 1.656222939491272, "learning_rate": 4.500205302965019e-05, "loss": 2.3317237854003907, "memory(GiB)": 121.15, "step": 7760, "token_acc": 0.5019902697921274, "train_speed(iter/s)": 1.194632 }, { "epoch": 2.6610692254969157, "grad_norm": 1.5245237350463867, "learning_rate": 4.494849428163161e-05, "loss": 2.428821563720703, "memory(GiB)": 121.15, "step": 7765, "token_acc": 0.48128559804719284, "train_speed(iter/s)": 1.194611 }, { "epoch": 2.662782727895819, "grad_norm": 1.506361484527588, "learning_rate": 4.489494138892419e-05, "loss": 2.325776481628418, "memory(GiB)": 121.15, "step": 7770, "token_acc": 0.5074565037282519, "train_speed(iter/s)": 1.194629 }, { "epoch": 2.6644962302947226, "grad_norm": 1.4178224802017212, "learning_rate": 4.4841394413602215e-05, "loss": 2.4216896057128907, "memory(GiB)": 121.15, "step": 7775, "token_acc": 0.4968421052631579, "train_speed(iter/s)": 1.194687 }, { "epoch": 2.666209732693626, "grad_norm": 1.3920022249221802, "learning_rate": 4.478785341773318e-05, "loss": 2.249222755432129, "memory(GiB)": 121.15, "step": 7780, "token_acc": 0.5142250530785563, "train_speed(iter/s)": 1.194677 }, { "epoch": 2.667923235092529, "grad_norm": 1.576805830001831, "learning_rate": 4.473431846337763e-05, "loss": 2.3988792419433596, "memory(GiB)": 121.15, "step": 7785, "token_acc": 0.4945722970039079, "train_speed(iter/s)": 1.194717 }, { "epoch": 2.6696367374914325, "grad_norm": 1.5764384269714355, "learning_rate": 4.4680789612589126e-05, "loss": 2.391432189941406, "memory(GiB)": 121.15, "step": 7790, "token_acc": 0.5069944891903349, "train_speed(iter/s)": 1.194709 }, { "epoch": 2.671350239890336, "grad_norm": 1.6187599897384644, "learning_rate": 4.462726692741409e-05, "loss": 2.3387897491455076, "memory(GiB)": 121.15, "step": 7795, "token_acc": 0.5023295213892418, "train_speed(iter/s)": 1.194751 }, { "epoch": 2.673063742289239, "grad_norm": 1.490241527557373, "learning_rate": 4.457375046989189e-05, "loss": 2.310940170288086, "memory(GiB)": 121.15, "step": 7800, "token_acc": 0.5122736418511067, "train_speed(iter/s)": 1.194719 }, { "epoch": 2.6747772446881424, "grad_norm": 1.4336426258087158, "learning_rate": 4.452024030205459e-05, "loss": 2.2496538162231445, "memory(GiB)": 121.15, "step": 7805, "token_acc": 0.5161818926669398, "train_speed(iter/s)": 1.194751 }, { "epoch": 2.676490747087046, "grad_norm": 1.4850397109985352, "learning_rate": 4.446673648592701e-05, "loss": 2.345110321044922, "memory(GiB)": 121.15, "step": 7810, "token_acc": 0.5027932960893855, "train_speed(iter/s)": 1.194784 }, { "epoch": 2.6782042494859493, "grad_norm": 1.552417516708374, "learning_rate": 4.44132390835266e-05, "loss": 2.3712160110473635, "memory(GiB)": 121.15, "step": 7815, "token_acc": 0.5076252723311547, "train_speed(iter/s)": 1.194857 }, { "epoch": 2.6799177518848527, "grad_norm": 1.5863691568374634, "learning_rate": 4.435974815686338e-05, "loss": 2.387053108215332, "memory(GiB)": 121.15, "step": 7820, "token_acc": 0.4866694879390605, "train_speed(iter/s)": 1.19467 }, { "epoch": 2.681631254283756, "grad_norm": 1.6476383209228516, "learning_rate": 4.4306263767939835e-05, "loss": 2.2598926544189455, "memory(GiB)": 121.15, "step": 7825, "token_acc": 0.5093600348280366, "train_speed(iter/s)": 1.194684 }, { "epoch": 2.6833447566826596, "grad_norm": 1.6482765674591064, "learning_rate": 4.4252785978750904e-05, "loss": 2.327539825439453, "memory(GiB)": 121.15, "step": 7830, "token_acc": 0.5151389464952302, "train_speed(iter/s)": 1.194697 }, { "epoch": 2.6850582590815626, "grad_norm": 1.5128191709518433, "learning_rate": 4.4199314851283894e-05, "loss": 2.418465805053711, "memory(GiB)": 121.15, "step": 7835, "token_acc": 0.48433530906011857, "train_speed(iter/s)": 1.194734 }, { "epoch": 2.686771761480466, "grad_norm": 1.6309679746627808, "learning_rate": 4.4145850447518335e-05, "loss": 2.434351348876953, "memory(GiB)": 121.15, "step": 7840, "token_acc": 0.4847611202635914, "train_speed(iter/s)": 1.19476 }, { "epoch": 2.6884852638793695, "grad_norm": 1.487942099571228, "learning_rate": 4.4092392829425975e-05, "loss": 2.334889030456543, "memory(GiB)": 121.15, "step": 7845, "token_acc": 0.48675914249684743, "train_speed(iter/s)": 1.194666 }, { "epoch": 2.690198766278273, "grad_norm": 1.6186355352401733, "learning_rate": 4.4038942058970736e-05, "loss": 2.210173988342285, "memory(GiB)": 121.15, "step": 7850, "token_acc": 0.523036419482229, "train_speed(iter/s)": 1.194694 }, { "epoch": 2.691912268677176, "grad_norm": 1.4891568422317505, "learning_rate": 4.398549819810858e-05, "loss": 2.2804634094238283, "memory(GiB)": 121.15, "step": 7855, "token_acc": 0.5186308492201039, "train_speed(iter/s)": 1.194634 }, { "epoch": 2.6936257710760794, "grad_norm": 1.835591197013855, "learning_rate": 4.393206130878745e-05, "loss": 2.4341314315795897, "memory(GiB)": 121.15, "step": 7860, "token_acc": 0.4991408934707904, "train_speed(iter/s)": 1.19467 }, { "epoch": 2.695339273474983, "grad_norm": 1.4837253093719482, "learning_rate": 4.387863145294724e-05, "loss": 2.3773256301879884, "memory(GiB)": 121.15, "step": 7865, "token_acc": 0.4991166077738516, "train_speed(iter/s)": 1.194703 }, { "epoch": 2.6970527758738863, "grad_norm": 1.5675833225250244, "learning_rate": 4.382520869251964e-05, "loss": 2.297439193725586, "memory(GiB)": 121.15, "step": 7870, "token_acc": 0.5049668874172185, "train_speed(iter/s)": 1.194573 }, { "epoch": 2.6987662782727897, "grad_norm": 1.8794087171554565, "learning_rate": 4.377179308942814e-05, "loss": 2.339019775390625, "memory(GiB)": 121.15, "step": 7875, "token_acc": 0.5071189279731994, "train_speed(iter/s)": 1.19461 }, { "epoch": 2.700479780671693, "grad_norm": 1.4613685607910156, "learning_rate": 4.3718384705587946e-05, "loss": 2.3379371643066404, "memory(GiB)": 121.15, "step": 7880, "token_acc": 0.4881763527054108, "train_speed(iter/s)": 1.19459 }, { "epoch": 2.702193283070596, "grad_norm": 1.4891860485076904, "learning_rate": 4.3664983602905905e-05, "loss": 2.3434640884399416, "memory(GiB)": 121.15, "step": 7885, "token_acc": 0.5044788273615635, "train_speed(iter/s)": 1.19462 }, { "epoch": 2.7039067854694996, "grad_norm": 1.5342968702316284, "learning_rate": 4.3611589843280386e-05, "loss": 2.2409011840820314, "memory(GiB)": 121.15, "step": 7890, "token_acc": 0.5145175064047822, "train_speed(iter/s)": 1.194594 }, { "epoch": 2.705620287868403, "grad_norm": 1.5139434337615967, "learning_rate": 4.355820348860129e-05, "loss": 2.343944549560547, "memory(GiB)": 121.15, "step": 7895, "token_acc": 0.5149347917543121, "train_speed(iter/s)": 1.194557 }, { "epoch": 2.7073337902673065, "grad_norm": 1.474655270576477, "learning_rate": 4.3504824600749894e-05, "loss": 2.3923971176147463, "memory(GiB)": 121.15, "step": 7900, "token_acc": 0.4905349794238683, "train_speed(iter/s)": 1.194588 }, { "epoch": 2.7090472926662095, "grad_norm": 1.607720971107483, "learning_rate": 4.345145324159884e-05, "loss": 2.286652946472168, "memory(GiB)": 121.15, "step": 7905, "token_acc": 0.5051546391752577, "train_speed(iter/s)": 1.194535 }, { "epoch": 2.710760795065113, "grad_norm": 1.5493136644363403, "learning_rate": 4.339808947301204e-05, "loss": 2.35056095123291, "memory(GiB)": 121.15, "step": 7910, "token_acc": 0.5087950747581355, "train_speed(iter/s)": 1.19459 }, { "epoch": 2.7124742974640164, "grad_norm": 1.4807928800582886, "learning_rate": 4.334473335684462e-05, "loss": 2.4301631927490233, "memory(GiB)": 121.15, "step": 7915, "token_acc": 0.5073007926574885, "train_speed(iter/s)": 1.194628 }, { "epoch": 2.71418779986292, "grad_norm": 1.5576765537261963, "learning_rate": 4.329138495494282e-05, "loss": 2.3195945739746096, "memory(GiB)": 121.15, "step": 7920, "token_acc": 0.5069165551093262, "train_speed(iter/s)": 1.194585 }, { "epoch": 2.7159013022618232, "grad_norm": 1.7450246810913086, "learning_rate": 4.323804432914393e-05, "loss": 2.306224060058594, "memory(GiB)": 121.15, "step": 7925, "token_acc": 0.5015224010439322, "train_speed(iter/s)": 1.194611 }, { "epoch": 2.7176148046607267, "grad_norm": 1.5348069667816162, "learning_rate": 4.318471154127626e-05, "loss": 2.318011665344238, "memory(GiB)": 121.15, "step": 7930, "token_acc": 0.5106014712245781, "train_speed(iter/s)": 1.19457 }, { "epoch": 2.71932830705963, "grad_norm": 1.446719765663147, "learning_rate": 4.3131386653159025e-05, "loss": 2.3717636108398437, "memory(GiB)": 121.15, "step": 7935, "token_acc": 0.49391012179756405, "train_speed(iter/s)": 1.194621 }, { "epoch": 2.721041809458533, "grad_norm": 1.6707332134246826, "learning_rate": 4.3078069726602235e-05, "loss": 2.3105705261230467, "memory(GiB)": 121.15, "step": 7940, "token_acc": 0.5012244897959184, "train_speed(iter/s)": 1.194625 }, { "epoch": 2.7227553118574366, "grad_norm": 1.6985622644424438, "learning_rate": 4.302476082340674e-05, "loss": 2.400469207763672, "memory(GiB)": 121.15, "step": 7945, "token_acc": 0.5004340277777778, "train_speed(iter/s)": 1.194536 }, { "epoch": 2.72446881425634, "grad_norm": 1.6710022687911987, "learning_rate": 4.297146000536403e-05, "loss": 2.3377248764038088, "memory(GiB)": 121.15, "step": 7950, "token_acc": 0.5077051228654728, "train_speed(iter/s)": 1.19451 }, { "epoch": 2.7261823166552435, "grad_norm": 1.6361571550369263, "learning_rate": 4.291816733425628e-05, "loss": 2.2505565643310548, "memory(GiB)": 121.15, "step": 7955, "token_acc": 0.5205655526992288, "train_speed(iter/s)": 1.194583 }, { "epoch": 2.7278958190541465, "grad_norm": 1.4578886032104492, "learning_rate": 4.286488287185619e-05, "loss": 2.3281957626342775, "memory(GiB)": 121.15, "step": 7960, "token_acc": 0.5082608695652174, "train_speed(iter/s)": 1.194617 }, { "epoch": 2.72960932145305, "grad_norm": 1.4619203805923462, "learning_rate": 4.281160667992697e-05, "loss": 2.3070764541625977, "memory(GiB)": 121.15, "step": 7965, "token_acc": 0.5113043478260869, "train_speed(iter/s)": 1.194699 }, { "epoch": 2.7313228238519534, "grad_norm": 1.3941497802734375, "learning_rate": 4.275833882022216e-05, "loss": 2.380165863037109, "memory(GiB)": 121.15, "step": 7970, "token_acc": 0.4993581514762516, "train_speed(iter/s)": 1.194641 }, { "epoch": 2.733036326250857, "grad_norm": 1.4948551654815674, "learning_rate": 4.270507935448576e-05, "loss": 2.292185592651367, "memory(GiB)": 121.15, "step": 7975, "token_acc": 0.511330861145447, "train_speed(iter/s)": 1.194655 }, { "epoch": 2.7347498286497602, "grad_norm": 1.5580098628997803, "learning_rate": 4.2651828344451986e-05, "loss": 2.276301956176758, "memory(GiB)": 121.15, "step": 7980, "token_acc": 0.5175321793164669, "train_speed(iter/s)": 1.194621 }, { "epoch": 2.7364633310486637, "grad_norm": 1.445989966392517, "learning_rate": 4.2598585851845214e-05, "loss": 2.4366626739501953, "memory(GiB)": 121.15, "step": 7985, "token_acc": 0.4822607260726073, "train_speed(iter/s)": 1.194644 }, { "epoch": 2.7381768334475667, "grad_norm": 1.5924949645996094, "learning_rate": 4.2545351938380016e-05, "loss": 2.277659606933594, "memory(GiB)": 121.15, "step": 7990, "token_acc": 0.5180935569285083, "train_speed(iter/s)": 1.194686 }, { "epoch": 2.73989033584647, "grad_norm": 1.5362380743026733, "learning_rate": 4.249212666576098e-05, "loss": 2.3536664962768556, "memory(GiB)": 121.15, "step": 7995, "token_acc": 0.4913480885311871, "train_speed(iter/s)": 1.19476 }, { "epoch": 2.7416038382453736, "grad_norm": 1.5266494750976562, "learning_rate": 4.243891009568271e-05, "loss": 2.289084625244141, "memory(GiB)": 121.15, "step": 8000, "token_acc": 0.5079297042434634, "train_speed(iter/s)": 1.194815 }, { "epoch": 2.7416038382453736, "eval_loss": 2.195467233657837, "eval_runtime": 3.702, "eval_samples_per_second": 27.013, "eval_steps_per_second": 27.013, "eval_token_acc": 0.47368421052631576, "step": 8000 }, { "epoch": 2.743317340644277, "grad_norm": 1.5534812211990356, "learning_rate": 4.238570228982965e-05, "loss": 2.3567989349365233, "memory(GiB)": 121.15, "step": 8005, "token_acc": 0.4882275997383911, "train_speed(iter/s)": 1.193706 }, { "epoch": 2.74503084304318, "grad_norm": 1.3857719898223877, "learning_rate": 4.2332503309876185e-05, "loss": 2.2657005310058596, "memory(GiB)": 121.15, "step": 8010, "token_acc": 0.5112431056427662, "train_speed(iter/s)": 1.193769 }, { "epoch": 2.7467443454420835, "grad_norm": 1.691359043121338, "learning_rate": 4.227931321748639e-05, "loss": 2.3738174438476562, "memory(GiB)": 121.15, "step": 8015, "token_acc": 0.48814563928873833, "train_speed(iter/s)": 1.193824 }, { "epoch": 2.748457847840987, "grad_norm": 1.6939215660095215, "learning_rate": 4.2226132074314076e-05, "loss": 2.2456254959106445, "memory(GiB)": 121.15, "step": 8020, "token_acc": 0.5216101694915254, "train_speed(iter/s)": 1.193878 }, { "epoch": 2.7501713502398903, "grad_norm": 1.4141733646392822, "learning_rate": 4.217295994200268e-05, "loss": 2.279380989074707, "memory(GiB)": 121.15, "step": 8025, "token_acc": 0.5135371179039301, "train_speed(iter/s)": 1.193832 }, { "epoch": 2.751884852638794, "grad_norm": 1.538428783416748, "learning_rate": 4.2119796882185206e-05, "loss": 2.2963930130004884, "memory(GiB)": 121.15, "step": 8030, "token_acc": 0.5111940298507462, "train_speed(iter/s)": 1.193884 }, { "epoch": 2.7535983550376972, "grad_norm": 1.4675339460372925, "learning_rate": 4.206664295648407e-05, "loss": 2.2779502868652344, "memory(GiB)": 121.15, "step": 8035, "token_acc": 0.5190746753246753, "train_speed(iter/s)": 1.193944 }, { "epoch": 2.7553118574366007, "grad_norm": 1.5948301553726196, "learning_rate": 4.2013498226511194e-05, "loss": 2.4128034591674803, "memory(GiB)": 121.15, "step": 8040, "token_acc": 0.4892739273927393, "train_speed(iter/s)": 1.193897 }, { "epoch": 2.7570253598355037, "grad_norm": 1.6910170316696167, "learning_rate": 4.1960362753867775e-05, "loss": 2.3000394821166994, "memory(GiB)": 121.15, "step": 8045, "token_acc": 0.5139905294877314, "train_speed(iter/s)": 1.193924 }, { "epoch": 2.758738862234407, "grad_norm": 1.5060713291168213, "learning_rate": 4.190723660014433e-05, "loss": 2.370680236816406, "memory(GiB)": 121.15, "step": 8050, "token_acc": 0.49521149241819634, "train_speed(iter/s)": 1.193913 }, { "epoch": 2.7604523646333106, "grad_norm": 1.6503150463104248, "learning_rate": 4.185411982692056e-05, "loss": 2.3592708587646483, "memory(GiB)": 121.15, "step": 8055, "token_acc": 0.4995777027027027, "train_speed(iter/s)": 1.19378 }, { "epoch": 2.762165867032214, "grad_norm": 1.5340155363082886, "learning_rate": 4.1801012495765234e-05, "loss": 2.3560302734375, "memory(GiB)": 121.15, "step": 8060, "token_acc": 0.4929275610801543, "train_speed(iter/s)": 1.193814 }, { "epoch": 2.763879369431117, "grad_norm": 1.7234855890274048, "learning_rate": 4.1747914668236285e-05, "loss": 2.3352184295654297, "memory(GiB)": 121.15, "step": 8065, "token_acc": 0.5061349693251533, "train_speed(iter/s)": 1.193807 }, { "epoch": 2.7655928718300204, "grad_norm": 1.6149938106536865, "learning_rate": 4.169482640588052e-05, "loss": 2.403522491455078, "memory(GiB)": 121.15, "step": 8070, "token_acc": 0.499558693733451, "train_speed(iter/s)": 1.193788 }, { "epoch": 2.767306374228924, "grad_norm": 1.6134692430496216, "learning_rate": 4.1641747770233744e-05, "loss": 2.2868879318237303, "memory(GiB)": 121.15, "step": 8075, "token_acc": 0.5110533159947984, "train_speed(iter/s)": 1.193619 }, { "epoch": 2.7690198766278273, "grad_norm": 1.6043286323547363, "learning_rate": 4.1588678822820534e-05, "loss": 2.368855667114258, "memory(GiB)": 121.15, "step": 8080, "token_acc": 0.5004159733777038, "train_speed(iter/s)": 1.193679 }, { "epoch": 2.770733379026731, "grad_norm": 1.6397167444229126, "learning_rate": 4.15356196251543e-05, "loss": 2.305200958251953, "memory(GiB)": 121.15, "step": 8085, "token_acc": 0.4901272487933304, "train_speed(iter/s)": 1.193718 }, { "epoch": 2.7724468814256342, "grad_norm": 1.5862244367599487, "learning_rate": 4.14825702387371e-05, "loss": 2.3542110443115236, "memory(GiB)": 121.15, "step": 8090, "token_acc": 0.509349593495935, "train_speed(iter/s)": 1.193673 }, { "epoch": 2.7741603838245372, "grad_norm": 1.5267062187194824, "learning_rate": 4.14295307250597e-05, "loss": 2.4073612213134767, "memory(GiB)": 121.15, "step": 8095, "token_acc": 0.49206349206349204, "train_speed(iter/s)": 1.193519 }, { "epoch": 2.7758738862234407, "grad_norm": 1.5997434854507446, "learning_rate": 4.137650114560127e-05, "loss": 2.297922134399414, "memory(GiB)": 121.15, "step": 8100, "token_acc": 0.505838198498749, "train_speed(iter/s)": 1.193529 }, { "epoch": 2.777587388622344, "grad_norm": 1.508156418800354, "learning_rate": 4.132348156182963e-05, "loss": 2.390484428405762, "memory(GiB)": 121.15, "step": 8105, "token_acc": 0.4909315746084089, "train_speed(iter/s)": 1.19349 }, { "epoch": 2.7793008910212476, "grad_norm": 1.5065027475357056, "learning_rate": 4.1270472035200915e-05, "loss": 2.275900459289551, "memory(GiB)": 121.15, "step": 8110, "token_acc": 0.49345162653147445, "train_speed(iter/s)": 1.193557 }, { "epoch": 2.7810143934201506, "grad_norm": 1.6494981050491333, "learning_rate": 4.121747262715963e-05, "loss": 2.378416061401367, "memory(GiB)": 121.15, "step": 8115, "token_acc": 0.5008795074758136, "train_speed(iter/s)": 1.193534 }, { "epoch": 2.782727895819054, "grad_norm": 1.4966726303100586, "learning_rate": 4.1164483399138576e-05, "loss": 2.317842483520508, "memory(GiB)": 121.15, "step": 8120, "token_acc": 0.5075398535114175, "train_speed(iter/s)": 1.193562 }, { "epoch": 2.7844413982179574, "grad_norm": 1.5789120197296143, "learning_rate": 4.11115044125587e-05, "loss": 2.3286500930786134, "memory(GiB)": 121.15, "step": 8125, "token_acc": 0.49847892220773576, "train_speed(iter/s)": 1.19359 }, { "epoch": 2.786154900616861, "grad_norm": 1.562319278717041, "learning_rate": 4.105853572882914e-05, "loss": 2.4296287536621093, "memory(GiB)": 121.15, "step": 8130, "token_acc": 0.4817902588854761, "train_speed(iter/s)": 1.193655 }, { "epoch": 2.7878684030157643, "grad_norm": 1.5127248764038086, "learning_rate": 4.100557740934703e-05, "loss": 2.3824228286743163, "memory(GiB)": 121.15, "step": 8135, "token_acc": 0.5067453625632378, "train_speed(iter/s)": 1.193692 }, { "epoch": 2.7895819054146678, "grad_norm": 1.5931386947631836, "learning_rate": 4.095262951549753e-05, "loss": 2.2854990005493163, "memory(GiB)": 121.15, "step": 8140, "token_acc": 0.5011246063877642, "train_speed(iter/s)": 1.193636 }, { "epoch": 2.7912954078135708, "grad_norm": 1.4190576076507568, "learning_rate": 4.0899692108653696e-05, "loss": 2.327110481262207, "memory(GiB)": 121.15, "step": 8145, "token_acc": 0.510239651416122, "train_speed(iter/s)": 1.193706 }, { "epoch": 2.793008910212474, "grad_norm": 1.631678819656372, "learning_rate": 4.084676525017647e-05, "loss": 2.2685739517211916, "memory(GiB)": 121.15, "step": 8150, "token_acc": 0.5036297640653358, "train_speed(iter/s)": 1.193727 }, { "epoch": 2.7947224126113777, "grad_norm": 1.730875015258789, "learning_rate": 4.0793849001414494e-05, "loss": 2.3732883453369142, "memory(GiB)": 121.15, "step": 8155, "token_acc": 0.4981195152528207, "train_speed(iter/s)": 1.193663 }, { "epoch": 2.796435915010281, "grad_norm": 1.6100255250930786, "learning_rate": 4.0740943423704196e-05, "loss": 2.3098613739013674, "memory(GiB)": 121.15, "step": 8160, "token_acc": 0.5122897800776197, "train_speed(iter/s)": 1.19371 }, { "epoch": 2.798149417409184, "grad_norm": 1.518561840057373, "learning_rate": 4.068804857836955e-05, "loss": 2.3199153900146485, "memory(GiB)": 121.15, "step": 8165, "token_acc": 0.5030837004405286, "train_speed(iter/s)": 1.19377 }, { "epoch": 2.7998629198080875, "grad_norm": 1.5999711751937866, "learning_rate": 4.063516452672216e-05, "loss": 2.3154571533203123, "memory(GiB)": 121.15, "step": 8170, "token_acc": 0.5, "train_speed(iter/s)": 1.193837 }, { "epoch": 2.801576422206991, "grad_norm": 1.5593599081039429, "learning_rate": 4.058229133006108e-05, "loss": 2.3897918701171874, "memory(GiB)": 121.15, "step": 8175, "token_acc": 0.4920430107526882, "train_speed(iter/s)": 1.193871 }, { "epoch": 2.8032899246058944, "grad_norm": 1.675803780555725, "learning_rate": 4.05294290496728e-05, "loss": 2.387475776672363, "memory(GiB)": 121.15, "step": 8180, "token_acc": 0.4987363100252738, "train_speed(iter/s)": 1.193866 }, { "epoch": 2.805003427004798, "grad_norm": 1.5203194618225098, "learning_rate": 4.047657774683113e-05, "loss": 2.1906909942626953, "memory(GiB)": 121.15, "step": 8185, "token_acc": 0.5254919499105546, "train_speed(iter/s)": 1.193833 }, { "epoch": 2.8067169294037013, "grad_norm": 1.6084851026535034, "learning_rate": 4.04237374827972e-05, "loss": 2.3288955688476562, "memory(GiB)": 121.15, "step": 8190, "token_acc": 0.5081266039349872, "train_speed(iter/s)": 1.193733 }, { "epoch": 2.8084304318026048, "grad_norm": 1.4567300081253052, "learning_rate": 4.0370908318819324e-05, "loss": 2.3196266174316404, "memory(GiB)": 121.15, "step": 8195, "token_acc": 0.519290465631929, "train_speed(iter/s)": 1.193763 }, { "epoch": 2.8101439342015078, "grad_norm": 1.5536800622940063, "learning_rate": 4.031809031613292e-05, "loss": 2.3541946411132812, "memory(GiB)": 121.15, "step": 8200, "token_acc": 0.49834983498349833, "train_speed(iter/s)": 1.193801 }, { "epoch": 2.811857436600411, "grad_norm": 1.5415433645248413, "learning_rate": 4.026528353596049e-05, "loss": 2.37197265625, "memory(GiB)": 121.15, "step": 8205, "token_acc": 0.509075907590759, "train_speed(iter/s)": 1.193775 }, { "epoch": 2.8135709389993147, "grad_norm": 1.528678297996521, "learning_rate": 4.0212488039511555e-05, "loss": 2.269755744934082, "memory(GiB)": 121.15, "step": 8210, "token_acc": 0.5160875160875161, "train_speed(iter/s)": 1.193799 }, { "epoch": 2.815284441398218, "grad_norm": 1.4955675601959229, "learning_rate": 4.015970388798253e-05, "loss": 2.317470169067383, "memory(GiB)": 121.15, "step": 8215, "token_acc": 0.5066195939982348, "train_speed(iter/s)": 1.193811 }, { "epoch": 2.816997943797121, "grad_norm": 2.1870265007019043, "learning_rate": 4.010693114255667e-05, "loss": 2.292193794250488, "memory(GiB)": 121.15, "step": 8220, "token_acc": 0.5015787099684258, "train_speed(iter/s)": 1.193825 }, { "epoch": 2.8187114461960245, "grad_norm": 1.5041214227676392, "learning_rate": 4.0054169864404037e-05, "loss": 2.21290283203125, "memory(GiB)": 121.15, "step": 8225, "token_acc": 0.5125398996808026, "train_speed(iter/s)": 1.193879 }, { "epoch": 2.820424948594928, "grad_norm": 1.6564319133758545, "learning_rate": 4.0001420114681384e-05, "loss": 2.3638093948364256, "memory(GiB)": 121.15, "step": 8230, "token_acc": 0.49253112033195023, "train_speed(iter/s)": 1.193878 }, { "epoch": 2.8221384509938314, "grad_norm": 1.682062029838562, "learning_rate": 3.994868195453209e-05, "loss": 2.3263677597045898, "memory(GiB)": 121.15, "step": 8235, "token_acc": 0.50736568457539, "train_speed(iter/s)": 1.193931 }, { "epoch": 2.823851953392735, "grad_norm": 1.5947937965393066, "learning_rate": 3.9895955445086116e-05, "loss": 2.224066162109375, "memory(GiB)": 121.15, "step": 8240, "token_acc": 0.5113082039911309, "train_speed(iter/s)": 1.19399 }, { "epoch": 2.8255654557916383, "grad_norm": 1.4926961660385132, "learning_rate": 3.984324064745993e-05, "loss": 2.237146759033203, "memory(GiB)": 121.15, "step": 8245, "token_acc": 0.5119930222416049, "train_speed(iter/s)": 1.19398 }, { "epoch": 2.8272789581905413, "grad_norm": 1.7023284435272217, "learning_rate": 3.979053762275641e-05, "loss": 2.2267377853393553, "memory(GiB)": 121.15, "step": 8250, "token_acc": 0.5138528138528139, "train_speed(iter/s)": 1.194014 }, { "epoch": 2.8289924605894448, "grad_norm": 1.61626398563385, "learning_rate": 3.973784643206478e-05, "loss": 2.371209716796875, "memory(GiB)": 121.15, "step": 8255, "token_acc": 0.49939344925192075, "train_speed(iter/s)": 1.193977 }, { "epoch": 2.830705962988348, "grad_norm": 1.7128705978393555, "learning_rate": 3.968516713646056e-05, "loss": 2.28033390045166, "memory(GiB)": 121.15, "step": 8260, "token_acc": 0.505303351718286, "train_speed(iter/s)": 1.193949 }, { "epoch": 2.8324194653872516, "grad_norm": 1.7926982641220093, "learning_rate": 3.963249979700548e-05, "loss": 2.3882879257202148, "memory(GiB)": 121.15, "step": 8265, "token_acc": 0.4933675652545999, "train_speed(iter/s)": 1.193905 }, { "epoch": 2.8341329677861546, "grad_norm": 1.7257990837097168, "learning_rate": 3.957984447474739e-05, "loss": 2.274164390563965, "memory(GiB)": 121.15, "step": 8270, "token_acc": 0.5072655217965654, "train_speed(iter/s)": 1.193737 }, { "epoch": 2.835846470185058, "grad_norm": 1.7262338399887085, "learning_rate": 3.9527201230720255e-05, "loss": 2.3277303695678713, "memory(GiB)": 121.15, "step": 8275, "token_acc": 0.5108892921960072, "train_speed(iter/s)": 1.193778 }, { "epoch": 2.8375599725839615, "grad_norm": 1.4948838949203491, "learning_rate": 3.9474570125943995e-05, "loss": 2.378707504272461, "memory(GiB)": 121.15, "step": 8280, "token_acc": 0.4940107393638992, "train_speed(iter/s)": 1.193763 }, { "epoch": 2.839273474982865, "grad_norm": 1.5111466646194458, "learning_rate": 3.9421951221424484e-05, "loss": 2.1894765853881837, "memory(GiB)": 121.15, "step": 8285, "token_acc": 0.518699910952805, "train_speed(iter/s)": 1.193828 }, { "epoch": 2.8409869773817684, "grad_norm": 1.5055327415466309, "learning_rate": 3.9369344578153475e-05, "loss": 2.348608207702637, "memory(GiB)": 121.15, "step": 8290, "token_acc": 0.4825345247766044, "train_speed(iter/s)": 1.193916 }, { "epoch": 2.842700479780672, "grad_norm": 1.5417581796646118, "learning_rate": 3.931675025710845e-05, "loss": 2.2956409454345703, "memory(GiB)": 121.15, "step": 8295, "token_acc": 0.512853470437018, "train_speed(iter/s)": 1.193955 }, { "epoch": 2.8444139821795753, "grad_norm": 1.521349310874939, "learning_rate": 3.926416831925264e-05, "loss": 2.2991729736328126, "memory(GiB)": 121.15, "step": 8300, "token_acc": 0.5180722891566265, "train_speed(iter/s)": 1.193978 }, { "epoch": 2.8461274845784783, "grad_norm": 1.4372450113296509, "learning_rate": 3.921159882553494e-05, "loss": 2.305103302001953, "memory(GiB)": 121.15, "step": 8305, "token_acc": 0.5097389142146705, "train_speed(iter/s)": 1.193901 }, { "epoch": 2.8478409869773817, "grad_norm": 1.7641798257827759, "learning_rate": 3.91590418368898e-05, "loss": 2.372893714904785, "memory(GiB)": 121.15, "step": 8310, "token_acc": 0.5040101308569016, "train_speed(iter/s)": 1.19393 }, { "epoch": 2.849554489376285, "grad_norm": 1.6648492813110352, "learning_rate": 3.910649741423717e-05, "loss": 2.291603660583496, "memory(GiB)": 121.15, "step": 8315, "token_acc": 0.5224018475750577, "train_speed(iter/s)": 1.193955 }, { "epoch": 2.8512679917751886, "grad_norm": 1.488171100616455, "learning_rate": 3.9053965618482445e-05, "loss": 2.3863039016723633, "memory(GiB)": 121.15, "step": 8320, "token_acc": 0.494971578487101, "train_speed(iter/s)": 1.193949 }, { "epoch": 2.8529814941740916, "grad_norm": 1.4774889945983887, "learning_rate": 3.9001446510516415e-05, "loss": 2.302927780151367, "memory(GiB)": 121.15, "step": 8325, "token_acc": 0.4979691307879773, "train_speed(iter/s)": 1.193956 }, { "epoch": 2.854694996572995, "grad_norm": 1.6324632167816162, "learning_rate": 3.894894015121507e-05, "loss": 2.2618579864501953, "memory(GiB)": 121.15, "step": 8330, "token_acc": 0.5205104831358249, "train_speed(iter/s)": 1.193963 }, { "epoch": 2.8564084989718985, "grad_norm": 1.467706561088562, "learning_rate": 3.889644660143973e-05, "loss": 2.398915100097656, "memory(GiB)": 121.15, "step": 8335, "token_acc": 0.4954769736842105, "train_speed(iter/s)": 1.19382 }, { "epoch": 2.858122001370802, "grad_norm": 1.6994136571884155, "learning_rate": 3.8843965922036814e-05, "loss": 2.373299980163574, "memory(GiB)": 121.15, "step": 8340, "token_acc": 0.49167733674775926, "train_speed(iter/s)": 1.193861 }, { "epoch": 2.8598355037697054, "grad_norm": 1.6510921716690063, "learning_rate": 3.87914981738378e-05, "loss": 2.2412174224853514, "memory(GiB)": 121.15, "step": 8345, "token_acc": 0.5269745360379802, "train_speed(iter/s)": 1.19386 }, { "epoch": 2.861549006168609, "grad_norm": 1.5934479236602783, "learning_rate": 3.873904341765925e-05, "loss": 2.4190753936767577, "memory(GiB)": 121.15, "step": 8350, "token_acc": 0.4872192099147947, "train_speed(iter/s)": 1.193876 }, { "epoch": 2.863262508567512, "grad_norm": 1.554694414138794, "learning_rate": 3.8686601714302606e-05, "loss": 2.2826189041137694, "memory(GiB)": 121.15, "step": 8355, "token_acc": 0.5154073448712537, "train_speed(iter/s)": 1.193908 }, { "epoch": 2.8649760109664153, "grad_norm": 1.6742122173309326, "learning_rate": 3.863417312455417e-05, "loss": 2.263763999938965, "memory(GiB)": 121.15, "step": 8360, "token_acc": 0.5165938864628821, "train_speed(iter/s)": 1.193952 }, { "epoch": 2.8666895133653187, "grad_norm": 1.5960415601730347, "learning_rate": 3.8581757709185104e-05, "loss": 2.2684038162231444, "memory(GiB)": 121.15, "step": 8365, "token_acc": 0.5043554006968641, "train_speed(iter/s)": 1.19403 }, { "epoch": 2.868403015764222, "grad_norm": 1.5087251663208008, "learning_rate": 3.852935552895126e-05, "loss": 2.389534759521484, "memory(GiB)": 121.15, "step": 8370, "token_acc": 0.4922039612305099, "train_speed(iter/s)": 1.193987 }, { "epoch": 2.870116518163125, "grad_norm": 1.4981257915496826, "learning_rate": 3.8476966644593135e-05, "loss": 2.274373245239258, "memory(GiB)": 121.15, "step": 8375, "token_acc": 0.5122791900043084, "train_speed(iter/s)": 1.194065 }, { "epoch": 2.8718300205620286, "grad_norm": 1.6126751899719238, "learning_rate": 3.8424591116835856e-05, "loss": 2.338445281982422, "memory(GiB)": 121.15, "step": 8380, "token_acc": 0.49706375838926176, "train_speed(iter/s)": 1.194077 }, { "epoch": 2.873543522960932, "grad_norm": 1.4883484840393066, "learning_rate": 3.837222900638905e-05, "loss": 2.331761932373047, "memory(GiB)": 121.15, "step": 8385, "token_acc": 0.4940402794903411, "train_speed(iter/s)": 1.194004 }, { "epoch": 2.8752570253598355, "grad_norm": 1.428598165512085, "learning_rate": 3.8319880373946795e-05, "loss": 2.3549156188964844, "memory(GiB)": 121.15, "step": 8390, "token_acc": 0.5014397367338543, "train_speed(iter/s)": 1.194056 }, { "epoch": 2.876970527758739, "grad_norm": 1.374932050704956, "learning_rate": 3.826754528018749e-05, "loss": 2.306890869140625, "memory(GiB)": 121.15, "step": 8395, "token_acc": 0.5045909849749582, "train_speed(iter/s)": 1.194023 }, { "epoch": 2.8786840301576424, "grad_norm": 1.545599102973938, "learning_rate": 3.821522378577395e-05, "loss": 2.4571741104125975, "memory(GiB)": 121.15, "step": 8400, "token_acc": 0.4760259179265659, "train_speed(iter/s)": 1.193982 }, { "epoch": 2.880397532556546, "grad_norm": 1.650925636291504, "learning_rate": 3.816291595135312e-05, "loss": 2.430350494384766, "memory(GiB)": 121.15, "step": 8405, "token_acc": 0.48231785257775883, "train_speed(iter/s)": 1.194001 }, { "epoch": 2.882111034955449, "grad_norm": 1.4905668497085571, "learning_rate": 3.8110621837556185e-05, "loss": 2.3991750717163085, "memory(GiB)": 121.15, "step": 8410, "token_acc": 0.4884318766066838, "train_speed(iter/s)": 1.193897 }, { "epoch": 2.8838245373543523, "grad_norm": 1.457558512687683, "learning_rate": 3.805834150499841e-05, "loss": 2.2778852462768553, "memory(GiB)": 121.15, "step": 8415, "token_acc": 0.4978883153449085, "train_speed(iter/s)": 1.193925 }, { "epoch": 2.8855380397532557, "grad_norm": 1.6635801792144775, "learning_rate": 3.8006075014279065e-05, "loss": 2.3462146759033202, "memory(GiB)": 121.15, "step": 8420, "token_acc": 0.49313358302122345, "train_speed(iter/s)": 1.193888 }, { "epoch": 2.887251542152159, "grad_norm": 1.4479472637176514, "learning_rate": 3.795382242598136e-05, "loss": 2.4357574462890623, "memory(GiB)": 121.15, "step": 8425, "token_acc": 0.4926184926184926, "train_speed(iter/s)": 1.193887 }, { "epoch": 2.888965044551062, "grad_norm": 1.6438034772872925, "learning_rate": 3.7901583800672445e-05, "loss": 2.372876739501953, "memory(GiB)": 121.15, "step": 8430, "token_acc": 0.4984297891431135, "train_speed(iter/s)": 1.193927 }, { "epoch": 2.8906785469499656, "grad_norm": 1.5155333280563354, "learning_rate": 3.784935919890326e-05, "loss": 2.2738758087158204, "memory(GiB)": 121.15, "step": 8435, "token_acc": 0.5063884156729132, "train_speed(iter/s)": 1.19378 }, { "epoch": 2.892392049348869, "grad_norm": 1.597977876663208, "learning_rate": 3.779714868120847e-05, "loss": 2.3466915130615233, "memory(GiB)": 121.15, "step": 8440, "token_acc": 0.4901330885727398, "train_speed(iter/s)": 1.193774 }, { "epoch": 2.8941055517477725, "grad_norm": 1.5892610549926758, "learning_rate": 3.774495230810645e-05, "loss": 2.334659194946289, "memory(GiB)": 121.15, "step": 8445, "token_acc": 0.5014736842105263, "train_speed(iter/s)": 1.193813 }, { "epoch": 2.895819054146676, "grad_norm": 1.4805757999420166, "learning_rate": 3.7692770140099145e-05, "loss": 2.3323972702026365, "memory(GiB)": 121.15, "step": 8450, "token_acc": 0.5046689303904923, "train_speed(iter/s)": 1.193834 }, { "epoch": 2.8975325565455794, "grad_norm": 1.4544373750686646, "learning_rate": 3.764060223767208e-05, "loss": 2.407394218444824, "memory(GiB)": 121.15, "step": 8455, "token_acc": 0.4841648590021692, "train_speed(iter/s)": 1.19389 }, { "epoch": 2.8992460589444824, "grad_norm": 1.5834975242614746, "learning_rate": 3.7588448661294165e-05, "loss": 2.2694728851318358, "memory(GiB)": 121.15, "step": 8460, "token_acc": 0.5057736720554272, "train_speed(iter/s)": 1.193754 }, { "epoch": 2.900959561343386, "grad_norm": 1.5493239164352417, "learning_rate": 3.75363094714178e-05, "loss": 2.2653213500976563, "memory(GiB)": 121.15, "step": 8465, "token_acc": 0.5130127922364358, "train_speed(iter/s)": 1.193805 }, { "epoch": 2.9026730637422893, "grad_norm": 1.612074613571167, "learning_rate": 3.748418472847864e-05, "loss": 2.318099784851074, "memory(GiB)": 121.15, "step": 8470, "token_acc": 0.5, "train_speed(iter/s)": 1.193793 }, { "epoch": 2.9043865661411927, "grad_norm": 1.5632963180541992, "learning_rate": 3.743207449289562e-05, "loss": 2.2269054412841798, "memory(GiB)": 121.15, "step": 8475, "token_acc": 0.5040983606557377, "train_speed(iter/s)": 1.19381 }, { "epoch": 2.9061000685400957, "grad_norm": 1.5124746561050415, "learning_rate": 3.737997882507088e-05, "loss": 2.3735084533691406, "memory(GiB)": 121.15, "step": 8480, "token_acc": 0.5076595744680851, "train_speed(iter/s)": 1.193817 }, { "epoch": 2.907813570938999, "grad_norm": 1.881354570388794, "learning_rate": 3.732789778538964e-05, "loss": 2.3380817413330077, "memory(GiB)": 121.15, "step": 8485, "token_acc": 0.5011420740063957, "train_speed(iter/s)": 1.193666 }, { "epoch": 2.9095270733379026, "grad_norm": 1.4020105600357056, "learning_rate": 3.7275831434220156e-05, "loss": 2.344683837890625, "memory(GiB)": 121.15, "step": 8490, "token_acc": 0.5062447960033306, "train_speed(iter/s)": 1.193665 }, { "epoch": 2.911240575736806, "grad_norm": 1.5645314455032349, "learning_rate": 3.72237798319137e-05, "loss": 2.2890382766723634, "memory(GiB)": 121.15, "step": 8495, "token_acc": 0.5071877807726864, "train_speed(iter/s)": 1.193713 }, { "epoch": 2.9129540781357095, "grad_norm": 1.5639678239822388, "learning_rate": 3.717174303880441e-05, "loss": 2.394083595275879, "memory(GiB)": 121.15, "step": 8500, "token_acc": 0.4852473899228325, "train_speed(iter/s)": 1.193737 }, { "epoch": 2.9129540781357095, "eval_loss": 2.2310516834259033, "eval_runtime": 3.7302, "eval_samples_per_second": 26.808, "eval_steps_per_second": 26.808, "eval_token_acc": 0.48655569782330343, "step": 8500 }, { "epoch": 2.914667580534613, "grad_norm": 1.6256135702133179, "learning_rate": 3.711972111520929e-05, "loss": 2.4089176177978517, "memory(GiB)": 121.15, "step": 8505, "token_acc": 0.48623853211009177, "train_speed(iter/s)": 1.192659 }, { "epoch": 2.916381082933516, "grad_norm": 1.57343327999115, "learning_rate": 3.70677141214281e-05, "loss": 2.296683692932129, "memory(GiB)": 121.15, "step": 8510, "token_acc": 0.5040408336877924, "train_speed(iter/s)": 1.19269 }, { "epoch": 2.9180945853324194, "grad_norm": 1.6131304502487183, "learning_rate": 3.701572211774326e-05, "loss": 2.393353271484375, "memory(GiB)": 121.15, "step": 8515, "token_acc": 0.4817846909537454, "train_speed(iter/s)": 1.192608 }, { "epoch": 2.919808087731323, "grad_norm": 1.429818868637085, "learning_rate": 3.6963745164419886e-05, "loss": 2.3343982696533203, "memory(GiB)": 121.15, "step": 8520, "token_acc": 0.5120535714285714, "train_speed(iter/s)": 1.192654 }, { "epoch": 2.9215215901302263, "grad_norm": 1.4620585441589355, "learning_rate": 3.6911783321705554e-05, "loss": 2.3193943023681642, "memory(GiB)": 121.15, "step": 8525, "token_acc": 0.5114250103863731, "train_speed(iter/s)": 1.192586 }, { "epoch": 2.9232350925291293, "grad_norm": 1.5154858827590942, "learning_rate": 3.68598366498304e-05, "loss": 2.33276252746582, "memory(GiB)": 121.15, "step": 8530, "token_acc": 0.4969723183391003, "train_speed(iter/s)": 1.192615 }, { "epoch": 2.9249485949280327, "grad_norm": 1.6015123128890991, "learning_rate": 3.680790520900696e-05, "loss": 2.2942514419555664, "memory(GiB)": 121.15, "step": 8535, "token_acc": 0.5040214477211796, "train_speed(iter/s)": 1.192661 }, { "epoch": 2.926662097326936, "grad_norm": 1.578438639640808, "learning_rate": 3.67559890594301e-05, "loss": 2.3375951766967775, "memory(GiB)": 121.15, "step": 8540, "token_acc": 0.5035913806863528, "train_speed(iter/s)": 1.192568 }, { "epoch": 2.9283755997258396, "grad_norm": 1.6107535362243652, "learning_rate": 3.6704088261276956e-05, "loss": 2.4581527709960938, "memory(GiB)": 121.15, "step": 8545, "token_acc": 0.48565310492505354, "train_speed(iter/s)": 1.192593 }, { "epoch": 2.930089102124743, "grad_norm": 1.5684584379196167, "learning_rate": 3.665220287470692e-05, "loss": 2.3295829772949217, "memory(GiB)": 121.15, "step": 8550, "token_acc": 0.5040983606557377, "train_speed(iter/s)": 1.19253 }, { "epoch": 2.9318026045236465, "grad_norm": 1.5336418151855469, "learning_rate": 3.6600332959861426e-05, "loss": 2.3713302612304688, "memory(GiB)": 121.15, "step": 8555, "token_acc": 0.4898867313915858, "train_speed(iter/s)": 1.192586 }, { "epoch": 2.93351610692255, "grad_norm": 1.69491708278656, "learning_rate": 3.6548478576864075e-05, "loss": 2.304068756103516, "memory(GiB)": 121.15, "step": 8560, "token_acc": 0.49730625777041026, "train_speed(iter/s)": 1.192583 }, { "epoch": 2.935229609321453, "grad_norm": 1.9096755981445312, "learning_rate": 3.6496639785820396e-05, "loss": 2.2930318832397463, "memory(GiB)": 121.15, "step": 8565, "token_acc": 0.49840546697038723, "train_speed(iter/s)": 1.1926 }, { "epoch": 2.9369431117203564, "grad_norm": 1.7008496522903442, "learning_rate": 3.644481664681788e-05, "loss": 2.361065483093262, "memory(GiB)": 121.15, "step": 8570, "token_acc": 0.49044309296264116, "train_speed(iter/s)": 1.192579 }, { "epoch": 2.93865661411926, "grad_norm": 1.5452463626861572, "learning_rate": 3.6393009219925864e-05, "loss": 2.3862640380859377, "memory(GiB)": 121.15, "step": 8575, "token_acc": 0.5012355848434926, "train_speed(iter/s)": 1.192578 }, { "epoch": 2.9403701165181633, "grad_norm": 1.402627944946289, "learning_rate": 3.634121756519547e-05, "loss": 2.342632865905762, "memory(GiB)": 121.15, "step": 8580, "token_acc": 0.5041017227235439, "train_speed(iter/s)": 1.192652 }, { "epoch": 2.9420836189170663, "grad_norm": 1.4606794118881226, "learning_rate": 3.6289441742659566e-05, "loss": 2.4799053192138674, "memory(GiB)": 121.15, "step": 8585, "token_acc": 0.4927894519983519, "train_speed(iter/s)": 1.192692 }, { "epoch": 2.9437971213159697, "grad_norm": 1.5299562215805054, "learning_rate": 3.6237681812332604e-05, "loss": 2.387319564819336, "memory(GiB)": 121.15, "step": 8590, "token_acc": 0.5029411764705882, "train_speed(iter/s)": 1.192733 }, { "epoch": 2.945510623714873, "grad_norm": 1.6590948104858398, "learning_rate": 3.618593783421067e-05, "loss": 2.3967815399169923, "memory(GiB)": 121.15, "step": 8595, "token_acc": 0.4838709677419355, "train_speed(iter/s)": 1.192747 }, { "epoch": 2.9472241261137766, "grad_norm": 1.808356523513794, "learning_rate": 3.6134209868271345e-05, "loss": 2.3479183197021483, "memory(GiB)": 121.15, "step": 8600, "token_acc": 0.5035051546391752, "train_speed(iter/s)": 1.192798 }, { "epoch": 2.94893762851268, "grad_norm": 1.4587924480438232, "learning_rate": 3.608249797447366e-05, "loss": 2.3058826446533205, "memory(GiB)": 121.15, "step": 8605, "token_acc": 0.5099009900990099, "train_speed(iter/s)": 1.192772 }, { "epoch": 2.9506511309115835, "grad_norm": 1.6210832595825195, "learning_rate": 3.6030802212758e-05, "loss": 2.352579116821289, "memory(GiB)": 121.15, "step": 8610, "token_acc": 0.5144557823129252, "train_speed(iter/s)": 1.192764 }, { "epoch": 2.9523646333104865, "grad_norm": 1.6602219343185425, "learning_rate": 3.5979122643046064e-05, "loss": 2.3269489288330076, "memory(GiB)": 121.15, "step": 8615, "token_acc": 0.4941964285714286, "train_speed(iter/s)": 1.192812 }, { "epoch": 2.95407813570939, "grad_norm": 1.8677892684936523, "learning_rate": 3.592745932524076e-05, "loss": 2.357849884033203, "memory(GiB)": 121.15, "step": 8620, "token_acc": 0.5091863517060368, "train_speed(iter/s)": 1.192851 }, { "epoch": 2.9557916381082934, "grad_norm": 1.5787839889526367, "learning_rate": 3.587581231922617e-05, "loss": 2.3000736236572266, "memory(GiB)": 121.15, "step": 8625, "token_acc": 0.5078556263269639, "train_speed(iter/s)": 1.192902 }, { "epoch": 2.957505140507197, "grad_norm": 1.9026787281036377, "learning_rate": 3.582418168486748e-05, "loss": 2.2839996337890627, "memory(GiB)": 121.15, "step": 8630, "token_acc": 0.5099703012303776, "train_speed(iter/s)": 1.192861 }, { "epoch": 2.9592186429061, "grad_norm": 1.5190229415893555, "learning_rate": 3.577256748201091e-05, "loss": 2.3706811904907226, "memory(GiB)": 121.15, "step": 8635, "token_acc": 0.4903727980335928, "train_speed(iter/s)": 1.192928 }, { "epoch": 2.9609321453050033, "grad_norm": 1.5569008588790894, "learning_rate": 3.572096977048358e-05, "loss": 2.3434267044067383, "memory(GiB)": 121.15, "step": 8640, "token_acc": 0.5014107214832729, "train_speed(iter/s)": 1.192996 }, { "epoch": 2.9626456477039067, "grad_norm": 1.6903308629989624, "learning_rate": 3.5669388610093554e-05, "loss": 2.2914501190185548, "memory(GiB)": 121.15, "step": 8645, "token_acc": 0.5106575963718821, "train_speed(iter/s)": 1.193039 }, { "epoch": 2.96435915010281, "grad_norm": 1.6186858415603638, "learning_rate": 3.5617824060629706e-05, "loss": 2.2670654296875, "memory(GiB)": 121.15, "step": 8650, "token_acc": 0.517193947730399, "train_speed(iter/s)": 1.193029 }, { "epoch": 2.9660726525017136, "grad_norm": 1.6906172037124634, "learning_rate": 3.55662761818616e-05, "loss": 2.419187545776367, "memory(GiB)": 121.15, "step": 8655, "token_acc": 0.5013723696248856, "train_speed(iter/s)": 1.193033 }, { "epoch": 2.967786154900617, "grad_norm": 1.5824154615402222, "learning_rate": 3.551474503353951e-05, "loss": 2.4431011199951174, "memory(GiB)": 121.15, "step": 8660, "token_acc": 0.4873524451939292, "train_speed(iter/s)": 1.193071 }, { "epoch": 2.9694996572995205, "grad_norm": 1.508203148841858, "learning_rate": 3.5463230675394356e-05, "loss": 2.259535217285156, "memory(GiB)": 121.15, "step": 8665, "token_acc": 0.5145969498910675, "train_speed(iter/s)": 1.193109 }, { "epoch": 2.9712131596984235, "grad_norm": 1.7769821882247925, "learning_rate": 3.5411733167137526e-05, "loss": 2.333037567138672, "memory(GiB)": 121.15, "step": 8670, "token_acc": 0.49936251593710157, "train_speed(iter/s)": 1.193141 }, { "epoch": 2.972926662097327, "grad_norm": 1.5895421504974365, "learning_rate": 3.536025256846093e-05, "loss": 2.489533805847168, "memory(GiB)": 121.15, "step": 8675, "token_acc": 0.486661277283751, "train_speed(iter/s)": 1.193204 }, { "epoch": 2.9746401644962304, "grad_norm": 1.5762379169464111, "learning_rate": 3.530878893903688e-05, "loss": 2.3454471588134767, "memory(GiB)": 121.15, "step": 8680, "token_acc": 0.489787049109083, "train_speed(iter/s)": 1.193223 }, { "epoch": 2.976353666895134, "grad_norm": 1.5479170083999634, "learning_rate": 3.525734233851795e-05, "loss": 2.3672027587890625, "memory(GiB)": 121.15, "step": 8685, "token_acc": 0.4946058091286307, "train_speed(iter/s)": 1.193199 }, { "epoch": 2.978067169294037, "grad_norm": 1.5874828100204468, "learning_rate": 3.5205912826537055e-05, "loss": 2.3584009170532227, "memory(GiB)": 121.15, "step": 8690, "token_acc": 0.5066119471044231, "train_speed(iter/s)": 1.193217 }, { "epoch": 2.9797806716929403, "grad_norm": 1.5923521518707275, "learning_rate": 3.515450046270726e-05, "loss": 2.3333444595336914, "memory(GiB)": 121.15, "step": 8695, "token_acc": 0.5019043588658485, "train_speed(iter/s)": 1.193127 }, { "epoch": 2.9814941740918437, "grad_norm": 1.6195495128631592, "learning_rate": 3.5103105306621786e-05, "loss": 2.2876913070678713, "memory(GiB)": 121.15, "step": 8700, "token_acc": 0.5053856096510125, "train_speed(iter/s)": 1.193185 }, { "epoch": 2.983207676490747, "grad_norm": 1.7005062103271484, "learning_rate": 3.505172741785385e-05, "loss": 2.3382389068603517, "memory(GiB)": 121.15, "step": 8705, "token_acc": 0.5062402496099844, "train_speed(iter/s)": 1.193144 }, { "epoch": 2.9849211788896506, "grad_norm": 1.6922836303710938, "learning_rate": 3.500036685595673e-05, "loss": 2.3141128540039064, "memory(GiB)": 121.15, "step": 8710, "token_acc": 0.5008503401360545, "train_speed(iter/s)": 1.193164 }, { "epoch": 2.986634681288554, "grad_norm": 1.7002676725387573, "learning_rate": 3.494902368046357e-05, "loss": 2.415830612182617, "memory(GiB)": 121.15, "step": 8715, "token_acc": 0.4861995753715499, "train_speed(iter/s)": 1.193249 }, { "epoch": 2.988348183687457, "grad_norm": 1.653050422668457, "learning_rate": 3.489769795088733e-05, "loss": 2.3206449508666993, "memory(GiB)": 121.15, "step": 8720, "token_acc": 0.5027602523659306, "train_speed(iter/s)": 1.193226 }, { "epoch": 2.9900616860863605, "grad_norm": 1.6064703464508057, "learning_rate": 3.484638972672084e-05, "loss": 2.3289939880371096, "memory(GiB)": 121.15, "step": 8725, "token_acc": 0.4952090592334495, "train_speed(iter/s)": 1.193269 }, { "epoch": 2.991775188485264, "grad_norm": 1.536339282989502, "learning_rate": 3.479509906743657e-05, "loss": 2.3234346389770506, "memory(GiB)": 121.15, "step": 8730, "token_acc": 0.4954695222405272, "train_speed(iter/s)": 1.193273 }, { "epoch": 2.9934886908841674, "grad_norm": 2.0391249656677246, "learning_rate": 3.4743826032486636e-05, "loss": 2.2786609649658205, "memory(GiB)": 121.15, "step": 8735, "token_acc": 0.49979991996798717, "train_speed(iter/s)": 1.193259 }, { "epoch": 2.9952021932830704, "grad_norm": 1.5964003801345825, "learning_rate": 3.469257068130276e-05, "loss": 2.375421333312988, "memory(GiB)": 121.15, "step": 8740, "token_acc": 0.5079638398622471, "train_speed(iter/s)": 1.193303 }, { "epoch": 2.996915695681974, "grad_norm": 1.4753762483596802, "learning_rate": 3.4641333073296146e-05, "loss": 2.3293346405029296, "memory(GiB)": 121.15, "step": 8745, "token_acc": 0.5032706459525756, "train_speed(iter/s)": 1.193333 }, { "epoch": 2.9986291980808772, "grad_norm": 1.633676528930664, "learning_rate": 3.4590113267857414e-05, "loss": 2.4083295822143556, "memory(GiB)": 121.15, "step": 8750, "token_acc": 0.4813417190775681, "train_speed(iter/s)": 1.193233 }, { "epoch": 3.0003427004797807, "grad_norm": 1.4545191526412964, "learning_rate": 3.453891132435656e-05, "loss": 2.300695037841797, "memory(GiB)": 121.15, "step": 8755, "token_acc": 0.5002331002331002, "train_speed(iter/s)": 1.193353 }, { "epoch": 3.002056202878684, "grad_norm": 1.5645560026168823, "learning_rate": 3.4487727302142906e-05, "loss": 2.23433895111084, "memory(GiB)": 121.15, "step": 8760, "token_acc": 0.5139689578713968, "train_speed(iter/s)": 1.193344 }, { "epoch": 3.0037697052775876, "grad_norm": 1.507390022277832, "learning_rate": 3.4436561260544944e-05, "loss": 2.233460807800293, "memory(GiB)": 121.15, "step": 8765, "token_acc": 0.518847945785684, "train_speed(iter/s)": 1.19336 }, { "epoch": 3.0054832076764906, "grad_norm": 1.4608941078186035, "learning_rate": 3.438541325887037e-05, "loss": 2.3050167083740236, "memory(GiB)": 121.15, "step": 8770, "token_acc": 0.5132743362831859, "train_speed(iter/s)": 1.193383 }, { "epoch": 3.007196710075394, "grad_norm": 1.5844905376434326, "learning_rate": 3.4334283356405966e-05, "loss": 2.323673629760742, "memory(GiB)": 122.07, "step": 8775, "token_acc": 0.512013256006628, "train_speed(iter/s)": 1.193318 }, { "epoch": 3.0089102124742975, "grad_norm": 1.625449776649475, "learning_rate": 3.428317161241753e-05, "loss": 2.2404956817626953, "memory(GiB)": 122.07, "step": 8780, "token_acc": 0.5162653147444022, "train_speed(iter/s)": 1.193353 }, { "epoch": 3.010623714873201, "grad_norm": 1.8972402811050415, "learning_rate": 3.423207808614976e-05, "loss": 2.190186882019043, "memory(GiB)": 122.07, "step": 8785, "token_acc": 0.5158077089649199, "train_speed(iter/s)": 1.19333 }, { "epoch": 3.0123372172721044, "grad_norm": 1.7302452325820923, "learning_rate": 3.4181002836826325e-05, "loss": 2.279846954345703, "memory(GiB)": 122.07, "step": 8790, "token_acc": 0.5092221331194867, "train_speed(iter/s)": 1.19337 }, { "epoch": 3.0140507196710074, "grad_norm": 1.7043501138687134, "learning_rate": 3.412994592364966e-05, "loss": 2.250117874145508, "memory(GiB)": 122.07, "step": 8795, "token_acc": 0.501917341286749, "train_speed(iter/s)": 1.193434 }, { "epoch": 3.015764222069911, "grad_norm": 1.6830390691757202, "learning_rate": 3.407890740580095e-05, "loss": 2.291457748413086, "memory(GiB)": 122.07, "step": 8800, "token_acc": 0.4964633068081344, "train_speed(iter/s)": 1.19342 }, { "epoch": 3.0174777244688142, "grad_norm": 1.6219419240951538, "learning_rate": 3.402788734244007e-05, "loss": 2.21071834564209, "memory(GiB)": 122.07, "step": 8805, "token_acc": 0.5171353251318102, "train_speed(iter/s)": 1.193488 }, { "epoch": 3.0191912268677177, "grad_norm": 1.709816575050354, "learning_rate": 3.397688579270549e-05, "loss": 2.0702747344970702, "memory(GiB)": 122.07, "step": 8810, "token_acc": 0.549054905490549, "train_speed(iter/s)": 1.193541 }, { "epoch": 3.020904729266621, "grad_norm": 1.746468424797058, "learning_rate": 3.39259028157142e-05, "loss": 2.3346933364868163, "memory(GiB)": 122.07, "step": 8815, "token_acc": 0.5141983398864133, "train_speed(iter/s)": 1.193365 }, { "epoch": 3.022618231665524, "grad_norm": 1.6401045322418213, "learning_rate": 3.3874938470561714e-05, "loss": 2.330914306640625, "memory(GiB)": 122.07, "step": 8820, "token_acc": 0.5150162337662337, "train_speed(iter/s)": 1.193386 }, { "epoch": 3.0243317340644276, "grad_norm": 1.5162307024002075, "learning_rate": 3.382399281632192e-05, "loss": 2.1995338439941405, "memory(GiB)": 122.07, "step": 8825, "token_acc": 0.529192546583851, "train_speed(iter/s)": 1.193319 }, { "epoch": 3.026045236463331, "grad_norm": 1.870711088180542, "learning_rate": 3.3773065912047044e-05, "loss": 2.2284805297851564, "memory(GiB)": 122.07, "step": 8830, "token_acc": 0.5242718446601942, "train_speed(iter/s)": 1.193365 }, { "epoch": 3.0277587388622345, "grad_norm": 1.7264056205749512, "learning_rate": 3.3722157816767575e-05, "loss": 2.293305015563965, "memory(GiB)": 122.07, "step": 8835, "token_acc": 0.5145941623350659, "train_speed(iter/s)": 1.193378 }, { "epoch": 3.029472241261138, "grad_norm": 1.6860394477844238, "learning_rate": 3.367126858949221e-05, "loss": 2.365376853942871, "memory(GiB)": 122.07, "step": 8840, "token_acc": 0.48997772828507796, "train_speed(iter/s)": 1.19341 }, { "epoch": 3.0311857436600413, "grad_norm": 1.8391963243484497, "learning_rate": 3.362039828920778e-05, "loss": 2.2944393157958984, "memory(GiB)": 122.07, "step": 8845, "token_acc": 0.5044052863436124, "train_speed(iter/s)": 1.193367 }, { "epoch": 3.0328992460589443, "grad_norm": 1.597660779953003, "learning_rate": 3.3569546974879117e-05, "loss": 2.263043212890625, "memory(GiB)": 122.07, "step": 8850, "token_acc": 0.5014553014553015, "train_speed(iter/s)": 1.193298 }, { "epoch": 3.034612748457848, "grad_norm": 1.6716846227645874, "learning_rate": 3.351871470544915e-05, "loss": 2.220351791381836, "memory(GiB)": 122.07, "step": 8855, "token_acc": 0.509590235396687, "train_speed(iter/s)": 1.193327 }, { "epoch": 3.0363262508567512, "grad_norm": 1.650913953781128, "learning_rate": 3.346790153983865e-05, "loss": 2.191604804992676, "memory(GiB)": 122.07, "step": 8860, "token_acc": 0.5283261802575108, "train_speed(iter/s)": 1.193331 }, { "epoch": 3.0380397532556547, "grad_norm": 1.838843822479248, "learning_rate": 3.341710753694627e-05, "loss": 2.1831289291381837, "memory(GiB)": 122.07, "step": 8865, "token_acc": 0.5281404772624944, "train_speed(iter/s)": 1.193366 }, { "epoch": 3.039753255654558, "grad_norm": 1.8924020528793335, "learning_rate": 3.336633275564848e-05, "loss": 2.275887298583984, "memory(GiB)": 122.07, "step": 8870, "token_acc": 0.516643225503985, "train_speed(iter/s)": 1.193436 }, { "epoch": 3.041466758053461, "grad_norm": 1.6475555896759033, "learning_rate": 3.331557725479944e-05, "loss": 2.2361019134521483, "memory(GiB)": 122.07, "step": 8875, "token_acc": 0.520764119601329, "train_speed(iter/s)": 1.193386 }, { "epoch": 3.0431802604523646, "grad_norm": 1.9566600322723389, "learning_rate": 3.326484109323092e-05, "loss": 2.185692024230957, "memory(GiB)": 122.07, "step": 8880, "token_acc": 0.5162227602905569, "train_speed(iter/s)": 1.193432 }, { "epoch": 3.044893762851268, "grad_norm": 1.7617939710617065, "learning_rate": 3.321412432975235e-05, "loss": 2.2551647186279298, "memory(GiB)": 122.07, "step": 8885, "token_acc": 0.5098712446351932, "train_speed(iter/s)": 1.193437 }, { "epoch": 3.0466072652501714, "grad_norm": 1.6758893728256226, "learning_rate": 3.316342702315066e-05, "loss": 2.30643367767334, "memory(GiB)": 122.07, "step": 8890, "token_acc": 0.5059744540585085, "train_speed(iter/s)": 1.193338 }, { "epoch": 3.048320767649075, "grad_norm": 1.9909729957580566, "learning_rate": 3.3112749232190175e-05, "loss": 2.218743896484375, "memory(GiB)": 122.07, "step": 8895, "token_acc": 0.5174563591022444, "train_speed(iter/s)": 1.193343 }, { "epoch": 3.050034270047978, "grad_norm": 1.645501971244812, "learning_rate": 3.306209101561267e-05, "loss": 2.19703369140625, "memory(GiB)": 122.07, "step": 8900, "token_acc": 0.5162116040955631, "train_speed(iter/s)": 1.193338 }, { "epoch": 3.0517477724468813, "grad_norm": 1.6399303674697876, "learning_rate": 3.301145243213717e-05, "loss": 2.2985908508300783, "memory(GiB)": 122.07, "step": 8905, "token_acc": 0.517931609674729, "train_speed(iter/s)": 1.193347 }, { "epoch": 3.053461274845785, "grad_norm": 1.624413251876831, "learning_rate": 3.296083354045999e-05, "loss": 2.3368175506591795, "memory(GiB)": 122.07, "step": 8910, "token_acc": 0.4954918032786885, "train_speed(iter/s)": 1.193366 }, { "epoch": 3.0551747772446882, "grad_norm": 1.639344334602356, "learning_rate": 3.291023439925457e-05, "loss": 2.4137229919433594, "memory(GiB)": 122.07, "step": 8915, "token_acc": 0.4699140401146132, "train_speed(iter/s)": 1.193294 }, { "epoch": 3.0568882796435917, "grad_norm": 1.7084764242172241, "learning_rate": 3.2859655067171505e-05, "loss": 2.3155616760253905, "memory(GiB)": 122.07, "step": 8920, "token_acc": 0.4980299448384555, "train_speed(iter/s)": 1.193312 }, { "epoch": 3.0586017820424947, "grad_norm": 1.7823113203048706, "learning_rate": 3.28090956028384e-05, "loss": 2.293366050720215, "memory(GiB)": 122.07, "step": 8925, "token_acc": 0.52, "train_speed(iter/s)": 1.193236 }, { "epoch": 3.060315284441398, "grad_norm": 1.7642625570297241, "learning_rate": 3.2758556064859855e-05, "loss": 2.33115234375, "memory(GiB)": 122.07, "step": 8930, "token_acc": 0.5081699346405228, "train_speed(iter/s)": 1.193228 }, { "epoch": 3.0620287868403016, "grad_norm": 1.7789069414138794, "learning_rate": 3.270803651181733e-05, "loss": 2.3448665618896483, "memory(GiB)": 122.07, "step": 8935, "token_acc": 0.5127630989699955, "train_speed(iter/s)": 1.193239 }, { "epoch": 3.063742289239205, "grad_norm": 1.7393025159835815, "learning_rate": 3.265753700226919e-05, "loss": 2.1954620361328123, "memory(GiB)": 122.07, "step": 8940, "token_acc": 0.5129757785467128, "train_speed(iter/s)": 1.19318 }, { "epoch": 3.0654557916381084, "grad_norm": 1.5473545789718628, "learning_rate": 3.2607057594750476e-05, "loss": 2.2024887084960936, "memory(GiB)": 122.07, "step": 8945, "token_acc": 0.5240118997025074, "train_speed(iter/s)": 1.193219 }, { "epoch": 3.0671692940370114, "grad_norm": 1.9248249530792236, "learning_rate": 3.2556598347773006e-05, "loss": 2.2586841583251953, "memory(GiB)": 122.07, "step": 8950, "token_acc": 0.513070447496677, "train_speed(iter/s)": 1.193181 }, { "epoch": 3.068882796435915, "grad_norm": 1.6810667514801025, "learning_rate": 3.250615931982519e-05, "loss": 2.21276798248291, "memory(GiB)": 122.07, "step": 8955, "token_acc": 0.5256637168141592, "train_speed(iter/s)": 1.193239 }, { "epoch": 3.0705962988348183, "grad_norm": 1.7392075061798096, "learning_rate": 3.2455740569372014e-05, "loss": 2.2368030548095703, "memory(GiB)": 122.07, "step": 8960, "token_acc": 0.5111208024422155, "train_speed(iter/s)": 1.193289 }, { "epoch": 3.0723098012337218, "grad_norm": 1.8429954051971436, "learning_rate": 3.2405342154854964e-05, "loss": 2.1413074493408204, "memory(GiB)": 122.07, "step": 8965, "token_acc": 0.5316001792917974, "train_speed(iter/s)": 1.19334 }, { "epoch": 3.074023303632625, "grad_norm": 1.7588374614715576, "learning_rate": 3.2354964134691946e-05, "loss": 2.3064386367797853, "memory(GiB)": 122.07, "step": 8970, "token_acc": 0.49531914893617024, "train_speed(iter/s)": 1.19339 }, { "epoch": 3.0757368060315287, "grad_norm": 1.693827509880066, "learning_rate": 3.230460656727724e-05, "loss": 2.167089080810547, "memory(GiB)": 122.07, "step": 8975, "token_acc": 0.5232715770447357, "train_speed(iter/s)": 1.193394 }, { "epoch": 3.0774503084304317, "grad_norm": 1.9387112855911255, "learning_rate": 3.225426951098139e-05, "loss": 2.2793413162231446, "memory(GiB)": 122.07, "step": 8980, "token_acc": 0.502991452991453, "train_speed(iter/s)": 1.193442 }, { "epoch": 3.079163810829335, "grad_norm": 1.8048999309539795, "learning_rate": 3.2203953024151176e-05, "loss": 2.368214988708496, "memory(GiB)": 122.07, "step": 8985, "token_acc": 0.5115400755350399, "train_speed(iter/s)": 1.193505 }, { "epoch": 3.0808773132282385, "grad_norm": 1.7254928350448608, "learning_rate": 3.215365716510955e-05, "loss": 2.1496627807617186, "memory(GiB)": 122.07, "step": 8990, "token_acc": 0.5297157622739018, "train_speed(iter/s)": 1.193471 }, { "epoch": 3.082590815627142, "grad_norm": 1.8961069583892822, "learning_rate": 3.210338199215556e-05, "loss": 2.2892387390136717, "memory(GiB)": 122.07, "step": 8995, "token_acc": 0.49507292077256604, "train_speed(iter/s)": 1.193479 }, { "epoch": 3.0843043180260454, "grad_norm": 1.4914902448654175, "learning_rate": 3.2053127563564244e-05, "loss": 2.2701747894287108, "memory(GiB)": 122.07, "step": 9000, "token_acc": 0.5072756669361358, "train_speed(iter/s)": 1.193517 }, { "epoch": 3.0843043180260454, "eval_loss": 2.228647232055664, "eval_runtime": 3.7878, "eval_samples_per_second": 26.4, "eval_steps_per_second": 26.4, "eval_token_acc": 0.49291784702549574, "step": 9000 }, { "epoch": 3.0860178204249484, "grad_norm": 1.7496970891952515, "learning_rate": 3.200289393758664e-05, "loss": 2.216603660583496, "memory(GiB)": 122.07, "step": 9005, "token_acc": 0.5168094924192486, "train_speed(iter/s)": 1.192574 }, { "epoch": 3.087731322823852, "grad_norm": 1.7419557571411133, "learning_rate": 3.195268117244962e-05, "loss": 2.258379364013672, "memory(GiB)": 122.07, "step": 9010, "token_acc": 0.5086655112651647, "train_speed(iter/s)": 1.192626 }, { "epoch": 3.0894448252227553, "grad_norm": 1.7188273668289185, "learning_rate": 3.19024893263559e-05, "loss": 2.1908195495605467, "memory(GiB)": 122.07, "step": 9015, "token_acc": 0.5213822894168466, "train_speed(iter/s)": 1.192659 }, { "epoch": 3.0911583276216588, "grad_norm": 2.312983751296997, "learning_rate": 3.185231845748397e-05, "loss": 2.292974090576172, "memory(GiB)": 122.07, "step": 9020, "token_acc": 0.5043914680050188, "train_speed(iter/s)": 1.192638 }, { "epoch": 3.092871830020562, "grad_norm": 1.982960820198059, "learning_rate": 3.180216862398796e-05, "loss": 2.3198162078857423, "memory(GiB)": 122.07, "step": 9025, "token_acc": 0.5086241476133173, "train_speed(iter/s)": 1.192702 }, { "epoch": 3.094585332419465, "grad_norm": 1.5948541164398193, "learning_rate": 3.1752039883997665e-05, "loss": 2.217629623413086, "memory(GiB)": 122.07, "step": 9030, "token_acc": 0.5120101137800253, "train_speed(iter/s)": 1.192765 }, { "epoch": 3.0962988348183687, "grad_norm": 2.0782973766326904, "learning_rate": 3.17019322956184e-05, "loss": 2.1459657669067385, "memory(GiB)": 122.07, "step": 9035, "token_acc": 0.5388936905790839, "train_speed(iter/s)": 1.192786 }, { "epoch": 3.098012337217272, "grad_norm": 1.693303108215332, "learning_rate": 3.165184591693098e-05, "loss": 2.2658594131469725, "memory(GiB)": 122.07, "step": 9040, "token_acc": 0.5163179916317991, "train_speed(iter/s)": 1.192852 }, { "epoch": 3.0997258396161755, "grad_norm": 1.6988754272460938, "learning_rate": 3.160178080599161e-05, "loss": 2.258569526672363, "memory(GiB)": 122.07, "step": 9045, "token_acc": 0.5203045685279187, "train_speed(iter/s)": 1.192857 }, { "epoch": 3.101439342015079, "grad_norm": 1.8109917640686035, "learning_rate": 3.155173702083185e-05, "loss": 2.250725746154785, "memory(GiB)": 122.07, "step": 9050, "token_acc": 0.5198294243070363, "train_speed(iter/s)": 1.192901 }, { "epoch": 3.103152844413982, "grad_norm": 1.7691233158111572, "learning_rate": 3.1501714619458574e-05, "loss": 2.243021774291992, "memory(GiB)": 122.07, "step": 9055, "token_acc": 0.5230305639259578, "train_speed(iter/s)": 1.192919 }, { "epoch": 3.1048663468128854, "grad_norm": 1.6419785022735596, "learning_rate": 3.145171365985384e-05, "loss": 2.365329933166504, "memory(GiB)": 122.07, "step": 9060, "token_acc": 0.5045417010734929, "train_speed(iter/s)": 1.192902 }, { "epoch": 3.106579849211789, "grad_norm": 1.8286446332931519, "learning_rate": 3.140173419997485e-05, "loss": 2.351803207397461, "memory(GiB)": 122.07, "step": 9065, "token_acc": 0.4993635977938057, "train_speed(iter/s)": 1.192887 }, { "epoch": 3.1082933516106923, "grad_norm": 1.8733779191970825, "learning_rate": 3.135177629775391e-05, "loss": 2.281037139892578, "memory(GiB)": 122.07, "step": 9070, "token_acc": 0.4968632371392723, "train_speed(iter/s)": 1.192931 }, { "epoch": 3.1100068540095958, "grad_norm": 1.688448429107666, "learning_rate": 3.13018400110983e-05, "loss": 2.2656137466430666, "memory(GiB)": 122.07, "step": 9075, "token_acc": 0.5171578522406136, "train_speed(iter/s)": 1.192963 }, { "epoch": 3.111720356408499, "grad_norm": 1.7524629831314087, "learning_rate": 3.125192539789026e-05, "loss": 2.2550739288330077, "memory(GiB)": 122.07, "step": 9080, "token_acc": 0.5235927152317881, "train_speed(iter/s)": 1.192955 }, { "epoch": 3.113433858807402, "grad_norm": 1.815287470817566, "learning_rate": 3.120203251598692e-05, "loss": 2.2247142791748047, "memory(GiB)": 122.07, "step": 9085, "token_acc": 0.5219123505976095, "train_speed(iter/s)": 1.192953 }, { "epoch": 3.1151473612063056, "grad_norm": 1.8016670942306519, "learning_rate": 3.115216142322024e-05, "loss": 2.227750778198242, "memory(GiB)": 122.07, "step": 9090, "token_acc": 0.5249376558603491, "train_speed(iter/s)": 1.192995 }, { "epoch": 3.116860863605209, "grad_norm": 1.8841761350631714, "learning_rate": 3.110231217739686e-05, "loss": 2.227629852294922, "memory(GiB)": 122.07, "step": 9095, "token_acc": 0.5191947565543071, "train_speed(iter/s)": 1.193031 }, { "epoch": 3.1185743660041125, "grad_norm": 1.7159292697906494, "learning_rate": 3.105248483629817e-05, "loss": 2.3194141387939453, "memory(GiB)": 122.07, "step": 9100, "token_acc": 0.5021739130434782, "train_speed(iter/s)": 1.193029 }, { "epoch": 3.120287868403016, "grad_norm": 1.632285714149475, "learning_rate": 3.1002679457680114e-05, "loss": 2.216500663757324, "memory(GiB)": 122.07, "step": 9105, "token_acc": 0.5254691689008043, "train_speed(iter/s)": 1.192917 }, { "epoch": 3.122001370801919, "grad_norm": 1.7120065689086914, "learning_rate": 3.095289609927321e-05, "loss": 2.300215148925781, "memory(GiB)": 122.07, "step": 9110, "token_acc": 0.5306479859894921, "train_speed(iter/s)": 1.192963 }, { "epoch": 3.1237148732008224, "grad_norm": 1.717826247215271, "learning_rate": 3.090313481878242e-05, "loss": 2.2408580780029297, "memory(GiB)": 122.07, "step": 9115, "token_acc": 0.5201543739279588, "train_speed(iter/s)": 1.193017 }, { "epoch": 3.125428375599726, "grad_norm": 1.7707499265670776, "learning_rate": 3.0853395673887166e-05, "loss": 2.277888870239258, "memory(GiB)": 122.07, "step": 9120, "token_acc": 0.5149804602692141, "train_speed(iter/s)": 1.193073 }, { "epoch": 3.1271418779986293, "grad_norm": 1.6590925455093384, "learning_rate": 3.0803678722241146e-05, "loss": 2.208821105957031, "memory(GiB)": 122.07, "step": 9125, "token_acc": 0.5189346650020807, "train_speed(iter/s)": 1.19314 }, { "epoch": 3.1288553803975327, "grad_norm": 1.7138558626174927, "learning_rate": 3.0753984021472394e-05, "loss": 2.2665132522583007, "memory(GiB)": 122.07, "step": 9130, "token_acc": 0.5197889182058048, "train_speed(iter/s)": 1.193189 }, { "epoch": 3.1305688827964357, "grad_norm": 1.8158000707626343, "learning_rate": 3.0704311629183134e-05, "loss": 2.2681648254394533, "memory(GiB)": 122.07, "step": 9135, "token_acc": 0.5163114397564158, "train_speed(iter/s)": 1.193206 }, { "epoch": 3.132282385195339, "grad_norm": 1.8797789812088013, "learning_rate": 3.0654661602949734e-05, "loss": 2.259324645996094, "memory(GiB)": 122.07, "step": 9140, "token_acc": 0.5077503142019271, "train_speed(iter/s)": 1.193229 }, { "epoch": 3.1339958875942426, "grad_norm": 1.6962672472000122, "learning_rate": 3.0605034000322586e-05, "loss": 2.193975830078125, "memory(GiB)": 122.07, "step": 9145, "token_acc": 0.5068438003220612, "train_speed(iter/s)": 1.193122 }, { "epoch": 3.135709389993146, "grad_norm": 1.8874629735946655, "learning_rate": 3.0555428878826165e-05, "loss": 2.227701187133789, "memory(GiB)": 122.07, "step": 9150, "token_acc": 0.5138528138528139, "train_speed(iter/s)": 1.193159 }, { "epoch": 3.1374228923920495, "grad_norm": 2.1094553470611572, "learning_rate": 3.0505846295958856e-05, "loss": 2.2137243270874025, "memory(GiB)": 122.07, "step": 9155, "token_acc": 0.516246953696182, "train_speed(iter/s)": 1.193134 }, { "epoch": 3.1391363947909525, "grad_norm": 1.783555030822754, "learning_rate": 3.0456286309192904e-05, "loss": 2.3172412872314454, "memory(GiB)": 122.07, "step": 9160, "token_acc": 0.5106382978723404, "train_speed(iter/s)": 1.193187 }, { "epoch": 3.140849897189856, "grad_norm": 1.880192518234253, "learning_rate": 3.0406748975974397e-05, "loss": 2.203290557861328, "memory(GiB)": 122.07, "step": 9165, "token_acc": 0.5260924904539669, "train_speed(iter/s)": 1.193196 }, { "epoch": 3.1425633995887594, "grad_norm": 1.7575607299804688, "learning_rate": 3.0357234353723142e-05, "loss": 2.188564682006836, "memory(GiB)": 122.07, "step": 9170, "token_acc": 0.519893899204244, "train_speed(iter/s)": 1.193044 }, { "epoch": 3.144276901987663, "grad_norm": 1.7600300312042236, "learning_rate": 3.0307742499832593e-05, "loss": 2.245474433898926, "memory(GiB)": 122.07, "step": 9175, "token_acc": 0.5156594618438465, "train_speed(iter/s)": 1.193094 }, { "epoch": 3.1459904043865663, "grad_norm": 1.8002816438674927, "learning_rate": 3.0258273471669874e-05, "loss": 2.26007194519043, "memory(GiB)": 122.07, "step": 9180, "token_acc": 0.5122791900043084, "train_speed(iter/s)": 1.193164 }, { "epoch": 3.1477039067854693, "grad_norm": 2.054474115371704, "learning_rate": 3.0208827326575628e-05, "loss": 2.3971385955810547, "memory(GiB)": 122.07, "step": 9185, "token_acc": 0.5036908380373426, "train_speed(iter/s)": 1.19322 }, { "epoch": 3.1494174091843727, "grad_norm": 1.7517485618591309, "learning_rate": 3.0159404121863933e-05, "loss": 2.2508331298828126, "memory(GiB)": 122.07, "step": 9190, "token_acc": 0.5072463768115942, "train_speed(iter/s)": 1.193165 }, { "epoch": 3.151130911583276, "grad_norm": 1.7058545351028442, "learning_rate": 3.011000391482235e-05, "loss": 2.4163639068603517, "memory(GiB)": 122.07, "step": 9195, "token_acc": 0.4920114707087259, "train_speed(iter/s)": 1.19318 }, { "epoch": 3.1528444139821796, "grad_norm": 1.9164550304412842, "learning_rate": 3.0060626762711708e-05, "loss": 2.237630081176758, "memory(GiB)": 122.07, "step": 9200, "token_acc": 0.5038133692238672, "train_speed(iter/s)": 1.193234 }, { "epoch": 3.154557916381083, "grad_norm": 1.8517645597457886, "learning_rate": 3.0011272722766192e-05, "loss": 2.3776105880737304, "memory(GiB)": 122.07, "step": 9205, "token_acc": 0.5018300122000814, "train_speed(iter/s)": 1.193247 }, { "epoch": 3.156271418779986, "grad_norm": 1.758013129234314, "learning_rate": 2.99619418521931e-05, "loss": 2.261616897583008, "memory(GiB)": 122.07, "step": 9210, "token_acc": 0.5223880597014925, "train_speed(iter/s)": 1.193256 }, { "epoch": 3.1579849211788895, "grad_norm": 1.837875247001648, "learning_rate": 2.991263420817296e-05, "loss": 2.3039133071899416, "memory(GiB)": 122.07, "step": 9215, "token_acc": 0.5110629067245119, "train_speed(iter/s)": 1.193292 }, { "epoch": 3.159698423577793, "grad_norm": 1.763244867324829, "learning_rate": 2.9863349847859323e-05, "loss": 2.2979745864868164, "memory(GiB)": 122.07, "step": 9220, "token_acc": 0.4950452391210685, "train_speed(iter/s)": 1.193309 }, { "epoch": 3.1614119259766964, "grad_norm": 1.8881205320358276, "learning_rate": 2.9814088828378785e-05, "loss": 2.304273796081543, "memory(GiB)": 122.07, "step": 9225, "token_acc": 0.504302925989673, "train_speed(iter/s)": 1.193358 }, { "epoch": 3.1631254283756, "grad_norm": 1.8006993532180786, "learning_rate": 2.976485120683089e-05, "loss": 2.168000030517578, "memory(GiB)": 122.07, "step": 9230, "token_acc": 0.5161583978152026, "train_speed(iter/s)": 1.193378 }, { "epoch": 3.1648389307745033, "grad_norm": 2.0978946685791016, "learning_rate": 2.9715637040288036e-05, "loss": 2.2787952423095703, "memory(GiB)": 122.07, "step": 9235, "token_acc": 0.508245243128964, "train_speed(iter/s)": 1.193304 }, { "epoch": 3.1665524331734063, "grad_norm": 1.8385149240493774, "learning_rate": 2.966644638579542e-05, "loss": 2.3131908416748046, "memory(GiB)": 122.07, "step": 9240, "token_acc": 0.510575296108291, "train_speed(iter/s)": 1.193343 }, { "epoch": 3.1682659355723097, "grad_norm": 1.8402957916259766, "learning_rate": 2.9617279300371037e-05, "loss": 2.333654594421387, "memory(GiB)": 122.07, "step": 9245, "token_acc": 0.496, "train_speed(iter/s)": 1.193317 }, { "epoch": 3.169979437971213, "grad_norm": 1.590035080909729, "learning_rate": 2.9568135841005513e-05, "loss": 2.272317314147949, "memory(GiB)": 122.07, "step": 9250, "token_acc": 0.5172113289760348, "train_speed(iter/s)": 1.19335 }, { "epoch": 3.1716929403701166, "grad_norm": 1.625421166419983, "learning_rate": 2.951901606466212e-05, "loss": 2.253430938720703, "memory(GiB)": 122.07, "step": 9255, "token_acc": 0.5180240320427236, "train_speed(iter/s)": 1.193365 }, { "epoch": 3.17340644276902, "grad_norm": 1.8977588415145874, "learning_rate": 2.9469920028276686e-05, "loss": 2.3270416259765625, "memory(GiB)": 122.07, "step": 9260, "token_acc": 0.49325153374233127, "train_speed(iter/s)": 1.19325 }, { "epoch": 3.175119945167923, "grad_norm": 1.868798851966858, "learning_rate": 2.9420847788757477e-05, "loss": 2.199062728881836, "memory(GiB)": 122.07, "step": 9265, "token_acc": 0.5241698192517865, "train_speed(iter/s)": 1.193251 }, { "epoch": 3.1768334475668265, "grad_norm": 1.5675784349441528, "learning_rate": 2.9371799402985235e-05, "loss": 2.269053840637207, "memory(GiB)": 122.07, "step": 9270, "token_acc": 0.5076595744680851, "train_speed(iter/s)": 1.193251 }, { "epoch": 3.17854694996573, "grad_norm": 1.8298683166503906, "learning_rate": 2.932277492781298e-05, "loss": 2.165645408630371, "memory(GiB)": 122.07, "step": 9275, "token_acc": 0.5128529287821323, "train_speed(iter/s)": 1.193267 }, { "epoch": 3.1802604523646334, "grad_norm": 1.6845191717147827, "learning_rate": 2.927377442006608e-05, "loss": 2.285168266296387, "memory(GiB)": 122.07, "step": 9280, "token_acc": 0.5114440282979609, "train_speed(iter/s)": 1.193168 }, { "epoch": 3.181973954763537, "grad_norm": 1.7087335586547852, "learning_rate": 2.9224797936542094e-05, "loss": 2.1202369689941407, "memory(GiB)": 122.07, "step": 9285, "token_acc": 0.5351860152398028, "train_speed(iter/s)": 1.193195 }, { "epoch": 3.18368745716244, "grad_norm": 2.0308897495269775, "learning_rate": 2.917584553401075e-05, "loss": 2.2754568099975585, "memory(GiB)": 122.07, "step": 9290, "token_acc": 0.517988729952319, "train_speed(iter/s)": 1.193243 }, { "epoch": 3.1854009595613433, "grad_norm": 1.8136203289031982, "learning_rate": 2.9126917269213827e-05, "loss": 2.3062488555908205, "memory(GiB)": 122.07, "step": 9295, "token_acc": 0.5058214747736093, "train_speed(iter/s)": 1.193242 }, { "epoch": 3.1871144619602467, "grad_norm": 1.880130648612976, "learning_rate": 2.9078013198865193e-05, "loss": 2.2049304962158205, "memory(GiB)": 122.07, "step": 9300, "token_acc": 0.5245231607629428, "train_speed(iter/s)": 1.193305 }, { "epoch": 3.18882796435915, "grad_norm": 1.6949478387832642, "learning_rate": 2.9029133379650575e-05, "loss": 2.21671142578125, "memory(GiB)": 122.07, "step": 9305, "token_acc": 0.5132167547783651, "train_speed(iter/s)": 1.193286 }, { "epoch": 3.1905414667580536, "grad_norm": 1.6619155406951904, "learning_rate": 2.8980277868227678e-05, "loss": 2.2326120376586913, "memory(GiB)": 122.07, "step": 9310, "token_acc": 0.5132231404958678, "train_speed(iter/s)": 1.193128 }, { "epoch": 3.1922549691569566, "grad_norm": 1.6280362606048584, "learning_rate": 2.8931446721226008e-05, "loss": 2.193625640869141, "memory(GiB)": 122.07, "step": 9315, "token_acc": 0.5296296296296297, "train_speed(iter/s)": 1.193117 }, { "epoch": 3.19396847155586, "grad_norm": 1.7770332098007202, "learning_rate": 2.8882639995246786e-05, "loss": 2.3233917236328123, "memory(GiB)": 122.07, "step": 9320, "token_acc": 0.5018618121638395, "train_speed(iter/s)": 1.19316 }, { "epoch": 3.1956819739547635, "grad_norm": 1.8166587352752686, "learning_rate": 2.8833857746862967e-05, "loss": 2.3222341537475586, "memory(GiB)": 122.07, "step": 9325, "token_acc": 0.50201126307321, "train_speed(iter/s)": 1.193163 }, { "epoch": 3.197395476353667, "grad_norm": 1.7648013830184937, "learning_rate": 2.878510003261914e-05, "loss": 2.2196250915527345, "memory(GiB)": 122.07, "step": 9330, "token_acc": 0.5204081632653061, "train_speed(iter/s)": 1.193099 }, { "epoch": 3.1991089787525704, "grad_norm": 1.8286317586898804, "learning_rate": 2.8736366909031455e-05, "loss": 2.1982696533203123, "memory(GiB)": 122.07, "step": 9335, "token_acc": 0.5179552175749894, "train_speed(iter/s)": 1.193141 }, { "epoch": 3.200822481151474, "grad_norm": 1.8763620853424072, "learning_rate": 2.8687658432587507e-05, "loss": 2.2087884902954102, "memory(GiB)": 122.07, "step": 9340, "token_acc": 0.5201072386058981, "train_speed(iter/s)": 1.193202 }, { "epoch": 3.202535983550377, "grad_norm": 1.7342298030853271, "learning_rate": 2.8638974659746377e-05, "loss": 2.350977325439453, "memory(GiB)": 122.07, "step": 9345, "token_acc": 0.5010734220695577, "train_speed(iter/s)": 1.193226 }, { "epoch": 3.2042494859492803, "grad_norm": 1.68598210811615, "learning_rate": 2.8590315646938514e-05, "loss": 2.24718017578125, "memory(GiB)": 122.07, "step": 9350, "token_acc": 0.5166083916083916, "train_speed(iter/s)": 1.193289 }, { "epoch": 3.2059629883481837, "grad_norm": 1.8049529790878296, "learning_rate": 2.854168145056561e-05, "loss": 2.3661706924438475, "memory(GiB)": 122.07, "step": 9355, "token_acc": 0.5058430717863105, "train_speed(iter/s)": 1.193322 }, { "epoch": 3.207676490747087, "grad_norm": 1.7792081832885742, "learning_rate": 2.8493072127000654e-05, "loss": 2.2781753540039062, "memory(GiB)": 122.07, "step": 9360, "token_acc": 0.5096491228070176, "train_speed(iter/s)": 1.193352 }, { "epoch": 3.2093899931459906, "grad_norm": 1.8771134614944458, "learning_rate": 2.8444487732587788e-05, "loss": 2.370772933959961, "memory(GiB)": 122.07, "step": 9365, "token_acc": 0.4922301553968921, "train_speed(iter/s)": 1.193447 }, { "epoch": 3.2111034955448936, "grad_norm": 1.7796372175216675, "learning_rate": 2.839592832364222e-05, "loss": 2.2711212158203127, "memory(GiB)": 122.07, "step": 9370, "token_acc": 0.5198275862068965, "train_speed(iter/s)": 1.193472 }, { "epoch": 3.212816997943797, "grad_norm": 1.746155023574829, "learning_rate": 2.834739395645024e-05, "loss": 2.256227493286133, "memory(GiB)": 122.07, "step": 9375, "token_acc": 0.5100213219616204, "train_speed(iter/s)": 1.193348 }, { "epoch": 3.2145305003427005, "grad_norm": 1.8220038414001465, "learning_rate": 2.8298884687269124e-05, "loss": 2.1999393463134767, "memory(GiB)": 122.07, "step": 9380, "token_acc": 0.5242593387720051, "train_speed(iter/s)": 1.193358 }, { "epoch": 3.216244002741604, "grad_norm": 1.8121503591537476, "learning_rate": 2.8250400572326986e-05, "loss": 2.27034854888916, "memory(GiB)": 122.07, "step": 9385, "token_acc": 0.5087719298245614, "train_speed(iter/s)": 1.193387 }, { "epoch": 3.2179575051405074, "grad_norm": 1.7742366790771484, "learning_rate": 2.820194166782285e-05, "loss": 2.1400615692138674, "memory(GiB)": 122.07, "step": 9390, "token_acc": 0.5278030993618961, "train_speed(iter/s)": 1.193422 }, { "epoch": 3.2196710075394104, "grad_norm": 1.7631847858428955, "learning_rate": 2.815350802992649e-05, "loss": 2.184143829345703, "memory(GiB)": 122.07, "step": 9395, "token_acc": 0.5323063756953359, "train_speed(iter/s)": 1.193471 }, { "epoch": 3.221384509938314, "grad_norm": 1.8940849304199219, "learning_rate": 2.8105099714778426e-05, "loss": 2.141870307922363, "memory(GiB)": 122.07, "step": 9400, "token_acc": 0.5200984413453651, "train_speed(iter/s)": 1.193354 }, { "epoch": 3.2230980123372173, "grad_norm": 1.9165118932724, "learning_rate": 2.805671677848976e-05, "loss": 2.254206085205078, "memory(GiB)": 122.07, "step": 9405, "token_acc": 0.4930817610062893, "train_speed(iter/s)": 1.193389 }, { "epoch": 3.2248115147361207, "grad_norm": 1.8117706775665283, "learning_rate": 2.8008359277142247e-05, "loss": 2.255998992919922, "memory(GiB)": 122.07, "step": 9410, "token_acc": 0.5173017958826106, "train_speed(iter/s)": 1.193437 }, { "epoch": 3.226525017135024, "grad_norm": 2.154123067855835, "learning_rate": 2.796002726678809e-05, "loss": 2.313081741333008, "memory(GiB)": 122.07, "step": 9415, "token_acc": 0.5104636848584325, "train_speed(iter/s)": 1.193501 }, { "epoch": 3.228238519533927, "grad_norm": 2.0489320755004883, "learning_rate": 2.7911720803450016e-05, "loss": 2.296300506591797, "memory(GiB)": 122.07, "step": 9420, "token_acc": 0.5044814340588989, "train_speed(iter/s)": 1.193518 }, { "epoch": 3.2299520219328306, "grad_norm": 1.866715431213379, "learning_rate": 2.7863439943121085e-05, "loss": 2.255726623535156, "memory(GiB)": 122.07, "step": 9425, "token_acc": 0.5215459795646379, "train_speed(iter/s)": 1.193544 }, { "epoch": 3.231665524331734, "grad_norm": 1.8159843683242798, "learning_rate": 2.7815184741764733e-05, "loss": 2.2506160736083984, "memory(GiB)": 122.07, "step": 9430, "token_acc": 0.5210084033613446, "train_speed(iter/s)": 1.193597 }, { "epoch": 3.2333790267306375, "grad_norm": 1.9645839929580688, "learning_rate": 2.7766955255314586e-05, "loss": 2.264515686035156, "memory(GiB)": 122.07, "step": 9435, "token_acc": 0.5151909722222222, "train_speed(iter/s)": 1.193661 }, { "epoch": 3.235092529129541, "grad_norm": 1.7612663507461548, "learning_rate": 2.7718751539674515e-05, "loss": 2.2184629440307617, "memory(GiB)": 122.07, "step": 9440, "token_acc": 0.5098901098901099, "train_speed(iter/s)": 1.193528 }, { "epoch": 3.2368060315284444, "grad_norm": 1.629757285118103, "learning_rate": 2.767057365071852e-05, "loss": 2.2349727630615233, "memory(GiB)": 122.07, "step": 9445, "token_acc": 0.5082828282828283, "train_speed(iter/s)": 1.193539 }, { "epoch": 3.2385195339273474, "grad_norm": 1.9383448362350464, "learning_rate": 2.762242164429063e-05, "loss": 2.2565065383911134, "memory(GiB)": 122.07, "step": 9450, "token_acc": 0.5183003786285233, "train_speed(iter/s)": 1.193569 }, { "epoch": 3.240233036326251, "grad_norm": 1.6744225025177002, "learning_rate": 2.757429557620489e-05, "loss": 2.1893611907958985, "memory(GiB)": 122.07, "step": 9455, "token_acc": 0.515952980688497, "train_speed(iter/s)": 1.193535 }, { "epoch": 3.2419465387251543, "grad_norm": 1.7343993186950684, "learning_rate": 2.752619550224529e-05, "loss": 2.3004392623901366, "memory(GiB)": 122.07, "step": 9460, "token_acc": 0.529637526652452, "train_speed(iter/s)": 1.193508 }, { "epoch": 3.2436600411240577, "grad_norm": 1.6730659008026123, "learning_rate": 2.7478121478165695e-05, "loss": 2.1803064346313477, "memory(GiB)": 122.07, "step": 9465, "token_acc": 0.5278875713658322, "train_speed(iter/s)": 1.193408 }, { "epoch": 3.2453735435229607, "grad_norm": 1.7645726203918457, "learning_rate": 2.7430073559689735e-05, "loss": 2.204039764404297, "memory(GiB)": 122.07, "step": 9470, "token_acc": 0.531810766721044, "train_speed(iter/s)": 1.193433 }, { "epoch": 3.247087045921864, "grad_norm": 1.709431767463684, "learning_rate": 2.738205180251083e-05, "loss": 2.28184814453125, "memory(GiB)": 122.07, "step": 9475, "token_acc": 0.5302385008517888, "train_speed(iter/s)": 1.19345 }, { "epoch": 3.2488005483207676, "grad_norm": 2.0840389728546143, "learning_rate": 2.733405626229203e-05, "loss": 2.200631523132324, "memory(GiB)": 122.07, "step": 9480, "token_acc": 0.5183547103051747, "train_speed(iter/s)": 1.19352 }, { "epoch": 3.250514050719671, "grad_norm": 1.8102812767028809, "learning_rate": 2.7286086994666027e-05, "loss": 2.141961860656738, "memory(GiB)": 122.07, "step": 9485, "token_acc": 0.519914346895075, "train_speed(iter/s)": 1.193583 }, { "epoch": 3.2522275531185745, "grad_norm": 1.7773722410202026, "learning_rate": 2.723814405523506e-05, "loss": 2.2777629852294923, "memory(GiB)": 122.07, "step": 9490, "token_acc": 0.5101708706265257, "train_speed(iter/s)": 1.193456 }, { "epoch": 3.253941055517478, "grad_norm": 1.8093239068984985, "learning_rate": 2.7190227499570854e-05, "loss": 2.2963558197021485, "memory(GiB)": 122.07, "step": 9495, "token_acc": 0.5050416483998247, "train_speed(iter/s)": 1.193487 }, { "epoch": 3.255654557916381, "grad_norm": 1.6847307682037354, "learning_rate": 2.71423373832145e-05, "loss": 2.3196792602539062, "memory(GiB)": 122.07, "step": 9500, "token_acc": 0.5168195718654435, "train_speed(iter/s)": 1.193519 }, { "epoch": 3.255654557916381, "eval_loss": 1.954684853553772, "eval_runtime": 3.7661, "eval_samples_per_second": 26.553, "eval_steps_per_second": 26.553, "eval_token_acc": 0.5122265122265123, "step": 9500 }, { "epoch": 3.2573680603152844, "grad_norm": 1.8393816947937012, "learning_rate": 2.709447376167653e-05, "loss": 2.2991168975830076, "memory(GiB)": 122.07, "step": 9505, "token_acc": 0.5098225602027884, "train_speed(iter/s)": 1.192644 }, { "epoch": 3.259081562714188, "grad_norm": 1.8086848258972168, "learning_rate": 2.704663669043668e-05, "loss": 2.294486427307129, "memory(GiB)": 122.07, "step": 9510, "token_acc": 0.5021834061135371, "train_speed(iter/s)": 1.192689 }, { "epoch": 3.2607950651130913, "grad_norm": 1.9263103008270264, "learning_rate": 2.6998826224943952e-05, "loss": 2.286343574523926, "memory(GiB)": 122.07, "step": 9515, "token_acc": 0.5082266910420475, "train_speed(iter/s)": 1.192758 }, { "epoch": 3.2625085675119947, "grad_norm": 1.6947686672210693, "learning_rate": 2.6951042420616522e-05, "loss": 2.3699859619140624, "memory(GiB)": 122.07, "step": 9520, "token_acc": 0.4976303317535545, "train_speed(iter/s)": 1.192765 }, { "epoch": 3.2642220699108977, "grad_norm": 1.889997124671936, "learning_rate": 2.6903285332841643e-05, "loss": 2.2644182205200196, "memory(GiB)": 122.07, "step": 9525, "token_acc": 0.5264840182648401, "train_speed(iter/s)": 1.192799 }, { "epoch": 3.265935572309801, "grad_norm": 1.7501091957092285, "learning_rate": 2.6855555016975626e-05, "loss": 2.3532482147216798, "memory(GiB)": 122.07, "step": 9530, "token_acc": 0.5040650406504065, "train_speed(iter/s)": 1.192763 }, { "epoch": 3.2676490747087046, "grad_norm": 1.6559194326400757, "learning_rate": 2.680785152834368e-05, "loss": 2.327367973327637, "memory(GiB)": 122.07, "step": 9535, "token_acc": 0.507996632996633, "train_speed(iter/s)": 1.192855 }, { "epoch": 3.269362577107608, "grad_norm": 1.8876538276672363, "learning_rate": 2.676017492224001e-05, "loss": 2.2094131469726563, "memory(GiB)": 122.07, "step": 9540, "token_acc": 0.5244010647737356, "train_speed(iter/s)": 1.192862 }, { "epoch": 3.2710760795065115, "grad_norm": 1.7802082300186157, "learning_rate": 2.671252525392758e-05, "loss": 2.204467010498047, "memory(GiB)": 122.07, "step": 9545, "token_acc": 0.5219534050179212, "train_speed(iter/s)": 1.192881 }, { "epoch": 3.272789581905415, "grad_norm": 1.8658849000930786, "learning_rate": 2.6664902578638173e-05, "loss": 2.21234130859375, "memory(GiB)": 122.07, "step": 9550, "token_acc": 0.5201931518876207, "train_speed(iter/s)": 1.19292 }, { "epoch": 3.274503084304318, "grad_norm": 1.9015576839447021, "learning_rate": 2.661730695157229e-05, "loss": 2.2351844787597654, "memory(GiB)": 122.07, "step": 9555, "token_acc": 0.5202853321444494, "train_speed(iter/s)": 1.192952 }, { "epoch": 3.2762165867032214, "grad_norm": 1.7749882936477661, "learning_rate": 2.656973842789908e-05, "loss": 2.3164825439453125, "memory(GiB)": 122.07, "step": 9560, "token_acc": 0.5182325182325183, "train_speed(iter/s)": 1.1929 }, { "epoch": 3.277930089102125, "grad_norm": 1.6939657926559448, "learning_rate": 2.6522197062756237e-05, "loss": 2.2246259689331054, "memory(GiB)": 122.07, "step": 9565, "token_acc": 0.5133779264214047, "train_speed(iter/s)": 1.192912 }, { "epoch": 3.2796435915010282, "grad_norm": 1.911936640739441, "learning_rate": 2.6474682911250015e-05, "loss": 2.2676462173461913, "memory(GiB)": 122.07, "step": 9570, "token_acc": 0.5132743362831859, "train_speed(iter/s)": 1.192916 }, { "epoch": 3.2813570938999312, "grad_norm": 1.8385761976242065, "learning_rate": 2.6427196028455092e-05, "loss": 2.328933906555176, "memory(GiB)": 122.07, "step": 9575, "token_acc": 0.5083571137382796, "train_speed(iter/s)": 1.192989 }, { "epoch": 3.2830705962988347, "grad_norm": 1.6944642066955566, "learning_rate": 2.637973646941457e-05, "loss": 2.201358413696289, "memory(GiB)": 122.07, "step": 9580, "token_acc": 0.5304721030042918, "train_speed(iter/s)": 1.193013 }, { "epoch": 3.284784098697738, "grad_norm": 2.0842530727386475, "learning_rate": 2.633230428913986e-05, "loss": 2.3770116806030273, "memory(GiB)": 122.07, "step": 9585, "token_acc": 0.4940017137960583, "train_speed(iter/s)": 1.193013 }, { "epoch": 3.2864976010966416, "grad_norm": 1.7518855333328247, "learning_rate": 2.628489954261064e-05, "loss": 2.336275100708008, "memory(GiB)": 122.07, "step": 9590, "token_acc": 0.512006512006512, "train_speed(iter/s)": 1.193046 }, { "epoch": 3.288211103495545, "grad_norm": 1.875346302986145, "learning_rate": 2.623752228477483e-05, "loss": 2.2839269638061523, "memory(GiB)": 122.07, "step": 9595, "token_acc": 0.5114192495921697, "train_speed(iter/s)": 1.193063 }, { "epoch": 3.2899246058944485, "grad_norm": 1.869600772857666, "learning_rate": 2.619017257054841e-05, "loss": 2.2238895416259767, "memory(GiB)": 122.07, "step": 9600, "token_acc": 0.5283882783882784, "train_speed(iter/s)": 1.193059 }, { "epoch": 3.2916381082933515, "grad_norm": 1.8584152460098267, "learning_rate": 2.614285045481546e-05, "loss": 2.333233451843262, "memory(GiB)": 122.07, "step": 9605, "token_acc": 0.5058069683620344, "train_speed(iter/s)": 1.193094 }, { "epoch": 3.293351610692255, "grad_norm": 2.0154781341552734, "learning_rate": 2.6095555992428113e-05, "loss": 2.2656604766845705, "memory(GiB)": 122.07, "step": 9610, "token_acc": 0.5130397605814451, "train_speed(iter/s)": 1.193035 }, { "epoch": 3.2950651130911583, "grad_norm": 1.7589367628097534, "learning_rate": 2.6048289238206396e-05, "loss": 2.167797088623047, "memory(GiB)": 122.07, "step": 9615, "token_acc": 0.5333930972658001, "train_speed(iter/s)": 1.193072 }, { "epoch": 3.296778615490062, "grad_norm": 1.9211636781692505, "learning_rate": 2.6001050246938264e-05, "loss": 2.2919384002685548, "memory(GiB)": 122.07, "step": 9620, "token_acc": 0.49259415996614475, "train_speed(iter/s)": 1.193091 }, { "epoch": 3.2984921178889652, "grad_norm": 1.9027773141860962, "learning_rate": 2.5953839073379473e-05, "loss": 2.271462249755859, "memory(GiB)": 122.07, "step": 9625, "token_acc": 0.5300225733634312, "train_speed(iter/s)": 1.193046 }, { "epoch": 3.3002056202878682, "grad_norm": 1.7802079916000366, "learning_rate": 2.590665577225351e-05, "loss": 2.289729690551758, "memory(GiB)": 122.07, "step": 9630, "token_acc": 0.5193298969072165, "train_speed(iter/s)": 1.193044 }, { "epoch": 3.3019191226867717, "grad_norm": 1.5545793771743774, "learning_rate": 2.5859500398251567e-05, "loss": 2.2673206329345703, "memory(GiB)": 122.07, "step": 9635, "token_acc": 0.5083682008368201, "train_speed(iter/s)": 1.19305 }, { "epoch": 3.303632625085675, "grad_norm": 1.738567590713501, "learning_rate": 2.5812373006032487e-05, "loss": 2.332536506652832, "memory(GiB)": 122.07, "step": 9640, "token_acc": 0.5204918032786885, "train_speed(iter/s)": 1.193117 }, { "epoch": 3.3053461274845786, "grad_norm": 1.92537260055542, "learning_rate": 2.5765273650222665e-05, "loss": 2.2888681411743166, "memory(GiB)": 122.07, "step": 9645, "token_acc": 0.5158762886597938, "train_speed(iter/s)": 1.193122 }, { "epoch": 3.307059629883482, "grad_norm": 2.0225696563720703, "learning_rate": 2.5718202385415996e-05, "loss": 2.26499080657959, "memory(GiB)": 122.07, "step": 9650, "token_acc": 0.5162006213936973, "train_speed(iter/s)": 1.193107 }, { "epoch": 3.308773132282385, "grad_norm": 1.7276281118392944, "learning_rate": 2.5671159266173816e-05, "loss": 2.1335094451904295, "memory(GiB)": 122.07, "step": 9655, "token_acc": 0.5325168918918919, "train_speed(iter/s)": 1.193097 }, { "epoch": 3.3104866346812885, "grad_norm": 1.6778994798660278, "learning_rate": 2.562414434702486e-05, "loss": 2.2123998641967773, "memory(GiB)": 122.07, "step": 9660, "token_acc": 0.5302768166089965, "train_speed(iter/s)": 1.193168 }, { "epoch": 3.312200137080192, "grad_norm": 1.7577285766601562, "learning_rate": 2.5577157682465124e-05, "loss": 2.248191070556641, "memory(GiB)": 122.07, "step": 9665, "token_acc": 0.5123216601815823, "train_speed(iter/s)": 1.19303 }, { "epoch": 3.3139136394790953, "grad_norm": 1.8301759958267212, "learning_rate": 2.553019932695787e-05, "loss": 2.3122411727905274, "memory(GiB)": 122.07, "step": 9670, "token_acc": 0.5023534445870774, "train_speed(iter/s)": 1.19306 }, { "epoch": 3.315627141877999, "grad_norm": 1.7588082551956177, "learning_rate": 2.548326933493358e-05, "loss": 2.2904829025268554, "memory(GiB)": 122.07, "step": 9675, "token_acc": 0.5066079295154186, "train_speed(iter/s)": 1.193074 }, { "epoch": 3.317340644276902, "grad_norm": 1.7105692625045776, "learning_rate": 2.543636776078983e-05, "loss": 2.251919174194336, "memory(GiB)": 122.07, "step": 9680, "token_acc": 0.5141548709408826, "train_speed(iter/s)": 1.193077 }, { "epoch": 3.3190541466758052, "grad_norm": 2.085932970046997, "learning_rate": 2.538949465889127e-05, "loss": 2.3872589111328124, "memory(GiB)": 122.07, "step": 9685, "token_acc": 0.5139344262295082, "train_speed(iter/s)": 1.192982 }, { "epoch": 3.3207676490747087, "grad_norm": 1.6536376476287842, "learning_rate": 2.5342650083569562e-05, "loss": 2.230014610290527, "memory(GiB)": 122.07, "step": 9690, "token_acc": 0.5209760273972602, "train_speed(iter/s)": 1.192995 }, { "epoch": 3.322481151473612, "grad_norm": 1.7877484560012817, "learning_rate": 2.5295834089123254e-05, "loss": 2.2710670471191405, "memory(GiB)": 122.07, "step": 9695, "token_acc": 0.5192472198460223, "train_speed(iter/s)": 1.193036 }, { "epoch": 3.3241946538725156, "grad_norm": 1.7263041734695435, "learning_rate": 2.524904672981778e-05, "loss": 2.2261188507080076, "memory(GiB)": 122.07, "step": 9700, "token_acc": 0.5241666666666667, "train_speed(iter/s)": 1.193028 }, { "epoch": 3.325908156271419, "grad_norm": 1.9658803939819336, "learning_rate": 2.5202288059885414e-05, "loss": 2.3140241622924806, "memory(GiB)": 122.07, "step": 9705, "token_acc": 0.5039335664335665, "train_speed(iter/s)": 1.193082 }, { "epoch": 3.327621658670322, "grad_norm": 1.6963913440704346, "learning_rate": 2.5155558133525148e-05, "loss": 2.2622798919677733, "memory(GiB)": 122.07, "step": 9710, "token_acc": 0.5181034482758621, "train_speed(iter/s)": 1.193156 }, { "epoch": 3.3293351610692254, "grad_norm": 1.9998455047607422, "learning_rate": 2.5108857004902676e-05, "loss": 2.358232307434082, "memory(GiB)": 122.07, "step": 9715, "token_acc": 0.49469664828171406, "train_speed(iter/s)": 1.193065 }, { "epoch": 3.331048663468129, "grad_norm": 1.7855279445648193, "learning_rate": 2.5062184728150316e-05, "loss": 2.312113571166992, "memory(GiB)": 122.07, "step": 9720, "token_acc": 0.5004226542688082, "train_speed(iter/s)": 1.1931 }, { "epoch": 3.3327621658670323, "grad_norm": 1.6063978672027588, "learning_rate": 2.501554135736691e-05, "loss": 2.171634292602539, "memory(GiB)": 122.07, "step": 9725, "token_acc": 0.5304932735426009, "train_speed(iter/s)": 1.193081 }, { "epoch": 3.3344756682659353, "grad_norm": 1.7458549737930298, "learning_rate": 2.4968926946617786e-05, "loss": 2.254778289794922, "memory(GiB)": 122.07, "step": 9730, "token_acc": 0.5232607767819035, "train_speed(iter/s)": 1.193155 }, { "epoch": 3.3361891706648388, "grad_norm": 1.7779192924499512, "learning_rate": 2.4922341549934764e-05, "loss": 2.287476348876953, "memory(GiB)": 122.07, "step": 9735, "token_acc": 0.4995751911639762, "train_speed(iter/s)": 1.193191 }, { "epoch": 3.337902673063742, "grad_norm": 1.5234028100967407, "learning_rate": 2.487578522131599e-05, "loss": 2.2172008514404298, "memory(GiB)": 122.07, "step": 9740, "token_acc": 0.5059111292295149, "train_speed(iter/s)": 1.193198 }, { "epoch": 3.3396161754626457, "grad_norm": 1.771582007408142, "learning_rate": 2.4829258014725927e-05, "loss": 2.296998405456543, "memory(GiB)": 122.07, "step": 9745, "token_acc": 0.5135135135135135, "train_speed(iter/s)": 1.193248 }, { "epoch": 3.341329677861549, "grad_norm": 1.6710354089736938, "learning_rate": 2.4782759984095276e-05, "loss": 2.2894086837768555, "memory(GiB)": 122.07, "step": 9750, "token_acc": 0.5198237885462555, "train_speed(iter/s)": 1.193294 }, { "epoch": 3.3430431802604526, "grad_norm": 1.9289501905441284, "learning_rate": 2.473629118332096e-05, "loss": 2.144314002990723, "memory(GiB)": 122.07, "step": 9755, "token_acc": 0.5247951703320397, "train_speed(iter/s)": 1.193154 }, { "epoch": 3.3447566826593556, "grad_norm": 2.0259079933166504, "learning_rate": 2.468985166626595e-05, "loss": 2.196807098388672, "memory(GiB)": 122.07, "step": 9760, "token_acc": 0.5110746513535686, "train_speed(iter/s)": 1.19314 }, { "epoch": 3.346470185058259, "grad_norm": 2.1351943016052246, "learning_rate": 2.4643441486759298e-05, "loss": 2.2174003601074217, "memory(GiB)": 122.07, "step": 9765, "token_acc": 0.515673289183223, "train_speed(iter/s)": 1.193179 }, { "epoch": 3.3481836874571624, "grad_norm": 1.6572248935699463, "learning_rate": 2.459706069859608e-05, "loss": 2.268656539916992, "memory(GiB)": 122.07, "step": 9770, "token_acc": 0.5040580948312687, "train_speed(iter/s)": 1.193231 }, { "epoch": 3.349897189856066, "grad_norm": 1.8621501922607422, "learning_rate": 2.4550709355537282e-05, "loss": 2.3003795623779295, "memory(GiB)": 122.07, "step": 9775, "token_acc": 0.5141969831410825, "train_speed(iter/s)": 1.193283 }, { "epoch": 3.3516106922549693, "grad_norm": 1.8723725080490112, "learning_rate": 2.4504387511309775e-05, "loss": 2.368662643432617, "memory(GiB)": 122.07, "step": 9780, "token_acc": 0.5086607520067596, "train_speed(iter/s)": 1.193238 }, { "epoch": 3.3533241946538723, "grad_norm": 1.7103732824325562, "learning_rate": 2.4458095219606226e-05, "loss": 2.259201240539551, "memory(GiB)": 122.07, "step": 9785, "token_acc": 0.5137078651685393, "train_speed(iter/s)": 1.193311 }, { "epoch": 3.3550376970527758, "grad_norm": 2.0507569313049316, "learning_rate": 2.4411832534085037e-05, "loss": 2.3012699127197265, "memory(GiB)": 122.07, "step": 9790, "token_acc": 0.5203775203775204, "train_speed(iter/s)": 1.193322 }, { "epoch": 3.356751199451679, "grad_norm": 1.8965412378311157, "learning_rate": 2.4365599508370286e-05, "loss": 2.335906219482422, "memory(GiB)": 122.07, "step": 9795, "token_acc": 0.5034904013961605, "train_speed(iter/s)": 1.193326 }, { "epoch": 3.3584647018505827, "grad_norm": 1.7463477849960327, "learning_rate": 2.4319396196051704e-05, "loss": 2.3336919784545898, "memory(GiB)": 122.07, "step": 9800, "token_acc": 0.504884720593982, "train_speed(iter/s)": 1.193371 }, { "epoch": 3.360178204249486, "grad_norm": 1.7060256004333496, "learning_rate": 2.4273222650684557e-05, "loss": 2.278851127624512, "memory(GiB)": 122.07, "step": 9805, "token_acc": 0.5137096774193548, "train_speed(iter/s)": 1.193351 }, { "epoch": 3.3618917066483895, "grad_norm": 1.8159931898117065, "learning_rate": 2.4227078925789627e-05, "loss": 2.3220001220703126, "memory(GiB)": 122.07, "step": 9810, "token_acc": 0.4983673469387755, "train_speed(iter/s)": 1.193386 }, { "epoch": 3.3636052090472925, "grad_norm": 1.666495442390442, "learning_rate": 2.418096507485313e-05, "loss": 2.119193267822266, "memory(GiB)": 122.07, "step": 9815, "token_acc": 0.5290208241027913, "train_speed(iter/s)": 1.193396 }, { "epoch": 3.365318711446196, "grad_norm": 1.823530673980713, "learning_rate": 2.413488115132662e-05, "loss": 2.391549301147461, "memory(GiB)": 122.07, "step": 9820, "token_acc": 0.5050590219224284, "train_speed(iter/s)": 1.193369 }, { "epoch": 3.3670322138450994, "grad_norm": 1.952932596206665, "learning_rate": 2.4088827208626976e-05, "loss": 2.218405914306641, "memory(GiB)": 122.07, "step": 9825, "token_acc": 0.5263598326359833, "train_speed(iter/s)": 1.193362 }, { "epoch": 3.368745716244003, "grad_norm": 2.44956111907959, "learning_rate": 2.404280330013634e-05, "loss": 2.1843074798583983, "memory(GiB)": 122.07, "step": 9830, "token_acc": 0.522231543624161, "train_speed(iter/s)": 1.193333 }, { "epoch": 3.370459218642906, "grad_norm": 1.657459020614624, "learning_rate": 2.3996809479202048e-05, "loss": 2.2845943450927733, "memory(GiB)": 122.07, "step": 9835, "token_acc": 0.5067283431455004, "train_speed(iter/s)": 1.193325 }, { "epoch": 3.3721727210418093, "grad_norm": 1.7553902864456177, "learning_rate": 2.3950845799136535e-05, "loss": 2.1353900909423826, "memory(GiB)": 122.07, "step": 9840, "token_acc": 0.5342869162810265, "train_speed(iter/s)": 1.193349 }, { "epoch": 3.3738862234407128, "grad_norm": 1.6238294839859009, "learning_rate": 2.3904912313217336e-05, "loss": 2.273978042602539, "memory(GiB)": 122.07, "step": 9845, "token_acc": 0.5101378751013788, "train_speed(iter/s)": 1.193287 }, { "epoch": 3.375599725839616, "grad_norm": 1.6317285299301147, "learning_rate": 2.3859009074686928e-05, "loss": 2.2021453857421873, "memory(GiB)": 122.07, "step": 9850, "token_acc": 0.5298411335337054, "train_speed(iter/s)": 1.193331 }, { "epoch": 3.3773132282385196, "grad_norm": 1.7587393522262573, "learning_rate": 2.3813136136752794e-05, "loss": 2.228514862060547, "memory(GiB)": 122.07, "step": 9855, "token_acc": 0.5244969378827646, "train_speed(iter/s)": 1.193268 }, { "epoch": 3.379026730637423, "grad_norm": 2.1510117053985596, "learning_rate": 2.376729355258722e-05, "loss": 2.256652069091797, "memory(GiB)": 122.07, "step": 9860, "token_acc": 0.5018152480839048, "train_speed(iter/s)": 1.193258 }, { "epoch": 3.380740233036326, "grad_norm": 2.0894572734832764, "learning_rate": 2.3721481375327388e-05, "loss": 2.248556137084961, "memory(GiB)": 122.07, "step": 9865, "token_acc": 0.515527950310559, "train_speed(iter/s)": 1.193176 }, { "epoch": 3.3824537354352295, "grad_norm": 1.636633038520813, "learning_rate": 2.3675699658075186e-05, "loss": 2.188521957397461, "memory(GiB)": 122.07, "step": 9870, "token_acc": 0.5129598662207357, "train_speed(iter/s)": 1.193175 }, { "epoch": 3.384167237834133, "grad_norm": 1.8968383073806763, "learning_rate": 2.3629948453897204e-05, "loss": 2.2012201309204102, "memory(GiB)": 122.07, "step": 9875, "token_acc": 0.5256030469741854, "train_speed(iter/s)": 1.193185 }, { "epoch": 3.3858807402330364, "grad_norm": 1.7751601934432983, "learning_rate": 2.3584227815824693e-05, "loss": 2.198863410949707, "memory(GiB)": 122.07, "step": 9880, "token_acc": 0.5184729064039408, "train_speed(iter/s)": 1.193131 }, { "epoch": 3.38759424263194, "grad_norm": 1.8402903079986572, "learning_rate": 2.353853779685342e-05, "loss": 2.284459686279297, "memory(GiB)": 122.07, "step": 9885, "token_acc": 0.518365662401981, "train_speed(iter/s)": 1.19314 }, { "epoch": 3.389307745030843, "grad_norm": 1.6719849109649658, "learning_rate": 2.3492878449943685e-05, "loss": 2.244987678527832, "memory(GiB)": 122.07, "step": 9890, "token_acc": 0.5219869706840391, "train_speed(iter/s)": 1.193176 }, { "epoch": 3.3910212474297463, "grad_norm": 1.9230955839157104, "learning_rate": 2.3447249828020245e-05, "loss": 2.36694450378418, "memory(GiB)": 122.07, "step": 9895, "token_acc": 0.49313358302122345, "train_speed(iter/s)": 1.193208 }, { "epoch": 3.3927347498286498, "grad_norm": 1.8671544790267944, "learning_rate": 2.3401651983972245e-05, "loss": 2.3673847198486326, "memory(GiB)": 122.07, "step": 9900, "token_acc": 0.49750415973377704, "train_speed(iter/s)": 1.193242 }, { "epoch": 3.394448252227553, "grad_norm": 1.8117700815200806, "learning_rate": 2.3356084970653135e-05, "loss": 2.2932151794433593, "memory(GiB)": 122.07, "step": 9905, "token_acc": 0.5159620362381363, "train_speed(iter/s)": 1.19323 }, { "epoch": 3.3961617546264566, "grad_norm": 1.6432514190673828, "learning_rate": 2.3310548840880674e-05, "loss": 2.181677055358887, "memory(GiB)": 122.07, "step": 9910, "token_acc": 0.5173858399664851, "train_speed(iter/s)": 1.193263 }, { "epoch": 3.39787525702536, "grad_norm": 1.9921050071716309, "learning_rate": 2.326504364743674e-05, "loss": 2.2637577056884766, "memory(GiB)": 122.07, "step": 9915, "token_acc": 0.5131133671742809, "train_speed(iter/s)": 1.193249 }, { "epoch": 3.399588759424263, "grad_norm": 2.0269384384155273, "learning_rate": 2.3219569443067446e-05, "loss": 2.233092498779297, "memory(GiB)": 122.07, "step": 9920, "token_acc": 0.5202338129496403, "train_speed(iter/s)": 1.193286 }, { "epoch": 3.4013022618231665, "grad_norm": 2.2034876346588135, "learning_rate": 2.31741262804829e-05, "loss": 2.3524139404296873, "memory(GiB)": 122.07, "step": 9925, "token_acc": 0.5085508550855086, "train_speed(iter/s)": 1.1933 }, { "epoch": 3.40301576422207, "grad_norm": 1.7878025770187378, "learning_rate": 2.3128714212357288e-05, "loss": 2.32403678894043, "memory(GiB)": 122.07, "step": 9930, "token_acc": 0.5035971223021583, "train_speed(iter/s)": 1.193309 }, { "epoch": 3.4047292666209734, "grad_norm": 1.8749325275421143, "learning_rate": 2.3083333291328735e-05, "loss": 2.365508460998535, "memory(GiB)": 122.07, "step": 9935, "token_acc": 0.4958368026644463, "train_speed(iter/s)": 1.193292 }, { "epoch": 3.4064427690198764, "grad_norm": 1.9509077072143555, "learning_rate": 2.3037983569999277e-05, "loss": 2.3046028137207033, "memory(GiB)": 122.07, "step": 9940, "token_acc": 0.5071090047393365, "train_speed(iter/s)": 1.19331 }, { "epoch": 3.40815627141878, "grad_norm": 1.8185195922851562, "learning_rate": 2.299266510093474e-05, "loss": 2.2463125228881835, "memory(GiB)": 122.07, "step": 9945, "token_acc": 0.5056034482758621, "train_speed(iter/s)": 1.193126 }, { "epoch": 3.4098697738176833, "grad_norm": 1.72922682762146, "learning_rate": 2.2947377936664795e-05, "loss": 2.233688163757324, "memory(GiB)": 122.07, "step": 9950, "token_acc": 0.5110095554632322, "train_speed(iter/s)": 1.193148 }, { "epoch": 3.4115832762165867, "grad_norm": 1.8313900232315063, "learning_rate": 2.290212212968274e-05, "loss": 2.2468441009521483, "memory(GiB)": 122.07, "step": 9955, "token_acc": 0.514311676510677, "train_speed(iter/s)": 1.193096 }, { "epoch": 3.41329677861549, "grad_norm": 1.8412357568740845, "learning_rate": 2.2856897732445605e-05, "loss": 2.2263725280761717, "memory(GiB)": 122.07, "step": 9960, "token_acc": 0.520605550883095, "train_speed(iter/s)": 1.193153 }, { "epoch": 3.4150102810143936, "grad_norm": 2.002868175506592, "learning_rate": 2.281170479737397e-05, "loss": 2.2106595993041993, "memory(GiB)": 122.07, "step": 9965, "token_acc": 0.5278503046127067, "train_speed(iter/s)": 1.193157 }, { "epoch": 3.4167237834132966, "grad_norm": 1.7946447134017944, "learning_rate": 2.2766543376851962e-05, "loss": 2.376206970214844, "memory(GiB)": 122.07, "step": 9970, "token_acc": 0.49255583126550867, "train_speed(iter/s)": 1.193156 }, { "epoch": 3.4184372858122, "grad_norm": 1.831265926361084, "learning_rate": 2.2721413523227205e-05, "loss": 2.165079879760742, "memory(GiB)": 122.07, "step": 9975, "token_acc": 0.5261875761266748, "train_speed(iter/s)": 1.193147 }, { "epoch": 3.4201507882111035, "grad_norm": 1.7947994470596313, "learning_rate": 2.2676315288810645e-05, "loss": 2.1944183349609374, "memory(GiB)": 122.07, "step": 9980, "token_acc": 0.5292110874200426, "train_speed(iter/s)": 1.193154 }, { "epoch": 3.421864290610007, "grad_norm": 1.9175325632095337, "learning_rate": 2.2631248725876687e-05, "loss": 2.240588569641113, "memory(GiB)": 122.07, "step": 9985, "token_acc": 0.5137533274179237, "train_speed(iter/s)": 1.193194 }, { "epoch": 3.42357779300891, "grad_norm": 1.7382193803787231, "learning_rate": 2.258621388666293e-05, "loss": 2.3054025650024412, "memory(GiB)": 122.07, "step": 9990, "token_acc": 0.5134420449537241, "train_speed(iter/s)": 1.193116 }, { "epoch": 3.4252912954078134, "grad_norm": 1.7441290616989136, "learning_rate": 2.2541210823370266e-05, "loss": 2.2081859588623045, "memory(GiB)": 122.07, "step": 9995, "token_acc": 0.534777176176593, "train_speed(iter/s)": 1.193156 }, { "epoch": 3.427004797806717, "grad_norm": 1.84006929397583, "learning_rate": 2.249623958816273e-05, "loss": 2.2778568267822266, "memory(GiB)": 122.07, "step": 10000, "token_acc": 0.5203145478374837, "train_speed(iter/s)": 1.193184 }, { "epoch": 3.427004797806717, "eval_loss": 1.9562283754348755, "eval_runtime": 3.7728, "eval_samples_per_second": 26.505, "eval_steps_per_second": 26.505, "eval_token_acc": 0.5127175368139224, "step": 10000 }, { "epoch": 3.4287183002056203, "grad_norm": 2.779076337814331, "learning_rate": 2.245130023316749e-05, "loss": 2.1947504043579102, "memory(GiB)": 122.07, "step": 10005, "token_acc": 0.5167432698621143, "train_speed(iter/s)": 1.192441 }, { "epoch": 3.4304318026045237, "grad_norm": 1.7597802877426147, "learning_rate": 2.2406392810474698e-05, "loss": 2.2214942932128907, "memory(GiB)": 122.07, "step": 10010, "token_acc": 0.5130820399113082, "train_speed(iter/s)": 1.192428 }, { "epoch": 3.432145305003427, "grad_norm": 1.6939657926559448, "learning_rate": 2.2361517372137575e-05, "loss": 2.267522621154785, "memory(GiB)": 122.07, "step": 10015, "token_acc": 0.5229318474067723, "train_speed(iter/s)": 1.192433 }, { "epoch": 3.43385880740233, "grad_norm": 1.967018961906433, "learning_rate": 2.231667397017218e-05, "loss": 2.265839385986328, "memory(GiB)": 122.07, "step": 10020, "token_acc": 0.5147874306839186, "train_speed(iter/s)": 1.192456 }, { "epoch": 3.4355723098012336, "grad_norm": 1.7294338941574097, "learning_rate": 2.2271862656557513e-05, "loss": 2.3644847869873047, "memory(GiB)": 122.07, "step": 10025, "token_acc": 0.504704875962361, "train_speed(iter/s)": 1.192361 }, { "epoch": 3.437285812200137, "grad_norm": 2.035295009613037, "learning_rate": 2.2227083483235345e-05, "loss": 2.2614814758300783, "memory(GiB)": 122.07, "step": 10030, "token_acc": 0.5186631944444444, "train_speed(iter/s)": 1.192391 }, { "epoch": 3.4389993145990405, "grad_norm": 1.6661983728408813, "learning_rate": 2.2182336502110223e-05, "loss": 2.2035171508789064, "memory(GiB)": 122.07, "step": 10035, "token_acc": 0.5237096097356274, "train_speed(iter/s)": 1.192396 }, { "epoch": 3.440712816997944, "grad_norm": 1.638975977897644, "learning_rate": 2.213762176504931e-05, "loss": 2.284648895263672, "memory(GiB)": 122.07, "step": 10040, "token_acc": 0.5086348684210527, "train_speed(iter/s)": 1.192304 }, { "epoch": 3.442426319396847, "grad_norm": 1.8652617931365967, "learning_rate": 2.209293932388246e-05, "loss": 2.274873733520508, "memory(GiB)": 122.07, "step": 10045, "token_acc": 0.5118503118503118, "train_speed(iter/s)": 1.19234 }, { "epoch": 3.4441398217957504, "grad_norm": 1.9299073219299316, "learning_rate": 2.2048289230402093e-05, "loss": 2.2451107025146486, "memory(GiB)": 122.07, "step": 10050, "token_acc": 0.5214669051878354, "train_speed(iter/s)": 1.192375 }, { "epoch": 3.445853324194654, "grad_norm": 1.9086673259735107, "learning_rate": 2.2003671536363073e-05, "loss": 2.2252471923828123, "memory(GiB)": 122.07, "step": 10055, "token_acc": 0.5137931034482759, "train_speed(iter/s)": 1.192432 }, { "epoch": 3.4475668265935573, "grad_norm": 1.951088309288025, "learning_rate": 2.1959086293482767e-05, "loss": 2.2767318725585937, "memory(GiB)": 122.07, "step": 10060, "token_acc": 0.5262677021470992, "train_speed(iter/s)": 1.192411 }, { "epoch": 3.4492803289924607, "grad_norm": 2.037562370300293, "learning_rate": 2.1914533553440908e-05, "loss": 2.123685836791992, "memory(GiB)": 122.07, "step": 10065, "token_acc": 0.5189873417721519, "train_speed(iter/s)": 1.192354 }, { "epoch": 3.450993831391364, "grad_norm": 1.8273729085922241, "learning_rate": 2.1870013367879573e-05, "loss": 2.245649528503418, "memory(GiB)": 122.07, "step": 10070, "token_acc": 0.5101249461439035, "train_speed(iter/s)": 1.192389 }, { "epoch": 3.452707333790267, "grad_norm": 1.7246378660202026, "learning_rate": 2.182552578840305e-05, "loss": 2.258312225341797, "memory(GiB)": 122.07, "step": 10075, "token_acc": 0.5064502704952143, "train_speed(iter/s)": 1.192363 }, { "epoch": 3.4544208361891706, "grad_norm": 1.87606942653656, "learning_rate": 2.1781070866577906e-05, "loss": 2.169177436828613, "memory(GiB)": 122.07, "step": 10080, "token_acc": 0.526685393258427, "train_speed(iter/s)": 1.192393 }, { "epoch": 3.456134338588074, "grad_norm": 1.9517576694488525, "learning_rate": 2.173664865393278e-05, "loss": 2.16630859375, "memory(GiB)": 122.07, "step": 10085, "token_acc": 0.514018691588785, "train_speed(iter/s)": 1.192463 }, { "epoch": 3.4578478409869775, "grad_norm": 1.7550653219223022, "learning_rate": 2.169225920195845e-05, "loss": 2.246305465698242, "memory(GiB)": 122.07, "step": 10090, "token_acc": 0.5119899032393773, "train_speed(iter/s)": 1.192473 }, { "epoch": 3.4595613433858805, "grad_norm": 2.1356804370880127, "learning_rate": 2.16479025621077e-05, "loss": 2.2518894195556642, "memory(GiB)": 122.07, "step": 10095, "token_acc": 0.506631299734748, "train_speed(iter/s)": 1.192523 }, { "epoch": 3.461274845784784, "grad_norm": 1.8441284894943237, "learning_rate": 2.1603578785795315e-05, "loss": 2.281854438781738, "memory(GiB)": 122.07, "step": 10100, "token_acc": 0.5096359743040685, "train_speed(iter/s)": 1.192511 }, { "epoch": 3.4629883481836874, "grad_norm": 1.8061394691467285, "learning_rate": 2.1559287924397908e-05, "loss": 2.2678506851196287, "memory(GiB)": 122.07, "step": 10105, "token_acc": 0.514639175257732, "train_speed(iter/s)": 1.19253 }, { "epoch": 3.464701850582591, "grad_norm": 2.0171964168548584, "learning_rate": 2.151503002925402e-05, "loss": 2.3429285049438477, "memory(GiB)": 122.07, "step": 10110, "token_acc": 0.5206140350877193, "train_speed(iter/s)": 1.192603 }, { "epoch": 3.4664153529814943, "grad_norm": 2.083205223083496, "learning_rate": 2.1470805151663948e-05, "loss": 2.24456672668457, "memory(GiB)": 122.07, "step": 10115, "token_acc": 0.5263605442176871, "train_speed(iter/s)": 1.192588 }, { "epoch": 3.4681288553803977, "grad_norm": 1.7336117029190063, "learning_rate": 2.1426613342889696e-05, "loss": 2.385091018676758, "memory(GiB)": 122.07, "step": 10120, "token_acc": 0.4882970137207425, "train_speed(iter/s)": 1.192643 }, { "epoch": 3.4698423577793007, "grad_norm": 1.9709190130233765, "learning_rate": 2.1382454654154975e-05, "loss": 2.28988037109375, "memory(GiB)": 122.07, "step": 10125, "token_acc": 0.5042992261392949, "train_speed(iter/s)": 1.192681 }, { "epoch": 3.471555860178204, "grad_norm": 1.873482584953308, "learning_rate": 2.1338329136645108e-05, "loss": 2.296854591369629, "memory(GiB)": 122.07, "step": 10130, "token_acc": 0.5109207708779443, "train_speed(iter/s)": 1.192752 }, { "epoch": 3.4732693625771076, "grad_norm": 1.7763012647628784, "learning_rate": 2.129423684150691e-05, "loss": 2.167729949951172, "memory(GiB)": 122.07, "step": 10135, "token_acc": 0.5270958777211672, "train_speed(iter/s)": 1.192766 }, { "epoch": 3.474982864976011, "grad_norm": 2.0531392097473145, "learning_rate": 2.1250177819848755e-05, "loss": 2.228240203857422, "memory(GiB)": 122.07, "step": 10140, "token_acc": 0.516157591854803, "train_speed(iter/s)": 1.192812 }, { "epoch": 3.4766963673749145, "grad_norm": 1.9307304620742798, "learning_rate": 2.120615212274043e-05, "loss": 2.3987991333007814, "memory(GiB)": 122.07, "step": 10145, "token_acc": 0.48666375163970266, "train_speed(iter/s)": 1.192738 }, { "epoch": 3.4784098697738175, "grad_norm": 1.977279543876648, "learning_rate": 2.1162159801213056e-05, "loss": 2.3481124877929687, "memory(GiB)": 122.07, "step": 10150, "token_acc": 0.5045831514622435, "train_speed(iter/s)": 1.192786 }, { "epoch": 3.480123372172721, "grad_norm": 1.7516409158706665, "learning_rate": 2.111820090625911e-05, "loss": 2.1643644332885743, "memory(GiB)": 122.07, "step": 10155, "token_acc": 0.5361917934532042, "train_speed(iter/s)": 1.19282 }, { "epoch": 3.4818368745716244, "grad_norm": 1.7820228338241577, "learning_rate": 2.1074275488832347e-05, "loss": 2.2563663482666017, "memory(GiB)": 122.07, "step": 10160, "token_acc": 0.5264538696982947, "train_speed(iter/s)": 1.19282 }, { "epoch": 3.483550376970528, "grad_norm": 1.9238759279251099, "learning_rate": 2.1030383599847626e-05, "loss": 2.2590375900268556, "memory(GiB)": 122.07, "step": 10165, "token_acc": 0.5006284038542103, "train_speed(iter/s)": 1.192871 }, { "epoch": 3.4852638793694313, "grad_norm": 1.9449177980422974, "learning_rate": 2.098652529018103e-05, "loss": 2.2748544692993162, "memory(GiB)": 122.07, "step": 10170, "token_acc": 0.5334218680832227, "train_speed(iter/s)": 1.192751 }, { "epoch": 3.4869773817683347, "grad_norm": 1.845283031463623, "learning_rate": 2.094270061066968e-05, "loss": 2.3211624145507814, "memory(GiB)": 122.07, "step": 10175, "token_acc": 0.5059288537549407, "train_speed(iter/s)": 1.192765 }, { "epoch": 3.4886908841672377, "grad_norm": 1.968048095703125, "learning_rate": 2.0898909612111746e-05, "loss": 2.2774383544921877, "memory(GiB)": 122.07, "step": 10180, "token_acc": 0.5262919310970081, "train_speed(iter/s)": 1.192764 }, { "epoch": 3.490404386566141, "grad_norm": 1.8186397552490234, "learning_rate": 2.0855152345266298e-05, "loss": 2.2911243438720703, "memory(GiB)": 122.07, "step": 10185, "token_acc": 0.509987250318742, "train_speed(iter/s)": 1.192807 }, { "epoch": 3.4921178889650446, "grad_norm": 2.0847506523132324, "learning_rate": 2.0811428860853366e-05, "loss": 2.201257514953613, "memory(GiB)": 122.07, "step": 10190, "token_acc": 0.5392491467576792, "train_speed(iter/s)": 1.19284 }, { "epoch": 3.493831391363948, "grad_norm": 2.0755300521850586, "learning_rate": 2.0767739209553814e-05, "loss": 2.216242027282715, "memory(GiB)": 122.07, "step": 10195, "token_acc": 0.5138306789606035, "train_speed(iter/s)": 1.192756 }, { "epoch": 3.495544893762851, "grad_norm": 1.8884403705596924, "learning_rate": 2.072408344200924e-05, "loss": 2.2187192916870115, "memory(GiB)": 122.07, "step": 10200, "token_acc": 0.5215889464594128, "train_speed(iter/s)": 1.192787 }, { "epoch": 3.4972583961617545, "grad_norm": 1.82899808883667, "learning_rate": 2.068046160882202e-05, "loss": 2.294861602783203, "memory(GiB)": 122.07, "step": 10205, "token_acc": 0.5146996165317427, "train_speed(iter/s)": 1.19277 }, { "epoch": 3.498971898560658, "grad_norm": 1.933526873588562, "learning_rate": 2.06368737605552e-05, "loss": 2.3178218841552733, "memory(GiB)": 122.07, "step": 10210, "token_acc": 0.5002103491796382, "train_speed(iter/s)": 1.192818 }, { "epoch": 3.5006854009595614, "grad_norm": 2.0058982372283936, "learning_rate": 2.0593319947732375e-05, "loss": 2.32427864074707, "memory(GiB)": 122.07, "step": 10215, "token_acc": 0.49459108610990915, "train_speed(iter/s)": 1.192831 }, { "epoch": 3.502398903358465, "grad_norm": 1.9459824562072754, "learning_rate": 2.054980022083774e-05, "loss": 2.3854038238525392, "memory(GiB)": 122.07, "step": 10220, "token_acc": 0.49698275862068964, "train_speed(iter/s)": 1.192814 }, { "epoch": 3.5041124057573683, "grad_norm": 2.0870120525360107, "learning_rate": 2.0506314630315982e-05, "loss": 2.3793027877807615, "memory(GiB)": 122.07, "step": 10225, "token_acc": 0.5043334709038382, "train_speed(iter/s)": 1.192828 }, { "epoch": 3.5058259081562713, "grad_norm": 1.7804927825927734, "learning_rate": 2.046286322657217e-05, "loss": 2.2607919692993166, "memory(GiB)": 122.07, "step": 10230, "token_acc": 0.5165073848827106, "train_speed(iter/s)": 1.192835 }, { "epoch": 3.5075394105551747, "grad_norm": 2.0558183193206787, "learning_rate": 2.04194460599718e-05, "loss": 2.3088937759399415, "memory(GiB)": 122.07, "step": 10235, "token_acc": 0.49936251593710157, "train_speed(iter/s)": 1.192843 }, { "epoch": 3.509252912954078, "grad_norm": 1.8408894538879395, "learning_rate": 2.0376063180840655e-05, "loss": 2.230327606201172, "memory(GiB)": 122.07, "step": 10240, "token_acc": 0.5287452790600083, "train_speed(iter/s)": 1.192871 }, { "epoch": 3.5109664153529816, "grad_norm": 2.058234930038452, "learning_rate": 2.0332714639464807e-05, "loss": 2.2698097229003906, "memory(GiB)": 122.07, "step": 10245, "token_acc": 0.5161870503597122, "train_speed(iter/s)": 1.192908 }, { "epoch": 3.5126799177518846, "grad_norm": 1.8585662841796875, "learning_rate": 2.0289400486090466e-05, "loss": 2.2385717391967774, "memory(GiB)": 122.07, "step": 10250, "token_acc": 0.5185856754306437, "train_speed(iter/s)": 1.192961 }, { "epoch": 3.514393420150788, "grad_norm": 1.777789831161499, "learning_rate": 2.0246120770924053e-05, "loss": 2.249223327636719, "memory(GiB)": 122.07, "step": 10255, "token_acc": 0.5100368701351905, "train_speed(iter/s)": 1.192966 }, { "epoch": 3.5161069225496915, "grad_norm": 2.0346245765686035, "learning_rate": 2.0202875544131994e-05, "loss": 2.41123046875, "memory(GiB)": 122.07, "step": 10260, "token_acc": 0.5104039167686658, "train_speed(iter/s)": 1.192965 }, { "epoch": 3.517820424948595, "grad_norm": 1.761027216911316, "learning_rate": 2.015966485584081e-05, "loss": 2.3078119277954103, "memory(GiB)": 122.07, "step": 10265, "token_acc": 0.5041356492969397, "train_speed(iter/s)": 1.192991 }, { "epoch": 3.5195339273474984, "grad_norm": 1.8433196544647217, "learning_rate": 2.011648875613694e-05, "loss": 2.249660110473633, "memory(GiB)": 122.07, "step": 10270, "token_acc": 0.5216165413533834, "train_speed(iter/s)": 1.192947 }, { "epoch": 3.521247429746402, "grad_norm": 1.75119948387146, "learning_rate": 2.0073347295066785e-05, "loss": 2.3651350021362303, "memory(GiB)": 122.07, "step": 10275, "token_acc": 0.49659284497444633, "train_speed(iter/s)": 1.192969 }, { "epoch": 3.5229609321453053, "grad_norm": 1.9127631187438965, "learning_rate": 2.0030240522636516e-05, "loss": 2.304412841796875, "memory(GiB)": 122.07, "step": 10280, "token_acc": 0.5070679434564523, "train_speed(iter/s)": 1.192985 }, { "epoch": 3.5246744345442083, "grad_norm": 1.8200931549072266, "learning_rate": 1.998716848881216e-05, "loss": 2.346615219116211, "memory(GiB)": 122.07, "step": 10285, "token_acc": 0.49685006299874, "train_speed(iter/s)": 1.193046 }, { "epoch": 3.5263879369431117, "grad_norm": 1.921699047088623, "learning_rate": 1.994413124351946e-05, "loss": 2.3692155838012696, "memory(GiB)": 122.07, "step": 10290, "token_acc": 0.5105091078935077, "train_speed(iter/s)": 1.193076 }, { "epoch": 3.528101439342015, "grad_norm": 1.766645908355713, "learning_rate": 1.990112883664382e-05, "loss": 2.343718910217285, "memory(GiB)": 122.07, "step": 10295, "token_acc": 0.50199203187251, "train_speed(iter/s)": 1.193043 }, { "epoch": 3.5298149417409186, "grad_norm": 1.6526739597320557, "learning_rate": 1.9858161318030266e-05, "loss": 2.252641487121582, "memory(GiB)": 122.07, "step": 10300, "token_acc": 0.5100261551874455, "train_speed(iter/s)": 1.193048 }, { "epoch": 3.5315284441398216, "grad_norm": 1.8028862476348877, "learning_rate": 1.981522873748341e-05, "loss": 2.2147848129272463, "memory(GiB)": 122.07, "step": 10305, "token_acc": 0.5207708779443255, "train_speed(iter/s)": 1.193046 }, { "epoch": 3.533241946538725, "grad_norm": 2.0613787174224854, "learning_rate": 1.9772331144767365e-05, "loss": 2.228532409667969, "memory(GiB)": 122.07, "step": 10310, "token_acc": 0.5259770114942529, "train_speed(iter/s)": 1.193085 }, { "epoch": 3.5349554489376285, "grad_norm": 1.9239959716796875, "learning_rate": 1.9729468589605622e-05, "loss": 2.191473388671875, "memory(GiB)": 122.07, "step": 10315, "token_acc": 0.509779442363712, "train_speed(iter/s)": 1.19313 }, { "epoch": 3.536668951336532, "grad_norm": 1.9036844968795776, "learning_rate": 1.968664112168116e-05, "loss": 2.269992446899414, "memory(GiB)": 122.07, "step": 10320, "token_acc": 0.534556313993174, "train_speed(iter/s)": 1.193146 }, { "epoch": 3.5383824537354354, "grad_norm": 1.9267109632492065, "learning_rate": 1.96438487906362e-05, "loss": 2.2892787933349608, "memory(GiB)": 122.07, "step": 10325, "token_acc": 0.514648010494097, "train_speed(iter/s)": 1.192996 }, { "epoch": 3.540095956134339, "grad_norm": 1.8470982313156128, "learning_rate": 1.960109164607228e-05, "loss": 2.19542236328125, "memory(GiB)": 122.07, "step": 10330, "token_acc": 0.5353621945992285, "train_speed(iter/s)": 1.192935 }, { "epoch": 3.541809458533242, "grad_norm": 1.7965630292892456, "learning_rate": 1.955836973755015e-05, "loss": 2.0951906204223634, "memory(GiB)": 122.07, "step": 10335, "token_acc": 0.5255707762557078, "train_speed(iter/s)": 1.192872 }, { "epoch": 3.5435229609321452, "grad_norm": 1.9307711124420166, "learning_rate": 1.9515683114589718e-05, "loss": 2.151797866821289, "memory(GiB)": 122.07, "step": 10340, "token_acc": 0.5318401405357928, "train_speed(iter/s)": 1.192906 }, { "epoch": 3.5452364633310487, "grad_norm": 1.8081350326538086, "learning_rate": 1.947303182666999e-05, "loss": 2.1883094787597654, "memory(GiB)": 122.07, "step": 10345, "token_acc": 0.5150250417362271, "train_speed(iter/s)": 1.192903 }, { "epoch": 3.546949965729952, "grad_norm": 1.9315623044967651, "learning_rate": 1.9430415923229e-05, "loss": 2.267107391357422, "memory(GiB)": 122.07, "step": 10350, "token_acc": 0.5096442348906987, "train_speed(iter/s)": 1.192934 }, { "epoch": 3.548663468128855, "grad_norm": 1.8229563236236572, "learning_rate": 1.9387835453663762e-05, "loss": 2.2930307388305664, "memory(GiB)": 122.07, "step": 10355, "token_acc": 0.5044682752457551, "train_speed(iter/s)": 1.192971 }, { "epoch": 3.5503769705277586, "grad_norm": 1.863473653793335, "learning_rate": 1.9345290467330247e-05, "loss": 2.21241455078125, "memory(GiB)": 122.07, "step": 10360, "token_acc": 0.5118141770124149, "train_speed(iter/s)": 1.192936 }, { "epoch": 3.552090472926662, "grad_norm": 1.8661507368087769, "learning_rate": 1.930278101354328e-05, "loss": 2.275509834289551, "memory(GiB)": 122.07, "step": 10365, "token_acc": 0.5238715277777778, "train_speed(iter/s)": 1.19294 }, { "epoch": 3.5538039753255655, "grad_norm": 1.858088493347168, "learning_rate": 1.9260307141576507e-05, "loss": 2.127162551879883, "memory(GiB)": 122.07, "step": 10370, "token_acc": 0.5329004329004329, "train_speed(iter/s)": 1.193003 }, { "epoch": 3.555517477724469, "grad_norm": 2.0156891345977783, "learning_rate": 1.921786890066234e-05, "loss": 2.16916561126709, "memory(GiB)": 122.07, "step": 10375, "token_acc": 0.5239102835378756, "train_speed(iter/s)": 1.192902 }, { "epoch": 3.5572309801233724, "grad_norm": 1.7774968147277832, "learning_rate": 1.917546633999184e-05, "loss": 2.1932485580444334, "memory(GiB)": 122.07, "step": 10380, "token_acc": 0.5395369156837047, "train_speed(iter/s)": 1.192931 }, { "epoch": 3.558944482522276, "grad_norm": 1.766064167022705, "learning_rate": 1.913309950871478e-05, "loss": 2.3725833892822266, "memory(GiB)": 122.07, "step": 10385, "token_acc": 0.50201126307321, "train_speed(iter/s)": 1.192895 }, { "epoch": 3.560657984921179, "grad_norm": 1.7495557069778442, "learning_rate": 1.9090768455939444e-05, "loss": 2.231924819946289, "memory(GiB)": 122.07, "step": 10390, "token_acc": 0.517184265010352, "train_speed(iter/s)": 1.192942 }, { "epoch": 3.5623714873200822, "grad_norm": 1.8767681121826172, "learning_rate": 1.9048473230732715e-05, "loss": 2.3275691986083986, "memory(GiB)": 122.07, "step": 10395, "token_acc": 0.50314993700126, "train_speed(iter/s)": 1.192981 }, { "epoch": 3.5640849897189857, "grad_norm": 1.9782066345214844, "learning_rate": 1.9006213882119905e-05, "loss": 2.240755081176758, "memory(GiB)": 122.07, "step": 10400, "token_acc": 0.5136540962288687, "train_speed(iter/s)": 1.192834 }, { "epoch": 3.565798492117889, "grad_norm": 1.7710661888122559, "learning_rate": 1.896399045908476e-05, "loss": 2.334490966796875, "memory(GiB)": 122.07, "step": 10405, "token_acc": 0.484399375975039, "train_speed(iter/s)": 1.192863 }, { "epoch": 3.567511994516792, "grad_norm": 1.7247000932693481, "learning_rate": 1.892180301056939e-05, "loss": 2.304714393615723, "memory(GiB)": 122.07, "step": 10410, "token_acc": 0.5153168275283256, "train_speed(iter/s)": 1.192882 }, { "epoch": 3.5692254969156956, "grad_norm": 1.8988062143325806, "learning_rate": 1.887965158547417e-05, "loss": 2.3655933380126952, "memory(GiB)": 122.07, "step": 10415, "token_acc": 0.5021312872975278, "train_speed(iter/s)": 1.192926 }, { "epoch": 3.570938999314599, "grad_norm": 1.8651701211929321, "learning_rate": 1.8837536232657734e-05, "loss": 2.2726898193359375, "memory(GiB)": 122.07, "step": 10420, "token_acc": 0.5051150895140665, "train_speed(iter/s)": 1.192941 }, { "epoch": 3.5726525017135025, "grad_norm": 1.6635339260101318, "learning_rate": 1.8795457000936922e-05, "loss": 2.2763547897338867, "memory(GiB)": 122.07, "step": 10425, "token_acc": 0.5208065208065208, "train_speed(iter/s)": 1.192823 }, { "epoch": 3.574366004112406, "grad_norm": 2.0580782890319824, "learning_rate": 1.87534139390867e-05, "loss": 2.2088802337646483, "memory(GiB)": 122.07, "step": 10430, "token_acc": 0.5219966159052454, "train_speed(iter/s)": 1.19283 }, { "epoch": 3.5760795065113093, "grad_norm": 2.4171574115753174, "learning_rate": 1.87114070958401e-05, "loss": 2.214941215515137, "memory(GiB)": 122.07, "step": 10435, "token_acc": 0.5062552126772311, "train_speed(iter/s)": 1.192848 }, { "epoch": 3.5777930089102123, "grad_norm": 1.974370002746582, "learning_rate": 1.8669436519888196e-05, "loss": 2.2320127487182617, "memory(GiB)": 122.07, "step": 10440, "token_acc": 0.510492505353319, "train_speed(iter/s)": 1.192876 }, { "epoch": 3.579506511309116, "grad_norm": 1.6883121728897095, "learning_rate": 1.862750225987998e-05, "loss": 2.2445446014404298, "memory(GiB)": 122.07, "step": 10445, "token_acc": 0.5198927134555208, "train_speed(iter/s)": 1.192926 }, { "epoch": 3.5812200137080192, "grad_norm": 1.8514841794967651, "learning_rate": 1.8585604364422364e-05, "loss": 2.2489950180053713, "memory(GiB)": 122.07, "step": 10450, "token_acc": 0.5300687285223368, "train_speed(iter/s)": 1.192886 }, { "epoch": 3.5829335161069227, "grad_norm": 1.760230302810669, "learning_rate": 1.8543742882080133e-05, "loss": 2.257783889770508, "memory(GiB)": 122.07, "step": 10455, "token_acc": 0.5229838709677419, "train_speed(iter/s)": 1.192916 }, { "epoch": 3.5846470185058257, "grad_norm": 1.6413869857788086, "learning_rate": 1.8501917861375844e-05, "loss": 2.1435638427734376, "memory(GiB)": 122.07, "step": 10460, "token_acc": 0.5206999573196757, "train_speed(iter/s)": 1.192933 }, { "epoch": 3.586360520904729, "grad_norm": 1.77054762840271, "learning_rate": 1.8460129350789807e-05, "loss": 2.209748077392578, "memory(GiB)": 122.07, "step": 10465, "token_acc": 0.5291117722056948, "train_speed(iter/s)": 1.192983 }, { "epoch": 3.5880740233036326, "grad_norm": 1.9564300775527954, "learning_rate": 1.841837739876001e-05, "loss": 2.1954938888549806, "memory(GiB)": 122.07, "step": 10470, "token_acc": 0.5238709677419355, "train_speed(iter/s)": 1.193017 }, { "epoch": 3.589787525702536, "grad_norm": 2.1611249446868896, "learning_rate": 1.8376662053682027e-05, "loss": 2.2539243698120117, "memory(GiB)": 122.07, "step": 10475, "token_acc": 0.5209090909090909, "train_speed(iter/s)": 1.19309 }, { "epoch": 3.5915010281014395, "grad_norm": 1.9754059314727783, "learning_rate": 1.8334983363909064e-05, "loss": 2.2494083404541017, "memory(GiB)": 122.07, "step": 10480, "token_acc": 0.5169082125603864, "train_speed(iter/s)": 1.193153 }, { "epoch": 3.593214530500343, "grad_norm": 1.7301807403564453, "learning_rate": 1.8293341377751767e-05, "loss": 2.300371551513672, "memory(GiB)": 122.07, "step": 10485, "token_acc": 0.512615138165799, "train_speed(iter/s)": 1.193094 }, { "epoch": 3.594928032899246, "grad_norm": 2.0621399879455566, "learning_rate": 1.8251736143478293e-05, "loss": 2.2241363525390625, "memory(GiB)": 122.07, "step": 10490, "token_acc": 0.5288003464703335, "train_speed(iter/s)": 1.193045 }, { "epoch": 3.5966415352981493, "grad_norm": 1.8481566905975342, "learning_rate": 1.8210167709314186e-05, "loss": 2.2219942092895506, "memory(GiB)": 122.07, "step": 10495, "token_acc": 0.5143212951432129, "train_speed(iter/s)": 1.193058 }, { "epoch": 3.598355037697053, "grad_norm": 1.729090929031372, "learning_rate": 1.8168636123442336e-05, "loss": 2.2774730682373048, "memory(GiB)": 122.07, "step": 10500, "token_acc": 0.513929313929314, "train_speed(iter/s)": 1.193038 }, { "epoch": 3.598355037697053, "eval_loss": 2.0023508071899414, "eval_runtime": 3.7136, "eval_samples_per_second": 26.928, "eval_steps_per_second": 26.928, "eval_token_acc": 0.4823232323232323, "step": 10500 }, { "epoch": 3.6000685400959562, "grad_norm": 1.9606683254241943, "learning_rate": 1.8127141434002914e-05, "loss": 2.16354923248291, "memory(GiB)": 122.07, "step": 10505, "token_acc": 0.5042016806722689, "train_speed(iter/s)": 1.192238 }, { "epoch": 3.6017820424948592, "grad_norm": 1.8785970211029053, "learning_rate": 1.808568368909333e-05, "loss": 2.1118656158447267, "memory(GiB)": 122.07, "step": 10510, "token_acc": 0.5345744680851063, "train_speed(iter/s)": 1.192235 }, { "epoch": 3.6034955448937627, "grad_norm": 1.8468067646026611, "learning_rate": 1.804426293676813e-05, "loss": 2.207821273803711, "memory(GiB)": 122.07, "step": 10515, "token_acc": 0.5246187363834423, "train_speed(iter/s)": 1.192286 }, { "epoch": 3.605209047292666, "grad_norm": 1.769443392753601, "learning_rate": 1.800287922503905e-05, "loss": 2.207012748718262, "memory(GiB)": 122.07, "step": 10520, "token_acc": 0.531195079086116, "train_speed(iter/s)": 1.19231 }, { "epoch": 3.6069225496915696, "grad_norm": 1.9482148885726929, "learning_rate": 1.7961532601874848e-05, "loss": 2.308085250854492, "memory(GiB)": 122.07, "step": 10525, "token_acc": 0.5067319461444308, "train_speed(iter/s)": 1.192362 }, { "epoch": 3.608636052090473, "grad_norm": 1.9792263507843018, "learning_rate": 1.79202231152013e-05, "loss": 2.33398380279541, "memory(GiB)": 122.07, "step": 10530, "token_acc": 0.5078979343863913, "train_speed(iter/s)": 1.192353 }, { "epoch": 3.6103495544893764, "grad_norm": 2.038337230682373, "learning_rate": 1.7878950812901168e-05, "loss": 2.2519905090332033, "memory(GiB)": 122.07, "step": 10535, "token_acc": 0.49159472966833256, "train_speed(iter/s)": 1.192377 }, { "epoch": 3.61206305688828, "grad_norm": 1.9844849109649658, "learning_rate": 1.783771574281406e-05, "loss": 2.2269372940063477, "memory(GiB)": 122.07, "step": 10540, "token_acc": 0.5141916906622789, "train_speed(iter/s)": 1.192356 }, { "epoch": 3.613776559287183, "grad_norm": 1.7494882345199585, "learning_rate": 1.779651795273643e-05, "loss": 2.3382444381713867, "memory(GiB)": 122.07, "step": 10545, "token_acc": 0.5002021835826931, "train_speed(iter/s)": 1.192376 }, { "epoch": 3.6154900616860863, "grad_norm": 1.8876407146453857, "learning_rate": 1.7755357490421558e-05, "loss": 2.3101829528808593, "memory(GiB)": 122.07, "step": 10550, "token_acc": 0.5093856655290102, "train_speed(iter/s)": 1.192444 }, { "epoch": 3.6172035640849898, "grad_norm": 1.7785651683807373, "learning_rate": 1.771423440357945e-05, "loss": 2.256972312927246, "memory(GiB)": 122.07, "step": 10555, "token_acc": 0.5214968152866242, "train_speed(iter/s)": 1.192449 }, { "epoch": 3.618917066483893, "grad_norm": 2.3091073036193848, "learning_rate": 1.767314873987676e-05, "loss": 2.3399988174438477, "memory(GiB)": 122.07, "step": 10560, "token_acc": 0.502588438308887, "train_speed(iter/s)": 1.192343 }, { "epoch": 3.620630568882796, "grad_norm": 1.8190593719482422, "learning_rate": 1.7632100546936814e-05, "loss": 2.259721374511719, "memory(GiB)": 122.07, "step": 10565, "token_acc": 0.5141811527904849, "train_speed(iter/s)": 1.19236 }, { "epoch": 3.6223440712816997, "grad_norm": 1.8992007970809937, "learning_rate": 1.7591089872339454e-05, "loss": 2.313262367248535, "memory(GiB)": 122.07, "step": 10570, "token_acc": 0.4931682322801025, "train_speed(iter/s)": 1.192379 }, { "epoch": 3.624057573680603, "grad_norm": 1.9505285024642944, "learning_rate": 1.7550116763621033e-05, "loss": 2.2701488494873048, "memory(GiB)": 122.07, "step": 10575, "token_acc": 0.5006380263717567, "train_speed(iter/s)": 1.1924 }, { "epoch": 3.6257710760795065, "grad_norm": 1.7532280683517456, "learning_rate": 1.7509181268274392e-05, "loss": 2.2644626617431642, "memory(GiB)": 122.07, "step": 10580, "token_acc": 0.5139442231075697, "train_speed(iter/s)": 1.192245 }, { "epoch": 3.62748457847841, "grad_norm": 1.8509037494659424, "learning_rate": 1.7468283433748778e-05, "loss": 2.2048114776611327, "memory(GiB)": 122.07, "step": 10585, "token_acc": 0.5256410256410257, "train_speed(iter/s)": 1.192261 }, { "epoch": 3.6291980808773134, "grad_norm": 1.636421799659729, "learning_rate": 1.7427423307449753e-05, "loss": 2.1859577178955076, "memory(GiB)": 122.07, "step": 10590, "token_acc": 0.5135013501350135, "train_speed(iter/s)": 1.192239 }, { "epoch": 3.6309115832762164, "grad_norm": 1.9653350114822388, "learning_rate": 1.7386600936739195e-05, "loss": 2.3013607025146485, "memory(GiB)": 122.07, "step": 10595, "token_acc": 0.506785411365564, "train_speed(iter/s)": 1.192252 }, { "epoch": 3.63262508567512, "grad_norm": 1.9332352876663208, "learning_rate": 1.734581636893522e-05, "loss": 2.2812570571899413, "memory(GiB)": 122.07, "step": 10600, "token_acc": 0.5303938356164384, "train_speed(iter/s)": 1.192275 }, { "epoch": 3.6343385880740233, "grad_norm": 2.0789597034454346, "learning_rate": 1.73050696513121e-05, "loss": 2.296762466430664, "memory(GiB)": 122.07, "step": 10605, "token_acc": 0.5154916928603502, "train_speed(iter/s)": 1.192303 }, { "epoch": 3.6360520904729268, "grad_norm": 1.8358402252197266, "learning_rate": 1.726436083110024e-05, "loss": 2.181193542480469, "memory(GiB)": 122.07, "step": 10610, "token_acc": 0.5299497027892089, "train_speed(iter/s)": 1.192171 }, { "epoch": 3.6377655928718298, "grad_norm": 1.9494704008102417, "learning_rate": 1.722368995548614e-05, "loss": 2.3001331329345702, "memory(GiB)": 122.07, "step": 10615, "token_acc": 0.5, "train_speed(iter/s)": 1.192206 }, { "epoch": 3.639479095270733, "grad_norm": 2.065182685852051, "learning_rate": 1.7183057071612296e-05, "loss": 2.2274154663085937, "memory(GiB)": 122.07, "step": 10620, "token_acc": 0.51375, "train_speed(iter/s)": 1.192228 }, { "epoch": 3.6411925976696367, "grad_norm": 2.017423629760742, "learning_rate": 1.714246222657719e-05, "loss": 2.220354461669922, "memory(GiB)": 122.07, "step": 10625, "token_acc": 0.5254885301614274, "train_speed(iter/s)": 1.19224 }, { "epoch": 3.64290610006854, "grad_norm": 1.7002902030944824, "learning_rate": 1.7101905467435203e-05, "loss": 2.2648441314697267, "memory(GiB)": 122.07, "step": 10630, "token_acc": 0.5144411887819171, "train_speed(iter/s)": 1.192282 }, { "epoch": 3.6446196024674435, "grad_norm": 1.7124277353286743, "learning_rate": 1.706138684119655e-05, "loss": 2.2488124847412108, "memory(GiB)": 122.07, "step": 10635, "token_acc": 0.5098698026039479, "train_speed(iter/s)": 1.192203 }, { "epoch": 3.646333104866347, "grad_norm": 1.8466676473617554, "learning_rate": 1.7020906394827245e-05, "loss": 2.3022844314575197, "memory(GiB)": 122.07, "step": 10640, "token_acc": 0.5124172185430463, "train_speed(iter/s)": 1.192248 }, { "epoch": 3.6480466072652504, "grad_norm": 1.9279844760894775, "learning_rate": 1.6980464175249083e-05, "loss": 2.3731672286987306, "memory(GiB)": 122.07, "step": 10645, "token_acc": 0.5108885017421603, "train_speed(iter/s)": 1.19229 }, { "epoch": 3.6497601096641534, "grad_norm": 1.7899826765060425, "learning_rate": 1.694006022933952e-05, "loss": 2.2507627487182615, "memory(GiB)": 122.07, "step": 10650, "token_acc": 0.5170670037926675, "train_speed(iter/s)": 1.192343 }, { "epoch": 3.651473612063057, "grad_norm": 1.8095563650131226, "learning_rate": 1.689969460393166e-05, "loss": 2.31125545501709, "memory(GiB)": 122.07, "step": 10655, "token_acc": 0.49595916631220754, "train_speed(iter/s)": 1.19233 }, { "epoch": 3.6531871144619603, "grad_norm": 1.7229092121124268, "learning_rate": 1.6859367345814204e-05, "loss": 2.187211608886719, "memory(GiB)": 122.07, "step": 10660, "token_acc": 0.5250326512842839, "train_speed(iter/s)": 1.192353 }, { "epoch": 3.6549006168608638, "grad_norm": 1.7414368391036987, "learning_rate": 1.681907850173133e-05, "loss": 2.2698490142822267, "memory(GiB)": 122.07, "step": 10665, "token_acc": 0.5046610169491526, "train_speed(iter/s)": 1.192394 }, { "epoch": 3.6566141192597668, "grad_norm": 1.9814118146896362, "learning_rate": 1.6778828118382744e-05, "loss": 2.3069427490234373, "memory(GiB)": 122.07, "step": 10670, "token_acc": 0.5252609603340292, "train_speed(iter/s)": 1.192343 }, { "epoch": 3.65832762165867, "grad_norm": 1.7212921380996704, "learning_rate": 1.6738616242423523e-05, "loss": 2.2428916931152343, "memory(GiB)": 122.07, "step": 10675, "token_acc": 0.5079899074852817, "train_speed(iter/s)": 1.192378 }, { "epoch": 3.6600411240575736, "grad_norm": 1.8815187215805054, "learning_rate": 1.6698442920464152e-05, "loss": 2.2139373779296876, "memory(GiB)": 122.07, "step": 10680, "token_acc": 0.5138190954773869, "train_speed(iter/s)": 1.192409 }, { "epoch": 3.661754626456477, "grad_norm": 1.7258700132369995, "learning_rate": 1.665830819907041e-05, "loss": 2.1802513122558596, "memory(GiB)": 122.07, "step": 10685, "token_acc": 0.5223382045929019, "train_speed(iter/s)": 1.192358 }, { "epoch": 3.6634681288553805, "grad_norm": 1.8020578622817993, "learning_rate": 1.6618212124763338e-05, "loss": 2.141399955749512, "memory(GiB)": 122.07, "step": 10690, "token_acc": 0.5363558597091531, "train_speed(iter/s)": 1.192367 }, { "epoch": 3.665181631254284, "grad_norm": 1.9779330492019653, "learning_rate": 1.657815474401918e-05, "loss": 2.179909515380859, "memory(GiB)": 122.07, "step": 10695, "token_acc": 0.517998244073749, "train_speed(iter/s)": 1.192397 }, { "epoch": 3.666895133653187, "grad_norm": 1.700027346611023, "learning_rate": 1.653813610326932e-05, "loss": 2.2885162353515627, "memory(GiB)": 122.07, "step": 10700, "token_acc": 0.49896822121337187, "train_speed(iter/s)": 1.192409 }, { "epoch": 3.6686086360520904, "grad_norm": 1.8151594400405884, "learning_rate": 1.6498156248900215e-05, "loss": 2.2819904327392577, "memory(GiB)": 122.07, "step": 10705, "token_acc": 0.5276254744833404, "train_speed(iter/s)": 1.192437 }, { "epoch": 3.670322138450994, "grad_norm": 1.9584680795669556, "learning_rate": 1.645821522725342e-05, "loss": 2.2931085586547852, "memory(GiB)": 122.07, "step": 10710, "token_acc": 0.511330861145447, "train_speed(iter/s)": 1.19248 }, { "epoch": 3.6720356408498973, "grad_norm": 2.560685157775879, "learning_rate": 1.6418313084625443e-05, "loss": 2.36212272644043, "memory(GiB)": 122.07, "step": 10715, "token_acc": 0.5063446582071224, "train_speed(iter/s)": 1.192442 }, { "epoch": 3.6737491432488003, "grad_norm": 1.6619384288787842, "learning_rate": 1.637844986726773e-05, "loss": 2.1912166595458986, "memory(GiB)": 122.07, "step": 10720, "token_acc": 0.5186455207886841, "train_speed(iter/s)": 1.192508 }, { "epoch": 3.6754626456477038, "grad_norm": 1.8528528213500977, "learning_rate": 1.6338625621386638e-05, "loss": 2.301262855529785, "memory(GiB)": 122.07, "step": 10725, "token_acc": 0.5164377861007075, "train_speed(iter/s)": 1.192484 }, { "epoch": 3.677176148046607, "grad_norm": 1.998936414718628, "learning_rate": 1.629884039314328e-05, "loss": 2.233258056640625, "memory(GiB)": 122.07, "step": 10730, "token_acc": 0.5283340434597358, "train_speed(iter/s)": 1.192508 }, { "epoch": 3.6788896504455106, "grad_norm": 1.677050232887268, "learning_rate": 1.6259094228653632e-05, "loss": 2.2908355712890627, "memory(GiB)": 122.07, "step": 10735, "token_acc": 0.5144578313253012, "train_speed(iter/s)": 1.192514 }, { "epoch": 3.680603152844414, "grad_norm": 1.6820589303970337, "learning_rate": 1.62193871739883e-05, "loss": 2.3046142578125, "memory(GiB)": 122.07, "step": 10740, "token_acc": 0.5104379860826852, "train_speed(iter/s)": 1.192509 }, { "epoch": 3.6823166552433175, "grad_norm": 1.8079637289047241, "learning_rate": 1.6179719275172628e-05, "loss": 2.2381731033325196, "memory(GiB)": 122.07, "step": 10745, "token_acc": 0.5072105480016481, "train_speed(iter/s)": 1.192509 }, { "epoch": 3.684030157642221, "grad_norm": 1.6911959648132324, "learning_rate": 1.6140090578186544e-05, "loss": 2.230304718017578, "memory(GiB)": 122.07, "step": 10750, "token_acc": 0.5157894736842106, "train_speed(iter/s)": 1.192548 }, { "epoch": 3.685743660041124, "grad_norm": 1.825029969215393, "learning_rate": 1.6100501128964556e-05, "loss": 2.278268814086914, "memory(GiB)": 122.07, "step": 10755, "token_acc": 0.5124784853700516, "train_speed(iter/s)": 1.19257 }, { "epoch": 3.6874571624400274, "grad_norm": 1.7708643674850464, "learning_rate": 1.6060950973395637e-05, "loss": 2.177284049987793, "memory(GiB)": 122.07, "step": 10760, "token_acc": 0.524390243902439, "train_speed(iter/s)": 1.192584 }, { "epoch": 3.689170664838931, "grad_norm": 2.069046974182129, "learning_rate": 1.602144015732327e-05, "loss": 2.3952829360961916, "memory(GiB)": 122.07, "step": 10765, "token_acc": 0.5049019607843137, "train_speed(iter/s)": 1.192642 }, { "epoch": 3.690884167237834, "grad_norm": 1.8081889152526855, "learning_rate": 1.598196872654527e-05, "loss": 2.302248001098633, "memory(GiB)": 122.07, "step": 10770, "token_acc": 0.5092326631103816, "train_speed(iter/s)": 1.192708 }, { "epoch": 3.6925976696367373, "grad_norm": 2.0229928493499756, "learning_rate": 1.5942536726813863e-05, "loss": 2.218337821960449, "memory(GiB)": 122.07, "step": 10775, "token_acc": 0.5223427331887202, "train_speed(iter/s)": 1.192739 }, { "epoch": 3.6943111720356407, "grad_norm": 2.1293785572052, "learning_rate": 1.5903144203835535e-05, "loss": 2.3575923919677733, "memory(GiB)": 122.07, "step": 10780, "token_acc": 0.5079764903442485, "train_speed(iter/s)": 1.192749 }, { "epoch": 3.696024674434544, "grad_norm": 1.9964429140090942, "learning_rate": 1.586379120327105e-05, "loss": 2.3517328262329102, "memory(GiB)": 122.07, "step": 10785, "token_acc": 0.509836751778987, "train_speed(iter/s)": 1.192797 }, { "epoch": 3.6977381768334476, "grad_norm": 2.124831438064575, "learning_rate": 1.5824477770735304e-05, "loss": 2.253963279724121, "memory(GiB)": 122.07, "step": 10790, "token_acc": 0.5153746210480727, "train_speed(iter/s)": 1.192831 }, { "epoch": 3.699451679232351, "grad_norm": 1.6826485395431519, "learning_rate": 1.5785203951797368e-05, "loss": 2.1793012619018555, "memory(GiB)": 122.07, "step": 10795, "token_acc": 0.5289429530201343, "train_speed(iter/s)": 1.192839 }, { "epoch": 3.7011651816312545, "grad_norm": 1.8683433532714844, "learning_rate": 1.5745969791980403e-05, "loss": 2.2597347259521485, "memory(GiB)": 122.07, "step": 10800, "token_acc": 0.5164279696714406, "train_speed(iter/s)": 1.192868 }, { "epoch": 3.7028786840301575, "grad_norm": 1.7214229106903076, "learning_rate": 1.570677533676156e-05, "loss": 2.2255001068115234, "memory(GiB)": 122.07, "step": 10805, "token_acc": 0.5141065830721003, "train_speed(iter/s)": 1.192897 }, { "epoch": 3.704592186429061, "grad_norm": 1.775171160697937, "learning_rate": 1.5667620631572e-05, "loss": 2.184585762023926, "memory(GiB)": 122.07, "step": 10810, "token_acc": 0.522311942201445, "train_speed(iter/s)": 1.192901 }, { "epoch": 3.7063056888279644, "grad_norm": 2.3444712162017822, "learning_rate": 1.562850572179681e-05, "loss": 2.2944992065429686, "memory(GiB)": 122.07, "step": 10815, "token_acc": 0.518108223263741, "train_speed(iter/s)": 1.192929 }, { "epoch": 3.708019191226868, "grad_norm": 1.8866379261016846, "learning_rate": 1.558943065277495e-05, "loss": 2.305619239807129, "memory(GiB)": 122.07, "step": 10820, "token_acc": 0.509515570934256, "train_speed(iter/s)": 1.192873 }, { "epoch": 3.709732693625771, "grad_norm": 1.7383856773376465, "learning_rate": 1.5550395469799156e-05, "loss": 2.1604724884033204, "memory(GiB)": 122.07, "step": 10825, "token_acc": 0.5330188679245284, "train_speed(iter/s)": 1.192866 }, { "epoch": 3.7114461960246743, "grad_norm": 1.8745979070663452, "learning_rate": 1.5511400218116e-05, "loss": 2.2869935989379884, "memory(GiB)": 122.07, "step": 10830, "token_acc": 0.5100250626566416, "train_speed(iter/s)": 1.192888 }, { "epoch": 3.7131596984235777, "grad_norm": 1.824724793434143, "learning_rate": 1.5472444942925708e-05, "loss": 2.2995368957519533, "memory(GiB)": 122.07, "step": 10835, "token_acc": 0.516, "train_speed(iter/s)": 1.192922 }, { "epoch": 3.714873200822481, "grad_norm": 1.8385076522827148, "learning_rate": 1.5433529689382203e-05, "loss": 2.284416389465332, "memory(GiB)": 122.07, "step": 10840, "token_acc": 0.5056620209059234, "train_speed(iter/s)": 1.19296 }, { "epoch": 3.7165867032213846, "grad_norm": 1.8002278804779053, "learning_rate": 1.5394654502593007e-05, "loss": 2.2367759704589845, "memory(GiB)": 122.42, "step": 10845, "token_acc": 0.5228070175438596, "train_speed(iter/s)": 1.19296 }, { "epoch": 3.718300205620288, "grad_norm": 1.899446725845337, "learning_rate": 1.535581942761923e-05, "loss": 2.244673156738281, "memory(GiB)": 122.42, "step": 10850, "token_acc": 0.5022366815778772, "train_speed(iter/s)": 1.192872 }, { "epoch": 3.720013708019191, "grad_norm": 1.8789596557617188, "learning_rate": 1.531702450947542e-05, "loss": 2.2390682220458986, "memory(GiB)": 122.42, "step": 10855, "token_acc": 0.5303938356164384, "train_speed(iter/s)": 1.192879 }, { "epoch": 3.7217272104180945, "grad_norm": 1.8040162324905396, "learning_rate": 1.5278269793129628e-05, "loss": 2.1746667861938476, "memory(GiB)": 122.42, "step": 10860, "token_acc": 0.5405764966740576, "train_speed(iter/s)": 1.192926 }, { "epoch": 3.723440712816998, "grad_norm": 1.8894891738891602, "learning_rate": 1.5239555323503318e-05, "loss": 2.3342267990112306, "memory(GiB)": 122.42, "step": 10865, "token_acc": 0.5142371440713982, "train_speed(iter/s)": 1.192968 }, { "epoch": 3.7251542152159014, "grad_norm": 2.061497688293457, "learning_rate": 1.5200881145471258e-05, "loss": 2.2065452575683593, "memory(GiB)": 122.42, "step": 10870, "token_acc": 0.5275010827197921, "train_speed(iter/s)": 1.192999 }, { "epoch": 3.7268677176148044, "grad_norm": 1.7115263938903809, "learning_rate": 1.5162247303861537e-05, "loss": 2.2559062957763674, "memory(GiB)": 122.42, "step": 10875, "token_acc": 0.5119191919191919, "train_speed(iter/s)": 1.192995 }, { "epoch": 3.728581220013708, "grad_norm": 1.8295539617538452, "learning_rate": 1.5123653843455516e-05, "loss": 2.3159961700439453, "memory(GiB)": 122.42, "step": 10880, "token_acc": 0.5048715677590788, "train_speed(iter/s)": 1.193059 }, { "epoch": 3.7302947224126113, "grad_norm": 1.9044498205184937, "learning_rate": 1.5085100808987679e-05, "loss": 2.2611099243164063, "memory(GiB)": 122.42, "step": 10885, "token_acc": 0.5045379537953796, "train_speed(iter/s)": 1.192993 }, { "epoch": 3.7320082248115147, "grad_norm": 1.7502388954162598, "learning_rate": 1.504658824514572e-05, "loss": 2.2800235748291016, "memory(GiB)": 122.42, "step": 10890, "token_acc": 0.5061174551386624, "train_speed(iter/s)": 1.193004 }, { "epoch": 3.733721727210418, "grad_norm": 1.844020128250122, "learning_rate": 1.5008116196570416e-05, "loss": 2.2005685806274413, "memory(GiB)": 122.42, "step": 10895, "token_acc": 0.5393700787401575, "train_speed(iter/s)": 1.193014 }, { "epoch": 3.7354352296093216, "grad_norm": 1.9301904439926147, "learning_rate": 1.496968470785552e-05, "loss": 2.3198829650878907, "memory(GiB)": 122.42, "step": 10900, "token_acc": 0.5124275062137531, "train_speed(iter/s)": 1.19298 }, { "epoch": 3.737148732008225, "grad_norm": 1.7508161067962646, "learning_rate": 1.4931293823547837e-05, "loss": 2.2355764389038084, "memory(GiB)": 122.42, "step": 10905, "token_acc": 0.5269512721000431, "train_speed(iter/s)": 1.193019 }, { "epoch": 3.738862234407128, "grad_norm": 1.9504071474075317, "learning_rate": 1.489294358814708e-05, "loss": 2.273830032348633, "memory(GiB)": 122.42, "step": 10910, "token_acc": 0.5190641247833622, "train_speed(iter/s)": 1.192962 }, { "epoch": 3.7405757368060315, "grad_norm": 1.533994436264038, "learning_rate": 1.4854634046105865e-05, "loss": 2.255419921875, "memory(GiB)": 122.42, "step": 10915, "token_acc": 0.5183224755700325, "train_speed(iter/s)": 1.193024 }, { "epoch": 3.742289239204935, "grad_norm": 1.903236746788025, "learning_rate": 1.4816365241829589e-05, "loss": 2.2691165924072267, "memory(GiB)": 122.42, "step": 10920, "token_acc": 0.5106014712245781, "train_speed(iter/s)": 1.193009 }, { "epoch": 3.7440027416038384, "grad_norm": 1.9181016683578491, "learning_rate": 1.4778137219676485e-05, "loss": 2.206517219543457, "memory(GiB)": 122.42, "step": 10925, "token_acc": 0.5280600972160848, "train_speed(iter/s)": 1.193021 }, { "epoch": 3.7457162440027414, "grad_norm": 2.523625373840332, "learning_rate": 1.4739950023957505e-05, "loss": 2.2401760101318358, "memory(GiB)": 122.42, "step": 10930, "token_acc": 0.5268638067471887, "train_speed(iter/s)": 1.193049 }, { "epoch": 3.747429746401645, "grad_norm": 1.7910364866256714, "learning_rate": 1.4701803698936228e-05, "loss": 2.294392776489258, "memory(GiB)": 122.42, "step": 10935, "token_acc": 0.5008554319931565, "train_speed(iter/s)": 1.192939 }, { "epoch": 3.7491432488005483, "grad_norm": 1.7120251655578613, "learning_rate": 1.4663698288828926e-05, "loss": 2.2656587600708007, "memory(GiB)": 122.42, "step": 10940, "token_acc": 0.5022926219258024, "train_speed(iter/s)": 1.192942 }, { "epoch": 3.7508567511994517, "grad_norm": 1.816076397895813, "learning_rate": 1.462563383780442e-05, "loss": 2.343815231323242, "memory(GiB)": 122.42, "step": 10945, "token_acc": 0.4890756302521008, "train_speed(iter/s)": 1.192971 }, { "epoch": 3.752570253598355, "grad_norm": 1.9487584829330444, "learning_rate": 1.4587610389984025e-05, "loss": 2.1969934463500977, "memory(GiB)": 122.42, "step": 10950, "token_acc": 0.5192733717323881, "train_speed(iter/s)": 1.193019 }, { "epoch": 3.7542837559972586, "grad_norm": 1.7853150367736816, "learning_rate": 1.4549627989441567e-05, "loss": 2.331837844848633, "memory(GiB)": 122.42, "step": 10955, "token_acc": 0.5119097367321354, "train_speed(iter/s)": 1.19305 }, { "epoch": 3.7559972583961616, "grad_norm": 1.8039592504501343, "learning_rate": 1.451168668020329e-05, "loss": 2.290620040893555, "memory(GiB)": 122.42, "step": 10960, "token_acc": 0.5090071219103477, "train_speed(iter/s)": 1.193008 }, { "epoch": 3.757710760795065, "grad_norm": 2.0630900859832764, "learning_rate": 1.4473786506247778e-05, "loss": 2.2503223419189453, "memory(GiB)": 122.42, "step": 10965, "token_acc": 0.5188679245283019, "train_speed(iter/s)": 1.192979 }, { "epoch": 3.7594242631939685, "grad_norm": 1.7634685039520264, "learning_rate": 1.4435927511505964e-05, "loss": 2.2988542556762694, "memory(GiB)": 122.42, "step": 10970, "token_acc": 0.5134315424610052, "train_speed(iter/s)": 1.192923 }, { "epoch": 3.761137765592872, "grad_norm": 1.9511150121688843, "learning_rate": 1.439810973986106e-05, "loss": 2.320774269104004, "memory(GiB)": 122.42, "step": 10975, "token_acc": 0.5033416875522139, "train_speed(iter/s)": 1.192947 }, { "epoch": 3.762851267991775, "grad_norm": 2.076505661010742, "learning_rate": 1.4360333235148437e-05, "loss": 2.3079015731811525, "memory(GiB)": 122.42, "step": 10980, "token_acc": 0.5028135048231511, "train_speed(iter/s)": 1.192959 }, { "epoch": 3.7645647703906784, "grad_norm": 2.0619113445281982, "learning_rate": 1.4322598041155688e-05, "loss": 2.2716150283813477, "memory(GiB)": 122.42, "step": 10985, "token_acc": 0.5255122539172359, "train_speed(iter/s)": 1.192887 }, { "epoch": 3.766278272789582, "grad_norm": 1.8584320545196533, "learning_rate": 1.4284904201622507e-05, "loss": 2.307679557800293, "memory(GiB)": 122.42, "step": 10990, "token_acc": 0.503502266172229, "train_speed(iter/s)": 1.192861 }, { "epoch": 3.7679917751884853, "grad_norm": 1.8156505823135376, "learning_rate": 1.4247251760240665e-05, "loss": 2.2844730377197267, "memory(GiB)": 122.42, "step": 10995, "token_acc": 0.511518771331058, "train_speed(iter/s)": 1.19287 }, { "epoch": 3.7697052775873887, "grad_norm": 1.9618486166000366, "learning_rate": 1.4209640760653903e-05, "loss": 2.308715057373047, "memory(GiB)": 122.42, "step": 11000, "token_acc": 0.513911620294599, "train_speed(iter/s)": 1.192901 }, { "epoch": 3.7697052775873887, "eval_loss": 2.146430730819702, "eval_runtime": 3.7001, "eval_samples_per_second": 27.026, "eval_steps_per_second": 27.026, "eval_token_acc": 0.4780426599749059, "step": 11000 }, { "epoch": 3.771418779986292, "grad_norm": 1.9939738512039185, "learning_rate": 1.4172071246457968e-05, "loss": 2.334667778015137, "memory(GiB)": 122.42, "step": 11005, "token_acc": 0.4946775203506575, "train_speed(iter/s)": 1.192089 }, { "epoch": 3.7731322823851956, "grad_norm": 2.230604648590088, "learning_rate": 1.4134543261200528e-05, "loss": 2.28389835357666, "memory(GiB)": 122.42, "step": 11010, "token_acc": 0.516614975631369, "train_speed(iter/s)": 1.192123 }, { "epoch": 3.7748457847840986, "grad_norm": 1.7321134805679321, "learning_rate": 1.4097056848381057e-05, "loss": 2.3273754119873047, "memory(GiB)": 122.42, "step": 11015, "token_acc": 0.5041250542770299, "train_speed(iter/s)": 1.192013 }, { "epoch": 3.776559287183002, "grad_norm": 1.7834577560424805, "learning_rate": 1.4059612051450905e-05, "loss": 2.321307373046875, "memory(GiB)": 122.42, "step": 11020, "token_acc": 0.5008733624454148, "train_speed(iter/s)": 1.192048 }, { "epoch": 3.7782727895819055, "grad_norm": 1.8647249937057495, "learning_rate": 1.4022208913813162e-05, "loss": 2.269017791748047, "memory(GiB)": 122.42, "step": 11025, "token_acc": 0.5039798910766653, "train_speed(iter/s)": 1.192066 }, { "epoch": 3.779986291980809, "grad_norm": 1.953665852546692, "learning_rate": 1.3984847478822605e-05, "loss": 2.2461288452148436, "memory(GiB)": 122.42, "step": 11030, "token_acc": 0.5138705416116248, "train_speed(iter/s)": 1.192075 }, { "epoch": 3.781699794379712, "grad_norm": 1.8006504774093628, "learning_rate": 1.3947527789785702e-05, "loss": 2.137205123901367, "memory(GiB)": 122.42, "step": 11035, "token_acc": 0.5245249668581529, "train_speed(iter/s)": 1.192113 }, { "epoch": 3.7834132967786154, "grad_norm": 2.1168365478515625, "learning_rate": 1.3910249889960542e-05, "loss": 2.274658966064453, "memory(GiB)": 122.42, "step": 11040, "token_acc": 0.5220649458784347, "train_speed(iter/s)": 1.192138 }, { "epoch": 3.785126799177519, "grad_norm": 1.572640299797058, "learning_rate": 1.3873013822556719e-05, "loss": 2.3433387756347654, "memory(GiB)": 122.42, "step": 11045, "token_acc": 0.49852507374631266, "train_speed(iter/s)": 1.192167 }, { "epoch": 3.7868403015764223, "grad_norm": 2.025104522705078, "learning_rate": 1.3835819630735402e-05, "loss": 2.3015819549560548, "memory(GiB)": 122.42, "step": 11050, "token_acc": 0.5134548611111112, "train_speed(iter/s)": 1.192184 }, { "epoch": 3.7885538039753257, "grad_norm": 2.0781309604644775, "learning_rate": 1.3798667357609186e-05, "loss": 2.243793487548828, "memory(GiB)": 122.42, "step": 11055, "token_acc": 0.5131977498918218, "train_speed(iter/s)": 1.192112 }, { "epoch": 3.790267306374229, "grad_norm": 2.0932536125183105, "learning_rate": 1.3761557046242106e-05, "loss": 2.2343854904174805, "memory(GiB)": 122.42, "step": 11060, "token_acc": 0.5247481384143671, "train_speed(iter/s)": 1.192096 }, { "epoch": 3.791980808773132, "grad_norm": 1.684779405593872, "learning_rate": 1.372448873964951e-05, "loss": 2.235240173339844, "memory(GiB)": 122.42, "step": 11065, "token_acc": 0.510896898575021, "train_speed(iter/s)": 1.192144 }, { "epoch": 3.7936943111720356, "grad_norm": 1.973107933998108, "learning_rate": 1.3687462480798113e-05, "loss": 2.2942081451416017, "memory(GiB)": 122.42, "step": 11070, "token_acc": 0.5109961190168176, "train_speed(iter/s)": 1.192173 }, { "epoch": 3.795407813570939, "grad_norm": 2.0061511993408203, "learning_rate": 1.365047831260583e-05, "loss": 2.1910732269287108, "memory(GiB)": 122.42, "step": 11075, "token_acc": 0.5311258278145695, "train_speed(iter/s)": 1.192197 }, { "epoch": 3.7971213159698425, "grad_norm": 1.82192862033844, "learning_rate": 1.3613536277941847e-05, "loss": 2.3048423767089843, "memory(GiB)": 122.42, "step": 11080, "token_acc": 0.5076988879384089, "train_speed(iter/s)": 1.192115 }, { "epoch": 3.7988348183687455, "grad_norm": 1.9031479358673096, "learning_rate": 1.3576636419626476e-05, "loss": 2.2264240264892576, "memory(GiB)": 122.42, "step": 11085, "token_acc": 0.5235269181856719, "train_speed(iter/s)": 1.192152 }, { "epoch": 3.800548320767649, "grad_norm": 1.9393949508666992, "learning_rate": 1.3539778780431172e-05, "loss": 2.282058525085449, "memory(GiB)": 122.42, "step": 11090, "token_acc": 0.5212452671434581, "train_speed(iter/s)": 1.192201 }, { "epoch": 3.8022618231665524, "grad_norm": 1.9337549209594727, "learning_rate": 1.3502963403078412e-05, "loss": 2.3383342742919924, "memory(GiB)": 122.42, "step": 11095, "token_acc": 0.5004332755632582, "train_speed(iter/s)": 1.192234 }, { "epoch": 3.803975325565456, "grad_norm": 1.936436653137207, "learning_rate": 1.3466190330241706e-05, "loss": 2.246038627624512, "memory(GiB)": 122.42, "step": 11100, "token_acc": 0.5099285146942018, "train_speed(iter/s)": 1.192251 }, { "epoch": 3.8056888279643593, "grad_norm": 2.056795358657837, "learning_rate": 1.3429459604545558e-05, "loss": 2.23275146484375, "memory(GiB)": 122.42, "step": 11105, "token_acc": 0.5094572368421053, "train_speed(iter/s)": 1.192158 }, { "epoch": 3.8074023303632627, "grad_norm": 1.8552989959716797, "learning_rate": 1.3392771268565324e-05, "loss": 2.28122501373291, "memory(GiB)": 122.42, "step": 11110, "token_acc": 0.496542783059637, "train_speed(iter/s)": 1.192179 }, { "epoch": 3.809115832762166, "grad_norm": 1.7016422748565674, "learning_rate": 1.335612536482727e-05, "loss": 2.158311462402344, "memory(GiB)": 122.42, "step": 11115, "token_acc": 0.5268355517389438, "train_speed(iter/s)": 1.192179 }, { "epoch": 3.810829335161069, "grad_norm": 1.8477908372879028, "learning_rate": 1.3319521935808477e-05, "loss": 2.2792278289794923, "memory(GiB)": 122.42, "step": 11120, "token_acc": 0.5109519797809604, "train_speed(iter/s)": 1.192181 }, { "epoch": 3.8125428375599726, "grad_norm": 3.638213634490967, "learning_rate": 1.328296102393679e-05, "loss": 2.2820770263671877, "memory(GiB)": 122.42, "step": 11125, "token_acc": 0.5237687366167023, "train_speed(iter/s)": 1.192218 }, { "epoch": 3.814256339958876, "grad_norm": 2.0561392307281494, "learning_rate": 1.3246442671590737e-05, "loss": 2.3117319107055665, "memory(GiB)": 122.42, "step": 11130, "token_acc": 0.5225752508361204, "train_speed(iter/s)": 1.192201 }, { "epoch": 3.815969842357779, "grad_norm": 1.8660897016525269, "learning_rate": 1.320996692109957e-05, "loss": 2.1424671173095704, "memory(GiB)": 122.42, "step": 11135, "token_acc": 0.5390417036379769, "train_speed(iter/s)": 1.192151 }, { "epoch": 3.8176833447566825, "grad_norm": 2.193160057067871, "learning_rate": 1.3173533814743105e-05, "loss": 2.272065544128418, "memory(GiB)": 122.42, "step": 11140, "token_acc": 0.5093776641091219, "train_speed(iter/s)": 1.192127 }, { "epoch": 3.819396847155586, "grad_norm": 1.7655516862869263, "learning_rate": 1.3137143394751766e-05, "loss": 2.365442085266113, "memory(GiB)": 122.42, "step": 11145, "token_acc": 0.5031982942430704, "train_speed(iter/s)": 1.192147 }, { "epoch": 3.8211103495544894, "grad_norm": 1.8049774169921875, "learning_rate": 1.310079570330649e-05, "loss": 2.3247861862182617, "memory(GiB)": 122.42, "step": 11150, "token_acc": 0.5058585858585859, "train_speed(iter/s)": 1.192196 }, { "epoch": 3.822823851953393, "grad_norm": 1.8103020191192627, "learning_rate": 1.3064490782538703e-05, "loss": 2.376938819885254, "memory(GiB)": 122.42, "step": 11155, "token_acc": 0.4944740073679902, "train_speed(iter/s)": 1.192194 }, { "epoch": 3.8245373543522962, "grad_norm": 1.847532033920288, "learning_rate": 1.3028228674530197e-05, "loss": 2.271404266357422, "memory(GiB)": 122.42, "step": 11160, "token_acc": 0.5193704600484261, "train_speed(iter/s)": 1.19223 }, { "epoch": 3.8262508567511997, "grad_norm": 1.981669306755066, "learning_rate": 1.2992009421313212e-05, "loss": 2.264346694946289, "memory(GiB)": 122.42, "step": 11165, "token_acc": 0.5114174924601465, "train_speed(iter/s)": 1.192223 }, { "epoch": 3.8279643591501027, "grad_norm": 2.022730588912964, "learning_rate": 1.2955833064870248e-05, "loss": 2.214743804931641, "memory(GiB)": 122.42, "step": 11170, "token_acc": 0.5111402359108781, "train_speed(iter/s)": 1.19223 }, { "epoch": 3.829677861549006, "grad_norm": 1.9871336221694946, "learning_rate": 1.2919699647134125e-05, "loss": 2.360897254943848, "memory(GiB)": 122.42, "step": 11175, "token_acc": 0.5023494233233661, "train_speed(iter/s)": 1.192265 }, { "epoch": 3.8313913639479096, "grad_norm": 1.804628849029541, "learning_rate": 1.288360920998789e-05, "loss": 2.28836669921875, "memory(GiB)": 122.42, "step": 11180, "token_acc": 0.5179249261914803, "train_speed(iter/s)": 1.192253 }, { "epoch": 3.833104866346813, "grad_norm": 1.8312549591064453, "learning_rate": 1.2847561795264757e-05, "loss": 2.2207639694213865, "memory(GiB)": 122.42, "step": 11185, "token_acc": 0.5132435953104646, "train_speed(iter/s)": 1.192263 }, { "epoch": 3.834818368745716, "grad_norm": 1.86195707321167, "learning_rate": 1.2811557444748085e-05, "loss": 2.233733367919922, "memory(GiB)": 122.42, "step": 11190, "token_acc": 0.5154295246038365, "train_speed(iter/s)": 1.192267 }, { "epoch": 3.8365318711446195, "grad_norm": 1.9144055843353271, "learning_rate": 1.2775596200171291e-05, "loss": 2.2540752410888674, "memory(GiB)": 122.42, "step": 11195, "token_acc": 0.5008613264427217, "train_speed(iter/s)": 1.192187 }, { "epoch": 3.838245373543523, "grad_norm": 1.7425026893615723, "learning_rate": 1.273967810321784e-05, "loss": 2.3161088943481447, "memory(GiB)": 122.42, "step": 11200, "token_acc": 0.5138946495230194, "train_speed(iter/s)": 1.192209 }, { "epoch": 3.8399588759424264, "grad_norm": 1.9525039196014404, "learning_rate": 1.2703803195521191e-05, "loss": 2.258024215698242, "memory(GiB)": 122.42, "step": 11205, "token_acc": 0.5008210180623974, "train_speed(iter/s)": 1.192248 }, { "epoch": 3.84167237834133, "grad_norm": 1.8944504261016846, "learning_rate": 1.266797151866474e-05, "loss": 2.2163562774658203, "memory(GiB)": 122.42, "step": 11210, "token_acc": 0.5195080576759966, "train_speed(iter/s)": 1.192259 }, { "epoch": 3.8433858807402332, "grad_norm": 2.253711223602295, "learning_rate": 1.2632183114181767e-05, "loss": 2.2646629333496096, "memory(GiB)": 122.42, "step": 11215, "token_acc": 0.5258818140174072, "train_speed(iter/s)": 1.192317 }, { "epoch": 3.8450993831391362, "grad_norm": 1.8283605575561523, "learning_rate": 1.2596438023555413e-05, "loss": 2.2194534301757813, "memory(GiB)": 122.42, "step": 11220, "token_acc": 0.5334763948497854, "train_speed(iter/s)": 1.192341 }, { "epoch": 3.8468128855380397, "grad_norm": 1.7083160877227783, "learning_rate": 1.2560736288218571e-05, "loss": 2.1317108154296873, "memory(GiB)": 122.42, "step": 11225, "token_acc": 0.5341156019122121, "train_speed(iter/s)": 1.192378 }, { "epoch": 3.848526387936943, "grad_norm": 1.8525314331054688, "learning_rate": 1.2525077949553938e-05, "loss": 2.2696468353271486, "memory(GiB)": 122.42, "step": 11230, "token_acc": 0.5092632485997415, "train_speed(iter/s)": 1.192394 }, { "epoch": 3.8502398903358466, "grad_norm": 2.005281925201416, "learning_rate": 1.2489463048893835e-05, "loss": 2.375990104675293, "memory(GiB)": 122.42, "step": 11235, "token_acc": 0.5052764879696074, "train_speed(iter/s)": 1.192446 }, { "epoch": 3.8519533927347496, "grad_norm": 1.6719223260879517, "learning_rate": 1.2453891627520303e-05, "loss": 2.263558197021484, "memory(GiB)": 122.42, "step": 11240, "token_acc": 0.5145385587863464, "train_speed(iter/s)": 1.192508 }, { "epoch": 3.853666895133653, "grad_norm": 1.8149837255477905, "learning_rate": 1.2418363726664944e-05, "loss": 2.362688446044922, "memory(GiB)": 122.42, "step": 11245, "token_acc": 0.5051502145922747, "train_speed(iter/s)": 1.192566 }, { "epoch": 3.8553803975325565, "grad_norm": 1.9497677087783813, "learning_rate": 1.2382879387508944e-05, "loss": 2.36514949798584, "memory(GiB)": 122.42, "step": 11250, "token_acc": 0.49765258215962443, "train_speed(iter/s)": 1.192572 }, { "epoch": 3.85709389993146, "grad_norm": 2.000393867492676, "learning_rate": 1.234743865118298e-05, "loss": 2.2735591888427735, "memory(GiB)": 122.42, "step": 11255, "token_acc": 0.5129621759456013, "train_speed(iter/s)": 1.192604 }, { "epoch": 3.8588074023303633, "grad_norm": 1.976680040359497, "learning_rate": 1.2312041558767178e-05, "loss": 2.3453866958618166, "memory(GiB)": 122.42, "step": 11260, "token_acc": 0.4944110060189166, "train_speed(iter/s)": 1.192615 }, { "epoch": 3.860520904729267, "grad_norm": 1.7852836847305298, "learning_rate": 1.2276688151291066e-05, "loss": 2.1265716552734375, "memory(GiB)": 122.42, "step": 11265, "token_acc": 0.5425065731814198, "train_speed(iter/s)": 1.192618 }, { "epoch": 3.8622344071281702, "grad_norm": 1.8209319114685059, "learning_rate": 1.2241378469733577e-05, "loss": 2.2154394149780274, "memory(GiB)": 122.42, "step": 11270, "token_acc": 0.5230642504118617, "train_speed(iter/s)": 1.192622 }, { "epoch": 3.8639479095270732, "grad_norm": 2.0152587890625, "learning_rate": 1.220611255502293e-05, "loss": 2.2518804550170897, "memory(GiB)": 122.42, "step": 11275, "token_acc": 0.5185504745470233, "train_speed(iter/s)": 1.192644 }, { "epoch": 3.8656614119259767, "grad_norm": 1.8596312999725342, "learning_rate": 1.2170890448036626e-05, "loss": 2.3133146286010744, "memory(GiB)": 122.42, "step": 11280, "token_acc": 0.5032509752925878, "train_speed(iter/s)": 1.192653 }, { "epoch": 3.86737491432488, "grad_norm": 2.134573221206665, "learning_rate": 1.2135712189601394e-05, "loss": 2.343684768676758, "memory(GiB)": 122.42, "step": 11285, "token_acc": 0.4963377854373115, "train_speed(iter/s)": 1.192657 }, { "epoch": 3.8690884167237836, "grad_norm": 1.8508104085922241, "learning_rate": 1.2100577820493114e-05, "loss": 2.301190948486328, "memory(GiB)": 122.42, "step": 11290, "token_acc": 0.5138211382113821, "train_speed(iter/s)": 1.19269 }, { "epoch": 3.8708019191226866, "grad_norm": 1.7782256603240967, "learning_rate": 1.2065487381436786e-05, "loss": 2.2020198822021486, "memory(GiB)": 122.42, "step": 11295, "token_acc": 0.5239697224558453, "train_speed(iter/s)": 1.19273 }, { "epoch": 3.87251542152159, "grad_norm": 1.807504415512085, "learning_rate": 1.2030440913106522e-05, "loss": 2.2514719009399413, "memory(GiB)": 122.42, "step": 11300, "token_acc": 0.5153526970954356, "train_speed(iter/s)": 1.192559 }, { "epoch": 3.8742289239204935, "grad_norm": 2.019465208053589, "learning_rate": 1.1995438456125452e-05, "loss": 2.256011390686035, "memory(GiB)": 125.99, "step": 11305, "token_acc": 0.5082918739635157, "train_speed(iter/s)": 1.192528 }, { "epoch": 3.875942426319397, "grad_norm": 1.8587501049041748, "learning_rate": 1.1960480051065697e-05, "loss": 2.198673439025879, "memory(GiB)": 125.99, "step": 11310, "token_acc": 0.531592249368155, "train_speed(iter/s)": 1.19253 }, { "epoch": 3.8776559287183003, "grad_norm": 1.7777910232543945, "learning_rate": 1.1925565738448308e-05, "loss": 2.133607292175293, "memory(GiB)": 125.99, "step": 11315, "token_acc": 0.5356839422259984, "train_speed(iter/s)": 1.192538 }, { "epoch": 3.879369431117204, "grad_norm": 1.7917118072509766, "learning_rate": 1.1890695558743248e-05, "loss": 2.331939697265625, "memory(GiB)": 125.99, "step": 11320, "token_acc": 0.5006200909466721, "train_speed(iter/s)": 1.192539 }, { "epoch": 3.881082933516107, "grad_norm": 1.8904743194580078, "learning_rate": 1.1855869552369292e-05, "loss": 2.320379066467285, "memory(GiB)": 125.99, "step": 11325, "token_acc": 0.5014836795252225, "train_speed(iter/s)": 1.192546 }, { "epoch": 3.8827964359150102, "grad_norm": 1.9468624591827393, "learning_rate": 1.182108775969401e-05, "loss": 2.29162483215332, "memory(GiB)": 125.99, "step": 11330, "token_acc": 0.5082922456297624, "train_speed(iter/s)": 1.192546 }, { "epoch": 3.8845099383139137, "grad_norm": 1.7736941576004028, "learning_rate": 1.178635022103376e-05, "loss": 2.290657806396484, "memory(GiB)": 130.55, "step": 11335, "token_acc": 0.5060999606454152, "train_speed(iter/s)": 1.192503 }, { "epoch": 3.886223440712817, "grad_norm": 1.8391001224517822, "learning_rate": 1.1751656976653585e-05, "loss": 2.3281013488769533, "memory(GiB)": 130.55, "step": 11340, "token_acc": 0.5064235391628678, "train_speed(iter/s)": 1.192506 }, { "epoch": 3.88793694311172, "grad_norm": 1.8927708864212036, "learning_rate": 1.1717008066767182e-05, "loss": 2.2621702194213866, "memory(GiB)": 130.55, "step": 11345, "token_acc": 0.5088400172488141, "train_speed(iter/s)": 1.192522 }, { "epoch": 3.8896504455106236, "grad_norm": 1.8561190366744995, "learning_rate": 1.1682403531536885e-05, "loss": 2.2941518783569337, "memory(GiB)": 130.55, "step": 11350, "token_acc": 0.5181737588652482, "train_speed(iter/s)": 1.192533 }, { "epoch": 3.891363947909527, "grad_norm": 1.7354230880737305, "learning_rate": 1.1647843411073551e-05, "loss": 2.1804782867431642, "memory(GiB)": 130.55, "step": 11355, "token_acc": 0.5236105307145842, "train_speed(iter/s)": 1.19256 }, { "epoch": 3.8930774503084304, "grad_norm": 2.04097318649292, "learning_rate": 1.1613327745436553e-05, "loss": 2.2233293533325194, "memory(GiB)": 130.55, "step": 11360, "token_acc": 0.5297777777777778, "train_speed(iter/s)": 1.192583 }, { "epoch": 3.894790952707334, "grad_norm": 1.936183214187622, "learning_rate": 1.1578856574633784e-05, "loss": 2.1804141998291016, "memory(GiB)": 130.55, "step": 11365, "token_acc": 0.5217013888888888, "train_speed(iter/s)": 1.192605 }, { "epoch": 3.8965044551062373, "grad_norm": 1.8398135900497437, "learning_rate": 1.1544429938621526e-05, "loss": 2.239546203613281, "memory(GiB)": 130.55, "step": 11370, "token_acc": 0.5246557085739671, "train_speed(iter/s)": 1.192645 }, { "epoch": 3.8982179575051408, "grad_norm": 1.9969321489334106, "learning_rate": 1.1510047877304448e-05, "loss": 2.2834798812866213, "memory(GiB)": 130.55, "step": 11375, "token_acc": 0.508100810081008, "train_speed(iter/s)": 1.192689 }, { "epoch": 3.8999314599040438, "grad_norm": 1.9128665924072266, "learning_rate": 1.147571043053557e-05, "loss": 2.344571113586426, "memory(GiB)": 130.55, "step": 11380, "token_acc": 0.5055889939810834, "train_speed(iter/s)": 1.192702 }, { "epoch": 3.901644962302947, "grad_norm": 2.0871903896331787, "learning_rate": 1.1441417638116164e-05, "loss": 2.2336704254150392, "memory(GiB)": 130.55, "step": 11385, "token_acc": 0.5080610021786492, "train_speed(iter/s)": 1.192747 }, { "epoch": 3.9033584647018507, "grad_norm": 2.0218071937561035, "learning_rate": 1.1407169539795747e-05, "loss": 2.2256595611572267, "memory(GiB)": 130.55, "step": 11390, "token_acc": 0.5166036149642707, "train_speed(iter/s)": 1.192649 }, { "epoch": 3.905071967100754, "grad_norm": 1.820892095565796, "learning_rate": 1.1372966175272053e-05, "loss": 2.1833568572998048, "memory(GiB)": 130.55, "step": 11395, "token_acc": 0.5224506924045321, "train_speed(iter/s)": 1.19266 }, { "epoch": 3.906785469499657, "grad_norm": 2.0126757621765137, "learning_rate": 1.1338807584190953e-05, "loss": 2.2479347229003905, "memory(GiB)": 130.55, "step": 11400, "token_acc": 0.5102307357422725, "train_speed(iter/s)": 1.192712 }, { "epoch": 3.9084989718985605, "grad_norm": 1.8271492719650269, "learning_rate": 1.1304693806146416e-05, "loss": 2.24362850189209, "memory(GiB)": 130.55, "step": 11405, "token_acc": 0.5103734439834025, "train_speed(iter/s)": 1.192736 }, { "epoch": 3.910212474297464, "grad_norm": 1.6903700828552246, "learning_rate": 1.1270624880680487e-05, "loss": 2.3612241744995117, "memory(GiB)": 130.55, "step": 11410, "token_acc": 0.4859324758842444, "train_speed(iter/s)": 1.192787 }, { "epoch": 3.9119259766963674, "grad_norm": 1.9062747955322266, "learning_rate": 1.123660084728318e-05, "loss": 2.2348983764648436, "memory(GiB)": 130.55, "step": 11415, "token_acc": 0.5304386750223814, "train_speed(iter/s)": 1.192751 }, { "epoch": 3.913639479095271, "grad_norm": 1.9170383214950562, "learning_rate": 1.1202621745392528e-05, "loss": 2.2593704223632813, "memory(GiB)": 130.55, "step": 11420, "token_acc": 0.5164533820840951, "train_speed(iter/s)": 1.192805 }, { "epoch": 3.9153529814941743, "grad_norm": 2.084787130355835, "learning_rate": 1.1168687614394418e-05, "loss": 2.3369237899780275, "memory(GiB)": 130.55, "step": 11425, "token_acc": 0.5092674315975286, "train_speed(iter/s)": 1.192861 }, { "epoch": 3.9170664838930773, "grad_norm": 1.6739064455032349, "learning_rate": 1.1134798493622667e-05, "loss": 2.299082946777344, "memory(GiB)": 130.55, "step": 11430, "token_acc": 0.5120303925707049, "train_speed(iter/s)": 1.1928 }, { "epoch": 3.9187799862919808, "grad_norm": 1.9540839195251465, "learning_rate": 1.110095442235889e-05, "loss": 2.2474342346191407, "memory(GiB)": 130.55, "step": 11435, "token_acc": 0.5186631944444444, "train_speed(iter/s)": 1.192791 }, { "epoch": 3.920493488690884, "grad_norm": 1.826161503791809, "learning_rate": 1.1067155439832505e-05, "loss": 2.288950729370117, "memory(GiB)": 130.55, "step": 11440, "token_acc": 0.5224416517055656, "train_speed(iter/s)": 1.192825 }, { "epoch": 3.9222069910897877, "grad_norm": 1.730695128440857, "learning_rate": 1.103340158522066e-05, "loss": 2.2664216995239257, "memory(GiB)": 130.55, "step": 11445, "token_acc": 0.5190839694656488, "train_speed(iter/s)": 1.192846 }, { "epoch": 3.9239204934886907, "grad_norm": 1.9056298732757568, "learning_rate": 1.0999692897648172e-05, "loss": 2.2105712890625, "memory(GiB)": 130.55, "step": 11450, "token_acc": 0.5238301559792028, "train_speed(iter/s)": 1.192868 }, { "epoch": 3.925633995887594, "grad_norm": 1.9477242231369019, "learning_rate": 1.0966029416187518e-05, "loss": 2.1811485290527344, "memory(GiB)": 130.55, "step": 11455, "token_acc": 0.5287206266318538, "train_speed(iter/s)": 1.192775 }, { "epoch": 3.9273474982864975, "grad_norm": 1.9991012811660767, "learning_rate": 1.0932411179858782e-05, "loss": 2.3462993621826174, "memory(GiB)": 130.55, "step": 11460, "token_acc": 0.4915463917525773, "train_speed(iter/s)": 1.192809 }, { "epoch": 3.929061000685401, "grad_norm": 1.894282341003418, "learning_rate": 1.0898838227629603e-05, "loss": 2.2479536056518556, "memory(GiB)": 130.55, "step": 11465, "token_acc": 0.5067186822713481, "train_speed(iter/s)": 1.192854 }, { "epoch": 3.9307745030843044, "grad_norm": 1.9628050327301025, "learning_rate": 1.086531059841513e-05, "loss": 2.243031311035156, "memory(GiB)": 130.55, "step": 11470, "token_acc": 0.5158946412352406, "train_speed(iter/s)": 1.192836 }, { "epoch": 3.932488005483208, "grad_norm": 1.7238304615020752, "learning_rate": 1.0831828331077981e-05, "loss": 2.2170360565185545, "memory(GiB)": 130.55, "step": 11475, "token_acc": 0.5036717062634989, "train_speed(iter/s)": 1.192845 }, { "epoch": 3.934201507882111, "grad_norm": 1.7980529069900513, "learning_rate": 1.0798391464428175e-05, "loss": 2.3099761962890626, "memory(GiB)": 130.55, "step": 11480, "token_acc": 0.516828031350853, "train_speed(iter/s)": 1.192866 }, { "epoch": 3.9359150102810143, "grad_norm": 2.0233840942382812, "learning_rate": 1.0765000037223111e-05, "loss": 2.2515941619873048, "memory(GiB)": 130.55, "step": 11485, "token_acc": 0.5254951538137379, "train_speed(iter/s)": 1.192869 }, { "epoch": 3.9376285126799178, "grad_norm": 2.00180983543396, "learning_rate": 1.0731654088167526e-05, "loss": 2.332698440551758, "memory(GiB)": 130.55, "step": 11490, "token_acc": 0.4926289926289926, "train_speed(iter/s)": 1.192885 }, { "epoch": 3.939342015078821, "grad_norm": 1.8675472736358643, "learning_rate": 1.0698353655913451e-05, "loss": 2.195104789733887, "memory(GiB)": 130.55, "step": 11495, "token_acc": 0.5210369740756481, "train_speed(iter/s)": 1.192895 }, { "epoch": 3.941055517477724, "grad_norm": 1.8866549730300903, "learning_rate": 1.0665098779060146e-05, "loss": 2.3588336944580077, "memory(GiB)": 130.55, "step": 11500, "token_acc": 0.4877734877734878, "train_speed(iter/s)": 1.192919 }, { "epoch": 3.941055517477724, "eval_loss": 1.963692307472229, "eval_runtime": 3.7051, "eval_samples_per_second": 26.99, "eval_steps_per_second": 26.99, "eval_token_acc": 0.5052631578947369, "step": 11500 }, { "epoch": 3.9427690198766276, "grad_norm": 1.8152549266815186, "learning_rate": 1.063188949615408e-05, "loss": 2.235198974609375, "memory(GiB)": 130.55, "step": 11505, "token_acc": 0.5138632162661737, "train_speed(iter/s)": 1.192255 }, { "epoch": 3.944482522275531, "grad_norm": 1.8655108213424683, "learning_rate": 1.0598725845688839e-05, "loss": 2.3246223449707033, "memory(GiB)": 130.55, "step": 11510, "token_acc": 0.503209242618742, "train_speed(iter/s)": 1.192267 }, { "epoch": 3.9461960246744345, "grad_norm": 1.8385215997695923, "learning_rate": 1.056560786610516e-05, "loss": 2.2945863723754885, "memory(GiB)": 130.55, "step": 11515, "token_acc": 0.5145592341443956, "train_speed(iter/s)": 1.192311 }, { "epoch": 3.947909527073338, "grad_norm": 1.9104149341583252, "learning_rate": 1.0532535595790793e-05, "loss": 2.267918586730957, "memory(GiB)": 130.55, "step": 11520, "token_acc": 0.5122156697556866, "train_speed(iter/s)": 1.192317 }, { "epoch": 3.9496230294722414, "grad_norm": 1.979591965675354, "learning_rate": 1.0499509073080549e-05, "loss": 2.3001773834228514, "memory(GiB)": 130.55, "step": 11525, "token_acc": 0.5208936698386429, "train_speed(iter/s)": 1.192318 }, { "epoch": 3.951336531871145, "grad_norm": 2.060901641845703, "learning_rate": 1.0466528336256199e-05, "loss": 2.2584865570068358, "memory(GiB)": 130.55, "step": 11530, "token_acc": 0.5183760683760684, "train_speed(iter/s)": 1.192321 }, { "epoch": 3.953050034270048, "grad_norm": 1.9038070440292358, "learning_rate": 1.0433593423546428e-05, "loss": 2.281483840942383, "memory(GiB)": 130.55, "step": 11535, "token_acc": 0.5133799917661589, "train_speed(iter/s)": 1.192329 }, { "epoch": 3.9547635366689513, "grad_norm": 1.9190291166305542, "learning_rate": 1.0400704373126835e-05, "loss": 2.2902956008911133, "memory(GiB)": 130.55, "step": 11540, "token_acc": 0.5031030202730657, "train_speed(iter/s)": 1.192182 }, { "epoch": 3.9564770390678548, "grad_norm": 1.7825120687484741, "learning_rate": 1.036786122311983e-05, "loss": 2.3507747650146484, "memory(GiB)": 130.55, "step": 11545, "token_acc": 0.4931623931623932, "train_speed(iter/s)": 1.192225 }, { "epoch": 3.958190541466758, "grad_norm": 1.7777223587036133, "learning_rate": 1.0335064011594614e-05, "loss": 2.3599311828613283, "memory(GiB)": 130.55, "step": 11550, "token_acc": 0.507383100902379, "train_speed(iter/s)": 1.192228 }, { "epoch": 3.959904043865661, "grad_norm": 1.6597204208374023, "learning_rate": 1.0302312776567164e-05, "loss": 2.2076492309570312, "memory(GiB)": 130.55, "step": 11555, "token_acc": 0.5078397212543554, "train_speed(iter/s)": 1.192248 }, { "epoch": 3.9616175462645646, "grad_norm": 1.6959490776062012, "learning_rate": 1.0269607556000155e-05, "loss": 2.187691307067871, "memory(GiB)": 130.55, "step": 11560, "token_acc": 0.5214203010420687, "train_speed(iter/s)": 1.192268 }, { "epoch": 3.963331048663468, "grad_norm": 1.8786017894744873, "learning_rate": 1.023694838780293e-05, "loss": 2.272084045410156, "memory(GiB)": 130.55, "step": 11565, "token_acc": 0.5105051408135897, "train_speed(iter/s)": 1.192295 }, { "epoch": 3.9650445510623715, "grad_norm": 1.799452543258667, "learning_rate": 1.0204335309831464e-05, "loss": 2.1702707290649412, "memory(GiB)": 130.55, "step": 11570, "token_acc": 0.5391891891891892, "train_speed(iter/s)": 1.19234 }, { "epoch": 3.966758053461275, "grad_norm": 1.7914162874221802, "learning_rate": 1.0171768359888257e-05, "loss": 2.2744373321533202, "memory(GiB)": 130.55, "step": 11575, "token_acc": 0.5037910699241785, "train_speed(iter/s)": 1.192335 }, { "epoch": 3.9684715558601784, "grad_norm": 1.947568655014038, "learning_rate": 1.0139247575722405e-05, "loss": 2.203424072265625, "memory(GiB)": 130.55, "step": 11580, "token_acc": 0.522077922077922, "train_speed(iter/s)": 1.192317 }, { "epoch": 3.9701850582590814, "grad_norm": 2.107259511947632, "learning_rate": 1.0106772995029433e-05, "loss": 2.176357460021973, "memory(GiB)": 130.55, "step": 11585, "token_acc": 0.5295964125560538, "train_speed(iter/s)": 1.192319 }, { "epoch": 3.971898560657985, "grad_norm": 1.9361604452133179, "learning_rate": 1.0074344655451362e-05, "loss": 2.3353137969970703, "memory(GiB)": 130.55, "step": 11590, "token_acc": 0.5049848287819679, "train_speed(iter/s)": 1.192292 }, { "epoch": 3.9736120630568883, "grad_norm": 1.9196524620056152, "learning_rate": 1.0041962594576587e-05, "loss": 2.278961181640625, "memory(GiB)": 130.55, "step": 11595, "token_acc": 0.504730563554093, "train_speed(iter/s)": 1.192317 }, { "epoch": 3.9753255654557917, "grad_norm": 1.6727811098098755, "learning_rate": 1.0009626849939879e-05, "loss": 2.276181221008301, "memory(GiB)": 130.55, "step": 11600, "token_acc": 0.5033840947546532, "train_speed(iter/s)": 1.192344 }, { "epoch": 3.9770390678546947, "grad_norm": 1.811861276626587, "learning_rate": 9.977337459022278e-06, "loss": 2.202879524230957, "memory(GiB)": 130.55, "step": 11605, "token_acc": 0.519404332129964, "train_speed(iter/s)": 1.192269 }, { "epoch": 3.978752570253598, "grad_norm": 2.1634395122528076, "learning_rate": 9.945094459251148e-06, "loss": 2.2504241943359373, "memory(GiB)": 130.55, "step": 11610, "token_acc": 0.5150697255960414, "train_speed(iter/s)": 1.19229 }, { "epoch": 3.9804660726525016, "grad_norm": 1.8555351495742798, "learning_rate": 9.912897888000066e-06, "loss": 2.4097042083740234, "memory(GiB)": 130.55, "step": 11615, "token_acc": 0.49958088851634536, "train_speed(iter/s)": 1.192336 }, { "epoch": 3.982179575051405, "grad_norm": 1.7590733766555786, "learning_rate": 9.880747782588757e-06, "loss": 2.203400802612305, "memory(GiB)": 130.55, "step": 11620, "token_acc": 0.5295640914976263, "train_speed(iter/s)": 1.192372 }, { "epoch": 3.9838930774503085, "grad_norm": 2.005702018737793, "learning_rate": 9.848644180283122e-06, "loss": 2.313191032409668, "memory(GiB)": 130.55, "step": 11625, "token_acc": 0.5174887892376682, "train_speed(iter/s)": 1.192391 }, { "epoch": 3.985606579849212, "grad_norm": 2.1286630630493164, "learning_rate": 9.816587118295162e-06, "loss": 2.3184911727905275, "memory(GiB)": 130.55, "step": 11630, "token_acc": 0.5075139544869043, "train_speed(iter/s)": 1.192283 }, { "epoch": 3.9873200822481154, "grad_norm": 1.9435898065567017, "learning_rate": 9.784576633782927e-06, "loss": 2.322968292236328, "memory(GiB)": 130.55, "step": 11635, "token_acc": 0.5065520065520066, "train_speed(iter/s)": 1.192288 }, { "epoch": 3.9890335846470184, "grad_norm": 1.809009313583374, "learning_rate": 9.75261276385045e-06, "loss": 2.1621219635009767, "memory(GiB)": 130.55, "step": 11640, "token_acc": 0.5248830284985113, "train_speed(iter/s)": 1.192274 }, { "epoch": 3.990747087045922, "grad_norm": 2.1495659351348877, "learning_rate": 9.720695545547775e-06, "loss": 2.3847883224487303, "memory(GiB)": 130.55, "step": 11645, "token_acc": 0.5079026057240495, "train_speed(iter/s)": 1.192281 }, { "epoch": 3.9924605894448253, "grad_norm": 1.9998271465301514, "learning_rate": 9.688825015870829e-06, "loss": 2.331809234619141, "memory(GiB)": 130.55, "step": 11650, "token_acc": 0.5186631944444444, "train_speed(iter/s)": 1.192322 }, { "epoch": 3.9941740918437287, "grad_norm": 1.7810020446777344, "learning_rate": 9.657001211761452e-06, "loss": 2.211961364746094, "memory(GiB)": 130.55, "step": 11655, "token_acc": 0.5174537987679672, "train_speed(iter/s)": 1.192214 }, { "epoch": 3.9958875942426317, "grad_norm": 1.9162592887878418, "learning_rate": 9.625224170107316e-06, "loss": 2.1939470291137697, "memory(GiB)": 130.55, "step": 11660, "token_acc": 0.5225859925404062, "train_speed(iter/s)": 1.192225 }, { "epoch": 3.997601096641535, "grad_norm": 1.7719178199768066, "learning_rate": 9.593493927741904e-06, "loss": 2.2833303451538085, "memory(GiB)": 130.55, "step": 11665, "token_acc": 0.5051208521097911, "train_speed(iter/s)": 1.192219 }, { "epoch": 3.9993145990404386, "grad_norm": 1.9131675958633423, "learning_rate": 9.561810521444403e-06, "loss": 2.213916015625, "memory(GiB)": 130.55, "step": 11670, "token_acc": 0.5295131408875484, "train_speed(iter/s)": 1.192241 }, { "epoch": 4.001028101439342, "grad_norm": 1.697918176651001, "learning_rate": 9.530173987939756e-06, "loss": 2.2518632888793944, "memory(GiB)": 130.55, "step": 11675, "token_acc": 0.5173086299366163, "train_speed(iter/s)": 1.192336 }, { "epoch": 4.0027416038382455, "grad_norm": 1.8584626913070679, "learning_rate": 9.498584363898584e-06, "loss": 2.2780994415283202, "memory(GiB)": 130.55, "step": 11680, "token_acc": 0.5255503144654088, "train_speed(iter/s)": 1.192232 }, { "epoch": 4.004455106237149, "grad_norm": 1.7797253131866455, "learning_rate": 9.467041685937084e-06, "loss": 2.296829605102539, "memory(GiB)": 130.55, "step": 11685, "token_acc": 0.5136986301369864, "train_speed(iter/s)": 1.192246 }, { "epoch": 4.006168608636052, "grad_norm": 1.8756027221679688, "learning_rate": 9.435545990617078e-06, "loss": 2.100477409362793, "memory(GiB)": 130.55, "step": 11690, "token_acc": 0.5377643504531722, "train_speed(iter/s)": 1.192236 }, { "epoch": 4.007882111034956, "grad_norm": 2.078951120376587, "learning_rate": 9.404097314445937e-06, "loss": 2.1901355743408204, "memory(GiB)": 130.55, "step": 11695, "token_acc": 0.5227848101265823, "train_speed(iter/s)": 1.192275 }, { "epoch": 4.009595613433858, "grad_norm": 1.9168057441711426, "learning_rate": 9.372695693876477e-06, "loss": 2.3248809814453124, "memory(GiB)": 130.55, "step": 11700, "token_acc": 0.519453642384106, "train_speed(iter/s)": 1.192262 }, { "epoch": 4.011309115832762, "grad_norm": 1.9914450645446777, "learning_rate": 9.341341165307021e-06, "loss": 2.230436325073242, "memory(GiB)": 130.55, "step": 11705, "token_acc": 0.5239965095986039, "train_speed(iter/s)": 1.192221 }, { "epoch": 4.013022618231665, "grad_norm": 1.9266914129257202, "learning_rate": 9.310033765081311e-06, "loss": 2.2615240097045897, "memory(GiB)": 130.55, "step": 11710, "token_acc": 0.5194238323876037, "train_speed(iter/s)": 1.192206 }, { "epoch": 4.014736120630569, "grad_norm": 1.8448268175125122, "learning_rate": 9.278773529488404e-06, "loss": 2.18719596862793, "memory(GiB)": 130.55, "step": 11715, "token_acc": 0.5229591836734694, "train_speed(iter/s)": 1.192246 }, { "epoch": 4.016449623029472, "grad_norm": 1.9633625745773315, "learning_rate": 9.247560494762747e-06, "loss": 2.1463369369506835, "memory(GiB)": 130.55, "step": 11720, "token_acc": 0.5233892321270962, "train_speed(iter/s)": 1.192286 }, { "epoch": 4.018163125428376, "grad_norm": 2.0266950130462646, "learning_rate": 9.216394697084047e-06, "loss": 2.229098892211914, "memory(GiB)": 130.55, "step": 11725, "token_acc": 0.5252525252525253, "train_speed(iter/s)": 1.192322 }, { "epoch": 4.019876627827279, "grad_norm": 1.851224660873413, "learning_rate": 9.185276172577284e-06, "loss": 2.2778388977050783, "memory(GiB)": 130.55, "step": 11730, "token_acc": 0.5154593639575972, "train_speed(iter/s)": 1.192367 }, { "epoch": 4.0215901302261825, "grad_norm": 2.108271598815918, "learning_rate": 9.154204957312595e-06, "loss": 2.180006408691406, "memory(GiB)": 130.55, "step": 11735, "token_acc": 0.5385312783318223, "train_speed(iter/s)": 1.192421 }, { "epoch": 4.023303632625086, "grad_norm": 1.9687108993530273, "learning_rate": 9.123181087305316e-06, "loss": 2.2575008392333986, "memory(GiB)": 130.55, "step": 11740, "token_acc": 0.5088235294117647, "train_speed(iter/s)": 1.192455 }, { "epoch": 4.025017135023989, "grad_norm": 2.992823600769043, "learning_rate": 9.092204598515908e-06, "loss": 2.2490610122680663, "memory(GiB)": 130.55, "step": 11745, "token_acc": 0.5235668789808917, "train_speed(iter/s)": 1.192428 }, { "epoch": 4.026730637422893, "grad_norm": 2.025089740753174, "learning_rate": 9.061275526849883e-06, "loss": 2.267682647705078, "memory(GiB)": 130.55, "step": 11750, "token_acc": 0.5225187656380317, "train_speed(iter/s)": 1.192451 }, { "epoch": 4.028444139821795, "grad_norm": 2.1361939907073975, "learning_rate": 9.030393908157808e-06, "loss": 2.393527793884277, "memory(GiB)": 130.55, "step": 11755, "token_acc": 0.49599358974358976, "train_speed(iter/s)": 1.192474 }, { "epoch": 4.030157642220699, "grad_norm": 2.1963562965393066, "learning_rate": 8.999559778235267e-06, "loss": 2.154241180419922, "memory(GiB)": 130.55, "step": 11760, "token_acc": 0.5305343511450382, "train_speed(iter/s)": 1.192521 }, { "epoch": 4.031871144619602, "grad_norm": 1.925977110862732, "learning_rate": 8.968773172822747e-06, "loss": 2.218596267700195, "memory(GiB)": 130.55, "step": 11765, "token_acc": 0.5213959285417532, "train_speed(iter/s)": 1.192559 }, { "epoch": 4.033584647018506, "grad_norm": 1.9511079788208008, "learning_rate": 8.938034127605687e-06, "loss": 2.187073516845703, "memory(GiB)": 130.55, "step": 11770, "token_acc": 0.5217573221757322, "train_speed(iter/s)": 1.192593 }, { "epoch": 4.035298149417409, "grad_norm": 2.0550224781036377, "learning_rate": 8.907342678214409e-06, "loss": 2.222880554199219, "memory(GiB)": 130.55, "step": 11775, "token_acc": 0.5153846153846153, "train_speed(iter/s)": 1.192573 }, { "epoch": 4.037011651816313, "grad_norm": 1.9903380870819092, "learning_rate": 8.876698860224014e-06, "loss": 2.2170047760009766, "memory(GiB)": 130.55, "step": 11780, "token_acc": 0.5239837398373983, "train_speed(iter/s)": 1.192582 }, { "epoch": 4.038725154215216, "grad_norm": 1.998765230178833, "learning_rate": 8.846102709154436e-06, "loss": 2.161180305480957, "memory(GiB)": 130.55, "step": 11785, "token_acc": 0.5294378043382028, "train_speed(iter/s)": 1.192619 }, { "epoch": 4.0404386566141195, "grad_norm": 2.0020272731781006, "learning_rate": 8.815554260470366e-06, "loss": 2.1807361602783204, "memory(GiB)": 130.55, "step": 11790, "token_acc": 0.5222272114952852, "train_speed(iter/s)": 1.192671 }, { "epoch": 4.042152159013023, "grad_norm": 1.976608157157898, "learning_rate": 8.785053549581151e-06, "loss": 2.266287612915039, "memory(GiB)": 130.55, "step": 11795, "token_acc": 0.5170429393536963, "train_speed(iter/s)": 1.192613 }, { "epoch": 4.043865661411926, "grad_norm": 1.807065725326538, "learning_rate": 8.75460061184084e-06, "loss": 2.2195302963256838, "memory(GiB)": 130.55, "step": 11800, "token_acc": 0.5252321356479612, "train_speed(iter/s)": 1.19262 }, { "epoch": 4.045579163810829, "grad_norm": 1.9842931032180786, "learning_rate": 8.724195482548114e-06, "loss": 2.1541397094726564, "memory(GiB)": 130.55, "step": 11805, "token_acc": 0.5421393841166937, "train_speed(iter/s)": 1.192663 }, { "epoch": 4.047292666209732, "grad_norm": 1.8437072038650513, "learning_rate": 8.693838196946236e-06, "loss": 2.1598651885986326, "memory(GiB)": 130.55, "step": 11810, "token_acc": 0.5315904139433552, "train_speed(iter/s)": 1.192562 }, { "epoch": 4.049006168608636, "grad_norm": 1.9644986391067505, "learning_rate": 8.663528790222963e-06, "loss": 2.1377235412597657, "memory(GiB)": 130.55, "step": 11815, "token_acc": 0.5263793797825211, "train_speed(iter/s)": 1.192537 }, { "epoch": 4.050719671007539, "grad_norm": 1.9463001489639282, "learning_rate": 8.633267297510638e-06, "loss": 2.283214569091797, "memory(GiB)": 130.55, "step": 11820, "token_acc": 0.509719222462203, "train_speed(iter/s)": 1.192583 }, { "epoch": 4.052433173406443, "grad_norm": 1.9830293655395508, "learning_rate": 8.603053753885981e-06, "loss": 2.0544178009033205, "memory(GiB)": 130.55, "step": 11825, "token_acc": 0.5393858477970628, "train_speed(iter/s)": 1.192602 }, { "epoch": 4.054146675805346, "grad_norm": 2.097766160964966, "learning_rate": 8.572888194370193e-06, "loss": 2.1332584381103517, "memory(GiB)": 130.55, "step": 11830, "token_acc": 0.5345372460496614, "train_speed(iter/s)": 1.192532 }, { "epoch": 4.05586017820425, "grad_norm": 2.045501947402954, "learning_rate": 8.54277065392884e-06, "loss": 2.26431884765625, "memory(GiB)": 130.55, "step": 11835, "token_acc": 0.507796881247501, "train_speed(iter/s)": 1.192542 }, { "epoch": 4.057573680603153, "grad_norm": 1.954146385192871, "learning_rate": 8.512701167471826e-06, "loss": 2.2499883651733397, "memory(GiB)": 130.55, "step": 11840, "token_acc": 0.5213926174496645, "train_speed(iter/s)": 1.192556 }, { "epoch": 4.0592871830020565, "grad_norm": 2.1005406379699707, "learning_rate": 8.482679769853335e-06, "loss": 2.162393569946289, "memory(GiB)": 130.55, "step": 11845, "token_acc": 0.5197309417040359, "train_speed(iter/s)": 1.192577 }, { "epoch": 4.06100068540096, "grad_norm": 1.9685190916061401, "learning_rate": 8.452706495871837e-06, "loss": 2.261433410644531, "memory(GiB)": 130.55, "step": 11850, "token_acc": 0.5084967320261438, "train_speed(iter/s)": 1.192596 }, { "epoch": 4.0627141877998625, "grad_norm": 1.9917489290237427, "learning_rate": 8.422781380270029e-06, "loss": 2.3125595092773437, "memory(GiB)": 130.55, "step": 11855, "token_acc": 0.49938549774682506, "train_speed(iter/s)": 1.192601 }, { "epoch": 4.064427690198766, "grad_norm": 1.9510730504989624, "learning_rate": 8.392904457734741e-06, "loss": 2.1992931365966797, "memory(GiB)": 130.55, "step": 11860, "token_acc": 0.5282791817087846, "train_speed(iter/s)": 1.192517 }, { "epoch": 4.066141192597669, "grad_norm": 2.1001195907592773, "learning_rate": 8.363075762896976e-06, "loss": 2.2342580795288085, "memory(GiB)": 130.55, "step": 11865, "token_acc": 0.5118997912317328, "train_speed(iter/s)": 1.192559 }, { "epoch": 4.067854694996573, "grad_norm": 2.0277042388916016, "learning_rate": 8.333295330331841e-06, "loss": 2.2357444763183594, "memory(GiB)": 130.55, "step": 11870, "token_acc": 0.5143581081081081, "train_speed(iter/s)": 1.19252 }, { "epoch": 4.069568197395476, "grad_norm": 2.098578929901123, "learning_rate": 8.303563194558494e-06, "loss": 2.3073654174804688, "memory(GiB)": 130.55, "step": 11875, "token_acc": 0.5051020408163265, "train_speed(iter/s)": 1.192552 }, { "epoch": 4.07128169979438, "grad_norm": 1.7829177379608154, "learning_rate": 8.27387939004008e-06, "loss": 2.19262809753418, "memory(GiB)": 130.55, "step": 11880, "token_acc": 0.5263616557734205, "train_speed(iter/s)": 1.192576 }, { "epoch": 4.072995202193283, "grad_norm": 1.9505503177642822, "learning_rate": 8.244243951183777e-06, "loss": 2.3132606506347657, "memory(GiB)": 130.55, "step": 11885, "token_acc": 0.518467852257182, "train_speed(iter/s)": 1.192489 }, { "epoch": 4.074708704592187, "grad_norm": 1.9534354209899902, "learning_rate": 8.214656912340645e-06, "loss": 2.2091606140136717, "memory(GiB)": 130.55, "step": 11890, "token_acc": 0.5153019023986766, "train_speed(iter/s)": 1.19252 }, { "epoch": 4.07642220699109, "grad_norm": 2.023059844970703, "learning_rate": 8.18511830780569e-06, "loss": 2.2305870056152344, "memory(GiB)": 130.55, "step": 11895, "token_acc": 0.5168156662409535, "train_speed(iter/s)": 1.192498 }, { "epoch": 4.0781357093899935, "grad_norm": 1.895702838897705, "learning_rate": 8.155628171817742e-06, "loss": 2.2750526428222657, "memory(GiB)": 130.55, "step": 11900, "token_acc": 0.5201716738197425, "train_speed(iter/s)": 1.192535 }, { "epoch": 4.079849211788897, "grad_norm": 2.1837918758392334, "learning_rate": 8.126186538559488e-06, "loss": 2.208523750305176, "memory(GiB)": 130.55, "step": 11905, "token_acc": 0.5271111111111111, "train_speed(iter/s)": 1.192591 }, { "epoch": 4.0815627141877995, "grad_norm": 1.9566824436187744, "learning_rate": 8.096793442157347e-06, "loss": 2.2504337310791014, "memory(GiB)": 130.55, "step": 11910, "token_acc": 0.5177685950413223, "train_speed(iter/s)": 1.192613 }, { "epoch": 4.083276216586703, "grad_norm": 2.185349225997925, "learning_rate": 8.067448916681519e-06, "loss": 2.219514083862305, "memory(GiB)": 130.55, "step": 11915, "token_acc": 0.5223628691983122, "train_speed(iter/s)": 1.192642 }, { "epoch": 4.084989718985606, "grad_norm": 2.136859178543091, "learning_rate": 8.03815299614587e-06, "loss": 2.146485710144043, "memory(GiB)": 130.55, "step": 11920, "token_acc": 0.5414827890556045, "train_speed(iter/s)": 1.192632 }, { "epoch": 4.08670322138451, "grad_norm": 1.9407206773757935, "learning_rate": 8.008905714507952e-06, "loss": 2.2074874877929687, "memory(GiB)": 130.55, "step": 11925, "token_acc": 0.5204415372035978, "train_speed(iter/s)": 1.19268 }, { "epoch": 4.088416723783413, "grad_norm": 1.9624541997909546, "learning_rate": 7.979707105668937e-06, "loss": 2.2058963775634766, "memory(GiB)": 130.55, "step": 11930, "token_acc": 0.5216150081566069, "train_speed(iter/s)": 1.192704 }, { "epoch": 4.090130226182317, "grad_norm": 1.9502159357070923, "learning_rate": 7.950557203473569e-06, "loss": 2.1544456481933594, "memory(GiB)": 130.55, "step": 11935, "token_acc": 0.5435380384967919, "train_speed(iter/s)": 1.1927 }, { "epoch": 4.09184372858122, "grad_norm": 1.9912784099578857, "learning_rate": 7.921456041710152e-06, "loss": 2.1967185974121093, "memory(GiB)": 130.55, "step": 11940, "token_acc": 0.5154989384288747, "train_speed(iter/s)": 1.19269 }, { "epoch": 4.093557230980124, "grad_norm": 2.081197500228882, "learning_rate": 7.892403654110452e-06, "loss": 2.2470287322998046, "memory(GiB)": 130.55, "step": 11945, "token_acc": 0.5163768701981399, "train_speed(iter/s)": 1.192716 }, { "epoch": 4.095270733379027, "grad_norm": 2.2528300285339355, "learning_rate": 7.863400074349764e-06, "loss": 2.2479167938232423, "memory(GiB)": 130.55, "step": 11950, "token_acc": 0.519603424966201, "train_speed(iter/s)": 1.192625 }, { "epoch": 4.0969842357779305, "grad_norm": 2.1298773288726807, "learning_rate": 7.834445336046741e-06, "loss": 2.199445915222168, "memory(GiB)": 130.55, "step": 11955, "token_acc": 0.5184866723989682, "train_speed(iter/s)": 1.192636 }, { "epoch": 4.098697738176833, "grad_norm": 1.9421640634536743, "learning_rate": 7.805539472763474e-06, "loss": 2.202961730957031, "memory(GiB)": 130.55, "step": 11960, "token_acc": 0.5295833333333333, "train_speed(iter/s)": 1.192646 }, { "epoch": 4.1004112405757365, "grad_norm": 2.2266104221343994, "learning_rate": 7.77668251800538e-06, "loss": 2.1744157791137697, "memory(GiB)": 130.55, "step": 11965, "token_acc": 0.532293986636971, "train_speed(iter/s)": 1.192674 }, { "epoch": 4.10212474297464, "grad_norm": 2.123335838317871, "learning_rate": 7.747874505221197e-06, "loss": 2.119965744018555, "memory(GiB)": 130.55, "step": 11970, "token_acc": 0.5318690783807063, "train_speed(iter/s)": 1.192588 }, { "epoch": 4.103838245373543, "grad_norm": 2.103393316268921, "learning_rate": 7.719115467802907e-06, "loss": 2.2870216369628906, "memory(GiB)": 130.55, "step": 11975, "token_acc": 0.5143101238786844, "train_speed(iter/s)": 1.19262 }, { "epoch": 4.105551747772447, "grad_norm": 2.034238338470459, "learning_rate": 7.690405439085758e-06, "loss": 2.1192535400390624, "memory(GiB)": 130.55, "step": 11980, "token_acc": 0.5194068905364152, "train_speed(iter/s)": 1.192625 }, { "epoch": 4.10726525017135, "grad_norm": 2.1074886322021484, "learning_rate": 7.661744452348157e-06, "loss": 2.1772510528564455, "memory(GiB)": 130.55, "step": 11985, "token_acc": 0.5235164835164835, "train_speed(iter/s)": 1.192626 }, { "epoch": 4.108978752570254, "grad_norm": 2.2622292041778564, "learning_rate": 7.6331325408117e-06, "loss": 2.258576583862305, "memory(GiB)": 130.55, "step": 11990, "token_acc": 0.5152951314088755, "train_speed(iter/s)": 1.192689 }, { "epoch": 4.110692254969157, "grad_norm": 2.05320405960083, "learning_rate": 7.604569737641065e-06, "loss": 2.2694459915161134, "memory(GiB)": 130.55, "step": 11995, "token_acc": 0.5063291139240507, "train_speed(iter/s)": 1.192669 }, { "epoch": 4.112405757368061, "grad_norm": 1.961431622505188, "learning_rate": 7.576056075944038e-06, "loss": 2.241751861572266, "memory(GiB)": 130.55, "step": 12000, "token_acc": 0.5197013687266694, "train_speed(iter/s)": 1.192579 }, { "epoch": 4.112405757368061, "eval_loss": 2.071058988571167, "eval_runtime": 3.6985, "eval_samples_per_second": 27.038, "eval_steps_per_second": 27.038, "eval_token_acc": 0.48476454293628807, "step": 12000 }, { "epoch": 4.114119259766964, "grad_norm": 2.118164300918579, "learning_rate": 7.547591588771435e-06, "loss": 2.3018321990966797, "memory(GiB)": 130.55, "step": 12005, "token_acc": 0.5019047619047619, "train_speed(iter/s)": 1.192002 }, { "epoch": 4.1158327621658675, "grad_norm": 2.1450955867767334, "learning_rate": 7.519176309117065e-06, "loss": 2.1216854095458983, "memory(GiB)": 130.55, "step": 12010, "token_acc": 0.5235492010092515, "train_speed(iter/s)": 1.192025 }, { "epoch": 4.11754626456477, "grad_norm": 2.16978120803833, "learning_rate": 7.490810269917675e-06, "loss": 2.1879077911376954, "memory(GiB)": 130.55, "step": 12015, "token_acc": 0.5149220489977728, "train_speed(iter/s)": 1.19207 }, { "epoch": 4.1192597669636735, "grad_norm": 1.8438684940338135, "learning_rate": 7.4624935040529864e-06, "loss": 2.172977638244629, "memory(GiB)": 130.55, "step": 12020, "token_acc": 0.5123720136518771, "train_speed(iter/s)": 1.192049 }, { "epoch": 4.120973269362577, "grad_norm": 2.0542585849761963, "learning_rate": 7.434226044345583e-06, "loss": 2.222331428527832, "memory(GiB)": 130.55, "step": 12025, "token_acc": 0.5176876617773943, "train_speed(iter/s)": 1.192022 }, { "epoch": 4.12268677176148, "grad_norm": 1.984217643737793, "learning_rate": 7.406007923560898e-06, "loss": 2.196524810791016, "memory(GiB)": 130.55, "step": 12030, "token_acc": 0.5192150449713818, "train_speed(iter/s)": 1.192061 }, { "epoch": 4.124400274160384, "grad_norm": 2.0155858993530273, "learning_rate": 7.3778391744071805e-06, "loss": 2.294706344604492, "memory(GiB)": 130.55, "step": 12035, "token_acc": 0.5176624509376363, "train_speed(iter/s)": 1.192021 }, { "epoch": 4.126113776559287, "grad_norm": 2.077742576599121, "learning_rate": 7.349719829535429e-06, "loss": 2.1816474914550783, "memory(GiB)": 130.55, "step": 12040, "token_acc": 0.52443857331572, "train_speed(iter/s)": 1.191997 }, { "epoch": 4.127827278958191, "grad_norm": 2.1180179119110107, "learning_rate": 7.321649921539414e-06, "loss": 2.321111297607422, "memory(GiB)": 130.55, "step": 12045, "token_acc": 0.5120507399577167, "train_speed(iter/s)": 1.191994 }, { "epoch": 4.129540781357094, "grad_norm": 1.8950824737548828, "learning_rate": 7.293629482955555e-06, "loss": 2.127734375, "memory(GiB)": 130.55, "step": 12050, "token_acc": 0.5344202898550725, "train_speed(iter/s)": 1.192043 }, { "epoch": 4.131254283755998, "grad_norm": 2.20332670211792, "learning_rate": 7.2656585462629766e-06, "loss": 2.2229515075683595, "memory(GiB)": 130.55, "step": 12055, "token_acc": 0.5230566534914362, "train_speed(iter/s)": 1.192027 }, { "epoch": 4.132967786154901, "grad_norm": 1.843179702758789, "learning_rate": 7.237737143883399e-06, "loss": 2.202108955383301, "memory(GiB)": 130.55, "step": 12060, "token_acc": 0.5218778486782133, "train_speed(iter/s)": 1.192083 }, { "epoch": 4.134681288553804, "grad_norm": 2.127674102783203, "learning_rate": 7.20986530818113e-06, "loss": 2.1402423858642576, "memory(GiB)": 130.55, "step": 12065, "token_acc": 0.524904214559387, "train_speed(iter/s)": 1.192075 }, { "epoch": 4.136394790952707, "grad_norm": 2.524840831756592, "learning_rate": 7.182043071463046e-06, "loss": 2.205684852600098, "memory(GiB)": 130.55, "step": 12070, "token_acc": 0.5243055555555556, "train_speed(iter/s)": 1.191929 }, { "epoch": 4.1381082933516105, "grad_norm": 1.9000338315963745, "learning_rate": 7.154270465978502e-06, "loss": 2.1777572631835938, "memory(GiB)": 130.55, "step": 12075, "token_acc": 0.5394794883105426, "train_speed(iter/s)": 1.191961 }, { "epoch": 4.139821795750514, "grad_norm": 2.1805286407470703, "learning_rate": 7.126547523919308e-06, "loss": 2.3103193283081054, "memory(GiB)": 130.55, "step": 12080, "token_acc": 0.5124777183600713, "train_speed(iter/s)": 1.191969 }, { "epoch": 4.141535298149417, "grad_norm": 1.9635602235794067, "learning_rate": 7.098874277419765e-06, "loss": 2.1522274017333984, "memory(GiB)": 130.55, "step": 12085, "token_acc": 0.5276967930029155, "train_speed(iter/s)": 1.191992 }, { "epoch": 4.143248800548321, "grad_norm": 2.0548675060272217, "learning_rate": 7.071250758556525e-06, "loss": 2.1326690673828126, "memory(GiB)": 130.55, "step": 12090, "token_acc": 0.5276422764227642, "train_speed(iter/s)": 1.19194 }, { "epoch": 4.144962302947224, "grad_norm": 1.9730554819107056, "learning_rate": 7.043676999348619e-06, "loss": 2.2134532928466797, "memory(GiB)": 130.55, "step": 12095, "token_acc": 0.5266666666666666, "train_speed(iter/s)": 1.19199 }, { "epoch": 4.146675805346128, "grad_norm": 2.298478603363037, "learning_rate": 7.016153031757416e-06, "loss": 2.196426010131836, "memory(GiB)": 130.55, "step": 12100, "token_acc": 0.516460025651988, "train_speed(iter/s)": 1.192031 }, { "epoch": 4.148389307745031, "grad_norm": 2.156153678894043, "learning_rate": 6.9886788876865285e-06, "loss": 2.1631765365600586, "memory(GiB)": 130.55, "step": 12105, "token_acc": 0.5433106575963719, "train_speed(iter/s)": 1.192054 }, { "epoch": 4.150102810143935, "grad_norm": 1.8653541803359985, "learning_rate": 6.961254598981837e-06, "loss": 2.2116641998291016, "memory(GiB)": 130.55, "step": 12110, "token_acc": 0.5224006762468301, "train_speed(iter/s)": 1.192106 }, { "epoch": 4.151816312542838, "grad_norm": 1.9882786273956299, "learning_rate": 6.933880197431441e-06, "loss": 2.1698883056640623, "memory(GiB)": 130.55, "step": 12115, "token_acc": 0.5254316378908073, "train_speed(iter/s)": 1.192125 }, { "epoch": 4.153529814941741, "grad_norm": 1.9699066877365112, "learning_rate": 6.906555714765617e-06, "loss": 2.3413787841796876, "memory(GiB)": 130.55, "step": 12120, "token_acc": 0.5051724137931034, "train_speed(iter/s)": 1.192137 }, { "epoch": 4.155243317340644, "grad_norm": 1.7928673028945923, "learning_rate": 6.879281182656766e-06, "loss": 2.156873321533203, "memory(GiB)": 130.55, "step": 12125, "token_acc": 0.5211144971702221, "train_speed(iter/s)": 1.192132 }, { "epoch": 4.1569568197395474, "grad_norm": 1.9489061832427979, "learning_rate": 6.852056632719412e-06, "loss": 2.240772247314453, "memory(GiB)": 130.55, "step": 12130, "token_acc": 0.5202761000862812, "train_speed(iter/s)": 1.192133 }, { "epoch": 4.158670322138451, "grad_norm": 2.0624327659606934, "learning_rate": 6.82488209651011e-06, "loss": 2.1611701965332033, "memory(GiB)": 130.55, "step": 12135, "token_acc": 0.5311111111111111, "train_speed(iter/s)": 1.192121 }, { "epoch": 4.160383824537354, "grad_norm": 2.0609402656555176, "learning_rate": 6.797757605527461e-06, "loss": 2.3023313522338866, "memory(GiB)": 130.55, "step": 12140, "token_acc": 0.5117204776647502, "train_speed(iter/s)": 1.192178 }, { "epoch": 4.162097326936258, "grad_norm": 2.1863691806793213, "learning_rate": 6.770683191212063e-06, "loss": 2.119672966003418, "memory(GiB)": 130.55, "step": 12145, "token_acc": 0.5297670405522001, "train_speed(iter/s)": 1.192185 }, { "epoch": 4.163810829335161, "grad_norm": 1.9849661588668823, "learning_rate": 6.7436588849464634e-06, "loss": 2.15012149810791, "memory(GiB)": 130.55, "step": 12150, "token_acc": 0.5376532399299475, "train_speed(iter/s)": 1.19223 }, { "epoch": 4.165524331734065, "grad_norm": 1.8201124668121338, "learning_rate": 6.716684718055128e-06, "loss": 2.212264060974121, "memory(GiB)": 130.55, "step": 12155, "token_acc": 0.5161557580778791, "train_speed(iter/s)": 1.192251 }, { "epoch": 4.167237834132968, "grad_norm": 1.895157814025879, "learning_rate": 6.689760721804411e-06, "loss": 2.173037528991699, "memory(GiB)": 130.55, "step": 12160, "token_acc": 0.5245689655172414, "train_speed(iter/s)": 1.192308 }, { "epoch": 4.1689513365318716, "grad_norm": 2.0762224197387695, "learning_rate": 6.662886927402512e-06, "loss": 2.3014598846435548, "memory(GiB)": 130.55, "step": 12165, "token_acc": 0.5057377049180328, "train_speed(iter/s)": 1.192302 }, { "epoch": 4.170664838930774, "grad_norm": 2.1235079765319824, "learning_rate": 6.6360633659994284e-06, "loss": 2.2525314331054687, "memory(GiB)": 130.55, "step": 12170, "token_acc": 0.5230278157774738, "train_speed(iter/s)": 1.19233 }, { "epoch": 4.1723783413296776, "grad_norm": 2.044233560562134, "learning_rate": 6.609290068686924e-06, "loss": 2.1828617095947265, "memory(GiB)": 130.55, "step": 12175, "token_acc": 0.5246675246675246, "train_speed(iter/s)": 1.192351 }, { "epoch": 4.174091843728581, "grad_norm": 2.0064263343811035, "learning_rate": 6.58256706649853e-06, "loss": 2.2326684951782227, "memory(GiB)": 130.55, "step": 12180, "token_acc": 0.5119675456389452, "train_speed(iter/s)": 1.192371 }, { "epoch": 4.175805346127484, "grad_norm": 1.9275426864624023, "learning_rate": 6.555894390409467e-06, "loss": 2.3071100234985353, "memory(GiB)": 130.55, "step": 12185, "token_acc": 0.517901748542881, "train_speed(iter/s)": 1.192373 }, { "epoch": 4.177518848526388, "grad_norm": 2.1160662174224854, "learning_rate": 6.529272071336617e-06, "loss": 2.271114540100098, "memory(GiB)": 130.55, "step": 12190, "token_acc": 0.5226611226611226, "train_speed(iter/s)": 1.192378 }, { "epoch": 4.179232350925291, "grad_norm": 1.9548041820526123, "learning_rate": 6.502700140138501e-06, "loss": 2.2611547470092774, "memory(GiB)": 130.55, "step": 12195, "token_acc": 0.5186104218362283, "train_speed(iter/s)": 1.192415 }, { "epoch": 4.180945853324195, "grad_norm": 2.1665165424346924, "learning_rate": 6.47617862761522e-06, "loss": 2.2540369033813477, "memory(GiB)": 130.55, "step": 12200, "token_acc": 0.5202814569536424, "train_speed(iter/s)": 1.192302 }, { "epoch": 4.182659355723098, "grad_norm": 2.2007110118865967, "learning_rate": 6.449707564508428e-06, "loss": 2.148825454711914, "memory(GiB)": 130.55, "step": 12205, "token_acc": 0.5335392762577229, "train_speed(iter/s)": 1.192334 }, { "epoch": 4.184372858122002, "grad_norm": 2.0414958000183105, "learning_rate": 6.423286981501331e-06, "loss": 2.10982666015625, "memory(GiB)": 130.55, "step": 12210, "token_acc": 0.5222988505747126, "train_speed(iter/s)": 1.192334 }, { "epoch": 4.186086360520905, "grad_norm": 1.9999845027923584, "learning_rate": 6.396916909218603e-06, "loss": 2.193212890625, "memory(GiB)": 130.55, "step": 12215, "token_acc": 0.5433829973707275, "train_speed(iter/s)": 1.192331 }, { "epoch": 4.187799862919808, "grad_norm": 1.8513684272766113, "learning_rate": 6.370597378226378e-06, "loss": 2.2068592071533204, "memory(GiB)": 130.55, "step": 12220, "token_acc": 0.5159371015724606, "train_speed(iter/s)": 1.19238 }, { "epoch": 4.189513365318711, "grad_norm": 2.2146847248077393, "learning_rate": 6.344328419032203e-06, "loss": 2.2399253845214844, "memory(GiB)": 130.55, "step": 12225, "token_acc": 0.5086397873283119, "train_speed(iter/s)": 1.192315 }, { "epoch": 4.1912268677176145, "grad_norm": 2.0924079418182373, "learning_rate": 6.318110062085003e-06, "loss": 2.2517864227294924, "memory(GiB)": 130.55, "step": 12230, "token_acc": 0.5080321285140562, "train_speed(iter/s)": 1.192313 }, { "epoch": 4.192940370116518, "grad_norm": 2.0255496501922607, "learning_rate": 6.291942337775036e-06, "loss": 2.1712068557739257, "memory(GiB)": 130.55, "step": 12235, "token_acc": 0.5322376738305942, "train_speed(iter/s)": 1.192276 }, { "epoch": 4.194653872515421, "grad_norm": 2.1257126331329346, "learning_rate": 6.265825276433901e-06, "loss": 2.268494415283203, "memory(GiB)": 130.55, "step": 12240, "token_acc": 0.5095683133066311, "train_speed(iter/s)": 1.192294 }, { "epoch": 4.196367374914325, "grad_norm": 2.161176919937134, "learning_rate": 6.239758908334453e-06, "loss": 2.1323089599609375, "memory(GiB)": 130.55, "step": 12245, "token_acc": 0.5300954753009548, "train_speed(iter/s)": 1.192327 }, { "epoch": 4.198080877313228, "grad_norm": 2.012115478515625, "learning_rate": 6.213743263690791e-06, "loss": 2.3039373397827148, "memory(GiB)": 130.55, "step": 12250, "token_acc": 0.49787955894826125, "train_speed(iter/s)": 1.192268 }, { "epoch": 4.199794379712132, "grad_norm": 2.3072197437286377, "learning_rate": 6.18777837265822e-06, "loss": 2.2442893981933594, "memory(GiB)": 130.55, "step": 12255, "token_acc": 0.5296152183311716, "train_speed(iter/s)": 1.192265 }, { "epoch": 4.201507882111035, "grad_norm": 1.8891538381576538, "learning_rate": 6.161864265333228e-06, "loss": 2.265629005432129, "memory(GiB)": 130.55, "step": 12260, "token_acc": 0.5067624683009299, "train_speed(iter/s)": 1.192259 }, { "epoch": 4.203221384509939, "grad_norm": 1.9867429733276367, "learning_rate": 6.136000971753414e-06, "loss": 2.2432277679443358, "memory(GiB)": 130.55, "step": 12265, "token_acc": 0.5151777970511708, "train_speed(iter/s)": 1.192303 }, { "epoch": 4.204934886908842, "grad_norm": 2.3591179847717285, "learning_rate": 6.110188521897475e-06, "loss": 2.2023120880126954, "memory(GiB)": 130.55, "step": 12270, "token_acc": 0.5186567164179104, "train_speed(iter/s)": 1.192324 }, { "epoch": 4.206648389307745, "grad_norm": 2.1886236667633057, "learning_rate": 6.084426945685201e-06, "loss": 2.2456722259521484, "memory(GiB)": 130.55, "step": 12275, "token_acc": 0.5163191296464189, "train_speed(iter/s)": 1.192252 }, { "epoch": 4.208361891706648, "grad_norm": 2.3034212589263916, "learning_rate": 6.058716272977405e-06, "loss": 2.1834743499755858, "memory(GiB)": 130.55, "step": 12280, "token_acc": 0.538989898989899, "train_speed(iter/s)": 1.19226 }, { "epoch": 4.2100753941055515, "grad_norm": 1.9843084812164307, "learning_rate": 6.033056533575887e-06, "loss": 2.2483638763427733, "memory(GiB)": 130.55, "step": 12285, "token_acc": 0.504420866489832, "train_speed(iter/s)": 1.192265 }, { "epoch": 4.211788896504455, "grad_norm": 2.0442423820495605, "learning_rate": 6.0074477572234225e-06, "loss": 2.1482433319091796, "memory(GiB)": 130.55, "step": 12290, "token_acc": 0.520073834794647, "train_speed(iter/s)": 1.192309 }, { "epoch": 4.213502398903358, "grad_norm": 2.0862231254577637, "learning_rate": 5.981889973603699e-06, "loss": 2.263450050354004, "memory(GiB)": 130.55, "step": 12295, "token_acc": 0.5200348432055749, "train_speed(iter/s)": 1.192352 }, { "epoch": 4.215215901302262, "grad_norm": 1.9439566135406494, "learning_rate": 5.956383212341293e-06, "loss": 2.2356330871582033, "memory(GiB)": 130.55, "step": 12300, "token_acc": 0.5231702035513209, "train_speed(iter/s)": 1.192283 }, { "epoch": 4.216929403701165, "grad_norm": 2.194953680038452, "learning_rate": 5.93092750300166e-06, "loss": 2.224166679382324, "memory(GiB)": 130.55, "step": 12305, "token_acc": 0.5323955669224212, "train_speed(iter/s)": 1.192296 }, { "epoch": 4.218642906100069, "grad_norm": 2.0231101512908936, "learning_rate": 5.9055228750910795e-06, "loss": 2.229655075073242, "memory(GiB)": 130.55, "step": 12310, "token_acc": 0.5037475345167652, "train_speed(iter/s)": 1.192321 }, { "epoch": 4.220356408498972, "grad_norm": 2.083087682723999, "learning_rate": 5.880169358056603e-06, "loss": 2.2806243896484375, "memory(GiB)": 130.55, "step": 12315, "token_acc": 0.5116379310344827, "train_speed(iter/s)": 1.192323 }, { "epoch": 4.222069910897876, "grad_norm": 2.009037733078003, "learning_rate": 5.854866981286061e-06, "loss": 2.1510141372680662, "memory(GiB)": 130.55, "step": 12320, "token_acc": 0.5332152347209921, "train_speed(iter/s)": 1.192371 }, { "epoch": 4.223783413296778, "grad_norm": 1.8285331726074219, "learning_rate": 5.829615774107977e-06, "loss": 2.2581483840942385, "memory(GiB)": 130.55, "step": 12325, "token_acc": 0.4969574036511156, "train_speed(iter/s)": 1.192373 }, { "epoch": 4.225496915695682, "grad_norm": 2.000898599624634, "learning_rate": 5.804415765791599e-06, "loss": 2.228652763366699, "memory(GiB)": 130.55, "step": 12330, "token_acc": 0.508130081300813, "train_speed(iter/s)": 1.192355 }, { "epoch": 4.227210418094585, "grad_norm": 1.947124719619751, "learning_rate": 5.7792669855467884e-06, "loss": 2.226536178588867, "memory(GiB)": 130.55, "step": 12335, "token_acc": 0.5158730158730159, "train_speed(iter/s)": 1.192341 }, { "epoch": 4.2289239204934885, "grad_norm": 2.242936611175537, "learning_rate": 5.754169462524056e-06, "loss": 2.1623512268066407, "memory(GiB)": 130.55, "step": 12340, "token_acc": 0.5163043478260869, "train_speed(iter/s)": 1.19234 }, { "epoch": 4.230637422892392, "grad_norm": 2.1836602687835693, "learning_rate": 5.729123225814498e-06, "loss": 2.154575157165527, "memory(GiB)": 130.55, "step": 12345, "token_acc": 0.5413145539906103, "train_speed(iter/s)": 1.192353 }, { "epoch": 4.232350925291295, "grad_norm": 2.0317842960357666, "learning_rate": 5.704128304449757e-06, "loss": 2.2617382049560546, "memory(GiB)": 130.55, "step": 12350, "token_acc": 0.5228233305156382, "train_speed(iter/s)": 1.192314 }, { "epoch": 4.234064427690199, "grad_norm": 1.8252770900726318, "learning_rate": 5.679184727401987e-06, "loss": 2.2499195098876954, "memory(GiB)": 130.55, "step": 12355, "token_acc": 0.5158662092624356, "train_speed(iter/s)": 1.192363 }, { "epoch": 4.235777930089102, "grad_norm": 1.8787213563919067, "learning_rate": 5.654292523583843e-06, "loss": 2.2549644470214845, "memory(GiB)": 130.55, "step": 12360, "token_acc": 0.5091130012150669, "train_speed(iter/s)": 1.192345 }, { "epoch": 4.237491432488006, "grad_norm": 1.9056172370910645, "learning_rate": 5.629451721848417e-06, "loss": 2.184517478942871, "memory(GiB)": 130.55, "step": 12365, "token_acc": 0.5318275154004107, "train_speed(iter/s)": 1.192363 }, { "epoch": 4.239204934886909, "grad_norm": 1.8541340827941895, "learning_rate": 5.604662350989226e-06, "loss": 2.2663108825683596, "memory(GiB)": 130.55, "step": 12370, "token_acc": 0.5195911413969335, "train_speed(iter/s)": 1.192376 }, { "epoch": 4.240918437285812, "grad_norm": 2.14340877532959, "learning_rate": 5.5799244397401806e-06, "loss": 2.220196533203125, "memory(GiB)": 130.55, "step": 12375, "token_acc": 0.5208333333333334, "train_speed(iter/s)": 1.192398 }, { "epoch": 4.242631939684715, "grad_norm": 2.005354404449463, "learning_rate": 5.555238016775538e-06, "loss": 2.2287391662597655, "memory(GiB)": 130.55, "step": 12380, "token_acc": 0.5046179680940386, "train_speed(iter/s)": 1.19242 }, { "epoch": 4.244345442083619, "grad_norm": 2.2996292114257812, "learning_rate": 5.530603110709875e-06, "loss": 2.158736801147461, "memory(GiB)": 130.55, "step": 12385, "token_acc": 0.5201015658061786, "train_speed(iter/s)": 1.192426 }, { "epoch": 4.246058944482522, "grad_norm": 2.1307616233825684, "learning_rate": 5.50601975009804e-06, "loss": 2.173929977416992, "memory(GiB)": 130.55, "step": 12390, "token_acc": 0.5328403653762506, "train_speed(iter/s)": 1.192361 }, { "epoch": 4.2477724468814255, "grad_norm": 2.2708873748779297, "learning_rate": 5.481487963435161e-06, "loss": 2.185109329223633, "memory(GiB)": 130.55, "step": 12395, "token_acc": 0.5195454545454545, "train_speed(iter/s)": 1.192394 }, { "epoch": 4.249485949280329, "grad_norm": 2.0611674785614014, "learning_rate": 5.457007779156554e-06, "loss": 2.2271492004394533, "memory(GiB)": 130.55, "step": 12400, "token_acc": 0.527534965034965, "train_speed(iter/s)": 1.192387 }, { "epoch": 4.251199451679232, "grad_norm": 1.9366153478622437, "learning_rate": 5.43257922563774e-06, "loss": 2.1791383743286135, "memory(GiB)": 130.55, "step": 12405, "token_acc": 0.5232751454696591, "train_speed(iter/s)": 1.192393 }, { "epoch": 4.252912954078136, "grad_norm": 2.0851259231567383, "learning_rate": 5.408202331194406e-06, "loss": 2.1831789016723633, "memory(GiB)": 130.55, "step": 12410, "token_acc": 0.5214961306964746, "train_speed(iter/s)": 1.192378 }, { "epoch": 4.254626456477039, "grad_norm": 2.20467472076416, "learning_rate": 5.3838771240823425e-06, "loss": 2.2786514282226564, "memory(GiB)": 130.55, "step": 12415, "token_acc": 0.5120967741935484, "train_speed(iter/s)": 1.192292 }, { "epoch": 4.256339958875943, "grad_norm": 1.8176981210708618, "learning_rate": 5.3596036324974116e-06, "loss": 2.2765214920043944, "memory(GiB)": 130.55, "step": 12420, "token_acc": 0.5164319248826291, "train_speed(iter/s)": 1.192322 }, { "epoch": 4.258053461274846, "grad_norm": 2.0272583961486816, "learning_rate": 5.335381884575575e-06, "loss": 2.260724639892578, "memory(GiB)": 130.55, "step": 12425, "token_acc": 0.5166051660516605, "train_speed(iter/s)": 1.192305 }, { "epoch": 4.259766963673749, "grad_norm": 2.128502368927002, "learning_rate": 5.311211908392771e-06, "loss": 2.2538402557373045, "memory(GiB)": 130.55, "step": 12430, "token_acc": 0.5148054145516074, "train_speed(iter/s)": 1.192306 }, { "epoch": 4.261480466072652, "grad_norm": 2.3304150104522705, "learning_rate": 5.287093731964965e-06, "loss": 2.2982044219970703, "memory(GiB)": 130.55, "step": 12435, "token_acc": 0.5182421227197347, "train_speed(iter/s)": 1.192346 }, { "epoch": 4.263193968471556, "grad_norm": 1.861554503440857, "learning_rate": 5.263027383248048e-06, "loss": 2.266160583496094, "memory(GiB)": 130.55, "step": 12440, "token_acc": 0.5182237117720989, "train_speed(iter/s)": 1.192223 }, { "epoch": 4.264907470870459, "grad_norm": 2.1336569786071777, "learning_rate": 5.2390128901378784e-06, "loss": 2.2324752807617188, "memory(GiB)": 130.55, "step": 12445, "token_acc": 0.5284015852047557, "train_speed(iter/s)": 1.19225 }, { "epoch": 4.2666209732693625, "grad_norm": 1.9853386878967285, "learning_rate": 5.215050280470163e-06, "loss": 2.222945785522461, "memory(GiB)": 130.55, "step": 12450, "token_acc": 0.5213327915481512, "train_speed(iter/s)": 1.192262 }, { "epoch": 4.268334475668266, "grad_norm": 2.0397493839263916, "learning_rate": 5.191139582020488e-06, "loss": 2.2349491119384766, "memory(GiB)": 130.55, "step": 12455, "token_acc": 0.52652697280428, "train_speed(iter/s)": 1.192258 }, { "epoch": 4.270047978067169, "grad_norm": 2.043833017349243, "learning_rate": 5.167280822504278e-06, "loss": 2.2753772735595703, "memory(GiB)": 130.55, "step": 12460, "token_acc": 0.5218978102189781, "train_speed(iter/s)": 1.192308 }, { "epoch": 4.271761480466073, "grad_norm": 2.0927646160125732, "learning_rate": 5.143474029576739e-06, "loss": 2.204433250427246, "memory(GiB)": 130.55, "step": 12465, "token_acc": 0.5166098807495741, "train_speed(iter/s)": 1.192292 }, { "epoch": 4.273474982864976, "grad_norm": 2.0224523544311523, "learning_rate": 5.1197192308328426e-06, "loss": 2.1961320877075194, "memory(GiB)": 130.55, "step": 12470, "token_acc": 0.5270894582108357, "train_speed(iter/s)": 1.192328 }, { "epoch": 4.27518848526388, "grad_norm": 2.1666219234466553, "learning_rate": 5.0960164538073065e-06, "loss": 2.2524124145507813, "memory(GiB)": 130.55, "step": 12475, "token_acc": 0.5261744966442953, "train_speed(iter/s)": 1.192293 }, { "epoch": 4.276901987662782, "grad_norm": 2.080268621444702, "learning_rate": 5.072365725974543e-06, "loss": 2.1195648193359373, "memory(GiB)": 130.55, "step": 12480, "token_acc": 0.5303791685701233, "train_speed(iter/s)": 1.192332 }, { "epoch": 4.278615490061686, "grad_norm": 2.141850233078003, "learning_rate": 5.0487670747486175e-06, "loss": 2.2539909362792967, "memory(GiB)": 130.55, "step": 12485, "token_acc": 0.5031520882584712, "train_speed(iter/s)": 1.192346 }, { "epoch": 4.280328992460589, "grad_norm": 1.9919016361236572, "learning_rate": 5.02522052748326e-06, "loss": 2.2427915573120116, "memory(GiB)": 130.55, "step": 12490, "token_acc": 0.5145432199918066, "train_speed(iter/s)": 1.192342 }, { "epoch": 4.282042494859493, "grad_norm": 2.005596160888672, "learning_rate": 5.001726111471766e-06, "loss": 2.1837947845458983, "memory(GiB)": 130.55, "step": 12495, "token_acc": 0.5266830870279147, "train_speed(iter/s)": 1.192296 }, { "epoch": 4.283755997258396, "grad_norm": 2.1508102416992188, "learning_rate": 4.978283853947046e-06, "loss": 2.129721832275391, "memory(GiB)": 130.55, "step": 12500, "token_acc": 0.527681660899654, "train_speed(iter/s)": 1.192273 }, { "epoch": 4.283755997258396, "eval_loss": 2.2630176544189453, "eval_runtime": 3.7136, "eval_samples_per_second": 26.928, "eval_steps_per_second": 26.928, "eval_token_acc": 0.48318804483188044, "step": 12500 }, { "epoch": 4.2854694996572995, "grad_norm": 1.9382619857788086, "learning_rate": 4.954893782081532e-06, "loss": 2.2208580017089843, "memory(GiB)": 130.55, "step": 12505, "token_acc": 0.5108091414453366, "train_speed(iter/s)": 1.191698 }, { "epoch": 4.287183002056203, "grad_norm": 2.042462110519409, "learning_rate": 4.93155592298718e-06, "loss": 2.2445606231689452, "memory(GiB)": 130.55, "step": 12510, "token_acc": 0.5215751989945538, "train_speed(iter/s)": 1.191603 }, { "epoch": 4.288896504455106, "grad_norm": 2.187875747680664, "learning_rate": 4.908270303715395e-06, "loss": 2.2389104843139647, "memory(GiB)": 130.55, "step": 12515, "token_acc": 0.5227765726681128, "train_speed(iter/s)": 1.191622 }, { "epoch": 4.29061000685401, "grad_norm": 1.96361243724823, "learning_rate": 4.885036951257055e-06, "loss": 2.15268611907959, "memory(GiB)": 130.55, "step": 12520, "token_acc": 0.5365205843293492, "train_speed(iter/s)": 1.191636 }, { "epoch": 4.292323509252913, "grad_norm": 2.157663106918335, "learning_rate": 4.861855892542466e-06, "loss": 2.19425106048584, "memory(GiB)": 130.55, "step": 12525, "token_acc": 0.5234406848756624, "train_speed(iter/s)": 1.191629 }, { "epoch": 4.294037011651817, "grad_norm": 2.066474437713623, "learning_rate": 4.83872715444128e-06, "loss": 2.202699661254883, "memory(GiB)": 130.55, "step": 12530, "token_acc": 0.5294380017841214, "train_speed(iter/s)": 1.191666 }, { "epoch": 4.295750514050719, "grad_norm": 2.242098569869995, "learning_rate": 4.81565076376253e-06, "loss": 2.2596067428588866, "memory(GiB)": 130.55, "step": 12535, "token_acc": 0.5227562739259889, "train_speed(iter/s)": 1.1916 }, { "epoch": 4.297464016449623, "grad_norm": 1.8892275094985962, "learning_rate": 4.79262674725458e-06, "loss": 2.324428176879883, "memory(GiB)": 130.55, "step": 12540, "token_acc": 0.500203832042397, "train_speed(iter/s)": 1.191637 }, { "epoch": 4.299177518848526, "grad_norm": 1.9863179922103882, "learning_rate": 4.769655131605055e-06, "loss": 2.192649269104004, "memory(GiB)": 130.55, "step": 12545, "token_acc": 0.519134775374376, "train_speed(iter/s)": 1.191612 }, { "epoch": 4.30089102124743, "grad_norm": 2.132850170135498, "learning_rate": 4.746735943440861e-06, "loss": 2.158019256591797, "memory(GiB)": 130.55, "step": 12550, "token_acc": 0.5224331320103538, "train_speed(iter/s)": 1.191635 }, { "epoch": 4.302604523646333, "grad_norm": 2.317235231399536, "learning_rate": 4.723869209328153e-06, "loss": 2.2390583038330076, "memory(GiB)": 130.55, "step": 12555, "token_acc": 0.518956379942927, "train_speed(iter/s)": 1.191681 }, { "epoch": 4.3043180260452365, "grad_norm": 1.9570469856262207, "learning_rate": 4.701054955772238e-06, "loss": 2.194683647155762, "memory(GiB)": 130.55, "step": 12560, "token_acc": 0.5250763192324466, "train_speed(iter/s)": 1.191608 }, { "epoch": 4.30603152844414, "grad_norm": 1.8804813623428345, "learning_rate": 4.678293209217632e-06, "loss": 2.1629547119140624, "memory(GiB)": 130.55, "step": 12565, "token_acc": 0.5216062854648625, "train_speed(iter/s)": 1.191667 }, { "epoch": 4.307745030843043, "grad_norm": 2.263571262359619, "learning_rate": 4.655583996047968e-06, "loss": 2.196034812927246, "memory(GiB)": 130.55, "step": 12570, "token_acc": 0.5212224108658744, "train_speed(iter/s)": 1.191698 }, { "epoch": 4.309458533241947, "grad_norm": 2.015333414077759, "learning_rate": 4.6329273425860075e-06, "loss": 2.2859607696533204, "memory(GiB)": 130.55, "step": 12575, "token_acc": 0.518695652173913, "train_speed(iter/s)": 1.191707 }, { "epoch": 4.31117203564085, "grad_norm": 1.9805129766464233, "learning_rate": 4.6103232750935535e-06, "loss": 2.2519384384155274, "memory(GiB)": 130.55, "step": 12580, "token_acc": 0.5236500627877773, "train_speed(iter/s)": 1.19173 }, { "epoch": 4.312885538039753, "grad_norm": 2.332209825515747, "learning_rate": 4.587771819771491e-06, "loss": 2.135664176940918, "memory(GiB)": 130.55, "step": 12585, "token_acc": 0.5459965928449745, "train_speed(iter/s)": 1.19165 }, { "epoch": 4.314599040438656, "grad_norm": 2.1223981380462646, "learning_rate": 4.565273002759713e-06, "loss": 2.21484489440918, "memory(GiB)": 130.55, "step": 12590, "token_acc": 0.5245700245700246, "train_speed(iter/s)": 1.191681 }, { "epoch": 4.31631254283756, "grad_norm": 1.8927420377731323, "learning_rate": 4.542826850137067e-06, "loss": 2.3592193603515623, "memory(GiB)": 130.55, "step": 12595, "token_acc": 0.499798630688683, "train_speed(iter/s)": 1.19165 }, { "epoch": 4.318026045236463, "grad_norm": 2.0239510536193848, "learning_rate": 4.5204333879214025e-06, "loss": 2.327073669433594, "memory(GiB)": 130.55, "step": 12600, "token_acc": 0.49735018344883813, "train_speed(iter/s)": 1.191689 }, { "epoch": 4.319739547635367, "grad_norm": 2.0677456855773926, "learning_rate": 4.498092642069473e-06, "loss": 2.2542465209960936, "memory(GiB)": 130.55, "step": 12605, "token_acc": 0.5132042253521126, "train_speed(iter/s)": 1.191763 }, { "epoch": 4.32145305003427, "grad_norm": 2.000335216522217, "learning_rate": 4.475804638476916e-06, "loss": 2.0846988677978517, "memory(GiB)": 130.55, "step": 12610, "token_acc": 0.5402247191011236, "train_speed(iter/s)": 1.191779 }, { "epoch": 4.3231665524331735, "grad_norm": 2.259676933288574, "learning_rate": 4.4535694029782474e-06, "loss": 2.0522132873535157, "memory(GiB)": 130.55, "step": 12615, "token_acc": 0.5399538106235566, "train_speed(iter/s)": 1.191838 }, { "epoch": 4.324880054832077, "grad_norm": 2.524949312210083, "learning_rate": 4.431386961346834e-06, "loss": 2.297074890136719, "memory(GiB)": 130.55, "step": 12620, "token_acc": 0.51586655817738, "train_speed(iter/s)": 1.191832 }, { "epoch": 4.32659355723098, "grad_norm": 2.196152687072754, "learning_rate": 4.409257339294804e-06, "loss": 2.2690858840942383, "memory(GiB)": 130.55, "step": 12625, "token_acc": 0.5074937552039966, "train_speed(iter/s)": 1.191838 }, { "epoch": 4.328307059629884, "grad_norm": 1.8612374067306519, "learning_rate": 4.387180562473103e-06, "loss": 2.150121307373047, "memory(GiB)": 130.55, "step": 12630, "token_acc": 0.527054935976869, "train_speed(iter/s)": 1.191844 }, { "epoch": 4.330020562028787, "grad_norm": 1.906197428703308, "learning_rate": 4.365156656471408e-06, "loss": 2.2024112701416017, "memory(GiB)": 130.55, "step": 12635, "token_acc": 0.53438701409654, "train_speed(iter/s)": 1.191869 }, { "epoch": 4.33173406442769, "grad_norm": 2.100214958190918, "learning_rate": 4.34318564681811e-06, "loss": 2.2782970428466798, "memory(GiB)": 130.55, "step": 12640, "token_acc": 0.510586319218241, "train_speed(iter/s)": 1.191882 }, { "epoch": 4.333447566826593, "grad_norm": 1.968786358833313, "learning_rate": 4.3212675589802796e-06, "loss": 2.199926567077637, "memory(GiB)": 130.55, "step": 12645, "token_acc": 0.5266327003242242, "train_speed(iter/s)": 1.191919 }, { "epoch": 4.335161069225497, "grad_norm": 2.193592071533203, "learning_rate": 4.2994024183636625e-06, "loss": 2.2745454788208006, "memory(GiB)": 130.55, "step": 12650, "token_acc": 0.5012315270935961, "train_speed(iter/s)": 1.191944 }, { "epoch": 4.3368745716244, "grad_norm": 2.1425840854644775, "learning_rate": 4.277590250312635e-06, "loss": 2.1874752044677734, "memory(GiB)": 130.55, "step": 12655, "token_acc": 0.5359342915811088, "train_speed(iter/s)": 1.191929 }, { "epoch": 4.338588074023304, "grad_norm": 2.1665496826171875, "learning_rate": 4.255831080110134e-06, "loss": 2.151189613342285, "memory(GiB)": 130.55, "step": 12660, "token_acc": 0.5314979001399907, "train_speed(iter/s)": 1.191913 }, { "epoch": 4.340301576422207, "grad_norm": 1.9089970588684082, "learning_rate": 4.2341249329777065e-06, "loss": 2.1691974639892577, "memory(GiB)": 130.55, "step": 12665, "token_acc": 0.5372182050191407, "train_speed(iter/s)": 1.191954 }, { "epoch": 4.3420150788211105, "grad_norm": 2.0866243839263916, "learning_rate": 4.2124718340754325e-06, "loss": 2.161419486999512, "memory(GiB)": 130.55, "step": 12670, "token_acc": 0.5396193005754759, "train_speed(iter/s)": 1.191974 }, { "epoch": 4.343728581220014, "grad_norm": 2.4157536029815674, "learning_rate": 4.190871808501867e-06, "loss": 2.185748481750488, "memory(GiB)": 130.55, "step": 12675, "token_acc": 0.5322217214961694, "train_speed(iter/s)": 1.19199 }, { "epoch": 4.345442083618917, "grad_norm": 1.9213780164718628, "learning_rate": 4.169324881294096e-06, "loss": 2.1766191482543946, "memory(GiB)": 130.55, "step": 12680, "token_acc": 0.5188284518828452, "train_speed(iter/s)": 1.191955 }, { "epoch": 4.347155586017821, "grad_norm": 1.8177317380905151, "learning_rate": 4.147831077427633e-06, "loss": 2.1539588928222657, "memory(GiB)": 130.55, "step": 12685, "token_acc": 0.5383580080753702, "train_speed(iter/s)": 1.191934 }, { "epoch": 4.348869088416723, "grad_norm": 2.0742111206054688, "learning_rate": 4.126390421816407e-06, "loss": 2.301266098022461, "memory(GiB)": 130.55, "step": 12690, "token_acc": 0.5223559759243336, "train_speed(iter/s)": 1.191993 }, { "epoch": 4.350582590815627, "grad_norm": 2.1382462978363037, "learning_rate": 4.105002939312763e-06, "loss": 2.1952188491821287, "memory(GiB)": 130.55, "step": 12695, "token_acc": 0.521497919556172, "train_speed(iter/s)": 1.192011 }, { "epoch": 4.35229609321453, "grad_norm": 2.0392839908599854, "learning_rate": 4.083668654707401e-06, "loss": 2.2726814270019533, "memory(GiB)": 130.55, "step": 12700, "token_acc": 0.5061677631578947, "train_speed(iter/s)": 1.192029 }, { "epoch": 4.354009595613434, "grad_norm": 2.078780174255371, "learning_rate": 4.06238759272935e-06, "loss": 2.1868108749389648, "memory(GiB)": 130.55, "step": 12705, "token_acc": 0.5161001788908766, "train_speed(iter/s)": 1.192046 }, { "epoch": 4.355723098012337, "grad_norm": 2.0698819160461426, "learning_rate": 4.041159778045961e-06, "loss": 2.1299932479858397, "memory(GiB)": 130.55, "step": 12710, "token_acc": 0.5358811040339703, "train_speed(iter/s)": 1.192038 }, { "epoch": 4.357436600411241, "grad_norm": 2.9475255012512207, "learning_rate": 4.019985235262863e-06, "loss": 2.236552429199219, "memory(GiB)": 130.55, "step": 12715, "token_acc": 0.5195296094078119, "train_speed(iter/s)": 1.192051 }, { "epoch": 4.359150102810144, "grad_norm": 1.9065223932266235, "learning_rate": 3.998863988923934e-06, "loss": 2.140062713623047, "memory(GiB)": 130.55, "step": 12720, "token_acc": 0.5309389874513197, "train_speed(iter/s)": 1.192053 }, { "epoch": 4.3608636052090475, "grad_norm": 2.1302590370178223, "learning_rate": 3.977796063511263e-06, "loss": 2.3513031005859375, "memory(GiB)": 130.55, "step": 12725, "token_acc": 0.5049180327868853, "train_speed(iter/s)": 1.192063 }, { "epoch": 4.362577107607951, "grad_norm": 2.3037071228027344, "learning_rate": 3.956781483445166e-06, "loss": 2.241806411743164, "memory(GiB)": 130.55, "step": 12730, "token_acc": 0.5161290322580645, "train_speed(iter/s)": 1.192049 }, { "epoch": 4.364290610006854, "grad_norm": 1.9048258066177368, "learning_rate": 3.935820273084073e-06, "loss": 2.2055727005004884, "memory(GiB)": 130.55, "step": 12735, "token_acc": 0.5090609555189456, "train_speed(iter/s)": 1.192056 }, { "epoch": 4.366004112405758, "grad_norm": 2.1472604274749756, "learning_rate": 3.914912456724607e-06, "loss": 2.2388031005859377, "memory(GiB)": 130.55, "step": 12740, "token_acc": 0.5230263157894737, "train_speed(iter/s)": 1.192094 }, { "epoch": 4.36771761480466, "grad_norm": 2.0233285427093506, "learning_rate": 3.894058058601469e-06, "loss": 2.189749526977539, "memory(GiB)": 130.55, "step": 12745, "token_acc": 0.5241171403962102, "train_speed(iter/s)": 1.192064 }, { "epoch": 4.369431117203564, "grad_norm": 2.4155516624450684, "learning_rate": 3.873257102887456e-06, "loss": 2.255877876281738, "memory(GiB)": 130.55, "step": 12750, "token_acc": 0.52007136485281, "train_speed(iter/s)": 1.192071 }, { "epoch": 4.371144619602467, "grad_norm": 2.132559061050415, "learning_rate": 3.852509613693401e-06, "loss": 2.279679870605469, "memory(GiB)": 130.55, "step": 12755, "token_acc": 0.524538496172895, "train_speed(iter/s)": 1.19211 }, { "epoch": 4.372858122001371, "grad_norm": 1.8754618167877197, "learning_rate": 3.831815615068185e-06, "loss": 2.3353082656860353, "memory(GiB)": 130.55, "step": 12760, "token_acc": 0.5066666666666667, "train_speed(iter/s)": 1.192048 }, { "epoch": 4.374571624400274, "grad_norm": 2.103724479675293, "learning_rate": 3.8111751309986622e-06, "loss": 2.3774368286132814, "memory(GiB)": 130.55, "step": 12765, "token_acc": 0.4849802371541502, "train_speed(iter/s)": 1.192048 }, { "epoch": 4.376285126799178, "grad_norm": 2.1614725589752197, "learning_rate": 3.7905881854096827e-06, "loss": 2.257037353515625, "memory(GiB)": 130.55, "step": 12770, "token_acc": 0.5285257873117298, "train_speed(iter/s)": 1.19206 }, { "epoch": 4.377998629198081, "grad_norm": 2.1127727031707764, "learning_rate": 3.7700548021640193e-06, "loss": 2.242336654663086, "memory(GiB)": 130.55, "step": 12775, "token_acc": 0.5238715277777778, "train_speed(iter/s)": 1.192076 }, { "epoch": 4.3797121315969845, "grad_norm": 1.953426480293274, "learning_rate": 3.7495750050623725e-06, "loss": 2.147279739379883, "memory(GiB)": 130.55, "step": 12780, "token_acc": 0.5289330922242315, "train_speed(iter/s)": 1.192116 }, { "epoch": 4.381425633995888, "grad_norm": 1.841266393661499, "learning_rate": 3.729148817843331e-06, "loss": 2.0980653762817383, "memory(GiB)": 130.55, "step": 12785, "token_acc": 0.5544763513513513, "train_speed(iter/s)": 1.192155 }, { "epoch": 4.383139136394791, "grad_norm": 1.8823156356811523, "learning_rate": 3.7087762641833223e-06, "loss": 2.2820159912109377, "memory(GiB)": 130.55, "step": 12790, "token_acc": 0.5222083852220839, "train_speed(iter/s)": 1.19217 }, { "epoch": 4.384852638793694, "grad_norm": 2.185560464859009, "learning_rate": 3.688457367696635e-06, "loss": 2.1605710983276367, "memory(GiB)": 130.55, "step": 12795, "token_acc": 0.5272349272349273, "train_speed(iter/s)": 1.192139 }, { "epoch": 4.386566141192597, "grad_norm": 2.0944507122039795, "learning_rate": 3.6681921519353347e-06, "loss": 2.2350442886352537, "memory(GiB)": 130.55, "step": 12800, "token_acc": 0.5291173794358508, "train_speed(iter/s)": 1.192145 }, { "epoch": 4.388279643591501, "grad_norm": 1.8316121101379395, "learning_rate": 3.6479806403892757e-06, "loss": 2.1359119415283203, "memory(GiB)": 130.55, "step": 12805, "token_acc": 0.5357142857142857, "train_speed(iter/s)": 1.192178 }, { "epoch": 4.389993145990404, "grad_norm": 1.937981367111206, "learning_rate": 3.627822856486074e-06, "loss": 2.1874156951904298, "memory(GiB)": 130.55, "step": 12810, "token_acc": 0.5309090909090909, "train_speed(iter/s)": 1.192229 }, { "epoch": 4.391706648389308, "grad_norm": 2.137779951095581, "learning_rate": 3.607718823591061e-06, "loss": 2.2154878616333007, "memory(GiB)": 130.55, "step": 12815, "token_acc": 0.5321691176470589, "train_speed(iter/s)": 1.19226 }, { "epoch": 4.393420150788211, "grad_norm": 1.8185510635375977, "learning_rate": 3.5876685650072626e-06, "loss": 2.286126708984375, "memory(GiB)": 130.55, "step": 12820, "token_acc": 0.5204630012401819, "train_speed(iter/s)": 1.192217 }, { "epoch": 4.395133653187115, "grad_norm": 1.8371931314468384, "learning_rate": 3.5676721039753723e-06, "loss": 2.3029455184936523, "memory(GiB)": 130.55, "step": 12825, "token_acc": 0.5161153519932146, "train_speed(iter/s)": 1.192244 }, { "epoch": 4.396847155586018, "grad_norm": 2.1442084312438965, "learning_rate": 3.547729463673716e-06, "loss": 2.258632469177246, "memory(GiB)": 130.55, "step": 12830, "token_acc": 0.5044776119402985, "train_speed(iter/s)": 1.192284 }, { "epoch": 4.3985606579849215, "grad_norm": 2.200639247894287, "learning_rate": 3.5278406672182518e-06, "loss": 2.1374837875366213, "memory(GiB)": 130.55, "step": 12835, "token_acc": 0.5400996828273674, "train_speed(iter/s)": 1.192188 }, { "epoch": 4.400274160383825, "grad_norm": 2.0080394744873047, "learning_rate": 3.508005737662523e-06, "loss": 2.2512025833129883, "memory(GiB)": 130.55, "step": 12840, "token_acc": 0.5166466105094264, "train_speed(iter/s)": 1.192202 }, { "epoch": 4.401987662782728, "grad_norm": 2.322605609893799, "learning_rate": 3.488224697997633e-06, "loss": 2.430258369445801, "memory(GiB)": 130.55, "step": 12845, "token_acc": 0.492089925062448, "train_speed(iter/s)": 1.192203 }, { "epoch": 4.403701165181631, "grad_norm": 2.175231695175171, "learning_rate": 3.4684975711522183e-06, "loss": 2.2955774307250976, "memory(GiB)": 130.55, "step": 12850, "token_acc": 0.5024019215372297, "train_speed(iter/s)": 1.192239 }, { "epoch": 4.405414667580534, "grad_norm": 1.9625951051712036, "learning_rate": 3.448824379992427e-06, "loss": 2.325702095031738, "memory(GiB)": 130.55, "step": 12855, "token_acc": 0.5100349040139616, "train_speed(iter/s)": 1.192267 }, { "epoch": 4.407128169979438, "grad_norm": 1.8548829555511475, "learning_rate": 3.4292051473218787e-06, "loss": 2.2490001678466798, "memory(GiB)": 130.55, "step": 12860, "token_acc": 0.5179640718562875, "train_speed(iter/s)": 1.19225 }, { "epoch": 4.408841672378341, "grad_norm": 2.4249424934387207, "learning_rate": 3.4096398958816558e-06, "loss": 2.311792755126953, "memory(GiB)": 130.55, "step": 12865, "token_acc": 0.4960767218831735, "train_speed(iter/s)": 1.192279 }, { "epoch": 4.410555174777245, "grad_norm": 1.7863249778747559, "learning_rate": 3.390128648350277e-06, "loss": 2.190296745300293, "memory(GiB)": 130.55, "step": 12870, "token_acc": 0.5141488898563343, "train_speed(iter/s)": 1.192245 }, { "epoch": 4.412268677176148, "grad_norm": 2.033595561981201, "learning_rate": 3.3706714273436578e-06, "loss": 2.2069585800170897, "memory(GiB)": 130.55, "step": 12875, "token_acc": 0.5146147032772365, "train_speed(iter/s)": 1.192247 }, { "epoch": 4.413982179575052, "grad_norm": 1.9131689071655273, "learning_rate": 3.3512682554150853e-06, "loss": 2.1586496353149416, "memory(GiB)": 130.55, "step": 12880, "token_acc": 0.5357917570498916, "train_speed(iter/s)": 1.192261 }, { "epoch": 4.415695681973955, "grad_norm": 1.851587176322937, "learning_rate": 3.331919155055213e-06, "loss": 2.161033821105957, "memory(GiB)": 130.55, "step": 12885, "token_acc": 0.5180199739470256, "train_speed(iter/s)": 1.192294 }, { "epoch": 4.4174091843728585, "grad_norm": 1.9654195308685303, "learning_rate": 3.3126241486920007e-06, "loss": 2.208755683898926, "memory(GiB)": 130.55, "step": 12890, "token_acc": 0.5139802631578947, "train_speed(iter/s)": 1.192228 }, { "epoch": 4.419122686771761, "grad_norm": 2.100290298461914, "learning_rate": 3.293383258690702e-06, "loss": 2.1888168334960936, "memory(GiB)": 130.55, "step": 12895, "token_acc": 0.5092281879194631, "train_speed(iter/s)": 1.19223 }, { "epoch": 4.4208361891706645, "grad_norm": 2.0742931365966797, "learning_rate": 3.274196507353866e-06, "loss": 2.103236770629883, "memory(GiB)": 130.55, "step": 12900, "token_acc": 0.539974348011971, "train_speed(iter/s)": 1.19227 }, { "epoch": 4.422549691569568, "grad_norm": 2.289883613586426, "learning_rate": 3.2550639169212804e-06, "loss": 2.2165441513061523, "memory(GiB)": 130.55, "step": 12905, "token_acc": 0.5223040277176266, "train_speed(iter/s)": 1.192301 }, { "epoch": 4.424263193968471, "grad_norm": 2.2695419788360596, "learning_rate": 3.235985509569944e-06, "loss": 2.100112533569336, "memory(GiB)": 130.55, "step": 12910, "token_acc": 0.5295385942216473, "train_speed(iter/s)": 1.192299 }, { "epoch": 4.425976696367375, "grad_norm": 1.9740076065063477, "learning_rate": 3.216961307414068e-06, "loss": 2.260842704772949, "memory(GiB)": 130.55, "step": 12915, "token_acc": 0.5037437603993344, "train_speed(iter/s)": 1.192313 }, { "epoch": 4.427690198766278, "grad_norm": 1.9310466051101685, "learning_rate": 3.197991332505018e-06, "loss": 2.2285717010498045, "memory(GiB)": 130.55, "step": 12920, "token_acc": 0.5263157894736842, "train_speed(iter/s)": 1.192296 }, { "epoch": 4.429403701165182, "grad_norm": 2.179133653640747, "learning_rate": 3.1790756068312942e-06, "loss": 2.2317279815673827, "memory(GiB)": 130.55, "step": 12925, "token_acc": 0.526039563988696, "train_speed(iter/s)": 1.192327 }, { "epoch": 4.431117203564085, "grad_norm": 2.014047622680664, "learning_rate": 3.1602141523185415e-06, "loss": 2.1434293746948243, "memory(GiB)": 130.55, "step": 12930, "token_acc": 0.5408664582402859, "train_speed(iter/s)": 1.192338 }, { "epoch": 4.432830705962989, "grad_norm": 2.1974940299987793, "learning_rate": 3.141406990829482e-06, "loss": 2.252669334411621, "memory(GiB)": 130.55, "step": 12935, "token_acc": 0.5238303454306953, "train_speed(iter/s)": 1.192335 }, { "epoch": 4.434544208361892, "grad_norm": 1.989583134651184, "learning_rate": 3.1226541441639113e-06, "loss": 2.2739336013793947, "memory(GiB)": 130.55, "step": 12940, "token_acc": 0.5240527884206045, "train_speed(iter/s)": 1.192344 }, { "epoch": 4.4362577107607954, "grad_norm": 2.0979769229888916, "learning_rate": 3.103955634058675e-06, "loss": 2.2493732452392576, "memory(GiB)": 130.55, "step": 12945, "token_acc": 0.5258932155760738, "train_speed(iter/s)": 1.192262 }, { "epoch": 4.437971213159699, "grad_norm": 1.9209474325180054, "learning_rate": 3.0853114821876193e-06, "loss": 2.245421600341797, "memory(GiB)": 130.55, "step": 12950, "token_acc": 0.5115618661257606, "train_speed(iter/s)": 1.192279 }, { "epoch": 4.4396847155586014, "grad_norm": 1.987107515335083, "learning_rate": 3.0667217101615796e-06, "loss": 2.1953403472900392, "memory(GiB)": 130.55, "step": 12955, "token_acc": 0.5315277190012696, "train_speed(iter/s)": 1.192311 }, { "epoch": 4.441398217957505, "grad_norm": 1.977878451347351, "learning_rate": 3.0481863395283804e-06, "loss": 2.356878662109375, "memory(GiB)": 130.55, "step": 12960, "token_acc": 0.5076335877862596, "train_speed(iter/s)": 1.192293 }, { "epoch": 4.443111720356408, "grad_norm": 1.915973424911499, "learning_rate": 3.029705391772769e-06, "loss": 2.2342628479003905, "memory(GiB)": 130.55, "step": 12965, "token_acc": 0.516142735768904, "train_speed(iter/s)": 1.192341 }, { "epoch": 4.444825222755312, "grad_norm": 2.115351438522339, "learning_rate": 3.011278888316421e-06, "loss": 2.13980770111084, "memory(GiB)": 130.55, "step": 12970, "token_acc": 0.528932704672096, "train_speed(iter/s)": 1.192382 }, { "epoch": 4.446538725154215, "grad_norm": 2.1975457668304443, "learning_rate": 2.992906850517907e-06, "loss": 2.2266729354858397, "memory(GiB)": 130.55, "step": 12975, "token_acc": 0.5326906957250629, "train_speed(iter/s)": 1.192376 }, { "epoch": 4.448252227553119, "grad_norm": 1.8491641283035278, "learning_rate": 2.974589299672653e-06, "loss": 2.2508182525634766, "memory(GiB)": 130.55, "step": 12980, "token_acc": 0.5210594876248371, "train_speed(iter/s)": 1.19241 }, { "epoch": 4.449965729952022, "grad_norm": 2.001997709274292, "learning_rate": 2.956326257012937e-06, "loss": 2.197398376464844, "memory(GiB)": 130.55, "step": 12985, "token_acc": 0.5284220100045475, "train_speed(iter/s)": 1.192364 }, { "epoch": 4.4516792323509256, "grad_norm": 2.0495705604553223, "learning_rate": 2.9381177437078468e-06, "loss": 2.3276044845581056, "memory(GiB)": 130.55, "step": 12990, "token_acc": 0.50066401062417, "train_speed(iter/s)": 1.192376 }, { "epoch": 4.453392734749829, "grad_norm": 2.238931655883789, "learning_rate": 2.9199637808632717e-06, "loss": 2.2004266738891602, "memory(GiB)": 130.55, "step": 12995, "token_acc": 0.5181619256017506, "train_speed(iter/s)": 1.192409 }, { "epoch": 4.4551062371487316, "grad_norm": 1.969020128250122, "learning_rate": 2.901864389521869e-06, "loss": 2.236370849609375, "memory(GiB)": 130.55, "step": 13000, "token_acc": 0.509449135504624, "train_speed(iter/s)": 1.192381 }, { "epoch": 4.4551062371487316, "eval_loss": 2.140795946121216, "eval_runtime": 3.7132, "eval_samples_per_second": 26.931, "eval_steps_per_second": 26.931, "eval_token_acc": 0.5079136690647482, "step": 13000 }, { "epoch": 4.456819739547635, "grad_norm": 1.957226037979126, "learning_rate": 2.883819590663045e-06, "loss": 2.2233627319335936, "memory(GiB)": 130.55, "step": 13005, "token_acc": 0.5205659756498848, "train_speed(iter/s)": 1.191813 }, { "epoch": 4.458533241946538, "grad_norm": 1.8799093961715698, "learning_rate": 2.8658294052029245e-06, "loss": 2.2253349304199217, "memory(GiB)": 130.55, "step": 13010, "token_acc": 0.5142284569138277, "train_speed(iter/s)": 1.19184 }, { "epoch": 4.460246744345442, "grad_norm": 2.0858514308929443, "learning_rate": 2.8478938539943213e-06, "loss": 2.223971939086914, "memory(GiB)": 130.55, "step": 13015, "token_acc": 0.5268313458262351, "train_speed(iter/s)": 1.191848 }, { "epoch": 4.461960246744345, "grad_norm": 2.087592363357544, "learning_rate": 2.8300129578267163e-06, "loss": 2.2612020492553713, "memory(GiB)": 130.55, "step": 13020, "token_acc": 0.5193832599118943, "train_speed(iter/s)": 1.1918 }, { "epoch": 4.463673749143249, "grad_norm": 2.10722017288208, "learning_rate": 2.8121867374262587e-06, "loss": 2.364531898498535, "memory(GiB)": 130.55, "step": 13025, "token_acc": 0.5083125519534497, "train_speed(iter/s)": 1.191834 }, { "epoch": 4.465387251542152, "grad_norm": 1.9532843828201294, "learning_rate": 2.7944152134557087e-06, "loss": 2.2811466217041017, "memory(GiB)": 130.55, "step": 13030, "token_acc": 0.510221465076661, "train_speed(iter/s)": 1.191861 }, { "epoch": 4.467100753941056, "grad_norm": 2.0534868240356445, "learning_rate": 2.7766984065144386e-06, "loss": 2.2762523651123048, "memory(GiB)": 130.55, "step": 13035, "token_acc": 0.5175031632222691, "train_speed(iter/s)": 1.191808 }, { "epoch": 4.468814256339959, "grad_norm": 2.378864049911499, "learning_rate": 2.759036337138382e-06, "loss": 2.151997184753418, "memory(GiB)": 130.55, "step": 13040, "token_acc": 0.5403726708074534, "train_speed(iter/s)": 1.191866 }, { "epoch": 4.4705277587388625, "grad_norm": 2.0602123737335205, "learning_rate": 2.741429025800035e-06, "loss": 2.266587257385254, "memory(GiB)": 130.55, "step": 13045, "token_acc": 0.5150501672240803, "train_speed(iter/s)": 1.191877 }, { "epoch": 4.472241261137766, "grad_norm": 1.9987930059432983, "learning_rate": 2.723876492908406e-06, "loss": 2.2462202072143556, "memory(GiB)": 130.55, "step": 13050, "token_acc": 0.5323617659665667, "train_speed(iter/s)": 1.191906 }, { "epoch": 4.4739547635366685, "grad_norm": 2.2764523029327393, "learning_rate": 2.706378758809025e-06, "loss": 2.1741737365722655, "memory(GiB)": 130.55, "step": 13055, "token_acc": 0.5249199817101051, "train_speed(iter/s)": 1.191927 }, { "epoch": 4.475668265935572, "grad_norm": 1.9932414293289185, "learning_rate": 2.6889358437839073e-06, "loss": 2.1476173400878906, "memory(GiB)": 130.55, "step": 13060, "token_acc": 0.5279187817258884, "train_speed(iter/s)": 1.191926 }, { "epoch": 4.477381768334475, "grad_norm": 1.9166350364685059, "learning_rate": 2.671547768051519e-06, "loss": 2.241679000854492, "memory(GiB)": 130.55, "step": 13065, "token_acc": 0.5211608222490931, "train_speed(iter/s)": 1.191934 }, { "epoch": 4.479095270733379, "grad_norm": 1.9657400846481323, "learning_rate": 2.6542145517667595e-06, "loss": 2.167758560180664, "memory(GiB)": 130.55, "step": 13070, "token_acc": 0.5309814563545907, "train_speed(iter/s)": 1.191987 }, { "epoch": 4.480808773132282, "grad_norm": 2.0002527236938477, "learning_rate": 2.6369362150209296e-06, "loss": 2.133269500732422, "memory(GiB)": 130.55, "step": 13075, "token_acc": 0.5262237762237763, "train_speed(iter/s)": 1.191991 }, { "epoch": 4.482522275531186, "grad_norm": 2.041893243789673, "learning_rate": 2.619712777841743e-06, "loss": 2.2124698638916014, "memory(GiB)": 130.55, "step": 13080, "token_acc": 0.5109797297297297, "train_speed(iter/s)": 1.191976 }, { "epoch": 4.484235777930089, "grad_norm": 2.1387083530426025, "learning_rate": 2.6025442601932515e-06, "loss": 2.1286188125610352, "memory(GiB)": 130.55, "step": 13085, "token_acc": 0.5268336314847942, "train_speed(iter/s)": 1.192009 }, { "epoch": 4.485949280328993, "grad_norm": 2.223896026611328, "learning_rate": 2.5854306819758646e-06, "loss": 2.2582538604736326, "memory(GiB)": 130.55, "step": 13090, "token_acc": 0.5116883116883116, "train_speed(iter/s)": 1.19202 }, { "epoch": 4.487662782727896, "grad_norm": 1.9430543184280396, "learning_rate": 2.5683720630263087e-06, "loss": 2.2102542877197267, "memory(GiB)": 130.55, "step": 13095, "token_acc": 0.5185185185185185, "train_speed(iter/s)": 1.192062 }, { "epoch": 4.4893762851267995, "grad_norm": 1.9938486814498901, "learning_rate": 2.5513684231176005e-06, "loss": 2.239934730529785, "memory(GiB)": 130.55, "step": 13100, "token_acc": 0.5089058524173028, "train_speed(iter/s)": 1.192087 }, { "epoch": 4.491089787525702, "grad_norm": 2.1035709381103516, "learning_rate": 2.534419781959041e-06, "loss": 2.1769598007202147, "memory(GiB)": 130.55, "step": 13105, "token_acc": 0.5262214253697893, "train_speed(iter/s)": 1.192136 }, { "epoch": 4.4928032899246055, "grad_norm": 1.9824103116989136, "learning_rate": 2.517526159196171e-06, "loss": 2.183398628234863, "memory(GiB)": 130.55, "step": 13110, "token_acc": 0.5318866253321524, "train_speed(iter/s)": 1.19215 }, { "epoch": 4.494516792323509, "grad_norm": 2.228956699371338, "learning_rate": 2.5006875744107374e-06, "loss": 2.1679960250854493, "memory(GiB)": 130.55, "step": 13115, "token_acc": 0.5269109000452284, "train_speed(iter/s)": 1.192165 }, { "epoch": 4.496230294722412, "grad_norm": 2.085447311401367, "learning_rate": 2.4839040471207386e-06, "loss": 2.1284339904785154, "memory(GiB)": 130.55, "step": 13120, "token_acc": 0.5177882554650665, "train_speed(iter/s)": 1.192186 }, { "epoch": 4.497943797121316, "grad_norm": 1.9792338609695435, "learning_rate": 2.4671755967803133e-06, "loss": 2.138764762878418, "memory(GiB)": 130.55, "step": 13125, "token_acc": 0.5193562418442801, "train_speed(iter/s)": 1.192194 }, { "epoch": 4.499657299520219, "grad_norm": 2.5649759769439697, "learning_rate": 2.450502242779784e-06, "loss": 2.264206314086914, "memory(GiB)": 130.55, "step": 13130, "token_acc": 0.5167902311382468, "train_speed(iter/s)": 1.192238 }, { "epoch": 4.501370801919123, "grad_norm": 2.023648738861084, "learning_rate": 2.433884004445608e-06, "loss": 2.2826164245605467, "memory(GiB)": 130.55, "step": 13135, "token_acc": 0.5121746431570109, "train_speed(iter/s)": 1.19225 }, { "epoch": 4.503084304318026, "grad_norm": 2.9367003440856934, "learning_rate": 2.4173209010403374e-06, "loss": 2.190526580810547, "memory(GiB)": 130.55, "step": 13140, "token_acc": 0.5267778753292361, "train_speed(iter/s)": 1.192278 }, { "epoch": 4.50479780671693, "grad_norm": 2.209721088409424, "learning_rate": 2.4008129517626376e-06, "loss": 2.1333650588989257, "memory(GiB)": 130.55, "step": 13145, "token_acc": 0.5206866197183099, "train_speed(iter/s)": 1.192316 }, { "epoch": 4.506511309115833, "grad_norm": 1.9125996828079224, "learning_rate": 2.384360175747219e-06, "loss": 2.319834327697754, "memory(GiB)": 130.55, "step": 13150, "token_acc": 0.52140549273021, "train_speed(iter/s)": 1.192356 }, { "epoch": 4.5082248115147365, "grad_norm": 1.9971463680267334, "learning_rate": 2.3679625920648707e-06, "loss": 2.1484798431396483, "memory(GiB)": 130.55, "step": 13155, "token_acc": 0.5250223413762288, "train_speed(iter/s)": 1.192355 }, { "epoch": 4.50993831391364, "grad_norm": 2.2031445503234863, "learning_rate": 2.351620219722389e-06, "loss": 2.154213523864746, "memory(GiB)": 130.55, "step": 13160, "token_acc": 0.5277401894451962, "train_speed(iter/s)": 1.192385 }, { "epoch": 4.5116518163125425, "grad_norm": 2.102325439453125, "learning_rate": 2.3353330776625826e-06, "loss": 2.1999759674072266, "memory(GiB)": 130.55, "step": 13165, "token_acc": 0.5172872340425532, "train_speed(iter/s)": 1.192425 }, { "epoch": 4.513365318711446, "grad_norm": 2.1456406116485596, "learning_rate": 2.319101184764222e-06, "loss": 2.1586435317993162, "memory(GiB)": 130.55, "step": 13170, "token_acc": 0.5320018157058557, "train_speed(iter/s)": 1.192453 }, { "epoch": 4.515078821110349, "grad_norm": 2.0400447845458984, "learning_rate": 2.302924559842057e-06, "loss": 2.284897041320801, "memory(GiB)": 130.55, "step": 13175, "token_acc": 0.5220768601798855, "train_speed(iter/s)": 1.192465 }, { "epoch": 4.516792323509253, "grad_norm": 1.8985552787780762, "learning_rate": 2.286803221646766e-06, "loss": 2.2878055572509766, "memory(GiB)": 130.55, "step": 13180, "token_acc": 0.5159362549800797, "train_speed(iter/s)": 1.192455 }, { "epoch": 4.518505825908156, "grad_norm": 2.105621814727783, "learning_rate": 2.2707371888649464e-06, "loss": 2.2129741668701173, "memory(GiB)": 130.55, "step": 13185, "token_acc": 0.5195412064570943, "train_speed(iter/s)": 1.192464 }, { "epoch": 4.52021932830706, "grad_norm": 2.130758047103882, "learning_rate": 2.2547264801190904e-06, "loss": 2.1573225021362306, "memory(GiB)": 130.55, "step": 13190, "token_acc": 0.5279734769995856, "train_speed(iter/s)": 1.192414 }, { "epoch": 4.521932830705963, "grad_norm": 1.892917275428772, "learning_rate": 2.238771113967564e-06, "loss": 2.2255414962768554, "memory(GiB)": 130.55, "step": 13195, "token_acc": 0.5134687111479486, "train_speed(iter/s)": 1.192429 }, { "epoch": 4.523646333104867, "grad_norm": 2.093857526779175, "learning_rate": 2.222871108904584e-06, "loss": 2.163463592529297, "memory(GiB)": 130.55, "step": 13200, "token_acc": 0.5293569431500466, "train_speed(iter/s)": 1.192484 }, { "epoch": 4.52535983550377, "grad_norm": 1.8592720031738281, "learning_rate": 2.2070264833601917e-06, "loss": 2.218959999084473, "memory(GiB)": 130.55, "step": 13205, "token_acc": 0.5290269828291088, "train_speed(iter/s)": 1.19253 }, { "epoch": 4.527073337902673, "grad_norm": 2.3239498138427734, "learning_rate": 2.1912372557002405e-06, "loss": 2.3249126434326173, "memory(GiB)": 130.55, "step": 13210, "token_acc": 0.4989535370447886, "train_speed(iter/s)": 1.192543 }, { "epoch": 4.528786840301576, "grad_norm": 1.9627690315246582, "learning_rate": 2.175503444226368e-06, "loss": 2.1890602111816406, "memory(GiB)": 130.55, "step": 13215, "token_acc": 0.5191212367778681, "train_speed(iter/s)": 1.192543 }, { "epoch": 4.5305003427004795, "grad_norm": 2.160715341567993, "learning_rate": 2.15982506717598e-06, "loss": 2.211734580993652, "memory(GiB)": 130.55, "step": 13220, "token_acc": 0.5229485396383866, "train_speed(iter/s)": 1.192552 }, { "epoch": 4.532213845099383, "grad_norm": 1.6879234313964844, "learning_rate": 2.144202142722229e-06, "loss": 2.2732002258300783, "memory(GiB)": 130.55, "step": 13225, "token_acc": 0.5157415078707539, "train_speed(iter/s)": 1.192514 }, { "epoch": 4.533927347498286, "grad_norm": 1.9406267404556274, "learning_rate": 2.1286346889739963e-06, "loss": 2.161721420288086, "memory(GiB)": 130.55, "step": 13230, "token_acc": 0.5146096816397733, "train_speed(iter/s)": 1.192465 }, { "epoch": 4.53564084989719, "grad_norm": 2.012801170349121, "learning_rate": 2.1131227239758524e-06, "loss": 2.302885055541992, "memory(GiB)": 130.55, "step": 13235, "token_acc": 0.5109518120270808, "train_speed(iter/s)": 1.19249 }, { "epoch": 4.537354352296093, "grad_norm": 2.240229845046997, "learning_rate": 2.097666265708059e-06, "loss": 2.0561485290527344, "memory(GiB)": 130.55, "step": 13240, "token_acc": 0.5385934819897084, "train_speed(iter/s)": 1.192494 }, { "epoch": 4.539067854694997, "grad_norm": 1.8579535484313965, "learning_rate": 2.082265332086536e-06, "loss": 2.239406967163086, "memory(GiB)": 130.55, "step": 13245, "token_acc": 0.5341282894736842, "train_speed(iter/s)": 1.192494 }, { "epoch": 4.5407813570939, "grad_norm": 1.9942742586135864, "learning_rate": 2.066919940962836e-06, "loss": 2.3036733627319337, "memory(GiB)": 130.55, "step": 13250, "token_acc": 0.5213232363491431, "train_speed(iter/s)": 1.192395 }, { "epoch": 4.542494859492804, "grad_norm": 2.1158878803253174, "learning_rate": 2.0516301101241476e-06, "loss": 2.2368568420410155, "memory(GiB)": 130.55, "step": 13255, "token_acc": 0.5118521494576135, "train_speed(iter/s)": 1.192431 }, { "epoch": 4.544208361891707, "grad_norm": 1.9653867483139038, "learning_rate": 2.0363958572932494e-06, "loss": 2.2116439819335936, "memory(GiB)": 130.55, "step": 13260, "token_acc": 0.5223813786929274, "train_speed(iter/s)": 1.19246 }, { "epoch": 4.54592186429061, "grad_norm": 1.8769257068634033, "learning_rate": 2.021217200128489e-06, "loss": 2.1281652450561523, "memory(GiB)": 130.55, "step": 13265, "token_acc": 0.5417212756662297, "train_speed(iter/s)": 1.192467 }, { "epoch": 4.547635366689513, "grad_norm": 2.287095069885254, "learning_rate": 2.006094156223792e-06, "loss": 2.1768863677978514, "memory(GiB)": 130.55, "step": 13270, "token_acc": 0.5323805270209915, "train_speed(iter/s)": 1.192458 }, { "epoch": 4.5493488690884165, "grad_norm": 1.914363980293274, "learning_rate": 1.9910267431086093e-06, "loss": 2.2093425750732423, "memory(GiB)": 130.55, "step": 13275, "token_acc": 0.531578947368421, "train_speed(iter/s)": 1.192476 }, { "epoch": 4.55106237148732, "grad_norm": 1.9819812774658203, "learning_rate": 1.9760149782478975e-06, "loss": 2.1602123260498045, "memory(GiB)": 130.55, "step": 13280, "token_acc": 0.5310287380258226, "train_speed(iter/s)": 1.192485 }, { "epoch": 4.552775873886223, "grad_norm": 2.132054567337036, "learning_rate": 1.961058879042138e-06, "loss": 2.3337148666381835, "memory(GiB)": 130.55, "step": 13285, "token_acc": 0.5112658746415404, "train_speed(iter/s)": 1.192498 }, { "epoch": 4.554489376285127, "grad_norm": 2.1721301078796387, "learning_rate": 1.946158462827263e-06, "loss": 2.113842010498047, "memory(GiB)": 130.55, "step": 13290, "token_acc": 0.5392028033289531, "train_speed(iter/s)": 1.192526 }, { "epoch": 4.55620287868403, "grad_norm": 2.196781635284424, "learning_rate": 1.9313137468746858e-06, "loss": 2.270059013366699, "memory(GiB)": 130.55, "step": 13295, "token_acc": 0.50375, "train_speed(iter/s)": 1.192515 }, { "epoch": 4.557916381082934, "grad_norm": 2.422428846359253, "learning_rate": 1.916524748391224e-06, "loss": 2.1157073974609375, "memory(GiB)": 130.55, "step": 13300, "token_acc": 0.5205770046669496, "train_speed(iter/s)": 1.19252 }, { "epoch": 4.559629883481837, "grad_norm": 1.977500319480896, "learning_rate": 1.9017914845191443e-06, "loss": 2.2093929290771483, "memory(GiB)": 130.55, "step": 13305, "token_acc": 0.5234812580784145, "train_speed(iter/s)": 1.19255 }, { "epoch": 4.561343385880741, "grad_norm": 1.9962948560714722, "learning_rate": 1.8871139723360908e-06, "loss": 2.2759498596191405, "memory(GiB)": 130.55, "step": 13310, "token_acc": 0.5185185185185185, "train_speed(iter/s)": 1.192461 }, { "epoch": 4.563056888279643, "grad_norm": 2.0633583068847656, "learning_rate": 1.8724922288550828e-06, "loss": 2.2301700592041014, "memory(GiB)": 130.55, "step": 13315, "token_acc": 0.5261162594776748, "train_speed(iter/s)": 1.19247 }, { "epoch": 4.564770390678547, "grad_norm": 1.8568263053894043, "learning_rate": 1.8579262710245181e-06, "loss": 2.29565544128418, "memory(GiB)": 130.55, "step": 13320, "token_acc": 0.5111018014243821, "train_speed(iter/s)": 1.192401 }, { "epoch": 4.56648389307745, "grad_norm": 2.1522772312164307, "learning_rate": 1.8434161157281139e-06, "loss": 2.154096794128418, "memory(GiB)": 130.55, "step": 13325, "token_acc": 0.5319614711033275, "train_speed(iter/s)": 1.192407 }, { "epoch": 4.5681973954763535, "grad_norm": 2.187654733657837, "learning_rate": 1.8289617797849045e-06, "loss": 2.222547149658203, "memory(GiB)": 130.55, "step": 13330, "token_acc": 0.5264516129032258, "train_speed(iter/s)": 1.192428 }, { "epoch": 4.569910897875257, "grad_norm": 2.1167640686035156, "learning_rate": 1.8145632799492273e-06, "loss": 2.2002214431762694, "memory(GiB)": 130.55, "step": 13335, "token_acc": 0.5125980999586948, "train_speed(iter/s)": 1.192468 }, { "epoch": 4.57162440027416, "grad_norm": 2.104156255722046, "learning_rate": 1.8002206329107097e-06, "loss": 2.1960565567016603, "memory(GiB)": 130.55, "step": 13340, "token_acc": 0.5188127090301003, "train_speed(iter/s)": 1.192506 }, { "epoch": 4.573337902673064, "grad_norm": 2.042083501815796, "learning_rate": 1.785933855294214e-06, "loss": 2.3135570526123046, "memory(GiB)": 130.55, "step": 13345, "token_acc": 0.5111016225448335, "train_speed(iter/s)": 1.192479 }, { "epoch": 4.575051405071967, "grad_norm": 2.096120834350586, "learning_rate": 1.7717029636598715e-06, "loss": 2.1698665618896484, "memory(GiB)": 130.55, "step": 13350, "token_acc": 0.5230566534914362, "train_speed(iter/s)": 1.192496 }, { "epoch": 4.576764907470871, "grad_norm": 1.924142599105835, "learning_rate": 1.757527974503015e-06, "loss": 2.2049127578735352, "memory(GiB)": 130.55, "step": 13355, "token_acc": 0.5216326530612245, "train_speed(iter/s)": 1.192511 }, { "epoch": 4.578478409869774, "grad_norm": 1.8972200155258179, "learning_rate": 1.743408904254179e-06, "loss": 2.290947151184082, "memory(GiB)": 130.55, "step": 13360, "token_acc": 0.5089361702127659, "train_speed(iter/s)": 1.192418 }, { "epoch": 4.580191912268678, "grad_norm": 2.047126293182373, "learning_rate": 1.7293457692791003e-06, "loss": 2.257863998413086, "memory(GiB)": 130.55, "step": 13365, "token_acc": 0.5188517566409597, "train_speed(iter/s)": 1.192424 }, { "epoch": 4.58190541466758, "grad_norm": 2.2870383262634277, "learning_rate": 1.715338585878662e-06, "loss": 2.2249841690063477, "memory(GiB)": 130.55, "step": 13370, "token_acc": 0.5207373271889401, "train_speed(iter/s)": 1.192432 }, { "epoch": 4.583618917066484, "grad_norm": 1.9063935279846191, "learning_rate": 1.7013873702888927e-06, "loss": 2.217995452880859, "memory(GiB)": 130.55, "step": 13375, "token_acc": 0.526829268292683, "train_speed(iter/s)": 1.192459 }, { "epoch": 4.585332419465387, "grad_norm": 1.8834792375564575, "learning_rate": 1.6874921386809573e-06, "loss": 2.2113561630249023, "memory(GiB)": 130.55, "step": 13380, "token_acc": 0.5068172964550058, "train_speed(iter/s)": 1.192481 }, { "epoch": 4.5870459218642905, "grad_norm": 2.150836229324341, "learning_rate": 1.6736529071611274e-06, "loss": 2.2613874435424806, "memory(GiB)": 130.55, "step": 13385, "token_acc": 0.5146147032772365, "train_speed(iter/s)": 1.192401 }, { "epoch": 4.588759424263194, "grad_norm": 2.106550455093384, "learning_rate": 1.659869691770749e-06, "loss": 2.184904098510742, "memory(GiB)": 130.55, "step": 13390, "token_acc": 0.5236842105263158, "train_speed(iter/s)": 1.192427 }, { "epoch": 4.590472926662097, "grad_norm": 2.446132183074951, "learning_rate": 1.6461425084862592e-06, "loss": 2.2907896041870117, "memory(GiB)": 130.55, "step": 13395, "token_acc": 0.5127288968289415, "train_speed(iter/s)": 1.192422 }, { "epoch": 4.592186429061001, "grad_norm": 2.295562267303467, "learning_rate": 1.6324713732191298e-06, "loss": 2.23663215637207, "memory(GiB)": 130.55, "step": 13400, "token_acc": 0.5176724137931035, "train_speed(iter/s)": 1.192473 }, { "epoch": 4.593899931459904, "grad_norm": 1.7455228567123413, "learning_rate": 1.6188563018158853e-06, "loss": 2.223371887207031, "memory(GiB)": 130.55, "step": 13405, "token_acc": 0.507711038961039, "train_speed(iter/s)": 1.192487 }, { "epoch": 4.595613433858808, "grad_norm": 2.0891318321228027, "learning_rate": 1.605297310058046e-06, "loss": 2.1019218444824217, "memory(GiB)": 130.55, "step": 13410, "token_acc": 0.5375312760633861, "train_speed(iter/s)": 1.192452 }, { "epoch": 4.59732693625771, "grad_norm": 2.1735732555389404, "learning_rate": 1.5917944136621342e-06, "loss": 2.2766643524169923, "memory(GiB)": 130.55, "step": 13415, "token_acc": 0.5242932862190812, "train_speed(iter/s)": 1.192476 }, { "epoch": 4.599040438656614, "grad_norm": 2.0514185428619385, "learning_rate": 1.5783476282796638e-06, "loss": 2.1544002532958983, "memory(GiB)": 130.55, "step": 13420, "token_acc": 0.5154549412276883, "train_speed(iter/s)": 1.192474 }, { "epoch": 4.600753941055517, "grad_norm": 1.8379744291305542, "learning_rate": 1.5649569694970833e-06, "loss": 2.2629409790039063, "memory(GiB)": 130.55, "step": 13425, "token_acc": 0.5075098814229249, "train_speed(iter/s)": 1.192481 }, { "epoch": 4.602467443454421, "grad_norm": 1.8790940046310425, "learning_rate": 1.5516224528358104e-06, "loss": 2.138291931152344, "memory(GiB)": 130.55, "step": 13430, "token_acc": 0.532618025751073, "train_speed(iter/s)": 1.192489 }, { "epoch": 4.604180945853324, "grad_norm": 2.028090715408325, "learning_rate": 1.5383440937521753e-06, "loss": 2.1858856201171877, "memory(GiB)": 130.55, "step": 13435, "token_acc": 0.5259353741496599, "train_speed(iter/s)": 1.19251 }, { "epoch": 4.6058944482522275, "grad_norm": 2.0729176998138428, "learning_rate": 1.5251219076374113e-06, "loss": 2.189767837524414, "memory(GiB)": 130.55, "step": 13440, "token_acc": 0.5232751454696591, "train_speed(iter/s)": 1.19252 }, { "epoch": 4.607607950651131, "grad_norm": 2.052361249923706, "learning_rate": 1.5119559098176472e-06, "loss": 2.102676010131836, "memory(GiB)": 130.55, "step": 13445, "token_acc": 0.5391551969625059, "train_speed(iter/s)": 1.19253 }, { "epoch": 4.609321453050034, "grad_norm": 1.9775774478912354, "learning_rate": 1.4988461155538812e-06, "loss": 2.1650867462158203, "memory(GiB)": 130.55, "step": 13450, "token_acc": 0.5293333333333333, "train_speed(iter/s)": 1.192552 }, { "epoch": 4.611034955448938, "grad_norm": 2.4063682556152344, "learning_rate": 1.485792540041958e-06, "loss": 2.2993867874145506, "memory(GiB)": 130.55, "step": 13455, "token_acc": 0.5057034220532319, "train_speed(iter/s)": 1.192548 }, { "epoch": 4.612748457847841, "grad_norm": 2.0857880115509033, "learning_rate": 1.4727951984125688e-06, "loss": 2.3336536407470705, "memory(GiB)": 130.55, "step": 13460, "token_acc": 0.510119785212722, "train_speed(iter/s)": 1.192457 }, { "epoch": 4.614461960246745, "grad_norm": 2.1080079078674316, "learning_rate": 1.4598541057312176e-06, "loss": 2.1764995574951174, "memory(GiB)": 130.55, "step": 13465, "token_acc": 0.5294924554183813, "train_speed(iter/s)": 1.192463 }, { "epoch": 4.616175462645648, "grad_norm": 2.13507080078125, "learning_rate": 1.4469692769982057e-06, "loss": 2.202072525024414, "memory(GiB)": 130.55, "step": 13470, "token_acc": 0.5157667386609072, "train_speed(iter/s)": 1.192494 }, { "epoch": 4.617888965044551, "grad_norm": 2.1733438968658447, "learning_rate": 1.434140727148625e-06, "loss": 2.0586471557617188, "memory(GiB)": 130.55, "step": 13475, "token_acc": 0.55420054200542, "train_speed(iter/s)": 1.192524 }, { "epoch": 4.619602467443454, "grad_norm": 2.243208408355713, "learning_rate": 1.4213684710523256e-06, "loss": 2.142982292175293, "memory(GiB)": 130.55, "step": 13480, "token_acc": 0.541095890410959, "train_speed(iter/s)": 1.192549 }, { "epoch": 4.621315969842358, "grad_norm": 1.946907639503479, "learning_rate": 1.4086525235139093e-06, "loss": 2.183429718017578, "memory(GiB)": 130.55, "step": 13485, "token_acc": 0.5244993608862377, "train_speed(iter/s)": 1.19252 }, { "epoch": 4.623029472241261, "grad_norm": 2.301032304763794, "learning_rate": 1.3959928992727078e-06, "loss": 2.2633365631103515, "memory(GiB)": 130.55, "step": 13490, "token_acc": 0.5106382978723404, "train_speed(iter/s)": 1.192568 }, { "epoch": 4.6247429746401645, "grad_norm": 2.449939012527466, "learning_rate": 1.383389613002778e-06, "loss": 2.2703926086425783, "memory(GiB)": 130.55, "step": 13495, "token_acc": 0.5095379398050022, "train_speed(iter/s)": 1.192577 }, { "epoch": 4.626456477039068, "grad_norm": 2.215045213699341, "learning_rate": 1.3708426793128616e-06, "loss": 2.206114387512207, "memory(GiB)": 130.55, "step": 13500, "token_acc": 0.5072783414203793, "train_speed(iter/s)": 1.1925 }, { "epoch": 4.626456477039068, "eval_loss": 2.173274517059326, "eval_runtime": 3.6772, "eval_samples_per_second": 27.195, "eval_steps_per_second": 27.195, "eval_token_acc": 0.47307692307692306, "step": 13500 }, { "epoch": 4.628169979437971, "grad_norm": 2.126387357711792, "learning_rate": 1.3583521127463806e-06, "loss": 2.158163070678711, "memory(GiB)": 130.55, "step": 13505, "token_acc": 0.5153262518968134, "train_speed(iter/s)": 1.191956 }, { "epoch": 4.629883481836875, "grad_norm": 2.262600898742676, "learning_rate": 1.345917927781426e-06, "loss": 2.266499710083008, "memory(GiB)": 130.55, "step": 13510, "token_acc": 0.5071669477234402, "train_speed(iter/s)": 1.191959 }, { "epoch": 4.631596984235778, "grad_norm": 1.8360989093780518, "learning_rate": 1.333540138830741e-06, "loss": 2.255025291442871, "memory(GiB)": 130.55, "step": 13515, "token_acc": 0.5041017227235439, "train_speed(iter/s)": 1.191949 }, { "epoch": 4.633310486634681, "grad_norm": 2.2607622146606445, "learning_rate": 1.3212187602416882e-06, "loss": 2.1664915084838867, "memory(GiB)": 130.55, "step": 13520, "token_acc": 0.5236528177704648, "train_speed(iter/s)": 1.191982 }, { "epoch": 4.635023989033584, "grad_norm": 1.9184722900390625, "learning_rate": 1.3089538062962426e-06, "loss": 2.1974374771118166, "memory(GiB)": 130.55, "step": 13525, "token_acc": 0.5307429547395388, "train_speed(iter/s)": 1.191983 }, { "epoch": 4.636737491432488, "grad_norm": 2.071254253387451, "learning_rate": 1.296745291210988e-06, "loss": 2.1907175064086912, "memory(GiB)": 130.55, "step": 13530, "token_acc": 0.5302820157189089, "train_speed(iter/s)": 1.191991 }, { "epoch": 4.638450993831391, "grad_norm": 2.0381035804748535, "learning_rate": 1.2845932291370877e-06, "loss": 2.371373748779297, "memory(GiB)": 130.55, "step": 13535, "token_acc": 0.4993829699712053, "train_speed(iter/s)": 1.191964 }, { "epoch": 4.640164496230295, "grad_norm": 2.224977731704712, "learning_rate": 1.2724976341602468e-06, "loss": 2.229353141784668, "memory(GiB)": 130.55, "step": 13540, "token_acc": 0.5167400881057269, "train_speed(iter/s)": 1.191986 }, { "epoch": 4.641877998629198, "grad_norm": 2.014080047607422, "learning_rate": 1.2604585203007502e-06, "loss": 2.1682254791259767, "memory(GiB)": 130.55, "step": 13545, "token_acc": 0.5275974025974026, "train_speed(iter/s)": 1.191985 }, { "epoch": 4.6435915010281015, "grad_norm": 1.9329839944839478, "learning_rate": 1.2484759015133906e-06, "loss": 2.1730972290039063, "memory(GiB)": 130.55, "step": 13550, "token_acc": 0.5197594501718213, "train_speed(iter/s)": 1.192035 }, { "epoch": 4.645305003427005, "grad_norm": 2.046900510787964, "learning_rate": 1.2365497916874858e-06, "loss": 2.264597702026367, "memory(GiB)": 130.55, "step": 13555, "token_acc": 0.5234270414993306, "train_speed(iter/s)": 1.192058 }, { "epoch": 4.647018505825908, "grad_norm": 1.9280399084091187, "learning_rate": 1.2246802046468552e-06, "loss": 2.1614526748657226, "memory(GiB)": 130.55, "step": 13560, "token_acc": 0.5390161153519932, "train_speed(iter/s)": 1.192059 }, { "epoch": 4.648732008224812, "grad_norm": 2.1580512523651123, "learning_rate": 1.2128671541497994e-06, "loss": 2.1908918380737306, "memory(GiB)": 130.55, "step": 13565, "token_acc": 0.5355094838994265, "train_speed(iter/s)": 1.192057 }, { "epoch": 4.650445510623715, "grad_norm": 2.052574872970581, "learning_rate": 1.2011106538890759e-06, "loss": 2.2844127655029296, "memory(GiB)": 130.55, "step": 13570, "token_acc": 0.5230898404701931, "train_speed(iter/s)": 1.192053 }, { "epoch": 4.652159013022619, "grad_norm": 2.38458251953125, "learning_rate": 1.1894107174919068e-06, "loss": 2.17431583404541, "memory(GiB)": 130.55, "step": 13575, "token_acc": 0.5252699784017278, "train_speed(iter/s)": 1.192068 }, { "epoch": 4.653872515421521, "grad_norm": 1.8641430139541626, "learning_rate": 1.1777673585199433e-06, "loss": 2.2208684921264648, "memory(GiB)": 130.55, "step": 13580, "token_acc": 0.518818292189397, "train_speed(iter/s)": 1.192054 }, { "epoch": 4.655586017820425, "grad_norm": 1.8199634552001953, "learning_rate": 1.1661805904692624e-06, "loss": 2.3090709686279296, "memory(GiB)": 130.55, "step": 13585, "token_acc": 0.5157415078707539, "train_speed(iter/s)": 1.192055 }, { "epoch": 4.657299520219328, "grad_norm": 2.20792555809021, "learning_rate": 1.1546504267703374e-06, "loss": 2.193890380859375, "memory(GiB)": 130.55, "step": 13590, "token_acc": 0.5195989061075661, "train_speed(iter/s)": 1.192034 }, { "epoch": 4.659013022618232, "grad_norm": 2.093717575073242, "learning_rate": 1.1431768807880328e-06, "loss": 2.2958629608154295, "memory(GiB)": 130.55, "step": 13595, "token_acc": 0.5129670329670329, "train_speed(iter/s)": 1.192045 }, { "epoch": 4.660726525017135, "grad_norm": 2.19246768951416, "learning_rate": 1.1317599658215938e-06, "loss": 2.275385284423828, "memory(GiB)": 130.55, "step": 13600, "token_acc": 0.4975288303130148, "train_speed(iter/s)": 1.192001 }, { "epoch": 4.6624400274160385, "grad_norm": 2.11181902885437, "learning_rate": 1.1203996951046125e-06, "loss": 2.1707521438598634, "memory(GiB)": 130.55, "step": 13605, "token_acc": 0.5165984538426558, "train_speed(iter/s)": 1.192031 }, { "epoch": 4.664153529814942, "grad_norm": 2.2920963764190674, "learning_rate": 1.1090960818050334e-06, "loss": 2.3923606872558594, "memory(GiB)": 130.55, "step": 13610, "token_acc": 0.4866609294320138, "train_speed(iter/s)": 1.191978 }, { "epoch": 4.665867032213845, "grad_norm": 2.046898603439331, "learning_rate": 1.097849139025109e-06, "loss": 2.206351089477539, "memory(GiB)": 130.55, "step": 13615, "token_acc": 0.5281478298238075, "train_speed(iter/s)": 1.191975 }, { "epoch": 4.667580534612749, "grad_norm": 1.9694304466247559, "learning_rate": 1.0866588798014278e-06, "loss": 2.2214630126953123, "memory(GiB)": 130.55, "step": 13620, "token_acc": 0.5049710024855012, "train_speed(iter/s)": 1.191916 }, { "epoch": 4.669294037011651, "grad_norm": 2.488548517227173, "learning_rate": 1.0755253171048696e-06, "loss": 2.2508529663085937, "memory(GiB)": 130.55, "step": 13625, "token_acc": 0.5163109142166734, "train_speed(iter/s)": 1.191924 }, { "epoch": 4.671007539410555, "grad_norm": 2.024367094039917, "learning_rate": 1.064448463840584e-06, "loss": 2.2043668746948244, "memory(GiB)": 130.55, "step": 13630, "token_acc": 0.526431718061674, "train_speed(iter/s)": 1.191947 }, { "epoch": 4.672721041809458, "grad_norm": 2.0270533561706543, "learning_rate": 1.0534283328479943e-06, "loss": 2.184493827819824, "memory(GiB)": 130.55, "step": 13635, "token_acc": 0.5290269828291088, "train_speed(iter/s)": 1.191966 }, { "epoch": 4.674434544208362, "grad_norm": 1.9968575239181519, "learning_rate": 1.0424649369007777e-06, "loss": 2.200638198852539, "memory(GiB)": 130.55, "step": 13640, "token_acc": 0.5155475439387112, "train_speed(iter/s)": 1.191958 }, { "epoch": 4.676148046607265, "grad_norm": 1.8713067770004272, "learning_rate": 1.031558288706852e-06, "loss": 2.166433906555176, "memory(GiB)": 130.55, "step": 13645, "token_acc": 0.5199211045364891, "train_speed(iter/s)": 1.191994 }, { "epoch": 4.677861549006169, "grad_norm": 2.125439167022705, "learning_rate": 1.0207084009083378e-06, "loss": 2.238144302368164, "memory(GiB)": 130.55, "step": 13650, "token_acc": 0.5270154373927959, "train_speed(iter/s)": 1.192023 }, { "epoch": 4.679575051405072, "grad_norm": 2.0425162315368652, "learning_rate": 1.0099152860815975e-06, "loss": 2.1724729537963867, "memory(GiB)": 130.55, "step": 13655, "token_acc": 0.5220154334997731, "train_speed(iter/s)": 1.192059 }, { "epoch": 4.6812885538039755, "grad_norm": 1.9764543771743774, "learning_rate": 9.991789567371512e-07, "loss": 2.2047882080078125, "memory(GiB)": 130.55, "step": 13660, "token_acc": 0.5187813021702838, "train_speed(iter/s)": 1.19206 }, { "epoch": 4.683002056202879, "grad_norm": 2.0450191497802734, "learning_rate": 9.884994253197277e-07, "loss": 2.2195735931396485, "memory(GiB)": 130.55, "step": 13665, "token_acc": 0.5175131348511384, "train_speed(iter/s)": 1.192085 }, { "epoch": 4.684715558601782, "grad_norm": 2.057476043701172, "learning_rate": 9.77876704208197e-07, "loss": 2.145445442199707, "memory(GiB)": 130.55, "step": 13670, "token_acc": 0.5275053304904052, "train_speed(iter/s)": 1.192116 }, { "epoch": 4.686429061000686, "grad_norm": 1.9729465246200562, "learning_rate": 9.673108057155878e-07, "loss": 2.2226978302001954, "memory(GiB)": 130.55, "step": 13675, "token_acc": 0.5207531022678648, "train_speed(iter/s)": 1.19214 }, { "epoch": 4.688142563399589, "grad_norm": 1.9729201793670654, "learning_rate": 9.568017420890696e-07, "loss": 2.2602727890014647, "memory(GiB)": 130.55, "step": 13680, "token_acc": 0.513840830449827, "train_speed(iter/s)": 1.192144 }, { "epoch": 4.689856065798492, "grad_norm": 2.088701009750366, "learning_rate": 9.463495255099208e-07, "loss": 2.224331283569336, "memory(GiB)": 130.55, "step": 13685, "token_acc": 0.5175879396984925, "train_speed(iter/s)": 1.192178 }, { "epoch": 4.691569568197395, "grad_norm": 1.93387770652771, "learning_rate": 9.359541680935446e-07, "loss": 2.053376388549805, "memory(GiB)": 130.55, "step": 13690, "token_acc": 0.5460992907801419, "train_speed(iter/s)": 1.192207 }, { "epoch": 4.693283070596299, "grad_norm": 2.098451614379883, "learning_rate": 9.256156818894301e-07, "loss": 2.2318656921386717, "memory(GiB)": 130.55, "step": 13695, "token_acc": 0.5221006564551423, "train_speed(iter/s)": 1.192225 }, { "epoch": 4.694996572995202, "grad_norm": 1.9968196153640747, "learning_rate": 9.15334078881136e-07, "loss": 2.208691215515137, "memory(GiB)": 130.55, "step": 13700, "token_acc": 0.5194238323876037, "train_speed(iter/s)": 1.192274 }, { "epoch": 4.696710075394106, "grad_norm": 2.0885965824127197, "learning_rate": 9.051093709862902e-07, "loss": 2.204213333129883, "memory(GiB)": 130.55, "step": 13705, "token_acc": 0.5368768026370004, "train_speed(iter/s)": 1.192302 }, { "epoch": 4.698423577793009, "grad_norm": 2.240149974822998, "learning_rate": 8.949415700565844e-07, "loss": 2.1840877532958984, "memory(GiB)": 130.55, "step": 13710, "token_acc": 0.5264298093587522, "train_speed(iter/s)": 1.192355 }, { "epoch": 4.7001370801919125, "grad_norm": 2.0487074851989746, "learning_rate": 8.848306878777357e-07, "loss": 2.2181465148925783, "memory(GiB)": 130.55, "step": 13715, "token_acc": 0.5339403973509934, "train_speed(iter/s)": 1.192306 }, { "epoch": 4.701850582590816, "grad_norm": 2.8321144580841064, "learning_rate": 8.747767361694859e-07, "loss": 2.1584804534912108, "memory(GiB)": 130.55, "step": 13720, "token_acc": 0.5293139293139293, "train_speed(iter/s)": 1.19229 }, { "epoch": 4.703564084989719, "grad_norm": 2.102921485900879, "learning_rate": 8.647797265856017e-07, "loss": 2.280454635620117, "memory(GiB)": 130.55, "step": 13725, "token_acc": 0.5134680134680135, "train_speed(iter/s)": 1.192279 }, { "epoch": 4.705277587388622, "grad_norm": 2.2720956802368164, "learning_rate": 8.548396707138306e-07, "loss": 2.2919519424438475, "memory(GiB)": 130.55, "step": 13730, "token_acc": 0.5148861646234676, "train_speed(iter/s)": 1.192298 }, { "epoch": 4.706991089787525, "grad_norm": 1.8402620553970337, "learning_rate": 8.449565800759118e-07, "loss": 2.296548080444336, "memory(GiB)": 130.55, "step": 13735, "token_acc": 0.5108384458077709, "train_speed(iter/s)": 1.192304 }, { "epoch": 4.708704592186429, "grad_norm": 2.031148672103882, "learning_rate": 8.351304661275428e-07, "loss": 2.127004051208496, "memory(GiB)": 130.55, "step": 13740, "token_acc": 0.5368421052631579, "train_speed(iter/s)": 1.192322 }, { "epoch": 4.710418094585332, "grad_norm": 2.031986713409424, "learning_rate": 8.253613402584015e-07, "loss": 2.3313224792480467, "memory(GiB)": 130.55, "step": 13745, "token_acc": 0.5050462573591253, "train_speed(iter/s)": 1.192338 }, { "epoch": 4.712131596984236, "grad_norm": 1.9358974695205688, "learning_rate": 8.156492137920857e-07, "loss": 2.1652076721191404, "memory(GiB)": 130.55, "step": 13750, "token_acc": 0.5280065897858319, "train_speed(iter/s)": 1.192357 }, { "epoch": 4.713845099383139, "grad_norm": 2.1929144859313965, "learning_rate": 8.059940979861347e-07, "loss": 2.1542510986328125, "memory(GiB)": 130.55, "step": 13755, "token_acc": 0.5256353098528757, "train_speed(iter/s)": 1.192386 }, { "epoch": 4.715558601782043, "grad_norm": 1.9610168933868408, "learning_rate": 7.963960040320184e-07, "loss": 2.127086639404297, "memory(GiB)": 130.55, "step": 13760, "token_acc": 0.5298165137614679, "train_speed(iter/s)": 1.192403 }, { "epoch": 4.717272104180946, "grad_norm": 2.1687140464782715, "learning_rate": 7.86854943055082e-07, "loss": 2.203241729736328, "memory(GiB)": 130.55, "step": 13765, "token_acc": 0.531887201735358, "train_speed(iter/s)": 1.192446 }, { "epoch": 4.7189856065798494, "grad_norm": 1.874226450920105, "learning_rate": 7.773709261145901e-07, "loss": 2.2153646469116213, "memory(GiB)": 130.55, "step": 13770, "token_acc": 0.5220436753193243, "train_speed(iter/s)": 1.192471 }, { "epoch": 4.720699108978753, "grad_norm": 2.080380439758301, "learning_rate": 7.679439642036657e-07, "loss": 2.255084228515625, "memory(GiB)": 130.55, "step": 13775, "token_acc": 0.5188639253921153, "train_speed(iter/s)": 1.192462 }, { "epoch": 4.722412611377656, "grad_norm": 2.041910171508789, "learning_rate": 7.58574068249307e-07, "loss": 2.2906558990478514, "memory(GiB)": 130.55, "step": 13780, "token_acc": 0.5230706410779911, "train_speed(iter/s)": 1.192502 }, { "epoch": 4.72412611377656, "grad_norm": 1.8459053039550781, "learning_rate": 7.492612491123763e-07, "loss": 2.282193183898926, "memory(GiB)": 130.55, "step": 13785, "token_acc": 0.5144708423326134, "train_speed(iter/s)": 1.192494 }, { "epoch": 4.725839616175462, "grad_norm": 1.7560193538665771, "learning_rate": 7.400055175875608e-07, "loss": 2.1304061889648436, "memory(GiB)": 130.55, "step": 13790, "token_acc": 0.5295870583226905, "train_speed(iter/s)": 1.192485 }, { "epoch": 4.727553118574366, "grad_norm": 2.3394546508789062, "learning_rate": 7.308068844033844e-07, "loss": 2.1923717498779296, "memory(GiB)": 130.55, "step": 13795, "token_acc": 0.5164556962025316, "train_speed(iter/s)": 1.192504 }, { "epoch": 4.729266620973269, "grad_norm": 2.0563313961029053, "learning_rate": 7.21665360222179e-07, "loss": 2.176904487609863, "memory(GiB)": 130.55, "step": 13800, "token_acc": 0.5325102880658437, "train_speed(iter/s)": 1.1925 }, { "epoch": 4.730980123372173, "grad_norm": 1.806680679321289, "learning_rate": 7.125809556400908e-07, "loss": 2.296641540527344, "memory(GiB)": 130.55, "step": 13805, "token_acc": 0.5187332738626227, "train_speed(iter/s)": 1.192454 }, { "epoch": 4.732693625771076, "grad_norm": 2.115635871887207, "learning_rate": 7.035536811870469e-07, "loss": 2.215250587463379, "memory(GiB)": 130.55, "step": 13810, "token_acc": 0.516629711751663, "train_speed(iter/s)": 1.192498 }, { "epoch": 4.7344071281699796, "grad_norm": 2.281672239303589, "learning_rate": 6.945835473267658e-07, "loss": 2.1740467071533205, "memory(GiB)": 130.55, "step": 13815, "token_acc": 0.5254817987152034, "train_speed(iter/s)": 1.192481 }, { "epoch": 4.736120630568883, "grad_norm": 2.018659830093384, "learning_rate": 6.856705644567196e-07, "loss": 2.184481620788574, "memory(GiB)": 130.55, "step": 13820, "token_acc": 0.5410821643286573, "train_speed(iter/s)": 1.192466 }, { "epoch": 4.737834132967786, "grad_norm": 2.0954761505126953, "learning_rate": 6.768147429081551e-07, "loss": 2.344972610473633, "memory(GiB)": 130.55, "step": 13825, "token_acc": 0.5132519983172066, "train_speed(iter/s)": 1.192495 }, { "epoch": 4.73954763536669, "grad_norm": 2.266660690307617, "learning_rate": 6.680160929460389e-07, "loss": 2.2789838790893553, "memory(GiB)": 130.55, "step": 13830, "token_acc": 0.5252082419991232, "train_speed(iter/s)": 1.192434 }, { "epoch": 4.741261137765592, "grad_norm": 2.0755105018615723, "learning_rate": 6.592746247690795e-07, "loss": 2.159549331665039, "memory(GiB)": 130.55, "step": 13835, "token_acc": 0.5446390431439556, "train_speed(iter/s)": 1.192453 }, { "epoch": 4.742974640164496, "grad_norm": 1.9544200897216797, "learning_rate": 6.505903485097054e-07, "loss": 2.2135833740234374, "memory(GiB)": 130.55, "step": 13840, "token_acc": 0.5143353605560382, "train_speed(iter/s)": 1.192444 }, { "epoch": 4.744688142563399, "grad_norm": 1.9544848203659058, "learning_rate": 6.419632742340531e-07, "loss": 2.1455356597900392, "memory(GiB)": 130.55, "step": 13845, "token_acc": 0.5355704697986577, "train_speed(iter/s)": 1.19243 }, { "epoch": 4.746401644962303, "grad_norm": 2.0023722648620605, "learning_rate": 6.333934119419515e-07, "loss": 2.255919647216797, "memory(GiB)": 130.55, "step": 13850, "token_acc": 0.5216336433481601, "train_speed(iter/s)": 1.192442 }, { "epoch": 4.748115147361206, "grad_norm": 2.174002170562744, "learning_rate": 6.248807715669269e-07, "loss": 2.0912023544311524, "memory(GiB)": 130.55, "step": 13855, "token_acc": 0.5496315561335067, "train_speed(iter/s)": 1.192376 }, { "epoch": 4.74982864976011, "grad_norm": 1.9470579624176025, "learning_rate": 6.16425362976153e-07, "loss": 2.2601566314697266, "memory(GiB)": 130.55, "step": 13860, "token_acc": 0.513424204874019, "train_speed(iter/s)": 1.192328 }, { "epoch": 4.751542152159013, "grad_norm": 1.9416953325271606, "learning_rate": 6.080271959704842e-07, "loss": 2.1883201599121094, "memory(GiB)": 130.55, "step": 13865, "token_acc": 0.5266323024054983, "train_speed(iter/s)": 1.192345 }, { "epoch": 4.7532556545579165, "grad_norm": 2.0993547439575195, "learning_rate": 5.996862802844172e-07, "loss": 2.1660799026489257, "memory(GiB)": 130.55, "step": 13870, "token_acc": 0.5324508966695133, "train_speed(iter/s)": 1.192355 }, { "epoch": 4.75496915695682, "grad_norm": 2.138814687728882, "learning_rate": 5.914026255861017e-07, "loss": 2.1802433013916014, "memory(GiB)": 130.55, "step": 13875, "token_acc": 0.5252274607113316, "train_speed(iter/s)": 1.192348 }, { "epoch": 4.756682659355723, "grad_norm": 2.2299325466156006, "learning_rate": 5.831762414772901e-07, "loss": 2.1746734619140624, "memory(GiB)": 130.55, "step": 13880, "token_acc": 0.5345721694036301, "train_speed(iter/s)": 1.192391 }, { "epoch": 4.758396161754627, "grad_norm": 2.0564193725585938, "learning_rate": 5.750071374933774e-07, "loss": 2.2243415832519533, "memory(GiB)": 130.55, "step": 13885, "token_acc": 0.5139813581890812, "train_speed(iter/s)": 1.192417 }, { "epoch": 4.760109664153529, "grad_norm": 2.1092584133148193, "learning_rate": 5.668953231033392e-07, "loss": 2.2442499160766602, "memory(GiB)": 130.55, "step": 13890, "token_acc": 0.5114116652578191, "train_speed(iter/s)": 1.192447 }, { "epoch": 4.761823166552433, "grad_norm": 1.9757190942764282, "learning_rate": 5.588408077097651e-07, "loss": 2.2698284149169923, "memory(GiB)": 130.55, "step": 13895, "token_acc": 0.5131837859110586, "train_speed(iter/s)": 1.192457 }, { "epoch": 4.763536668951336, "grad_norm": 2.2477900981903076, "learning_rate": 5.508436006488204e-07, "loss": 2.2155841827392577, "memory(GiB)": 130.55, "step": 13900, "token_acc": 0.5257601351351351, "train_speed(iter/s)": 1.192486 }, { "epoch": 4.76525017135024, "grad_norm": 1.8514361381530762, "learning_rate": 5.429037111902346e-07, "loss": 2.0877252578735352, "memory(GiB)": 130.55, "step": 13905, "token_acc": 0.5444778685002148, "train_speed(iter/s)": 1.192505 }, { "epoch": 4.766963673749143, "grad_norm": 1.980224609375, "learning_rate": 5.35021148537318e-07, "loss": 2.1872770309448244, "memory(GiB)": 130.55, "step": 13910, "token_acc": 0.5289462723865056, "train_speed(iter/s)": 1.192515 }, { "epoch": 4.768677176148047, "grad_norm": 2.2554759979248047, "learning_rate": 5.271959218269229e-07, "loss": 2.2479269027709963, "memory(GiB)": 130.55, "step": 13915, "token_acc": 0.5161161957819339, "train_speed(iter/s)": 1.192527 }, { "epoch": 4.77039067854695, "grad_norm": 2.047968626022339, "learning_rate": 5.194280401294382e-07, "loss": 2.28411922454834, "memory(GiB)": 130.55, "step": 13920, "token_acc": 0.5071574642126789, "train_speed(iter/s)": 1.192514 }, { "epoch": 4.7721041809458535, "grad_norm": 2.070277452468872, "learning_rate": 5.117175124487839e-07, "loss": 2.2346435546875, "memory(GiB)": 130.55, "step": 13925, "token_acc": 0.5231101511879049, "train_speed(iter/s)": 1.192515 }, { "epoch": 4.773817683344757, "grad_norm": 2.100255250930786, "learning_rate": 5.040643477223994e-07, "loss": 2.265007972717285, "memory(GiB)": 130.55, "step": 13930, "token_acc": 0.5117914770376499, "train_speed(iter/s)": 1.192532 }, { "epoch": 4.77553118574366, "grad_norm": 2.014009475708008, "learning_rate": 4.964685548212389e-07, "loss": 2.2311553955078125, "memory(GiB)": 130.55, "step": 13935, "token_acc": 0.5189003436426117, "train_speed(iter/s)": 1.192527 }, { "epoch": 4.777244688142563, "grad_norm": 2.056262731552124, "learning_rate": 4.889301425497539e-07, "loss": 2.1862113952636717, "memory(GiB)": 130.55, "step": 13940, "token_acc": 0.541507024265645, "train_speed(iter/s)": 1.192502 }, { "epoch": 4.778958190541466, "grad_norm": 2.0826165676116943, "learning_rate": 4.814491196458826e-07, "loss": 2.218357276916504, "memory(GiB)": 130.55, "step": 13945, "token_acc": 0.5046153846153846, "train_speed(iter/s)": 1.192483 }, { "epoch": 4.78067169294037, "grad_norm": 2.167203903198242, "learning_rate": 4.740254947810441e-07, "loss": 2.288166046142578, "memory(GiB)": 130.55, "step": 13950, "token_acc": 0.5170157068062827, "train_speed(iter/s)": 1.192518 }, { "epoch": 4.782385195339273, "grad_norm": 2.225837230682373, "learning_rate": 4.666592765601274e-07, "loss": 2.1710533142089843, "memory(GiB)": 130.55, "step": 13955, "token_acc": 0.5158150851581509, "train_speed(iter/s)": 1.192541 }, { "epoch": 4.784098697738177, "grad_norm": 2.1717846393585205, "learning_rate": 4.5935047352146934e-07, "loss": 2.209270477294922, "memory(GiB)": 130.55, "step": 13960, "token_acc": 0.5206645056726094, "train_speed(iter/s)": 1.192594 }, { "epoch": 4.78581220013708, "grad_norm": 1.8744474649429321, "learning_rate": 4.520990941368708e-07, "loss": 2.2001022338867187, "memory(GiB)": 130.55, "step": 13965, "token_acc": 0.5236427320490368, "train_speed(iter/s)": 1.192586 }, { "epoch": 4.787525702535984, "grad_norm": 2.0045695304870605, "learning_rate": 4.449051468115639e-07, "loss": 2.2776039123535154, "memory(GiB)": 130.55, "step": 13970, "token_acc": 0.5049668874172185, "train_speed(iter/s)": 1.192484 }, { "epoch": 4.789239204934887, "grad_norm": 2.0308001041412354, "learning_rate": 4.3776863988420623e-07, "loss": 2.120580291748047, "memory(GiB)": 130.55, "step": 13975, "token_acc": 0.5418556701030928, "train_speed(iter/s)": 1.192509 }, { "epoch": 4.7909527073337905, "grad_norm": 2.065638542175293, "learning_rate": 4.3068958162688635e-07, "loss": 2.2245065689086916, "memory(GiB)": 130.55, "step": 13980, "token_acc": 0.5067796610169492, "train_speed(iter/s)": 1.192503 }, { "epoch": 4.792666209732694, "grad_norm": 1.9671863317489624, "learning_rate": 4.236679802450905e-07, "loss": 2.273542022705078, "memory(GiB)": 130.55, "step": 13985, "token_acc": 0.5119196988707654, "train_speed(iter/s)": 1.19247 }, { "epoch": 4.794379712131597, "grad_norm": 2.1110644340515137, "learning_rate": 4.167038438777138e-07, "loss": 2.2350975036621095, "memory(GiB)": 130.55, "step": 13990, "token_acc": 0.5123323236694072, "train_speed(iter/s)": 1.192459 }, { "epoch": 4.7960932145305, "grad_norm": 2.1051502227783203, "learning_rate": 4.0979718059703797e-07, "loss": 2.1570316314697267, "memory(GiB)": 130.55, "step": 13995, "token_acc": 0.520116134384073, "train_speed(iter/s)": 1.192459 }, { "epoch": 4.797806716929403, "grad_norm": 2.11232328414917, "learning_rate": 4.029479984087259e-07, "loss": 2.350233268737793, "memory(GiB)": 130.55, "step": 14000, "token_acc": 0.5110671936758894, "train_speed(iter/s)": 1.192469 }, { "epoch": 4.797806716929403, "eval_loss": 1.9200912714004517, "eval_runtime": 3.6832, "eval_samples_per_second": 27.15, "eval_steps_per_second": 27.15, "eval_token_acc": 0.5177304964539007, "step": 14000 }, { "epoch": 4.799520219328307, "grad_norm": 2.1008856296539307, "learning_rate": 3.9615630525182137e-07, "loss": 2.186952018737793, "memory(GiB)": 130.55, "step": 14005, "token_acc": 0.5298726738491675, "train_speed(iter/s)": 1.191956 }, { "epoch": 4.80123372172721, "grad_norm": 2.019843578338623, "learning_rate": 3.8942210899872154e-07, "loss": 2.084529685974121, "memory(GiB)": 130.55, "step": 14010, "token_acc": 0.5361842105263158, "train_speed(iter/s)": 1.191954 }, { "epoch": 4.802947224126114, "grad_norm": 2.041665554046631, "learning_rate": 3.8274541745518254e-07, "loss": 2.217309761047363, "memory(GiB)": 130.55, "step": 14015, "token_acc": 0.5195132186319765, "train_speed(iter/s)": 1.191971 }, { "epoch": 4.804660726525017, "grad_norm": 2.39040470123291, "learning_rate": 3.761262383603026e-07, "loss": 2.2068206787109377, "memory(GiB)": 130.55, "step": 14020, "token_acc": 0.5156106519742883, "train_speed(iter/s)": 1.192001 }, { "epoch": 4.806374228923921, "grad_norm": 2.1500377655029297, "learning_rate": 3.6956457938651656e-07, "loss": 2.263751411437988, "memory(GiB)": 130.55, "step": 14025, "token_acc": 0.5223550243470563, "train_speed(iter/s)": 1.192023 }, { "epoch": 4.808087731322824, "grad_norm": 2.008535385131836, "learning_rate": 3.6306044813958495e-07, "loss": 2.2525087356567384, "memory(GiB)": 130.55, "step": 14030, "token_acc": 0.5159684778100373, "train_speed(iter/s)": 1.192033 }, { "epoch": 4.8098012337217275, "grad_norm": 2.2728431224823, "learning_rate": 3.5661385215859375e-07, "loss": 2.2598957061767577, "memory(GiB)": 130.55, "step": 14035, "token_acc": 0.514766201804758, "train_speed(iter/s)": 1.192026 }, { "epoch": 4.81151473612063, "grad_norm": 1.8149888515472412, "learning_rate": 3.502247989159324e-07, "loss": 2.2795738220214843, "memory(GiB)": 130.55, "step": 14040, "token_acc": 0.5104253544620517, "train_speed(iter/s)": 1.191997 }, { "epoch": 4.8132282385195335, "grad_norm": 2.2835614681243896, "learning_rate": 3.438932958172991e-07, "loss": 2.193770980834961, "memory(GiB)": 130.55, "step": 14045, "token_acc": 0.5234082397003745, "train_speed(iter/s)": 1.19206 }, { "epoch": 4.814941740918437, "grad_norm": 1.9650501012802124, "learning_rate": 3.3761935020166224e-07, "loss": 2.2321950912475588, "memory(GiB)": 130.55, "step": 14050, "token_acc": 0.5049958368026645, "train_speed(iter/s)": 1.191998 }, { "epoch": 4.81665524331734, "grad_norm": 2.533489465713501, "learning_rate": 3.3140296934130455e-07, "loss": 2.333421325683594, "memory(GiB)": 130.55, "step": 14055, "token_acc": 0.5044359949302915, "train_speed(iter/s)": 1.192026 }, { "epoch": 4.818368745716244, "grad_norm": 2.1107213497161865, "learning_rate": 3.252441604417622e-07, "loss": 2.2639755249023437, "memory(GiB)": 130.55, "step": 14060, "token_acc": 0.5140108741112506, "train_speed(iter/s)": 1.192042 }, { "epoch": 4.820082248115147, "grad_norm": 1.966711163520813, "learning_rate": 3.1914293064184697e-07, "loss": 2.1163991928100585, "memory(GiB)": 130.55, "step": 14065, "token_acc": 0.5436046511627907, "train_speed(iter/s)": 1.19196 }, { "epoch": 4.821795750514051, "grad_norm": 1.8916853666305542, "learning_rate": 3.130992870136296e-07, "loss": 2.245888328552246, "memory(GiB)": 130.55, "step": 14070, "token_acc": 0.5182481751824818, "train_speed(iter/s)": 1.191981 }, { "epoch": 4.823509252912954, "grad_norm": 2.329317808151245, "learning_rate": 3.0711323656243405e-07, "loss": 2.1614913940429688, "memory(GiB)": 130.55, "step": 14075, "token_acc": 0.5242553191489362, "train_speed(iter/s)": 1.191922 }, { "epoch": 4.825222755311858, "grad_norm": 2.165745496749878, "learning_rate": 3.011847862268158e-07, "loss": 2.2863515853881835, "memory(GiB)": 130.55, "step": 14080, "token_acc": 0.5200690548122572, "train_speed(iter/s)": 1.191976 }, { "epoch": 4.826936257710761, "grad_norm": 2.2019283771514893, "learning_rate": 2.953139428785723e-07, "loss": 2.3372451782226564, "memory(GiB)": 130.55, "step": 14085, "token_acc": 0.5046649703138253, "train_speed(iter/s)": 1.192008 }, { "epoch": 4.8286497601096645, "grad_norm": 2.0600759983062744, "learning_rate": 2.895007133227268e-07, "loss": 2.220657157897949, "memory(GiB)": 130.55, "step": 14090, "token_acc": 0.5281276238455079, "train_speed(iter/s)": 1.191953 }, { "epoch": 4.830363262508568, "grad_norm": 1.9710392951965332, "learning_rate": 2.837451042975281e-07, "loss": 2.273898124694824, "memory(GiB)": 130.55, "step": 14095, "token_acc": 0.505761316872428, "train_speed(iter/s)": 1.191964 }, { "epoch": 4.8320767649074705, "grad_norm": 1.9951810836791992, "learning_rate": 2.7804712247441746e-07, "loss": 2.232831573486328, "memory(GiB)": 130.55, "step": 14100, "token_acc": 0.5230425055928412, "train_speed(iter/s)": 1.191981 }, { "epoch": 4.833790267306374, "grad_norm": 2.1336541175842285, "learning_rate": 2.7240677445806715e-07, "loss": 2.2262737274169924, "memory(GiB)": 130.55, "step": 14105, "token_acc": 0.497119341563786, "train_speed(iter/s)": 1.191991 }, { "epoch": 4.835503769705277, "grad_norm": 2.1237008571624756, "learning_rate": 2.6682406678630866e-07, "loss": 2.255015754699707, "memory(GiB)": 130.55, "step": 14110, "token_acc": 0.5229508196721312, "train_speed(iter/s)": 1.192038 }, { "epoch": 4.837217272104181, "grad_norm": 1.8884668350219727, "learning_rate": 2.612990059301934e-07, "loss": 2.265828323364258, "memory(GiB)": 130.55, "step": 14115, "token_acc": 0.52, "train_speed(iter/s)": 1.192036 }, { "epoch": 4.838930774503084, "grad_norm": 1.9628678560256958, "learning_rate": 2.55831598293943e-07, "loss": 2.1209743499755858, "memory(GiB)": 130.55, "step": 14120, "token_acc": 0.5355739851593191, "train_speed(iter/s)": 1.192053 }, { "epoch": 4.840644276901988, "grad_norm": 2.5155882835388184, "learning_rate": 2.5042185021493803e-07, "loss": 2.2484176635742186, "memory(GiB)": 130.55, "step": 14125, "token_acc": 0.5189234650967199, "train_speed(iter/s)": 1.192044 }, { "epoch": 4.842357779300891, "grad_norm": 2.0214743614196777, "learning_rate": 2.4506976796374595e-07, "loss": 2.1873956680297852, "memory(GiB)": 130.55, "step": 14130, "token_acc": 0.5333333333333333, "train_speed(iter/s)": 1.192076 }, { "epoch": 4.844071281699795, "grad_norm": 1.8553332090377808, "learning_rate": 2.3977535774407645e-07, "loss": 2.1994958877563477, "memory(GiB)": 130.55, "step": 14135, "token_acc": 0.5183333333333333, "train_speed(iter/s)": 1.192101 }, { "epoch": 4.845784784098698, "grad_norm": 2.2229483127593994, "learning_rate": 2.3453862569280394e-07, "loss": 2.1327796936035157, "memory(GiB)": 130.55, "step": 14140, "token_acc": 0.5293333333333333, "train_speed(iter/s)": 1.192096 }, { "epoch": 4.847498286497601, "grad_norm": 1.9346532821655273, "learning_rate": 2.29359577879934e-07, "loss": 2.0924720764160156, "memory(GiB)": 130.55, "step": 14145, "token_acc": 0.5304198800342759, "train_speed(iter/s)": 1.192134 }, { "epoch": 4.849211788896504, "grad_norm": 1.968376874923706, "learning_rate": 2.2423822030861463e-07, "loss": 2.2481334686279295, "memory(GiB)": 130.55, "step": 14150, "token_acc": 0.498390989541432, "train_speed(iter/s)": 1.19216 }, { "epoch": 4.8509252912954075, "grad_norm": 1.9980233907699585, "learning_rate": 2.1917455891513062e-07, "loss": 2.271377944946289, "memory(GiB)": 130.55, "step": 14155, "token_acc": 0.49893662271373884, "train_speed(iter/s)": 1.192136 }, { "epoch": 4.852638793694311, "grad_norm": 2.265482187271118, "learning_rate": 2.1416859956887026e-07, "loss": 2.2250139236450197, "memory(GiB)": 130.55, "step": 14160, "token_acc": 0.5286903197547087, "train_speed(iter/s)": 1.192156 }, { "epoch": 4.854352296093214, "grad_norm": 1.9751136302947998, "learning_rate": 2.092203480723587e-07, "loss": 2.198324203491211, "memory(GiB)": 130.55, "step": 14165, "token_acc": 0.5255642561031783, "train_speed(iter/s)": 1.192138 }, { "epoch": 4.856065798492118, "grad_norm": 2.413687229156494, "learning_rate": 2.0432981016122455e-07, "loss": 2.1384220123291016, "memory(GiB)": 130.55, "step": 14170, "token_acc": 0.5431494661921709, "train_speed(iter/s)": 1.192155 }, { "epoch": 4.857779300891021, "grad_norm": 1.9041825532913208, "learning_rate": 1.9949699150419443e-07, "loss": 2.170484924316406, "memory(GiB)": 130.55, "step": 14175, "token_acc": 0.5291242362525458, "train_speed(iter/s)": 1.192172 }, { "epoch": 4.859492803289925, "grad_norm": 2.102792501449585, "learning_rate": 1.9472189770309845e-07, "loss": 2.269325828552246, "memory(GiB)": 130.55, "step": 14180, "token_acc": 0.5211000474158369, "train_speed(iter/s)": 1.192177 }, { "epoch": 4.861206305688828, "grad_norm": 1.8328721523284912, "learning_rate": 1.9000453429284803e-07, "loss": 2.2199756622314455, "memory(GiB)": 130.55, "step": 14185, "token_acc": 0.5201371036846615, "train_speed(iter/s)": 1.192208 }, { "epoch": 4.862919808087732, "grad_norm": 2.1829142570495605, "learning_rate": 1.8534490674144145e-07, "loss": 2.221316909790039, "memory(GiB)": 130.55, "step": 14190, "token_acc": 0.5276381909547738, "train_speed(iter/s)": 1.192199 }, { "epoch": 4.864633310486635, "grad_norm": 2.048112630844116, "learning_rate": 1.807430204499583e-07, "loss": 2.321460723876953, "memory(GiB)": 130.55, "step": 14195, "token_acc": 0.5127768313458262, "train_speed(iter/s)": 1.192221 }, { "epoch": 4.8663468128855385, "grad_norm": 2.020235776901245, "learning_rate": 1.7619888075254832e-07, "loss": 2.1912817001342773, "memory(GiB)": 130.55, "step": 14200, "token_acc": 0.5217773867135944, "train_speed(iter/s)": 1.192267 }, { "epoch": 4.868060315284441, "grad_norm": 1.9895570278167725, "learning_rate": 1.7171249291641488e-07, "loss": 2.0808130264282227, "memory(GiB)": 130.55, "step": 14205, "token_acc": 0.543236301369863, "train_speed(iter/s)": 1.192212 }, { "epoch": 4.8697738176833445, "grad_norm": 1.9310506582260132, "learning_rate": 1.6728386214184266e-07, "loss": 2.1447816848754884, "memory(GiB)": 130.55, "step": 14210, "token_acc": 0.5381929288520296, "train_speed(iter/s)": 1.192239 }, { "epoch": 4.871487320082248, "grad_norm": 2.2810404300689697, "learning_rate": 1.6291299356214207e-07, "loss": 2.1828681945800783, "memory(GiB)": 130.55, "step": 14215, "token_acc": 0.5160337552742617, "train_speed(iter/s)": 1.192257 }, { "epoch": 4.873200822481151, "grad_norm": 2.212198257446289, "learning_rate": 1.585998922436882e-07, "loss": 2.288247299194336, "memory(GiB)": 130.55, "step": 14220, "token_acc": 0.5165618448637317, "train_speed(iter/s)": 1.192172 }, { "epoch": 4.874914324880055, "grad_norm": 1.9337730407714844, "learning_rate": 1.5434456318588752e-07, "loss": 2.1800827026367187, "memory(GiB)": 130.55, "step": 14225, "token_acc": 0.5172265288544359, "train_speed(iter/s)": 1.192193 }, { "epoch": 4.876627827278958, "grad_norm": 2.113398313522339, "learning_rate": 1.5014701132118893e-07, "loss": 2.1661081314086914, "memory(GiB)": 130.55, "step": 14230, "token_acc": 0.5165178571428571, "train_speed(iter/s)": 1.192207 }, { "epoch": 4.878341329677862, "grad_norm": 2.0898277759552, "learning_rate": 1.460072415150726e-07, "loss": 2.1655485153198244, "memory(GiB)": 130.55, "step": 14235, "token_acc": 0.533068783068783, "train_speed(iter/s)": 1.192248 }, { "epoch": 4.880054832076765, "grad_norm": 2.2278850078582764, "learning_rate": 1.4192525856602247e-07, "loss": 2.243768501281738, "memory(GiB)": 130.55, "step": 14240, "token_acc": 0.5146621334466638, "train_speed(iter/s)": 1.192287 }, { "epoch": 4.881768334475669, "grad_norm": 2.4969170093536377, "learning_rate": 1.3790106720557028e-07, "loss": 2.2279930114746094, "memory(GiB)": 130.55, "step": 14245, "token_acc": 0.511402027027027, "train_speed(iter/s)": 1.192328 }, { "epoch": 4.883481836874571, "grad_norm": 1.9701027870178223, "learning_rate": 1.339346720982293e-07, "loss": 2.1576820373535157, "memory(GiB)": 130.55, "step": 14250, "token_acc": 0.5368979231109147, "train_speed(iter/s)": 1.192336 }, { "epoch": 4.885195339273475, "grad_norm": 2.149050235748291, "learning_rate": 1.3002607784154964e-07, "loss": 2.2002704620361326, "memory(GiB)": 130.55, "step": 14255, "token_acc": 0.5262018189692508, "train_speed(iter/s)": 1.192253 }, { "epoch": 4.886908841672378, "grad_norm": 2.696915626525879, "learning_rate": 1.2617528896605724e-07, "loss": 2.1398157119750976, "memory(GiB)": 130.55, "step": 14260, "token_acc": 0.5249073692877727, "train_speed(iter/s)": 1.19227 }, { "epoch": 4.8886223440712815, "grad_norm": 2.150233745574951, "learning_rate": 1.2238230993529275e-07, "loss": 2.2925085067749023, "memory(GiB)": 130.55, "step": 14265, "token_acc": 0.5023216547066273, "train_speed(iter/s)": 1.192271 }, { "epoch": 4.890335846470185, "grad_norm": 2.0403544902801514, "learning_rate": 1.1864714514577269e-07, "loss": 2.1724292755126955, "memory(GiB)": 130.55, "step": 14270, "token_acc": 0.5364469663902226, "train_speed(iter/s)": 1.192309 }, { "epoch": 4.892049348869088, "grad_norm": 1.9944607019424438, "learning_rate": 1.1496979892702265e-07, "loss": 2.254658508300781, "memory(GiB)": 130.55, "step": 14275, "token_acc": 0.5302461410095953, "train_speed(iter/s)": 1.192313 }, { "epoch": 4.893762851267992, "grad_norm": 2.2753989696502686, "learning_rate": 1.1135027554152188e-07, "loss": 2.288718605041504, "memory(GiB)": 130.55, "step": 14280, "token_acc": 0.5171558495245969, "train_speed(iter/s)": 1.192315 }, { "epoch": 4.895476353666895, "grad_norm": 2.0635955333709717, "learning_rate": 1.0778857918474771e-07, "loss": 2.196341323852539, "memory(GiB)": 130.55, "step": 14285, "token_acc": 0.5214128035320088, "train_speed(iter/s)": 1.192331 }, { "epoch": 4.897189856065799, "grad_norm": 1.8750108480453491, "learning_rate": 1.0428471398513662e-07, "loss": 2.2255151748657225, "memory(GiB)": 130.55, "step": 14290, "token_acc": 0.5196850393700787, "train_speed(iter/s)": 1.192361 }, { "epoch": 4.898903358464702, "grad_norm": 2.0000741481781006, "learning_rate": 1.0083868400410091e-07, "loss": 2.2751232147216798, "memory(GiB)": 130.55, "step": 14295, "token_acc": 0.5205831903945112, "train_speed(iter/s)": 1.192342 }, { "epoch": 4.900616860863606, "grad_norm": 2.3275270462036133, "learning_rate": 9.745049323600097e-08, "loss": 2.2683815002441405, "memory(GiB)": 130.55, "step": 14300, "token_acc": 0.5293859649122807, "train_speed(iter/s)": 1.192387 }, { "epoch": 4.902330363262509, "grad_norm": 1.9109596014022827, "learning_rate": 9.412014560816751e-08, "loss": 2.2257835388183596, "memory(GiB)": 130.55, "step": 14305, "token_acc": 0.51919795221843, "train_speed(iter/s)": 1.192375 }, { "epoch": 4.904043865661412, "grad_norm": 2.433055877685547, "learning_rate": 9.084764498087928e-08, "loss": 2.222443771362305, "memory(GiB)": 130.55, "step": 14310, "token_acc": 0.5209606986899563, "train_speed(iter/s)": 1.192281 }, { "epoch": 4.905757368060315, "grad_norm": 2.1567060947418213, "learning_rate": 8.763299514736867e-08, "loss": 2.180527114868164, "memory(GiB)": 130.55, "step": 14315, "token_acc": 0.5203720106288751, "train_speed(iter/s)": 1.19227 }, { "epoch": 4.9074708704592185, "grad_norm": 2.014284610748291, "learning_rate": 8.44761998337995e-08, "loss": 2.1570661544799803, "memory(GiB)": 130.55, "step": 14320, "token_acc": 0.5382631126397248, "train_speed(iter/s)": 1.192232 }, { "epoch": 4.909184372858122, "grad_norm": 2.077064275741577, "learning_rate": 8.137726269928369e-08, "loss": 2.192942428588867, "memory(GiB)": 130.55, "step": 14325, "token_acc": 0.526704298740773, "train_speed(iter/s)": 1.192227 }, { "epoch": 4.910897875257025, "grad_norm": 2.6485390663146973, "learning_rate": 7.833618733587012e-08, "loss": 2.1355133056640625, "memory(GiB)": 130.55, "step": 14330, "token_acc": 0.5329315540249677, "train_speed(iter/s)": 1.19219 }, { "epoch": 4.912611377655929, "grad_norm": 1.9350426197052002, "learning_rate": 7.535297726853351e-08, "loss": 2.2384035110473635, "memory(GiB)": 130.55, "step": 14335, "token_acc": 0.5176211453744494, "train_speed(iter/s)": 1.19221 }, { "epoch": 4.914324880054832, "grad_norm": 2.309631824493408, "learning_rate": 7.24276359551801e-08, "loss": 2.21250057220459, "memory(GiB)": 130.55, "step": 14340, "token_acc": 0.537261698440208, "train_speed(iter/s)": 1.192213 }, { "epoch": 4.916038382453736, "grad_norm": 1.7430955171585083, "learning_rate": 6.956016678663635e-08, "loss": 2.2300556182861326, "memory(GiB)": 130.55, "step": 14345, "token_acc": 0.5328778821520068, "train_speed(iter/s)": 1.192224 }, { "epoch": 4.917751884852639, "grad_norm": 2.0153298377990723, "learning_rate": 6.675057308664911e-08, "loss": 2.2111814498901365, "memory(GiB)": 130.55, "step": 14350, "token_acc": 0.5170431211498974, "train_speed(iter/s)": 1.192218 }, { "epoch": 4.919465387251542, "grad_norm": 2.349559783935547, "learning_rate": 6.399885811188001e-08, "loss": 2.1624032974243166, "memory(GiB)": 130.55, "step": 14355, "token_acc": 0.5207895842083158, "train_speed(iter/s)": 1.192245 }, { "epoch": 4.921178889650445, "grad_norm": 2.1747775077819824, "learning_rate": 6.130502505190539e-08, "loss": 2.199728012084961, "memory(GiB)": 130.55, "step": 14360, "token_acc": 0.5259259259259259, "train_speed(iter/s)": 1.192245 }, { "epoch": 4.922892392049349, "grad_norm": 2.275698184967041, "learning_rate": 5.866907702919422e-08, "loss": 2.1868719100952148, "memory(GiB)": 130.55, "step": 14365, "token_acc": 0.5288673424416043, "train_speed(iter/s)": 1.192269 }, { "epoch": 4.924605894448252, "grad_norm": 2.0053064823150635, "learning_rate": 5.609101709914688e-08, "loss": 2.1921106338500977, "memory(GiB)": 130.55, "step": 14370, "token_acc": 0.5229400749063671, "train_speed(iter/s)": 1.192277 }, { "epoch": 4.9263193968471555, "grad_norm": 2.262312173843384, "learning_rate": 5.357084825003966e-08, "loss": 2.2385990142822267, "memory(GiB)": 130.55, "step": 14375, "token_acc": 0.5174150230801511, "train_speed(iter/s)": 1.192287 }, { "epoch": 4.928032899246059, "grad_norm": 2.1642446517944336, "learning_rate": 5.110857340305808e-08, "loss": 2.259749984741211, "memory(GiB)": 130.55, "step": 14380, "token_acc": 0.520052310374891, "train_speed(iter/s)": 1.192309 }, { "epoch": 4.929746401644962, "grad_norm": 2.26530385017395, "learning_rate": 4.870419541228022e-08, "loss": 2.3021326065063477, "memory(GiB)": 130.55, "step": 14385, "token_acc": 0.5120481927710844, "train_speed(iter/s)": 1.192226 }, { "epoch": 4.931459904043866, "grad_norm": 2.557229518890381, "learning_rate": 4.635771706467673e-08, "loss": 2.2701778411865234, "memory(GiB)": 130.55, "step": 14390, "token_acc": 0.5082037996545768, "train_speed(iter/s)": 1.192237 }, { "epoch": 4.933173406442769, "grad_norm": 2.22898268699646, "learning_rate": 4.406914108009419e-08, "loss": 2.149393653869629, "memory(GiB)": 130.55, "step": 14395, "token_acc": 0.5276558384547849, "train_speed(iter/s)": 1.192229 }, { "epoch": 4.934886908841673, "grad_norm": 2.1356201171875, "learning_rate": 4.183847011127173e-08, "loss": 2.2546186447143555, "memory(GiB)": 130.55, "step": 14400, "token_acc": 0.5193929173693086, "train_speed(iter/s)": 1.192267 }, { "epoch": 4.936600411240576, "grad_norm": 1.9675670862197876, "learning_rate": 3.966570674383552e-08, "loss": 2.220624542236328, "memory(GiB)": 130.55, "step": 14405, "token_acc": 0.5207496653279786, "train_speed(iter/s)": 1.192307 }, { "epoch": 4.93831391363948, "grad_norm": 2.1762588024139404, "learning_rate": 3.755085349628207e-08, "loss": 2.293637275695801, "memory(GiB)": 130.55, "step": 14410, "token_acc": 0.5156316916488223, "train_speed(iter/s)": 1.192288 }, { "epoch": 4.940027416038382, "grad_norm": 1.9647468328475952, "learning_rate": 3.549391281997827e-08, "loss": 2.2102991104125977, "memory(GiB)": 130.55, "step": 14415, "token_acc": 0.5324508966695133, "train_speed(iter/s)": 1.192315 }, { "epoch": 4.941740918437286, "grad_norm": 2.0399062633514404, "learning_rate": 3.349488709917803e-08, "loss": 2.1865060806274412, "memory(GiB)": 130.55, "step": 14420, "token_acc": 0.5377682403433477, "train_speed(iter/s)": 1.192302 }, { "epoch": 4.943454420836189, "grad_norm": 2.1297311782836914, "learning_rate": 3.1553778650983414e-08, "loss": 2.15445499420166, "memory(GiB)": 130.55, "step": 14425, "token_acc": 0.540720961281709, "train_speed(iter/s)": 1.192334 }, { "epoch": 4.9451679232350925, "grad_norm": 2.1576132774353027, "learning_rate": 2.9670589725389053e-08, "loss": 2.1445831298828124, "memory(GiB)": 130.55, "step": 14430, "token_acc": 0.5382087099424815, "train_speed(iter/s)": 1.192364 }, { "epoch": 4.946881425633996, "grad_norm": 2.150397300720215, "learning_rate": 2.7845322505232196e-08, "loss": 2.2206657409667967, "memory(GiB)": 130.55, "step": 14435, "token_acc": 0.5221978021978022, "train_speed(iter/s)": 1.192338 }, { "epoch": 4.948594928032899, "grad_norm": 1.9687632322311401, "learning_rate": 2.6077979106226002e-08, "loss": 2.2677852630615236, "memory(GiB)": 130.55, "step": 14440, "token_acc": 0.5221571906354515, "train_speed(iter/s)": 1.192371 }, { "epoch": 4.950308430431803, "grad_norm": 1.8274730443954468, "learning_rate": 2.4368561576931793e-08, "loss": 2.3464849472045897, "memory(GiB)": 130.55, "step": 14445, "token_acc": 0.4972608512431521, "train_speed(iter/s)": 1.192355 }, { "epoch": 4.952021932830706, "grad_norm": 1.940651774406433, "learning_rate": 2.27170718987757e-08, "loss": 2.1730390548706056, "memory(GiB)": 130.55, "step": 14450, "token_acc": 0.5270607826810991, "train_speed(iter/s)": 1.192382 }, { "epoch": 4.95373543522961, "grad_norm": 1.8532999753952026, "learning_rate": 2.112351198603757e-08, "loss": 2.1792078018188477, "memory(GiB)": 130.55, "step": 14455, "token_acc": 0.5352606219886115, "train_speed(iter/s)": 1.192376 }, { "epoch": 4.955448937628512, "grad_norm": 2.0167243480682373, "learning_rate": 1.958788368583986e-08, "loss": 2.242079162597656, "memory(GiB)": 130.55, "step": 14460, "token_acc": 0.5163339382940109, "train_speed(iter/s)": 1.19239 }, { "epoch": 4.957162440027416, "grad_norm": 2.881985664367676, "learning_rate": 1.8110188778169835e-08, "loss": 2.240383338928223, "memory(GiB)": 130.55, "step": 14465, "token_acc": 0.519134775374376, "train_speed(iter/s)": 1.192419 }, { "epoch": 4.958875942426319, "grad_norm": 2.028731346130371, "learning_rate": 1.6690428975857374e-08, "loss": 2.2353944778442383, "memory(GiB)": 130.55, "step": 14470, "token_acc": 0.5341772151898734, "train_speed(iter/s)": 1.192397 }, { "epoch": 4.960589444825223, "grad_norm": 2.0534753799438477, "learning_rate": 1.5328605924569418e-08, "loss": 2.3037452697753906, "memory(GiB)": 130.55, "step": 14475, "token_acc": 0.511437908496732, "train_speed(iter/s)": 1.192358 }, { "epoch": 4.962302947224126, "grad_norm": 2.1362569332122803, "learning_rate": 1.402472120283216e-08, "loss": 2.2141571044921875, "memory(GiB)": 130.55, "step": 14480, "token_acc": 0.5321585903083701, "train_speed(iter/s)": 1.19237 }, { "epoch": 4.9640164496230295, "grad_norm": 2.2847423553466797, "learning_rate": 1.2778776322008857e-08, "loss": 2.0940788269042967, "memory(GiB)": 130.55, "step": 14485, "token_acc": 0.5260089686098655, "train_speed(iter/s)": 1.192367 }, { "epoch": 4.965729952021933, "grad_norm": 2.1345536708831787, "learning_rate": 1.1590772726294275e-08, "loss": 2.279332160949707, "memory(GiB)": 130.55, "step": 14490, "token_acc": 0.5227464195450716, "train_speed(iter/s)": 1.192393 }, { "epoch": 4.967443454420836, "grad_norm": 2.2825701236724854, "learning_rate": 1.0460711792731337e-08, "loss": 2.218735122680664, "memory(GiB)": 130.55, "step": 14495, "token_acc": 0.5194238323876037, "train_speed(iter/s)": 1.192424 }, { "epoch": 4.96915695681974, "grad_norm": 1.8039098978042603, "learning_rate": 9.388594831200026e-09, "loss": 2.229032516479492, "memory(GiB)": 130.55, "step": 14500, "token_acc": 0.5234171725932351, "train_speed(iter/s)": 1.192391 }, { "epoch": 4.96915695681974, "eval_loss": 2.0703091621398926, "eval_runtime": 3.7109, "eval_samples_per_second": 26.947, "eval_steps_per_second": 26.947, "eval_token_acc": 0.49575070821529743, "step": 14500 } ], "logging_steps": 5, "max_steps": 14590, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.95089234805563e+18, "train_batch_size": 64, "trial_name": null, "trial_params": null }