diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,9 +1,9 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 0.7504540701194342, + "epoch": 0.8004843414607298, "eval_steps": 500, - "global_step": 13635, + "global_step": 14544, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, @@ -95452,6 +95452,6369 @@ "learning_rate": 6.935121468327907e-06, "loss": 0.7858, "step": 13635 + }, + { + "epoch": 0.7505091089217899, + "grad_norm": 0.7102827429771423, + "learning_rate": 6.934721774701824e-06, + "loss": 0.7485, + "step": 13636 + }, + { + "epoch": 0.7505641477241455, + "grad_norm": 0.7031061053276062, + "learning_rate": 6.934322066534891e-06, + "loss": 0.7154, + "step": 13637 + }, + { + "epoch": 0.7506191865265012, + "grad_norm": 0.6468148231506348, + "learning_rate": 6.933922343830112e-06, + "loss": 0.729, + "step": 13638 + }, + { + "epoch": 0.7506742253288569, + "grad_norm": 0.8570408225059509, + "learning_rate": 6.933522606590489e-06, + "loss": 0.6922, + "step": 13639 + }, + { + "epoch": 0.7507292641312125, + "grad_norm": 0.6836286783218384, + "learning_rate": 6.933122854819027e-06, + "loss": 0.7982, + "step": 13640 + }, + { + "epoch": 0.7507843029335681, + "grad_norm": 1.052017092704773, + "learning_rate": 6.9327230885187344e-06, + "loss": 0.7522, + "step": 13641 + }, + { + "epoch": 0.7508393417359238, + "grad_norm": 0.6352099180221558, + "learning_rate": 6.932323307692611e-06, + "loss": 0.6724, + "step": 13642 + }, + { + "epoch": 0.7508943805382795, + "grad_norm": 0.7046655416488647, + "learning_rate": 6.931923512343663e-06, + "loss": 0.7732, + "step": 13643 + }, + { + "epoch": 0.7509494193406352, + "grad_norm": 0.7600587010383606, + "learning_rate": 6.931523702474893e-06, + "loss": 0.7013, + "step": 13644 + }, + { + "epoch": 0.7510044581429908, + "grad_norm": 0.674828052520752, + "learning_rate": 6.9311238780893095e-06, + "loss": 0.7022, + "step": 13645 + }, + { + "epoch": 0.7510594969453465, + "grad_norm": 0.7517798542976379, + "learning_rate": 6.930724039189916e-06, + "loss": 0.7248, + "step": 13646 + }, + { + "epoch": 0.7511145357477022, + "grad_norm": 0.7851112484931946, + "learning_rate": 6.930324185779716e-06, + "loss": 0.8025, + "step": 13647 + }, + { + "epoch": 0.7511695745500578, + "grad_norm": 0.6545413732528687, + "learning_rate": 6.929924317861717e-06, + "loss": 0.781, + "step": 13648 + }, + { + "epoch": 0.7512246133524134, + "grad_norm": 0.7079984545707703, + "learning_rate": 6.929524435438923e-06, + "loss": 0.8033, + "step": 13649 + }, + { + "epoch": 0.7512796521547691, + "grad_norm": 0.6501914262771606, + "learning_rate": 6.929124538514341e-06, + "loss": 0.7525, + "step": 13650 + }, + { + "epoch": 0.7513346909571248, + "grad_norm": 0.7697597742080688, + "learning_rate": 6.928724627090975e-06, + "loss": 0.7358, + "step": 13651 + }, + { + "epoch": 0.7513897297594805, + "grad_norm": 0.8155171275138855, + "learning_rate": 6.928324701171832e-06, + "loss": 0.7389, + "step": 13652 + }, + { + "epoch": 0.7514447685618361, + "grad_norm": 0.6969262361526489, + "learning_rate": 6.927924760759914e-06, + "loss": 0.8349, + "step": 13653 + }, + { + "epoch": 0.7514998073641918, + "grad_norm": 0.6736776828765869, + "learning_rate": 6.927524805858233e-06, + "loss": 0.7379, + "step": 13654 + }, + { + "epoch": 0.7515548461665474, + "grad_norm": 0.6362389922142029, + "learning_rate": 6.927124836469788e-06, + "loss": 0.7479, + "step": 13655 + }, + { + "epoch": 0.7516098849689031, + "grad_norm": 0.688922643661499, + "learning_rate": 6.92672485259759e-06, + "loss": 0.7828, + "step": 13656 + }, + { + "epoch": 0.7516649237712587, + "grad_norm": 0.7098214030265808, + "learning_rate": 6.926324854244644e-06, + "loss": 0.6084, + "step": 13657 + }, + { + "epoch": 0.7517199625736144, + "grad_norm": 0.6436209678649902, + "learning_rate": 6.925924841413956e-06, + "loss": 0.687, + "step": 13658 + }, + { + "epoch": 0.7517750013759701, + "grad_norm": 0.6051730513572693, + "learning_rate": 6.925524814108533e-06, + "loss": 0.6884, + "step": 13659 + }, + { + "epoch": 0.7518300401783257, + "grad_norm": 0.6347759962081909, + "learning_rate": 6.92512477233138e-06, + "loss": 0.7057, + "step": 13660 + }, + { + "epoch": 0.7518850789806814, + "grad_norm": 0.6917054653167725, + "learning_rate": 6.924724716085505e-06, + "loss": 0.8374, + "step": 13661 + }, + { + "epoch": 0.751940117783037, + "grad_norm": 0.7676698565483093, + "learning_rate": 6.924324645373914e-06, + "loss": 0.7435, + "step": 13662 + }, + { + "epoch": 0.7519951565853927, + "grad_norm": 0.6601388454437256, + "learning_rate": 6.923924560199613e-06, + "loss": 0.7168, + "step": 13663 + }, + { + "epoch": 0.7520501953877483, + "grad_norm": 0.6342683434486389, + "learning_rate": 6.923524460565611e-06, + "loss": 0.7382, + "step": 13664 + }, + { + "epoch": 0.752105234190104, + "grad_norm": 0.6703974604606628, + "learning_rate": 6.923124346474915e-06, + "loss": 0.7687, + "step": 13665 + }, + { + "epoch": 0.7521602729924597, + "grad_norm": 0.6937074661254883, + "learning_rate": 6.922724217930531e-06, + "loss": 0.7687, + "step": 13666 + }, + { + "epoch": 0.7522153117948154, + "grad_norm": 0.7919568419456482, + "learning_rate": 6.922324074935466e-06, + "loss": 0.7328, + "step": 13667 + }, + { + "epoch": 0.752270350597171, + "grad_norm": 0.668331503868103, + "learning_rate": 6.9219239174927275e-06, + "loss": 0.7654, + "step": 13668 + }, + { + "epoch": 0.7523253893995266, + "grad_norm": 0.6298941969871521, + "learning_rate": 6.921523745605323e-06, + "loss": 0.719, + "step": 13669 + }, + { + "epoch": 0.7523804282018823, + "grad_norm": 0.6539381146430969, + "learning_rate": 6.921123559276262e-06, + "loss": 0.6681, + "step": 13670 + }, + { + "epoch": 0.752435467004238, + "grad_norm": 1.0692330598831177, + "learning_rate": 6.920723358508548e-06, + "loss": 0.7914, + "step": 13671 + }, + { + "epoch": 0.7524905058065936, + "grad_norm": 0.7410482168197632, + "learning_rate": 6.920323143305193e-06, + "loss": 0.8331, + "step": 13672 + }, + { + "epoch": 0.7525455446089493, + "grad_norm": 0.6976327300071716, + "learning_rate": 6.919922913669203e-06, + "loss": 0.8131, + "step": 13673 + }, + { + "epoch": 0.752600583411305, + "grad_norm": 0.646442174911499, + "learning_rate": 6.919522669603587e-06, + "loss": 0.7658, + "step": 13674 + }, + { + "epoch": 0.7526556222136607, + "grad_norm": 0.6257727146148682, + "learning_rate": 6.919122411111352e-06, + "loss": 0.666, + "step": 13675 + }, + { + "epoch": 0.7527106610160162, + "grad_norm": 0.6913230419158936, + "learning_rate": 6.918722138195506e-06, + "loss": 0.6935, + "step": 13676 + }, + { + "epoch": 0.7527656998183719, + "grad_norm": 0.6282557249069214, + "learning_rate": 6.918321850859059e-06, + "loss": 0.7042, + "step": 13677 + }, + { + "epoch": 0.7528207386207276, + "grad_norm": 0.6980175971984863, + "learning_rate": 6.917921549105018e-06, + "loss": 0.6757, + "step": 13678 + }, + { + "epoch": 0.7528757774230833, + "grad_norm": 0.6954337954521179, + "learning_rate": 6.917521232936393e-06, + "loss": 0.729, + "step": 13679 + }, + { + "epoch": 0.7529308162254389, + "grad_norm": 0.6813758015632629, + "learning_rate": 6.91712090235619e-06, + "loss": 0.6964, + "step": 13680 + }, + { + "epoch": 0.7529858550277946, + "grad_norm": 1.0940780639648438, + "learning_rate": 6.916720557367419e-06, + "loss": 0.7853, + "step": 13681 + }, + { + "epoch": 0.7530408938301503, + "grad_norm": 0.6899382472038269, + "learning_rate": 6.9163201979730906e-06, + "loss": 0.7639, + "step": 13682 + }, + { + "epoch": 0.753095932632506, + "grad_norm": 0.660252034664154, + "learning_rate": 6.915919824176213e-06, + "loss": 0.7068, + "step": 13683 + }, + { + "epoch": 0.7531509714348615, + "grad_norm": 0.6454583406448364, + "learning_rate": 6.915519435979795e-06, + "loss": 0.7268, + "step": 13684 + }, + { + "epoch": 0.7532060102372172, + "grad_norm": 0.7292754650115967, + "learning_rate": 6.915119033386843e-06, + "loss": 0.8131, + "step": 13685 + }, + { + "epoch": 0.7532610490395729, + "grad_norm": 0.6312932372093201, + "learning_rate": 6.914718616400372e-06, + "loss": 0.6977, + "step": 13686 + }, + { + "epoch": 0.7533160878419286, + "grad_norm": 0.8528029322624207, + "learning_rate": 6.914318185023388e-06, + "loss": 0.8403, + "step": 13687 + }, + { + "epoch": 0.7533711266442842, + "grad_norm": 0.758721649646759, + "learning_rate": 6.9139177392589e-06, + "loss": 0.7, + "step": 13688 + }, + { + "epoch": 0.7534261654466399, + "grad_norm": 0.6678142547607422, + "learning_rate": 6.913517279109919e-06, + "loss": 0.6251, + "step": 13689 + }, + { + "epoch": 0.7534812042489956, + "grad_norm": 0.6136146783828735, + "learning_rate": 6.913116804579455e-06, + "loss": 0.653, + "step": 13690 + }, + { + "epoch": 0.7535362430513513, + "grad_norm": 0.7546648383140564, + "learning_rate": 6.912716315670517e-06, + "loss": 0.8202, + "step": 13691 + }, + { + "epoch": 0.7535912818537068, + "grad_norm": 0.7232012152671814, + "learning_rate": 6.912315812386114e-06, + "loss": 0.7993, + "step": 13692 + }, + { + "epoch": 0.7536463206560625, + "grad_norm": 0.7288710474967957, + "learning_rate": 6.911915294729258e-06, + "loss": 0.7702, + "step": 13693 + }, + { + "epoch": 0.7537013594584182, + "grad_norm": 0.6847403049468994, + "learning_rate": 6.9115147627029575e-06, + "loss": 0.8141, + "step": 13694 + }, + { + "epoch": 0.7537563982607739, + "grad_norm": 0.62345951795578, + "learning_rate": 6.9111142163102255e-06, + "loss": 0.6832, + "step": 13695 + }, + { + "epoch": 0.7538114370631295, + "grad_norm": 0.7275232672691345, + "learning_rate": 6.9107136555540695e-06, + "loss": 0.7548, + "step": 13696 + }, + { + "epoch": 0.7538664758654852, + "grad_norm": 0.6724695563316345, + "learning_rate": 6.910313080437501e-06, + "loss": 0.7755, + "step": 13697 + }, + { + "epoch": 0.7539215146678409, + "grad_norm": 0.8446974754333496, + "learning_rate": 6.90991249096353e-06, + "loss": 0.827, + "step": 13698 + }, + { + "epoch": 0.7539765534701965, + "grad_norm": 0.7124913930892944, + "learning_rate": 6.9095118871351705e-06, + "loss": 0.7463, + "step": 13699 + }, + { + "epoch": 0.7540315922725521, + "grad_norm": 0.6916043162345886, + "learning_rate": 6.90911126895543e-06, + "loss": 0.714, + "step": 13700 + }, + { + "epoch": 0.7540866310749078, + "grad_norm": 0.7585330009460449, + "learning_rate": 6.908710636427319e-06, + "loss": 0.6731, + "step": 13701 + }, + { + "epoch": 0.7541416698772635, + "grad_norm": 0.6905520558357239, + "learning_rate": 6.90830998955385e-06, + "loss": 0.726, + "step": 13702 + }, + { + "epoch": 0.7541967086796191, + "grad_norm": 0.7482494115829468, + "learning_rate": 6.907909328338035e-06, + "loss": 0.7269, + "step": 13703 + }, + { + "epoch": 0.7542517474819748, + "grad_norm": 0.7565957307815552, + "learning_rate": 6.907508652782884e-06, + "loss": 0.6959, + "step": 13704 + }, + { + "epoch": 0.7543067862843305, + "grad_norm": 0.7458370923995972, + "learning_rate": 6.9071079628914075e-06, + "loss": 0.7448, + "step": 13705 + }, + { + "epoch": 0.7543618250866861, + "grad_norm": 1.3538293838500977, + "learning_rate": 6.9067072586666185e-06, + "loss": 0.8164, + "step": 13706 + }, + { + "epoch": 0.7544168638890417, + "grad_norm": 0.6217493414878845, + "learning_rate": 6.906306540111528e-06, + "loss": 0.7001, + "step": 13707 + }, + { + "epoch": 0.7544719026913974, + "grad_norm": 0.6862730383872986, + "learning_rate": 6.9059058072291485e-06, + "loss": 0.7921, + "step": 13708 + }, + { + "epoch": 0.7545269414937531, + "grad_norm": 0.6684688925743103, + "learning_rate": 6.905505060022491e-06, + "loss": 0.6736, + "step": 13709 + }, + { + "epoch": 0.7545819802961088, + "grad_norm": 0.6581160426139832, + "learning_rate": 6.905104298494567e-06, + "loss": 0.7581, + "step": 13710 + }, + { + "epoch": 0.7546370190984644, + "grad_norm": 0.7772610783576965, + "learning_rate": 6.9047035226483885e-06, + "loss": 0.7984, + "step": 13711 + }, + { + "epoch": 0.7546920579008201, + "grad_norm": 0.6856822371482849, + "learning_rate": 6.90430273248697e-06, + "loss": 0.8232, + "step": 13712 + }, + { + "epoch": 0.7547470967031757, + "grad_norm": 0.7250725626945496, + "learning_rate": 6.903901928013322e-06, + "loss": 0.7844, + "step": 13713 + }, + { + "epoch": 0.7548021355055314, + "grad_norm": 0.7034164667129517, + "learning_rate": 6.9035011092304545e-06, + "loss": 0.8293, + "step": 13714 + }, + { + "epoch": 0.754857174307887, + "grad_norm": 0.6783095002174377, + "learning_rate": 6.903100276141383e-06, + "loss": 0.6841, + "step": 13715 + }, + { + "epoch": 0.7549122131102427, + "grad_norm": 0.6180121302604675, + "learning_rate": 6.90269942874912e-06, + "loss": 0.7111, + "step": 13716 + }, + { + "epoch": 0.7549672519125984, + "grad_norm": 0.70428466796875, + "learning_rate": 6.902298567056677e-06, + "loss": 0.8758, + "step": 13717 + }, + { + "epoch": 0.7550222907149541, + "grad_norm": 0.8130238652229309, + "learning_rate": 6.9018976910670665e-06, + "loss": 0.6443, + "step": 13718 + }, + { + "epoch": 0.7550773295173097, + "grad_norm": 0.6910800933837891, + "learning_rate": 6.901496800783302e-06, + "loss": 0.7231, + "step": 13719 + }, + { + "epoch": 0.7551323683196653, + "grad_norm": 0.700933575630188, + "learning_rate": 6.901095896208398e-06, + "loss": 0.6785, + "step": 13720 + }, + { + "epoch": 0.755187407122021, + "grad_norm": 0.7407829761505127, + "learning_rate": 6.9006949773453656e-06, + "loss": 0.694, + "step": 13721 + }, + { + "epoch": 0.7552424459243767, + "grad_norm": 0.7907935380935669, + "learning_rate": 6.900294044197218e-06, + "loss": 0.7674, + "step": 13722 + }, + { + "epoch": 0.7552974847267323, + "grad_norm": 0.6585111021995544, + "learning_rate": 6.89989309676697e-06, + "loss": 0.6785, + "step": 13723 + }, + { + "epoch": 0.755352523529088, + "grad_norm": 0.7611724138259888, + "learning_rate": 6.899492135057633e-06, + "loss": 0.8028, + "step": 13724 + }, + { + "epoch": 0.7554075623314437, + "grad_norm": 0.6412070989608765, + "learning_rate": 6.899091159072222e-06, + "loss": 0.7634, + "step": 13725 + }, + { + "epoch": 0.7554626011337994, + "grad_norm": 0.7712366580963135, + "learning_rate": 6.898690168813751e-06, + "loss": 0.8275, + "step": 13726 + }, + { + "epoch": 0.755517639936155, + "grad_norm": 0.6826579570770264, + "learning_rate": 6.898289164285232e-06, + "loss": 0.7949, + "step": 13727 + }, + { + "epoch": 0.7555726787385106, + "grad_norm": 0.7501955628395081, + "learning_rate": 6.897888145489681e-06, + "loss": 0.7846, + "step": 13728 + }, + { + "epoch": 0.7556277175408663, + "grad_norm": 0.6493077874183655, + "learning_rate": 6.8974871124301075e-06, + "loss": 0.7294, + "step": 13729 + }, + { + "epoch": 0.755682756343222, + "grad_norm": 0.6854347586631775, + "learning_rate": 6.897086065109532e-06, + "loss": 0.7121, + "step": 13730 + }, + { + "epoch": 0.7557377951455776, + "grad_norm": 0.7376317977905273, + "learning_rate": 6.896685003530964e-06, + "loss": 0.7719, + "step": 13731 + }, + { + "epoch": 0.7557928339479333, + "grad_norm": 0.8477175235748291, + "learning_rate": 6.89628392769742e-06, + "loss": 0.7981, + "step": 13732 + }, + { + "epoch": 0.755847872750289, + "grad_norm": 0.6611722111701965, + "learning_rate": 6.8958828376119125e-06, + "loss": 0.7628, + "step": 13733 + }, + { + "epoch": 0.7559029115526447, + "grad_norm": 0.6898290514945984, + "learning_rate": 6.895481733277458e-06, + "loss": 0.7578, + "step": 13734 + }, + { + "epoch": 0.7559579503550002, + "grad_norm": 0.6566810607910156, + "learning_rate": 6.89508061469707e-06, + "loss": 0.6919, + "step": 13735 + }, + { + "epoch": 0.7560129891573559, + "grad_norm": 0.6395933032035828, + "learning_rate": 6.894679481873763e-06, + "loss": 0.7334, + "step": 13736 + }, + { + "epoch": 0.7560680279597116, + "grad_norm": 0.7060876488685608, + "learning_rate": 6.8942783348105535e-06, + "loss": 0.7405, + "step": 13737 + }, + { + "epoch": 0.7561230667620673, + "grad_norm": 0.7303228974342346, + "learning_rate": 6.893877173510454e-06, + "loss": 0.8563, + "step": 13738 + }, + { + "epoch": 0.7561781055644229, + "grad_norm": 0.663474977016449, + "learning_rate": 6.893475997976481e-06, + "loss": 0.703, + "step": 13739 + }, + { + "epoch": 0.7562331443667786, + "grad_norm": 0.8005428910255432, + "learning_rate": 6.893074808211649e-06, + "loss": 0.7219, + "step": 13740 + }, + { + "epoch": 0.7562881831691343, + "grad_norm": 1.3285688161849976, + "learning_rate": 6.892673604218972e-06, + "loss": 0.672, + "step": 13741 + }, + { + "epoch": 0.75634322197149, + "grad_norm": 0.6958948373794556, + "learning_rate": 6.892272386001469e-06, + "loss": 0.7728, + "step": 13742 + }, + { + "epoch": 0.7563982607738455, + "grad_norm": 0.6840598583221436, + "learning_rate": 6.891871153562153e-06, + "loss": 0.7881, + "step": 13743 + }, + { + "epoch": 0.7564532995762012, + "grad_norm": 0.7184257507324219, + "learning_rate": 6.891469906904039e-06, + "loss": 0.736, + "step": 13744 + }, + { + "epoch": 0.7565083383785569, + "grad_norm": 0.6611571311950684, + "learning_rate": 6.891068646030143e-06, + "loss": 0.7171, + "step": 13745 + }, + { + "epoch": 0.7565633771809125, + "grad_norm": 0.8237559795379639, + "learning_rate": 6.890667370943482e-06, + "loss": 0.8669, + "step": 13746 + }, + { + "epoch": 0.7566184159832682, + "grad_norm": 0.6898388266563416, + "learning_rate": 6.890266081647072e-06, + "loss": 0.6654, + "step": 13747 + }, + { + "epoch": 0.7566734547856239, + "grad_norm": 0.6541711688041687, + "learning_rate": 6.889864778143928e-06, + "loss": 0.7455, + "step": 13748 + }, + { + "epoch": 0.7567284935879796, + "grad_norm": 0.6518157124519348, + "learning_rate": 6.8894634604370655e-06, + "loss": 0.7174, + "step": 13749 + }, + { + "epoch": 0.7567835323903351, + "grad_norm": 0.7992080450057983, + "learning_rate": 6.889062128529502e-06, + "loss": 0.7349, + "step": 13750 + }, + { + "epoch": 0.7568385711926908, + "grad_norm": 0.5748338103294373, + "learning_rate": 6.888660782424253e-06, + "loss": 0.5398, + "step": 13751 + }, + { + "epoch": 0.7568936099950465, + "grad_norm": 0.6507781744003296, + "learning_rate": 6.8882594221243344e-06, + "loss": 0.6762, + "step": 13752 + }, + { + "epoch": 0.7569486487974022, + "grad_norm": 0.6908432841300964, + "learning_rate": 6.887858047632764e-06, + "loss": 0.8034, + "step": 13753 + }, + { + "epoch": 0.7570036875997578, + "grad_norm": 0.6497751474380493, + "learning_rate": 6.887456658952557e-06, + "loss": 0.6351, + "step": 13754 + }, + { + "epoch": 0.7570587264021135, + "grad_norm": 0.7233273386955261, + "learning_rate": 6.887055256086732e-06, + "loss": 0.7096, + "step": 13755 + }, + { + "epoch": 0.7571137652044692, + "grad_norm": 0.6587454676628113, + "learning_rate": 6.886653839038305e-06, + "loss": 0.7354, + "step": 13756 + }, + { + "epoch": 0.7571688040068248, + "grad_norm": 0.6654310822486877, + "learning_rate": 6.886252407810292e-06, + "loss": 0.7776, + "step": 13757 + }, + { + "epoch": 0.7572238428091804, + "grad_norm": 0.796604573726654, + "learning_rate": 6.885850962405711e-06, + "loss": 0.7925, + "step": 13758 + }, + { + "epoch": 0.7572788816115361, + "grad_norm": 0.7053457498550415, + "learning_rate": 6.8854495028275795e-06, + "loss": 0.7893, + "step": 13759 + }, + { + "epoch": 0.7573339204138918, + "grad_norm": 0.7201200127601624, + "learning_rate": 6.885048029078914e-06, + "loss": 0.8346, + "step": 13760 + }, + { + "epoch": 0.7573889592162475, + "grad_norm": 0.8437653183937073, + "learning_rate": 6.884646541162731e-06, + "loss": 0.7468, + "step": 13761 + }, + { + "epoch": 0.7574439980186031, + "grad_norm": 0.6910028457641602, + "learning_rate": 6.884245039082052e-06, + "loss": 0.7362, + "step": 13762 + }, + { + "epoch": 0.7574990368209588, + "grad_norm": 0.6896274089813232, + "learning_rate": 6.883843522839889e-06, + "loss": 0.6515, + "step": 13763 + }, + { + "epoch": 0.7575540756233144, + "grad_norm": 0.9833560585975647, + "learning_rate": 6.8834419924392636e-06, + "loss": 0.8764, + "step": 13764 + }, + { + "epoch": 0.7576091144256701, + "grad_norm": 0.7130032181739807, + "learning_rate": 6.88304044788319e-06, + "loss": 0.7631, + "step": 13765 + }, + { + "epoch": 0.7576641532280257, + "grad_norm": 0.7059195041656494, + "learning_rate": 6.882638889174691e-06, + "loss": 0.8147, + "step": 13766 + }, + { + "epoch": 0.7577191920303814, + "grad_norm": 0.6451989412307739, + "learning_rate": 6.882237316316781e-06, + "loss": 0.6638, + "step": 13767 + }, + { + "epoch": 0.7577742308327371, + "grad_norm": 0.7541074752807617, + "learning_rate": 6.881835729312481e-06, + "loss": 0.6918, + "step": 13768 + }, + { + "epoch": 0.7578292696350928, + "grad_norm": 0.7227535843849182, + "learning_rate": 6.881434128164805e-06, + "loss": 0.7759, + "step": 13769 + }, + { + "epoch": 0.7578843084374484, + "grad_norm": 0.673112154006958, + "learning_rate": 6.881032512876774e-06, + "loss": 0.7328, + "step": 13770 + }, + { + "epoch": 0.757939347239804, + "grad_norm": 0.6536681056022644, + "learning_rate": 6.880630883451407e-06, + "loss": 0.7677, + "step": 13771 + }, + { + "epoch": 0.7579943860421597, + "grad_norm": 0.8517894148826599, + "learning_rate": 6.880229239891721e-06, + "loss": 0.8566, + "step": 13772 + }, + { + "epoch": 0.7580494248445154, + "grad_norm": 0.8260573148727417, + "learning_rate": 6.879827582200737e-06, + "loss": 0.8228, + "step": 13773 + }, + { + "epoch": 0.758104463646871, + "grad_norm": 0.7460072040557861, + "learning_rate": 6.87942591038147e-06, + "loss": 0.8047, + "step": 13774 + }, + { + "epoch": 0.7581595024492267, + "grad_norm": 0.7648436427116394, + "learning_rate": 6.879024224436942e-06, + "loss": 0.852, + "step": 13775 + }, + { + "epoch": 0.7582145412515824, + "grad_norm": 0.7161253094673157, + "learning_rate": 6.878622524370171e-06, + "loss": 0.7638, + "step": 13776 + }, + { + "epoch": 0.7582695800539381, + "grad_norm": 0.6559579372406006, + "learning_rate": 6.878220810184175e-06, + "loss": 0.6932, + "step": 13777 + }, + { + "epoch": 0.7583246188562937, + "grad_norm": 0.6846898198127747, + "learning_rate": 6.877819081881975e-06, + "loss": 0.7098, + "step": 13778 + }, + { + "epoch": 0.7583796576586493, + "grad_norm": 0.7569675445556641, + "learning_rate": 6.87741733946659e-06, + "loss": 0.687, + "step": 13779 + }, + { + "epoch": 0.758434696461005, + "grad_norm": 0.7513766288757324, + "learning_rate": 6.877015582941038e-06, + "loss": 0.8673, + "step": 13780 + }, + { + "epoch": 0.7584897352633607, + "grad_norm": 0.7158082127571106, + "learning_rate": 6.876613812308338e-06, + "loss": 0.7563, + "step": 13781 + }, + { + "epoch": 0.7585447740657163, + "grad_norm": 0.6307277083396912, + "learning_rate": 6.876212027571513e-06, + "loss": 0.6725, + "step": 13782 + }, + { + "epoch": 0.758599812868072, + "grad_norm": 0.735090434551239, + "learning_rate": 6.87581022873358e-06, + "loss": 0.763, + "step": 13783 + }, + { + "epoch": 0.7586548516704277, + "grad_norm": 0.6412403583526611, + "learning_rate": 6.8754084157975594e-06, + "loss": 0.5992, + "step": 13784 + }, + { + "epoch": 0.7587098904727834, + "grad_norm": 0.639854907989502, + "learning_rate": 6.875006588766472e-06, + "loss": 0.7372, + "step": 13785 + }, + { + "epoch": 0.7587649292751389, + "grad_norm": 0.6855082511901855, + "learning_rate": 6.8746047476433365e-06, + "loss": 0.7709, + "step": 13786 + }, + { + "epoch": 0.7588199680774946, + "grad_norm": 0.6838769912719727, + "learning_rate": 6.874202892431173e-06, + "loss": 0.7545, + "step": 13787 + }, + { + "epoch": 0.7588750068798503, + "grad_norm": 1.1560181379318237, + "learning_rate": 6.873801023133002e-06, + "loss": 0.7291, + "step": 13788 + }, + { + "epoch": 0.7589300456822059, + "grad_norm": 0.7140469551086426, + "learning_rate": 6.873399139751844e-06, + "loss": 0.7214, + "step": 13789 + }, + { + "epoch": 0.7589850844845616, + "grad_norm": 0.6856355667114258, + "learning_rate": 6.8729972422907195e-06, + "loss": 0.7417, + "step": 13790 + }, + { + "epoch": 0.7590401232869173, + "grad_norm": 0.7856155633926392, + "learning_rate": 6.8725953307526505e-06, + "loss": 0.7484, + "step": 13791 + }, + { + "epoch": 0.759095162089273, + "grad_norm": 0.8107255697250366, + "learning_rate": 6.8721934051406555e-06, + "loss": 0.7568, + "step": 13792 + }, + { + "epoch": 0.7591502008916285, + "grad_norm": 0.6590837240219116, + "learning_rate": 6.871791465457757e-06, + "loss": 0.7495, + "step": 13793 + }, + { + "epoch": 0.7592052396939842, + "grad_norm": 0.7531588077545166, + "learning_rate": 6.8713895117069715e-06, + "loss": 0.7434, + "step": 13794 + }, + { + "epoch": 0.7592602784963399, + "grad_norm": 0.6818329095840454, + "learning_rate": 6.870987543891326e-06, + "loss": 0.7128, + "step": 13795 + }, + { + "epoch": 0.7593153172986956, + "grad_norm": 0.6082884669303894, + "learning_rate": 6.8705855620138395e-06, + "loss": 0.7437, + "step": 13796 + }, + { + "epoch": 0.7593703561010512, + "grad_norm": 0.9583787322044373, + "learning_rate": 6.870183566077532e-06, + "loss": 0.7779, + "step": 13797 + }, + { + "epoch": 0.7594253949034069, + "grad_norm": 0.6684621572494507, + "learning_rate": 6.869781556085425e-06, + "loss": 0.5856, + "step": 13798 + }, + { + "epoch": 0.7594804337057626, + "grad_norm": 0.6225603222846985, + "learning_rate": 6.869379532040541e-06, + "loss": 0.7407, + "step": 13799 + }, + { + "epoch": 0.7595354725081183, + "grad_norm": 0.6973103284835815, + "learning_rate": 6.8689774939459005e-06, + "loss": 0.7789, + "step": 13800 + }, + { + "epoch": 0.7595905113104738, + "grad_norm": 0.6655399203300476, + "learning_rate": 6.868575441804526e-06, + "loss": 0.7489, + "step": 13801 + }, + { + "epoch": 0.7596455501128295, + "grad_norm": 0.7066664695739746, + "learning_rate": 6.868173375619437e-06, + "loss": 0.7035, + "step": 13802 + }, + { + "epoch": 0.7597005889151852, + "grad_norm": 1.0646852254867554, + "learning_rate": 6.867771295393658e-06, + "loss": 0.8488, + "step": 13803 + }, + { + "epoch": 0.7597556277175409, + "grad_norm": 0.6551353335380554, + "learning_rate": 6.867369201130209e-06, + "loss": 0.7147, + "step": 13804 + }, + { + "epoch": 0.7598106665198965, + "grad_norm": 0.6749850511550903, + "learning_rate": 6.866967092832115e-06, + "loss": 0.7963, + "step": 13805 + }, + { + "epoch": 0.7598657053222522, + "grad_norm": 0.6704042553901672, + "learning_rate": 6.866564970502394e-06, + "loss": 0.7992, + "step": 13806 + }, + { + "epoch": 0.7599207441246079, + "grad_norm": 0.7027791142463684, + "learning_rate": 6.866162834144071e-06, + "loss": 0.7931, + "step": 13807 + }, + { + "epoch": 0.7599757829269636, + "grad_norm": 0.7925322651863098, + "learning_rate": 6.865760683760169e-06, + "loss": 0.7826, + "step": 13808 + }, + { + "epoch": 0.7600308217293191, + "grad_norm": 0.7152161002159119, + "learning_rate": 6.865358519353708e-06, + "loss": 0.7481, + "step": 13809 + }, + { + "epoch": 0.7600858605316748, + "grad_norm": 0.6572757959365845, + "learning_rate": 6.864956340927711e-06, + "loss": 0.785, + "step": 13810 + }, + { + "epoch": 0.7601408993340305, + "grad_norm": 0.6848406791687012, + "learning_rate": 6.864554148485203e-06, + "loss": 0.6423, + "step": 13811 + }, + { + "epoch": 0.7601959381363862, + "grad_norm": 0.747597873210907, + "learning_rate": 6.864151942029205e-06, + "loss": 0.7901, + "step": 13812 + }, + { + "epoch": 0.7602509769387418, + "grad_norm": 0.7106720805168152, + "learning_rate": 6.863749721562738e-06, + "loss": 0.7488, + "step": 13813 + }, + { + "epoch": 0.7603060157410975, + "grad_norm": 0.6864057779312134, + "learning_rate": 6.8633474870888275e-06, + "loss": 0.7066, + "step": 13814 + }, + { + "epoch": 0.7603610545434532, + "grad_norm": 0.7022056579589844, + "learning_rate": 6.862945238610496e-06, + "loss": 0.6851, + "step": 13815 + }, + { + "epoch": 0.7604160933458088, + "grad_norm": 0.7361913919448853, + "learning_rate": 6.862542976130769e-06, + "loss": 0.7425, + "step": 13816 + }, + { + "epoch": 0.7604711321481644, + "grad_norm": 0.6723676323890686, + "learning_rate": 6.862140699652666e-06, + "loss": 0.7937, + "step": 13817 + }, + { + "epoch": 0.7605261709505201, + "grad_norm": 0.7491924166679382, + "learning_rate": 6.861738409179212e-06, + "loss": 0.7585, + "step": 13818 + }, + { + "epoch": 0.7605812097528758, + "grad_norm": 0.6772211790084839, + "learning_rate": 6.86133610471343e-06, + "loss": 0.7617, + "step": 13819 + }, + { + "epoch": 0.7606362485552315, + "grad_norm": 0.7819864153862, + "learning_rate": 6.860933786258344e-06, + "loss": 0.7924, + "step": 13820 + }, + { + "epoch": 0.7606912873575871, + "grad_norm": 0.6992526650428772, + "learning_rate": 6.86053145381698e-06, + "loss": 0.7054, + "step": 13821 + }, + { + "epoch": 0.7607463261599428, + "grad_norm": 0.7189231514930725, + "learning_rate": 6.860129107392357e-06, + "loss": 0.7603, + "step": 13822 + }, + { + "epoch": 0.7608013649622984, + "grad_norm": 0.7165294885635376, + "learning_rate": 6.859726746987503e-06, + "loss": 0.8118, + "step": 13823 + }, + { + "epoch": 0.7608564037646541, + "grad_norm": 0.6510334014892578, + "learning_rate": 6.85932437260544e-06, + "loss": 0.7584, + "step": 13824 + }, + { + "epoch": 0.7609114425670097, + "grad_norm": 0.7113379836082458, + "learning_rate": 6.8589219842491935e-06, + "loss": 0.7799, + "step": 13825 + }, + { + "epoch": 0.7609664813693654, + "grad_norm": 0.7441100478172302, + "learning_rate": 6.8585195819217856e-06, + "loss": 0.6468, + "step": 13826 + }, + { + "epoch": 0.7610215201717211, + "grad_norm": 1.0703508853912354, + "learning_rate": 6.858117165626244e-06, + "loss": 0.7922, + "step": 13827 + }, + { + "epoch": 0.7610765589740768, + "grad_norm": 0.7097275853157043, + "learning_rate": 6.857714735365589e-06, + "loss": 0.7594, + "step": 13828 + }, + { + "epoch": 0.7611315977764324, + "grad_norm": 0.7001124620437622, + "learning_rate": 6.857312291142848e-06, + "loss": 0.7679, + "step": 13829 + }, + { + "epoch": 0.761186636578788, + "grad_norm": 0.6898123621940613, + "learning_rate": 6.856909832961045e-06, + "loss": 0.7684, + "step": 13830 + }, + { + "epoch": 0.7612416753811437, + "grad_norm": 0.6535243391990662, + "learning_rate": 6.856507360823206e-06, + "loss": 0.6143, + "step": 13831 + }, + { + "epoch": 0.7612967141834993, + "grad_norm": 0.6726056933403015, + "learning_rate": 6.856104874732353e-06, + "loss": 0.7566, + "step": 13832 + }, + { + "epoch": 0.761351752985855, + "grad_norm": 0.8741437196731567, + "learning_rate": 6.855702374691513e-06, + "loss": 0.723, + "step": 13833 + }, + { + "epoch": 0.7614067917882107, + "grad_norm": 0.7025718092918396, + "learning_rate": 6.855299860703712e-06, + "loss": 0.8035, + "step": 13834 + }, + { + "epoch": 0.7614618305905664, + "grad_norm": 1.08286452293396, + "learning_rate": 6.8548973327719726e-06, + "loss": 0.7347, + "step": 13835 + }, + { + "epoch": 0.761516869392922, + "grad_norm": 0.6483243107795715, + "learning_rate": 6.854494790899322e-06, + "loss": 0.7326, + "step": 13836 + }, + { + "epoch": 0.7615719081952776, + "grad_norm": 0.6611089110374451, + "learning_rate": 6.854092235088784e-06, + "loss": 0.7619, + "step": 13837 + }, + { + "epoch": 0.7616269469976333, + "grad_norm": 0.8394322991371155, + "learning_rate": 6.853689665343385e-06, + "loss": 0.7017, + "step": 13838 + }, + { + "epoch": 0.761681985799989, + "grad_norm": 0.7131583094596863, + "learning_rate": 6.853287081666151e-06, + "loss": 0.7367, + "step": 13839 + }, + { + "epoch": 0.7617370246023446, + "grad_norm": 0.7316367626190186, + "learning_rate": 6.852884484060108e-06, + "loss": 0.7323, + "step": 13840 + }, + { + "epoch": 0.7617920634047003, + "grad_norm": 0.7639010548591614, + "learning_rate": 6.852481872528281e-06, + "loss": 0.819, + "step": 13841 + }, + { + "epoch": 0.761847102207056, + "grad_norm": 0.7118390202522278, + "learning_rate": 6.852079247073695e-06, + "loss": 0.7645, + "step": 13842 + }, + { + "epoch": 0.7619021410094117, + "grad_norm": 0.6885393857955933, + "learning_rate": 6.851676607699379e-06, + "loss": 0.8052, + "step": 13843 + }, + { + "epoch": 0.7619571798117672, + "grad_norm": 0.7034374475479126, + "learning_rate": 6.851273954408356e-06, + "loss": 0.8464, + "step": 13844 + }, + { + "epoch": 0.7620122186141229, + "grad_norm": 0.6531803607940674, + "learning_rate": 6.850871287203654e-06, + "loss": 0.7871, + "step": 13845 + }, + { + "epoch": 0.7620672574164786, + "grad_norm": 0.6637283563613892, + "learning_rate": 6.8504686060882995e-06, + "loss": 0.7326, + "step": 13846 + }, + { + "epoch": 0.7621222962188343, + "grad_norm": 0.6467694640159607, + "learning_rate": 6.850065911065318e-06, + "loss": 0.7936, + "step": 13847 + }, + { + "epoch": 0.7621773350211899, + "grad_norm": 0.6829109191894531, + "learning_rate": 6.849663202137735e-06, + "loss": 0.7003, + "step": 13848 + }, + { + "epoch": 0.7622323738235456, + "grad_norm": 0.7321386933326721, + "learning_rate": 6.84926047930858e-06, + "loss": 0.6921, + "step": 13849 + }, + { + "epoch": 0.7622874126259013, + "grad_norm": 0.6900202631950378, + "learning_rate": 6.8488577425808766e-06, + "loss": 0.7496, + "step": 13850 + }, + { + "epoch": 0.762342451428257, + "grad_norm": 0.6304247975349426, + "learning_rate": 6.848454991957655e-06, + "loss": 0.7135, + "step": 13851 + }, + { + "epoch": 0.7623974902306125, + "grad_norm": 0.7087798118591309, + "learning_rate": 6.8480522274419404e-06, + "loss": 0.7032, + "step": 13852 + }, + { + "epoch": 0.7624525290329682, + "grad_norm": 0.7777289152145386, + "learning_rate": 6.84764944903676e-06, + "loss": 0.7345, + "step": 13853 + }, + { + "epoch": 0.7625075678353239, + "grad_norm": 0.7282242774963379, + "learning_rate": 6.847246656745139e-06, + "loss": 0.6408, + "step": 13854 + }, + { + "epoch": 0.7625626066376796, + "grad_norm": 0.7798221707344055, + "learning_rate": 6.846843850570107e-06, + "loss": 0.9058, + "step": 13855 + }, + { + "epoch": 0.7626176454400352, + "grad_norm": 0.6145210266113281, + "learning_rate": 6.846441030514692e-06, + "loss": 0.6331, + "step": 13856 + }, + { + "epoch": 0.7626726842423909, + "grad_norm": 0.7079364061355591, + "learning_rate": 6.846038196581921e-06, + "loss": 0.7511, + "step": 13857 + }, + { + "epoch": 0.7627277230447466, + "grad_norm": 0.733635425567627, + "learning_rate": 6.845635348774821e-06, + "loss": 0.6957, + "step": 13858 + }, + { + "epoch": 0.7627827618471023, + "grad_norm": 0.8099489808082581, + "learning_rate": 6.845232487096419e-06, + "loss": 0.8068, + "step": 13859 + }, + { + "epoch": 0.7628378006494578, + "grad_norm": 0.6241937875747681, + "learning_rate": 6.844829611549744e-06, + "loss": 0.7102, + "step": 13860 + }, + { + "epoch": 0.7628928394518135, + "grad_norm": 0.8009611368179321, + "learning_rate": 6.8444267221378235e-06, + "loss": 0.8369, + "step": 13861 + }, + { + "epoch": 0.7629478782541692, + "grad_norm": 0.6700903177261353, + "learning_rate": 6.844023818863685e-06, + "loss": 0.8075, + "step": 13862 + }, + { + "epoch": 0.7630029170565249, + "grad_norm": 0.9378371834754944, + "learning_rate": 6.843620901730357e-06, + "loss": 0.7539, + "step": 13863 + }, + { + "epoch": 0.7630579558588805, + "grad_norm": 0.6704423427581787, + "learning_rate": 6.843217970740867e-06, + "loss": 0.7285, + "step": 13864 + }, + { + "epoch": 0.7631129946612362, + "grad_norm": 0.7236818075180054, + "learning_rate": 6.842815025898246e-06, + "loss": 0.7223, + "step": 13865 + }, + { + "epoch": 0.7631680334635919, + "grad_norm": 0.676184356212616, + "learning_rate": 6.84241206720552e-06, + "loss": 0.7286, + "step": 13866 + }, + { + "epoch": 0.7632230722659475, + "grad_norm": 0.6443304419517517, + "learning_rate": 6.842009094665717e-06, + "loss": 0.6806, + "step": 13867 + }, + { + "epoch": 0.7632781110683031, + "grad_norm": 0.7931790947914124, + "learning_rate": 6.841606108281868e-06, + "loss": 0.7801, + "step": 13868 + }, + { + "epoch": 0.7633331498706588, + "grad_norm": 0.7440798878669739, + "learning_rate": 6.841203108057e-06, + "loss": 0.8044, + "step": 13869 + }, + { + "epoch": 0.7633881886730145, + "grad_norm": 0.7226675748825073, + "learning_rate": 6.840800093994142e-06, + "loss": 0.718, + "step": 13870 + }, + { + "epoch": 0.7634432274753702, + "grad_norm": 0.7351265549659729, + "learning_rate": 6.8403970660963245e-06, + "loss": 0.8389, + "step": 13871 + }, + { + "epoch": 0.7634982662777258, + "grad_norm": 0.8326215744018555, + "learning_rate": 6.839994024366574e-06, + "loss": 0.8583, + "step": 13872 + }, + { + "epoch": 0.7635533050800815, + "grad_norm": 0.6841259002685547, + "learning_rate": 6.839590968807922e-06, + "loss": 0.7553, + "step": 13873 + }, + { + "epoch": 0.7636083438824371, + "grad_norm": 0.7305078506469727, + "learning_rate": 6.839187899423395e-06, + "loss": 0.7825, + "step": 13874 + }, + { + "epoch": 0.7636633826847927, + "grad_norm": 0.7235193252563477, + "learning_rate": 6.838784816216025e-06, + "loss": 0.7653, + "step": 13875 + }, + { + "epoch": 0.7637184214871484, + "grad_norm": 0.6468761563301086, + "learning_rate": 6.838381719188842e-06, + "loss": 0.6901, + "step": 13876 + }, + { + "epoch": 0.7637734602895041, + "grad_norm": 0.6806310415267944, + "learning_rate": 6.837978608344872e-06, + "loss": 0.6876, + "step": 13877 + }, + { + "epoch": 0.7638284990918598, + "grad_norm": 0.692081093788147, + "learning_rate": 6.837575483687147e-06, + "loss": 0.7506, + "step": 13878 + }, + { + "epoch": 0.7638835378942154, + "grad_norm": 0.6447135806083679, + "learning_rate": 6.837172345218697e-06, + "loss": 0.6841, + "step": 13879 + }, + { + "epoch": 0.7639385766965711, + "grad_norm": 0.7352014183998108, + "learning_rate": 6.8367691929425516e-06, + "loss": 0.8066, + "step": 13880 + }, + { + "epoch": 0.7639936154989267, + "grad_norm": 0.7305072546005249, + "learning_rate": 6.8363660268617405e-06, + "loss": 0.717, + "step": 13881 + }, + { + "epoch": 0.7640486543012824, + "grad_norm": 0.6580411195755005, + "learning_rate": 6.835962846979294e-06, + "loss": 0.7585, + "step": 13882 + }, + { + "epoch": 0.764103693103638, + "grad_norm": 0.7568425536155701, + "learning_rate": 6.835559653298242e-06, + "loss": 0.8273, + "step": 13883 + }, + { + "epoch": 0.7641587319059937, + "grad_norm": 0.8121107816696167, + "learning_rate": 6.835156445821616e-06, + "loss": 0.9064, + "step": 13884 + }, + { + "epoch": 0.7642137707083494, + "grad_norm": 0.6522091031074524, + "learning_rate": 6.834753224552444e-06, + "loss": 0.767, + "step": 13885 + }, + { + "epoch": 0.7642688095107051, + "grad_norm": 1.0779389142990112, + "learning_rate": 6.8343499894937574e-06, + "loss": 0.7702, + "step": 13886 + }, + { + "epoch": 0.7643238483130607, + "grad_norm": 0.6902838349342346, + "learning_rate": 6.833946740648588e-06, + "loss": 0.6529, + "step": 13887 + }, + { + "epoch": 0.7643788871154164, + "grad_norm": 0.692480742931366, + "learning_rate": 6.833543478019966e-06, + "loss": 0.7404, + "step": 13888 + }, + { + "epoch": 0.764433925917772, + "grad_norm": 0.633627712726593, + "learning_rate": 6.833140201610923e-06, + "loss": 0.711, + "step": 13889 + }, + { + "epoch": 0.7644889647201277, + "grad_norm": 0.8653294444084167, + "learning_rate": 6.832736911424487e-06, + "loss": 0.8102, + "step": 13890 + }, + { + "epoch": 0.7645440035224833, + "grad_norm": 0.7864197492599487, + "learning_rate": 6.832333607463692e-06, + "loss": 0.7064, + "step": 13891 + }, + { + "epoch": 0.764599042324839, + "grad_norm": 0.6703711748123169, + "learning_rate": 6.831930289731569e-06, + "loss": 0.7653, + "step": 13892 + }, + { + "epoch": 0.7646540811271947, + "grad_norm": 0.7420178651809692, + "learning_rate": 6.831526958231147e-06, + "loss": 0.8137, + "step": 13893 + }, + { + "epoch": 0.7647091199295504, + "grad_norm": 0.7372543215751648, + "learning_rate": 6.831123612965459e-06, + "loss": 0.6871, + "step": 13894 + }, + { + "epoch": 0.764764158731906, + "grad_norm": 0.77486652135849, + "learning_rate": 6.830720253937536e-06, + "loss": 0.727, + "step": 13895 + }, + { + "epoch": 0.7648191975342616, + "grad_norm": 0.7087406516075134, + "learning_rate": 6.83031688115041e-06, + "loss": 0.7743, + "step": 13896 + }, + { + "epoch": 0.7648742363366173, + "grad_norm": 0.8415336608886719, + "learning_rate": 6.829913494607112e-06, + "loss": 0.774, + "step": 13897 + }, + { + "epoch": 0.764929275138973, + "grad_norm": 0.7736749053001404, + "learning_rate": 6.829510094310674e-06, + "loss": 0.7541, + "step": 13898 + }, + { + "epoch": 0.7649843139413286, + "grad_norm": 0.6749987602233887, + "learning_rate": 6.829106680264128e-06, + "loss": 0.7139, + "step": 13899 + }, + { + "epoch": 0.7650393527436843, + "grad_norm": 0.7079635262489319, + "learning_rate": 6.8287032524705055e-06, + "loss": 0.75, + "step": 13900 + }, + { + "epoch": 0.76509439154604, + "grad_norm": 0.6906388401985168, + "learning_rate": 6.828299810932839e-06, + "loss": 0.6895, + "step": 13901 + }, + { + "epoch": 0.7651494303483957, + "grad_norm": 0.7045881152153015, + "learning_rate": 6.82789635565416e-06, + "loss": 0.8728, + "step": 13902 + }, + { + "epoch": 0.7652044691507512, + "grad_norm": 0.6836426258087158, + "learning_rate": 6.827492886637501e-06, + "loss": 0.7315, + "step": 13903 + }, + { + "epoch": 0.7652595079531069, + "grad_norm": 0.6467520594596863, + "learning_rate": 6.827089403885896e-06, + "loss": 0.7556, + "step": 13904 + }, + { + "epoch": 0.7653145467554626, + "grad_norm": 0.7118285894393921, + "learning_rate": 6.826685907402376e-06, + "loss": 0.8686, + "step": 13905 + }, + { + "epoch": 0.7653695855578183, + "grad_norm": 0.6093236207962036, + "learning_rate": 6.826282397189974e-06, + "loss": 0.7066, + "step": 13906 + }, + { + "epoch": 0.7654246243601739, + "grad_norm": 0.6839649677276611, + "learning_rate": 6.825878873251721e-06, + "loss": 0.7025, + "step": 13907 + }, + { + "epoch": 0.7654796631625296, + "grad_norm": 0.7582715153694153, + "learning_rate": 6.825475335590652e-06, + "loss": 0.7301, + "step": 13908 + }, + { + "epoch": 0.7655347019648853, + "grad_norm": 0.6580978631973267, + "learning_rate": 6.8250717842098e-06, + "loss": 0.6771, + "step": 13909 + }, + { + "epoch": 0.765589740767241, + "grad_norm": 0.6754937171936035, + "learning_rate": 6.824668219112195e-06, + "loss": 0.7446, + "step": 13910 + }, + { + "epoch": 0.7656447795695965, + "grad_norm": 0.7541018724441528, + "learning_rate": 6.8242646403008725e-06, + "loss": 0.802, + "step": 13911 + }, + { + "epoch": 0.7656998183719522, + "grad_norm": 0.6714808344841003, + "learning_rate": 6.823861047778866e-06, + "loss": 0.7334, + "step": 13912 + }, + { + "epoch": 0.7657548571743079, + "grad_norm": 0.6972425580024719, + "learning_rate": 6.823457441549209e-06, + "loss": 0.7859, + "step": 13913 + }, + { + "epoch": 0.7658098959766636, + "grad_norm": 0.6660878658294678, + "learning_rate": 6.823053821614931e-06, + "loss": 0.6594, + "step": 13914 + }, + { + "epoch": 0.7658649347790192, + "grad_norm": 0.7392181158065796, + "learning_rate": 6.82265018797907e-06, + "loss": 0.6667, + "step": 13915 + }, + { + "epoch": 0.7659199735813749, + "grad_norm": 0.7601449489593506, + "learning_rate": 6.822246540644659e-06, + "loss": 0.7349, + "step": 13916 + }, + { + "epoch": 0.7659750123837306, + "grad_norm": 0.6648421287536621, + "learning_rate": 6.821842879614731e-06, + "loss": 0.7597, + "step": 13917 + }, + { + "epoch": 0.7660300511860861, + "grad_norm": 0.6369950175285339, + "learning_rate": 6.821439204892317e-06, + "loss": 0.7452, + "step": 13918 + }, + { + "epoch": 0.7660850899884418, + "grad_norm": 0.747653603553772, + "learning_rate": 6.821035516480457e-06, + "loss": 0.693, + "step": 13919 + }, + { + "epoch": 0.7661401287907975, + "grad_norm": 0.6450137495994568, + "learning_rate": 6.8206318143821795e-06, + "loss": 0.6492, + "step": 13920 + }, + { + "epoch": 0.7661951675931532, + "grad_norm": 0.707801878452301, + "learning_rate": 6.8202280986005205e-06, + "loss": 0.7284, + "step": 13921 + }, + { + "epoch": 0.7662502063955088, + "grad_norm": 0.7191962003707886, + "learning_rate": 6.8198243691385146e-06, + "loss": 0.7714, + "step": 13922 + }, + { + "epoch": 0.7663052451978645, + "grad_norm": 0.7477172613143921, + "learning_rate": 6.819420625999196e-06, + "loss": 0.7076, + "step": 13923 + }, + { + "epoch": 0.7663602840002202, + "grad_norm": 0.6221175193786621, + "learning_rate": 6.819016869185599e-06, + "loss": 0.6848, + "step": 13924 + }, + { + "epoch": 0.7664153228025758, + "grad_norm": 0.7840436697006226, + "learning_rate": 6.818613098700758e-06, + "loss": 0.7028, + "step": 13925 + }, + { + "epoch": 0.7664703616049314, + "grad_norm": 0.7147907018661499, + "learning_rate": 6.818209314547707e-06, + "loss": 0.7242, + "step": 13926 + }, + { + "epoch": 0.7665254004072871, + "grad_norm": 0.6627985835075378, + "learning_rate": 6.817805516729482e-06, + "loss": 0.7177, + "step": 13927 + }, + { + "epoch": 0.7665804392096428, + "grad_norm": 0.8019070625305176, + "learning_rate": 6.817401705249118e-06, + "loss": 0.6594, + "step": 13928 + }, + { + "epoch": 0.7666354780119985, + "grad_norm": 0.7127207517623901, + "learning_rate": 6.816997880109649e-06, + "loss": 0.8282, + "step": 13929 + }, + { + "epoch": 0.7666905168143541, + "grad_norm": 0.7335825562477112, + "learning_rate": 6.816594041314111e-06, + "loss": 0.7593, + "step": 13930 + }, + { + "epoch": 0.7667455556167098, + "grad_norm": 0.6878668069839478, + "learning_rate": 6.816190188865538e-06, + "loss": 0.7898, + "step": 13931 + }, + { + "epoch": 0.7668005944190655, + "grad_norm": 0.6441968679428101, + "learning_rate": 6.815786322766965e-06, + "loss": 0.6795, + "step": 13932 + }, + { + "epoch": 0.7668556332214211, + "grad_norm": 0.6503410339355469, + "learning_rate": 6.815382443021429e-06, + "loss": 0.753, + "step": 13933 + }, + { + "epoch": 0.7669106720237767, + "grad_norm": 0.6734908223152161, + "learning_rate": 6.8149785496319645e-06, + "loss": 0.7145, + "step": 13934 + }, + { + "epoch": 0.7669657108261324, + "grad_norm": 0.8363823890686035, + "learning_rate": 6.814574642601606e-06, + "loss": 0.8499, + "step": 13935 + }, + { + "epoch": 0.7670207496284881, + "grad_norm": 0.6986021995544434, + "learning_rate": 6.81417072193339e-06, + "loss": 0.7101, + "step": 13936 + }, + { + "epoch": 0.7670757884308438, + "grad_norm": 0.9656592011451721, + "learning_rate": 6.813766787630354e-06, + "loss": 0.7841, + "step": 13937 + }, + { + "epoch": 0.7671308272331994, + "grad_norm": 0.6830777525901794, + "learning_rate": 6.813362839695532e-06, + "loss": 0.7443, + "step": 13938 + }, + { + "epoch": 0.767185866035555, + "grad_norm": 0.6358513236045837, + "learning_rate": 6.812958878131959e-06, + "loss": 0.7017, + "step": 13939 + }, + { + "epoch": 0.7672409048379107, + "grad_norm": 0.9075862169265747, + "learning_rate": 6.812554902942673e-06, + "loss": 0.6991, + "step": 13940 + }, + { + "epoch": 0.7672959436402664, + "grad_norm": 0.7004347443580627, + "learning_rate": 6.812150914130709e-06, + "loss": 0.6519, + "step": 13941 + }, + { + "epoch": 0.767350982442622, + "grad_norm": 0.6648300886154175, + "learning_rate": 6.811746911699105e-06, + "loss": 0.7044, + "step": 13942 + }, + { + "epoch": 0.7674060212449777, + "grad_norm": 0.7050208449363708, + "learning_rate": 6.811342895650896e-06, + "loss": 0.78, + "step": 13943 + }, + { + "epoch": 0.7674610600473334, + "grad_norm": 0.6387132406234741, + "learning_rate": 6.810938865989119e-06, + "loss": 0.6062, + "step": 13944 + }, + { + "epoch": 0.7675160988496891, + "grad_norm": 0.6441114544868469, + "learning_rate": 6.81053482271681e-06, + "loss": 0.7252, + "step": 13945 + }, + { + "epoch": 0.7675711376520447, + "grad_norm": 0.7309751510620117, + "learning_rate": 6.810130765837006e-06, + "loss": 0.6407, + "step": 13946 + }, + { + "epoch": 0.7676261764544003, + "grad_norm": 0.7132161259651184, + "learning_rate": 6.809726695352742e-06, + "loss": 0.8341, + "step": 13947 + }, + { + "epoch": 0.767681215256756, + "grad_norm": 0.7214738726615906, + "learning_rate": 6.809322611267058e-06, + "loss": 0.8357, + "step": 13948 + }, + { + "epoch": 0.7677362540591117, + "grad_norm": 0.6410175561904907, + "learning_rate": 6.80891851358299e-06, + "loss": 0.6718, + "step": 13949 + }, + { + "epoch": 0.7677912928614673, + "grad_norm": 0.8888845443725586, + "learning_rate": 6.8085144023035745e-06, + "loss": 0.7823, + "step": 13950 + }, + { + "epoch": 0.767846331663823, + "grad_norm": 0.7327878475189209, + "learning_rate": 6.808110277431848e-06, + "loss": 0.7083, + "step": 13951 + }, + { + "epoch": 0.7679013704661787, + "grad_norm": 0.6871985793113708, + "learning_rate": 6.807706138970849e-06, + "loss": 0.7808, + "step": 13952 + }, + { + "epoch": 0.7679564092685344, + "grad_norm": 0.6939501762390137, + "learning_rate": 6.8073019869236134e-06, + "loss": 0.693, + "step": 13953 + }, + { + "epoch": 0.76801144807089, + "grad_norm": 0.7377064824104309, + "learning_rate": 6.8068978212931814e-06, + "loss": 0.9322, + "step": 13954 + }, + { + "epoch": 0.7680664868732456, + "grad_norm": 0.8165044188499451, + "learning_rate": 6.80649364208259e-06, + "loss": 0.6846, + "step": 13955 + }, + { + "epoch": 0.7681215256756013, + "grad_norm": 0.6774152517318726, + "learning_rate": 6.806089449294875e-06, + "loss": 0.8503, + "step": 13956 + }, + { + "epoch": 0.768176564477957, + "grad_norm": 0.7773441076278687, + "learning_rate": 6.805685242933074e-06, + "loss": 0.8775, + "step": 13957 + }, + { + "epoch": 0.7682316032803126, + "grad_norm": 0.6710473895072937, + "learning_rate": 6.805281023000227e-06, + "loss": 0.7831, + "step": 13958 + }, + { + "epoch": 0.7682866420826683, + "grad_norm": 0.6163424849510193, + "learning_rate": 6.80487678949937e-06, + "loss": 0.7309, + "step": 13959 + }, + { + "epoch": 0.768341680885024, + "grad_norm": 0.6851963400840759, + "learning_rate": 6.804472542433543e-06, + "loss": 0.6556, + "step": 13960 + }, + { + "epoch": 0.7683967196873795, + "grad_norm": 0.6881004571914673, + "learning_rate": 6.804068281805784e-06, + "loss": 0.7115, + "step": 13961 + }, + { + "epoch": 0.7684517584897352, + "grad_norm": 0.7372351884841919, + "learning_rate": 6.8036640076191304e-06, + "loss": 0.7869, + "step": 13962 + }, + { + "epoch": 0.7685067972920909, + "grad_norm": 0.7900989055633545, + "learning_rate": 6.8032597198766205e-06, + "loss": 0.7419, + "step": 13963 + }, + { + "epoch": 0.7685618360944466, + "grad_norm": 0.7245132327079773, + "learning_rate": 6.802855418581294e-06, + "loss": 0.8175, + "step": 13964 + }, + { + "epoch": 0.7686168748968022, + "grad_norm": 0.6681550741195679, + "learning_rate": 6.802451103736188e-06, + "loss": 0.773, + "step": 13965 + }, + { + "epoch": 0.7686719136991579, + "grad_norm": 0.6316970586776733, + "learning_rate": 6.802046775344343e-06, + "loss": 0.6597, + "step": 13966 + }, + { + "epoch": 0.7687269525015136, + "grad_norm": 0.7201604843139648, + "learning_rate": 6.801642433408796e-06, + "loss": 0.7205, + "step": 13967 + }, + { + "epoch": 0.7687819913038693, + "grad_norm": 0.6226171851158142, + "learning_rate": 6.801238077932587e-06, + "loss": 0.7271, + "step": 13968 + }, + { + "epoch": 0.7688370301062248, + "grad_norm": 0.833369255065918, + "learning_rate": 6.800833708918755e-06, + "loss": 0.7731, + "step": 13969 + }, + { + "epoch": 0.7688920689085805, + "grad_norm": 0.7280329465866089, + "learning_rate": 6.800429326370339e-06, + "loss": 0.7833, + "step": 13970 + }, + { + "epoch": 0.7689471077109362, + "grad_norm": 0.7581672072410583, + "learning_rate": 6.800024930290376e-06, + "loss": 0.8008, + "step": 13971 + }, + { + "epoch": 0.7690021465132919, + "grad_norm": 0.7931516170501709, + "learning_rate": 6.79962052068191e-06, + "loss": 0.8884, + "step": 13972 + }, + { + "epoch": 0.7690571853156475, + "grad_norm": 0.8455879092216492, + "learning_rate": 6.799216097547977e-06, + "loss": 0.8109, + "step": 13973 + }, + { + "epoch": 0.7691122241180032, + "grad_norm": 0.687336266040802, + "learning_rate": 6.798811660891618e-06, + "loss": 0.783, + "step": 13974 + }, + { + "epoch": 0.7691672629203589, + "grad_norm": 0.7661089897155762, + "learning_rate": 6.7984072107158696e-06, + "loss": 0.8448, + "step": 13975 + }, + { + "epoch": 0.7692223017227146, + "grad_norm": 0.6965043544769287, + "learning_rate": 6.798002747023776e-06, + "loss": 0.7421, + "step": 13976 + }, + { + "epoch": 0.7692773405250701, + "grad_norm": 0.7373656630516052, + "learning_rate": 6.797598269818375e-06, + "loss": 0.7093, + "step": 13977 + }, + { + "epoch": 0.7693323793274258, + "grad_norm": 0.6387331485748291, + "learning_rate": 6.7971937791027064e-06, + "loss": 0.7811, + "step": 13978 + }, + { + "epoch": 0.7693874181297815, + "grad_norm": 0.7566075325012207, + "learning_rate": 6.796789274879811e-06, + "loss": 0.8245, + "step": 13979 + }, + { + "epoch": 0.7694424569321372, + "grad_norm": 0.7035738229751587, + "learning_rate": 6.796384757152729e-06, + "loss": 0.7674, + "step": 13980 + }, + { + "epoch": 0.7694974957344928, + "grad_norm": 0.8265605568885803, + "learning_rate": 6.795980225924499e-06, + "loss": 0.7755, + "step": 13981 + }, + { + "epoch": 0.7695525345368485, + "grad_norm": 0.709454357624054, + "learning_rate": 6.7955756811981625e-06, + "loss": 0.8651, + "step": 13982 + }, + { + "epoch": 0.7696075733392042, + "grad_norm": 0.7075764536857605, + "learning_rate": 6.795171122976758e-06, + "loss": 0.7371, + "step": 13983 + }, + { + "epoch": 0.7696626121415598, + "grad_norm": 0.7027561664581299, + "learning_rate": 6.79476655126333e-06, + "loss": 0.7763, + "step": 13984 + }, + { + "epoch": 0.7697176509439154, + "grad_norm": 0.7922375202178955, + "learning_rate": 6.794361966060916e-06, + "loss": 0.7677, + "step": 13985 + }, + { + "epoch": 0.7697726897462711, + "grad_norm": 0.7185537219047546, + "learning_rate": 6.793957367372559e-06, + "loss": 0.7229, + "step": 13986 + }, + { + "epoch": 0.7698277285486268, + "grad_norm": 0.7173545956611633, + "learning_rate": 6.793552755201297e-06, + "loss": 0.7508, + "step": 13987 + }, + { + "epoch": 0.7698827673509825, + "grad_norm": 0.7743139863014221, + "learning_rate": 6.793148129550175e-06, + "loss": 0.7305, + "step": 13988 + }, + { + "epoch": 0.7699378061533381, + "grad_norm": 0.7992164492607117, + "learning_rate": 6.792743490422229e-06, + "loss": 0.7212, + "step": 13989 + }, + { + "epoch": 0.7699928449556938, + "grad_norm": 0.7437503337860107, + "learning_rate": 6.792338837820504e-06, + "loss": 0.6396, + "step": 13990 + }, + { + "epoch": 0.7700478837580494, + "grad_norm": 0.6908634305000305, + "learning_rate": 6.79193417174804e-06, + "loss": 0.7279, + "step": 13991 + }, + { + "epoch": 0.7701029225604051, + "grad_norm": 0.6894391775131226, + "learning_rate": 6.7915294922078805e-06, + "loss": 0.7615, + "step": 13992 + }, + { + "epoch": 0.7701579613627607, + "grad_norm": 0.7162172794342041, + "learning_rate": 6.791124799203062e-06, + "loss": 0.7404, + "step": 13993 + }, + { + "epoch": 0.7702130001651164, + "grad_norm": 0.6469258069992065, + "learning_rate": 6.79072009273663e-06, + "loss": 0.7035, + "step": 13994 + }, + { + "epoch": 0.7702680389674721, + "grad_norm": 0.6456457376480103, + "learning_rate": 6.790315372811625e-06, + "loss": 0.708, + "step": 13995 + }, + { + "epoch": 0.7703230777698278, + "grad_norm": 0.7880644798278809, + "learning_rate": 6.789910639431089e-06, + "loss": 0.7723, + "step": 13996 + }, + { + "epoch": 0.7703781165721834, + "grad_norm": 0.7847834229469299, + "learning_rate": 6.789505892598063e-06, + "loss": 0.8585, + "step": 13997 + }, + { + "epoch": 0.770433155374539, + "grad_norm": 0.6909215450286865, + "learning_rate": 6.789101132315591e-06, + "loss": 0.7107, + "step": 13998 + }, + { + "epoch": 0.7704881941768947, + "grad_norm": 0.7883939146995544, + "learning_rate": 6.788696358586713e-06, + "loss": 0.7575, + "step": 13999 + }, + { + "epoch": 0.7705432329792504, + "grad_norm": 0.6629998087882996, + "learning_rate": 6.788291571414472e-06, + "loss": 0.7273, + "step": 14000 + }, + { + "epoch": 0.770598271781606, + "grad_norm": 0.7548647522926331, + "learning_rate": 6.7878867708019106e-06, + "loss": 0.8214, + "step": 14001 + }, + { + "epoch": 0.7706533105839617, + "grad_norm": 0.6721330881118774, + "learning_rate": 6.78748195675207e-06, + "loss": 0.7153, + "step": 14002 + }, + { + "epoch": 0.7707083493863174, + "grad_norm": 0.6921262145042419, + "learning_rate": 6.787077129267994e-06, + "loss": 0.7099, + "step": 14003 + }, + { + "epoch": 0.770763388188673, + "grad_norm": 0.956937849521637, + "learning_rate": 6.786672288352725e-06, + "loss": 0.6765, + "step": 14004 + }, + { + "epoch": 0.7708184269910286, + "grad_norm": 0.7265778183937073, + "learning_rate": 6.786267434009306e-06, + "loss": 0.7653, + "step": 14005 + }, + { + "epoch": 0.7708734657933843, + "grad_norm": 0.7429845929145813, + "learning_rate": 6.785862566240778e-06, + "loss": 0.8064, + "step": 14006 + }, + { + "epoch": 0.77092850459574, + "grad_norm": 0.7437632083892822, + "learning_rate": 6.785457685050184e-06, + "loss": 0.7138, + "step": 14007 + }, + { + "epoch": 0.7709835433980956, + "grad_norm": 0.7218232750892639, + "learning_rate": 6.7850527904405695e-06, + "loss": 0.7785, + "step": 14008 + }, + { + "epoch": 0.7710385822004513, + "grad_norm": 0.7131973505020142, + "learning_rate": 6.784647882414977e-06, + "loss": 0.7651, + "step": 14009 + }, + { + "epoch": 0.771093621002807, + "grad_norm": 0.739919126033783, + "learning_rate": 6.784242960976447e-06, + "loss": 0.7993, + "step": 14010 + }, + { + "epoch": 0.7711486598051627, + "grad_norm": 0.6655608415603638, + "learning_rate": 6.783838026128025e-06, + "loss": 0.7394, + "step": 14011 + }, + { + "epoch": 0.7712036986075183, + "grad_norm": 0.9327310919761658, + "learning_rate": 6.783433077872753e-06, + "loss": 0.8737, + "step": 14012 + }, + { + "epoch": 0.7712587374098739, + "grad_norm": 0.5928294062614441, + "learning_rate": 6.783028116213677e-06, + "loss": 0.5819, + "step": 14013 + }, + { + "epoch": 0.7713137762122296, + "grad_norm": 0.6752136945724487, + "learning_rate": 6.782623141153838e-06, + "loss": 0.8021, + "step": 14014 + }, + { + "epoch": 0.7713688150145853, + "grad_norm": 0.6452222466468811, + "learning_rate": 6.78221815269628e-06, + "loss": 0.7806, + "step": 14015 + }, + { + "epoch": 0.7714238538169409, + "grad_norm": 0.7725237607955933, + "learning_rate": 6.78181315084405e-06, + "loss": 0.7679, + "step": 14016 + }, + { + "epoch": 0.7714788926192966, + "grad_norm": 0.6594743728637695, + "learning_rate": 6.781408135600187e-06, + "loss": 0.7254, + "step": 14017 + }, + { + "epoch": 0.7715339314216523, + "grad_norm": 0.7008917927742004, + "learning_rate": 6.7810031069677385e-06, + "loss": 0.705, + "step": 14018 + }, + { + "epoch": 0.771588970224008, + "grad_norm": 0.9435684084892273, + "learning_rate": 6.780598064949746e-06, + "loss": 0.7787, + "step": 14019 + }, + { + "epoch": 0.7716440090263635, + "grad_norm": 0.6615981459617615, + "learning_rate": 6.780193009549256e-06, + "loss": 0.7592, + "step": 14020 + }, + { + "epoch": 0.7716990478287192, + "grad_norm": 0.7042600512504578, + "learning_rate": 6.7797879407693115e-06, + "loss": 0.719, + "step": 14021 + }, + { + "epoch": 0.7717540866310749, + "grad_norm": 0.7135425209999084, + "learning_rate": 6.779382858612957e-06, + "loss": 0.739, + "step": 14022 + }, + { + "epoch": 0.7718091254334306, + "grad_norm": 0.6546016931533813, + "learning_rate": 6.778977763083238e-06, + "loss": 0.7039, + "step": 14023 + }, + { + "epoch": 0.7718641642357862, + "grad_norm": 0.8549250960350037, + "learning_rate": 6.778572654183198e-06, + "loss": 0.8384, + "step": 14024 + }, + { + "epoch": 0.7719192030381419, + "grad_norm": 0.7008731365203857, + "learning_rate": 6.778167531915882e-06, + "loss": 0.776, + "step": 14025 + }, + { + "epoch": 0.7719742418404976, + "grad_norm": 0.7047393321990967, + "learning_rate": 6.7777623962843355e-06, + "loss": 0.819, + "step": 14026 + }, + { + "epoch": 0.7720292806428533, + "grad_norm": 0.7015580534934998, + "learning_rate": 6.777357247291601e-06, + "loss": 0.8339, + "step": 14027 + }, + { + "epoch": 0.7720843194452088, + "grad_norm": 0.7008551955223083, + "learning_rate": 6.776952084940727e-06, + "loss": 0.783, + "step": 14028 + }, + { + "epoch": 0.7721393582475645, + "grad_norm": 1.0310637950897217, + "learning_rate": 6.776546909234757e-06, + "loss": 0.7447, + "step": 14029 + }, + { + "epoch": 0.7721943970499202, + "grad_norm": 0.6264338493347168, + "learning_rate": 6.776141720176734e-06, + "loss": 0.5542, + "step": 14030 + }, + { + "epoch": 0.7722494358522759, + "grad_norm": 0.6249508261680603, + "learning_rate": 6.775736517769707e-06, + "loss": 0.6514, + "step": 14031 + }, + { + "epoch": 0.7723044746546315, + "grad_norm": 0.6741732954978943, + "learning_rate": 6.775331302016719e-06, + "loss": 0.6967, + "step": 14032 + }, + { + "epoch": 0.7723595134569872, + "grad_norm": 0.7342913746833801, + "learning_rate": 6.774926072920815e-06, + "loss": 0.8279, + "step": 14033 + }, + { + "epoch": 0.7724145522593429, + "grad_norm": 0.7702916264533997, + "learning_rate": 6.774520830485044e-06, + "loss": 0.8539, + "step": 14034 + }, + { + "epoch": 0.7724695910616985, + "grad_norm": 0.7873550057411194, + "learning_rate": 6.774115574712448e-06, + "loss": 0.6999, + "step": 14035 + }, + { + "epoch": 0.7725246298640541, + "grad_norm": 0.6832353472709656, + "learning_rate": 6.773710305606074e-06, + "loss": 0.7246, + "step": 14036 + }, + { + "epoch": 0.7725796686664098, + "grad_norm": 0.7547367215156555, + "learning_rate": 6.773305023168969e-06, + "loss": 0.7357, + "step": 14037 + }, + { + "epoch": 0.7726347074687655, + "grad_norm": 0.7146826386451721, + "learning_rate": 6.772899727404178e-06, + "loss": 0.6742, + "step": 14038 + }, + { + "epoch": 0.7726897462711212, + "grad_norm": 0.7623558640480042, + "learning_rate": 6.772494418314748e-06, + "loss": 0.7729, + "step": 14039 + }, + { + "epoch": 0.7727447850734768, + "grad_norm": 0.637706458568573, + "learning_rate": 6.772089095903723e-06, + "loss": 0.6662, + "step": 14040 + }, + { + "epoch": 0.7727998238758325, + "grad_norm": 0.7293589115142822, + "learning_rate": 6.771683760174151e-06, + "loss": 0.7899, + "step": 14041 + }, + { + "epoch": 0.7728548626781881, + "grad_norm": 0.7191390991210938, + "learning_rate": 6.771278411129079e-06, + "loss": 0.6912, + "step": 14042 + }, + { + "epoch": 0.7729099014805438, + "grad_norm": 0.8264575004577637, + "learning_rate": 6.770873048771552e-06, + "loss": 0.7027, + "step": 14043 + }, + { + "epoch": 0.7729649402828994, + "grad_norm": 0.7490931749343872, + "learning_rate": 6.770467673104617e-06, + "loss": 0.6917, + "step": 14044 + }, + { + "epoch": 0.7730199790852551, + "grad_norm": 0.6901552081108093, + "learning_rate": 6.77006228413132e-06, + "loss": 0.8097, + "step": 14045 + }, + { + "epoch": 0.7730750178876108, + "grad_norm": 0.6340280175209045, + "learning_rate": 6.76965688185471e-06, + "loss": 0.6309, + "step": 14046 + }, + { + "epoch": 0.7731300566899664, + "grad_norm": 0.6807279586791992, + "learning_rate": 6.7692514662778315e-06, + "loss": 0.7744, + "step": 14047 + }, + { + "epoch": 0.7731850954923221, + "grad_norm": 1.2796865701675415, + "learning_rate": 6.7688460374037335e-06, + "loss": 0.7499, + "step": 14048 + }, + { + "epoch": 0.7732401342946778, + "grad_norm": 0.7059674263000488, + "learning_rate": 6.768440595235463e-06, + "loss": 0.8705, + "step": 14049 + }, + { + "epoch": 0.7732951730970334, + "grad_norm": 0.7626641392707825, + "learning_rate": 6.768035139776066e-06, + "loss": 0.8448, + "step": 14050 + }, + { + "epoch": 0.773350211899389, + "grad_norm": 0.6590229868888855, + "learning_rate": 6.767629671028588e-06, + "loss": 0.6796, + "step": 14051 + }, + { + "epoch": 0.7734052507017447, + "grad_norm": 0.6702030301094055, + "learning_rate": 6.767224188996081e-06, + "loss": 0.7087, + "step": 14052 + }, + { + "epoch": 0.7734602895041004, + "grad_norm": 0.670612096786499, + "learning_rate": 6.76681869368159e-06, + "loss": 0.7203, + "step": 14053 + }, + { + "epoch": 0.7735153283064561, + "grad_norm": 0.6892215013504028, + "learning_rate": 6.766413185088161e-06, + "loss": 0.6891, + "step": 14054 + }, + { + "epoch": 0.7735703671088117, + "grad_norm": 0.8354474902153015, + "learning_rate": 6.766007663218843e-06, + "loss": 0.7378, + "step": 14055 + }, + { + "epoch": 0.7736254059111674, + "grad_norm": 0.7633876204490662, + "learning_rate": 6.765602128076686e-06, + "loss": 0.6916, + "step": 14056 + }, + { + "epoch": 0.773680444713523, + "grad_norm": 0.7249060869216919, + "learning_rate": 6.765196579664736e-06, + "loss": 0.791, + "step": 14057 + }, + { + "epoch": 0.7737354835158787, + "grad_norm": 0.7033042311668396, + "learning_rate": 6.7647910179860395e-06, + "loss": 0.6799, + "step": 14058 + }, + { + "epoch": 0.7737905223182343, + "grad_norm": 0.7087684273719788, + "learning_rate": 6.7643854430436466e-06, + "loss": 0.6389, + "step": 14059 + }, + { + "epoch": 0.77384556112059, + "grad_norm": 0.6433978080749512, + "learning_rate": 6.763979854840606e-06, + "loss": 0.7214, + "step": 14060 + }, + { + "epoch": 0.7739005999229457, + "grad_norm": 0.7777101993560791, + "learning_rate": 6.763574253379964e-06, + "loss": 0.7458, + "step": 14061 + }, + { + "epoch": 0.7739556387253014, + "grad_norm": 0.7065346240997314, + "learning_rate": 6.763168638664771e-06, + "loss": 0.7663, + "step": 14062 + }, + { + "epoch": 0.774010677527657, + "grad_norm": 0.7136278748512268, + "learning_rate": 6.762763010698074e-06, + "loss": 0.667, + "step": 14063 + }, + { + "epoch": 0.7740657163300126, + "grad_norm": 0.6670508980751038, + "learning_rate": 6.762357369482921e-06, + "loss": 0.7462, + "step": 14064 + }, + { + "epoch": 0.7741207551323683, + "grad_norm": 0.6366799473762512, + "learning_rate": 6.7619517150223635e-06, + "loss": 0.7147, + "step": 14065 + }, + { + "epoch": 0.774175793934724, + "grad_norm": 0.5999431610107422, + "learning_rate": 6.761546047319447e-06, + "loss": 0.667, + "step": 14066 + }, + { + "epoch": 0.7742308327370796, + "grad_norm": 0.6751196980476379, + "learning_rate": 6.761140366377222e-06, + "loss": 0.7255, + "step": 14067 + }, + { + "epoch": 0.7742858715394353, + "grad_norm": 0.6786272525787354, + "learning_rate": 6.760734672198738e-06, + "loss": 0.7694, + "step": 14068 + }, + { + "epoch": 0.774340910341791, + "grad_norm": 0.6915947794914246, + "learning_rate": 6.760328964787044e-06, + "loss": 0.7955, + "step": 14069 + }, + { + "epoch": 0.7743959491441467, + "grad_norm": 0.7041972279548645, + "learning_rate": 6.759923244145188e-06, + "loss": 0.6542, + "step": 14070 + }, + { + "epoch": 0.7744509879465022, + "grad_norm": 0.6384761333465576, + "learning_rate": 6.759517510276221e-06, + "loss": 0.7384, + "step": 14071 + }, + { + "epoch": 0.7745060267488579, + "grad_norm": 0.7430800199508667, + "learning_rate": 6.759111763183189e-06, + "loss": 0.7587, + "step": 14072 + }, + { + "epoch": 0.7745610655512136, + "grad_norm": 0.6568213701248169, + "learning_rate": 6.758706002869146e-06, + "loss": 0.7118, + "step": 14073 + }, + { + "epoch": 0.7746161043535693, + "grad_norm": 0.8791618943214417, + "learning_rate": 6.75830022933714e-06, + "loss": 0.8049, + "step": 14074 + }, + { + "epoch": 0.7746711431559249, + "grad_norm": 0.6377304792404175, + "learning_rate": 6.75789444259022e-06, + "loss": 0.737, + "step": 14075 + }, + { + "epoch": 0.7747261819582806, + "grad_norm": 0.7253721356391907, + "learning_rate": 6.757488642631434e-06, + "loss": 0.8432, + "step": 14076 + }, + { + "epoch": 0.7747812207606363, + "grad_norm": 0.684626042842865, + "learning_rate": 6.757082829463835e-06, + "loss": 0.7845, + "step": 14077 + }, + { + "epoch": 0.774836259562992, + "grad_norm": 0.7737520337104797, + "learning_rate": 6.756677003090471e-06, + "loss": 0.8055, + "step": 14078 + }, + { + "epoch": 0.7748912983653475, + "grad_norm": 0.7294824719429016, + "learning_rate": 6.756271163514394e-06, + "loss": 0.7666, + "step": 14079 + }, + { + "epoch": 0.7749463371677032, + "grad_norm": 0.7728607654571533, + "learning_rate": 6.755865310738651e-06, + "loss": 0.7748, + "step": 14080 + }, + { + "epoch": 0.7750013759700589, + "grad_norm": 0.6738442778587341, + "learning_rate": 6.755459444766297e-06, + "loss": 0.6711, + "step": 14081 + }, + { + "epoch": 0.7750564147724146, + "grad_norm": 0.7041414976119995, + "learning_rate": 6.7550535656003794e-06, + "loss": 0.7126, + "step": 14082 + }, + { + "epoch": 0.7751114535747702, + "grad_norm": 1.0205422639846802, + "learning_rate": 6.754647673243948e-06, + "loss": 0.7394, + "step": 14083 + }, + { + "epoch": 0.7751664923771259, + "grad_norm": 0.6594380736351013, + "learning_rate": 6.754241767700054e-06, + "loss": 0.7599, + "step": 14084 + }, + { + "epoch": 0.7752215311794816, + "grad_norm": 0.6800520420074463, + "learning_rate": 6.753835848971749e-06, + "loss": 0.7579, + "step": 14085 + }, + { + "epoch": 0.7752765699818372, + "grad_norm": 0.7658087611198425, + "learning_rate": 6.7534299170620846e-06, + "loss": 0.7705, + "step": 14086 + }, + { + "epoch": 0.7753316087841928, + "grad_norm": 0.7242750525474548, + "learning_rate": 6.7530239719741084e-06, + "loss": 0.7683, + "step": 14087 + }, + { + "epoch": 0.7753866475865485, + "grad_norm": 0.6997398138046265, + "learning_rate": 6.752618013710874e-06, + "loss": 0.8023, + "step": 14088 + }, + { + "epoch": 0.7754416863889042, + "grad_norm": 0.7041590809822083, + "learning_rate": 6.752212042275431e-06, + "loss": 0.7013, + "step": 14089 + }, + { + "epoch": 0.7754967251912598, + "grad_norm": 0.7027721405029297, + "learning_rate": 6.751806057670832e-06, + "loss": 0.7678, + "step": 14090 + }, + { + "epoch": 0.7755517639936155, + "grad_norm": 0.714290201663971, + "learning_rate": 6.751400059900128e-06, + "loss": 0.6769, + "step": 14091 + }, + { + "epoch": 0.7756068027959712, + "grad_norm": 0.7385110855102539, + "learning_rate": 6.750994048966369e-06, + "loss": 0.6576, + "step": 14092 + }, + { + "epoch": 0.7756618415983269, + "grad_norm": 0.7665147185325623, + "learning_rate": 6.750588024872607e-06, + "loss": 0.8127, + "step": 14093 + }, + { + "epoch": 0.7757168804006824, + "grad_norm": 0.6774508953094482, + "learning_rate": 6.750181987621895e-06, + "loss": 0.8112, + "step": 14094 + }, + { + "epoch": 0.7757719192030381, + "grad_norm": 0.666394054889679, + "learning_rate": 6.749775937217285e-06, + "loss": 0.6444, + "step": 14095 + }, + { + "epoch": 0.7758269580053938, + "grad_norm": 0.6557022929191589, + "learning_rate": 6.749369873661825e-06, + "loss": 0.7613, + "step": 14096 + }, + { + "epoch": 0.7758819968077495, + "grad_norm": 0.7090621590614319, + "learning_rate": 6.74896379695857e-06, + "loss": 0.7229, + "step": 14097 + }, + { + "epoch": 0.7759370356101051, + "grad_norm": 0.8117626309394836, + "learning_rate": 6.7485577071105734e-06, + "loss": 0.8002, + "step": 14098 + }, + { + "epoch": 0.7759920744124608, + "grad_norm": 0.6743370294570923, + "learning_rate": 6.748151604120883e-06, + "loss": 0.7457, + "step": 14099 + }, + { + "epoch": 0.7760471132148165, + "grad_norm": 0.7637452483177185, + "learning_rate": 6.747745487992553e-06, + "loss": 0.7471, + "step": 14100 + }, + { + "epoch": 0.7761021520171721, + "grad_norm": 0.6732922196388245, + "learning_rate": 6.747339358728636e-06, + "loss": 0.7471, + "step": 14101 + }, + { + "epoch": 0.7761571908195277, + "grad_norm": 0.7510336637496948, + "learning_rate": 6.746933216332184e-06, + "loss": 0.7252, + "step": 14102 + }, + { + "epoch": 0.7762122296218834, + "grad_norm": 0.731719434261322, + "learning_rate": 6.746527060806251e-06, + "loss": 0.8706, + "step": 14103 + }, + { + "epoch": 0.7762672684242391, + "grad_norm": 0.7625692486763, + "learning_rate": 6.746120892153886e-06, + "loss": 0.7518, + "step": 14104 + }, + { + "epoch": 0.7763223072265948, + "grad_norm": 0.6809547543525696, + "learning_rate": 6.745714710378145e-06, + "loss": 0.7172, + "step": 14105 + }, + { + "epoch": 0.7763773460289504, + "grad_norm": 0.709996223449707, + "learning_rate": 6.745308515482079e-06, + "loss": 0.7925, + "step": 14106 + }, + { + "epoch": 0.776432384831306, + "grad_norm": 0.6675372123718262, + "learning_rate": 6.744902307468742e-06, + "loss": 0.8175, + "step": 14107 + }, + { + "epoch": 0.7764874236336617, + "grad_norm": 0.6978115439414978, + "learning_rate": 6.744496086341186e-06, + "loss": 0.7895, + "step": 14108 + }, + { + "epoch": 0.7765424624360174, + "grad_norm": 0.6593814492225647, + "learning_rate": 6.7440898521024634e-06, + "loss": 0.7791, + "step": 14109 + }, + { + "epoch": 0.776597501238373, + "grad_norm": 0.7169299721717834, + "learning_rate": 6.743683604755631e-06, + "loss": 0.7944, + "step": 14110 + }, + { + "epoch": 0.7766525400407287, + "grad_norm": 0.6805511713027954, + "learning_rate": 6.743277344303738e-06, + "loss": 0.7671, + "step": 14111 + }, + { + "epoch": 0.7767075788430844, + "grad_norm": 0.7300780415534973, + "learning_rate": 6.742871070749838e-06, + "loss": 0.7789, + "step": 14112 + }, + { + "epoch": 0.7767626176454401, + "grad_norm": 0.6475857496261597, + "learning_rate": 6.742464784096987e-06, + "loss": 0.6652, + "step": 14113 + }, + { + "epoch": 0.7768176564477957, + "grad_norm": 0.6941269040107727, + "learning_rate": 6.742058484348236e-06, + "loss": 0.8138, + "step": 14114 + }, + { + "epoch": 0.7768726952501513, + "grad_norm": 0.6175981760025024, + "learning_rate": 6.7416521715066405e-06, + "loss": 0.7667, + "step": 14115 + }, + { + "epoch": 0.776927734052507, + "grad_norm": 0.6499401330947876, + "learning_rate": 6.741245845575252e-06, + "loss": 0.7415, + "step": 14116 + }, + { + "epoch": 0.7769827728548627, + "grad_norm": 0.6601547598838806, + "learning_rate": 6.740839506557127e-06, + "loss": 0.732, + "step": 14117 + }, + { + "epoch": 0.7770378116572183, + "grad_norm": 0.7939042448997498, + "learning_rate": 6.740433154455319e-06, + "loss": 0.7043, + "step": 14118 + }, + { + "epoch": 0.777092850459574, + "grad_norm": 0.7381628751754761, + "learning_rate": 6.740026789272881e-06, + "loss": 0.8256, + "step": 14119 + }, + { + "epoch": 0.7771478892619297, + "grad_norm": 0.6131769418716431, + "learning_rate": 6.739620411012866e-06, + "loss": 0.726, + "step": 14120 + }, + { + "epoch": 0.7772029280642854, + "grad_norm": 1.201745867729187, + "learning_rate": 6.739214019678332e-06, + "loss": 0.7097, + "step": 14121 + }, + { + "epoch": 0.777257966866641, + "grad_norm": 0.6618456244468689, + "learning_rate": 6.7388076152723295e-06, + "loss": 0.6396, + "step": 14122 + }, + { + "epoch": 0.7773130056689966, + "grad_norm": 0.7490836977958679, + "learning_rate": 6.738401197797915e-06, + "loss": 0.6475, + "step": 14123 + }, + { + "epoch": 0.7773680444713523, + "grad_norm": 0.8125407099723816, + "learning_rate": 6.737994767258142e-06, + "loss": 0.7693, + "step": 14124 + }, + { + "epoch": 0.777423083273708, + "grad_norm": 0.7501794099807739, + "learning_rate": 6.737588323656065e-06, + "loss": 0.7333, + "step": 14125 + }, + { + "epoch": 0.7774781220760636, + "grad_norm": 1.3062889575958252, + "learning_rate": 6.73718186699474e-06, + "loss": 0.6909, + "step": 14126 + }, + { + "epoch": 0.7775331608784193, + "grad_norm": 0.6784525513648987, + "learning_rate": 6.736775397277221e-06, + "loss": 0.7256, + "step": 14127 + }, + { + "epoch": 0.777588199680775, + "grad_norm": 0.7018646597862244, + "learning_rate": 6.736368914506562e-06, + "loss": 0.7632, + "step": 14128 + }, + { + "epoch": 0.7776432384831307, + "grad_norm": 0.7596307992935181, + "learning_rate": 6.735962418685821e-06, + "loss": 0.7117, + "step": 14129 + }, + { + "epoch": 0.7776982772854862, + "grad_norm": 0.7582107186317444, + "learning_rate": 6.7355559098180504e-06, + "loss": 0.7808, + "step": 14130 + }, + { + "epoch": 0.7777533160878419, + "grad_norm": 0.6460647583007812, + "learning_rate": 6.7351493879063056e-06, + "loss": 0.675, + "step": 14131 + }, + { + "epoch": 0.7778083548901976, + "grad_norm": 0.6801304221153259, + "learning_rate": 6.7347428529536415e-06, + "loss": 0.6504, + "step": 14132 + }, + { + "epoch": 0.7778633936925532, + "grad_norm": 0.8122933506965637, + "learning_rate": 6.7343363049631176e-06, + "loss": 0.7949, + "step": 14133 + }, + { + "epoch": 0.7779184324949089, + "grad_norm": 0.6750267744064331, + "learning_rate": 6.733929743937784e-06, + "loss": 0.7689, + "step": 14134 + }, + { + "epoch": 0.7779734712972646, + "grad_norm": 0.7141891121864319, + "learning_rate": 6.7335231698807005e-06, + "loss": 0.7099, + "step": 14135 + }, + { + "epoch": 0.7780285100996203, + "grad_norm": 0.7904065251350403, + "learning_rate": 6.733116582794918e-06, + "loss": 0.8458, + "step": 14136 + }, + { + "epoch": 0.7780835489019758, + "grad_norm": 0.6905248165130615, + "learning_rate": 6.732709982683496e-06, + "loss": 0.7848, + "step": 14137 + }, + { + "epoch": 0.7781385877043315, + "grad_norm": 0.6707245707511902, + "learning_rate": 6.732303369549491e-06, + "loss": 0.8319, + "step": 14138 + }, + { + "epoch": 0.7781936265066872, + "grad_norm": 0.6611519455909729, + "learning_rate": 6.731896743395957e-06, + "loss": 0.7025, + "step": 14139 + }, + { + "epoch": 0.7782486653090429, + "grad_norm": 0.7113156914710999, + "learning_rate": 6.73149010422595e-06, + "loss": 0.8297, + "step": 14140 + }, + { + "epoch": 0.7783037041113985, + "grad_norm": 0.7279486060142517, + "learning_rate": 6.7310834520425265e-06, + "loss": 0.8134, + "step": 14141 + }, + { + "epoch": 0.7783587429137542, + "grad_norm": 0.7561796307563782, + "learning_rate": 6.730676786848744e-06, + "loss": 0.806, + "step": 14142 + }, + { + "epoch": 0.7784137817161099, + "grad_norm": 0.6724728345870972, + "learning_rate": 6.7302701086476585e-06, + "loss": 0.7782, + "step": 14143 + }, + { + "epoch": 0.7784688205184656, + "grad_norm": 0.6363211274147034, + "learning_rate": 6.729863417442325e-06, + "loss": 0.6298, + "step": 14144 + }, + { + "epoch": 0.7785238593208211, + "grad_norm": 0.6920950412750244, + "learning_rate": 6.729456713235803e-06, + "loss": 0.5804, + "step": 14145 + }, + { + "epoch": 0.7785788981231768, + "grad_norm": 0.7388806343078613, + "learning_rate": 6.729049996031145e-06, + "loss": 0.6594, + "step": 14146 + }, + { + "epoch": 0.7786339369255325, + "grad_norm": 0.7736972570419312, + "learning_rate": 6.728643265831412e-06, + "loss": 0.8244, + "step": 14147 + }, + { + "epoch": 0.7786889757278882, + "grad_norm": 0.6928302049636841, + "learning_rate": 6.728236522639658e-06, + "loss": 0.6713, + "step": 14148 + }, + { + "epoch": 0.7787440145302438, + "grad_norm": 0.8058464527130127, + "learning_rate": 6.72782976645894e-06, + "loss": 0.7647, + "step": 14149 + }, + { + "epoch": 0.7787990533325995, + "grad_norm": 0.7111127376556396, + "learning_rate": 6.727422997292317e-06, + "loss": 0.7629, + "step": 14150 + }, + { + "epoch": 0.7788540921349552, + "grad_norm": 0.9375373721122742, + "learning_rate": 6.7270162151428455e-06, + "loss": 0.8306, + "step": 14151 + }, + { + "epoch": 0.7789091309373108, + "grad_norm": 0.6894392371177673, + "learning_rate": 6.726609420013581e-06, + "loss": 0.6995, + "step": 14152 + }, + { + "epoch": 0.7789641697396664, + "grad_norm": 0.7058690786361694, + "learning_rate": 6.726202611907583e-06, + "loss": 0.844, + "step": 14153 + }, + { + "epoch": 0.7790192085420221, + "grad_norm": 0.7672932744026184, + "learning_rate": 6.725795790827909e-06, + "loss": 0.6613, + "step": 14154 + }, + { + "epoch": 0.7790742473443778, + "grad_norm": 0.8575173020362854, + "learning_rate": 6.7253889567776146e-06, + "loss": 0.6946, + "step": 14155 + }, + { + "epoch": 0.7791292861467335, + "grad_norm": 0.6832261085510254, + "learning_rate": 6.724982109759759e-06, + "loss": 0.7121, + "step": 14156 + }, + { + "epoch": 0.7791843249490891, + "grad_norm": 0.8188209533691406, + "learning_rate": 6.724575249777401e-06, + "loss": 0.6479, + "step": 14157 + }, + { + "epoch": 0.7792393637514448, + "grad_norm": 0.6514336466789246, + "learning_rate": 6.724168376833595e-06, + "loss": 0.6117, + "step": 14158 + }, + { + "epoch": 0.7792944025538004, + "grad_norm": 0.7283767461776733, + "learning_rate": 6.723761490931403e-06, + "loss": 0.6882, + "step": 14159 + }, + { + "epoch": 0.7793494413561561, + "grad_norm": 0.7681146860122681, + "learning_rate": 6.7233545920738785e-06, + "loss": 0.8028, + "step": 14160 + }, + { + "epoch": 0.7794044801585117, + "grad_norm": 0.6202995181083679, + "learning_rate": 6.722947680264084e-06, + "loss": 0.713, + "step": 14161 + }, + { + "epoch": 0.7794595189608674, + "grad_norm": 0.7137139439582825, + "learning_rate": 6.722540755505076e-06, + "loss": 0.7842, + "step": 14162 + }, + { + "epoch": 0.7795145577632231, + "grad_norm": 0.6852554678916931, + "learning_rate": 6.722133817799913e-06, + "loss": 0.7329, + "step": 14163 + }, + { + "epoch": 0.7795695965655788, + "grad_norm": 0.7520774602890015, + "learning_rate": 6.7217268671516525e-06, + "loss": 0.7498, + "step": 14164 + }, + { + "epoch": 0.7796246353679344, + "grad_norm": 0.708577573299408, + "learning_rate": 6.7213199035633525e-06, + "loss": 0.675, + "step": 14165 + }, + { + "epoch": 0.77967967417029, + "grad_norm": 0.8061410188674927, + "learning_rate": 6.7209129270380744e-06, + "loss": 0.7176, + "step": 14166 + }, + { + "epoch": 0.7797347129726457, + "grad_norm": 0.8070787787437439, + "learning_rate": 6.720505937578876e-06, + "loss": 0.8138, + "step": 14167 + }, + { + "epoch": 0.7797897517750014, + "grad_norm": 0.7127004265785217, + "learning_rate": 6.720098935188815e-06, + "loss": 0.7004, + "step": 14168 + }, + { + "epoch": 0.779844790577357, + "grad_norm": 0.7188708782196045, + "learning_rate": 6.719691919870951e-06, + "loss": 0.6996, + "step": 14169 + }, + { + "epoch": 0.7798998293797127, + "grad_norm": 0.6346360445022583, + "learning_rate": 6.719284891628342e-06, + "loss": 0.7349, + "step": 14170 + }, + { + "epoch": 0.7799548681820684, + "grad_norm": 0.6262187361717224, + "learning_rate": 6.71887785046405e-06, + "loss": 0.7279, + "step": 14171 + }, + { + "epoch": 0.7800099069844241, + "grad_norm": 0.7538053393363953, + "learning_rate": 6.718470796381129e-06, + "loss": 0.754, + "step": 14172 + }, + { + "epoch": 0.7800649457867797, + "grad_norm": 0.6569569706916809, + "learning_rate": 6.718063729382643e-06, + "loss": 0.6787, + "step": 14173 + }, + { + "epoch": 0.7801199845891353, + "grad_norm": 0.6446678042411804, + "learning_rate": 6.71765664947165e-06, + "loss": 0.6338, + "step": 14174 + }, + { + "epoch": 0.780175023391491, + "grad_norm": 0.7559269666671753, + "learning_rate": 6.7172495566512095e-06, + "loss": 0.7472, + "step": 14175 + }, + { + "epoch": 0.7802300621938466, + "grad_norm": 0.6920101642608643, + "learning_rate": 6.71684245092438e-06, + "loss": 0.7189, + "step": 14176 + }, + { + "epoch": 0.7802851009962023, + "grad_norm": 0.6513105034828186, + "learning_rate": 6.716435332294223e-06, + "loss": 0.6104, + "step": 14177 + }, + { + "epoch": 0.780340139798558, + "grad_norm": 0.7076418399810791, + "learning_rate": 6.716028200763798e-06, + "loss": 0.7974, + "step": 14178 + }, + { + "epoch": 0.7803951786009137, + "grad_norm": 0.7291662693023682, + "learning_rate": 6.715621056336164e-06, + "loss": 0.7661, + "step": 14179 + }, + { + "epoch": 0.7804502174032693, + "grad_norm": 0.682321310043335, + "learning_rate": 6.715213899014381e-06, + "loss": 0.7345, + "step": 14180 + }, + { + "epoch": 0.7805052562056249, + "grad_norm": 0.7170400619506836, + "learning_rate": 6.71480672880151e-06, + "loss": 0.6968, + "step": 14181 + }, + { + "epoch": 0.7805602950079806, + "grad_norm": 0.7504192590713501, + "learning_rate": 6.714399545700611e-06, + "loss": 0.7868, + "step": 14182 + }, + { + "epoch": 0.7806153338103363, + "grad_norm": 0.7334801554679871, + "learning_rate": 6.713992349714744e-06, + "loss": 0.8806, + "step": 14183 + }, + { + "epoch": 0.7806703726126919, + "grad_norm": 0.6495537161827087, + "learning_rate": 6.713585140846969e-06, + "loss": 0.7272, + "step": 14184 + }, + { + "epoch": 0.7807254114150476, + "grad_norm": 0.7101101279258728, + "learning_rate": 6.713177919100347e-06, + "loss": 0.8038, + "step": 14185 + }, + { + "epoch": 0.7807804502174033, + "grad_norm": 0.7013083100318909, + "learning_rate": 6.712770684477937e-06, + "loss": 0.7576, + "step": 14186 + }, + { + "epoch": 0.780835489019759, + "grad_norm": 0.7535369992256165, + "learning_rate": 6.712363436982802e-06, + "loss": 0.6537, + "step": 14187 + }, + { + "epoch": 0.7808905278221145, + "grad_norm": 0.7432667016983032, + "learning_rate": 6.711956176618001e-06, + "loss": 0.7734, + "step": 14188 + }, + { + "epoch": 0.7809455666244702, + "grad_norm": 0.718006432056427, + "learning_rate": 6.711548903386597e-06, + "loss": 0.7291, + "step": 14189 + }, + { + "epoch": 0.7810006054268259, + "grad_norm": 0.7983072400093079, + "learning_rate": 6.711141617291649e-06, + "loss": 0.8403, + "step": 14190 + }, + { + "epoch": 0.7810556442291816, + "grad_norm": 0.7017259001731873, + "learning_rate": 6.710734318336218e-06, + "loss": 0.7293, + "step": 14191 + }, + { + "epoch": 0.7811106830315372, + "grad_norm": 0.6061737537384033, + "learning_rate": 6.710327006523366e-06, + "loss": 0.6624, + "step": 14192 + }, + { + "epoch": 0.7811657218338929, + "grad_norm": 0.6876726746559143, + "learning_rate": 6.709919681856155e-06, + "loss": 0.723, + "step": 14193 + }, + { + "epoch": 0.7812207606362486, + "grad_norm": 0.6926757097244263, + "learning_rate": 6.709512344337646e-06, + "loss": 0.7392, + "step": 14194 + }, + { + "epoch": 0.7812757994386043, + "grad_norm": 0.6464381217956543, + "learning_rate": 6.7091049939708985e-06, + "loss": 0.7301, + "step": 14195 + }, + { + "epoch": 0.7813308382409598, + "grad_norm": 0.7292629480361938, + "learning_rate": 6.708697630758974e-06, + "loss": 0.7511, + "step": 14196 + }, + { + "epoch": 0.7813858770433155, + "grad_norm": 0.7483099102973938, + "learning_rate": 6.708290254704937e-06, + "loss": 0.7981, + "step": 14197 + }, + { + "epoch": 0.7814409158456712, + "grad_norm": 0.6766877770423889, + "learning_rate": 6.707882865811848e-06, + "loss": 0.7987, + "step": 14198 + }, + { + "epoch": 0.7814959546480269, + "grad_norm": 0.7340181469917297, + "learning_rate": 6.707475464082769e-06, + "loss": 0.799, + "step": 14199 + }, + { + "epoch": 0.7815509934503825, + "grad_norm": 0.6247759461402893, + "learning_rate": 6.707068049520759e-06, + "loss": 0.7299, + "step": 14200 + }, + { + "epoch": 0.7816060322527382, + "grad_norm": 0.6783067584037781, + "learning_rate": 6.706660622128885e-06, + "loss": 0.6987, + "step": 14201 + }, + { + "epoch": 0.7816610710550939, + "grad_norm": 0.7613719701766968, + "learning_rate": 6.706253181910205e-06, + "loss": 0.7894, + "step": 14202 + }, + { + "epoch": 0.7817161098574495, + "grad_norm": 0.6673761606216431, + "learning_rate": 6.705845728867784e-06, + "loss": 0.8015, + "step": 14203 + }, + { + "epoch": 0.7817711486598051, + "grad_norm": 0.6551307439804077, + "learning_rate": 6.705438263004683e-06, + "loss": 0.7057, + "step": 14204 + }, + { + "epoch": 0.7818261874621608, + "grad_norm": 0.6815405488014221, + "learning_rate": 6.705030784323965e-06, + "loss": 0.7466, + "step": 14205 + }, + { + "epoch": 0.7818812262645165, + "grad_norm": 0.6838087439537048, + "learning_rate": 6.704623292828692e-06, + "loss": 0.8226, + "step": 14206 + }, + { + "epoch": 0.7819362650668722, + "grad_norm": 0.6704637408256531, + "learning_rate": 6.704215788521925e-06, + "loss": 0.8101, + "step": 14207 + }, + { + "epoch": 0.7819913038692278, + "grad_norm": 0.6606172919273376, + "learning_rate": 6.70380827140673e-06, + "loss": 0.7824, + "step": 14208 + }, + { + "epoch": 0.7820463426715835, + "grad_norm": 0.6641090512275696, + "learning_rate": 6.703400741486166e-06, + "loss": 0.7507, + "step": 14209 + }, + { + "epoch": 0.7821013814739392, + "grad_norm": 1.6413429975509644, + "learning_rate": 6.702993198763299e-06, + "loss": 0.7793, + "step": 14210 + }, + { + "epoch": 0.7821564202762948, + "grad_norm": 0.6664854884147644, + "learning_rate": 6.7025856432411915e-06, + "loss": 0.7304, + "step": 14211 + }, + { + "epoch": 0.7822114590786504, + "grad_norm": 0.6968172192573547, + "learning_rate": 6.7021780749229075e-06, + "loss": 0.7506, + "step": 14212 + }, + { + "epoch": 0.7822664978810061, + "grad_norm": 0.6443943381309509, + "learning_rate": 6.701770493811506e-06, + "loss": 0.7511, + "step": 14213 + }, + { + "epoch": 0.7823215366833618, + "grad_norm": 0.67723548412323, + "learning_rate": 6.701362899910053e-06, + "loss": 0.6839, + "step": 14214 + }, + { + "epoch": 0.7823765754857175, + "grad_norm": 0.7601221203804016, + "learning_rate": 6.700955293221614e-06, + "loss": 0.7397, + "step": 14215 + }, + { + "epoch": 0.7824316142880731, + "grad_norm": 0.6056920289993286, + "learning_rate": 6.700547673749249e-06, + "loss": 0.7706, + "step": 14216 + }, + { + "epoch": 0.7824866530904288, + "grad_norm": 0.6421142816543579, + "learning_rate": 6.700140041496024e-06, + "loss": 0.7209, + "step": 14217 + }, + { + "epoch": 0.7825416918927844, + "grad_norm": 0.6653133034706116, + "learning_rate": 6.6997323964650005e-06, + "loss": 0.708, + "step": 14218 + }, + { + "epoch": 0.78259673069514, + "grad_norm": 0.8854939937591553, + "learning_rate": 6.699324738659243e-06, + "loss": 0.7658, + "step": 14219 + }, + { + "epoch": 0.7826517694974957, + "grad_norm": 0.7130745649337769, + "learning_rate": 6.6989170680818175e-06, + "loss": 0.7827, + "step": 14220 + }, + { + "epoch": 0.7827068082998514, + "grad_norm": 0.953117847442627, + "learning_rate": 6.698509384735783e-06, + "loss": 0.7852, + "step": 14221 + }, + { + "epoch": 0.7827618471022071, + "grad_norm": 0.655768871307373, + "learning_rate": 6.698101688624209e-06, + "loss": 0.8461, + "step": 14222 + }, + { + "epoch": 0.7828168859045627, + "grad_norm": 0.656775951385498, + "learning_rate": 6.6976939797501575e-06, + "loss": 0.7254, + "step": 14223 + }, + { + "epoch": 0.7828719247069184, + "grad_norm": 0.6901991963386536, + "learning_rate": 6.697286258116691e-06, + "loss": 0.7242, + "step": 14224 + }, + { + "epoch": 0.782926963509274, + "grad_norm": 0.8289571404457092, + "learning_rate": 6.696878523726875e-06, + "loss": 0.8578, + "step": 14225 + }, + { + "epoch": 0.7829820023116297, + "grad_norm": 0.6268846392631531, + "learning_rate": 6.696470776583775e-06, + "loss": 0.737, + "step": 14226 + }, + { + "epoch": 0.7830370411139853, + "grad_norm": 0.7026770114898682, + "learning_rate": 6.696063016690455e-06, + "loss": 0.6771, + "step": 14227 + }, + { + "epoch": 0.783092079916341, + "grad_norm": 0.7377839088439941, + "learning_rate": 6.69565524404998e-06, + "loss": 0.7174, + "step": 14228 + }, + { + "epoch": 0.7831471187186967, + "grad_norm": 0.6778523921966553, + "learning_rate": 6.695247458665414e-06, + "loss": 0.8255, + "step": 14229 + }, + { + "epoch": 0.7832021575210524, + "grad_norm": 0.7624330520629883, + "learning_rate": 6.69483966053982e-06, + "loss": 0.7495, + "step": 14230 + }, + { + "epoch": 0.783257196323408, + "grad_norm": 0.8944052457809448, + "learning_rate": 6.694431849676267e-06, + "loss": 0.868, + "step": 14231 + }, + { + "epoch": 0.7833122351257636, + "grad_norm": 0.7391701936721802, + "learning_rate": 6.694024026077816e-06, + "loss": 0.7032, + "step": 14232 + }, + { + "epoch": 0.7833672739281193, + "grad_norm": 0.7548620104789734, + "learning_rate": 6.693616189747535e-06, + "loss": 0.8272, + "step": 14233 + }, + { + "epoch": 0.783422312730475, + "grad_norm": 0.6795994639396667, + "learning_rate": 6.693208340688489e-06, + "loss": 0.703, + "step": 14234 + }, + { + "epoch": 0.7834773515328306, + "grad_norm": 0.6580816507339478, + "learning_rate": 6.69280047890374e-06, + "loss": 0.7454, + "step": 14235 + }, + { + "epoch": 0.7835323903351863, + "grad_norm": 0.7124443650245667, + "learning_rate": 6.6923926043963576e-06, + "loss": 0.6655, + "step": 14236 + }, + { + "epoch": 0.783587429137542, + "grad_norm": 0.6730241179466248, + "learning_rate": 6.691984717169404e-06, + "loss": 0.7522, + "step": 14237 + }, + { + "epoch": 0.7836424679398977, + "grad_norm": 0.8156033158302307, + "learning_rate": 6.6915768172259466e-06, + "loss": 0.8955, + "step": 14238 + }, + { + "epoch": 0.7836975067422532, + "grad_norm": 0.8041443228721619, + "learning_rate": 6.6911689045690506e-06, + "loss": 0.8019, + "step": 14239 + }, + { + "epoch": 0.7837525455446089, + "grad_norm": 0.7252053618431091, + "learning_rate": 6.690760979201782e-06, + "loss": 0.7014, + "step": 14240 + }, + { + "epoch": 0.7838075843469646, + "grad_norm": 0.6969071626663208, + "learning_rate": 6.690353041127208e-06, + "loss": 0.7304, + "step": 14241 + }, + { + "epoch": 0.7838626231493203, + "grad_norm": 0.8254885673522949, + "learning_rate": 6.6899450903483906e-06, + "loss": 0.7193, + "step": 14242 + }, + { + "epoch": 0.7839176619516759, + "grad_norm": 0.7426590323448181, + "learning_rate": 6.6895371268684e-06, + "loss": 0.697, + "step": 14243 + }, + { + "epoch": 0.7839727007540316, + "grad_norm": 0.6744338274002075, + "learning_rate": 6.6891291506903e-06, + "loss": 0.8363, + "step": 14244 + }, + { + "epoch": 0.7840277395563873, + "grad_norm": 0.6609839797019958, + "learning_rate": 6.688721161817156e-06, + "loss": 0.7756, + "step": 14245 + }, + { + "epoch": 0.784082778358743, + "grad_norm": 0.8377131223678589, + "learning_rate": 6.688313160252038e-06, + "loss": 0.8355, + "step": 14246 + }, + { + "epoch": 0.7841378171610985, + "grad_norm": 0.6922308802604675, + "learning_rate": 6.687905145998009e-06, + "loss": 0.756, + "step": 14247 + }, + { + "epoch": 0.7841928559634542, + "grad_norm": 0.7217739820480347, + "learning_rate": 6.687497119058137e-06, + "loss": 0.7309, + "step": 14248 + }, + { + "epoch": 0.7842478947658099, + "grad_norm": 0.6906038522720337, + "learning_rate": 6.687089079435488e-06, + "loss": 0.6645, + "step": 14249 + }, + { + "epoch": 0.7843029335681656, + "grad_norm": 0.6800183057785034, + "learning_rate": 6.6866810271331305e-06, + "loss": 0.6791, + "step": 14250 + }, + { + "epoch": 0.7843579723705212, + "grad_norm": 0.6835503578186035, + "learning_rate": 6.686272962154129e-06, + "loss": 0.699, + "step": 14251 + }, + { + "epoch": 0.7844130111728769, + "grad_norm": 0.6643723845481873, + "learning_rate": 6.685864884501552e-06, + "loss": 0.7808, + "step": 14252 + }, + { + "epoch": 0.7844680499752326, + "grad_norm": 0.6742954850196838, + "learning_rate": 6.685456794178464e-06, + "loss": 0.7704, + "step": 14253 + }, + { + "epoch": 0.7845230887775883, + "grad_norm": 0.6374711990356445, + "learning_rate": 6.6850486911879355e-06, + "loss": 0.7557, + "step": 14254 + }, + { + "epoch": 0.7845781275799438, + "grad_norm": 0.7354347109794617, + "learning_rate": 6.684640575533031e-06, + "loss": 0.7928, + "step": 14255 + }, + { + "epoch": 0.7846331663822995, + "grad_norm": 0.6694937348365784, + "learning_rate": 6.684232447216821e-06, + "loss": 0.7247, + "step": 14256 + }, + { + "epoch": 0.7846882051846552, + "grad_norm": 0.716623842716217, + "learning_rate": 6.683824306242368e-06, + "loss": 0.8638, + "step": 14257 + }, + { + "epoch": 0.7847432439870109, + "grad_norm": 0.667164146900177, + "learning_rate": 6.683416152612743e-06, + "loss": 0.7455, + "step": 14258 + }, + { + "epoch": 0.7847982827893665, + "grad_norm": 0.7302100658416748, + "learning_rate": 6.683007986331014e-06, + "loss": 0.707, + "step": 14259 + }, + { + "epoch": 0.7848533215917222, + "grad_norm": 0.7605045437812805, + "learning_rate": 6.682599807400246e-06, + "loss": 0.7727, + "step": 14260 + }, + { + "epoch": 0.7849083603940779, + "grad_norm": 0.6819437146186829, + "learning_rate": 6.682191615823508e-06, + "loss": 0.7538, + "step": 14261 + }, + { + "epoch": 0.7849633991964334, + "grad_norm": 0.7399439811706543, + "learning_rate": 6.6817834116038695e-06, + "loss": 0.7499, + "step": 14262 + }, + { + "epoch": 0.7850184379987891, + "grad_norm": 0.7864901423454285, + "learning_rate": 6.681375194744397e-06, + "loss": 0.7128, + "step": 14263 + }, + { + "epoch": 0.7850734768011448, + "grad_norm": 0.7308626174926758, + "learning_rate": 6.680966965248159e-06, + "loss": 0.7239, + "step": 14264 + }, + { + "epoch": 0.7851285156035005, + "grad_norm": 0.6553478837013245, + "learning_rate": 6.680558723118222e-06, + "loss": 0.6984, + "step": 14265 + }, + { + "epoch": 0.7851835544058561, + "grad_norm": 0.621415376663208, + "learning_rate": 6.680150468357656e-06, + "loss": 0.6428, + "step": 14266 + }, + { + "epoch": 0.7852385932082118, + "grad_norm": 1.0505764484405518, + "learning_rate": 6.679742200969529e-06, + "loss": 0.8073, + "step": 14267 + }, + { + "epoch": 0.7852936320105675, + "grad_norm": 0.7393355369567871, + "learning_rate": 6.67933392095691e-06, + "loss": 0.7396, + "step": 14268 + }, + { + "epoch": 0.7853486708129231, + "grad_norm": 0.7346563935279846, + "learning_rate": 6.678925628322864e-06, + "loss": 0.7398, + "step": 14269 + }, + { + "epoch": 0.7854037096152787, + "grad_norm": 0.6694674491882324, + "learning_rate": 6.678517323070465e-06, + "loss": 0.7346, + "step": 14270 + }, + { + "epoch": 0.7854587484176344, + "grad_norm": 0.6907033920288086, + "learning_rate": 6.678109005202779e-06, + "loss": 0.7617, + "step": 14271 + }, + { + "epoch": 0.7855137872199901, + "grad_norm": 0.6588131189346313, + "learning_rate": 6.677700674722873e-06, + "loss": 0.7514, + "step": 14272 + }, + { + "epoch": 0.7855688260223458, + "grad_norm": 0.6535136699676514, + "learning_rate": 6.677292331633819e-06, + "loss": 0.7154, + "step": 14273 + }, + { + "epoch": 0.7856238648247014, + "grad_norm": 0.7013682723045349, + "learning_rate": 6.676883975938685e-06, + "loss": 0.8506, + "step": 14274 + }, + { + "epoch": 0.7856789036270571, + "grad_norm": 0.7128416895866394, + "learning_rate": 6.67647560764054e-06, + "loss": 0.7669, + "step": 14275 + }, + { + "epoch": 0.7857339424294127, + "grad_norm": 0.7021318674087524, + "learning_rate": 6.676067226742453e-06, + "loss": 0.8236, + "step": 14276 + }, + { + "epoch": 0.7857889812317684, + "grad_norm": 0.7067561745643616, + "learning_rate": 6.675658833247493e-06, + "loss": 0.6848, + "step": 14277 + }, + { + "epoch": 0.785844020034124, + "grad_norm": 0.6488254070281982, + "learning_rate": 6.675250427158731e-06, + "loss": 0.7877, + "step": 14278 + }, + { + "epoch": 0.7858990588364797, + "grad_norm": 0.7153946757316589, + "learning_rate": 6.674842008479234e-06, + "loss": 0.7994, + "step": 14279 + }, + { + "epoch": 0.7859540976388354, + "grad_norm": 0.7290914058685303, + "learning_rate": 6.6744335772120735e-06, + "loss": 0.8074, + "step": 14280 + }, + { + "epoch": 0.7860091364411911, + "grad_norm": 0.726309061050415, + "learning_rate": 6.674025133360316e-06, + "loss": 0.7789, + "step": 14281 + }, + { + "epoch": 0.7860641752435467, + "grad_norm": 0.6294347047805786, + "learning_rate": 6.673616676927037e-06, + "loss": 0.6405, + "step": 14282 + }, + { + "epoch": 0.7861192140459023, + "grad_norm": 0.654400646686554, + "learning_rate": 6.673208207915302e-06, + "loss": 0.7876, + "step": 14283 + }, + { + "epoch": 0.786174252848258, + "grad_norm": 0.6729328632354736, + "learning_rate": 6.672799726328182e-06, + "loss": 0.7773, + "step": 14284 + }, + { + "epoch": 0.7862292916506137, + "grad_norm": 0.7607905268669128, + "learning_rate": 6.672391232168745e-06, + "loss": 0.8262, + "step": 14285 + }, + { + "epoch": 0.7862843304529693, + "grad_norm": 0.6475018858909607, + "learning_rate": 6.671982725440065e-06, + "loss": 0.7383, + "step": 14286 + }, + { + "epoch": 0.786339369255325, + "grad_norm": 0.8290789723396301, + "learning_rate": 6.671574206145211e-06, + "loss": 0.7968, + "step": 14287 + }, + { + "epoch": 0.7863944080576807, + "grad_norm": 0.7462177872657776, + "learning_rate": 6.671165674287252e-06, + "loss": 0.7465, + "step": 14288 + }, + { + "epoch": 0.7864494468600364, + "grad_norm": 0.7029373049736023, + "learning_rate": 6.6707571298692595e-06, + "loss": 0.7342, + "step": 14289 + }, + { + "epoch": 0.786504485662392, + "grad_norm": 0.8253761529922485, + "learning_rate": 6.670348572894303e-06, + "loss": 0.8196, + "step": 14290 + }, + { + "epoch": 0.7865595244647476, + "grad_norm": 0.7234970331192017, + "learning_rate": 6.669940003365455e-06, + "loss": 0.7966, + "step": 14291 + }, + { + "epoch": 0.7866145632671033, + "grad_norm": 0.8699348568916321, + "learning_rate": 6.6695314212857845e-06, + "loss": 0.8761, + "step": 14292 + }, + { + "epoch": 0.786669602069459, + "grad_norm": 0.6620158553123474, + "learning_rate": 6.66912282665836e-06, + "loss": 0.7534, + "step": 14293 + }, + { + "epoch": 0.7867246408718146, + "grad_norm": 0.6469776630401611, + "learning_rate": 6.668714219486259e-06, + "loss": 0.7812, + "step": 14294 + }, + { + "epoch": 0.7867796796741703, + "grad_norm": 0.6477407813072205, + "learning_rate": 6.668305599772546e-06, + "loss": 0.7144, + "step": 14295 + }, + { + "epoch": 0.786834718476526, + "grad_norm": 0.6626473665237427, + "learning_rate": 6.667896967520297e-06, + "loss": 0.7283, + "step": 14296 + }, + { + "epoch": 0.7868897572788817, + "grad_norm": 0.6214945316314697, + "learning_rate": 6.667488322732578e-06, + "loss": 0.6835, + "step": 14297 + }, + { + "epoch": 0.7869447960812372, + "grad_norm": 0.6199555397033691, + "learning_rate": 6.667079665412465e-06, + "loss": 0.706, + "step": 14298 + }, + { + "epoch": 0.7869998348835929, + "grad_norm": 0.8127612471580505, + "learning_rate": 6.666670995563027e-06, + "loss": 0.7099, + "step": 14299 + }, + { + "epoch": 0.7870548736859486, + "grad_norm": 0.6241362690925598, + "learning_rate": 6.6662623131873374e-06, + "loss": 0.7076, + "step": 14300 + }, + { + "epoch": 0.7871099124883043, + "grad_norm": 0.7260692715644836, + "learning_rate": 6.665853618288465e-06, + "loss": 0.7842, + "step": 14301 + }, + { + "epoch": 0.7871649512906599, + "grad_norm": 0.6644107103347778, + "learning_rate": 6.665444910869482e-06, + "loss": 0.6515, + "step": 14302 + }, + { + "epoch": 0.7872199900930156, + "grad_norm": 0.6629641056060791, + "learning_rate": 6.6650361909334616e-06, + "loss": 0.7062, + "step": 14303 + }, + { + "epoch": 0.7872750288953713, + "grad_norm": 0.6616516709327698, + "learning_rate": 6.6646274584834745e-06, + "loss": 0.8195, + "step": 14304 + }, + { + "epoch": 0.7873300676977268, + "grad_norm": 0.7184805870056152, + "learning_rate": 6.664218713522593e-06, + "loss": 0.8699, + "step": 14305 + }, + { + "epoch": 0.7873851065000825, + "grad_norm": 0.6567219495773315, + "learning_rate": 6.6638099560538905e-06, + "loss": 0.7679, + "step": 14306 + }, + { + "epoch": 0.7874401453024382, + "grad_norm": 0.6952399611473083, + "learning_rate": 6.663401186080436e-06, + "loss": 0.603, + "step": 14307 + }, + { + "epoch": 0.7874951841047939, + "grad_norm": 0.7298767566680908, + "learning_rate": 6.662992403605304e-06, + "loss": 0.7655, + "step": 14308 + }, + { + "epoch": 0.7875502229071495, + "grad_norm": 0.7162219882011414, + "learning_rate": 6.662583608631567e-06, + "loss": 0.7797, + "step": 14309 + }, + { + "epoch": 0.7876052617095052, + "grad_norm": 0.6489827036857605, + "learning_rate": 6.662174801162296e-06, + "loss": 0.8165, + "step": 14310 + }, + { + "epoch": 0.7876603005118609, + "grad_norm": 0.7893611192703247, + "learning_rate": 6.6617659812005635e-06, + "loss": 0.8082, + "step": 14311 + }, + { + "epoch": 0.7877153393142166, + "grad_norm": 0.6709675192832947, + "learning_rate": 6.661357148749443e-06, + "loss": 0.7549, + "step": 14312 + }, + { + "epoch": 0.7877703781165721, + "grad_norm": 0.6166689991950989, + "learning_rate": 6.660948303812009e-06, + "loss": 0.7116, + "step": 14313 + }, + { + "epoch": 0.7878254169189278, + "grad_norm": 0.7941738367080688, + "learning_rate": 6.660539446391329e-06, + "loss": 0.7981, + "step": 14314 + }, + { + "epoch": 0.7878804557212835, + "grad_norm": 0.6339346170425415, + "learning_rate": 6.660130576490481e-06, + "loss": 0.7306, + "step": 14315 + }, + { + "epoch": 0.7879354945236392, + "grad_norm": 0.7044192552566528, + "learning_rate": 6.659721694112535e-06, + "loss": 0.7811, + "step": 14316 + }, + { + "epoch": 0.7879905333259948, + "grad_norm": 0.7853406071662903, + "learning_rate": 6.659312799260565e-06, + "loss": 0.7652, + "step": 14317 + }, + { + "epoch": 0.7880455721283505, + "grad_norm": 0.7076637148857117, + "learning_rate": 6.658903891937645e-06, + "loss": 0.7672, + "step": 14318 + }, + { + "epoch": 0.7881006109307062, + "grad_norm": 0.7043278813362122, + "learning_rate": 6.658494972146847e-06, + "loss": 0.726, + "step": 14319 + }, + { + "epoch": 0.7881556497330618, + "grad_norm": 0.8903809785842896, + "learning_rate": 6.658086039891245e-06, + "loss": 0.8, + "step": 14320 + }, + { + "epoch": 0.7882106885354174, + "grad_norm": 0.8239984512329102, + "learning_rate": 6.657677095173911e-06, + "loss": 0.7283, + "step": 14321 + }, + { + "epoch": 0.7882657273377731, + "grad_norm": 0.7221176028251648, + "learning_rate": 6.6572681379979206e-06, + "loss": 0.8058, + "step": 14322 + }, + { + "epoch": 0.7883207661401288, + "grad_norm": 0.8297285437583923, + "learning_rate": 6.6568591683663475e-06, + "loss": 0.8064, + "step": 14323 + }, + { + "epoch": 0.7883758049424845, + "grad_norm": 0.680659294128418, + "learning_rate": 6.656450186282264e-06, + "loss": 0.7259, + "step": 14324 + }, + { + "epoch": 0.7884308437448401, + "grad_norm": 0.7067807912826538, + "learning_rate": 6.656041191748744e-06, + "loss": 0.8414, + "step": 14325 + }, + { + "epoch": 0.7884858825471958, + "grad_norm": 0.6053900718688965, + "learning_rate": 6.655632184768861e-06, + "loss": 0.6762, + "step": 14326 + }, + { + "epoch": 0.7885409213495514, + "grad_norm": 0.7123621106147766, + "learning_rate": 6.65522316534569e-06, + "loss": 0.6968, + "step": 14327 + }, + { + "epoch": 0.7885959601519071, + "grad_norm": 0.7308228015899658, + "learning_rate": 6.6548141334823045e-06, + "loss": 0.6715, + "step": 14328 + }, + { + "epoch": 0.7886509989542627, + "grad_norm": 0.7508199214935303, + "learning_rate": 6.654405089181779e-06, + "loss": 0.7884, + "step": 14329 + }, + { + "epoch": 0.7887060377566184, + "grad_norm": 0.7317141890525818, + "learning_rate": 6.653996032447188e-06, + "loss": 0.7319, + "step": 14330 + }, + { + "epoch": 0.7887610765589741, + "grad_norm": 0.6797091364860535, + "learning_rate": 6.653586963281607e-06, + "loss": 0.7898, + "step": 14331 + }, + { + "epoch": 0.7888161153613298, + "grad_norm": 0.6293582320213318, + "learning_rate": 6.6531778816881065e-06, + "loss": 0.6784, + "step": 14332 + }, + { + "epoch": 0.7888711541636854, + "grad_norm": 0.7604238986968994, + "learning_rate": 6.652768787669763e-06, + "loss": 0.7226, + "step": 14333 + }, + { + "epoch": 0.788926192966041, + "grad_norm": 0.6921128034591675, + "learning_rate": 6.652359681229654e-06, + "loss": 0.7375, + "step": 14334 + }, + { + "epoch": 0.7889812317683967, + "grad_norm": 0.6532993316650391, + "learning_rate": 6.651950562370851e-06, + "loss": 0.703, + "step": 14335 + }, + { + "epoch": 0.7890362705707524, + "grad_norm": 0.6739360094070435, + "learning_rate": 6.651541431096431e-06, + "loss": 0.7488, + "step": 14336 + }, + { + "epoch": 0.789091309373108, + "grad_norm": 0.7503200173377991, + "learning_rate": 6.651132287409466e-06, + "loss": 0.7492, + "step": 14337 + }, + { + "epoch": 0.7891463481754637, + "grad_norm": 0.6537551879882812, + "learning_rate": 6.650723131313035e-06, + "loss": 0.723, + "step": 14338 + }, + { + "epoch": 0.7892013869778194, + "grad_norm": 0.6378511786460876, + "learning_rate": 6.650313962810208e-06, + "loss": 0.7764, + "step": 14339 + }, + { + "epoch": 0.7892564257801751, + "grad_norm": 0.7948685884475708, + "learning_rate": 6.649904781904065e-06, + "loss": 0.7996, + "step": 14340 + }, + { + "epoch": 0.7893114645825307, + "grad_norm": 0.7558071613311768, + "learning_rate": 6.649495588597678e-06, + "loss": 0.8249, + "step": 14341 + }, + { + "epoch": 0.7893665033848863, + "grad_norm": 0.7158063054084778, + "learning_rate": 6.649086382894124e-06, + "loss": 0.815, + "step": 14342 + }, + { + "epoch": 0.789421542187242, + "grad_norm": 0.7551599144935608, + "learning_rate": 6.648677164796479e-06, + "loss": 0.7151, + "step": 14343 + }, + { + "epoch": 0.7894765809895977, + "grad_norm": 0.6966339349746704, + "learning_rate": 6.648267934307817e-06, + "loss": 0.8057, + "step": 14344 + }, + { + "epoch": 0.7895316197919533, + "grad_norm": 0.6863396167755127, + "learning_rate": 6.647858691431214e-06, + "loss": 0.7819, + "step": 14345 + }, + { + "epoch": 0.789586658594309, + "grad_norm": 0.7352383136749268, + "learning_rate": 6.647449436169747e-06, + "loss": 0.8101, + "step": 14346 + }, + { + "epoch": 0.7896416973966647, + "grad_norm": 0.7630855441093445, + "learning_rate": 6.64704016852649e-06, + "loss": 0.7155, + "step": 14347 + }, + { + "epoch": 0.7896967361990203, + "grad_norm": 0.6740198135375977, + "learning_rate": 6.646630888504522e-06, + "loss": 0.7255, + "step": 14348 + }, + { + "epoch": 0.7897517750013759, + "grad_norm": 0.7095367908477783, + "learning_rate": 6.646221596106917e-06, + "loss": 0.7527, + "step": 14349 + }, + { + "epoch": 0.7898068138037316, + "grad_norm": 0.6096131801605225, + "learning_rate": 6.645812291336749e-06, + "loss": 0.7116, + "step": 14350 + }, + { + "epoch": 0.7898618526060873, + "grad_norm": 0.7212585210800171, + "learning_rate": 6.645402974197097e-06, + "loss": 0.7647, + "step": 14351 + }, + { + "epoch": 0.7899168914084429, + "grad_norm": 0.7145454287528992, + "learning_rate": 6.6449936446910376e-06, + "loss": 0.7988, + "step": 14352 + }, + { + "epoch": 0.7899719302107986, + "grad_norm": 0.668269693851471, + "learning_rate": 6.644584302821646e-06, + "loss": 0.8453, + "step": 14353 + }, + { + "epoch": 0.7900269690131543, + "grad_norm": 0.7431649565696716, + "learning_rate": 6.644174948591998e-06, + "loss": 0.6981, + "step": 14354 + }, + { + "epoch": 0.79008200781551, + "grad_norm": 0.6727485060691833, + "learning_rate": 6.643765582005172e-06, + "loss": 0.792, + "step": 14355 + }, + { + "epoch": 0.7901370466178655, + "grad_norm": 0.7102059721946716, + "learning_rate": 6.643356203064244e-06, + "loss": 0.7469, + "step": 14356 + }, + { + "epoch": 0.7901920854202212, + "grad_norm": 0.6719706654548645, + "learning_rate": 6.642946811772291e-06, + "loss": 0.7542, + "step": 14357 + }, + { + "epoch": 0.7902471242225769, + "grad_norm": 0.7044880986213684, + "learning_rate": 6.6425374081323875e-06, + "loss": 0.7884, + "step": 14358 + }, + { + "epoch": 0.7903021630249326, + "grad_norm": 0.656411349773407, + "learning_rate": 6.642127992147614e-06, + "loss": 0.7596, + "step": 14359 + }, + { + "epoch": 0.7903572018272882, + "grad_norm": 0.6256445050239563, + "learning_rate": 6.641718563821047e-06, + "loss": 0.6257, + "step": 14360 + }, + { + "epoch": 0.7904122406296439, + "grad_norm": 0.6761715412139893, + "learning_rate": 6.641309123155761e-06, + "loss": 0.7024, + "step": 14361 + }, + { + "epoch": 0.7904672794319996, + "grad_norm": 0.7567794322967529, + "learning_rate": 6.640899670154837e-06, + "loss": 0.7948, + "step": 14362 + }, + { + "epoch": 0.7905223182343553, + "grad_norm": 0.6192977428436279, + "learning_rate": 6.640490204821349e-06, + "loss": 0.7307, + "step": 14363 + }, + { + "epoch": 0.7905773570367108, + "grad_norm": 0.8120929002761841, + "learning_rate": 6.640080727158376e-06, + "loss": 0.7173, + "step": 14364 + }, + { + "epoch": 0.7906323958390665, + "grad_norm": 0.7303271293640137, + "learning_rate": 6.639671237168996e-06, + "loss": 0.8118, + "step": 14365 + }, + { + "epoch": 0.7906874346414222, + "grad_norm": 0.6731529831886292, + "learning_rate": 6.639261734856284e-06, + "loss": 0.76, + "step": 14366 + }, + { + "epoch": 0.7907424734437779, + "grad_norm": 0.6909935474395752, + "learning_rate": 6.638852220223321e-06, + "loss": 0.7732, + "step": 14367 + }, + { + "epoch": 0.7907975122461335, + "grad_norm": 0.6543979048728943, + "learning_rate": 6.638442693273183e-06, + "loss": 0.7408, + "step": 14368 + }, + { + "epoch": 0.7908525510484892, + "grad_norm": 0.6411511301994324, + "learning_rate": 6.6380331540089485e-06, + "loss": 0.6963, + "step": 14369 + }, + { + "epoch": 0.7909075898508449, + "grad_norm": 0.6657214164733887, + "learning_rate": 6.637623602433694e-06, + "loss": 0.7417, + "step": 14370 + }, + { + "epoch": 0.7909626286532006, + "grad_norm": 0.6852405071258545, + "learning_rate": 6.6372140385505e-06, + "loss": 0.7176, + "step": 14371 + }, + { + "epoch": 0.7910176674555561, + "grad_norm": 0.6453777551651001, + "learning_rate": 6.636804462362444e-06, + "loss": 0.7791, + "step": 14372 + }, + { + "epoch": 0.7910727062579118, + "grad_norm": 0.6806328296661377, + "learning_rate": 6.636394873872603e-06, + "loss": 0.7856, + "step": 14373 + }, + { + "epoch": 0.7911277450602675, + "grad_norm": 0.6819495558738708, + "learning_rate": 6.635985273084058e-06, + "loss": 0.7865, + "step": 14374 + }, + { + "epoch": 0.7911827838626232, + "grad_norm": 0.7372999787330627, + "learning_rate": 6.635575659999883e-06, + "loss": 0.8549, + "step": 14375 + }, + { + "epoch": 0.7912378226649788, + "grad_norm": 0.8146817684173584, + "learning_rate": 6.635166034623162e-06, + "loss": 0.7253, + "step": 14376 + }, + { + "epoch": 0.7912928614673345, + "grad_norm": 0.8205630779266357, + "learning_rate": 6.634756396956969e-06, + "loss": 0.6915, + "step": 14377 + }, + { + "epoch": 0.7913479002696902, + "grad_norm": 0.7168713808059692, + "learning_rate": 6.634346747004383e-06, + "loss": 0.7495, + "step": 14378 + }, + { + "epoch": 0.7914029390720458, + "grad_norm": 0.7210709452629089, + "learning_rate": 6.6339370847684854e-06, + "loss": 0.7323, + "step": 14379 + }, + { + "epoch": 0.7914579778744014, + "grad_norm": 0.9042065143585205, + "learning_rate": 6.633527410252355e-06, + "loss": 0.847, + "step": 14380 + }, + { + "epoch": 0.7915130166767571, + "grad_norm": 0.6700118184089661, + "learning_rate": 6.633117723459071e-06, + "loss": 0.7975, + "step": 14381 + }, + { + "epoch": 0.7915680554791128, + "grad_norm": 0.6355725526809692, + "learning_rate": 6.632708024391707e-06, + "loss": 0.7398, + "step": 14382 + }, + { + "epoch": 0.7916230942814685, + "grad_norm": 0.8274535536766052, + "learning_rate": 6.6322983130533505e-06, + "loss": 0.8641, + "step": 14383 + }, + { + "epoch": 0.7916781330838241, + "grad_norm": 0.5835573077201843, + "learning_rate": 6.631888589447075e-06, + "loss": 0.636, + "step": 14384 + }, + { + "epoch": 0.7917331718861798, + "grad_norm": 0.6933130621910095, + "learning_rate": 6.631478853575963e-06, + "loss": 0.7874, + "step": 14385 + }, + { + "epoch": 0.7917882106885354, + "grad_norm": 0.8125241994857788, + "learning_rate": 6.631069105443092e-06, + "loss": 0.7961, + "step": 14386 + }, + { + "epoch": 0.7918432494908911, + "grad_norm": 0.6661116480827332, + "learning_rate": 6.630659345051542e-06, + "loss": 0.6498, + "step": 14387 + }, + { + "epoch": 0.7918982882932467, + "grad_norm": 0.6807548403739929, + "learning_rate": 6.630249572404393e-06, + "loss": 0.6952, + "step": 14388 + }, + { + "epoch": 0.7919533270956024, + "grad_norm": 0.6886214017868042, + "learning_rate": 6.629839787504726e-06, + "loss": 0.7416, + "step": 14389 + }, + { + "epoch": 0.7920083658979581, + "grad_norm": 0.7633732557296753, + "learning_rate": 6.629429990355617e-06, + "loss": 0.8008, + "step": 14390 + }, + { + "epoch": 0.7920634047003137, + "grad_norm": 0.8401023745536804, + "learning_rate": 6.6290201809601494e-06, + "loss": 0.8312, + "step": 14391 + }, + { + "epoch": 0.7921184435026694, + "grad_norm": 0.6608526706695557, + "learning_rate": 6.628610359321403e-06, + "loss": 0.563, + "step": 14392 + }, + { + "epoch": 0.792173482305025, + "grad_norm": 0.687045156955719, + "learning_rate": 6.6282005254424566e-06, + "loss": 0.7451, + "step": 14393 + }, + { + "epoch": 0.7922285211073807, + "grad_norm": 0.7129287123680115, + "learning_rate": 6.627790679326389e-06, + "loss": 0.8495, + "step": 14394 + }, + { + "epoch": 0.7922835599097363, + "grad_norm": 0.6951952576637268, + "learning_rate": 6.627380820976283e-06, + "loss": 0.7895, + "step": 14395 + }, + { + "epoch": 0.792338598712092, + "grad_norm": 0.8020780086517334, + "learning_rate": 6.626970950395221e-06, + "loss": 0.7136, + "step": 14396 + }, + { + "epoch": 0.7923936375144477, + "grad_norm": 0.6654007434844971, + "learning_rate": 6.626561067586279e-06, + "loss": 0.7865, + "step": 14397 + }, + { + "epoch": 0.7924486763168034, + "grad_norm": 0.844744861125946, + "learning_rate": 6.62615117255254e-06, + "loss": 0.7856, + "step": 14398 + }, + { + "epoch": 0.792503715119159, + "grad_norm": 0.6890879273414612, + "learning_rate": 6.625741265297083e-06, + "loss": 0.7574, + "step": 14399 + }, + { + "epoch": 0.7925587539215146, + "grad_norm": 0.7559735774993896, + "learning_rate": 6.625331345822992e-06, + "loss": 0.634, + "step": 14400 + }, + { + "epoch": 0.7926137927238703, + "grad_norm": 0.6918107867240906, + "learning_rate": 6.624921414133344e-06, + "loss": 0.6935, + "step": 14401 + }, + { + "epoch": 0.792668831526226, + "grad_norm": 0.7468792200088501, + "learning_rate": 6.624511470231221e-06, + "loss": 0.7301, + "step": 14402 + }, + { + "epoch": 0.7927238703285816, + "grad_norm": 0.6749486327171326, + "learning_rate": 6.624101514119705e-06, + "loss": 0.7143, + "step": 14403 + }, + { + "epoch": 0.7927789091309373, + "grad_norm": 0.7765836119651794, + "learning_rate": 6.623691545801878e-06, + "loss": 0.7201, + "step": 14404 + }, + { + "epoch": 0.792833947933293, + "grad_norm": 0.6263312697410583, + "learning_rate": 6.623281565280819e-06, + "loss": 0.5866, + "step": 14405 + }, + { + "epoch": 0.7928889867356487, + "grad_norm": 0.6325232982635498, + "learning_rate": 6.62287157255961e-06, + "loss": 0.7389, + "step": 14406 + }, + { + "epoch": 0.7929440255380042, + "grad_norm": 0.7165958881378174, + "learning_rate": 6.622461567641333e-06, + "loss": 0.7378, + "step": 14407 + }, + { + "epoch": 0.7929990643403599, + "grad_norm": 0.7611519694328308, + "learning_rate": 6.62205155052907e-06, + "loss": 0.7146, + "step": 14408 + }, + { + "epoch": 0.7930541031427156, + "grad_norm": 0.6764969825744629, + "learning_rate": 6.6216415212259e-06, + "loss": 0.7802, + "step": 14409 + }, + { + "epoch": 0.7931091419450713, + "grad_norm": 0.7266956567764282, + "learning_rate": 6.621231479734908e-06, + "loss": 0.7065, + "step": 14410 + }, + { + "epoch": 0.7931641807474269, + "grad_norm": 0.7540454268455505, + "learning_rate": 6.620821426059174e-06, + "loss": 0.7327, + "step": 14411 + }, + { + "epoch": 0.7932192195497826, + "grad_norm": 0.7931423783302307, + "learning_rate": 6.620411360201779e-06, + "loss": 0.8032, + "step": 14412 + }, + { + "epoch": 0.7932742583521383, + "grad_norm": 1.2976648807525635, + "learning_rate": 6.620001282165808e-06, + "loss": 0.7422, + "step": 14413 + }, + { + "epoch": 0.793329297154494, + "grad_norm": 0.6525906920433044, + "learning_rate": 6.619591191954338e-06, + "loss": 0.6857, + "step": 14414 + }, + { + "epoch": 0.7933843359568495, + "grad_norm": 0.6153263449668884, + "learning_rate": 6.619181089570456e-06, + "loss": 0.6117, + "step": 14415 + }, + { + "epoch": 0.7934393747592052, + "grad_norm": 0.7076815962791443, + "learning_rate": 6.6187709750172425e-06, + "loss": 0.8053, + "step": 14416 + }, + { + "epoch": 0.7934944135615609, + "grad_norm": 0.6999046802520752, + "learning_rate": 6.618360848297779e-06, + "loss": 0.6275, + "step": 14417 + }, + { + "epoch": 0.7935494523639166, + "grad_norm": 0.7043859958648682, + "learning_rate": 6.6179507094151484e-06, + "loss": 0.8273, + "step": 14418 + }, + { + "epoch": 0.7936044911662722, + "grad_norm": 0.6295393705368042, + "learning_rate": 6.617540558372434e-06, + "loss": 0.6394, + "step": 14419 + }, + { + "epoch": 0.7936595299686279, + "grad_norm": 0.8165664076805115, + "learning_rate": 6.617130395172718e-06, + "loss": 0.8473, + "step": 14420 + }, + { + "epoch": 0.7937145687709836, + "grad_norm": 0.7598135471343994, + "learning_rate": 6.616720219819082e-06, + "loss": 0.729, + "step": 14421 + }, + { + "epoch": 0.7937696075733393, + "grad_norm": 0.7222034335136414, + "learning_rate": 6.6163100323146105e-06, + "loss": 0.7526, + "step": 14422 + }, + { + "epoch": 0.7938246463756948, + "grad_norm": 0.7994693517684937, + "learning_rate": 6.615899832662385e-06, + "loss": 0.8346, + "step": 14423 + }, + { + "epoch": 0.7938796851780505, + "grad_norm": 0.6603162884712219, + "learning_rate": 6.615489620865489e-06, + "loss": 0.7546, + "step": 14424 + }, + { + "epoch": 0.7939347239804062, + "grad_norm": 0.6525929570198059, + "learning_rate": 6.615079396927005e-06, + "loss": 0.7344, + "step": 14425 + }, + { + "epoch": 0.7939897627827619, + "grad_norm": 0.6144835948944092, + "learning_rate": 6.614669160850016e-06, + "loss": 0.6776, + "step": 14426 + }, + { + "epoch": 0.7940448015851175, + "grad_norm": 0.7205507159233093, + "learning_rate": 6.614258912637607e-06, + "loss": 0.809, + "step": 14427 + }, + { + "epoch": 0.7940998403874732, + "grad_norm": 0.6757732629776001, + "learning_rate": 6.61384865229286e-06, + "loss": 0.7403, + "step": 14428 + }, + { + "epoch": 0.7941548791898289, + "grad_norm": 0.6392103433609009, + "learning_rate": 6.6134383798188586e-06, + "loss": 0.7689, + "step": 14429 + }, + { + "epoch": 0.7942099179921845, + "grad_norm": 0.6647289395332336, + "learning_rate": 6.613028095218685e-06, + "loss": 0.6611, + "step": 14430 + }, + { + "epoch": 0.7942649567945401, + "grad_norm": 0.6961668133735657, + "learning_rate": 6.612617798495426e-06, + "loss": 0.7784, + "step": 14431 + }, + { + "epoch": 0.7943199955968958, + "grad_norm": 1.1188037395477295, + "learning_rate": 6.6122074896521615e-06, + "loss": 0.6518, + "step": 14432 + }, + { + "epoch": 0.7943750343992515, + "grad_norm": 0.6382507085800171, + "learning_rate": 6.611797168691978e-06, + "loss": 0.6954, + "step": 14433 + }, + { + "epoch": 0.7944300732016071, + "grad_norm": 0.6720117330551147, + "learning_rate": 6.6113868356179585e-06, + "loss": 0.7267, + "step": 14434 + }, + { + "epoch": 0.7944851120039628, + "grad_norm": 0.6667274832725525, + "learning_rate": 6.610976490433186e-06, + "loss": 0.6867, + "step": 14435 + }, + { + "epoch": 0.7945401508063185, + "grad_norm": 0.658217191696167, + "learning_rate": 6.610566133140747e-06, + "loss": 0.66, + "step": 14436 + }, + { + "epoch": 0.7945951896086741, + "grad_norm": 0.6820386648178101, + "learning_rate": 6.610155763743723e-06, + "loss": 0.7352, + "step": 14437 + }, + { + "epoch": 0.7946502284110297, + "grad_norm": 0.788696825504303, + "learning_rate": 6.609745382245198e-06, + "loss": 0.6822, + "step": 14438 + }, + { + "epoch": 0.7947052672133854, + "grad_norm": 0.6485540270805359, + "learning_rate": 6.6093349886482596e-06, + "loss": 0.718, + "step": 14439 + }, + { + "epoch": 0.7947603060157411, + "grad_norm": 0.717659056186676, + "learning_rate": 6.60892458295599e-06, + "loss": 0.7898, + "step": 14440 + }, + { + "epoch": 0.7948153448180968, + "grad_norm": 0.6576352119445801, + "learning_rate": 6.608514165171473e-06, + "loss": 0.8041, + "step": 14441 + }, + { + "epoch": 0.7948703836204524, + "grad_norm": 0.7034726738929749, + "learning_rate": 6.608103735297795e-06, + "loss": 0.7901, + "step": 14442 + }, + { + "epoch": 0.7949254224228081, + "grad_norm": 0.7001451253890991, + "learning_rate": 6.6076932933380386e-06, + "loss": 0.6814, + "step": 14443 + }, + { + "epoch": 0.7949804612251637, + "grad_norm": 0.789359450340271, + "learning_rate": 6.607282839295291e-06, + "loss": 0.744, + "step": 14444 + }, + { + "epoch": 0.7950355000275194, + "grad_norm": 0.7830412983894348, + "learning_rate": 6.606872373172636e-06, + "loss": 0.8161, + "step": 14445 + }, + { + "epoch": 0.795090538829875, + "grad_norm": 0.6462455987930298, + "learning_rate": 6.606461894973157e-06, + "loss": 0.7723, + "step": 14446 + }, + { + "epoch": 0.7951455776322307, + "grad_norm": 0.6232526898384094, + "learning_rate": 6.606051404699943e-06, + "loss": 0.6723, + "step": 14447 + }, + { + "epoch": 0.7952006164345864, + "grad_norm": 0.7790026068687439, + "learning_rate": 6.605640902356074e-06, + "loss": 0.7687, + "step": 14448 + }, + { + "epoch": 0.7952556552369421, + "grad_norm": 0.7281851768493652, + "learning_rate": 6.605230387944639e-06, + "loss": 0.827, + "step": 14449 + }, + { + "epoch": 0.7953106940392977, + "grad_norm": 0.6519556045532227, + "learning_rate": 6.604819861468721e-06, + "loss": 0.7039, + "step": 14450 + }, + { + "epoch": 0.7953657328416534, + "grad_norm": 0.6768763661384583, + "learning_rate": 6.604409322931406e-06, + "loss": 0.7288, + "step": 14451 + }, + { + "epoch": 0.795420771644009, + "grad_norm": 0.7457320094108582, + "learning_rate": 6.6039987723357825e-06, + "loss": 0.8386, + "step": 14452 + }, + { + "epoch": 0.7954758104463647, + "grad_norm": 0.9579072594642639, + "learning_rate": 6.6035882096849325e-06, + "loss": 0.7552, + "step": 14453 + }, + { + "epoch": 0.7955308492487203, + "grad_norm": 0.6709916591644287, + "learning_rate": 6.603177634981941e-06, + "loss": 0.724, + "step": 14454 + }, + { + "epoch": 0.795585888051076, + "grad_norm": 0.6097317934036255, + "learning_rate": 6.602767048229897e-06, + "loss": 0.6866, + "step": 14455 + }, + { + "epoch": 0.7956409268534317, + "grad_norm": 0.7303394675254822, + "learning_rate": 6.602356449431885e-06, + "loss": 0.682, + "step": 14456 + }, + { + "epoch": 0.7956959656557874, + "grad_norm": 0.775979220867157, + "learning_rate": 6.601945838590991e-06, + "loss": 0.7784, + "step": 14457 + }, + { + "epoch": 0.795751004458143, + "grad_norm": 0.7016483545303345, + "learning_rate": 6.6015352157103e-06, + "loss": 0.7557, + "step": 14458 + }, + { + "epoch": 0.7958060432604986, + "grad_norm": 0.688946545124054, + "learning_rate": 6.6011245807929e-06, + "loss": 0.707, + "step": 14459 + }, + { + "epoch": 0.7958610820628543, + "grad_norm": 0.7286174297332764, + "learning_rate": 6.600713933841877e-06, + "loss": 0.784, + "step": 14460 + }, + { + "epoch": 0.79591612086521, + "grad_norm": 0.7604749798774719, + "learning_rate": 6.600303274860316e-06, + "loss": 0.7099, + "step": 14461 + }, + { + "epoch": 0.7959711596675656, + "grad_norm": 0.6626706123352051, + "learning_rate": 6.599892603851301e-06, + "loss": 0.7137, + "step": 14462 + }, + { + "epoch": 0.7960261984699213, + "grad_norm": 0.7692080736160278, + "learning_rate": 6.599481920817925e-06, + "loss": 0.847, + "step": 14463 + }, + { + "epoch": 0.796081237272277, + "grad_norm": 0.6811042428016663, + "learning_rate": 6.599071225763269e-06, + "loss": 0.7888, + "step": 14464 + }, + { + "epoch": 0.7961362760746327, + "grad_norm": 0.654481053352356, + "learning_rate": 6.598660518690424e-06, + "loss": 0.6973, + "step": 14465 + }, + { + "epoch": 0.7961913148769882, + "grad_norm": 0.7332738637924194, + "learning_rate": 6.598249799602472e-06, + "loss": 0.8311, + "step": 14466 + }, + { + "epoch": 0.7962463536793439, + "grad_norm": 0.7098381519317627, + "learning_rate": 6.597839068502503e-06, + "loss": 0.8265, + "step": 14467 + }, + { + "epoch": 0.7963013924816996, + "grad_norm": 0.6338212490081787, + "learning_rate": 6.597428325393604e-06, + "loss": 0.6889, + "step": 14468 + }, + { + "epoch": 0.7963564312840553, + "grad_norm": 0.7001339197158813, + "learning_rate": 6.597017570278861e-06, + "loss": 0.7613, + "step": 14469 + }, + { + "epoch": 0.7964114700864109, + "grad_norm": 0.6565783619880676, + "learning_rate": 6.596606803161361e-06, + "loss": 0.6284, + "step": 14470 + }, + { + "epoch": 0.7964665088887666, + "grad_norm": 0.6638015508651733, + "learning_rate": 6.5961960240441935e-06, + "loss": 0.6635, + "step": 14471 + }, + { + "epoch": 0.7965215476911223, + "grad_norm": 0.6389575600624084, + "learning_rate": 6.595785232930443e-06, + "loss": 0.6588, + "step": 14472 + }, + { + "epoch": 0.796576586493478, + "grad_norm": 0.9486858248710632, + "learning_rate": 6.595374429823197e-06, + "loss": 0.8314, + "step": 14473 + }, + { + "epoch": 0.7966316252958335, + "grad_norm": 0.7555649280548096, + "learning_rate": 6.594963614725544e-06, + "loss": 0.8173, + "step": 14474 + }, + { + "epoch": 0.7966866640981892, + "grad_norm": 0.63021320104599, + "learning_rate": 6.5945527876405715e-06, + "loss": 0.7038, + "step": 14475 + }, + { + "epoch": 0.7967417029005449, + "grad_norm": 0.802980899810791, + "learning_rate": 6.594141948571366e-06, + "loss": 0.8031, + "step": 14476 + }, + { + "epoch": 0.7967967417029005, + "grad_norm": 0.7204614281654358, + "learning_rate": 6.593731097521019e-06, + "loss": 0.827, + "step": 14477 + }, + { + "epoch": 0.7968517805052562, + "grad_norm": 0.6805211305618286, + "learning_rate": 6.593320234492613e-06, + "loss": 0.7405, + "step": 14478 + }, + { + "epoch": 0.7969068193076119, + "grad_norm": 0.7011345028877258, + "learning_rate": 6.59290935948924e-06, + "loss": 0.7241, + "step": 14479 + }, + { + "epoch": 0.7969618581099676, + "grad_norm": 0.8995540738105774, + "learning_rate": 6.592498472513986e-06, + "loss": 0.6864, + "step": 14480 + }, + { + "epoch": 0.7970168969123231, + "grad_norm": 0.7518284320831299, + "learning_rate": 6.592087573569941e-06, + "loss": 0.7561, + "step": 14481 + }, + { + "epoch": 0.7970719357146788, + "grad_norm": 0.6359231472015381, + "learning_rate": 6.591676662660191e-06, + "loss": 0.6402, + "step": 14482 + }, + { + "epoch": 0.7971269745170345, + "grad_norm": 0.6610120534896851, + "learning_rate": 6.5912657397878264e-06, + "loss": 0.6419, + "step": 14483 + }, + { + "epoch": 0.7971820133193902, + "grad_norm": 0.7054341435432434, + "learning_rate": 6.590854804955934e-06, + "loss": 0.7252, + "step": 14484 + }, + { + "epoch": 0.7972370521217458, + "grad_norm": 0.6929903626441956, + "learning_rate": 6.5904438581676025e-06, + "loss": 0.6566, + "step": 14485 + }, + { + "epoch": 0.7972920909241015, + "grad_norm": 0.7354124188423157, + "learning_rate": 6.59003289942592e-06, + "loss": 0.763, + "step": 14486 + }, + { + "epoch": 0.7973471297264572, + "grad_norm": 0.6366610527038574, + "learning_rate": 6.5896219287339755e-06, + "loss": 0.6601, + "step": 14487 + }, + { + "epoch": 0.7974021685288128, + "grad_norm": 0.6916924715042114, + "learning_rate": 6.589210946094859e-06, + "loss": 0.7683, + "step": 14488 + }, + { + "epoch": 0.7974572073311684, + "grad_norm": 0.6567399501800537, + "learning_rate": 6.5887999515116586e-06, + "loss": 0.7487, + "step": 14489 + }, + { + "epoch": 0.7975122461335241, + "grad_norm": 0.8082888722419739, + "learning_rate": 6.5883889449874626e-06, + "loss": 0.7579, + "step": 14490 + }, + { + "epoch": 0.7975672849358798, + "grad_norm": 0.7138401865959167, + "learning_rate": 6.58797792652536e-06, + "loss": 0.7256, + "step": 14491 + }, + { + "epoch": 0.7976223237382355, + "grad_norm": 0.6514482498168945, + "learning_rate": 6.587566896128441e-06, + "loss": 0.6612, + "step": 14492 + }, + { + "epoch": 0.7976773625405911, + "grad_norm": 0.6770455837249756, + "learning_rate": 6.587155853799795e-06, + "loss": 0.677, + "step": 14493 + }, + { + "epoch": 0.7977324013429468, + "grad_norm": 0.6956327557563782, + "learning_rate": 6.586744799542511e-06, + "loss": 0.7824, + "step": 14494 + }, + { + "epoch": 0.7977874401453025, + "grad_norm": 0.6565653085708618, + "learning_rate": 6.586333733359676e-06, + "loss": 0.7496, + "step": 14495 + }, + { + "epoch": 0.7978424789476581, + "grad_norm": 0.6353399157524109, + "learning_rate": 6.585922655254382e-06, + "loss": 0.7264, + "step": 14496 + }, + { + "epoch": 0.7978975177500137, + "grad_norm": 1.037051796913147, + "learning_rate": 6.585511565229717e-06, + "loss": 0.7562, + "step": 14497 + }, + { + "epoch": 0.7979525565523694, + "grad_norm": 0.6447896957397461, + "learning_rate": 6.5851004632887725e-06, + "loss": 0.7509, + "step": 14498 + }, + { + "epoch": 0.7980075953547251, + "grad_norm": 0.7022401690483093, + "learning_rate": 6.584689349434636e-06, + "loss": 0.7752, + "step": 14499 + }, + { + "epoch": 0.7980626341570808, + "grad_norm": 0.7033591270446777, + "learning_rate": 6.5842782236703996e-06, + "loss": 0.7693, + "step": 14500 + }, + { + "epoch": 0.7981176729594364, + "grad_norm": 0.7061769962310791, + "learning_rate": 6.583867085999151e-06, + "loss": 0.6833, + "step": 14501 + }, + { + "epoch": 0.798172711761792, + "grad_norm": 0.7934882640838623, + "learning_rate": 6.583455936423984e-06, + "loss": 0.799, + "step": 14502 + }, + { + "epoch": 0.7982277505641477, + "grad_norm": 0.6968011260032654, + "learning_rate": 6.5830447749479835e-06, + "loss": 0.7132, + "step": 14503 + }, + { + "epoch": 0.7982827893665034, + "grad_norm": 1.7348299026489258, + "learning_rate": 6.582633601574243e-06, + "loss": 0.8996, + "step": 14504 + }, + { + "epoch": 0.798337828168859, + "grad_norm": 0.6822964549064636, + "learning_rate": 6.582222416305852e-06, + "loss": 0.7381, + "step": 14505 + }, + { + "epoch": 0.7983928669712147, + "grad_norm": 0.6600543856620789, + "learning_rate": 6.581811219145902e-06, + "loss": 0.711, + "step": 14506 + }, + { + "epoch": 0.7984479057735704, + "grad_norm": 0.8719834089279175, + "learning_rate": 6.581400010097481e-06, + "loss": 0.7567, + "step": 14507 + }, + { + "epoch": 0.7985029445759261, + "grad_norm": 0.7221046090126038, + "learning_rate": 6.580988789163681e-06, + "loss": 0.7417, + "step": 14508 + }, + { + "epoch": 0.7985579833782817, + "grad_norm": 0.6720401048660278, + "learning_rate": 6.580577556347592e-06, + "loss": 0.7467, + "step": 14509 + }, + { + "epoch": 0.7986130221806373, + "grad_norm": 0.7007263898849487, + "learning_rate": 6.580166311652306e-06, + "loss": 0.7356, + "step": 14510 + }, + { + "epoch": 0.798668060982993, + "grad_norm": 0.7384739518165588, + "learning_rate": 6.579755055080912e-06, + "loss": 0.7807, + "step": 14511 + }, + { + "epoch": 0.7987230997853487, + "grad_norm": 0.8054519295692444, + "learning_rate": 6.579343786636503e-06, + "loss": 0.7737, + "step": 14512 + }, + { + "epoch": 0.7987781385877043, + "grad_norm": 1.042319655418396, + "learning_rate": 6.578932506322169e-06, + "loss": 0.8708, + "step": 14513 + }, + { + "epoch": 0.79883317739006, + "grad_norm": 0.7122198343276978, + "learning_rate": 6.578521214141e-06, + "loss": 0.7818, + "step": 14514 + }, + { + "epoch": 0.7988882161924157, + "grad_norm": 0.9158271551132202, + "learning_rate": 6.578109910096088e-06, + "loss": 0.7439, + "step": 14515 + }, + { + "epoch": 0.7989432549947714, + "grad_norm": 0.7280082106590271, + "learning_rate": 6.577698594190524e-06, + "loss": 0.7888, + "step": 14516 + }, + { + "epoch": 0.798998293797127, + "grad_norm": 0.8203748464584351, + "learning_rate": 6.577287266427401e-06, + "loss": 0.7669, + "step": 14517 + }, + { + "epoch": 0.7990533325994826, + "grad_norm": 0.6998257637023926, + "learning_rate": 6.576875926809809e-06, + "loss": 0.7819, + "step": 14518 + }, + { + "epoch": 0.7991083714018383, + "grad_norm": 0.672575831413269, + "learning_rate": 6.57646457534084e-06, + "loss": 0.7359, + "step": 14519 + }, + { + "epoch": 0.7991634102041939, + "grad_norm": 0.931996762752533, + "learning_rate": 6.5760532120235845e-06, + "loss": 0.8816, + "step": 14520 + }, + { + "epoch": 0.7992184490065496, + "grad_norm": 0.7250553369522095, + "learning_rate": 6.575641836861134e-06, + "loss": 0.7924, + "step": 14521 + }, + { + "epoch": 0.7992734878089053, + "grad_norm": 0.6658768057823181, + "learning_rate": 6.575230449856582e-06, + "loss": 0.7064, + "step": 14522 + }, + { + "epoch": 0.799328526611261, + "grad_norm": 0.6901206374168396, + "learning_rate": 6.57481905101302e-06, + "loss": 0.7826, + "step": 14523 + }, + { + "epoch": 0.7993835654136165, + "grad_norm": 0.6772152781486511, + "learning_rate": 6.5744076403335386e-06, + "loss": 0.8143, + "step": 14524 + }, + { + "epoch": 0.7994386042159722, + "grad_norm": 0.6718147397041321, + "learning_rate": 6.5739962178212325e-06, + "loss": 0.765, + "step": 14525 + }, + { + "epoch": 0.7994936430183279, + "grad_norm": 0.7435488700866699, + "learning_rate": 6.573584783479191e-06, + "loss": 0.8685, + "step": 14526 + }, + { + "epoch": 0.7995486818206836, + "grad_norm": 0.7146314382553101, + "learning_rate": 6.573173337310506e-06, + "loss": 0.7605, + "step": 14527 + }, + { + "epoch": 0.7996037206230392, + "grad_norm": 0.6808409690856934, + "learning_rate": 6.572761879318274e-06, + "loss": 0.6996, + "step": 14528 + }, + { + "epoch": 0.7996587594253949, + "grad_norm": 1.1303905248641968, + "learning_rate": 6.572350409505584e-06, + "loss": 0.6107, + "step": 14529 + }, + { + "epoch": 0.7997137982277506, + "grad_norm": 0.7584583163261414, + "learning_rate": 6.571938927875529e-06, + "loss": 0.771, + "step": 14530 + }, + { + "epoch": 0.7997688370301063, + "grad_norm": 0.808233916759491, + "learning_rate": 6.5715274344312015e-06, + "loss": 0.7179, + "step": 14531 + }, + { + "epoch": 0.7998238758324618, + "grad_norm": 0.7067314386367798, + "learning_rate": 6.571115929175695e-06, + "loss": 0.7519, + "step": 14532 + }, + { + "epoch": 0.7998789146348175, + "grad_norm": 0.7611628174781799, + "learning_rate": 6.570704412112101e-06, + "loss": 0.8727, + "step": 14533 + }, + { + "epoch": 0.7999339534371732, + "grad_norm": 0.6485727429389954, + "learning_rate": 6.5702928832435145e-06, + "loss": 0.8455, + "step": 14534 + }, + { + "epoch": 0.7999889922395289, + "grad_norm": 1.5309134721755981, + "learning_rate": 6.569881342573024e-06, + "loss": 0.8362, + "step": 14535 + }, + { + "epoch": 0.8000440310418845, + "grad_norm": 0.7068225145339966, + "learning_rate": 6.569469790103729e-06, + "loss": 0.7924, + "step": 14536 + }, + { + "epoch": 0.8000990698442402, + "grad_norm": 0.7326669692993164, + "learning_rate": 6.569058225838717e-06, + "loss": 0.7594, + "step": 14537 + }, + { + "epoch": 0.8001541086465959, + "grad_norm": 0.6705706119537354, + "learning_rate": 6.568646649781085e-06, + "loss": 0.7331, + "step": 14538 + }, + { + "epoch": 0.8002091474489516, + "grad_norm": 0.7303051948547363, + "learning_rate": 6.568235061933923e-06, + "loss": 0.7274, + "step": 14539 + }, + { + "epoch": 0.8002641862513071, + "grad_norm": 0.6334550380706787, + "learning_rate": 6.567823462300326e-06, + "loss": 0.7105, + "step": 14540 + }, + { + "epoch": 0.8003192250536628, + "grad_norm": 0.7183839678764343, + "learning_rate": 6.56741185088339e-06, + "loss": 0.657, + "step": 14541 + }, + { + "epoch": 0.8003742638560185, + "grad_norm": 0.6896400451660156, + "learning_rate": 6.567000227686204e-06, + "loss": 0.7752, + "step": 14542 + }, + { + "epoch": 0.8004293026583742, + "grad_norm": 0.7214651703834534, + "learning_rate": 6.566588592711864e-06, + "loss": 0.753, + "step": 14543 + }, + { + "epoch": 0.8004843414607298, + "grad_norm": 0.7064470648765564, + "learning_rate": 6.566176945963464e-06, + "loss": 0.744, + "step": 14544 } ], "logging_steps": 1, @@ -95471,7 +101834,7 @@ "attributes": {} } }, - "total_flos": 4.023772071182991e+19, + "total_flos": 4.292023542595191e+19, "train_batch_size": 1, "trial_name": null, "trial_params": null