diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,9 +1,9 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 0.05003027134129561, + "epoch": 0.10006054268259122, "eval_steps": 500, - "global_step": 909, + "global_step": 1818, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, @@ -6370,6 +6370,6369 @@ "learning_rate": 9.987707779910499e-06, "loss": 0.9263, "step": 909 + }, + { + "epoch": 0.05008531014365127, + "grad_norm": 0.9423969388008118, + "learning_rate": 9.987677384947402e-06, + "loss": 0.9667, + "step": 910 + }, + { + "epoch": 0.050140348946006934, + "grad_norm": 1.5940319299697876, + "learning_rate": 9.987646952498256e-06, + "loss": 0.9223, + "step": 911 + }, + { + "epoch": 0.050195387748362595, + "grad_norm": 0.941792368888855, + "learning_rate": 9.987616482563292e-06, + "loss": 0.895, + "step": 912 + }, + { + "epoch": 0.05025042655071826, + "grad_norm": 3.1945221424102783, + "learning_rate": 9.987585975142738e-06, + "loss": 0.837, + "step": 913 + }, + { + "epoch": 0.05030546535307392, + "grad_norm": 2.0819199085235596, + "learning_rate": 9.98755543023682e-06, + "loss": 0.918, + "step": 914 + }, + { + "epoch": 0.05036050415542958, + "grad_norm": 0.984282910823822, + "learning_rate": 9.987524847845773e-06, + "loss": 0.8589, + "step": 915 + }, + { + "epoch": 0.05041554295778524, + "grad_norm": 0.9021026492118835, + "learning_rate": 9.987494227969823e-06, + "loss": 0.9053, + "step": 916 + }, + { + "epoch": 0.0504705817601409, + "grad_norm": 2.6515462398529053, + "learning_rate": 9.9874635706092e-06, + "loss": 0.8874, + "step": 917 + }, + { + "epoch": 0.050525620562496563, + "grad_norm": 0.8893095254898071, + "learning_rate": 9.98743287576414e-06, + "loss": 0.8259, + "step": 918 + }, + { + "epoch": 0.05058065936485222, + "grad_norm": 0.9897775650024414, + "learning_rate": 9.987402143434868e-06, + "loss": 0.877, + "step": 919 + }, + { + "epoch": 0.05063569816720788, + "grad_norm": 0.9391944408416748, + "learning_rate": 9.987371373621614e-06, + "loss": 0.9363, + "step": 920 + }, + { + "epoch": 0.05069073696956354, + "grad_norm": 0.9585913419723511, + "learning_rate": 9.987340566324615e-06, + "loss": 0.8704, + "step": 921 + }, + { + "epoch": 0.0507457757719192, + "grad_norm": 0.9210980534553528, + "learning_rate": 9.987309721544098e-06, + "loss": 0.9321, + "step": 922 + }, + { + "epoch": 0.05080081457427486, + "grad_norm": 1.0713307857513428, + "learning_rate": 9.987278839280295e-06, + "loss": 0.9489, + "step": 923 + }, + { + "epoch": 0.050855853376630525, + "grad_norm": 1.0178636312484741, + "learning_rate": 9.98724791953344e-06, + "loss": 0.853, + "step": 924 + }, + { + "epoch": 0.050910892178986186, + "grad_norm": 0.9782636761665344, + "learning_rate": 9.987216962303766e-06, + "loss": 0.924, + "step": 925 + }, + { + "epoch": 0.05096593098134185, + "grad_norm": 0.9474522471427917, + "learning_rate": 9.987185967591503e-06, + "loss": 0.8619, + "step": 926 + }, + { + "epoch": 0.05102096978369751, + "grad_norm": 1.1875778436660767, + "learning_rate": 9.987154935396885e-06, + "loss": 1.012, + "step": 927 + }, + { + "epoch": 0.05107600858605317, + "grad_norm": 1.0585243701934814, + "learning_rate": 9.987123865720147e-06, + "loss": 0.9019, + "step": 928 + }, + { + "epoch": 0.051131047388408825, + "grad_norm": 0.9848800897598267, + "learning_rate": 9.98709275856152e-06, + "loss": 0.9665, + "step": 929 + }, + { + "epoch": 0.051186086190764486, + "grad_norm": 1.04201078414917, + "learning_rate": 9.987061613921238e-06, + "loss": 0.9269, + "step": 930 + }, + { + "epoch": 0.05124112499312015, + "grad_norm": 1.1748600006103516, + "learning_rate": 9.987030431799537e-06, + "loss": 0.8565, + "step": 931 + }, + { + "epoch": 0.05129616379547581, + "grad_norm": 1.879232406616211, + "learning_rate": 9.98699921219665e-06, + "loss": 0.8535, + "step": 932 + }, + { + "epoch": 0.05135120259783147, + "grad_norm": 0.9837847948074341, + "learning_rate": 9.986967955112812e-06, + "loss": 0.927, + "step": 933 + }, + { + "epoch": 0.05140624140018713, + "grad_norm": 0.8637211918830872, + "learning_rate": 9.986936660548257e-06, + "loss": 0.7903, + "step": 934 + }, + { + "epoch": 0.05146128020254279, + "grad_norm": 0.9078792929649353, + "learning_rate": 9.986905328503222e-06, + "loss": 0.9135, + "step": 935 + }, + { + "epoch": 0.051516319004898455, + "grad_norm": 0.9763005971908569, + "learning_rate": 9.98687395897794e-06, + "loss": 0.9006, + "step": 936 + }, + { + "epoch": 0.051571357807254116, + "grad_norm": 1.0174345970153809, + "learning_rate": 9.98684255197265e-06, + "loss": 0.9294, + "step": 937 + }, + { + "epoch": 0.05162639660960978, + "grad_norm": 0.8709769248962402, + "learning_rate": 9.986811107487584e-06, + "loss": 0.7986, + "step": 938 + }, + { + "epoch": 0.05168143541196544, + "grad_norm": 0.8717525601387024, + "learning_rate": 9.986779625522983e-06, + "loss": 0.8705, + "step": 939 + }, + { + "epoch": 0.05173647421432109, + "grad_norm": 0.9682945013046265, + "learning_rate": 9.98674810607908e-06, + "loss": 0.8127, + "step": 940 + }, + { + "epoch": 0.051791513016676755, + "grad_norm": 1.0248037576675415, + "learning_rate": 9.986716549156113e-06, + "loss": 0.9217, + "step": 941 + }, + { + "epoch": 0.051846551819032416, + "grad_norm": 0.9883397221565247, + "learning_rate": 9.98668495475432e-06, + "loss": 0.853, + "step": 942 + }, + { + "epoch": 0.05190159062138808, + "grad_norm": 0.9271108508110046, + "learning_rate": 9.986653322873937e-06, + "loss": 0.8807, + "step": 943 + }, + { + "epoch": 0.05195662942374374, + "grad_norm": 0.9027101397514343, + "learning_rate": 9.986621653515203e-06, + "loss": 0.88, + "step": 944 + }, + { + "epoch": 0.0520116682260994, + "grad_norm": 0.9807021617889404, + "learning_rate": 9.986589946678354e-06, + "loss": 0.8922, + "step": 945 + }, + { + "epoch": 0.05206670702845506, + "grad_norm": 0.8779157400131226, + "learning_rate": 9.98655820236363e-06, + "loss": 0.8988, + "step": 946 + }, + { + "epoch": 0.05212174583081072, + "grad_norm": 0.8182910680770874, + "learning_rate": 9.986526420571272e-06, + "loss": 0.7534, + "step": 947 + }, + { + "epoch": 0.052176784633166384, + "grad_norm": 0.9205981492996216, + "learning_rate": 9.986494601301513e-06, + "loss": 0.7516, + "step": 948 + }, + { + "epoch": 0.052231823435522046, + "grad_norm": 0.9829681515693665, + "learning_rate": 9.986462744554598e-06, + "loss": 0.9358, + "step": 949 + }, + { + "epoch": 0.05228686223787771, + "grad_norm": 0.8869890570640564, + "learning_rate": 9.986430850330762e-06, + "loss": 0.7933, + "step": 950 + }, + { + "epoch": 0.05234190104023336, + "grad_norm": 1.0226716995239258, + "learning_rate": 9.986398918630248e-06, + "loss": 0.9523, + "step": 951 + }, + { + "epoch": 0.05239693984258902, + "grad_norm": 0.9549778699874878, + "learning_rate": 9.986366949453293e-06, + "loss": 0.9368, + "step": 952 + }, + { + "epoch": 0.052451978644944684, + "grad_norm": 0.860454797744751, + "learning_rate": 9.98633494280014e-06, + "loss": 0.7618, + "step": 953 + }, + { + "epoch": 0.052507017447300346, + "grad_norm": 0.9623841643333435, + "learning_rate": 9.986302898671027e-06, + "loss": 0.8356, + "step": 954 + }, + { + "epoch": 0.05256205624965601, + "grad_norm": 0.9236606359481812, + "learning_rate": 9.986270817066196e-06, + "loss": 0.921, + "step": 955 + }, + { + "epoch": 0.05261709505201167, + "grad_norm": 1.0599812269210815, + "learning_rate": 9.98623869798589e-06, + "loss": 0.8082, + "step": 956 + }, + { + "epoch": 0.05267213385436733, + "grad_norm": 1.0321687459945679, + "learning_rate": 9.986206541430347e-06, + "loss": 0.9001, + "step": 957 + }, + { + "epoch": 0.05272717265672299, + "grad_norm": 0.8884543776512146, + "learning_rate": 9.986174347399813e-06, + "loss": 0.8317, + "step": 958 + }, + { + "epoch": 0.05278221145907865, + "grad_norm": 0.9592668414115906, + "learning_rate": 9.986142115894526e-06, + "loss": 0.9955, + "step": 959 + }, + { + "epoch": 0.052837250261434314, + "grad_norm": 0.9604032039642334, + "learning_rate": 9.986109846914729e-06, + "loss": 0.876, + "step": 960 + }, + { + "epoch": 0.052892289063789975, + "grad_norm": 0.9837536811828613, + "learning_rate": 9.986077540460664e-06, + "loss": 0.8247, + "step": 961 + }, + { + "epoch": 0.05294732786614563, + "grad_norm": 0.8570861220359802, + "learning_rate": 9.986045196532576e-06, + "loss": 0.879, + "step": 962 + }, + { + "epoch": 0.05300236666850129, + "grad_norm": 0.8441471457481384, + "learning_rate": 9.986012815130708e-06, + "loss": 0.8979, + "step": 963 + }, + { + "epoch": 0.05305740547085695, + "grad_norm": 0.8976197838783264, + "learning_rate": 9.985980396255302e-06, + "loss": 0.9382, + "step": 964 + }, + { + "epoch": 0.053112444273212614, + "grad_norm": 0.9685307741165161, + "learning_rate": 9.985947939906599e-06, + "loss": 0.8627, + "step": 965 + }, + { + "epoch": 0.053167483075568275, + "grad_norm": 0.8939018249511719, + "learning_rate": 9.98591544608485e-06, + "loss": 0.9221, + "step": 966 + }, + { + "epoch": 0.05322252187792394, + "grad_norm": 0.9218310713768005, + "learning_rate": 9.985882914790292e-06, + "loss": 0.8356, + "step": 967 + }, + { + "epoch": 0.0532775606802796, + "grad_norm": 0.9342261552810669, + "learning_rate": 9.985850346023174e-06, + "loss": 0.971, + "step": 968 + }, + { + "epoch": 0.05333259948263526, + "grad_norm": 1.0860705375671387, + "learning_rate": 9.985817739783741e-06, + "loss": 0.906, + "step": 969 + }, + { + "epoch": 0.05338763828499092, + "grad_norm": 0.8675006031990051, + "learning_rate": 9.985785096072234e-06, + "loss": 0.906, + "step": 970 + }, + { + "epoch": 0.05344267708734658, + "grad_norm": 0.8170626163482666, + "learning_rate": 9.985752414888903e-06, + "loss": 0.8109, + "step": 971 + }, + { + "epoch": 0.05349771588970224, + "grad_norm": 0.936434805393219, + "learning_rate": 9.98571969623399e-06, + "loss": 0.9219, + "step": 972 + }, + { + "epoch": 0.0535527546920579, + "grad_norm": 0.9316715002059937, + "learning_rate": 9.985686940107741e-06, + "loss": 0.8569, + "step": 973 + }, + { + "epoch": 0.05360779349441356, + "grad_norm": 1.183008074760437, + "learning_rate": 9.985654146510405e-06, + "loss": 0.837, + "step": 974 + }, + { + "epoch": 0.05366283229676922, + "grad_norm": 1.0788745880126953, + "learning_rate": 9.98562131544223e-06, + "loss": 0.8822, + "step": 975 + }, + { + "epoch": 0.05371787109912488, + "grad_norm": 0.9285461902618408, + "learning_rate": 9.985588446903455e-06, + "loss": 0.9279, + "step": 976 + }, + { + "epoch": 0.053772909901480544, + "grad_norm": 0.9389022588729858, + "learning_rate": 9.985555540894334e-06, + "loss": 0.9839, + "step": 977 + }, + { + "epoch": 0.053827948703836205, + "grad_norm": 0.8920616507530212, + "learning_rate": 9.985522597415112e-06, + "loss": 0.9205, + "step": 978 + }, + { + "epoch": 0.053882987506191866, + "grad_norm": 0.9755093455314636, + "learning_rate": 9.985489616466035e-06, + "loss": 0.8987, + "step": 979 + }, + { + "epoch": 0.05393802630854753, + "grad_norm": 0.96027010679245, + "learning_rate": 9.985456598047356e-06, + "loss": 0.8543, + "step": 980 + }, + { + "epoch": 0.05399306511090319, + "grad_norm": 1.0489718914031982, + "learning_rate": 9.985423542159317e-06, + "loss": 0.9179, + "step": 981 + }, + { + "epoch": 0.05404810391325885, + "grad_norm": 0.8665526509284973, + "learning_rate": 9.985390448802171e-06, + "loss": 0.9047, + "step": 982 + }, + { + "epoch": 0.054103142715614505, + "grad_norm": 0.8849464654922485, + "learning_rate": 9.985357317976163e-06, + "loss": 0.8892, + "step": 983 + }, + { + "epoch": 0.054158181517970166, + "grad_norm": 1.0083115100860596, + "learning_rate": 9.985324149681545e-06, + "loss": 0.7713, + "step": 984 + }, + { + "epoch": 0.05421322032032583, + "grad_norm": 0.8233863711357117, + "learning_rate": 9.985290943918565e-06, + "loss": 0.7967, + "step": 985 + }, + { + "epoch": 0.05426825912268149, + "grad_norm": 0.9615303874015808, + "learning_rate": 9.985257700687472e-06, + "loss": 0.8576, + "step": 986 + }, + { + "epoch": 0.05432329792503715, + "grad_norm": 0.8856416344642639, + "learning_rate": 9.985224419988517e-06, + "loss": 0.8614, + "step": 987 + }, + { + "epoch": 0.05437833672739281, + "grad_norm": 0.968325674533844, + "learning_rate": 9.98519110182195e-06, + "loss": 0.8247, + "step": 988 + }, + { + "epoch": 0.05443337552974847, + "grad_norm": 0.878402054309845, + "learning_rate": 9.985157746188021e-06, + "loss": 0.8661, + "step": 989 + }, + { + "epoch": 0.054488414332104135, + "grad_norm": 0.8376438021659851, + "learning_rate": 9.985124353086981e-06, + "loss": 0.8554, + "step": 990 + }, + { + "epoch": 0.054543453134459796, + "grad_norm": 1.0293036699295044, + "learning_rate": 9.98509092251908e-06, + "loss": 0.8049, + "step": 991 + }, + { + "epoch": 0.05459849193681546, + "grad_norm": 1.2345234155654907, + "learning_rate": 9.98505745448457e-06, + "loss": 1.0358, + "step": 992 + }, + { + "epoch": 0.05465353073917112, + "grad_norm": 0.9974482655525208, + "learning_rate": 9.985023948983703e-06, + "loss": 0.9329, + "step": 993 + }, + { + "epoch": 0.05470856954152677, + "grad_norm": 1.383955478668213, + "learning_rate": 9.984990406016732e-06, + "loss": 0.8688, + "step": 994 + }, + { + "epoch": 0.054763608343882435, + "grad_norm": 0.9369306564331055, + "learning_rate": 9.984956825583906e-06, + "loss": 0.8308, + "step": 995 + }, + { + "epoch": 0.054818647146238096, + "grad_norm": 0.8676120042800903, + "learning_rate": 9.984923207685478e-06, + "loss": 0.8283, + "step": 996 + }, + { + "epoch": 0.05487368594859376, + "grad_norm": 0.9218453168869019, + "learning_rate": 9.984889552321704e-06, + "loss": 0.7247, + "step": 997 + }, + { + "epoch": 0.05492872475094942, + "grad_norm": 0.8575478196144104, + "learning_rate": 9.984855859492833e-06, + "loss": 0.8462, + "step": 998 + }, + { + "epoch": 0.05498376355330508, + "grad_norm": 1.0042616128921509, + "learning_rate": 9.98482212919912e-06, + "loss": 0.9383, + "step": 999 + }, + { + "epoch": 0.05503880235566074, + "grad_norm": 0.8642181158065796, + "learning_rate": 9.984788361440817e-06, + "loss": 0.8805, + "step": 1000 + }, + { + "epoch": 0.0550938411580164, + "grad_norm": 0.8413823843002319, + "learning_rate": 9.984754556218178e-06, + "loss": 0.8161, + "step": 1001 + }, + { + "epoch": 0.055148879960372064, + "grad_norm": 0.9473856091499329, + "learning_rate": 9.984720713531462e-06, + "loss": 0.8425, + "step": 1002 + }, + { + "epoch": 0.055203918762727726, + "grad_norm": 0.7854379415512085, + "learning_rate": 9.984686833380917e-06, + "loss": 0.7506, + "step": 1003 + }, + { + "epoch": 0.05525895756508339, + "grad_norm": 0.9481745958328247, + "learning_rate": 9.984652915766801e-06, + "loss": 0.954, + "step": 1004 + }, + { + "epoch": 0.05531399636743904, + "grad_norm": 0.767803966999054, + "learning_rate": 9.984618960689366e-06, + "loss": 0.8113, + "step": 1005 + }, + { + "epoch": 0.0553690351697947, + "grad_norm": 0.8957781195640564, + "learning_rate": 9.984584968148871e-06, + "loss": 0.9042, + "step": 1006 + }, + { + "epoch": 0.055424073972150364, + "grad_norm": 1.116646409034729, + "learning_rate": 9.98455093814557e-06, + "loss": 0.8648, + "step": 1007 + }, + { + "epoch": 0.055479112774506026, + "grad_norm": 0.9567018151283264, + "learning_rate": 9.98451687067972e-06, + "loss": 0.9446, + "step": 1008 + }, + { + "epoch": 0.05553415157686169, + "grad_norm": 0.8470665812492371, + "learning_rate": 9.98448276575157e-06, + "loss": 0.8186, + "step": 1009 + }, + { + "epoch": 0.05558919037921735, + "grad_norm": 0.9595193862915039, + "learning_rate": 9.984448623361387e-06, + "loss": 0.8406, + "step": 1010 + }, + { + "epoch": 0.05564422918157301, + "grad_norm": 1.0579735040664673, + "learning_rate": 9.98441444350942e-06, + "loss": 0.9676, + "step": 1011 + }, + { + "epoch": 0.05569926798392867, + "grad_norm": 0.8693701028823853, + "learning_rate": 9.98438022619593e-06, + "loss": 0.9451, + "step": 1012 + }, + { + "epoch": 0.05575430678628433, + "grad_norm": 0.9251859784126282, + "learning_rate": 9.98434597142117e-06, + "loss": 0.7858, + "step": 1013 + }, + { + "epoch": 0.055809345588639994, + "grad_norm": 0.8584280014038086, + "learning_rate": 9.984311679185402e-06, + "loss": 0.8481, + "step": 1014 + }, + { + "epoch": 0.05586438439099565, + "grad_norm": 0.8903968334197998, + "learning_rate": 9.98427734948888e-06, + "loss": 0.7832, + "step": 1015 + }, + { + "epoch": 0.05591942319335131, + "grad_norm": 0.905581533908844, + "learning_rate": 9.984242982331864e-06, + "loss": 0.8088, + "step": 1016 + }, + { + "epoch": 0.05597446199570697, + "grad_norm": 0.9866476655006409, + "learning_rate": 9.984208577714612e-06, + "loss": 0.8366, + "step": 1017 + }, + { + "epoch": 0.05602950079806263, + "grad_norm": 0.8843809962272644, + "learning_rate": 9.984174135637384e-06, + "loss": 0.8961, + "step": 1018 + }, + { + "epoch": 0.056084539600418294, + "grad_norm": 0.9071753621101379, + "learning_rate": 9.984139656100435e-06, + "loss": 0.8671, + "step": 1019 + }, + { + "epoch": 0.056139578402773956, + "grad_norm": 0.9894018173217773, + "learning_rate": 9.984105139104028e-06, + "loss": 0.9099, + "step": 1020 + }, + { + "epoch": 0.05619461720512962, + "grad_norm": 0.8432741165161133, + "learning_rate": 9.98407058464842e-06, + "loss": 0.7817, + "step": 1021 + }, + { + "epoch": 0.05624965600748528, + "grad_norm": 0.9538390040397644, + "learning_rate": 9.984035992733873e-06, + "loss": 0.8689, + "step": 1022 + }, + { + "epoch": 0.05630469480984094, + "grad_norm": 0.9263421297073364, + "learning_rate": 9.984001363360645e-06, + "loss": 0.9066, + "step": 1023 + }, + { + "epoch": 0.0563597336121966, + "grad_norm": 0.8921047449111938, + "learning_rate": 9.983966696528996e-06, + "loss": 0.8304, + "step": 1024 + }, + { + "epoch": 0.05641477241455226, + "grad_norm": 0.8379812240600586, + "learning_rate": 9.983931992239188e-06, + "loss": 0.866, + "step": 1025 + }, + { + "epoch": 0.05646981121690792, + "grad_norm": 0.9444219470024109, + "learning_rate": 9.983897250491481e-06, + "loss": 0.9456, + "step": 1026 + }, + { + "epoch": 0.05652485001926358, + "grad_norm": 1.0268759727478027, + "learning_rate": 9.983862471286137e-06, + "loss": 0.8277, + "step": 1027 + }, + { + "epoch": 0.05657988882161924, + "grad_norm": 1.3949217796325684, + "learning_rate": 9.983827654623418e-06, + "loss": 0.9721, + "step": 1028 + }, + { + "epoch": 0.0566349276239749, + "grad_norm": 0.8899377584457397, + "learning_rate": 9.983792800503582e-06, + "loss": 0.8794, + "step": 1029 + }, + { + "epoch": 0.05668996642633056, + "grad_norm": 0.989072322845459, + "learning_rate": 9.983757908926895e-06, + "loss": 0.8852, + "step": 1030 + }, + { + "epoch": 0.056745005228686224, + "grad_norm": 0.9797759056091309, + "learning_rate": 9.983722979893615e-06, + "loss": 1.0405, + "step": 1031 + }, + { + "epoch": 0.056800044031041885, + "grad_norm": 0.9044767618179321, + "learning_rate": 9.98368801340401e-06, + "loss": 0.7243, + "step": 1032 + }, + { + "epoch": 0.05685508283339755, + "grad_norm": 1.116324782371521, + "learning_rate": 9.983653009458338e-06, + "loss": 0.9183, + "step": 1033 + }, + { + "epoch": 0.05691012163575321, + "grad_norm": 0.9373337030410767, + "learning_rate": 9.983617968056866e-06, + "loss": 0.9417, + "step": 1034 + }, + { + "epoch": 0.05696516043810887, + "grad_norm": 1.0587197542190552, + "learning_rate": 9.983582889199855e-06, + "loss": 0.896, + "step": 1035 + }, + { + "epoch": 0.05702019924046453, + "grad_norm": 1.0080119371414185, + "learning_rate": 9.983547772887568e-06, + "loss": 0.924, + "step": 1036 + }, + { + "epoch": 0.057075238042820185, + "grad_norm": 0.847091019153595, + "learning_rate": 9.98351261912027e-06, + "loss": 0.7443, + "step": 1037 + }, + { + "epoch": 0.05713027684517585, + "grad_norm": 0.9876272082328796, + "learning_rate": 9.983477427898225e-06, + "loss": 0.9365, + "step": 1038 + }, + { + "epoch": 0.05718531564753151, + "grad_norm": 0.9188169240951538, + "learning_rate": 9.983442199221698e-06, + "loss": 0.9213, + "step": 1039 + }, + { + "epoch": 0.05724035444988717, + "grad_norm": 0.932399332523346, + "learning_rate": 9.983406933090954e-06, + "loss": 0.958, + "step": 1040 + }, + { + "epoch": 0.05729539325224283, + "grad_norm": 0.9126465320587158, + "learning_rate": 9.983371629506258e-06, + "loss": 0.8913, + "step": 1041 + }, + { + "epoch": 0.05735043205459849, + "grad_norm": 0.80904620885849, + "learning_rate": 9.983336288467873e-06, + "loss": 0.7719, + "step": 1042 + }, + { + "epoch": 0.057405470856954154, + "grad_norm": 0.873833417892456, + "learning_rate": 9.983300909976067e-06, + "loss": 0.9201, + "step": 1043 + }, + { + "epoch": 0.057460509659309815, + "grad_norm": 0.8331829309463501, + "learning_rate": 9.983265494031107e-06, + "loss": 0.8605, + "step": 1044 + }, + { + "epoch": 0.057515548461665476, + "grad_norm": 0.8364768624305725, + "learning_rate": 9.983230040633255e-06, + "loss": 0.8627, + "step": 1045 + }, + { + "epoch": 0.05757058726402114, + "grad_norm": 0.9226736426353455, + "learning_rate": 9.98319454978278e-06, + "loss": 0.9759, + "step": 1046 + }, + { + "epoch": 0.05762562606637679, + "grad_norm": 0.8174427151679993, + "learning_rate": 9.98315902147995e-06, + "loss": 0.8066, + "step": 1047 + }, + { + "epoch": 0.057680664868732454, + "grad_norm": 0.9154924750328064, + "learning_rate": 9.98312345572503e-06, + "loss": 0.9112, + "step": 1048 + }, + { + "epoch": 0.057735703671088115, + "grad_norm": 0.8884655237197876, + "learning_rate": 9.983087852518289e-06, + "loss": 0.8699, + "step": 1049 + }, + { + "epoch": 0.057790742473443776, + "grad_norm": 0.8849230408668518, + "learning_rate": 9.983052211859992e-06, + "loss": 0.8999, + "step": 1050 + }, + { + "epoch": 0.05784578127579944, + "grad_norm": 1.025843858718872, + "learning_rate": 9.98301653375041e-06, + "loss": 0.7764, + "step": 1051 + }, + { + "epoch": 0.0579008200781551, + "grad_norm": 0.900505006313324, + "learning_rate": 9.98298081818981e-06, + "loss": 0.9196, + "step": 1052 + }, + { + "epoch": 0.05795585888051076, + "grad_norm": 0.9506704211235046, + "learning_rate": 9.982945065178457e-06, + "loss": 0.8319, + "step": 1053 + }, + { + "epoch": 0.05801089768286642, + "grad_norm": 0.9439849853515625, + "learning_rate": 9.982909274716626e-06, + "loss": 0.8561, + "step": 1054 + }, + { + "epoch": 0.05806593648522208, + "grad_norm": 0.8761240243911743, + "learning_rate": 9.982873446804579e-06, + "loss": 0.9681, + "step": 1055 + }, + { + "epoch": 0.058120975287577745, + "grad_norm": 0.8756145238876343, + "learning_rate": 9.982837581442592e-06, + "loss": 0.8452, + "step": 1056 + }, + { + "epoch": 0.058176014089933406, + "grad_norm": 0.8732383847236633, + "learning_rate": 9.982801678630932e-06, + "loss": 0.9018, + "step": 1057 + }, + { + "epoch": 0.05823105289228906, + "grad_norm": 0.8338272571563721, + "learning_rate": 9.982765738369867e-06, + "loss": 0.9308, + "step": 1058 + }, + { + "epoch": 0.05828609169464472, + "grad_norm": 0.843163013458252, + "learning_rate": 9.982729760659669e-06, + "loss": 0.7802, + "step": 1059 + }, + { + "epoch": 0.05834113049700038, + "grad_norm": 1.2007580995559692, + "learning_rate": 9.982693745500606e-06, + "loss": 0.8406, + "step": 1060 + }, + { + "epoch": 0.058396169299356045, + "grad_norm": 0.8760073781013489, + "learning_rate": 9.982657692892954e-06, + "loss": 0.8528, + "step": 1061 + }, + { + "epoch": 0.058451208101711706, + "grad_norm": 0.925309419631958, + "learning_rate": 9.982621602836978e-06, + "loss": 0.9601, + "step": 1062 + }, + { + "epoch": 0.05850624690406737, + "grad_norm": 0.9277135133743286, + "learning_rate": 9.982585475332952e-06, + "loss": 0.8405, + "step": 1063 + }, + { + "epoch": 0.05856128570642303, + "grad_norm": 0.928044319152832, + "learning_rate": 9.98254931038115e-06, + "loss": 0.8259, + "step": 1064 + }, + { + "epoch": 0.05861632450877869, + "grad_norm": 0.8363838195800781, + "learning_rate": 9.982513107981837e-06, + "loss": 0.8655, + "step": 1065 + }, + { + "epoch": 0.05867136331113435, + "grad_norm": 0.9800984859466553, + "learning_rate": 9.982476868135292e-06, + "loss": 0.9285, + "step": 1066 + }, + { + "epoch": 0.05872640211349001, + "grad_norm": 0.8062636256217957, + "learning_rate": 9.982440590841785e-06, + "loss": 0.754, + "step": 1067 + }, + { + "epoch": 0.058781440915845674, + "grad_norm": 1.2010705471038818, + "learning_rate": 9.982404276101586e-06, + "loss": 0.9872, + "step": 1068 + }, + { + "epoch": 0.05883647971820133, + "grad_norm": 1.0036406517028809, + "learning_rate": 9.982367923914971e-06, + "loss": 0.8724, + "step": 1069 + }, + { + "epoch": 0.05889151852055699, + "grad_norm": 0.8768866658210754, + "learning_rate": 9.982331534282212e-06, + "loss": 0.838, + "step": 1070 + }, + { + "epoch": 0.05894655732291265, + "grad_norm": 0.7892739176750183, + "learning_rate": 9.982295107203584e-06, + "loss": 0.6974, + "step": 1071 + }, + { + "epoch": 0.05900159612526831, + "grad_norm": 0.863315999507904, + "learning_rate": 9.982258642679358e-06, + "loss": 0.9282, + "step": 1072 + }, + { + "epoch": 0.059056634927623974, + "grad_norm": 0.8645132780075073, + "learning_rate": 9.982222140709812e-06, + "loss": 0.8504, + "step": 1073 + }, + { + "epoch": 0.059111673729979636, + "grad_norm": 1.0003199577331543, + "learning_rate": 9.982185601295216e-06, + "loss": 1.0293, + "step": 1074 + }, + { + "epoch": 0.0591667125323353, + "grad_norm": 0.8391831517219543, + "learning_rate": 9.982149024435848e-06, + "loss": 0.8609, + "step": 1075 + }, + { + "epoch": 0.05922175133469096, + "grad_norm": 0.9940230846405029, + "learning_rate": 9.982112410131981e-06, + "loss": 0.9623, + "step": 1076 + }, + { + "epoch": 0.05927679013704662, + "grad_norm": 1.0670262575149536, + "learning_rate": 9.98207575838389e-06, + "loss": 0.9952, + "step": 1077 + }, + { + "epoch": 0.05933182893940228, + "grad_norm": 0.8506165742874146, + "learning_rate": 9.982039069191853e-06, + "loss": 0.8401, + "step": 1078 + }, + { + "epoch": 0.05938686774175794, + "grad_norm": 0.8956409096717834, + "learning_rate": 9.982002342556144e-06, + "loss": 0.8779, + "step": 1079 + }, + { + "epoch": 0.0594419065441136, + "grad_norm": 0.8955749273300171, + "learning_rate": 9.981965578477038e-06, + "loss": 0.8946, + "step": 1080 + }, + { + "epoch": 0.05949694534646926, + "grad_norm": 0.9035234451293945, + "learning_rate": 9.981928776954811e-06, + "loss": 0.9352, + "step": 1081 + }, + { + "epoch": 0.05955198414882492, + "grad_norm": 0.8748759627342224, + "learning_rate": 9.981891937989743e-06, + "loss": 0.8803, + "step": 1082 + }, + { + "epoch": 0.05960702295118058, + "grad_norm": 0.9966281056404114, + "learning_rate": 9.981855061582108e-06, + "loss": 0.9304, + "step": 1083 + }, + { + "epoch": 0.05966206175353624, + "grad_norm": 0.8696668148040771, + "learning_rate": 9.981818147732183e-06, + "loss": 0.8706, + "step": 1084 + }, + { + "epoch": 0.059717100555891904, + "grad_norm": 0.9823188185691833, + "learning_rate": 9.981781196440249e-06, + "loss": 0.9431, + "step": 1085 + }, + { + "epoch": 0.059772139358247565, + "grad_norm": 0.8401583433151245, + "learning_rate": 9.981744207706577e-06, + "loss": 0.8369, + "step": 1086 + }, + { + "epoch": 0.05982717816060323, + "grad_norm": 0.8775757551193237, + "learning_rate": 9.981707181531452e-06, + "loss": 0.9516, + "step": 1087 + }, + { + "epoch": 0.05988221696295889, + "grad_norm": 0.9153465628623962, + "learning_rate": 9.981670117915148e-06, + "loss": 0.8997, + "step": 1088 + }, + { + "epoch": 0.05993725576531455, + "grad_norm": 0.9053078889846802, + "learning_rate": 9.981633016857946e-06, + "loss": 0.9452, + "step": 1089 + }, + { + "epoch": 0.059992294567670204, + "grad_norm": 0.9154480695724487, + "learning_rate": 9.981595878360123e-06, + "loss": 0.8293, + "step": 1090 + }, + { + "epoch": 0.060047333370025865, + "grad_norm": 0.85718834400177, + "learning_rate": 9.981558702421958e-06, + "loss": 0.876, + "step": 1091 + }, + { + "epoch": 0.06010237217238153, + "grad_norm": 0.9437130689620972, + "learning_rate": 9.981521489043734e-06, + "loss": 0.9731, + "step": 1092 + }, + { + "epoch": 0.06015741097473719, + "grad_norm": 0.9014891386032104, + "learning_rate": 9.981484238225725e-06, + "loss": 0.811, + "step": 1093 + }, + { + "epoch": 0.06021244977709285, + "grad_norm": 0.8942846655845642, + "learning_rate": 9.981446949968216e-06, + "loss": 0.808, + "step": 1094 + }, + { + "epoch": 0.06026748857944851, + "grad_norm": 0.855297863483429, + "learning_rate": 9.981409624271483e-06, + "loss": 0.8319, + "step": 1095 + }, + { + "epoch": 0.06032252738180417, + "grad_norm": 0.9310913681983948, + "learning_rate": 9.981372261135811e-06, + "loss": 0.899, + "step": 1096 + }, + { + "epoch": 0.060377566184159834, + "grad_norm": 0.8472979664802551, + "learning_rate": 9.981334860561478e-06, + "loss": 0.8818, + "step": 1097 + }, + { + "epoch": 0.060432604986515495, + "grad_norm": 0.896617591381073, + "learning_rate": 9.981297422548764e-06, + "loss": 0.8991, + "step": 1098 + }, + { + "epoch": 0.06048764378887116, + "grad_norm": 0.8543037176132202, + "learning_rate": 9.981259947097954e-06, + "loss": 0.8595, + "step": 1099 + }, + { + "epoch": 0.06054268259122682, + "grad_norm": 0.8794904947280884, + "learning_rate": 9.981222434209327e-06, + "loss": 0.8561, + "step": 1100 + }, + { + "epoch": 0.06059772139358247, + "grad_norm": 0.8882116675376892, + "learning_rate": 9.981184883883165e-06, + "loss": 0.8099, + "step": 1101 + }, + { + "epoch": 0.060652760195938134, + "grad_norm": 1.0068262815475464, + "learning_rate": 9.98114729611975e-06, + "loss": 0.8104, + "step": 1102 + }, + { + "epoch": 0.060707798998293795, + "grad_norm": 1.072316288948059, + "learning_rate": 9.981109670919366e-06, + "loss": 0.9877, + "step": 1103 + }, + { + "epoch": 0.06076283780064946, + "grad_norm": 0.9959045052528381, + "learning_rate": 9.981072008282298e-06, + "loss": 0.906, + "step": 1104 + }, + { + "epoch": 0.06081787660300512, + "grad_norm": 0.8712790608406067, + "learning_rate": 9.981034308208823e-06, + "loss": 0.8725, + "step": 1105 + }, + { + "epoch": 0.06087291540536078, + "grad_norm": 0.9114679098129272, + "learning_rate": 9.980996570699228e-06, + "loss": 0.8385, + "step": 1106 + }, + { + "epoch": 0.06092795420771644, + "grad_norm": 1.0024466514587402, + "learning_rate": 9.980958795753796e-06, + "loss": 0.8661, + "step": 1107 + }, + { + "epoch": 0.0609829930100721, + "grad_norm": 0.9578461050987244, + "learning_rate": 9.98092098337281e-06, + "loss": 0.9358, + "step": 1108 + }, + { + "epoch": 0.061038031812427763, + "grad_norm": 0.8677787780761719, + "learning_rate": 9.980883133556557e-06, + "loss": 0.8146, + "step": 1109 + }, + { + "epoch": 0.061093070614783425, + "grad_norm": 0.9072276949882507, + "learning_rate": 9.98084524630532e-06, + "loss": 0.91, + "step": 1110 + }, + { + "epoch": 0.061148109417139086, + "grad_norm": 0.8827292919158936, + "learning_rate": 9.980807321619381e-06, + "loss": 0.8854, + "step": 1111 + }, + { + "epoch": 0.06120314821949474, + "grad_norm": 1.0012744665145874, + "learning_rate": 9.98076935949903e-06, + "loss": 0.8242, + "step": 1112 + }, + { + "epoch": 0.0612581870218504, + "grad_norm": 0.9152620434761047, + "learning_rate": 9.980731359944548e-06, + "loss": 0.8832, + "step": 1113 + }, + { + "epoch": 0.061313225824206063, + "grad_norm": 0.8986824750900269, + "learning_rate": 9.980693322956222e-06, + "loss": 0.7975, + "step": 1114 + }, + { + "epoch": 0.061368264626561725, + "grad_norm": 0.9373019933700562, + "learning_rate": 9.98065524853434e-06, + "loss": 0.9541, + "step": 1115 + }, + { + "epoch": 0.061423303428917386, + "grad_norm": 0.9875593781471252, + "learning_rate": 9.980617136679185e-06, + "loss": 1.0052, + "step": 1116 + }, + { + "epoch": 0.06147834223127305, + "grad_norm": 1.0664819478988647, + "learning_rate": 9.980578987391045e-06, + "loss": 0.8584, + "step": 1117 + }, + { + "epoch": 0.06153338103362871, + "grad_norm": 0.9149377942085266, + "learning_rate": 9.980540800670207e-06, + "loss": 0.8467, + "step": 1118 + }, + { + "epoch": 0.06158841983598437, + "grad_norm": 0.9303194284439087, + "learning_rate": 9.980502576516959e-06, + "loss": 0.8219, + "step": 1119 + }, + { + "epoch": 0.06164345863834003, + "grad_norm": 0.9059457778930664, + "learning_rate": 9.980464314931583e-06, + "loss": 0.8459, + "step": 1120 + }, + { + "epoch": 0.06169849744069569, + "grad_norm": 0.9368849396705627, + "learning_rate": 9.980426015914375e-06, + "loss": 0.8933, + "step": 1121 + }, + { + "epoch": 0.061753536243051355, + "grad_norm": 0.8188626766204834, + "learning_rate": 9.980387679465615e-06, + "loss": 0.807, + "step": 1122 + }, + { + "epoch": 0.06180857504540701, + "grad_norm": 1.027171015739441, + "learning_rate": 9.980349305585595e-06, + "loss": 0.8919, + "step": 1123 + }, + { + "epoch": 0.06186361384776267, + "grad_norm": 0.831649899482727, + "learning_rate": 9.980310894274603e-06, + "loss": 0.8109, + "step": 1124 + }, + { + "epoch": 0.06191865265011833, + "grad_norm": 1.0170252323150635, + "learning_rate": 9.980272445532928e-06, + "loss": 0.9537, + "step": 1125 + }, + { + "epoch": 0.06197369145247399, + "grad_norm": 0.97837233543396, + "learning_rate": 9.980233959360858e-06, + "loss": 0.9104, + "step": 1126 + }, + { + "epoch": 0.062028730254829655, + "grad_norm": 0.9548324942588806, + "learning_rate": 9.980195435758681e-06, + "loss": 0.9473, + "step": 1127 + }, + { + "epoch": 0.062083769057185316, + "grad_norm": 0.8675842881202698, + "learning_rate": 9.980156874726692e-06, + "loss": 0.8313, + "step": 1128 + }, + { + "epoch": 0.06213880785954098, + "grad_norm": 0.8948968052864075, + "learning_rate": 9.980118276265173e-06, + "loss": 0.8008, + "step": 1129 + }, + { + "epoch": 0.06219384666189664, + "grad_norm": 0.8914239406585693, + "learning_rate": 9.98007964037442e-06, + "loss": 0.7642, + "step": 1130 + }, + { + "epoch": 0.0622488854642523, + "grad_norm": 0.9499951004981995, + "learning_rate": 9.980040967054723e-06, + "loss": 0.8669, + "step": 1131 + }, + { + "epoch": 0.06230392426660796, + "grad_norm": 0.8959251642227173, + "learning_rate": 9.980002256306369e-06, + "loss": 0.9177, + "step": 1132 + }, + { + "epoch": 0.062358963068963616, + "grad_norm": 0.8634380102157593, + "learning_rate": 9.97996350812965e-06, + "loss": 0.8252, + "step": 1133 + }, + { + "epoch": 0.06241400187131928, + "grad_norm": 0.9380598068237305, + "learning_rate": 9.97992472252486e-06, + "loss": 0.9335, + "step": 1134 + }, + { + "epoch": 0.06246904067367494, + "grad_norm": 0.8373183608055115, + "learning_rate": 9.97988589949229e-06, + "loss": 0.848, + "step": 1135 + }, + { + "epoch": 0.0625240794760306, + "grad_norm": 0.9649023413658142, + "learning_rate": 9.97984703903223e-06, + "loss": 0.9648, + "step": 1136 + }, + { + "epoch": 0.06257911827838626, + "grad_norm": 0.9972373843193054, + "learning_rate": 9.979808141144972e-06, + "loss": 0.9104, + "step": 1137 + }, + { + "epoch": 0.06263415708074192, + "grad_norm": 0.8230985403060913, + "learning_rate": 9.97976920583081e-06, + "loss": 0.8393, + "step": 1138 + }, + { + "epoch": 0.06268919588309758, + "grad_norm": 0.9775324463844299, + "learning_rate": 9.979730233090034e-06, + "loss": 0.8385, + "step": 1139 + }, + { + "epoch": 0.06274423468545325, + "grad_norm": 0.8288110494613647, + "learning_rate": 9.97969122292294e-06, + "loss": 0.7308, + "step": 1140 + }, + { + "epoch": 0.06279927348780891, + "grad_norm": 0.8980758786201477, + "learning_rate": 9.979652175329819e-06, + "loss": 0.863, + "step": 1141 + }, + { + "epoch": 0.06285431229016457, + "grad_norm": 7.43889045715332, + "learning_rate": 9.979613090310965e-06, + "loss": 0.9412, + "step": 1142 + }, + { + "epoch": 0.06290935109252023, + "grad_norm": 0.9758191704750061, + "learning_rate": 9.97957396786667e-06, + "loss": 0.8896, + "step": 1143 + }, + { + "epoch": 0.06296438989487589, + "grad_norm": 0.8211693167686462, + "learning_rate": 9.979534807997234e-06, + "loss": 0.7352, + "step": 1144 + }, + { + "epoch": 0.06301942869723155, + "grad_norm": 0.8643441796302795, + "learning_rate": 9.979495610702945e-06, + "loss": 0.8701, + "step": 1145 + }, + { + "epoch": 0.06307446749958721, + "grad_norm": 1.0199437141418457, + "learning_rate": 9.9794563759841e-06, + "loss": 0.9025, + "step": 1146 + }, + { + "epoch": 0.06312950630194288, + "grad_norm": 0.8367893695831299, + "learning_rate": 9.979417103840994e-06, + "loss": 0.8491, + "step": 1147 + }, + { + "epoch": 0.06318454510429854, + "grad_norm": 0.9411819577217102, + "learning_rate": 9.979377794273923e-06, + "loss": 0.8501, + "step": 1148 + }, + { + "epoch": 0.06323958390665418, + "grad_norm": 1.1497365236282349, + "learning_rate": 9.97933844728318e-06, + "loss": 1.0227, + "step": 1149 + }, + { + "epoch": 0.06329462270900985, + "grad_norm": 0.9892984628677368, + "learning_rate": 9.979299062869064e-06, + "loss": 0.8942, + "step": 1150 + }, + { + "epoch": 0.06334966151136551, + "grad_norm": 0.947952926158905, + "learning_rate": 9.979259641031867e-06, + "loss": 1.0149, + "step": 1151 + }, + { + "epoch": 0.06340470031372117, + "grad_norm": 0.9060251712799072, + "learning_rate": 9.979220181771889e-06, + "loss": 0.8607, + "step": 1152 + }, + { + "epoch": 0.06345973911607683, + "grad_norm": 0.8331984281539917, + "learning_rate": 9.979180685089424e-06, + "loss": 0.8777, + "step": 1153 + }, + { + "epoch": 0.06351477791843249, + "grad_norm": 0.9133188724517822, + "learning_rate": 9.97914115098477e-06, + "loss": 0.7409, + "step": 1154 + }, + { + "epoch": 0.06356981672078815, + "grad_norm": 0.9095513820648193, + "learning_rate": 9.979101579458224e-06, + "loss": 0.8938, + "step": 1155 + }, + { + "epoch": 0.06362485552314381, + "grad_norm": 0.9584553241729736, + "learning_rate": 9.979061970510082e-06, + "loss": 0.8765, + "step": 1156 + }, + { + "epoch": 0.06367989432549948, + "grad_norm": 0.8742124438285828, + "learning_rate": 9.979022324140644e-06, + "loss": 0.8564, + "step": 1157 + }, + { + "epoch": 0.06373493312785514, + "grad_norm": 0.8776904344558716, + "learning_rate": 9.978982640350208e-06, + "loss": 0.8713, + "step": 1158 + }, + { + "epoch": 0.0637899719302108, + "grad_norm": 0.8667464852333069, + "learning_rate": 9.97894291913907e-06, + "loss": 0.8705, + "step": 1159 + }, + { + "epoch": 0.06384501073256646, + "grad_norm": 0.9028087854385376, + "learning_rate": 9.978903160507531e-06, + "loss": 0.8297, + "step": 1160 + }, + { + "epoch": 0.06390004953492212, + "grad_norm": 0.900812029838562, + "learning_rate": 9.978863364455887e-06, + "loss": 0.8456, + "step": 1161 + }, + { + "epoch": 0.06395508833727778, + "grad_norm": 0.9667207598686218, + "learning_rate": 9.97882353098444e-06, + "loss": 0.8081, + "step": 1162 + }, + { + "epoch": 0.06401012713963344, + "grad_norm": 0.8959711194038391, + "learning_rate": 9.978783660093488e-06, + "loss": 0.8455, + "step": 1163 + }, + { + "epoch": 0.0640651659419891, + "grad_norm": 0.8519117832183838, + "learning_rate": 9.97874375178333e-06, + "loss": 0.849, + "step": 1164 + }, + { + "epoch": 0.06412020474434477, + "grad_norm": 1.0532654523849487, + "learning_rate": 9.978703806054267e-06, + "loss": 0.7356, + "step": 1165 + }, + { + "epoch": 0.06417524354670043, + "grad_norm": 1.0313252210617065, + "learning_rate": 9.9786638229066e-06, + "loss": 1.024, + "step": 1166 + }, + { + "epoch": 0.06423028234905609, + "grad_norm": 1.0567537546157837, + "learning_rate": 9.978623802340627e-06, + "loss": 0.9423, + "step": 1167 + }, + { + "epoch": 0.06428532115141175, + "grad_norm": 0.8198097348213196, + "learning_rate": 9.97858374435665e-06, + "loss": 0.829, + "step": 1168 + }, + { + "epoch": 0.06434035995376741, + "grad_norm": 0.8718193173408508, + "learning_rate": 9.97854364895497e-06, + "loss": 0.7184, + "step": 1169 + }, + { + "epoch": 0.06439539875612307, + "grad_norm": 0.8037594556808472, + "learning_rate": 9.978503516135892e-06, + "loss": 0.7961, + "step": 1170 + }, + { + "epoch": 0.06445043755847872, + "grad_norm": 0.9052229523658752, + "learning_rate": 9.978463345899709e-06, + "loss": 0.8016, + "step": 1171 + }, + { + "epoch": 0.06450547636083438, + "grad_norm": 1.0194638967514038, + "learning_rate": 9.978423138246731e-06, + "loss": 0.9045, + "step": 1172 + }, + { + "epoch": 0.06456051516319004, + "grad_norm": 0.953078031539917, + "learning_rate": 9.978382893177259e-06, + "loss": 0.9661, + "step": 1173 + }, + { + "epoch": 0.0646155539655457, + "grad_norm": 0.8777341842651367, + "learning_rate": 9.978342610691592e-06, + "loss": 0.8685, + "step": 1174 + }, + { + "epoch": 0.06467059276790137, + "grad_norm": 1.0178394317626953, + "learning_rate": 9.978302290790034e-06, + "loss": 0.9075, + "step": 1175 + }, + { + "epoch": 0.06472563157025703, + "grad_norm": 0.935694694519043, + "learning_rate": 9.978261933472889e-06, + "loss": 0.8438, + "step": 1176 + }, + { + "epoch": 0.06478067037261269, + "grad_norm": 1.0022411346435547, + "learning_rate": 9.97822153874046e-06, + "loss": 0.8701, + "step": 1177 + }, + { + "epoch": 0.06483570917496835, + "grad_norm": 1.0371203422546387, + "learning_rate": 9.97818110659305e-06, + "loss": 0.9111, + "step": 1178 + }, + { + "epoch": 0.06489074797732401, + "grad_norm": 0.7972478866577148, + "learning_rate": 9.978140637030963e-06, + "loss": 0.8602, + "step": 1179 + }, + { + "epoch": 0.06494578677967967, + "grad_norm": 0.8556679487228394, + "learning_rate": 9.978100130054505e-06, + "loss": 0.9149, + "step": 1180 + }, + { + "epoch": 0.06500082558203533, + "grad_norm": 0.92474365234375, + "learning_rate": 9.978059585663979e-06, + "loss": 0.8608, + "step": 1181 + }, + { + "epoch": 0.065055864384391, + "grad_norm": 1.0170830488204956, + "learning_rate": 9.978019003859687e-06, + "loss": 0.9986, + "step": 1182 + }, + { + "epoch": 0.06511090318674666, + "grad_norm": 0.9405049681663513, + "learning_rate": 9.97797838464194e-06, + "loss": 0.9023, + "step": 1183 + }, + { + "epoch": 0.06516594198910232, + "grad_norm": 0.9351203441619873, + "learning_rate": 9.977937728011038e-06, + "loss": 0.8698, + "step": 1184 + }, + { + "epoch": 0.06522098079145798, + "grad_norm": 0.8620241284370422, + "learning_rate": 9.97789703396729e-06, + "loss": 0.9393, + "step": 1185 + }, + { + "epoch": 0.06527601959381364, + "grad_norm": 0.9440441131591797, + "learning_rate": 9.977856302511e-06, + "loss": 0.8249, + "step": 1186 + }, + { + "epoch": 0.0653310583961693, + "grad_norm": 0.8311079144477844, + "learning_rate": 9.977815533642474e-06, + "loss": 0.8614, + "step": 1187 + }, + { + "epoch": 0.06538609719852496, + "grad_norm": 0.8911672830581665, + "learning_rate": 9.977774727362018e-06, + "loss": 0.7909, + "step": 1188 + }, + { + "epoch": 0.06544113600088063, + "grad_norm": 0.9237088561058044, + "learning_rate": 9.97773388366994e-06, + "loss": 0.7116, + "step": 1189 + }, + { + "epoch": 0.06549617480323629, + "grad_norm": 1.1155747175216675, + "learning_rate": 9.977693002566549e-06, + "loss": 0.9248, + "step": 1190 + }, + { + "epoch": 0.06555121360559195, + "grad_norm": 0.9386736750602722, + "learning_rate": 9.977652084052148e-06, + "loss": 0.8307, + "step": 1191 + }, + { + "epoch": 0.0656062524079476, + "grad_norm": 1.1666040420532227, + "learning_rate": 9.977611128127044e-06, + "loss": 0.9723, + "step": 1192 + }, + { + "epoch": 0.06566129121030326, + "grad_norm": 1.2366368770599365, + "learning_rate": 9.977570134791552e-06, + "loss": 0.8253, + "step": 1193 + }, + { + "epoch": 0.06571633001265892, + "grad_norm": 0.823443591594696, + "learning_rate": 9.977529104045971e-06, + "loss": 0.7472, + "step": 1194 + }, + { + "epoch": 0.06577136881501458, + "grad_norm": 0.9481683969497681, + "learning_rate": 9.977488035890617e-06, + "loss": 0.887, + "step": 1195 + }, + { + "epoch": 0.06582640761737024, + "grad_norm": 0.9120422005653381, + "learning_rate": 9.977446930325794e-06, + "loss": 0.867, + "step": 1196 + }, + { + "epoch": 0.0658814464197259, + "grad_norm": 0.8595587015151978, + "learning_rate": 9.977405787351811e-06, + "loss": 0.8532, + "step": 1197 + }, + { + "epoch": 0.06593648522208156, + "grad_norm": 0.8590419888496399, + "learning_rate": 9.97736460696898e-06, + "loss": 0.8998, + "step": 1198 + }, + { + "epoch": 0.06599152402443723, + "grad_norm": 0.9670939445495605, + "learning_rate": 9.977323389177609e-06, + "loss": 0.8964, + "step": 1199 + }, + { + "epoch": 0.06604656282679289, + "grad_norm": 0.8870261907577515, + "learning_rate": 9.977282133978006e-06, + "loss": 0.9542, + "step": 1200 + }, + { + "epoch": 0.06610160162914855, + "grad_norm": 0.942294180393219, + "learning_rate": 9.977240841370484e-06, + "loss": 0.8681, + "step": 1201 + }, + { + "epoch": 0.06615664043150421, + "grad_norm": 0.9632517099380493, + "learning_rate": 9.977199511355353e-06, + "loss": 0.7327, + "step": 1202 + }, + { + "epoch": 0.06621167923385987, + "grad_norm": 4.8085479736328125, + "learning_rate": 9.97715814393292e-06, + "loss": 0.8528, + "step": 1203 + }, + { + "epoch": 0.06626671803621553, + "grad_norm": 0.9084093570709229, + "learning_rate": 9.977116739103503e-06, + "loss": 0.7836, + "step": 1204 + }, + { + "epoch": 0.0663217568385712, + "grad_norm": 0.8961902260780334, + "learning_rate": 9.977075296867406e-06, + "loss": 0.854, + "step": 1205 + }, + { + "epoch": 0.06637679564092686, + "grad_norm": 0.8727987408638, + "learning_rate": 9.977033817224945e-06, + "loss": 0.7931, + "step": 1206 + }, + { + "epoch": 0.06643183444328252, + "grad_norm": 0.8263267874717712, + "learning_rate": 9.976992300176428e-06, + "loss": 0.852, + "step": 1207 + }, + { + "epoch": 0.06648687324563818, + "grad_norm": 1.0499639511108398, + "learning_rate": 9.97695074572217e-06, + "loss": 1.0427, + "step": 1208 + }, + { + "epoch": 0.06654191204799384, + "grad_norm": 0.9337313771247864, + "learning_rate": 9.976909153862482e-06, + "loss": 0.8035, + "step": 1209 + }, + { + "epoch": 0.0665969508503495, + "grad_norm": 0.8795992732048035, + "learning_rate": 9.976867524597678e-06, + "loss": 0.9022, + "step": 1210 + }, + { + "epoch": 0.06665198965270516, + "grad_norm": 0.9787294268608093, + "learning_rate": 9.976825857928069e-06, + "loss": 0.8259, + "step": 1211 + }, + { + "epoch": 0.06670702845506082, + "grad_norm": 0.8570082187652588, + "learning_rate": 9.976784153853969e-06, + "loss": 0.8567, + "step": 1212 + }, + { + "epoch": 0.06676206725741649, + "grad_norm": 1.0620380640029907, + "learning_rate": 9.976742412375694e-06, + "loss": 0.851, + "step": 1213 + }, + { + "epoch": 0.06681710605977213, + "grad_norm": 0.8545439839363098, + "learning_rate": 9.976700633493551e-06, + "loss": 0.8827, + "step": 1214 + }, + { + "epoch": 0.0668721448621278, + "grad_norm": 0.8543682098388672, + "learning_rate": 9.97665881720786e-06, + "loss": 0.8524, + "step": 1215 + }, + { + "epoch": 0.06692718366448346, + "grad_norm": 0.7748527526855469, + "learning_rate": 9.976616963518935e-06, + "loss": 0.7459, + "step": 1216 + }, + { + "epoch": 0.06698222246683912, + "grad_norm": 0.9876659512519836, + "learning_rate": 9.976575072427087e-06, + "loss": 0.8426, + "step": 1217 + }, + { + "epoch": 0.06703726126919478, + "grad_norm": 0.8763901591300964, + "learning_rate": 9.976533143932635e-06, + "loss": 0.8561, + "step": 1218 + }, + { + "epoch": 0.06709230007155044, + "grad_norm": 0.7816654443740845, + "learning_rate": 9.97649117803589e-06, + "loss": 0.8361, + "step": 1219 + }, + { + "epoch": 0.0671473388739061, + "grad_norm": 0.8659802675247192, + "learning_rate": 9.97644917473717e-06, + "loss": 0.897, + "step": 1220 + }, + { + "epoch": 0.06720237767626176, + "grad_norm": 0.9180877208709717, + "learning_rate": 9.97640713403679e-06, + "loss": 0.9516, + "step": 1221 + }, + { + "epoch": 0.06725741647861742, + "grad_norm": 0.9624410271644592, + "learning_rate": 9.976365055935067e-06, + "loss": 0.9119, + "step": 1222 + }, + { + "epoch": 0.06731245528097309, + "grad_norm": 0.8291105031967163, + "learning_rate": 9.976322940432314e-06, + "loss": 0.788, + "step": 1223 + }, + { + "epoch": 0.06736749408332875, + "grad_norm": 0.9858983755111694, + "learning_rate": 9.976280787528854e-06, + "loss": 0.8794, + "step": 1224 + }, + { + "epoch": 0.06742253288568441, + "grad_norm": 0.8283948302268982, + "learning_rate": 9.976238597224996e-06, + "loss": 0.8571, + "step": 1225 + }, + { + "epoch": 0.06747757168804007, + "grad_norm": 0.8585363626480103, + "learning_rate": 9.976196369521063e-06, + "loss": 0.9005, + "step": 1226 + }, + { + "epoch": 0.06753261049039573, + "grad_norm": 0.847882091999054, + "learning_rate": 9.976154104417369e-06, + "loss": 0.8058, + "step": 1227 + }, + { + "epoch": 0.06758764929275139, + "grad_norm": 0.9045611023902893, + "learning_rate": 9.976111801914232e-06, + "loss": 0.7864, + "step": 1228 + }, + { + "epoch": 0.06764268809510705, + "grad_norm": 0.805932879447937, + "learning_rate": 9.976069462011972e-06, + "loss": 0.8436, + "step": 1229 + }, + { + "epoch": 0.06769772689746271, + "grad_norm": 0.8809003233909607, + "learning_rate": 9.976027084710906e-06, + "loss": 0.7876, + "step": 1230 + }, + { + "epoch": 0.06775276569981838, + "grad_norm": 0.8681740760803223, + "learning_rate": 9.975984670011352e-06, + "loss": 0.877, + "step": 1231 + }, + { + "epoch": 0.06780780450217404, + "grad_norm": 0.9909854531288147, + "learning_rate": 9.975942217913627e-06, + "loss": 0.8957, + "step": 1232 + }, + { + "epoch": 0.0678628433045297, + "grad_norm": 0.9213934540748596, + "learning_rate": 9.975899728418056e-06, + "loss": 0.8344, + "step": 1233 + }, + { + "epoch": 0.06791788210688536, + "grad_norm": 0.8289967179298401, + "learning_rate": 9.975857201524952e-06, + "loss": 0.876, + "step": 1234 + }, + { + "epoch": 0.06797292090924101, + "grad_norm": 0.891812264919281, + "learning_rate": 9.97581463723464e-06, + "loss": 0.8611, + "step": 1235 + }, + { + "epoch": 0.06802795971159667, + "grad_norm": 1.0301382541656494, + "learning_rate": 9.975772035547435e-06, + "loss": 0.8177, + "step": 1236 + }, + { + "epoch": 0.06808299851395233, + "grad_norm": 0.8380662798881531, + "learning_rate": 9.975729396463659e-06, + "loss": 0.8631, + "step": 1237 + }, + { + "epoch": 0.06813803731630799, + "grad_norm": 0.9226046204566956, + "learning_rate": 9.975686719983633e-06, + "loss": 0.8927, + "step": 1238 + }, + { + "epoch": 0.06819307611866365, + "grad_norm": 0.8917136192321777, + "learning_rate": 9.975644006107679e-06, + "loss": 0.9048, + "step": 1239 + }, + { + "epoch": 0.06824811492101931, + "grad_norm": 0.8559191226959229, + "learning_rate": 9.975601254836114e-06, + "loss": 0.8169, + "step": 1240 + }, + { + "epoch": 0.06830315372337498, + "grad_norm": 0.9345341920852661, + "learning_rate": 9.975558466169263e-06, + "loss": 0.7929, + "step": 1241 + }, + { + "epoch": 0.06835819252573064, + "grad_norm": 0.9155850410461426, + "learning_rate": 9.975515640107447e-06, + "loss": 0.8825, + "step": 1242 + }, + { + "epoch": 0.0684132313280863, + "grad_norm": 0.899712860584259, + "learning_rate": 9.975472776650987e-06, + "loss": 0.825, + "step": 1243 + }, + { + "epoch": 0.06846827013044196, + "grad_norm": 0.8280880451202393, + "learning_rate": 9.975429875800206e-06, + "loss": 0.8539, + "step": 1244 + }, + { + "epoch": 0.06852330893279762, + "grad_norm": 0.9589636325836182, + "learning_rate": 9.975386937555426e-06, + "loss": 0.9465, + "step": 1245 + }, + { + "epoch": 0.06857834773515328, + "grad_norm": 1.1027253866195679, + "learning_rate": 9.97534396191697e-06, + "loss": 0.87, + "step": 1246 + }, + { + "epoch": 0.06863338653750894, + "grad_norm": 1.0510318279266357, + "learning_rate": 9.975300948885158e-06, + "loss": 0.8569, + "step": 1247 + }, + { + "epoch": 0.0686884253398646, + "grad_norm": 0.8897958397865295, + "learning_rate": 9.975257898460317e-06, + "loss": 0.8431, + "step": 1248 + }, + { + "epoch": 0.06874346414222027, + "grad_norm": 0.8827036619186401, + "learning_rate": 9.975214810642771e-06, + "loss": 0.922, + "step": 1249 + }, + { + "epoch": 0.06879850294457593, + "grad_norm": 0.8798324465751648, + "learning_rate": 9.97517168543284e-06, + "loss": 0.7837, + "step": 1250 + }, + { + "epoch": 0.06885354174693159, + "grad_norm": 0.9053803086280823, + "learning_rate": 9.975128522830853e-06, + "loss": 0.82, + "step": 1251 + }, + { + "epoch": 0.06890858054928725, + "grad_norm": 0.8362607359886169, + "learning_rate": 9.975085322837129e-06, + "loss": 0.7684, + "step": 1252 + }, + { + "epoch": 0.06896361935164291, + "grad_norm": 0.8898602724075317, + "learning_rate": 9.975042085451997e-06, + "loss": 0.8205, + "step": 1253 + }, + { + "epoch": 0.06901865815399857, + "grad_norm": 0.9210274815559387, + "learning_rate": 9.97499881067578e-06, + "loss": 0.8364, + "step": 1254 + }, + { + "epoch": 0.06907369695635424, + "grad_norm": 1.0881952047348022, + "learning_rate": 9.974955498508804e-06, + "loss": 0.8234, + "step": 1255 + }, + { + "epoch": 0.0691287357587099, + "grad_norm": 0.8875024914741516, + "learning_rate": 9.974912148951394e-06, + "loss": 0.7974, + "step": 1256 + }, + { + "epoch": 0.06918377456106554, + "grad_norm": 0.9065666794776917, + "learning_rate": 9.974868762003876e-06, + "loss": 0.7721, + "step": 1257 + }, + { + "epoch": 0.0692388133634212, + "grad_norm": 0.8904553651809692, + "learning_rate": 9.974825337666576e-06, + "loss": 0.8551, + "step": 1258 + }, + { + "epoch": 0.06929385216577687, + "grad_norm": 0.8586102724075317, + "learning_rate": 9.974781875939821e-06, + "loss": 0.8666, + "step": 1259 + }, + { + "epoch": 0.06934889096813253, + "grad_norm": 0.9103402495384216, + "learning_rate": 9.974738376823935e-06, + "loss": 0.8361, + "step": 1260 + }, + { + "epoch": 0.06940392977048819, + "grad_norm": 0.8657701015472412, + "learning_rate": 9.974694840319249e-06, + "loss": 0.8217, + "step": 1261 + }, + { + "epoch": 0.06945896857284385, + "grad_norm": 0.865703821182251, + "learning_rate": 9.974651266426088e-06, + "loss": 0.8751, + "step": 1262 + }, + { + "epoch": 0.06951400737519951, + "grad_norm": 0.8932577967643738, + "learning_rate": 9.974607655144779e-06, + "loss": 0.8709, + "step": 1263 + }, + { + "epoch": 0.06956904617755517, + "grad_norm": 0.8417405486106873, + "learning_rate": 9.97456400647565e-06, + "loss": 0.8104, + "step": 1264 + }, + { + "epoch": 0.06962408497991084, + "grad_norm": 0.8578035235404968, + "learning_rate": 9.974520320419032e-06, + "loss": 0.9173, + "step": 1265 + }, + { + "epoch": 0.0696791237822665, + "grad_norm": 0.957539439201355, + "learning_rate": 9.974476596975249e-06, + "loss": 0.8955, + "step": 1266 + }, + { + "epoch": 0.06973416258462216, + "grad_norm": 0.851222038269043, + "learning_rate": 9.974432836144632e-06, + "loss": 0.8696, + "step": 1267 + }, + { + "epoch": 0.06978920138697782, + "grad_norm": 0.8178789615631104, + "learning_rate": 9.974389037927508e-06, + "loss": 0.7921, + "step": 1268 + }, + { + "epoch": 0.06984424018933348, + "grad_norm": 0.954091489315033, + "learning_rate": 9.97434520232421e-06, + "loss": 0.9362, + "step": 1269 + }, + { + "epoch": 0.06989927899168914, + "grad_norm": 0.8525053858757019, + "learning_rate": 9.974301329335063e-06, + "loss": 0.7996, + "step": 1270 + }, + { + "epoch": 0.0699543177940448, + "grad_norm": 0.9340476393699646, + "learning_rate": 9.9742574189604e-06, + "loss": 0.9091, + "step": 1271 + }, + { + "epoch": 0.07000935659640047, + "grad_norm": 0.7946187257766724, + "learning_rate": 9.974213471200548e-06, + "loss": 0.874, + "step": 1272 + }, + { + "epoch": 0.07006439539875613, + "grad_norm": 0.8048381209373474, + "learning_rate": 9.97416948605584e-06, + "loss": 0.8557, + "step": 1273 + }, + { + "epoch": 0.07011943420111179, + "grad_norm": 0.9849064946174622, + "learning_rate": 9.974125463526607e-06, + "loss": 0.8154, + "step": 1274 + }, + { + "epoch": 0.07017447300346745, + "grad_norm": 0.9030239582061768, + "learning_rate": 9.974081403613178e-06, + "loss": 0.9411, + "step": 1275 + }, + { + "epoch": 0.07022951180582311, + "grad_norm": 0.8869300484657288, + "learning_rate": 9.974037306315882e-06, + "loss": 0.8978, + "step": 1276 + }, + { + "epoch": 0.07028455060817877, + "grad_norm": 0.8558536767959595, + "learning_rate": 9.973993171635057e-06, + "loss": 0.8937, + "step": 1277 + }, + { + "epoch": 0.07033958941053442, + "grad_norm": 0.9005453586578369, + "learning_rate": 9.973948999571029e-06, + "loss": 0.9336, + "step": 1278 + }, + { + "epoch": 0.07039462821289008, + "grad_norm": 0.8489978909492493, + "learning_rate": 9.973904790124131e-06, + "loss": 0.8267, + "step": 1279 + }, + { + "epoch": 0.07044966701524574, + "grad_norm": 0.8295948505401611, + "learning_rate": 9.973860543294696e-06, + "loss": 0.8478, + "step": 1280 + }, + { + "epoch": 0.0705047058176014, + "grad_norm": 0.8111379742622375, + "learning_rate": 9.973816259083058e-06, + "loss": 0.8333, + "step": 1281 + }, + { + "epoch": 0.07055974461995707, + "grad_norm": 0.9380189776420593, + "learning_rate": 9.973771937489547e-06, + "loss": 0.9718, + "step": 1282 + }, + { + "epoch": 0.07061478342231273, + "grad_norm": 1.251194953918457, + "learning_rate": 9.973727578514499e-06, + "loss": 0.9531, + "step": 1283 + }, + { + "epoch": 0.07066982222466839, + "grad_norm": 0.9897224307060242, + "learning_rate": 9.973683182158243e-06, + "loss": 0.7853, + "step": 1284 + }, + { + "epoch": 0.07072486102702405, + "grad_norm": 0.8409335017204285, + "learning_rate": 9.973638748421119e-06, + "loss": 0.7692, + "step": 1285 + }, + { + "epoch": 0.07077989982937971, + "grad_norm": 0.9019681215286255, + "learning_rate": 9.973594277303456e-06, + "loss": 0.8135, + "step": 1286 + }, + { + "epoch": 0.07083493863173537, + "grad_norm": 0.9236096739768982, + "learning_rate": 9.973549768805588e-06, + "loss": 0.9304, + "step": 1287 + }, + { + "epoch": 0.07088997743409103, + "grad_norm": 0.9244743585586548, + "learning_rate": 9.973505222927854e-06, + "loss": 0.9056, + "step": 1288 + }, + { + "epoch": 0.0709450162364467, + "grad_norm": 1.3418753147125244, + "learning_rate": 9.973460639670585e-06, + "loss": 0.8419, + "step": 1289 + }, + { + "epoch": 0.07100005503880236, + "grad_norm": 0.8715767860412598, + "learning_rate": 9.973416019034117e-06, + "loss": 0.9704, + "step": 1290 + }, + { + "epoch": 0.07105509384115802, + "grad_norm": 0.9609012007713318, + "learning_rate": 9.973371361018787e-06, + "loss": 0.8807, + "step": 1291 + }, + { + "epoch": 0.07111013264351368, + "grad_norm": 0.8085873126983643, + "learning_rate": 9.973326665624927e-06, + "loss": 0.7947, + "step": 1292 + }, + { + "epoch": 0.07116517144586934, + "grad_norm": 0.919280469417572, + "learning_rate": 9.973281932852877e-06, + "loss": 0.9743, + "step": 1293 + }, + { + "epoch": 0.071220210248225, + "grad_norm": 1.0651074647903442, + "learning_rate": 9.973237162702968e-06, + "loss": 0.7164, + "step": 1294 + }, + { + "epoch": 0.07127524905058066, + "grad_norm": 0.987251341342926, + "learning_rate": 9.973192355175542e-06, + "loss": 0.9286, + "step": 1295 + }, + { + "epoch": 0.07133028785293632, + "grad_norm": 1.5507274866104126, + "learning_rate": 9.973147510270935e-06, + "loss": 0.9733, + "step": 1296 + }, + { + "epoch": 0.07138532665529199, + "grad_norm": 0.8439416885375977, + "learning_rate": 9.97310262798948e-06, + "loss": 0.7462, + "step": 1297 + }, + { + "epoch": 0.07144036545764765, + "grad_norm": 0.9604889750480652, + "learning_rate": 9.973057708331519e-06, + "loss": 1.0006, + "step": 1298 + }, + { + "epoch": 0.07149540426000331, + "grad_norm": 0.8568960428237915, + "learning_rate": 9.973012751297386e-06, + "loss": 0.878, + "step": 1299 + }, + { + "epoch": 0.07155044306235896, + "grad_norm": 0.8169522285461426, + "learning_rate": 9.972967756887419e-06, + "loss": 0.8241, + "step": 1300 + }, + { + "epoch": 0.07160548186471462, + "grad_norm": 0.875738799571991, + "learning_rate": 9.97292272510196e-06, + "loss": 0.854, + "step": 1301 + }, + { + "epoch": 0.07166052066707028, + "grad_norm": 0.7877739071846008, + "learning_rate": 9.972877655941345e-06, + "loss": 0.779, + "step": 1302 + }, + { + "epoch": 0.07171555946942594, + "grad_norm": 0.8148574829101562, + "learning_rate": 9.972832549405912e-06, + "loss": 0.6965, + "step": 1303 + }, + { + "epoch": 0.0717705982717816, + "grad_norm": 0.936720609664917, + "learning_rate": 9.972787405495998e-06, + "loss": 0.798, + "step": 1304 + }, + { + "epoch": 0.07182563707413726, + "grad_norm": 0.8932886123657227, + "learning_rate": 9.972742224211949e-06, + "loss": 0.9196, + "step": 1305 + }, + { + "epoch": 0.07188067587649292, + "grad_norm": 0.899246871471405, + "learning_rate": 9.972697005554099e-06, + "loss": 0.8081, + "step": 1306 + }, + { + "epoch": 0.07193571467884859, + "grad_norm": 0.8789899349212646, + "learning_rate": 9.972651749522788e-06, + "loss": 0.89, + "step": 1307 + }, + { + "epoch": 0.07199075348120425, + "grad_norm": 1.2412173748016357, + "learning_rate": 9.97260645611836e-06, + "loss": 0.9866, + "step": 1308 + }, + { + "epoch": 0.07204579228355991, + "grad_norm": 0.8655833005905151, + "learning_rate": 9.972561125341152e-06, + "loss": 0.8144, + "step": 1309 + }, + { + "epoch": 0.07210083108591557, + "grad_norm": 0.8705299496650696, + "learning_rate": 9.972515757191506e-06, + "loss": 0.8431, + "step": 1310 + }, + { + "epoch": 0.07215586988827123, + "grad_norm": 0.8813188672065735, + "learning_rate": 9.972470351669761e-06, + "loss": 0.859, + "step": 1311 + }, + { + "epoch": 0.0722109086906269, + "grad_norm": 2.043627977371216, + "learning_rate": 9.972424908776262e-06, + "loss": 0.9886, + "step": 1312 + }, + { + "epoch": 0.07226594749298255, + "grad_norm": 0.9167500734329224, + "learning_rate": 9.972379428511348e-06, + "loss": 0.7203, + "step": 1313 + }, + { + "epoch": 0.07232098629533822, + "grad_norm": 1.3145136833190918, + "learning_rate": 9.972333910875358e-06, + "loss": 0.9325, + "step": 1314 + }, + { + "epoch": 0.07237602509769388, + "grad_norm": 0.834710419178009, + "learning_rate": 9.972288355868641e-06, + "loss": 0.9361, + "step": 1315 + }, + { + "epoch": 0.07243106390004954, + "grad_norm": 0.9039230942726135, + "learning_rate": 9.972242763491535e-06, + "loss": 0.8027, + "step": 1316 + }, + { + "epoch": 0.0724861027024052, + "grad_norm": 0.8911495208740234, + "learning_rate": 9.972197133744384e-06, + "loss": 0.951, + "step": 1317 + }, + { + "epoch": 0.07254114150476086, + "grad_norm": 1.0752439498901367, + "learning_rate": 9.972151466627529e-06, + "loss": 0.8421, + "step": 1318 + }, + { + "epoch": 0.07259618030711652, + "grad_norm": 0.926135778427124, + "learning_rate": 9.972105762141314e-06, + "loss": 0.8901, + "step": 1319 + }, + { + "epoch": 0.07265121910947218, + "grad_norm": 0.8166295289993286, + "learning_rate": 9.972060020286085e-06, + "loss": 0.7845, + "step": 1320 + }, + { + "epoch": 0.07270625791182783, + "grad_norm": 1.0000934600830078, + "learning_rate": 9.972014241062182e-06, + "loss": 0.8383, + "step": 1321 + }, + { + "epoch": 0.0727612967141835, + "grad_norm": 1.2617899179458618, + "learning_rate": 9.971968424469951e-06, + "loss": 0.9826, + "step": 1322 + }, + { + "epoch": 0.07281633551653915, + "grad_norm": 0.8451040983200073, + "learning_rate": 9.971922570509738e-06, + "loss": 0.8262, + "step": 1323 + }, + { + "epoch": 0.07287137431889482, + "grad_norm": 0.8101939558982849, + "learning_rate": 9.971876679181884e-06, + "loss": 0.6904, + "step": 1324 + }, + { + "epoch": 0.07292641312125048, + "grad_norm": 0.8805514574050903, + "learning_rate": 9.971830750486736e-06, + "loss": 0.8491, + "step": 1325 + }, + { + "epoch": 0.07298145192360614, + "grad_norm": 0.8236901164054871, + "learning_rate": 9.97178478442464e-06, + "loss": 0.8462, + "step": 1326 + }, + { + "epoch": 0.0730364907259618, + "grad_norm": 0.9183042645454407, + "learning_rate": 9.971738780995938e-06, + "loss": 0.7577, + "step": 1327 + }, + { + "epoch": 0.07309152952831746, + "grad_norm": 0.8425934314727783, + "learning_rate": 9.971692740200982e-06, + "loss": 0.8462, + "step": 1328 + }, + { + "epoch": 0.07314656833067312, + "grad_norm": 0.9114993214607239, + "learning_rate": 9.971646662040112e-06, + "loss": 0.9132, + "step": 1329 + }, + { + "epoch": 0.07320160713302878, + "grad_norm": 0.8516649603843689, + "learning_rate": 9.971600546513675e-06, + "loss": 0.8819, + "step": 1330 + }, + { + "epoch": 0.07325664593538445, + "grad_norm": 1.0859558582305908, + "learning_rate": 9.971554393622023e-06, + "loss": 0.9929, + "step": 1331 + }, + { + "epoch": 0.07331168473774011, + "grad_norm": 0.8906900882720947, + "learning_rate": 9.971508203365497e-06, + "loss": 0.9166, + "step": 1332 + }, + { + "epoch": 0.07336672354009577, + "grad_norm": 0.8931803703308105, + "learning_rate": 9.971461975744445e-06, + "loss": 0.864, + "step": 1333 + }, + { + "epoch": 0.07342176234245143, + "grad_norm": 0.8404982686042786, + "learning_rate": 9.971415710759216e-06, + "loss": 0.8609, + "step": 1334 + }, + { + "epoch": 0.07347680114480709, + "grad_norm": 0.8016490340232849, + "learning_rate": 9.971369408410157e-06, + "loss": 0.7694, + "step": 1335 + }, + { + "epoch": 0.07353183994716275, + "grad_norm": 0.7700600028038025, + "learning_rate": 9.971323068697618e-06, + "loss": 0.7875, + "step": 1336 + }, + { + "epoch": 0.07358687874951841, + "grad_norm": 0.8679799437522888, + "learning_rate": 9.971276691621946e-06, + "loss": 0.8409, + "step": 1337 + }, + { + "epoch": 0.07364191755187408, + "grad_norm": 0.8329173922538757, + "learning_rate": 9.971230277183486e-06, + "loss": 0.8707, + "step": 1338 + }, + { + "epoch": 0.07369695635422974, + "grad_norm": 0.8790140151977539, + "learning_rate": 9.97118382538259e-06, + "loss": 0.7631, + "step": 1339 + }, + { + "epoch": 0.0737519951565854, + "grad_norm": 1.1895341873168945, + "learning_rate": 9.97113733621961e-06, + "loss": 0.8555, + "step": 1340 + }, + { + "epoch": 0.07380703395894106, + "grad_norm": 0.8531593680381775, + "learning_rate": 9.97109080969489e-06, + "loss": 0.7192, + "step": 1341 + }, + { + "epoch": 0.07386207276129672, + "grad_norm": 1.0388946533203125, + "learning_rate": 9.971044245808784e-06, + "loss": 0.8182, + "step": 1342 + }, + { + "epoch": 0.07391711156365237, + "grad_norm": 0.8858556747436523, + "learning_rate": 9.970997644561639e-06, + "loss": 0.7981, + "step": 1343 + }, + { + "epoch": 0.07397215036600803, + "grad_norm": 0.8710204362869263, + "learning_rate": 9.970951005953807e-06, + "loss": 0.7667, + "step": 1344 + }, + { + "epoch": 0.07402718916836369, + "grad_norm": 0.9788708090782166, + "learning_rate": 9.970904329985638e-06, + "loss": 0.9693, + "step": 1345 + }, + { + "epoch": 0.07408222797071935, + "grad_norm": 0.7805914878845215, + "learning_rate": 9.970857616657482e-06, + "loss": 0.6683, + "step": 1346 + }, + { + "epoch": 0.07413726677307501, + "grad_norm": 0.9977933168411255, + "learning_rate": 9.97081086596969e-06, + "loss": 0.8288, + "step": 1347 + }, + { + "epoch": 0.07419230557543068, + "grad_norm": 0.829115629196167, + "learning_rate": 9.970764077922617e-06, + "loss": 0.8361, + "step": 1348 + }, + { + "epoch": 0.07424734437778634, + "grad_norm": 1.226120114326477, + "learning_rate": 9.97071725251661e-06, + "loss": 1.0008, + "step": 1349 + }, + { + "epoch": 0.074302383180142, + "grad_norm": 0.8997750878334045, + "learning_rate": 9.970670389752021e-06, + "loss": 0.8048, + "step": 1350 + }, + { + "epoch": 0.07435742198249766, + "grad_norm": 1.0885238647460938, + "learning_rate": 9.970623489629205e-06, + "loss": 0.9202, + "step": 1351 + }, + { + "epoch": 0.07441246078485332, + "grad_norm": 0.8736100792884827, + "learning_rate": 9.970576552148515e-06, + "loss": 0.8515, + "step": 1352 + }, + { + "epoch": 0.07446749958720898, + "grad_norm": 0.9211294651031494, + "learning_rate": 9.970529577310301e-06, + "loss": 0.9389, + "step": 1353 + }, + { + "epoch": 0.07452253838956464, + "grad_norm": 0.9334765672683716, + "learning_rate": 9.970482565114917e-06, + "loss": 0.8165, + "step": 1354 + }, + { + "epoch": 0.0745775771919203, + "grad_norm": 0.8307162523269653, + "learning_rate": 9.970435515562717e-06, + "loss": 0.7829, + "step": 1355 + }, + { + "epoch": 0.07463261599427597, + "grad_norm": 0.987634003162384, + "learning_rate": 9.970388428654055e-06, + "loss": 0.848, + "step": 1356 + }, + { + "epoch": 0.07468765479663163, + "grad_norm": 1.094752311706543, + "learning_rate": 9.970341304389281e-06, + "loss": 1.003, + "step": 1357 + }, + { + "epoch": 0.07474269359898729, + "grad_norm": 0.9865909814834595, + "learning_rate": 9.970294142768755e-06, + "loss": 0.9116, + "step": 1358 + }, + { + "epoch": 0.07479773240134295, + "grad_norm": 0.8404149413108826, + "learning_rate": 9.970246943792828e-06, + "loss": 0.8699, + "step": 1359 + }, + { + "epoch": 0.07485277120369861, + "grad_norm": 0.9602416753768921, + "learning_rate": 9.970199707461855e-06, + "loss": 0.8166, + "step": 1360 + }, + { + "epoch": 0.07490781000605427, + "grad_norm": 0.9748693704605103, + "learning_rate": 9.970152433776193e-06, + "loss": 0.8767, + "step": 1361 + }, + { + "epoch": 0.07496284880840993, + "grad_norm": 0.8721657991409302, + "learning_rate": 9.970105122736194e-06, + "loss": 0.8825, + "step": 1362 + }, + { + "epoch": 0.0750178876107656, + "grad_norm": 0.8683610558509827, + "learning_rate": 9.970057774342215e-06, + "loss": 0.7873, + "step": 1363 + }, + { + "epoch": 0.07507292641312124, + "grad_norm": 0.856396496295929, + "learning_rate": 9.970010388594613e-06, + "loss": 0.8505, + "step": 1364 + }, + { + "epoch": 0.0751279652154769, + "grad_norm": 1.0709880590438843, + "learning_rate": 9.969962965493744e-06, + "loss": 0.9519, + "step": 1365 + }, + { + "epoch": 0.07518300401783257, + "grad_norm": 0.8839450478553772, + "learning_rate": 9.969915505039963e-06, + "loss": 0.8041, + "step": 1366 + }, + { + "epoch": 0.07523804282018823, + "grad_norm": 0.89545738697052, + "learning_rate": 9.969868007233627e-06, + "loss": 0.8713, + "step": 1367 + }, + { + "epoch": 0.07529308162254389, + "grad_norm": 0.9870849251747131, + "learning_rate": 9.969820472075094e-06, + "loss": 0.8655, + "step": 1368 + }, + { + "epoch": 0.07534812042489955, + "grad_norm": 1.3123797178268433, + "learning_rate": 9.96977289956472e-06, + "loss": 1.0425, + "step": 1369 + }, + { + "epoch": 0.07540315922725521, + "grad_norm": 0.8538400530815125, + "learning_rate": 9.969725289702865e-06, + "loss": 0.7052, + "step": 1370 + }, + { + "epoch": 0.07545819802961087, + "grad_norm": 0.933397114276886, + "learning_rate": 9.969677642489884e-06, + "loss": 0.9819, + "step": 1371 + }, + { + "epoch": 0.07551323683196653, + "grad_norm": 0.8428112268447876, + "learning_rate": 9.969629957926134e-06, + "loss": 0.7313, + "step": 1372 + }, + { + "epoch": 0.0755682756343222, + "grad_norm": 0.9023239612579346, + "learning_rate": 9.96958223601198e-06, + "loss": 0.8297, + "step": 1373 + }, + { + "epoch": 0.07562331443667786, + "grad_norm": 0.8971324563026428, + "learning_rate": 9.969534476747771e-06, + "loss": 0.8832, + "step": 1374 + }, + { + "epoch": 0.07567835323903352, + "grad_norm": 0.8709388375282288, + "learning_rate": 9.969486680133874e-06, + "loss": 0.743, + "step": 1375 + }, + { + "epoch": 0.07573339204138918, + "grad_norm": 0.9094591736793518, + "learning_rate": 9.969438846170644e-06, + "loss": 0.8294, + "step": 1376 + }, + { + "epoch": 0.07578843084374484, + "grad_norm": 1.0753988027572632, + "learning_rate": 9.969390974858444e-06, + "loss": 0.7479, + "step": 1377 + }, + { + "epoch": 0.0758434696461005, + "grad_norm": 0.933775007724762, + "learning_rate": 9.96934306619763e-06, + "loss": 0.8235, + "step": 1378 + }, + { + "epoch": 0.07589850844845616, + "grad_norm": 0.8419735431671143, + "learning_rate": 9.969295120188565e-06, + "loss": 0.8103, + "step": 1379 + }, + { + "epoch": 0.07595354725081183, + "grad_norm": 0.8912790417671204, + "learning_rate": 9.969247136831606e-06, + "loss": 0.911, + "step": 1380 + }, + { + "epoch": 0.07600858605316749, + "grad_norm": 0.8780983090400696, + "learning_rate": 9.969199116127118e-06, + "loss": 0.8619, + "step": 1381 + }, + { + "epoch": 0.07606362485552315, + "grad_norm": 0.8503809571266174, + "learning_rate": 9.969151058075459e-06, + "loss": 0.8093, + "step": 1382 + }, + { + "epoch": 0.07611866365787881, + "grad_norm": 0.8633087277412415, + "learning_rate": 9.96910296267699e-06, + "loss": 0.7524, + "step": 1383 + }, + { + "epoch": 0.07617370246023447, + "grad_norm": 1.1203595399856567, + "learning_rate": 9.969054829932074e-06, + "loss": 0.945, + "step": 1384 + }, + { + "epoch": 0.07622874126259013, + "grad_norm": 0.8766878843307495, + "learning_rate": 9.969006659841072e-06, + "loss": 0.7537, + "step": 1385 + }, + { + "epoch": 0.07628378006494578, + "grad_norm": 0.9795958399772644, + "learning_rate": 9.968958452404345e-06, + "loss": 0.7963, + "step": 1386 + }, + { + "epoch": 0.07633881886730144, + "grad_norm": 0.9117506146430969, + "learning_rate": 9.968910207622257e-06, + "loss": 0.9469, + "step": 1387 + }, + { + "epoch": 0.0763938576696571, + "grad_norm": 0.9731466770172119, + "learning_rate": 9.96886192549517e-06, + "loss": 0.9536, + "step": 1388 + }, + { + "epoch": 0.07644889647201276, + "grad_norm": 0.8923571109771729, + "learning_rate": 9.968813606023446e-06, + "loss": 0.8362, + "step": 1389 + }, + { + "epoch": 0.07650393527436843, + "grad_norm": 0.8819600343704224, + "learning_rate": 9.96876524920745e-06, + "loss": 0.6938, + "step": 1390 + }, + { + "epoch": 0.07655897407672409, + "grad_norm": 0.9629887342453003, + "learning_rate": 9.968716855047545e-06, + "loss": 0.9104, + "step": 1391 + }, + { + "epoch": 0.07661401287907975, + "grad_norm": 0.992770254611969, + "learning_rate": 9.968668423544093e-06, + "loss": 0.944, + "step": 1392 + }, + { + "epoch": 0.07666905168143541, + "grad_norm": 0.8578491806983948, + "learning_rate": 9.96861995469746e-06, + "loss": 0.898, + "step": 1393 + }, + { + "epoch": 0.07672409048379107, + "grad_norm": 1.1169229745864868, + "learning_rate": 9.968571448508008e-06, + "loss": 0.8324, + "step": 1394 + }, + { + "epoch": 0.07677912928614673, + "grad_norm": 0.9600160121917725, + "learning_rate": 9.968522904976106e-06, + "loss": 0.9519, + "step": 1395 + }, + { + "epoch": 0.0768341680885024, + "grad_norm": 0.8271373510360718, + "learning_rate": 9.968474324102112e-06, + "loss": 0.8576, + "step": 1396 + }, + { + "epoch": 0.07688920689085806, + "grad_norm": 0.9437325596809387, + "learning_rate": 9.968425705886397e-06, + "loss": 0.9201, + "step": 1397 + }, + { + "epoch": 0.07694424569321372, + "grad_norm": 0.8679039478302002, + "learning_rate": 9.968377050329325e-06, + "loss": 0.8893, + "step": 1398 + }, + { + "epoch": 0.07699928449556938, + "grad_norm": 1.0178717374801636, + "learning_rate": 9.96832835743126e-06, + "loss": 0.9718, + "step": 1399 + }, + { + "epoch": 0.07705432329792504, + "grad_norm": 0.8354432582855225, + "learning_rate": 9.96827962719257e-06, + "loss": 0.83, + "step": 1400 + }, + { + "epoch": 0.0771093621002807, + "grad_norm": 1.2244631052017212, + "learning_rate": 9.968230859613619e-06, + "loss": 0.907, + "step": 1401 + }, + { + "epoch": 0.07716440090263636, + "grad_norm": 0.9099625945091248, + "learning_rate": 9.968182054694775e-06, + "loss": 0.809, + "step": 1402 + }, + { + "epoch": 0.07721943970499202, + "grad_norm": 0.8591424226760864, + "learning_rate": 9.968133212436404e-06, + "loss": 0.8869, + "step": 1403 + }, + { + "epoch": 0.07727447850734769, + "grad_norm": 1.068003535270691, + "learning_rate": 9.968084332838876e-06, + "loss": 0.8747, + "step": 1404 + }, + { + "epoch": 0.07732951730970335, + "grad_norm": 0.8503691554069519, + "learning_rate": 9.968035415902555e-06, + "loss": 0.7478, + "step": 1405 + }, + { + "epoch": 0.07738455611205901, + "grad_norm": 0.9209537506103516, + "learning_rate": 9.967986461627808e-06, + "loss": 0.9052, + "step": 1406 + }, + { + "epoch": 0.07743959491441466, + "grad_norm": 0.8447962999343872, + "learning_rate": 9.967937470015006e-06, + "loss": 0.7897, + "step": 1407 + }, + { + "epoch": 0.07749463371677032, + "grad_norm": 0.8731846809387207, + "learning_rate": 9.967888441064515e-06, + "loss": 0.837, + "step": 1408 + }, + { + "epoch": 0.07754967251912598, + "grad_norm": 0.9810444712638855, + "learning_rate": 9.967839374776705e-06, + "loss": 0.8236, + "step": 1409 + }, + { + "epoch": 0.07760471132148164, + "grad_norm": 0.8283190131187439, + "learning_rate": 9.967790271151944e-06, + "loss": 0.8443, + "step": 1410 + }, + { + "epoch": 0.0776597501238373, + "grad_norm": 0.7999932765960693, + "learning_rate": 9.9677411301906e-06, + "loss": 0.7945, + "step": 1411 + }, + { + "epoch": 0.07771478892619296, + "grad_norm": 0.9435983300209045, + "learning_rate": 9.967691951893044e-06, + "loss": 0.9745, + "step": 1412 + }, + { + "epoch": 0.07776982772854862, + "grad_norm": 0.8885984420776367, + "learning_rate": 9.967642736259646e-06, + "loss": 0.9163, + "step": 1413 + }, + { + "epoch": 0.07782486653090429, + "grad_norm": 0.993928074836731, + "learning_rate": 9.967593483290776e-06, + "loss": 0.7797, + "step": 1414 + }, + { + "epoch": 0.07787990533325995, + "grad_norm": 1.058830976486206, + "learning_rate": 9.9675441929868e-06, + "loss": 0.8671, + "step": 1415 + }, + { + "epoch": 0.07793494413561561, + "grad_norm": 1.0469766855239868, + "learning_rate": 9.967494865348093e-06, + "loss": 0.8671, + "step": 1416 + }, + { + "epoch": 0.07798998293797127, + "grad_norm": 0.902729868888855, + "learning_rate": 9.967445500375025e-06, + "loss": 0.8748, + "step": 1417 + }, + { + "epoch": 0.07804502174032693, + "grad_norm": 0.90755295753479, + "learning_rate": 9.967396098067965e-06, + "loss": 0.8279, + "step": 1418 + }, + { + "epoch": 0.07810006054268259, + "grad_norm": 0.8822374939918518, + "learning_rate": 9.967346658427287e-06, + "loss": 0.9386, + "step": 1419 + }, + { + "epoch": 0.07815509934503825, + "grad_norm": 0.9201469421386719, + "learning_rate": 9.96729718145336e-06, + "loss": 0.8684, + "step": 1420 + }, + { + "epoch": 0.07821013814739392, + "grad_norm": 0.9451109766960144, + "learning_rate": 9.967247667146558e-06, + "loss": 0.7854, + "step": 1421 + }, + { + "epoch": 0.07826517694974958, + "grad_norm": 0.9146197438240051, + "learning_rate": 9.96719811550725e-06, + "loss": 0.8496, + "step": 1422 + }, + { + "epoch": 0.07832021575210524, + "grad_norm": 0.9771224856376648, + "learning_rate": 9.967148526535813e-06, + "loss": 0.9657, + "step": 1423 + }, + { + "epoch": 0.0783752545544609, + "grad_norm": 0.8437683582305908, + "learning_rate": 9.967098900232616e-06, + "loss": 0.8336, + "step": 1424 + }, + { + "epoch": 0.07843029335681656, + "grad_norm": 0.8232185244560242, + "learning_rate": 9.967049236598034e-06, + "loss": 0.8878, + "step": 1425 + }, + { + "epoch": 0.07848533215917222, + "grad_norm": 1.0200369358062744, + "learning_rate": 9.96699953563244e-06, + "loss": 0.8135, + "step": 1426 + }, + { + "epoch": 0.07854037096152788, + "grad_norm": 0.8779187202453613, + "learning_rate": 9.966949797336208e-06, + "loss": 0.9124, + "step": 1427 + }, + { + "epoch": 0.07859540976388354, + "grad_norm": 0.9557466506958008, + "learning_rate": 9.966900021709708e-06, + "loss": 0.9118, + "step": 1428 + }, + { + "epoch": 0.07865044856623919, + "grad_norm": 0.8431050777435303, + "learning_rate": 9.966850208753317e-06, + "loss": 0.8361, + "step": 1429 + }, + { + "epoch": 0.07870548736859485, + "grad_norm": 0.9269648194313049, + "learning_rate": 9.966800358467412e-06, + "loss": 0.9194, + "step": 1430 + }, + { + "epoch": 0.07876052617095052, + "grad_norm": 0.818681538105011, + "learning_rate": 9.966750470852363e-06, + "loss": 0.7483, + "step": 1431 + }, + { + "epoch": 0.07881556497330618, + "grad_norm": 0.8788284659385681, + "learning_rate": 9.966700545908547e-06, + "loss": 0.858, + "step": 1432 + }, + { + "epoch": 0.07887060377566184, + "grad_norm": 0.7734160423278809, + "learning_rate": 9.966650583636342e-06, + "loss": 0.694, + "step": 1433 + }, + { + "epoch": 0.0789256425780175, + "grad_norm": 0.8846608996391296, + "learning_rate": 9.966600584036117e-06, + "loss": 0.8144, + "step": 1434 + }, + { + "epoch": 0.07898068138037316, + "grad_norm": 0.9740058183670044, + "learning_rate": 9.966550547108254e-06, + "loss": 0.9314, + "step": 1435 + }, + { + "epoch": 0.07903572018272882, + "grad_norm": 0.8731759786605835, + "learning_rate": 9.966500472853124e-06, + "loss": 0.8475, + "step": 1436 + }, + { + "epoch": 0.07909075898508448, + "grad_norm": 0.8984843492507935, + "learning_rate": 9.966450361271109e-06, + "loss": 0.7803, + "step": 1437 + }, + { + "epoch": 0.07914579778744014, + "grad_norm": 0.8897966742515564, + "learning_rate": 9.96640021236258e-06, + "loss": 0.8879, + "step": 1438 + }, + { + "epoch": 0.0792008365897958, + "grad_norm": 0.80704265832901, + "learning_rate": 9.966350026127917e-06, + "loss": 0.7585, + "step": 1439 + }, + { + "epoch": 0.07925587539215147, + "grad_norm": 1.0807467699050903, + "learning_rate": 9.966299802567499e-06, + "loss": 1.078, + "step": 1440 + }, + { + "epoch": 0.07931091419450713, + "grad_norm": 0.7994028925895691, + "learning_rate": 9.966249541681697e-06, + "loss": 0.8074, + "step": 1441 + }, + { + "epoch": 0.07936595299686279, + "grad_norm": 0.877592921257019, + "learning_rate": 9.966199243470895e-06, + "loss": 0.8084, + "step": 1442 + }, + { + "epoch": 0.07942099179921845, + "grad_norm": 0.7704572081565857, + "learning_rate": 9.966148907935469e-06, + "loss": 0.7206, + "step": 1443 + }, + { + "epoch": 0.07947603060157411, + "grad_norm": 0.8222140669822693, + "learning_rate": 9.966098535075797e-06, + "loss": 0.7768, + "step": 1444 + }, + { + "epoch": 0.07953106940392977, + "grad_norm": 1.389320731163025, + "learning_rate": 9.966048124892257e-06, + "loss": 1.0356, + "step": 1445 + }, + { + "epoch": 0.07958610820628544, + "grad_norm": 0.9082457423210144, + "learning_rate": 9.965997677385229e-06, + "loss": 0.7379, + "step": 1446 + }, + { + "epoch": 0.0796411470086411, + "grad_norm": 0.8029153943061829, + "learning_rate": 9.965947192555093e-06, + "loss": 0.7826, + "step": 1447 + }, + { + "epoch": 0.07969618581099676, + "grad_norm": 0.8752758502960205, + "learning_rate": 9.965896670402227e-06, + "loss": 0.8526, + "step": 1448 + }, + { + "epoch": 0.07975122461335242, + "grad_norm": 1.0665404796600342, + "learning_rate": 9.965846110927009e-06, + "loss": 0.858, + "step": 1449 + }, + { + "epoch": 0.07980626341570807, + "grad_norm": 0.9468502402305603, + "learning_rate": 9.96579551412982e-06, + "loss": 0.9658, + "step": 1450 + }, + { + "epoch": 0.07986130221806373, + "grad_norm": 1.0239403247833252, + "learning_rate": 9.965744880011046e-06, + "loss": 0.7995, + "step": 1451 + }, + { + "epoch": 0.07991634102041939, + "grad_norm": 0.9808099865913391, + "learning_rate": 9.965694208571059e-06, + "loss": 1.0173, + "step": 1452 + }, + { + "epoch": 0.07997137982277505, + "grad_norm": 0.9338780641555786, + "learning_rate": 9.965643499810245e-06, + "loss": 0.7917, + "step": 1453 + }, + { + "epoch": 0.08002641862513071, + "grad_norm": 0.9294295310974121, + "learning_rate": 9.965592753728981e-06, + "loss": 0.88, + "step": 1454 + }, + { + "epoch": 0.08008145742748637, + "grad_norm": 1.0261508226394653, + "learning_rate": 9.965541970327654e-06, + "loss": 0.8825, + "step": 1455 + }, + { + "epoch": 0.08013649622984204, + "grad_norm": 0.8964946269989014, + "learning_rate": 9.965491149606642e-06, + "loss": 0.81, + "step": 1456 + }, + { + "epoch": 0.0801915350321977, + "grad_norm": 0.9468267560005188, + "learning_rate": 9.965440291566329e-06, + "loss": 0.9453, + "step": 1457 + }, + { + "epoch": 0.08024657383455336, + "grad_norm": 0.8289040327072144, + "learning_rate": 9.965389396207092e-06, + "loss": 0.7373, + "step": 1458 + }, + { + "epoch": 0.08030161263690902, + "grad_norm": 0.8782384991645813, + "learning_rate": 9.965338463529322e-06, + "loss": 0.9199, + "step": 1459 + }, + { + "epoch": 0.08035665143926468, + "grad_norm": 0.8613787293434143, + "learning_rate": 9.965287493533395e-06, + "loss": 0.8719, + "step": 1460 + }, + { + "epoch": 0.08041169024162034, + "grad_norm": 0.8474903106689453, + "learning_rate": 9.965236486219696e-06, + "loss": 0.8033, + "step": 1461 + }, + { + "epoch": 0.080466729043976, + "grad_norm": 1.1442681550979614, + "learning_rate": 9.965185441588609e-06, + "loss": 0.8996, + "step": 1462 + }, + { + "epoch": 0.08052176784633167, + "grad_norm": 1.564138412475586, + "learning_rate": 9.965134359640518e-06, + "loss": 0.7451, + "step": 1463 + }, + { + "epoch": 0.08057680664868733, + "grad_norm": 0.9211083054542542, + "learning_rate": 9.965083240375806e-06, + "loss": 0.8939, + "step": 1464 + }, + { + "epoch": 0.08063184545104299, + "grad_norm": 0.9503418207168579, + "learning_rate": 9.965032083794856e-06, + "loss": 0.8544, + "step": 1465 + }, + { + "epoch": 0.08068688425339865, + "grad_norm": 0.9304021596908569, + "learning_rate": 9.964980889898055e-06, + "loss": 0.9192, + "step": 1466 + }, + { + "epoch": 0.08074192305575431, + "grad_norm": 0.8430425524711609, + "learning_rate": 9.964929658685787e-06, + "loss": 0.8586, + "step": 1467 + }, + { + "epoch": 0.08079696185810997, + "grad_norm": 0.8671759366989136, + "learning_rate": 9.964878390158437e-06, + "loss": 0.8807, + "step": 1468 + }, + { + "epoch": 0.08085200066046563, + "grad_norm": 0.9548830986022949, + "learning_rate": 9.964827084316389e-06, + "loss": 0.9033, + "step": 1469 + }, + { + "epoch": 0.0809070394628213, + "grad_norm": 0.8736767768859863, + "learning_rate": 9.964775741160029e-06, + "loss": 0.8509, + "step": 1470 + }, + { + "epoch": 0.08096207826517696, + "grad_norm": 0.8827025890350342, + "learning_rate": 9.964724360689745e-06, + "loss": 0.897, + "step": 1471 + }, + { + "epoch": 0.0810171170675326, + "grad_norm": 1.02822744846344, + "learning_rate": 9.964672942905921e-06, + "loss": 1.0371, + "step": 1472 + }, + { + "epoch": 0.08107215586988827, + "grad_norm": 0.8619557619094849, + "learning_rate": 9.964621487808946e-06, + "loss": 0.7654, + "step": 1473 + }, + { + "epoch": 0.08112719467224393, + "grad_norm": 0.7855951189994812, + "learning_rate": 9.9645699953992e-06, + "loss": 0.7767, + "step": 1474 + }, + { + "epoch": 0.08118223347459959, + "grad_norm": 0.8139809370040894, + "learning_rate": 9.96451846567708e-06, + "loss": 0.7535, + "step": 1475 + }, + { + "epoch": 0.08123727227695525, + "grad_norm": 0.8491657376289368, + "learning_rate": 9.964466898642966e-06, + "loss": 0.854, + "step": 1476 + }, + { + "epoch": 0.08129231107931091, + "grad_norm": 0.8968605399131775, + "learning_rate": 9.964415294297247e-06, + "loss": 0.8914, + "step": 1477 + }, + { + "epoch": 0.08134734988166657, + "grad_norm": 0.8692505359649658, + "learning_rate": 9.964363652640313e-06, + "loss": 0.9245, + "step": 1478 + }, + { + "epoch": 0.08140238868402223, + "grad_norm": 0.8916530013084412, + "learning_rate": 9.964311973672549e-06, + "loss": 0.7662, + "step": 1479 + }, + { + "epoch": 0.0814574274863779, + "grad_norm": 0.8239215612411499, + "learning_rate": 9.964260257394347e-06, + "loss": 0.9191, + "step": 1480 + }, + { + "epoch": 0.08151246628873356, + "grad_norm": 0.8672100901603699, + "learning_rate": 9.964208503806092e-06, + "loss": 0.7656, + "step": 1481 + }, + { + "epoch": 0.08156750509108922, + "grad_norm": 0.9195712208747864, + "learning_rate": 9.964156712908177e-06, + "loss": 0.8656, + "step": 1482 + }, + { + "epoch": 0.08162254389344488, + "grad_norm": 0.8282535672187805, + "learning_rate": 9.964104884700986e-06, + "loss": 0.8264, + "step": 1483 + }, + { + "epoch": 0.08167758269580054, + "grad_norm": 0.8492032289505005, + "learning_rate": 9.964053019184913e-06, + "loss": 0.7816, + "step": 1484 + }, + { + "epoch": 0.0817326214981562, + "grad_norm": 0.8491117358207703, + "learning_rate": 9.964001116360347e-06, + "loss": 0.7885, + "step": 1485 + }, + { + "epoch": 0.08178766030051186, + "grad_norm": 0.9415153861045837, + "learning_rate": 9.963949176227677e-06, + "loss": 0.8165, + "step": 1486 + }, + { + "epoch": 0.08184269910286752, + "grad_norm": 0.8462526202201843, + "learning_rate": 9.963897198787294e-06, + "loss": 0.8498, + "step": 1487 + }, + { + "epoch": 0.08189773790522319, + "grad_norm": 0.8591959476470947, + "learning_rate": 9.963845184039586e-06, + "loss": 0.8906, + "step": 1488 + }, + { + "epoch": 0.08195277670757885, + "grad_norm": 0.840761661529541, + "learning_rate": 9.963793131984949e-06, + "loss": 0.7831, + "step": 1489 + }, + { + "epoch": 0.08200781550993451, + "grad_norm": 0.931404173374176, + "learning_rate": 9.96374104262377e-06, + "loss": 0.889, + "step": 1490 + }, + { + "epoch": 0.08206285431229017, + "grad_norm": 0.9048783779144287, + "learning_rate": 9.963688915956443e-06, + "loss": 0.8321, + "step": 1491 + }, + { + "epoch": 0.08211789311464583, + "grad_norm": 0.9145931601524353, + "learning_rate": 9.96363675198336e-06, + "loss": 0.9918, + "step": 1492 + }, + { + "epoch": 0.08217293191700148, + "grad_norm": 0.9256643652915955, + "learning_rate": 9.963584550704908e-06, + "loss": 0.8731, + "step": 1493 + }, + { + "epoch": 0.08222797071935714, + "grad_norm": 1.0212007761001587, + "learning_rate": 9.963532312121486e-06, + "loss": 0.9077, + "step": 1494 + }, + { + "epoch": 0.0822830095217128, + "grad_norm": 0.9206242561340332, + "learning_rate": 9.963480036233483e-06, + "loss": 0.9076, + "step": 1495 + }, + { + "epoch": 0.08233804832406846, + "grad_norm": 0.8846865296363831, + "learning_rate": 9.963427723041294e-06, + "loss": 0.6826, + "step": 1496 + }, + { + "epoch": 0.08239308712642412, + "grad_norm": 0.8745351433753967, + "learning_rate": 9.963375372545309e-06, + "loss": 0.7935, + "step": 1497 + }, + { + "epoch": 0.08244812592877979, + "grad_norm": 0.9019666314125061, + "learning_rate": 9.963322984745924e-06, + "loss": 0.8435, + "step": 1498 + }, + { + "epoch": 0.08250316473113545, + "grad_norm": 0.8586859703063965, + "learning_rate": 9.963270559643531e-06, + "loss": 0.8118, + "step": 1499 + }, + { + "epoch": 0.08255820353349111, + "grad_norm": 0.9192817807197571, + "learning_rate": 9.963218097238528e-06, + "loss": 0.824, + "step": 1500 + }, + { + "epoch": 0.08261324233584677, + "grad_norm": 0.8972243070602417, + "learning_rate": 9.963165597531304e-06, + "loss": 0.8404, + "step": 1501 + }, + { + "epoch": 0.08266828113820243, + "grad_norm": 0.8953961133956909, + "learning_rate": 9.963113060522256e-06, + "loss": 0.9031, + "step": 1502 + }, + { + "epoch": 0.0827233199405581, + "grad_norm": 0.9551270604133606, + "learning_rate": 9.963060486211779e-06, + "loss": 0.9177, + "step": 1503 + }, + { + "epoch": 0.08277835874291375, + "grad_norm": 0.8524616956710815, + "learning_rate": 9.963007874600268e-06, + "loss": 0.8582, + "step": 1504 + }, + { + "epoch": 0.08283339754526942, + "grad_norm": 0.8148764371871948, + "learning_rate": 9.962955225688118e-06, + "loss": 0.6859, + "step": 1505 + }, + { + "epoch": 0.08288843634762508, + "grad_norm": 0.9110590219497681, + "learning_rate": 9.962902539475728e-06, + "loss": 0.7189, + "step": 1506 + }, + { + "epoch": 0.08294347514998074, + "grad_norm": 0.8700116872787476, + "learning_rate": 9.962849815963487e-06, + "loss": 0.9462, + "step": 1507 + }, + { + "epoch": 0.0829985139523364, + "grad_norm": 0.877109706401825, + "learning_rate": 9.962797055151797e-06, + "loss": 0.8138, + "step": 1508 + }, + { + "epoch": 0.08305355275469206, + "grad_norm": 0.7818365097045898, + "learning_rate": 9.962744257041053e-06, + "loss": 0.8474, + "step": 1509 + }, + { + "epoch": 0.08310859155704772, + "grad_norm": 0.88360196352005, + "learning_rate": 9.96269142163165e-06, + "loss": 0.8724, + "step": 1510 + }, + { + "epoch": 0.08316363035940338, + "grad_norm": 0.8982682228088379, + "learning_rate": 9.962638548923988e-06, + "loss": 0.9687, + "step": 1511 + }, + { + "epoch": 0.08321866916175905, + "grad_norm": 0.7362002730369568, + "learning_rate": 9.962585638918462e-06, + "loss": 0.7666, + "step": 1512 + }, + { + "epoch": 0.08327370796411471, + "grad_norm": 1.0993375778198242, + "learning_rate": 9.962532691615472e-06, + "loss": 0.8869, + "step": 1513 + }, + { + "epoch": 0.08332874676647037, + "grad_norm": 0.8684842586517334, + "learning_rate": 9.962479707015415e-06, + "loss": 0.872, + "step": 1514 + }, + { + "epoch": 0.08338378556882602, + "grad_norm": 1.0598478317260742, + "learning_rate": 9.962426685118689e-06, + "loss": 0.9102, + "step": 1515 + }, + { + "epoch": 0.08343882437118168, + "grad_norm": 0.8492125272750854, + "learning_rate": 9.96237362592569e-06, + "loss": 0.7554, + "step": 1516 + }, + { + "epoch": 0.08349386317353734, + "grad_norm": 0.8489052653312683, + "learning_rate": 9.962320529436821e-06, + "loss": 0.9139, + "step": 1517 + }, + { + "epoch": 0.083548901975893, + "grad_norm": 0.8650774359703064, + "learning_rate": 9.962267395652479e-06, + "loss": 0.8717, + "step": 1518 + }, + { + "epoch": 0.08360394077824866, + "grad_norm": 0.8393206596374512, + "learning_rate": 9.962214224573064e-06, + "loss": 0.8256, + "step": 1519 + }, + { + "epoch": 0.08365897958060432, + "grad_norm": 0.8304896354675293, + "learning_rate": 9.962161016198974e-06, + "loss": 0.8232, + "step": 1520 + }, + { + "epoch": 0.08371401838295998, + "grad_norm": 0.8718386292457581, + "learning_rate": 9.962107770530612e-06, + "loss": 0.8206, + "step": 1521 + }, + { + "epoch": 0.08376905718531565, + "grad_norm": 0.9109341502189636, + "learning_rate": 9.962054487568373e-06, + "loss": 0.9576, + "step": 1522 + }, + { + "epoch": 0.08382409598767131, + "grad_norm": 0.9543303847312927, + "learning_rate": 9.962001167312663e-06, + "loss": 0.8816, + "step": 1523 + }, + { + "epoch": 0.08387913479002697, + "grad_norm": 0.9992844462394714, + "learning_rate": 9.961947809763881e-06, + "loss": 0.8682, + "step": 1524 + }, + { + "epoch": 0.08393417359238263, + "grad_norm": 0.8092770576477051, + "learning_rate": 9.961894414922425e-06, + "loss": 0.6352, + "step": 1525 + }, + { + "epoch": 0.08398921239473829, + "grad_norm": 0.9888653755187988, + "learning_rate": 9.961840982788703e-06, + "loss": 0.8721, + "step": 1526 + }, + { + "epoch": 0.08404425119709395, + "grad_norm": 1.0092703104019165, + "learning_rate": 9.961787513363108e-06, + "loss": 0.7776, + "step": 1527 + }, + { + "epoch": 0.08409928999944961, + "grad_norm": 0.8654646277427673, + "learning_rate": 9.961734006646049e-06, + "loss": 0.8835, + "step": 1528 + }, + { + "epoch": 0.08415432880180528, + "grad_norm": 0.7630153298377991, + "learning_rate": 9.961680462637924e-06, + "loss": 0.7501, + "step": 1529 + }, + { + "epoch": 0.08420936760416094, + "grad_norm": 1.1883158683776855, + "learning_rate": 9.961626881339138e-06, + "loss": 0.9476, + "step": 1530 + }, + { + "epoch": 0.0842644064065166, + "grad_norm": 0.8710927963256836, + "learning_rate": 9.96157326275009e-06, + "loss": 0.749, + "step": 1531 + }, + { + "epoch": 0.08431944520887226, + "grad_norm": 0.9500633478164673, + "learning_rate": 9.961519606871188e-06, + "loss": 0.8994, + "step": 1532 + }, + { + "epoch": 0.08437448401122792, + "grad_norm": 0.873257577419281, + "learning_rate": 9.961465913702833e-06, + "loss": 0.816, + "step": 1533 + }, + { + "epoch": 0.08442952281358358, + "grad_norm": 0.8007022142410278, + "learning_rate": 9.961412183245426e-06, + "loss": 0.787, + "step": 1534 + }, + { + "epoch": 0.08448456161593924, + "grad_norm": 0.8998435139656067, + "learning_rate": 9.961358415499374e-06, + "loss": 0.8741, + "step": 1535 + }, + { + "epoch": 0.08453960041829489, + "grad_norm": 0.9152502417564392, + "learning_rate": 9.961304610465081e-06, + "loss": 0.9749, + "step": 1536 + }, + { + "epoch": 0.08459463922065055, + "grad_norm": 0.8961958289146423, + "learning_rate": 9.961250768142949e-06, + "loss": 0.8683, + "step": 1537 + }, + { + "epoch": 0.08464967802300621, + "grad_norm": 0.8683995008468628, + "learning_rate": 9.961196888533387e-06, + "loss": 0.8347, + "step": 1538 + }, + { + "epoch": 0.08470471682536188, + "grad_norm": 0.835221529006958, + "learning_rate": 9.961142971636795e-06, + "loss": 0.8936, + "step": 1539 + }, + { + "epoch": 0.08475975562771754, + "grad_norm": 0.8666725158691406, + "learning_rate": 9.96108901745358e-06, + "loss": 0.7344, + "step": 1540 + }, + { + "epoch": 0.0848147944300732, + "grad_norm": 0.9509082436561584, + "learning_rate": 9.96103502598415e-06, + "loss": 0.8965, + "step": 1541 + }, + { + "epoch": 0.08486983323242886, + "grad_norm": 0.8134233951568604, + "learning_rate": 9.960980997228908e-06, + "loss": 0.797, + "step": 1542 + }, + { + "epoch": 0.08492487203478452, + "grad_norm": 1.0432242155075073, + "learning_rate": 9.96092693118826e-06, + "loss": 0.8754, + "step": 1543 + }, + { + "epoch": 0.08497991083714018, + "grad_norm": 0.9560218453407288, + "learning_rate": 9.960872827862613e-06, + "loss": 0.9238, + "step": 1544 + }, + { + "epoch": 0.08503494963949584, + "grad_norm": 0.8471649885177612, + "learning_rate": 9.960818687252374e-06, + "loss": 0.8622, + "step": 1545 + }, + { + "epoch": 0.0850899884418515, + "grad_norm": 1.2584747076034546, + "learning_rate": 9.960764509357951e-06, + "loss": 0.8007, + "step": 1546 + }, + { + "epoch": 0.08514502724420717, + "grad_norm": 0.8730618953704834, + "learning_rate": 9.960710294179748e-06, + "loss": 0.7412, + "step": 1547 + }, + { + "epoch": 0.08520006604656283, + "grad_norm": 0.8361592292785645, + "learning_rate": 9.960656041718176e-06, + "loss": 0.7018, + "step": 1548 + }, + { + "epoch": 0.08525510484891849, + "grad_norm": 0.8351722359657288, + "learning_rate": 9.96060175197364e-06, + "loss": 0.843, + "step": 1549 + }, + { + "epoch": 0.08531014365127415, + "grad_norm": 0.8665090203285217, + "learning_rate": 9.960547424946549e-06, + "loss": 0.8235, + "step": 1550 + }, + { + "epoch": 0.08536518245362981, + "grad_norm": 0.9254478812217712, + "learning_rate": 9.960493060637313e-06, + "loss": 0.8122, + "step": 1551 + }, + { + "epoch": 0.08542022125598547, + "grad_norm": 0.8712261319160461, + "learning_rate": 9.960438659046337e-06, + "loss": 0.823, + "step": 1552 + }, + { + "epoch": 0.08547526005834113, + "grad_norm": 0.9027207493782043, + "learning_rate": 9.960384220174033e-06, + "loss": 0.7964, + "step": 1553 + }, + { + "epoch": 0.0855302988606968, + "grad_norm": 0.854626476764679, + "learning_rate": 9.960329744020808e-06, + "loss": 0.755, + "step": 1554 + }, + { + "epoch": 0.08558533766305246, + "grad_norm": 0.9398048520088196, + "learning_rate": 9.960275230587073e-06, + "loss": 0.8607, + "step": 1555 + }, + { + "epoch": 0.08564037646540812, + "grad_norm": 1.008002758026123, + "learning_rate": 9.960220679873238e-06, + "loss": 0.9711, + "step": 1556 + }, + { + "epoch": 0.08569541526776378, + "grad_norm": 0.8999453783035278, + "learning_rate": 9.96016609187971e-06, + "loss": 0.8233, + "step": 1557 + }, + { + "epoch": 0.08575045407011943, + "grad_norm": 0.8912106156349182, + "learning_rate": 9.960111466606903e-06, + "loss": 0.8271, + "step": 1558 + }, + { + "epoch": 0.08580549287247509, + "grad_norm": 0.9269998073577881, + "learning_rate": 9.960056804055227e-06, + "loss": 0.7959, + "step": 1559 + }, + { + "epoch": 0.08586053167483075, + "grad_norm": 1.083815336227417, + "learning_rate": 9.96000210422509e-06, + "loss": 0.9436, + "step": 1560 + }, + { + "epoch": 0.08591557047718641, + "grad_norm": 0.8906280398368835, + "learning_rate": 9.959947367116905e-06, + "loss": 0.9317, + "step": 1561 + }, + { + "epoch": 0.08597060927954207, + "grad_norm": 1.211696743965149, + "learning_rate": 9.959892592731084e-06, + "loss": 0.9076, + "step": 1562 + }, + { + "epoch": 0.08602564808189773, + "grad_norm": 0.9050534963607788, + "learning_rate": 9.959837781068038e-06, + "loss": 0.8728, + "step": 1563 + }, + { + "epoch": 0.0860806868842534, + "grad_norm": 0.9384796619415283, + "learning_rate": 9.959782932128178e-06, + "loss": 0.9277, + "step": 1564 + }, + { + "epoch": 0.08613572568660906, + "grad_norm": 0.795844316482544, + "learning_rate": 9.959728045911915e-06, + "loss": 0.7666, + "step": 1565 + }, + { + "epoch": 0.08619076448896472, + "grad_norm": 0.925956666469574, + "learning_rate": 9.959673122419668e-06, + "loss": 0.815, + "step": 1566 + }, + { + "epoch": 0.08624580329132038, + "grad_norm": 0.898047924041748, + "learning_rate": 9.959618161651843e-06, + "loss": 0.8131, + "step": 1567 + }, + { + "epoch": 0.08630084209367604, + "grad_norm": 0.8656220436096191, + "learning_rate": 9.959563163608856e-06, + "loss": 0.9336, + "step": 1568 + }, + { + "epoch": 0.0863558808960317, + "grad_norm": 0.9184645414352417, + "learning_rate": 9.95950812829112e-06, + "loss": 0.9557, + "step": 1569 + }, + { + "epoch": 0.08641091969838736, + "grad_norm": 0.8607667684555054, + "learning_rate": 9.959453055699048e-06, + "loss": 0.8272, + "step": 1570 + }, + { + "epoch": 0.08646595850074303, + "grad_norm": 0.9561272263526917, + "learning_rate": 9.959397945833056e-06, + "loss": 0.8876, + "step": 1571 + }, + { + "epoch": 0.08652099730309869, + "grad_norm": 0.8562412261962891, + "learning_rate": 9.959342798693556e-06, + "loss": 0.8404, + "step": 1572 + }, + { + "epoch": 0.08657603610545435, + "grad_norm": 0.8924610614776611, + "learning_rate": 9.95928761428096e-06, + "loss": 0.8779, + "step": 1573 + }, + { + "epoch": 0.08663107490781001, + "grad_norm": 0.8343208432197571, + "learning_rate": 9.95923239259569e-06, + "loss": 0.8992, + "step": 1574 + }, + { + "epoch": 0.08668611371016567, + "grad_norm": 0.8835015296936035, + "learning_rate": 9.959177133638155e-06, + "loss": 1.0026, + "step": 1575 + }, + { + "epoch": 0.08674115251252133, + "grad_norm": 0.9540221095085144, + "learning_rate": 9.959121837408771e-06, + "loss": 0.8507, + "step": 1576 + }, + { + "epoch": 0.086796191314877, + "grad_norm": 1.087817668914795, + "learning_rate": 9.959066503907957e-06, + "loss": 0.8607, + "step": 1577 + }, + { + "epoch": 0.08685123011723266, + "grad_norm": 0.8072447180747986, + "learning_rate": 9.959011133136124e-06, + "loss": 0.882, + "step": 1578 + }, + { + "epoch": 0.0869062689195883, + "grad_norm": 0.7646876573562622, + "learning_rate": 9.958955725093694e-06, + "loss": 0.7653, + "step": 1579 + }, + { + "epoch": 0.08696130772194396, + "grad_norm": 0.8979537487030029, + "learning_rate": 9.958900279781078e-06, + "loss": 0.9033, + "step": 1580 + }, + { + "epoch": 0.08701634652429963, + "grad_norm": 0.9445611834526062, + "learning_rate": 9.958844797198696e-06, + "loss": 0.9423, + "step": 1581 + }, + { + "epoch": 0.08707138532665529, + "grad_norm": 0.8836671113967896, + "learning_rate": 9.958789277346963e-06, + "loss": 0.839, + "step": 1582 + }, + { + "epoch": 0.08712642412901095, + "grad_norm": 1.0333542823791504, + "learning_rate": 9.958733720226296e-06, + "loss": 0.9211, + "step": 1583 + }, + { + "epoch": 0.08718146293136661, + "grad_norm": 0.8084085583686829, + "learning_rate": 9.958678125837117e-06, + "loss": 0.8387, + "step": 1584 + }, + { + "epoch": 0.08723650173372227, + "grad_norm": 0.7769419550895691, + "learning_rate": 9.958622494179838e-06, + "loss": 0.8307, + "step": 1585 + }, + { + "epoch": 0.08729154053607793, + "grad_norm": 0.8387578129768372, + "learning_rate": 9.95856682525488e-06, + "loss": 0.8001, + "step": 1586 + }, + { + "epoch": 0.0873465793384336, + "grad_norm": 0.8989812731742859, + "learning_rate": 9.95851111906266e-06, + "loss": 0.7752, + "step": 1587 + }, + { + "epoch": 0.08740161814078926, + "grad_norm": 0.8558734655380249, + "learning_rate": 9.958455375603602e-06, + "loss": 0.8149, + "step": 1588 + }, + { + "epoch": 0.08745665694314492, + "grad_norm": 0.8890896439552307, + "learning_rate": 9.958399594878117e-06, + "loss": 0.8232, + "step": 1589 + }, + { + "epoch": 0.08751169574550058, + "grad_norm": 0.875912070274353, + "learning_rate": 9.95834377688663e-06, + "loss": 0.7458, + "step": 1590 + }, + { + "epoch": 0.08756673454785624, + "grad_norm": 0.808355987071991, + "learning_rate": 9.958287921629557e-06, + "loss": 0.8296, + "step": 1591 + }, + { + "epoch": 0.0876217733502119, + "grad_norm": 0.9637090563774109, + "learning_rate": 9.958232029107318e-06, + "loss": 0.8769, + "step": 1592 + }, + { + "epoch": 0.08767681215256756, + "grad_norm": 0.8980715870857239, + "learning_rate": 9.958176099320336e-06, + "loss": 0.7995, + "step": 1593 + }, + { + "epoch": 0.08773185095492322, + "grad_norm": 0.9369860291481018, + "learning_rate": 9.95812013226903e-06, + "loss": 0.8545, + "step": 1594 + }, + { + "epoch": 0.08778688975727889, + "grad_norm": 0.8589349389076233, + "learning_rate": 9.958064127953819e-06, + "loss": 0.8693, + "step": 1595 + }, + { + "epoch": 0.08784192855963455, + "grad_norm": 0.929207444190979, + "learning_rate": 9.958008086375126e-06, + "loss": 0.811, + "step": 1596 + }, + { + "epoch": 0.08789696736199021, + "grad_norm": 1.0825661420822144, + "learning_rate": 9.957952007533371e-06, + "loss": 1.0145, + "step": 1597 + }, + { + "epoch": 0.08795200616434587, + "grad_norm": 0.8818382024765015, + "learning_rate": 9.957895891428978e-06, + "loss": 0.7771, + "step": 1598 + }, + { + "epoch": 0.08800704496670153, + "grad_norm": 0.882780909538269, + "learning_rate": 9.957839738062363e-06, + "loss": 0.8857, + "step": 1599 + }, + { + "epoch": 0.08806208376905718, + "grad_norm": 0.9136924743652344, + "learning_rate": 9.957783547433955e-06, + "loss": 0.8873, + "step": 1600 + }, + { + "epoch": 0.08811712257141284, + "grad_norm": 0.8896858096122742, + "learning_rate": 9.95772731954417e-06, + "loss": 0.8463, + "step": 1601 + }, + { + "epoch": 0.0881721613737685, + "grad_norm": 0.8671631813049316, + "learning_rate": 9.957671054393436e-06, + "loss": 0.8333, + "step": 1602 + }, + { + "epoch": 0.08822720017612416, + "grad_norm": 0.9442896246910095, + "learning_rate": 9.957614751982172e-06, + "loss": 0.9676, + "step": 1603 + }, + { + "epoch": 0.08828223897847982, + "grad_norm": 0.8249240517616272, + "learning_rate": 9.957558412310803e-06, + "loss": 0.7746, + "step": 1604 + }, + { + "epoch": 0.08833727778083549, + "grad_norm": 0.8125253319740295, + "learning_rate": 9.957502035379751e-06, + "loss": 0.7816, + "step": 1605 + }, + { + "epoch": 0.08839231658319115, + "grad_norm": 0.8467233777046204, + "learning_rate": 9.957445621189442e-06, + "loss": 0.7697, + "step": 1606 + }, + { + "epoch": 0.08844735538554681, + "grad_norm": 0.8322175145149231, + "learning_rate": 9.957389169740299e-06, + "loss": 0.7561, + "step": 1607 + }, + { + "epoch": 0.08850239418790247, + "grad_norm": 0.869163453578949, + "learning_rate": 9.957332681032746e-06, + "loss": 0.8984, + "step": 1608 + }, + { + "epoch": 0.08855743299025813, + "grad_norm": 0.8755944967269897, + "learning_rate": 9.957276155067206e-06, + "loss": 0.8016, + "step": 1609 + }, + { + "epoch": 0.08861247179261379, + "grad_norm": 0.8152669668197632, + "learning_rate": 9.957219591844108e-06, + "loss": 0.7763, + "step": 1610 + }, + { + "epoch": 0.08866751059496945, + "grad_norm": 0.979752779006958, + "learning_rate": 9.957162991363871e-06, + "loss": 0.7755, + "step": 1611 + }, + { + "epoch": 0.08872254939732512, + "grad_norm": 1.0481054782867432, + "learning_rate": 9.957106353626926e-06, + "loss": 0.9395, + "step": 1612 + }, + { + "epoch": 0.08877758819968078, + "grad_norm": 0.7773686647415161, + "learning_rate": 9.957049678633697e-06, + "loss": 0.7713, + "step": 1613 + }, + { + "epoch": 0.08883262700203644, + "grad_norm": 0.838979959487915, + "learning_rate": 9.956992966384609e-06, + "loss": 0.7909, + "step": 1614 + }, + { + "epoch": 0.0888876658043921, + "grad_norm": 0.9527049660682678, + "learning_rate": 9.956936216880089e-06, + "loss": 0.7944, + "step": 1615 + }, + { + "epoch": 0.08894270460674776, + "grad_norm": 0.7967305183410645, + "learning_rate": 9.956879430120561e-06, + "loss": 0.7703, + "step": 1616 + }, + { + "epoch": 0.08899774340910342, + "grad_norm": 0.9065802097320557, + "learning_rate": 9.956822606106456e-06, + "loss": 0.8188, + "step": 1617 + }, + { + "epoch": 0.08905278221145908, + "grad_norm": 0.7329322099685669, + "learning_rate": 9.956765744838199e-06, + "loss": 0.8043, + "step": 1618 + }, + { + "epoch": 0.08910782101381474, + "grad_norm": 0.864973247051239, + "learning_rate": 9.95670884631622e-06, + "loss": 0.8334, + "step": 1619 + }, + { + "epoch": 0.0891628598161704, + "grad_norm": 1.073559045791626, + "learning_rate": 9.95665191054094e-06, + "loss": 0.7755, + "step": 1620 + }, + { + "epoch": 0.08921789861852607, + "grad_norm": 0.7347918748855591, + "learning_rate": 9.956594937512794e-06, + "loss": 0.7556, + "step": 1621 + }, + { + "epoch": 0.08927293742088172, + "grad_norm": 0.8756610751152039, + "learning_rate": 9.956537927232205e-06, + "loss": 0.8129, + "step": 1622 + }, + { + "epoch": 0.08932797622323738, + "grad_norm": 0.9132435917854309, + "learning_rate": 9.956480879699605e-06, + "loss": 0.8221, + "step": 1623 + }, + { + "epoch": 0.08938301502559304, + "grad_norm": 1.1978256702423096, + "learning_rate": 9.956423794915421e-06, + "loss": 0.8651, + "step": 1624 + }, + { + "epoch": 0.0894380538279487, + "grad_norm": 0.8493894934654236, + "learning_rate": 9.956366672880082e-06, + "loss": 0.7267, + "step": 1625 + }, + { + "epoch": 0.08949309263030436, + "grad_norm": 1.0971951484680176, + "learning_rate": 9.956309513594019e-06, + "loss": 0.7852, + "step": 1626 + }, + { + "epoch": 0.08954813143266002, + "grad_norm": 0.899974524974823, + "learning_rate": 9.95625231705766e-06, + "loss": 0.8868, + "step": 1627 + }, + { + "epoch": 0.08960317023501568, + "grad_norm": 0.8995566368103027, + "learning_rate": 9.956195083271436e-06, + "loss": 0.87, + "step": 1628 + }, + { + "epoch": 0.08965820903737134, + "grad_norm": 0.8924218416213989, + "learning_rate": 9.956137812235776e-06, + "loss": 0.7885, + "step": 1629 + }, + { + "epoch": 0.089713247839727, + "grad_norm": 0.9232820868492126, + "learning_rate": 9.956080503951108e-06, + "loss": 0.7923, + "step": 1630 + }, + { + "epoch": 0.08976828664208267, + "grad_norm": 0.9298982620239258, + "learning_rate": 9.956023158417869e-06, + "loss": 0.8625, + "step": 1631 + }, + { + "epoch": 0.08982332544443833, + "grad_norm": 0.86515212059021, + "learning_rate": 9.955965775636488e-06, + "loss": 0.7683, + "step": 1632 + }, + { + "epoch": 0.08987836424679399, + "grad_norm": 0.8016952276229858, + "learning_rate": 9.955908355607392e-06, + "loss": 0.8122, + "step": 1633 + }, + { + "epoch": 0.08993340304914965, + "grad_norm": 0.842703640460968, + "learning_rate": 9.955850898331015e-06, + "loss": 0.8487, + "step": 1634 + }, + { + "epoch": 0.08998844185150531, + "grad_norm": 0.8239083886146545, + "learning_rate": 9.95579340380779e-06, + "loss": 0.8701, + "step": 1635 + }, + { + "epoch": 0.09004348065386097, + "grad_norm": 0.8575418591499329, + "learning_rate": 9.955735872038149e-06, + "loss": 0.8263, + "step": 1636 + }, + { + "epoch": 0.09009851945621664, + "grad_norm": 0.8884586095809937, + "learning_rate": 9.955678303022522e-06, + "loss": 0.8112, + "step": 1637 + }, + { + "epoch": 0.0901535582585723, + "grad_norm": 0.9024681448936462, + "learning_rate": 9.955620696761345e-06, + "loss": 0.9174, + "step": 1638 + }, + { + "epoch": 0.09020859706092796, + "grad_norm": 0.8151944875717163, + "learning_rate": 9.955563053255049e-06, + "loss": 0.806, + "step": 1639 + }, + { + "epoch": 0.09026363586328362, + "grad_norm": 0.8292184472084045, + "learning_rate": 9.955505372504069e-06, + "loss": 0.8007, + "step": 1640 + }, + { + "epoch": 0.09031867466563928, + "grad_norm": 0.9445936679840088, + "learning_rate": 9.955447654508835e-06, + "loss": 0.7089, + "step": 1641 + }, + { + "epoch": 0.09037371346799494, + "grad_norm": 0.781579315662384, + "learning_rate": 9.955389899269782e-06, + "loss": 0.8224, + "step": 1642 + }, + { + "epoch": 0.09042875227035059, + "grad_norm": 0.9028880596160889, + "learning_rate": 9.955332106787348e-06, + "loss": 0.7976, + "step": 1643 + }, + { + "epoch": 0.09048379107270625, + "grad_norm": 1.0336887836456299, + "learning_rate": 9.955274277061963e-06, + "loss": 0.9296, + "step": 1644 + }, + { + "epoch": 0.09053882987506191, + "grad_norm": 0.8894197940826416, + "learning_rate": 9.955216410094062e-06, + "loss": 0.815, + "step": 1645 + }, + { + "epoch": 0.09059386867741757, + "grad_norm": 0.8955528140068054, + "learning_rate": 9.955158505884083e-06, + "loss": 0.8707, + "step": 1646 + }, + { + "epoch": 0.09064890747977324, + "grad_norm": 0.8012683987617493, + "learning_rate": 9.955100564432458e-06, + "loss": 0.7467, + "step": 1647 + }, + { + "epoch": 0.0907039462821289, + "grad_norm": 0.917969286441803, + "learning_rate": 9.955042585739623e-06, + "loss": 0.8835, + "step": 1648 + }, + { + "epoch": 0.09075898508448456, + "grad_norm": 0.8066666722297668, + "learning_rate": 9.954984569806014e-06, + "loss": 0.8338, + "step": 1649 + }, + { + "epoch": 0.09081402388684022, + "grad_norm": 1.1324070692062378, + "learning_rate": 9.954926516632069e-06, + "loss": 0.8245, + "step": 1650 + }, + { + "epoch": 0.09086906268919588, + "grad_norm": 0.8196014761924744, + "learning_rate": 9.954868426218222e-06, + "loss": 0.7897, + "step": 1651 + }, + { + "epoch": 0.09092410149155154, + "grad_norm": 0.8713478446006775, + "learning_rate": 9.95481029856491e-06, + "loss": 0.891, + "step": 1652 + }, + { + "epoch": 0.0909791402939072, + "grad_norm": 0.8489059805870056, + "learning_rate": 9.954752133672569e-06, + "loss": 0.7748, + "step": 1653 + }, + { + "epoch": 0.09103417909626287, + "grad_norm": 0.8914602994918823, + "learning_rate": 9.954693931541638e-06, + "loss": 0.8657, + "step": 1654 + }, + { + "epoch": 0.09108921789861853, + "grad_norm": 0.9031614661216736, + "learning_rate": 9.954635692172555e-06, + "loss": 0.7409, + "step": 1655 + }, + { + "epoch": 0.09114425670097419, + "grad_norm": 0.8680000305175781, + "learning_rate": 9.954577415565756e-06, + "loss": 0.8535, + "step": 1656 + }, + { + "epoch": 0.09119929550332985, + "grad_norm": 0.830596923828125, + "learning_rate": 9.954519101721679e-06, + "loss": 0.8601, + "step": 1657 + }, + { + "epoch": 0.09125433430568551, + "grad_norm": 0.9041332602500916, + "learning_rate": 9.954460750640762e-06, + "loss": 0.9104, + "step": 1658 + }, + { + "epoch": 0.09130937310804117, + "grad_norm": 0.7786296606063843, + "learning_rate": 9.954402362323445e-06, + "loss": 0.7671, + "step": 1659 + }, + { + "epoch": 0.09136441191039683, + "grad_norm": 1.0363564491271973, + "learning_rate": 9.954343936770165e-06, + "loss": 0.9339, + "step": 1660 + }, + { + "epoch": 0.0914194507127525, + "grad_norm": 0.8049986958503723, + "learning_rate": 9.954285473981363e-06, + "loss": 0.8125, + "step": 1661 + }, + { + "epoch": 0.09147448951510816, + "grad_norm": 0.7842011451721191, + "learning_rate": 9.954226973957477e-06, + "loss": 0.7153, + "step": 1662 + }, + { + "epoch": 0.09152952831746382, + "grad_norm": 0.8929729461669922, + "learning_rate": 9.954168436698948e-06, + "loss": 0.9563, + "step": 1663 + }, + { + "epoch": 0.09158456711981948, + "grad_norm": 0.8850226402282715, + "learning_rate": 9.954109862206216e-06, + "loss": 0.8257, + "step": 1664 + }, + { + "epoch": 0.09163960592217513, + "grad_norm": 0.8673348426818848, + "learning_rate": 9.954051250479719e-06, + "loss": 0.9489, + "step": 1665 + }, + { + "epoch": 0.09169464472453079, + "grad_norm": 0.8726119995117188, + "learning_rate": 9.9539926015199e-06, + "loss": 0.8222, + "step": 1666 + }, + { + "epoch": 0.09174968352688645, + "grad_norm": 0.7609312534332275, + "learning_rate": 9.953933915327196e-06, + "loss": 0.7749, + "step": 1667 + }, + { + "epoch": 0.09180472232924211, + "grad_norm": 0.857404887676239, + "learning_rate": 9.953875191902055e-06, + "loss": 0.8496, + "step": 1668 + }, + { + "epoch": 0.09185976113159777, + "grad_norm": 0.7835526466369629, + "learning_rate": 9.953816431244909e-06, + "loss": 0.7258, + "step": 1669 + }, + { + "epoch": 0.09191479993395343, + "grad_norm": 0.944984495639801, + "learning_rate": 9.95375763335621e-06, + "loss": 0.902, + "step": 1670 + }, + { + "epoch": 0.0919698387363091, + "grad_norm": 0.9038936495780945, + "learning_rate": 9.953698798236391e-06, + "loss": 0.7559, + "step": 1671 + }, + { + "epoch": 0.09202487753866476, + "grad_norm": 0.8450848460197449, + "learning_rate": 9.953639925885898e-06, + "loss": 0.8338, + "step": 1672 + }, + { + "epoch": 0.09207991634102042, + "grad_norm": 0.827419102191925, + "learning_rate": 9.953581016305175e-06, + "loss": 0.8167, + "step": 1673 + }, + { + "epoch": 0.09213495514337608, + "grad_norm": 0.8517075777053833, + "learning_rate": 9.953522069494663e-06, + "loss": 0.8681, + "step": 1674 + }, + { + "epoch": 0.09218999394573174, + "grad_norm": 0.9504323601722717, + "learning_rate": 9.953463085454804e-06, + "loss": 0.8688, + "step": 1675 + }, + { + "epoch": 0.0922450327480874, + "grad_norm": 0.8905719518661499, + "learning_rate": 9.953404064186044e-06, + "loss": 0.8818, + "step": 1676 + }, + { + "epoch": 0.09230007155044306, + "grad_norm": 0.9223340153694153, + "learning_rate": 9.953345005688822e-06, + "loss": 0.8752, + "step": 1677 + }, + { + "epoch": 0.09235511035279872, + "grad_norm": 1.0500547885894775, + "learning_rate": 9.953285909963588e-06, + "loss": 0.7816, + "step": 1678 + }, + { + "epoch": 0.09241014915515439, + "grad_norm": 0.8407441973686218, + "learning_rate": 9.953226777010781e-06, + "loss": 0.745, + "step": 1679 + }, + { + "epoch": 0.09246518795751005, + "grad_norm": 0.7997288107872009, + "learning_rate": 9.953167606830847e-06, + "loss": 0.8171, + "step": 1680 + }, + { + "epoch": 0.09252022675986571, + "grad_norm": 0.9752318859100342, + "learning_rate": 9.953108399424234e-06, + "loss": 0.8719, + "step": 1681 + }, + { + "epoch": 0.09257526556222137, + "grad_norm": 0.8524298667907715, + "learning_rate": 9.953049154791382e-06, + "loss": 0.8257, + "step": 1682 + }, + { + "epoch": 0.09263030436457703, + "grad_norm": 0.9460529088973999, + "learning_rate": 9.952989872932739e-06, + "loss": 0.7278, + "step": 1683 + }, + { + "epoch": 0.0926853431669327, + "grad_norm": 0.8959575891494751, + "learning_rate": 9.95293055384875e-06, + "loss": 0.903, + "step": 1684 + }, + { + "epoch": 0.09274038196928835, + "grad_norm": 0.8764386177062988, + "learning_rate": 9.95287119753986e-06, + "loss": 0.7958, + "step": 1685 + }, + { + "epoch": 0.092795420771644, + "grad_norm": 0.9611337184906006, + "learning_rate": 9.952811804006517e-06, + "loss": 0.8726, + "step": 1686 + }, + { + "epoch": 0.09285045957399966, + "grad_norm": 0.8155574202537537, + "learning_rate": 9.952752373249165e-06, + "loss": 0.7882, + "step": 1687 + }, + { + "epoch": 0.09290549837635532, + "grad_norm": 0.8789697289466858, + "learning_rate": 9.952692905268253e-06, + "loss": 0.8642, + "step": 1688 + }, + { + "epoch": 0.09296053717871099, + "grad_norm": 0.7910027503967285, + "learning_rate": 9.952633400064227e-06, + "loss": 0.7852, + "step": 1689 + }, + { + "epoch": 0.09301557598106665, + "grad_norm": 0.815819501876831, + "learning_rate": 9.952573857637533e-06, + "loss": 0.8606, + "step": 1690 + }, + { + "epoch": 0.09307061478342231, + "grad_norm": 0.9840701818466187, + "learning_rate": 9.95251427798862e-06, + "loss": 0.9349, + "step": 1691 + }, + { + "epoch": 0.09312565358577797, + "grad_norm": 0.8715788722038269, + "learning_rate": 9.952454661117936e-06, + "loss": 0.813, + "step": 1692 + }, + { + "epoch": 0.09318069238813363, + "grad_norm": 0.8287779092788696, + "learning_rate": 9.952395007025926e-06, + "loss": 0.8346, + "step": 1693 + }, + { + "epoch": 0.0932357311904893, + "grad_norm": 0.9375059008598328, + "learning_rate": 9.952335315713044e-06, + "loss": 0.8868, + "step": 1694 + }, + { + "epoch": 0.09329076999284495, + "grad_norm": 0.9063667058944702, + "learning_rate": 9.952275587179734e-06, + "loss": 0.9562, + "step": 1695 + }, + { + "epoch": 0.09334580879520062, + "grad_norm": 0.816643476486206, + "learning_rate": 9.952215821426447e-06, + "loss": 0.7456, + "step": 1696 + }, + { + "epoch": 0.09340084759755628, + "grad_norm": 0.9004347324371338, + "learning_rate": 9.95215601845363e-06, + "loss": 0.8545, + "step": 1697 + }, + { + "epoch": 0.09345588639991194, + "grad_norm": 0.919195830821991, + "learning_rate": 9.952096178261736e-06, + "loss": 0.9347, + "step": 1698 + }, + { + "epoch": 0.0935109252022676, + "grad_norm": 0.8313261866569519, + "learning_rate": 9.952036300851211e-06, + "loss": 0.9169, + "step": 1699 + }, + { + "epoch": 0.09356596400462326, + "grad_norm": 0.8674910664558411, + "learning_rate": 9.951976386222507e-06, + "loss": 0.7621, + "step": 1700 + }, + { + "epoch": 0.09362100280697892, + "grad_norm": 0.8931052684783936, + "learning_rate": 9.951916434376074e-06, + "loss": 0.8702, + "step": 1701 + }, + { + "epoch": 0.09367604160933458, + "grad_norm": 0.8748393058776855, + "learning_rate": 9.951856445312364e-06, + "loss": 0.7446, + "step": 1702 + }, + { + "epoch": 0.09373108041169025, + "grad_norm": 1.005459189414978, + "learning_rate": 9.951796419031825e-06, + "loss": 0.9843, + "step": 1703 + }, + { + "epoch": 0.09378611921404591, + "grad_norm": 1.0155184268951416, + "learning_rate": 9.95173635553491e-06, + "loss": 0.8868, + "step": 1704 + }, + { + "epoch": 0.09384115801640157, + "grad_norm": 2.1387271881103516, + "learning_rate": 9.951676254822072e-06, + "loss": 0.8691, + "step": 1705 + }, + { + "epoch": 0.09389619681875723, + "grad_norm": 0.9768403768539429, + "learning_rate": 9.951616116893757e-06, + "loss": 0.8409, + "step": 1706 + }, + { + "epoch": 0.09395123562111289, + "grad_norm": 0.7994607090950012, + "learning_rate": 9.951555941750424e-06, + "loss": 0.7836, + "step": 1707 + }, + { + "epoch": 0.09400627442346854, + "grad_norm": 0.8460201025009155, + "learning_rate": 9.95149572939252e-06, + "loss": 0.8216, + "step": 1708 + }, + { + "epoch": 0.0940613132258242, + "grad_norm": 0.8904135227203369, + "learning_rate": 9.951435479820499e-06, + "loss": 0.9053, + "step": 1709 + }, + { + "epoch": 0.09411635202817986, + "grad_norm": 0.9084494113922119, + "learning_rate": 9.951375193034815e-06, + "loss": 0.9308, + "step": 1710 + }, + { + "epoch": 0.09417139083053552, + "grad_norm": 1.0826482772827148, + "learning_rate": 9.951314869035921e-06, + "loss": 0.8468, + "step": 1711 + }, + { + "epoch": 0.09422642963289118, + "grad_norm": 0.8068915009498596, + "learning_rate": 9.95125450782427e-06, + "loss": 0.8253, + "step": 1712 + }, + { + "epoch": 0.09428146843524685, + "grad_norm": 0.8445400595664978, + "learning_rate": 9.951194109400316e-06, + "loss": 0.8386, + "step": 1713 + }, + { + "epoch": 0.09433650723760251, + "grad_norm": 0.8180645704269409, + "learning_rate": 9.951133673764513e-06, + "loss": 0.7907, + "step": 1714 + }, + { + "epoch": 0.09439154603995817, + "grad_norm": 0.8111036419868469, + "learning_rate": 9.951073200917311e-06, + "loss": 0.7918, + "step": 1715 + }, + { + "epoch": 0.09444658484231383, + "grad_norm": 0.862042248249054, + "learning_rate": 9.951012690859172e-06, + "loss": 0.783, + "step": 1716 + }, + { + "epoch": 0.09450162364466949, + "grad_norm": 0.8189615607261658, + "learning_rate": 9.950952143590544e-06, + "loss": 0.8192, + "step": 1717 + }, + { + "epoch": 0.09455666244702515, + "grad_norm": 0.9714062809944153, + "learning_rate": 9.950891559111887e-06, + "loss": 0.774, + "step": 1718 + }, + { + "epoch": 0.09461170124938081, + "grad_norm": 0.9691846370697021, + "learning_rate": 9.950830937423655e-06, + "loss": 0.8347, + "step": 1719 + }, + { + "epoch": 0.09466674005173648, + "grad_norm": 0.8488250970840454, + "learning_rate": 9.950770278526301e-06, + "loss": 0.8228, + "step": 1720 + }, + { + "epoch": 0.09472177885409214, + "grad_norm": 0.8638359904289246, + "learning_rate": 9.950709582420282e-06, + "loss": 0.8973, + "step": 1721 + }, + { + "epoch": 0.0947768176564478, + "grad_norm": 1.0148643255233765, + "learning_rate": 9.950648849106058e-06, + "loss": 0.9638, + "step": 1722 + }, + { + "epoch": 0.09483185645880346, + "grad_norm": 0.8870131969451904, + "learning_rate": 9.95058807858408e-06, + "loss": 0.8259, + "step": 1723 + }, + { + "epoch": 0.09488689526115912, + "grad_norm": 0.9134769439697266, + "learning_rate": 9.950527270854807e-06, + "loss": 0.865, + "step": 1724 + }, + { + "epoch": 0.09494193406351478, + "grad_norm": 0.7221654653549194, + "learning_rate": 9.950466425918697e-06, + "loss": 0.7593, + "step": 1725 + }, + { + "epoch": 0.09499697286587044, + "grad_norm": 0.9386674165725708, + "learning_rate": 9.950405543776207e-06, + "loss": 0.9508, + "step": 1726 + }, + { + "epoch": 0.0950520116682261, + "grad_norm": 0.7850627899169922, + "learning_rate": 9.950344624427795e-06, + "loss": 0.7999, + "step": 1727 + }, + { + "epoch": 0.09510705047058177, + "grad_norm": 0.921198308467865, + "learning_rate": 9.950283667873916e-06, + "loss": 0.8249, + "step": 1728 + }, + { + "epoch": 0.09516208927293741, + "grad_norm": 0.9503389000892639, + "learning_rate": 9.95022267411503e-06, + "loss": 0.901, + "step": 1729 + }, + { + "epoch": 0.09521712807529308, + "grad_norm": 0.7977343201637268, + "learning_rate": 9.950161643151597e-06, + "loss": 0.838, + "step": 1730 + }, + { + "epoch": 0.09527216687764874, + "grad_norm": 0.9056238532066345, + "learning_rate": 9.950100574984072e-06, + "loss": 0.9756, + "step": 1731 + }, + { + "epoch": 0.0953272056800044, + "grad_norm": 0.8092935681343079, + "learning_rate": 9.950039469612918e-06, + "loss": 0.8812, + "step": 1732 + }, + { + "epoch": 0.09538224448236006, + "grad_norm": 0.823693573474884, + "learning_rate": 9.949978327038592e-06, + "loss": 0.7914, + "step": 1733 + }, + { + "epoch": 0.09543728328471572, + "grad_norm": 0.9114876389503479, + "learning_rate": 9.949917147261554e-06, + "loss": 0.7944, + "step": 1734 + }, + { + "epoch": 0.09549232208707138, + "grad_norm": 1.0084123611450195, + "learning_rate": 9.949855930282262e-06, + "loss": 0.8544, + "step": 1735 + }, + { + "epoch": 0.09554736088942704, + "grad_norm": 0.842462956905365, + "learning_rate": 9.949794676101181e-06, + "loss": 0.7056, + "step": 1736 + }, + { + "epoch": 0.0956023996917827, + "grad_norm": 1.00497305393219, + "learning_rate": 9.949733384718766e-06, + "loss": 0.8372, + "step": 1737 + }, + { + "epoch": 0.09565743849413837, + "grad_norm": 1.0166410207748413, + "learning_rate": 9.94967205613548e-06, + "loss": 0.9316, + "step": 1738 + }, + { + "epoch": 0.09571247729649403, + "grad_norm": 0.8520192503929138, + "learning_rate": 9.949610690351784e-06, + "loss": 0.786, + "step": 1739 + }, + { + "epoch": 0.09576751609884969, + "grad_norm": 0.8003227114677429, + "learning_rate": 9.949549287368139e-06, + "loss": 0.8003, + "step": 1740 + }, + { + "epoch": 0.09582255490120535, + "grad_norm": 0.8657151460647583, + "learning_rate": 9.949487847185006e-06, + "loss": 0.8407, + "step": 1741 + }, + { + "epoch": 0.09587759370356101, + "grad_norm": 1.1119858026504517, + "learning_rate": 9.949426369802848e-06, + "loss": 0.8594, + "step": 1742 + }, + { + "epoch": 0.09593263250591667, + "grad_norm": 0.8968474864959717, + "learning_rate": 9.949364855222126e-06, + "loss": 0.8254, + "step": 1743 + }, + { + "epoch": 0.09598767130827233, + "grad_norm": 0.8740531206130981, + "learning_rate": 9.949303303443304e-06, + "loss": 0.8748, + "step": 1744 + }, + { + "epoch": 0.096042710110628, + "grad_norm": 0.8833459615707397, + "learning_rate": 9.94924171446684e-06, + "loss": 0.838, + "step": 1745 + }, + { + "epoch": 0.09609774891298366, + "grad_norm": 0.8783486485481262, + "learning_rate": 9.949180088293201e-06, + "loss": 0.7972, + "step": 1746 + }, + { + "epoch": 0.09615278771533932, + "grad_norm": 0.9197877049446106, + "learning_rate": 9.949118424922852e-06, + "loss": 0.8669, + "step": 1747 + }, + { + "epoch": 0.09620782651769498, + "grad_norm": 0.9771283864974976, + "learning_rate": 9.949056724356251e-06, + "loss": 0.8461, + "step": 1748 + }, + { + "epoch": 0.09626286532005064, + "grad_norm": 0.8325022459030151, + "learning_rate": 9.948994986593864e-06, + "loss": 0.8482, + "step": 1749 + }, + { + "epoch": 0.0963179041224063, + "grad_norm": 0.9732363224029541, + "learning_rate": 9.948933211636158e-06, + "loss": 0.8825, + "step": 1750 + }, + { + "epoch": 0.09637294292476195, + "grad_norm": 0.8229798078536987, + "learning_rate": 9.948871399483592e-06, + "loss": 0.8079, + "step": 1751 + }, + { + "epoch": 0.09642798172711761, + "grad_norm": 0.8861554265022278, + "learning_rate": 9.948809550136635e-06, + "loss": 0.8323, + "step": 1752 + }, + { + "epoch": 0.09648302052947327, + "grad_norm": 1.0618904829025269, + "learning_rate": 9.94874766359575e-06, + "loss": 0.8519, + "step": 1753 + }, + { + "epoch": 0.09653805933182893, + "grad_norm": 0.8494864702224731, + "learning_rate": 9.948685739861403e-06, + "loss": 0.961, + "step": 1754 + }, + { + "epoch": 0.0965930981341846, + "grad_norm": 0.8872213959693909, + "learning_rate": 9.948623778934058e-06, + "loss": 0.9367, + "step": 1755 + }, + { + "epoch": 0.09664813693654026, + "grad_norm": 0.8441230058670044, + "learning_rate": 9.948561780814181e-06, + "loss": 0.7654, + "step": 1756 + }, + { + "epoch": 0.09670317573889592, + "grad_norm": 0.8072223663330078, + "learning_rate": 9.948499745502239e-06, + "loss": 0.7894, + "step": 1757 + }, + { + "epoch": 0.09675821454125158, + "grad_norm": 0.8285261392593384, + "learning_rate": 9.948437672998696e-06, + "loss": 0.8351, + "step": 1758 + }, + { + "epoch": 0.09681325334360724, + "grad_norm": 0.9272124767303467, + "learning_rate": 9.94837556330402e-06, + "loss": 0.8708, + "step": 1759 + }, + { + "epoch": 0.0968682921459629, + "grad_norm": 0.8689375519752502, + "learning_rate": 9.94831341641868e-06, + "loss": 0.8478, + "step": 1760 + }, + { + "epoch": 0.09692333094831856, + "grad_norm": 1.040784239768982, + "learning_rate": 9.94825123234314e-06, + "loss": 0.8915, + "step": 1761 + }, + { + "epoch": 0.09697836975067423, + "grad_norm": 0.7819718718528748, + "learning_rate": 9.948189011077867e-06, + "loss": 0.7728, + "step": 1762 + }, + { + "epoch": 0.09703340855302989, + "grad_norm": 0.7959379553794861, + "learning_rate": 9.948126752623331e-06, + "loss": 0.8248, + "step": 1763 + }, + { + "epoch": 0.09708844735538555, + "grad_norm": 0.8844753503799438, + "learning_rate": 9.94806445698e-06, + "loss": 0.7742, + "step": 1764 + }, + { + "epoch": 0.09714348615774121, + "grad_norm": 0.9168505668640137, + "learning_rate": 9.948002124148339e-06, + "loss": 0.9145, + "step": 1765 + }, + { + "epoch": 0.09719852496009687, + "grad_norm": 0.7199662923812866, + "learning_rate": 9.947939754128819e-06, + "loss": 0.6652, + "step": 1766 + }, + { + "epoch": 0.09725356376245253, + "grad_norm": 0.866470992565155, + "learning_rate": 9.947877346921909e-06, + "loss": 0.8293, + "step": 1767 + }, + { + "epoch": 0.0973086025648082, + "grad_norm": 0.9124754667282104, + "learning_rate": 9.947814902528078e-06, + "loss": 0.8599, + "step": 1768 + }, + { + "epoch": 0.09736364136716386, + "grad_norm": 0.9169870615005493, + "learning_rate": 9.947752420947792e-06, + "loss": 0.8382, + "step": 1769 + }, + { + "epoch": 0.09741868016951952, + "grad_norm": 1.0147640705108643, + "learning_rate": 9.947689902181526e-06, + "loss": 0.8425, + "step": 1770 + }, + { + "epoch": 0.09747371897187518, + "grad_norm": 0.778575599193573, + "learning_rate": 9.947627346229745e-06, + "loss": 0.6979, + "step": 1771 + }, + { + "epoch": 0.09752875777423083, + "grad_norm": 0.815101146697998, + "learning_rate": 9.947564753092922e-06, + "loss": 0.8617, + "step": 1772 + }, + { + "epoch": 0.09758379657658649, + "grad_norm": 0.9556358456611633, + "learning_rate": 9.947502122771527e-06, + "loss": 0.9009, + "step": 1773 + }, + { + "epoch": 0.09763883537894215, + "grad_norm": 0.8603761196136475, + "learning_rate": 9.94743945526603e-06, + "loss": 0.9443, + "step": 1774 + }, + { + "epoch": 0.09769387418129781, + "grad_norm": 0.8621761798858643, + "learning_rate": 9.947376750576903e-06, + "loss": 0.7537, + "step": 1775 + }, + { + "epoch": 0.09774891298365347, + "grad_norm": 0.7399948835372925, + "learning_rate": 9.947314008704616e-06, + "loss": 0.7477, + "step": 1776 + }, + { + "epoch": 0.09780395178600913, + "grad_norm": 0.8855582475662231, + "learning_rate": 9.947251229649641e-06, + "loss": 0.8745, + "step": 1777 + }, + { + "epoch": 0.0978589905883648, + "grad_norm": 0.8718472719192505, + "learning_rate": 9.947188413412452e-06, + "loss": 0.9672, + "step": 1778 + }, + { + "epoch": 0.09791402939072046, + "grad_norm": 0.8598514795303345, + "learning_rate": 9.947125559993517e-06, + "loss": 0.8278, + "step": 1779 + }, + { + "epoch": 0.09796906819307612, + "grad_norm": 1.0373798608779907, + "learning_rate": 9.947062669393312e-06, + "loss": 0.8123, + "step": 1780 + }, + { + "epoch": 0.09802410699543178, + "grad_norm": 1.0198705196380615, + "learning_rate": 9.946999741612306e-06, + "loss": 0.9039, + "step": 1781 + }, + { + "epoch": 0.09807914579778744, + "grad_norm": 0.8770025968551636, + "learning_rate": 9.946936776650977e-06, + "loss": 0.8326, + "step": 1782 + }, + { + "epoch": 0.0981341846001431, + "grad_norm": 0.7970215678215027, + "learning_rate": 9.946873774509794e-06, + "loss": 0.848, + "step": 1783 + }, + { + "epoch": 0.09818922340249876, + "grad_norm": 0.90342777967453, + "learning_rate": 9.946810735189231e-06, + "loss": 0.7993, + "step": 1784 + }, + { + "epoch": 0.09824426220485442, + "grad_norm": 1.2095681428909302, + "learning_rate": 9.946747658689763e-06, + "loss": 0.8544, + "step": 1785 + }, + { + "epoch": 0.09829930100721009, + "grad_norm": 0.8500953316688538, + "learning_rate": 9.946684545011866e-06, + "loss": 0.8398, + "step": 1786 + }, + { + "epoch": 0.09835433980956575, + "grad_norm": 0.8570724725723267, + "learning_rate": 9.946621394156011e-06, + "loss": 0.9255, + "step": 1787 + }, + { + "epoch": 0.09840937861192141, + "grad_norm": 0.8314846158027649, + "learning_rate": 9.946558206122672e-06, + "loss": 0.8398, + "step": 1788 + }, + { + "epoch": 0.09846441741427707, + "grad_norm": 0.8894716501235962, + "learning_rate": 9.946494980912326e-06, + "loss": 0.8612, + "step": 1789 + }, + { + "epoch": 0.09851945621663273, + "grad_norm": 0.9555756449699402, + "learning_rate": 9.94643171852545e-06, + "loss": 0.9551, + "step": 1790 + }, + { + "epoch": 0.09857449501898839, + "grad_norm": 0.9556692838668823, + "learning_rate": 9.946368418962515e-06, + "loss": 0.8175, + "step": 1791 + }, + { + "epoch": 0.09862953382134405, + "grad_norm": 0.7288535833358765, + "learning_rate": 9.946305082224e-06, + "loss": 0.6162, + "step": 1792 + }, + { + "epoch": 0.09868457262369972, + "grad_norm": 0.95478355884552, + "learning_rate": 9.94624170831038e-06, + "loss": 0.9089, + "step": 1793 + }, + { + "epoch": 0.09873961142605536, + "grad_norm": 0.9080137610435486, + "learning_rate": 9.946178297222133e-06, + "loss": 0.9443, + "step": 1794 + }, + { + "epoch": 0.09879465022841102, + "grad_norm": 0.8060124516487122, + "learning_rate": 9.946114848959732e-06, + "loss": 0.7412, + "step": 1795 + }, + { + "epoch": 0.09884968903076669, + "grad_norm": 0.8487932085990906, + "learning_rate": 9.946051363523655e-06, + "loss": 0.7098, + "step": 1796 + }, + { + "epoch": 0.09890472783312235, + "grad_norm": 0.8982037901878357, + "learning_rate": 9.945987840914381e-06, + "loss": 0.8304, + "step": 1797 + }, + { + "epoch": 0.09895976663547801, + "grad_norm": 0.8124602437019348, + "learning_rate": 9.945924281132386e-06, + "loss": 0.8441, + "step": 1798 + }, + { + "epoch": 0.09901480543783367, + "grad_norm": 0.8081663250923157, + "learning_rate": 9.945860684178147e-06, + "loss": 0.732, + "step": 1799 + }, + { + "epoch": 0.09906984424018933, + "grad_norm": 0.7662907242774963, + "learning_rate": 9.945797050052147e-06, + "loss": 0.7538, + "step": 1800 + }, + { + "epoch": 0.09912488304254499, + "grad_norm": 0.8418399095535278, + "learning_rate": 9.945733378754856e-06, + "loss": 0.8488, + "step": 1801 + }, + { + "epoch": 0.09917992184490065, + "grad_norm": 0.7298988699913025, + "learning_rate": 9.94566967028676e-06, + "loss": 0.7822, + "step": 1802 + }, + { + "epoch": 0.09923496064725632, + "grad_norm": 0.7788695693016052, + "learning_rate": 9.945605924648332e-06, + "loss": 0.8037, + "step": 1803 + }, + { + "epoch": 0.09928999944961198, + "grad_norm": 0.939297080039978, + "learning_rate": 9.945542141840054e-06, + "loss": 0.8654, + "step": 1804 + }, + { + "epoch": 0.09934503825196764, + "grad_norm": 0.9274358749389648, + "learning_rate": 9.945478321862406e-06, + "loss": 0.7712, + "step": 1805 + }, + { + "epoch": 0.0994000770543233, + "grad_norm": 0.816561222076416, + "learning_rate": 9.945414464715866e-06, + "loss": 0.7676, + "step": 1806 + }, + { + "epoch": 0.09945511585667896, + "grad_norm": 0.867915153503418, + "learning_rate": 9.945350570400916e-06, + "loss": 0.8343, + "step": 1807 + }, + { + "epoch": 0.09951015465903462, + "grad_norm": 0.8446162939071655, + "learning_rate": 9.945286638918034e-06, + "loss": 0.8128, + "step": 1808 + }, + { + "epoch": 0.09956519346139028, + "grad_norm": 0.8372986316680908, + "learning_rate": 9.945222670267703e-06, + "loss": 0.8611, + "step": 1809 + }, + { + "epoch": 0.09962023226374594, + "grad_norm": 0.787836492061615, + "learning_rate": 9.945158664450399e-06, + "loss": 0.7286, + "step": 1810 + }, + { + "epoch": 0.0996752710661016, + "grad_norm": 0.9293436408042908, + "learning_rate": 9.945094621466609e-06, + "loss": 0.8699, + "step": 1811 + }, + { + "epoch": 0.09973030986845727, + "grad_norm": 0.8336932063102722, + "learning_rate": 9.94503054131681e-06, + "loss": 0.8222, + "step": 1812 + }, + { + "epoch": 0.09978534867081293, + "grad_norm": 0.8310953378677368, + "learning_rate": 9.944966424001486e-06, + "loss": 0.8131, + "step": 1813 + }, + { + "epoch": 0.09984038747316859, + "grad_norm": 0.7703443169593811, + "learning_rate": 9.944902269521117e-06, + "loss": 0.8135, + "step": 1814 + }, + { + "epoch": 0.09989542627552424, + "grad_norm": 0.750990092754364, + "learning_rate": 9.944838077876186e-06, + "loss": 0.8137, + "step": 1815 + }, + { + "epoch": 0.0999504650778799, + "grad_norm": 0.8502481579780579, + "learning_rate": 9.944773849067178e-06, + "loss": 0.8973, + "step": 1816 + }, + { + "epoch": 0.10000550388023556, + "grad_norm": 0.8299791812896729, + "learning_rate": 9.94470958309457e-06, + "loss": 0.8341, + "step": 1817 + }, + { + "epoch": 0.10006054268259122, + "grad_norm": 0.8519022464752197, + "learning_rate": 9.94464527995885e-06, + "loss": 0.8529, + "step": 1818 } ], "logging_steps": 1, @@ -6389,7 +12752,7 @@ "attributes": {} } }, - "total_flos": 2.682514714121994e+18, + "total_flos": 5.365029428243988e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null