diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,9 +1,9 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 0.6003632560955474, + "epoch": 0.650393527436843, "eval_steps": 500, - "global_step": 10908, + "global_step": 11817, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, @@ -76363,6 +76363,6369 @@ "learning_rate": 7.961114987144781e-06, "loss": 0.6374, "step": 10908 + }, + { + "epoch": 0.600418294897903, + "grad_norm": 0.7149941921234131, + "learning_rate": 7.960765699838854e-06, + "loss": 0.8422, + "step": 10909 + }, + { + "epoch": 0.6004733337002587, + "grad_norm": 0.7040171027183533, + "learning_rate": 7.960416390280608e-06, + "loss": 0.8261, + "step": 10910 + }, + { + "epoch": 0.6005283725026144, + "grad_norm": 0.713591456413269, + "learning_rate": 7.960067058472663e-06, + "loss": 0.7908, + "step": 10911 + }, + { + "epoch": 0.60058341130497, + "grad_norm": 0.654086172580719, + "learning_rate": 7.959717704417645e-06, + "loss": 0.6971, + "step": 10912 + }, + { + "epoch": 0.6006384501073256, + "grad_norm": 0.7293223738670349, + "learning_rate": 7.959368328118183e-06, + "loss": 0.7032, + "step": 10913 + }, + { + "epoch": 0.6006934889096813, + "grad_norm": 0.705434262752533, + "learning_rate": 7.959018929576898e-06, + "loss": 0.7193, + "step": 10914 + }, + { + "epoch": 0.600748527712037, + "grad_norm": 0.7406907677650452, + "learning_rate": 7.958669508796422e-06, + "loss": 0.8464, + "step": 10915 + }, + { + "epoch": 0.6008035665143926, + "grad_norm": 0.6683858036994934, + "learning_rate": 7.958320065779377e-06, + "loss": 0.699, + "step": 10916 + }, + { + "epoch": 0.6008586053167483, + "grad_norm": 0.7380560636520386, + "learning_rate": 7.95797060052839e-06, + "loss": 0.7409, + "step": 10917 + }, + { + "epoch": 0.600913644119104, + "grad_norm": 0.7729377746582031, + "learning_rate": 7.957621113046088e-06, + "loss": 0.8838, + "step": 10918 + }, + { + "epoch": 0.6009686829214597, + "grad_norm": 0.6842743158340454, + "learning_rate": 7.957271603335097e-06, + "loss": 0.781, + "step": 10919 + }, + { + "epoch": 0.6010237217238152, + "grad_norm": 0.6864648461341858, + "learning_rate": 7.956922071398045e-06, + "loss": 0.6717, + "step": 10920 + }, + { + "epoch": 0.6010787605261709, + "grad_norm": 0.7718262672424316, + "learning_rate": 7.956572517237557e-06, + "loss": 0.8023, + "step": 10921 + }, + { + "epoch": 0.6011337993285266, + "grad_norm": 0.686338484287262, + "learning_rate": 7.956222940856261e-06, + "loss": 0.7139, + "step": 10922 + }, + { + "epoch": 0.6011888381308823, + "grad_norm": 0.7064465284347534, + "learning_rate": 7.955873342256789e-06, + "loss": 0.845, + "step": 10923 + }, + { + "epoch": 0.6012438769332379, + "grad_norm": 0.6847875714302063, + "learning_rate": 7.955523721441761e-06, + "loss": 0.7078, + "step": 10924 + }, + { + "epoch": 0.6012989157355936, + "grad_norm": 0.6879494786262512, + "learning_rate": 7.955174078413806e-06, + "loss": 0.7532, + "step": 10925 + }, + { + "epoch": 0.6013539545379493, + "grad_norm": 0.6569855213165283, + "learning_rate": 7.954824413175554e-06, + "loss": 0.7529, + "step": 10926 + }, + { + "epoch": 0.601408993340305, + "grad_norm": 0.6225974559783936, + "learning_rate": 7.954474725729635e-06, + "loss": 0.6595, + "step": 10927 + }, + { + "epoch": 0.6014640321426605, + "grad_norm": 0.7067761421203613, + "learning_rate": 7.954125016078675e-06, + "loss": 0.7851, + "step": 10928 + }, + { + "epoch": 0.6015190709450162, + "grad_norm": 0.683030903339386, + "learning_rate": 7.9537752842253e-06, + "loss": 0.7461, + "step": 10929 + }, + { + "epoch": 0.6015741097473719, + "grad_norm": 0.6411080956459045, + "learning_rate": 7.953425530172143e-06, + "loss": 0.6945, + "step": 10930 + }, + { + "epoch": 0.6016291485497276, + "grad_norm": 0.6254550814628601, + "learning_rate": 7.953075753921829e-06, + "loss": 0.7143, + "step": 10931 + }, + { + "epoch": 0.6016841873520832, + "grad_norm": 0.684100866317749, + "learning_rate": 7.952725955476987e-06, + "loss": 0.8137, + "step": 10932 + }, + { + "epoch": 0.6017392261544389, + "grad_norm": 0.6341036558151245, + "learning_rate": 7.95237613484025e-06, + "loss": 0.6692, + "step": 10933 + }, + { + "epoch": 0.6017942649567946, + "grad_norm": 0.7311153411865234, + "learning_rate": 7.952026292014242e-06, + "loss": 0.7091, + "step": 10934 + }, + { + "epoch": 0.6018493037591502, + "grad_norm": 0.7265943884849548, + "learning_rate": 7.951676427001596e-06, + "loss": 0.765, + "step": 10935 + }, + { + "epoch": 0.6019043425615058, + "grad_norm": 0.8777397274971008, + "learning_rate": 7.951326539804938e-06, + "loss": 0.7824, + "step": 10936 + }, + { + "epoch": 0.6019593813638615, + "grad_norm": 0.7241179347038269, + "learning_rate": 7.9509766304269e-06, + "loss": 0.7913, + "step": 10937 + }, + { + "epoch": 0.6020144201662172, + "grad_norm": 0.8090667128562927, + "learning_rate": 7.950626698870113e-06, + "loss": 0.8208, + "step": 10938 + }, + { + "epoch": 0.6020694589685729, + "grad_norm": 0.7376043796539307, + "learning_rate": 7.950276745137206e-06, + "loss": 0.7176, + "step": 10939 + }, + { + "epoch": 0.6021244977709285, + "grad_norm": 0.7149157524108887, + "learning_rate": 7.949926769230809e-06, + "loss": 0.7949, + "step": 10940 + }, + { + "epoch": 0.6021795365732842, + "grad_norm": 0.8721579909324646, + "learning_rate": 7.949576771153549e-06, + "loss": 0.8433, + "step": 10941 + }, + { + "epoch": 0.6022345753756398, + "grad_norm": 0.7946182489395142, + "learning_rate": 7.949226750908062e-06, + "loss": 0.7412, + "step": 10942 + }, + { + "epoch": 0.6022896141779955, + "grad_norm": 0.6661237478256226, + "learning_rate": 7.948876708496975e-06, + "loss": 0.725, + "step": 10943 + }, + { + "epoch": 0.6023446529803511, + "grad_norm": 0.8346213698387146, + "learning_rate": 7.948526643922922e-06, + "loss": 0.6817, + "step": 10944 + }, + { + "epoch": 0.6023996917827068, + "grad_norm": 0.7911655306816101, + "learning_rate": 7.94817655718853e-06, + "loss": 0.7398, + "step": 10945 + }, + { + "epoch": 0.6024547305850625, + "grad_norm": 0.6480078101158142, + "learning_rate": 7.947826448296432e-06, + "loss": 0.6822, + "step": 10946 + }, + { + "epoch": 0.6025097693874182, + "grad_norm": 0.6950085759162903, + "learning_rate": 7.94747631724926e-06, + "loss": 0.8073, + "step": 10947 + }, + { + "epoch": 0.6025648081897738, + "grad_norm": 0.7142168879508972, + "learning_rate": 7.947126164049645e-06, + "loss": 0.6159, + "step": 10948 + }, + { + "epoch": 0.6026198469921294, + "grad_norm": 0.7459015846252441, + "learning_rate": 7.946775988700219e-06, + "loss": 0.8377, + "step": 10949 + }, + { + "epoch": 0.6026748857944851, + "grad_norm": 1.050179362297058, + "learning_rate": 7.946425791203614e-06, + "loss": 0.8098, + "step": 10950 + }, + { + "epoch": 0.6027299245968408, + "grad_norm": 0.7473265528678894, + "learning_rate": 7.94607557156246e-06, + "loss": 0.6846, + "step": 10951 + }, + { + "epoch": 0.6027849633991964, + "grad_norm": 0.7990789413452148, + "learning_rate": 7.945725329779392e-06, + "loss": 0.8216, + "step": 10952 + }, + { + "epoch": 0.6028400022015521, + "grad_norm": 0.6461700201034546, + "learning_rate": 7.94537506585704e-06, + "loss": 0.7864, + "step": 10953 + }, + { + "epoch": 0.6028950410039078, + "grad_norm": 0.661123514175415, + "learning_rate": 7.945024779798038e-06, + "loss": 0.7466, + "step": 10954 + }, + { + "epoch": 0.6029500798062634, + "grad_norm": 0.6998088359832764, + "learning_rate": 7.944674471605018e-06, + "loss": 0.7846, + "step": 10955 + }, + { + "epoch": 0.603005118608619, + "grad_norm": 0.6917386651039124, + "learning_rate": 7.944324141280613e-06, + "loss": 0.7699, + "step": 10956 + }, + { + "epoch": 0.6030601574109747, + "grad_norm": 0.7304503321647644, + "learning_rate": 7.943973788827455e-06, + "loss": 0.8015, + "step": 10957 + }, + { + "epoch": 0.6031151962133304, + "grad_norm": 0.7996858358383179, + "learning_rate": 7.94362341424818e-06, + "loss": 0.7093, + "step": 10958 + }, + { + "epoch": 0.603170235015686, + "grad_norm": 0.7445322871208191, + "learning_rate": 7.943273017545419e-06, + "loss": 0.7388, + "step": 10959 + }, + { + "epoch": 0.6032252738180417, + "grad_norm": 0.6672174334526062, + "learning_rate": 7.942922598721805e-06, + "loss": 0.7703, + "step": 10960 + }, + { + "epoch": 0.6032803126203974, + "grad_norm": 0.7313557267189026, + "learning_rate": 7.94257215777997e-06, + "loss": 0.6637, + "step": 10961 + }, + { + "epoch": 0.6033353514227531, + "grad_norm": 0.7248823642730713, + "learning_rate": 7.942221694722553e-06, + "loss": 0.836, + "step": 10962 + }, + { + "epoch": 0.6033903902251087, + "grad_norm": 0.6583372354507446, + "learning_rate": 7.941871209552187e-06, + "loss": 0.7582, + "step": 10963 + }, + { + "epoch": 0.6034454290274643, + "grad_norm": 0.7502591013908386, + "learning_rate": 7.941520702271503e-06, + "loss": 0.7455, + "step": 10964 + }, + { + "epoch": 0.60350046782982, + "grad_norm": 0.6899349689483643, + "learning_rate": 7.941170172883135e-06, + "loss": 0.7677, + "step": 10965 + }, + { + "epoch": 0.6035555066321757, + "grad_norm": 0.693321943283081, + "learning_rate": 7.940819621389722e-06, + "loss": 0.7754, + "step": 10966 + }, + { + "epoch": 0.6036105454345313, + "grad_norm": 0.7376342415809631, + "learning_rate": 7.940469047793893e-06, + "loss": 0.7761, + "step": 10967 + }, + { + "epoch": 0.603665584236887, + "grad_norm": 0.6377952694892883, + "learning_rate": 7.940118452098289e-06, + "loss": 0.6612, + "step": 10968 + }, + { + "epoch": 0.6037206230392427, + "grad_norm": 0.8041388988494873, + "learning_rate": 7.939767834305538e-06, + "loss": 0.8358, + "step": 10969 + }, + { + "epoch": 0.6037756618415984, + "grad_norm": 1.5993521213531494, + "learning_rate": 7.939417194418282e-06, + "loss": 0.8536, + "step": 10970 + }, + { + "epoch": 0.6038307006439539, + "grad_norm": 0.6718295216560364, + "learning_rate": 7.939066532439153e-06, + "loss": 0.717, + "step": 10971 + }, + { + "epoch": 0.6038857394463096, + "grad_norm": 0.7951062917709351, + "learning_rate": 7.938715848370787e-06, + "loss": 0.6919, + "step": 10972 + }, + { + "epoch": 0.6039407782486653, + "grad_norm": 0.707804262638092, + "learning_rate": 7.938365142215816e-06, + "loss": 0.7346, + "step": 10973 + }, + { + "epoch": 0.603995817051021, + "grad_norm": 0.7244500517845154, + "learning_rate": 7.938014413976883e-06, + "loss": 0.708, + "step": 10974 + }, + { + "epoch": 0.6040508558533766, + "grad_norm": 0.7533566951751709, + "learning_rate": 7.937663663656617e-06, + "loss": 0.6761, + "step": 10975 + }, + { + "epoch": 0.6041058946557323, + "grad_norm": 0.8844665288925171, + "learning_rate": 7.93731289125766e-06, + "loss": 0.7833, + "step": 10976 + }, + { + "epoch": 0.604160933458088, + "grad_norm": 0.6413047313690186, + "learning_rate": 7.936962096782643e-06, + "loss": 0.7175, + "step": 10977 + }, + { + "epoch": 0.6042159722604437, + "grad_norm": 0.765943706035614, + "learning_rate": 7.936611280234206e-06, + "loss": 0.7654, + "step": 10978 + }, + { + "epoch": 0.6042710110627992, + "grad_norm": 0.6833398938179016, + "learning_rate": 7.936260441614985e-06, + "loss": 0.7459, + "step": 10979 + }, + { + "epoch": 0.6043260498651549, + "grad_norm": 0.6363481283187866, + "learning_rate": 7.935909580927617e-06, + "loss": 0.7173, + "step": 10980 + }, + { + "epoch": 0.6043810886675106, + "grad_norm": 0.7731046080589294, + "learning_rate": 7.935558698174738e-06, + "loss": 0.8428, + "step": 10981 + }, + { + "epoch": 0.6044361274698663, + "grad_norm": 0.7346602082252502, + "learning_rate": 7.935207793358986e-06, + "loss": 0.832, + "step": 10982 + }, + { + "epoch": 0.6044911662722219, + "grad_norm": 0.6711193919181824, + "learning_rate": 7.934856866482998e-06, + "loss": 0.742, + "step": 10983 + }, + { + "epoch": 0.6045462050745776, + "grad_norm": 0.6931266784667969, + "learning_rate": 7.934505917549411e-06, + "loss": 0.7779, + "step": 10984 + }, + { + "epoch": 0.6046012438769333, + "grad_norm": 0.7624725699424744, + "learning_rate": 7.934154946560862e-06, + "loss": 0.7229, + "step": 10985 + }, + { + "epoch": 0.604656282679289, + "grad_norm": 0.6594272255897522, + "learning_rate": 7.933803953519991e-06, + "loss": 0.7776, + "step": 10986 + }, + { + "epoch": 0.6047113214816445, + "grad_norm": 0.674521803855896, + "learning_rate": 7.933452938429435e-06, + "loss": 0.6904, + "step": 10987 + }, + { + "epoch": 0.6047663602840002, + "grad_norm": 0.7352569699287415, + "learning_rate": 7.933101901291831e-06, + "loss": 0.7655, + "step": 10988 + }, + { + "epoch": 0.6048213990863559, + "grad_norm": 0.8560347557067871, + "learning_rate": 7.932750842109817e-06, + "loss": 0.7894, + "step": 10989 + }, + { + "epoch": 0.6048764378887116, + "grad_norm": 0.769496500492096, + "learning_rate": 7.932399760886037e-06, + "loss": 0.8255, + "step": 10990 + }, + { + "epoch": 0.6049314766910672, + "grad_norm": 0.9399588108062744, + "learning_rate": 7.932048657623122e-06, + "loss": 0.8554, + "step": 10991 + }, + { + "epoch": 0.6049865154934229, + "grad_norm": 0.6662001609802246, + "learning_rate": 7.931697532323716e-06, + "loss": 0.7788, + "step": 10992 + }, + { + "epoch": 0.6050415542957785, + "grad_norm": 0.758263111114502, + "learning_rate": 7.931346384990455e-06, + "loss": 0.7907, + "step": 10993 + }, + { + "epoch": 0.6050965930981342, + "grad_norm": 0.7283937335014343, + "learning_rate": 7.930995215625978e-06, + "loss": 0.8415, + "step": 10994 + }, + { + "epoch": 0.6051516319004898, + "grad_norm": 0.6611599922180176, + "learning_rate": 7.930644024232927e-06, + "loss": 0.7145, + "step": 10995 + }, + { + "epoch": 0.6052066707028455, + "grad_norm": 0.8450857400894165, + "learning_rate": 7.93029281081394e-06, + "loss": 0.7208, + "step": 10996 + }, + { + "epoch": 0.6052617095052012, + "grad_norm": 0.649010181427002, + "learning_rate": 7.929941575371655e-06, + "loss": 0.6928, + "step": 10997 + }, + { + "epoch": 0.6053167483075568, + "grad_norm": 0.7022100687026978, + "learning_rate": 7.929590317908718e-06, + "loss": 0.7329, + "step": 10998 + }, + { + "epoch": 0.6053717871099125, + "grad_norm": 0.768598198890686, + "learning_rate": 7.92923903842776e-06, + "loss": 0.7799, + "step": 10999 + }, + { + "epoch": 0.6054268259122682, + "grad_norm": 0.6648436784744263, + "learning_rate": 7.928887736931428e-06, + "loss": 0.7728, + "step": 11000 + }, + { + "epoch": 0.6054818647146238, + "grad_norm": 0.6946157813072205, + "learning_rate": 7.928536413422357e-06, + "loss": 0.7609, + "step": 11001 + }, + { + "epoch": 0.6055369035169794, + "grad_norm": 0.7779337167739868, + "learning_rate": 7.928185067903191e-06, + "loss": 0.7679, + "step": 11002 + }, + { + "epoch": 0.6055919423193351, + "grad_norm": 0.6520814895629883, + "learning_rate": 7.927833700376573e-06, + "loss": 0.6734, + "step": 11003 + }, + { + "epoch": 0.6056469811216908, + "grad_norm": 0.7724258899688721, + "learning_rate": 7.927482310845138e-06, + "loss": 0.7564, + "step": 11004 + }, + { + "epoch": 0.6057020199240465, + "grad_norm": 0.6649174690246582, + "learning_rate": 7.927130899311529e-06, + "loss": 0.7217, + "step": 11005 + }, + { + "epoch": 0.6057570587264021, + "grad_norm": 0.6807287931442261, + "learning_rate": 7.926779465778389e-06, + "loss": 0.6966, + "step": 11006 + }, + { + "epoch": 0.6058120975287578, + "grad_norm": 0.6644826531410217, + "learning_rate": 7.926428010248357e-06, + "loss": 0.7238, + "step": 11007 + }, + { + "epoch": 0.6058671363311134, + "grad_norm": 0.7533535957336426, + "learning_rate": 7.926076532724077e-06, + "loss": 0.855, + "step": 11008 + }, + { + "epoch": 0.6059221751334691, + "grad_norm": 0.6457169055938721, + "learning_rate": 7.925725033208187e-06, + "loss": 0.6717, + "step": 11009 + }, + { + "epoch": 0.6059772139358247, + "grad_norm": 0.724719762802124, + "learning_rate": 7.925373511703332e-06, + "loss": 0.8701, + "step": 11010 + }, + { + "epoch": 0.6060322527381804, + "grad_norm": 0.746755063533783, + "learning_rate": 7.925021968212153e-06, + "loss": 0.8509, + "step": 11011 + }, + { + "epoch": 0.6060872915405361, + "grad_norm": 0.7377174496650696, + "learning_rate": 7.924670402737292e-06, + "loss": 0.8053, + "step": 11012 + }, + { + "epoch": 0.6061423303428918, + "grad_norm": 0.9791839718818665, + "learning_rate": 7.92431881528139e-06, + "loss": 0.7893, + "step": 11013 + }, + { + "epoch": 0.6061973691452474, + "grad_norm": 0.7472195029258728, + "learning_rate": 7.923967205847089e-06, + "loss": 0.7195, + "step": 11014 + }, + { + "epoch": 0.606252407947603, + "grad_norm": 0.672851026058197, + "learning_rate": 7.923615574437037e-06, + "loss": 0.8234, + "step": 11015 + }, + { + "epoch": 0.6063074467499587, + "grad_norm": 0.739942729473114, + "learning_rate": 7.923263921053872e-06, + "loss": 0.8582, + "step": 11016 + }, + { + "epoch": 0.6063624855523144, + "grad_norm": 0.7337772846221924, + "learning_rate": 7.922912245700236e-06, + "loss": 0.8008, + "step": 11017 + }, + { + "epoch": 0.60641752435467, + "grad_norm": 0.6707174777984619, + "learning_rate": 7.922560548378774e-06, + "loss": 0.8531, + "step": 11018 + }, + { + "epoch": 0.6064725631570257, + "grad_norm": 0.6783839464187622, + "learning_rate": 7.922208829092133e-06, + "loss": 0.7963, + "step": 11019 + }, + { + "epoch": 0.6065276019593814, + "grad_norm": 0.6133253574371338, + "learning_rate": 7.92185708784295e-06, + "loss": 0.7375, + "step": 11020 + }, + { + "epoch": 0.6065826407617371, + "grad_norm": 0.8300097584724426, + "learning_rate": 7.921505324633868e-06, + "loss": 0.7976, + "step": 11021 + }, + { + "epoch": 0.6066376795640926, + "grad_norm": 0.6800658702850342, + "learning_rate": 7.921153539467538e-06, + "loss": 0.7321, + "step": 11022 + }, + { + "epoch": 0.6066927183664483, + "grad_norm": 0.6849787831306458, + "learning_rate": 7.920801732346602e-06, + "loss": 0.7134, + "step": 11023 + }, + { + "epoch": 0.606747757168804, + "grad_norm": 0.7675080895423889, + "learning_rate": 7.920449903273697e-06, + "loss": 0.7402, + "step": 11024 + }, + { + "epoch": 0.6068027959711597, + "grad_norm": 0.7431055903434753, + "learning_rate": 7.920098052251476e-06, + "loss": 0.7872, + "step": 11025 + }, + { + "epoch": 0.6068578347735153, + "grad_norm": 0.6264036297798157, + "learning_rate": 7.919746179282577e-06, + "loss": 0.7496, + "step": 11026 + }, + { + "epoch": 0.606912873575871, + "grad_norm": 0.7800843715667725, + "learning_rate": 7.919394284369648e-06, + "loss": 0.7917, + "step": 11027 + }, + { + "epoch": 0.6069679123782267, + "grad_norm": 0.7665574550628662, + "learning_rate": 7.919042367515336e-06, + "loss": 0.7905, + "step": 11028 + }, + { + "epoch": 0.6070229511805824, + "grad_norm": 0.7473214864730835, + "learning_rate": 7.918690428722279e-06, + "loss": 0.7732, + "step": 11029 + }, + { + "epoch": 0.6070779899829379, + "grad_norm": 0.6717211008071899, + "learning_rate": 7.918338467993127e-06, + "loss": 0.8221, + "step": 11030 + }, + { + "epoch": 0.6071330287852936, + "grad_norm": 0.6745431423187256, + "learning_rate": 7.917986485330525e-06, + "loss": 0.6899, + "step": 11031 + }, + { + "epoch": 0.6071880675876493, + "grad_norm": 0.6838263273239136, + "learning_rate": 7.917634480737117e-06, + "loss": 0.7133, + "step": 11032 + }, + { + "epoch": 0.607243106390005, + "grad_norm": 0.7975682020187378, + "learning_rate": 7.91728245421555e-06, + "loss": 0.8283, + "step": 11033 + }, + { + "epoch": 0.6072981451923606, + "grad_norm": 0.7112031579017639, + "learning_rate": 7.916930405768468e-06, + "loss": 0.7423, + "step": 11034 + }, + { + "epoch": 0.6073531839947163, + "grad_norm": 0.7006776928901672, + "learning_rate": 7.91657833539852e-06, + "loss": 0.716, + "step": 11035 + }, + { + "epoch": 0.607408222797072, + "grad_norm": 0.7523549795150757, + "learning_rate": 7.916226243108348e-06, + "loss": 0.8591, + "step": 11036 + }, + { + "epoch": 0.6074632615994277, + "grad_norm": 0.7257835268974304, + "learning_rate": 7.9158741289006e-06, + "loss": 0.7471, + "step": 11037 + }, + { + "epoch": 0.6075183004017832, + "grad_norm": 0.8100149631500244, + "learning_rate": 7.915521992777922e-06, + "loss": 0.8373, + "step": 11038 + }, + { + "epoch": 0.6075733392041389, + "grad_norm": 0.7781035304069519, + "learning_rate": 7.915169834742964e-06, + "loss": 0.8471, + "step": 11039 + }, + { + "epoch": 0.6076283780064946, + "grad_norm": 0.7426049709320068, + "learning_rate": 7.914817654798368e-06, + "loss": 0.753, + "step": 11040 + }, + { + "epoch": 0.6076834168088502, + "grad_norm": 0.6990010738372803, + "learning_rate": 7.914465452946782e-06, + "loss": 0.7556, + "step": 11041 + }, + { + "epoch": 0.6077384556112059, + "grad_norm": 0.8038754463195801, + "learning_rate": 7.914113229190856e-06, + "loss": 0.7787, + "step": 11042 + }, + { + "epoch": 0.6077934944135616, + "grad_norm": 0.6434115767478943, + "learning_rate": 7.913760983533233e-06, + "loss": 0.7831, + "step": 11043 + }, + { + "epoch": 0.6078485332159173, + "grad_norm": 0.8119033575057983, + "learning_rate": 7.913408715976562e-06, + "loss": 0.7691, + "step": 11044 + }, + { + "epoch": 0.6079035720182728, + "grad_norm": 0.6710149049758911, + "learning_rate": 7.913056426523493e-06, + "loss": 0.7542, + "step": 11045 + }, + { + "epoch": 0.6079586108206285, + "grad_norm": 0.7458183765411377, + "learning_rate": 7.912704115176671e-06, + "loss": 0.7673, + "step": 11046 + }, + { + "epoch": 0.6080136496229842, + "grad_norm": 0.8061705827713013, + "learning_rate": 7.912351781938745e-06, + "loss": 0.9255, + "step": 11047 + }, + { + "epoch": 0.6080686884253399, + "grad_norm": 0.7193130850791931, + "learning_rate": 7.91199942681236e-06, + "loss": 0.8154, + "step": 11048 + }, + { + "epoch": 0.6081237272276955, + "grad_norm": 0.7785167098045349, + "learning_rate": 7.911647049800171e-06, + "loss": 0.7747, + "step": 11049 + }, + { + "epoch": 0.6081787660300512, + "grad_norm": 0.665765106678009, + "learning_rate": 7.911294650904818e-06, + "loss": 0.7573, + "step": 11050 + }, + { + "epoch": 0.6082338048324069, + "grad_norm": 0.7940623760223389, + "learning_rate": 7.910942230128956e-06, + "loss": 0.6628, + "step": 11051 + }, + { + "epoch": 0.6082888436347625, + "grad_norm": 0.8364549875259399, + "learning_rate": 7.910589787475232e-06, + "loss": 0.8103, + "step": 11052 + }, + { + "epoch": 0.6083438824371181, + "grad_norm": 0.6153101325035095, + "learning_rate": 7.910237322946292e-06, + "loss": 0.76, + "step": 11053 + }, + { + "epoch": 0.6083989212394738, + "grad_norm": 0.8381257653236389, + "learning_rate": 7.909884836544789e-06, + "loss": 0.8366, + "step": 11054 + }, + { + "epoch": 0.6084539600418295, + "grad_norm": 0.6602391600608826, + "learning_rate": 7.90953232827337e-06, + "loss": 0.7389, + "step": 11055 + }, + { + "epoch": 0.6085089988441852, + "grad_norm": 0.7329971194267273, + "learning_rate": 7.909179798134685e-06, + "loss": 0.8217, + "step": 11056 + }, + { + "epoch": 0.6085640376465408, + "grad_norm": 0.7319926023483276, + "learning_rate": 7.908827246131383e-06, + "loss": 0.78, + "step": 11057 + }, + { + "epoch": 0.6086190764488965, + "grad_norm": 0.6491387486457825, + "learning_rate": 7.908474672266114e-06, + "loss": 0.7496, + "step": 11058 + }, + { + "epoch": 0.6086741152512521, + "grad_norm": 0.656434953212738, + "learning_rate": 7.908122076541529e-06, + "loss": 0.7462, + "step": 11059 + }, + { + "epoch": 0.6087291540536078, + "grad_norm": 0.6908577680587769, + "learning_rate": 7.907769458960275e-06, + "loss": 0.7505, + "step": 11060 + }, + { + "epoch": 0.6087841928559634, + "grad_norm": 0.774424135684967, + "learning_rate": 7.907416819525007e-06, + "loss": 0.8275, + "step": 11061 + }, + { + "epoch": 0.6088392316583191, + "grad_norm": 0.6796718835830688, + "learning_rate": 7.90706415823837e-06, + "loss": 0.7606, + "step": 11062 + }, + { + "epoch": 0.6088942704606748, + "grad_norm": 0.9576514959335327, + "learning_rate": 7.906711475103016e-06, + "loss": 0.807, + "step": 11063 + }, + { + "epoch": 0.6089493092630305, + "grad_norm": 0.9848490953445435, + "learning_rate": 7.9063587701216e-06, + "loss": 0.7856, + "step": 11064 + }, + { + "epoch": 0.6090043480653861, + "grad_norm": 0.9490165710449219, + "learning_rate": 7.906006043296768e-06, + "loss": 0.8519, + "step": 11065 + }, + { + "epoch": 0.6090593868677417, + "grad_norm": 0.631382942199707, + "learning_rate": 7.905653294631172e-06, + "loss": 0.7041, + "step": 11066 + }, + { + "epoch": 0.6091144256700974, + "grad_norm": 0.6969574093818665, + "learning_rate": 7.905300524127464e-06, + "loss": 0.7556, + "step": 11067 + }, + { + "epoch": 0.6091694644724531, + "grad_norm": 0.6990532279014587, + "learning_rate": 7.904947731788295e-06, + "loss": 0.799, + "step": 11068 + }, + { + "epoch": 0.6092245032748087, + "grad_norm": 0.7216916084289551, + "learning_rate": 7.904594917616315e-06, + "loss": 0.7617, + "step": 11069 + }, + { + "epoch": 0.6092795420771644, + "grad_norm": 0.6874147653579712, + "learning_rate": 7.904242081614179e-06, + "loss": 0.7616, + "step": 11070 + }, + { + "epoch": 0.6093345808795201, + "grad_norm": 0.6909550428390503, + "learning_rate": 7.903889223784535e-06, + "loss": 0.7649, + "step": 11071 + }, + { + "epoch": 0.6093896196818758, + "grad_norm": 0.7796370387077332, + "learning_rate": 7.90353634413004e-06, + "loss": 0.7557, + "step": 11072 + }, + { + "epoch": 0.6094446584842313, + "grad_norm": 0.807448148727417, + "learning_rate": 7.903183442653341e-06, + "loss": 0.7519, + "step": 11073 + }, + { + "epoch": 0.609499697286587, + "grad_norm": 0.846371054649353, + "learning_rate": 7.902830519357092e-06, + "loss": 0.9342, + "step": 11074 + }, + { + "epoch": 0.6095547360889427, + "grad_norm": 1.0386929512023926, + "learning_rate": 7.902477574243947e-06, + "loss": 0.6802, + "step": 11075 + }, + { + "epoch": 0.6096097748912984, + "grad_norm": 0.8011854887008667, + "learning_rate": 7.902124607316558e-06, + "loss": 0.7756, + "step": 11076 + }, + { + "epoch": 0.609664813693654, + "grad_norm": 0.6560170650482178, + "learning_rate": 7.901771618577574e-06, + "loss": 0.7831, + "step": 11077 + }, + { + "epoch": 0.6097198524960097, + "grad_norm": 0.656891942024231, + "learning_rate": 7.901418608029655e-06, + "loss": 0.7239, + "step": 11078 + }, + { + "epoch": 0.6097748912983654, + "grad_norm": 0.7451794743537903, + "learning_rate": 7.901065575675448e-06, + "loss": 0.7426, + "step": 11079 + }, + { + "epoch": 0.6098299301007211, + "grad_norm": 0.6805453300476074, + "learning_rate": 7.90071252151761e-06, + "loss": 0.7257, + "step": 11080 + }, + { + "epoch": 0.6098849689030766, + "grad_norm": 0.7747140526771545, + "learning_rate": 7.900359445558791e-06, + "loss": 0.8554, + "step": 11081 + }, + { + "epoch": 0.6099400077054323, + "grad_norm": 0.7276260256767273, + "learning_rate": 7.900006347801649e-06, + "loss": 0.7608, + "step": 11082 + }, + { + "epoch": 0.609995046507788, + "grad_norm": 0.7496321201324463, + "learning_rate": 7.899653228248836e-06, + "loss": 0.7707, + "step": 11083 + }, + { + "epoch": 0.6100500853101436, + "grad_norm": 0.6810722947120667, + "learning_rate": 7.899300086903006e-06, + "loss": 0.7425, + "step": 11084 + }, + { + "epoch": 0.6101051241124993, + "grad_norm": 0.7245593070983887, + "learning_rate": 7.89894692376681e-06, + "loss": 0.8404, + "step": 11085 + }, + { + "epoch": 0.610160162914855, + "grad_norm": 0.7139402627944946, + "learning_rate": 7.898593738842906e-06, + "loss": 0.7219, + "step": 11086 + }, + { + "epoch": 0.6102152017172107, + "grad_norm": 0.6483772397041321, + "learning_rate": 7.898240532133947e-06, + "loss": 0.7571, + "step": 11087 + }, + { + "epoch": 0.6102702405195662, + "grad_norm": 0.7347467541694641, + "learning_rate": 7.89788730364259e-06, + "loss": 0.7666, + "step": 11088 + }, + { + "epoch": 0.6103252793219219, + "grad_norm": 0.8899261355400085, + "learning_rate": 7.897534053371485e-06, + "loss": 0.6886, + "step": 11089 + }, + { + "epoch": 0.6103803181242776, + "grad_norm": 0.7005650401115417, + "learning_rate": 7.89718078132329e-06, + "loss": 0.6771, + "step": 11090 + }, + { + "epoch": 0.6104353569266333, + "grad_norm": 0.776589035987854, + "learning_rate": 7.896827487500662e-06, + "loss": 0.7731, + "step": 11091 + }, + { + "epoch": 0.6104903957289889, + "grad_norm": 0.7039395570755005, + "learning_rate": 7.896474171906252e-06, + "loss": 0.7415, + "step": 11092 + }, + { + "epoch": 0.6105454345313446, + "grad_norm": 0.7453792095184326, + "learning_rate": 7.896120834542718e-06, + "loss": 0.8507, + "step": 11093 + }, + { + "epoch": 0.6106004733337003, + "grad_norm": 0.7516497373580933, + "learning_rate": 7.895767475412717e-06, + "loss": 0.8271, + "step": 11094 + }, + { + "epoch": 0.610655512136056, + "grad_norm": 0.6751283407211304, + "learning_rate": 7.895414094518901e-06, + "loss": 0.7788, + "step": 11095 + }, + { + "epoch": 0.6107105509384115, + "grad_norm": 0.7240836024284363, + "learning_rate": 7.895060691863927e-06, + "loss": 0.7507, + "step": 11096 + }, + { + "epoch": 0.6107655897407672, + "grad_norm": 0.8286149501800537, + "learning_rate": 7.894707267450451e-06, + "loss": 0.7033, + "step": 11097 + }, + { + "epoch": 0.6108206285431229, + "grad_norm": 0.8814655542373657, + "learning_rate": 7.894353821281131e-06, + "loss": 0.73, + "step": 11098 + }, + { + "epoch": 0.6108756673454786, + "grad_norm": 0.6792872548103333, + "learning_rate": 7.894000353358624e-06, + "loss": 0.7445, + "step": 11099 + }, + { + "epoch": 0.6109307061478342, + "grad_norm": 0.6442595720291138, + "learning_rate": 7.893646863685584e-06, + "loss": 0.7228, + "step": 11100 + }, + { + "epoch": 0.6109857449501899, + "grad_norm": 0.6775944828987122, + "learning_rate": 7.89329335226467e-06, + "loss": 0.7937, + "step": 11101 + }, + { + "epoch": 0.6110407837525456, + "grad_norm": 0.6315211653709412, + "learning_rate": 7.892939819098534e-06, + "loss": 0.7328, + "step": 11102 + }, + { + "epoch": 0.6110958225549012, + "grad_norm": 0.7419382929801941, + "learning_rate": 7.89258626418984e-06, + "loss": 0.8088, + "step": 11103 + }, + { + "epoch": 0.6111508613572568, + "grad_norm": 0.6645117402076721, + "learning_rate": 7.89223268754124e-06, + "loss": 0.7844, + "step": 11104 + }, + { + "epoch": 0.6112059001596125, + "grad_norm": 0.6389926075935364, + "learning_rate": 7.891879089155397e-06, + "loss": 0.6353, + "step": 11105 + }, + { + "epoch": 0.6112609389619682, + "grad_norm": 0.8223785758018494, + "learning_rate": 7.891525469034963e-06, + "loss": 0.7377, + "step": 11106 + }, + { + "epoch": 0.6113159777643239, + "grad_norm": 0.7627747058868408, + "learning_rate": 7.891171827182595e-06, + "loss": 0.8317, + "step": 11107 + }, + { + "epoch": 0.6113710165666795, + "grad_norm": 0.8015971183776855, + "learning_rate": 7.890818163600956e-06, + "loss": 0.8324, + "step": 11108 + }, + { + "epoch": 0.6114260553690352, + "grad_norm": 0.7180280089378357, + "learning_rate": 7.8904644782927e-06, + "loss": 0.8211, + "step": 11109 + }, + { + "epoch": 0.6114810941713908, + "grad_norm": 0.7855646014213562, + "learning_rate": 7.890110771260487e-06, + "loss": 0.8629, + "step": 11110 + }, + { + "epoch": 0.6115361329737465, + "grad_norm": 0.7389342784881592, + "learning_rate": 7.889757042506976e-06, + "loss": 0.6917, + "step": 11111 + }, + { + "epoch": 0.6115911717761021, + "grad_norm": 0.7996030449867249, + "learning_rate": 7.889403292034825e-06, + "loss": 0.7361, + "step": 11112 + }, + { + "epoch": 0.6116462105784578, + "grad_norm": 0.6658353805541992, + "learning_rate": 7.88904951984669e-06, + "loss": 0.7048, + "step": 11113 + }, + { + "epoch": 0.6117012493808135, + "grad_norm": 0.8128555417060852, + "learning_rate": 7.888695725945235e-06, + "loss": 0.7772, + "step": 11114 + }, + { + "epoch": 0.6117562881831692, + "grad_norm": 0.7597428560256958, + "learning_rate": 7.888341910333114e-06, + "loss": 0.7447, + "step": 11115 + }, + { + "epoch": 0.6118113269855248, + "grad_norm": 0.7330088019371033, + "learning_rate": 7.88798807301299e-06, + "loss": 0.849, + "step": 11116 + }, + { + "epoch": 0.6118663657878805, + "grad_norm": 0.8374074101448059, + "learning_rate": 7.88763421398752e-06, + "loss": 0.6149, + "step": 11117 + }, + { + "epoch": 0.6119214045902361, + "grad_norm": 0.7507160305976868, + "learning_rate": 7.887280333259364e-06, + "loss": 0.7737, + "step": 11118 + }, + { + "epoch": 0.6119764433925918, + "grad_norm": 0.7218281626701355, + "learning_rate": 7.886926430831181e-06, + "loss": 0.8151, + "step": 11119 + }, + { + "epoch": 0.6120314821949474, + "grad_norm": 0.6761744618415833, + "learning_rate": 7.886572506705634e-06, + "loss": 0.7429, + "step": 11120 + }, + { + "epoch": 0.6120865209973031, + "grad_norm": 0.8243520259857178, + "learning_rate": 7.886218560885379e-06, + "loss": 0.819, + "step": 11121 + }, + { + "epoch": 0.6121415597996588, + "grad_norm": 0.9675465822219849, + "learning_rate": 7.885864593373078e-06, + "loss": 0.7834, + "step": 11122 + }, + { + "epoch": 0.6121965986020145, + "grad_norm": 0.7220338582992554, + "learning_rate": 7.885510604171391e-06, + "loss": 0.8266, + "step": 11123 + }, + { + "epoch": 0.61225163740437, + "grad_norm": 0.7185316681861877, + "learning_rate": 7.88515659328298e-06, + "loss": 0.7949, + "step": 11124 + }, + { + "epoch": 0.6123066762067257, + "grad_norm": 0.67637038230896, + "learning_rate": 7.884802560710503e-06, + "loss": 0.7456, + "step": 11125 + }, + { + "epoch": 0.6123617150090814, + "grad_norm": 0.7886855602264404, + "learning_rate": 7.884448506456622e-06, + "loss": 0.7181, + "step": 11126 + }, + { + "epoch": 0.612416753811437, + "grad_norm": 0.7250227928161621, + "learning_rate": 7.884094430523999e-06, + "loss": 0.7537, + "step": 11127 + }, + { + "epoch": 0.6124717926137927, + "grad_norm": 0.6771906614303589, + "learning_rate": 7.883740332915295e-06, + "loss": 0.7642, + "step": 11128 + }, + { + "epoch": 0.6125268314161484, + "grad_norm": 0.8375886082649231, + "learning_rate": 7.88338621363317e-06, + "loss": 0.7231, + "step": 11129 + }, + { + "epoch": 0.6125818702185041, + "grad_norm": 0.6782773733139038, + "learning_rate": 7.883032072680285e-06, + "loss": 0.8391, + "step": 11130 + }, + { + "epoch": 0.6126369090208597, + "grad_norm": 0.7103945016860962, + "learning_rate": 7.882677910059304e-06, + "loss": 0.7838, + "step": 11131 + }, + { + "epoch": 0.6126919478232153, + "grad_norm": 0.7037224769592285, + "learning_rate": 7.882323725772887e-06, + "loss": 0.7906, + "step": 11132 + }, + { + "epoch": 0.612746986625571, + "grad_norm": 0.6872009634971619, + "learning_rate": 7.881969519823695e-06, + "loss": 0.7764, + "step": 11133 + }, + { + "epoch": 0.6128020254279267, + "grad_norm": 0.7377448678016663, + "learning_rate": 7.881615292214393e-06, + "loss": 0.8231, + "step": 11134 + }, + { + "epoch": 0.6128570642302823, + "grad_norm": 0.62479168176651, + "learning_rate": 7.881261042947642e-06, + "loss": 0.6522, + "step": 11135 + }, + { + "epoch": 0.612912103032638, + "grad_norm": 0.7989023923873901, + "learning_rate": 7.880906772026105e-06, + "loss": 0.7326, + "step": 11136 + }, + { + "epoch": 0.6129671418349937, + "grad_norm": 0.6322734951972961, + "learning_rate": 7.880552479452441e-06, + "loss": 0.6775, + "step": 11137 + }, + { + "epoch": 0.6130221806373494, + "grad_norm": 0.8628767132759094, + "learning_rate": 7.880198165229318e-06, + "loss": 0.7705, + "step": 11138 + }, + { + "epoch": 0.613077219439705, + "grad_norm": 0.7386173605918884, + "learning_rate": 7.879843829359396e-06, + "loss": 0.7297, + "step": 11139 + }, + { + "epoch": 0.6131322582420606, + "grad_norm": 0.6882045269012451, + "learning_rate": 7.879489471845339e-06, + "loss": 0.6875, + "step": 11140 + }, + { + "epoch": 0.6131872970444163, + "grad_norm": 0.5986032485961914, + "learning_rate": 7.879135092689809e-06, + "loss": 0.6329, + "step": 11141 + }, + { + "epoch": 0.613242335846772, + "grad_norm": 0.7973099946975708, + "learning_rate": 7.878780691895472e-06, + "loss": 0.809, + "step": 11142 + }, + { + "epoch": 0.6132973746491276, + "grad_norm": 0.6828579902648926, + "learning_rate": 7.878426269464989e-06, + "loss": 0.7777, + "step": 11143 + }, + { + "epoch": 0.6133524134514833, + "grad_norm": 0.8179183006286621, + "learning_rate": 7.878071825401024e-06, + "loss": 0.7275, + "step": 11144 + }, + { + "epoch": 0.613407452253839, + "grad_norm": 0.7290762066841125, + "learning_rate": 7.877717359706242e-06, + "loss": 0.7424, + "step": 11145 + }, + { + "epoch": 0.6134624910561947, + "grad_norm": 0.732510507106781, + "learning_rate": 7.877362872383305e-06, + "loss": 0.6157, + "step": 11146 + }, + { + "epoch": 0.6135175298585502, + "grad_norm": 0.9205982685089111, + "learning_rate": 7.877008363434881e-06, + "loss": 0.7723, + "step": 11147 + }, + { + "epoch": 0.6135725686609059, + "grad_norm": 0.7138587832450867, + "learning_rate": 7.876653832863633e-06, + "loss": 0.7773, + "step": 11148 + }, + { + "epoch": 0.6136276074632616, + "grad_norm": 0.7323171496391296, + "learning_rate": 7.876299280672224e-06, + "loss": 0.8265, + "step": 11149 + }, + { + "epoch": 0.6136826462656173, + "grad_norm": 0.6717494130134583, + "learning_rate": 7.875944706863318e-06, + "loss": 0.788, + "step": 11150 + }, + { + "epoch": 0.6137376850679729, + "grad_norm": 0.7779331207275391, + "learning_rate": 7.875590111439582e-06, + "loss": 0.7864, + "step": 11151 + }, + { + "epoch": 0.6137927238703286, + "grad_norm": 0.6706684827804565, + "learning_rate": 7.875235494403683e-06, + "loss": 0.6673, + "step": 11152 + }, + { + "epoch": 0.6138477626726843, + "grad_norm": 0.7142137885093689, + "learning_rate": 7.874880855758281e-06, + "loss": 0.8031, + "step": 11153 + }, + { + "epoch": 0.61390280147504, + "grad_norm": 0.6962595582008362, + "learning_rate": 7.874526195506045e-06, + "loss": 0.692, + "step": 11154 + }, + { + "epoch": 0.6139578402773955, + "grad_norm": 0.7237100601196289, + "learning_rate": 7.874171513649638e-06, + "loss": 0.7504, + "step": 11155 + }, + { + "epoch": 0.6140128790797512, + "grad_norm": 0.8235127925872803, + "learning_rate": 7.87381681019173e-06, + "loss": 0.8132, + "step": 11156 + }, + { + "epoch": 0.6140679178821069, + "grad_norm": 0.7483351826667786, + "learning_rate": 7.873462085134981e-06, + "loss": 0.7589, + "step": 11157 + }, + { + "epoch": 0.6141229566844626, + "grad_norm": 0.7309976816177368, + "learning_rate": 7.873107338482062e-06, + "loss": 0.7722, + "step": 11158 + }, + { + "epoch": 0.6141779954868182, + "grad_norm": 0.8871245384216309, + "learning_rate": 7.872752570235639e-06, + "loss": 0.882, + "step": 11159 + }, + { + "epoch": 0.6142330342891739, + "grad_norm": 0.5987886190414429, + "learning_rate": 7.872397780398374e-06, + "loss": 0.6312, + "step": 11160 + }, + { + "epoch": 0.6142880730915296, + "grad_norm": 0.7320038080215454, + "learning_rate": 7.872042968972937e-06, + "loss": 0.7444, + "step": 11161 + }, + { + "epoch": 0.6143431118938852, + "grad_norm": 0.8111129999160767, + "learning_rate": 7.871688135961995e-06, + "loss": 0.7413, + "step": 11162 + }, + { + "epoch": 0.6143981506962408, + "grad_norm": 0.7497085332870483, + "learning_rate": 7.871333281368211e-06, + "loss": 0.8413, + "step": 11163 + }, + { + "epoch": 0.6144531894985965, + "grad_norm": 0.8341198563575745, + "learning_rate": 7.870978405194256e-06, + "loss": 0.7959, + "step": 11164 + }, + { + "epoch": 0.6145082283009522, + "grad_norm": 0.6293482780456543, + "learning_rate": 7.870623507442797e-06, + "loss": 0.6429, + "step": 11165 + }, + { + "epoch": 0.6145632671033079, + "grad_norm": 1.2423945665359497, + "learning_rate": 7.870268588116499e-06, + "loss": 0.6309, + "step": 11166 + }, + { + "epoch": 0.6146183059056635, + "grad_norm": 0.7811731100082397, + "learning_rate": 7.86991364721803e-06, + "loss": 0.738, + "step": 11167 + }, + { + "epoch": 0.6146733447080192, + "grad_norm": 0.6904361248016357, + "learning_rate": 7.869558684750061e-06, + "loss": 0.7995, + "step": 11168 + }, + { + "epoch": 0.6147283835103748, + "grad_norm": 0.7267210483551025, + "learning_rate": 7.869203700715254e-06, + "loss": 0.6989, + "step": 11169 + }, + { + "epoch": 0.6147834223127304, + "grad_norm": 0.7183068990707397, + "learning_rate": 7.868848695116282e-06, + "loss": 0.7872, + "step": 11170 + }, + { + "epoch": 0.6148384611150861, + "grad_norm": 0.6774286031723022, + "learning_rate": 7.868493667955808e-06, + "loss": 0.7502, + "step": 11171 + }, + { + "epoch": 0.6148934999174418, + "grad_norm": 0.7587934732437134, + "learning_rate": 7.868138619236507e-06, + "loss": 0.8037, + "step": 11172 + }, + { + "epoch": 0.6149485387197975, + "grad_norm": 0.6825854182243347, + "learning_rate": 7.867783548961043e-06, + "loss": 0.7924, + "step": 11173 + }, + { + "epoch": 0.6150035775221531, + "grad_norm": 0.6243380904197693, + "learning_rate": 7.867428457132084e-06, + "loss": 0.5953, + "step": 11174 + }, + { + "epoch": 0.6150586163245088, + "grad_norm": 0.6630006432533264, + "learning_rate": 7.8670733437523e-06, + "loss": 0.7102, + "step": 11175 + }, + { + "epoch": 0.6151136551268644, + "grad_norm": 0.7059652805328369, + "learning_rate": 7.866718208824362e-06, + "loss": 0.6847, + "step": 11176 + }, + { + "epoch": 0.6151686939292201, + "grad_norm": 0.6768305897712708, + "learning_rate": 7.866363052350938e-06, + "loss": 0.7152, + "step": 11177 + }, + { + "epoch": 0.6152237327315757, + "grad_norm": 0.6850628852844238, + "learning_rate": 7.866007874334696e-06, + "loss": 0.767, + "step": 11178 + }, + { + "epoch": 0.6152787715339314, + "grad_norm": 0.6767143607139587, + "learning_rate": 7.865652674778305e-06, + "loss": 0.6826, + "step": 11179 + }, + { + "epoch": 0.6153338103362871, + "grad_norm": 0.8240014314651489, + "learning_rate": 7.865297453684436e-06, + "loss": 0.8493, + "step": 11180 + }, + { + "epoch": 0.6153888491386428, + "grad_norm": 0.7725485563278198, + "learning_rate": 7.864942211055758e-06, + "loss": 0.8704, + "step": 11181 + }, + { + "epoch": 0.6154438879409984, + "grad_norm": 0.9260931015014648, + "learning_rate": 7.864586946894941e-06, + "loss": 0.7926, + "step": 11182 + }, + { + "epoch": 0.615498926743354, + "grad_norm": 0.7558152079582214, + "learning_rate": 7.864231661204655e-06, + "loss": 0.8436, + "step": 11183 + }, + { + "epoch": 0.6155539655457097, + "grad_norm": 0.7899817824363708, + "learning_rate": 7.863876353987571e-06, + "loss": 0.7579, + "step": 11184 + }, + { + "epoch": 0.6156090043480654, + "grad_norm": 0.7757478952407837, + "learning_rate": 7.863521025246362e-06, + "loss": 0.7534, + "step": 11185 + }, + { + "epoch": 0.615664043150421, + "grad_norm": 0.6563131809234619, + "learning_rate": 7.863165674983693e-06, + "loss": 0.728, + "step": 11186 + }, + { + "epoch": 0.6157190819527767, + "grad_norm": 0.6516488790512085, + "learning_rate": 7.862810303202234e-06, + "loss": 0.736, + "step": 11187 + }, + { + "epoch": 0.6157741207551324, + "grad_norm": 0.6867820620536804, + "learning_rate": 7.862454909904665e-06, + "loss": 0.8032, + "step": 11188 + }, + { + "epoch": 0.6158291595574881, + "grad_norm": 0.7399753928184509, + "learning_rate": 7.862099495093647e-06, + "loss": 0.8681, + "step": 11189 + }, + { + "epoch": 0.6158841983598436, + "grad_norm": 0.7249311804771423, + "learning_rate": 7.861744058771857e-06, + "loss": 0.7868, + "step": 11190 + }, + { + "epoch": 0.6159392371621993, + "grad_norm": 0.8579045534133911, + "learning_rate": 7.861388600941964e-06, + "loss": 0.7915, + "step": 11191 + }, + { + "epoch": 0.615994275964555, + "grad_norm": 0.6855454444885254, + "learning_rate": 7.86103312160664e-06, + "loss": 0.8442, + "step": 11192 + }, + { + "epoch": 0.6160493147669107, + "grad_norm": 0.7412910461425781, + "learning_rate": 7.860677620768558e-06, + "loss": 0.7684, + "step": 11193 + }, + { + "epoch": 0.6161043535692663, + "grad_norm": 0.8567430377006531, + "learning_rate": 7.860322098430389e-06, + "loss": 0.8801, + "step": 11194 + }, + { + "epoch": 0.616159392371622, + "grad_norm": 0.7504804134368896, + "learning_rate": 7.859966554594802e-06, + "loss": 0.7359, + "step": 11195 + }, + { + "epoch": 0.6162144311739777, + "grad_norm": 0.7086803317070007, + "learning_rate": 7.859610989264474e-06, + "loss": 0.8498, + "step": 11196 + }, + { + "epoch": 0.6162694699763334, + "grad_norm": 0.7201757431030273, + "learning_rate": 7.859255402442075e-06, + "loss": 0.608, + "step": 11197 + }, + { + "epoch": 0.6163245087786889, + "grad_norm": 0.8968291282653809, + "learning_rate": 7.858899794130279e-06, + "loss": 0.8067, + "step": 11198 + }, + { + "epoch": 0.6163795475810446, + "grad_norm": 0.7474254965782166, + "learning_rate": 7.858544164331756e-06, + "loss": 0.8355, + "step": 11199 + }, + { + "epoch": 0.6164345863834003, + "grad_norm": 0.6907560229301453, + "learning_rate": 7.85818851304918e-06, + "loss": 0.788, + "step": 11200 + }, + { + "epoch": 0.616489625185756, + "grad_norm": 0.725330650806427, + "learning_rate": 7.857832840285224e-06, + "loss": 0.8157, + "step": 11201 + }, + { + "epoch": 0.6165446639881116, + "grad_norm": 0.682722270488739, + "learning_rate": 7.857477146042562e-06, + "loss": 0.7939, + "step": 11202 + }, + { + "epoch": 0.6165997027904673, + "grad_norm": 0.661533534526825, + "learning_rate": 7.857121430323866e-06, + "loss": 0.7173, + "step": 11203 + }, + { + "epoch": 0.616654741592823, + "grad_norm": 0.6922706961631775, + "learning_rate": 7.856765693131811e-06, + "loss": 0.7719, + "step": 11204 + }, + { + "epoch": 0.6167097803951787, + "grad_norm": 0.72809898853302, + "learning_rate": 7.856409934469071e-06, + "loss": 0.7362, + "step": 11205 + }, + { + "epoch": 0.6167648191975342, + "grad_norm": 0.7540956735610962, + "learning_rate": 7.856054154338317e-06, + "loss": 0.7883, + "step": 11206 + }, + { + "epoch": 0.6168198579998899, + "grad_norm": 0.6777094006538391, + "learning_rate": 7.855698352742224e-06, + "loss": 0.6938, + "step": 11207 + }, + { + "epoch": 0.6168748968022456, + "grad_norm": 0.6771852970123291, + "learning_rate": 7.855342529683467e-06, + "loss": 0.697, + "step": 11208 + }, + { + "epoch": 0.6169299356046013, + "grad_norm": 0.7810118198394775, + "learning_rate": 7.854986685164721e-06, + "loss": 0.6875, + "step": 11209 + }, + { + "epoch": 0.6169849744069569, + "grad_norm": 0.6992766261100769, + "learning_rate": 7.854630819188658e-06, + "loss": 0.6553, + "step": 11210 + }, + { + "epoch": 0.6170400132093126, + "grad_norm": 0.7409703135490417, + "learning_rate": 7.854274931757954e-06, + "loss": 0.7685, + "step": 11211 + }, + { + "epoch": 0.6170950520116683, + "grad_norm": 0.7263410687446594, + "learning_rate": 7.853919022875285e-06, + "loss": 0.7939, + "step": 11212 + }, + { + "epoch": 0.6171500908140238, + "grad_norm": 0.8451918959617615, + "learning_rate": 7.853563092543323e-06, + "loss": 0.7522, + "step": 11213 + }, + { + "epoch": 0.6172051296163795, + "grad_norm": 0.672926664352417, + "learning_rate": 7.853207140764745e-06, + "loss": 0.732, + "step": 11214 + }, + { + "epoch": 0.6172601684187352, + "grad_norm": 0.6607885956764221, + "learning_rate": 7.852851167542226e-06, + "loss": 0.7441, + "step": 11215 + }, + { + "epoch": 0.6173152072210909, + "grad_norm": 0.730385422706604, + "learning_rate": 7.85249517287844e-06, + "loss": 0.7925, + "step": 11216 + }, + { + "epoch": 0.6173702460234465, + "grad_norm": 0.7338821887969971, + "learning_rate": 7.852139156776067e-06, + "loss": 0.8106, + "step": 11217 + }, + { + "epoch": 0.6174252848258022, + "grad_norm": 0.7662163376808167, + "learning_rate": 7.851783119237777e-06, + "loss": 0.8166, + "step": 11218 + }, + { + "epoch": 0.6174803236281579, + "grad_norm": 0.7738409042358398, + "learning_rate": 7.85142706026625e-06, + "loss": 0.7898, + "step": 11219 + }, + { + "epoch": 0.6175353624305135, + "grad_norm": 0.8129978775978088, + "learning_rate": 7.851070979864159e-06, + "loss": 0.7618, + "step": 11220 + }, + { + "epoch": 0.6175904012328691, + "grad_norm": 0.7923482060432434, + "learning_rate": 7.850714878034183e-06, + "loss": 0.7341, + "step": 11221 + }, + { + "epoch": 0.6176454400352248, + "grad_norm": 0.7189306020736694, + "learning_rate": 7.850358754778996e-06, + "loss": 0.7775, + "step": 11222 + }, + { + "epoch": 0.6177004788375805, + "grad_norm": 0.9873724579811096, + "learning_rate": 7.850002610101276e-06, + "loss": 0.8521, + "step": 11223 + }, + { + "epoch": 0.6177555176399362, + "grad_norm": 0.6350038051605225, + "learning_rate": 7.8496464440037e-06, + "loss": 0.6356, + "step": 11224 + }, + { + "epoch": 0.6178105564422918, + "grad_norm": 0.8059771060943604, + "learning_rate": 7.849290256488941e-06, + "loss": 0.821, + "step": 11225 + }, + { + "epoch": 0.6178655952446475, + "grad_norm": 0.7469610571861267, + "learning_rate": 7.848934047559684e-06, + "loss": 0.7782, + "step": 11226 + }, + { + "epoch": 0.6179206340470031, + "grad_norm": 0.6423176527023315, + "learning_rate": 7.848577817218597e-06, + "loss": 0.6693, + "step": 11227 + }, + { + "epoch": 0.6179756728493588, + "grad_norm": 0.7298387885093689, + "learning_rate": 7.848221565468363e-06, + "loss": 0.775, + "step": 11228 + }, + { + "epoch": 0.6180307116517144, + "grad_norm": 0.7125145196914673, + "learning_rate": 7.84786529231166e-06, + "loss": 0.7507, + "step": 11229 + }, + { + "epoch": 0.6180857504540701, + "grad_norm": 0.6658627390861511, + "learning_rate": 7.847508997751163e-06, + "loss": 0.7506, + "step": 11230 + }, + { + "epoch": 0.6181407892564258, + "grad_norm": 0.6425275206565857, + "learning_rate": 7.847152681789549e-06, + "loss": 0.657, + "step": 11231 + }, + { + "epoch": 0.6181958280587815, + "grad_norm": 0.8075960278511047, + "learning_rate": 7.846796344429498e-06, + "loss": 0.5434, + "step": 11232 + }, + { + "epoch": 0.6182508668611371, + "grad_norm": 0.8481889367103577, + "learning_rate": 7.846439985673689e-06, + "loss": 0.8303, + "step": 11233 + }, + { + "epoch": 0.6183059056634927, + "grad_norm": 0.7216358184814453, + "learning_rate": 7.846083605524799e-06, + "loss": 0.7589, + "step": 11234 + }, + { + "epoch": 0.6183609444658484, + "grad_norm": 0.8399745225906372, + "learning_rate": 7.845727203985504e-06, + "loss": 0.8096, + "step": 11235 + }, + { + "epoch": 0.6184159832682041, + "grad_norm": 0.6708692908287048, + "learning_rate": 7.845370781058489e-06, + "loss": 0.6858, + "step": 11236 + }, + { + "epoch": 0.6184710220705597, + "grad_norm": 0.6309100389480591, + "learning_rate": 7.845014336746426e-06, + "loss": 0.6093, + "step": 11237 + }, + { + "epoch": 0.6185260608729154, + "grad_norm": 0.8138728141784668, + "learning_rate": 7.844657871051997e-06, + "loss": 0.8259, + "step": 11238 + }, + { + "epoch": 0.6185810996752711, + "grad_norm": 0.6763564348220825, + "learning_rate": 7.844301383977882e-06, + "loss": 0.7056, + "step": 11239 + }, + { + "epoch": 0.6186361384776268, + "grad_norm": 0.792085587978363, + "learning_rate": 7.843944875526758e-06, + "loss": 0.7364, + "step": 11240 + }, + { + "epoch": 0.6186911772799824, + "grad_norm": 0.8738027811050415, + "learning_rate": 7.843588345701306e-06, + "loss": 0.7092, + "step": 11241 + }, + { + "epoch": 0.618746216082338, + "grad_norm": 0.7694413065910339, + "learning_rate": 7.843231794504205e-06, + "loss": 0.852, + "step": 11242 + }, + { + "epoch": 0.6188012548846937, + "grad_norm": 0.8211640119552612, + "learning_rate": 7.842875221938135e-06, + "loss": 0.8218, + "step": 11243 + }, + { + "epoch": 0.6188562936870494, + "grad_norm": 0.620566189289093, + "learning_rate": 7.842518628005776e-06, + "loss": 0.7176, + "step": 11244 + }, + { + "epoch": 0.618911332489405, + "grad_norm": 0.7044099569320679, + "learning_rate": 7.84216201270981e-06, + "loss": 0.8068, + "step": 11245 + }, + { + "epoch": 0.6189663712917607, + "grad_norm": 0.765209436416626, + "learning_rate": 7.841805376052912e-06, + "loss": 0.8002, + "step": 11246 + }, + { + "epoch": 0.6190214100941164, + "grad_norm": 0.7565444707870483, + "learning_rate": 7.841448718037765e-06, + "loss": 0.7997, + "step": 11247 + }, + { + "epoch": 0.6190764488964721, + "grad_norm": 0.9544101357460022, + "learning_rate": 7.841092038667052e-06, + "loss": 0.647, + "step": 11248 + }, + { + "epoch": 0.6191314876988276, + "grad_norm": 0.7319634556770325, + "learning_rate": 7.840735337943452e-06, + "loss": 0.7982, + "step": 11249 + }, + { + "epoch": 0.6191865265011833, + "grad_norm": 0.6017479300498962, + "learning_rate": 7.840378615869645e-06, + "loss": 0.6817, + "step": 11250 + }, + { + "epoch": 0.619241565303539, + "grad_norm": 0.6936477422714233, + "learning_rate": 7.840021872448312e-06, + "loss": 0.7227, + "step": 11251 + }, + { + "epoch": 0.6192966041058947, + "grad_norm": 0.6962631940841675, + "learning_rate": 7.839665107682135e-06, + "loss": 0.779, + "step": 11252 + }, + { + "epoch": 0.6193516429082503, + "grad_norm": 0.9580947160720825, + "learning_rate": 7.839308321573797e-06, + "loss": 0.8821, + "step": 11253 + }, + { + "epoch": 0.619406681710606, + "grad_norm": 0.7721261978149414, + "learning_rate": 7.838951514125977e-06, + "loss": 0.7146, + "step": 11254 + }, + { + "epoch": 0.6194617205129617, + "grad_norm": 0.7349434494972229, + "learning_rate": 7.838594685341354e-06, + "loss": 0.7601, + "step": 11255 + }, + { + "epoch": 0.6195167593153172, + "grad_norm": 0.6787356734275818, + "learning_rate": 7.838237835222618e-06, + "loss": 0.706, + "step": 11256 + }, + { + "epoch": 0.6195717981176729, + "grad_norm": 0.7658288478851318, + "learning_rate": 7.837880963772445e-06, + "loss": 0.7102, + "step": 11257 + }, + { + "epoch": 0.6196268369200286, + "grad_norm": 0.8083927035331726, + "learning_rate": 7.837524070993516e-06, + "loss": 0.8501, + "step": 11258 + }, + { + "epoch": 0.6196818757223843, + "grad_norm": 0.7656283974647522, + "learning_rate": 7.837167156888516e-06, + "loss": 0.7558, + "step": 11259 + }, + { + "epoch": 0.6197369145247399, + "grad_norm": 0.7897886037826538, + "learning_rate": 7.836810221460128e-06, + "loss": 0.7567, + "step": 11260 + }, + { + "epoch": 0.6197919533270956, + "grad_norm": 0.6858190298080444, + "learning_rate": 7.836453264711035e-06, + "loss": 0.717, + "step": 11261 + }, + { + "epoch": 0.6198469921294513, + "grad_norm": 0.7423431873321533, + "learning_rate": 7.836096286643917e-06, + "loss": 0.7047, + "step": 11262 + }, + { + "epoch": 0.619902030931807, + "grad_norm": 0.8277921676635742, + "learning_rate": 7.835739287261458e-06, + "loss": 0.7418, + "step": 11263 + }, + { + "epoch": 0.6199570697341625, + "grad_norm": 0.7102510929107666, + "learning_rate": 7.835382266566343e-06, + "loss": 0.8202, + "step": 11264 + }, + { + "epoch": 0.6200121085365182, + "grad_norm": 0.6705429553985596, + "learning_rate": 7.835025224561252e-06, + "loss": 0.7332, + "step": 11265 + }, + { + "epoch": 0.6200671473388739, + "grad_norm": 0.6529950499534607, + "learning_rate": 7.834668161248873e-06, + "loss": 0.7579, + "step": 11266 + }, + { + "epoch": 0.6201221861412296, + "grad_norm": 0.7189938426017761, + "learning_rate": 7.834311076631885e-06, + "loss": 0.7323, + "step": 11267 + }, + { + "epoch": 0.6201772249435852, + "grad_norm": 0.6559470891952515, + "learning_rate": 7.833953970712973e-06, + "loss": 0.5973, + "step": 11268 + }, + { + "epoch": 0.6202322637459409, + "grad_norm": 0.7971723675727844, + "learning_rate": 7.833596843494824e-06, + "loss": 0.804, + "step": 11269 + }, + { + "epoch": 0.6202873025482966, + "grad_norm": 0.7800958752632141, + "learning_rate": 7.833239694980118e-06, + "loss": 0.772, + "step": 11270 + }, + { + "epoch": 0.6203423413506522, + "grad_norm": 0.6831466555595398, + "learning_rate": 7.83288252517154e-06, + "loss": 0.7341, + "step": 11271 + }, + { + "epoch": 0.6203973801530078, + "grad_norm": 0.6504807472229004, + "learning_rate": 7.832525334071776e-06, + "loss": 0.6462, + "step": 11272 + }, + { + "epoch": 0.6204524189553635, + "grad_norm": 0.6973552703857422, + "learning_rate": 7.832168121683512e-06, + "loss": 0.7504, + "step": 11273 + }, + { + "epoch": 0.6205074577577192, + "grad_norm": 0.6772480607032776, + "learning_rate": 7.831810888009427e-06, + "loss": 0.7273, + "step": 11274 + }, + { + "epoch": 0.6205624965600749, + "grad_norm": 0.7077416777610779, + "learning_rate": 7.831453633052212e-06, + "loss": 0.7365, + "step": 11275 + }, + { + "epoch": 0.6206175353624305, + "grad_norm": 0.7338337898254395, + "learning_rate": 7.831096356814548e-06, + "loss": 0.7959, + "step": 11276 + }, + { + "epoch": 0.6206725741647862, + "grad_norm": 0.6313255429267883, + "learning_rate": 7.830739059299123e-06, + "loss": 0.7027, + "step": 11277 + }, + { + "epoch": 0.6207276129671419, + "grad_norm": 0.7377570867538452, + "learning_rate": 7.830381740508619e-06, + "loss": 0.6903, + "step": 11278 + }, + { + "epoch": 0.6207826517694975, + "grad_norm": 0.6868650317192078, + "learning_rate": 7.830024400445724e-06, + "loss": 0.6882, + "step": 11279 + }, + { + "epoch": 0.6208376905718531, + "grad_norm": 0.7632661461830139, + "learning_rate": 7.829667039113124e-06, + "loss": 0.8437, + "step": 11280 + }, + { + "epoch": 0.6208927293742088, + "grad_norm": 0.9241608381271362, + "learning_rate": 7.829309656513504e-06, + "loss": 0.779, + "step": 11281 + }, + { + "epoch": 0.6209477681765645, + "grad_norm": 0.6857842206954956, + "learning_rate": 7.828952252649551e-06, + "loss": 0.7882, + "step": 11282 + }, + { + "epoch": 0.6210028069789202, + "grad_norm": 0.695659875869751, + "learning_rate": 7.828594827523947e-06, + "loss": 0.7471, + "step": 11283 + }, + { + "epoch": 0.6210578457812758, + "grad_norm": 0.6398521661758423, + "learning_rate": 7.828237381139383e-06, + "loss": 0.7328, + "step": 11284 + }, + { + "epoch": 0.6211128845836315, + "grad_norm": 0.7386063933372498, + "learning_rate": 7.827879913498544e-06, + "loss": 0.748, + "step": 11285 + }, + { + "epoch": 0.6211679233859871, + "grad_norm": 0.6740923523902893, + "learning_rate": 7.827522424604117e-06, + "loss": 0.6866, + "step": 11286 + }, + { + "epoch": 0.6212229621883428, + "grad_norm": 0.6794413924217224, + "learning_rate": 7.82716491445879e-06, + "loss": 0.7299, + "step": 11287 + }, + { + "epoch": 0.6212780009906984, + "grad_norm": 0.6471715569496155, + "learning_rate": 7.826807383065245e-06, + "loss": 0.7071, + "step": 11288 + }, + { + "epoch": 0.6213330397930541, + "grad_norm": 0.9716162085533142, + "learning_rate": 7.826449830426174e-06, + "loss": 0.7417, + "step": 11289 + }, + { + "epoch": 0.6213880785954098, + "grad_norm": 0.6928716897964478, + "learning_rate": 7.826092256544263e-06, + "loss": 0.7757, + "step": 11290 + }, + { + "epoch": 0.6214431173977655, + "grad_norm": 0.6739227175712585, + "learning_rate": 7.825734661422197e-06, + "loss": 0.7576, + "step": 11291 + }, + { + "epoch": 0.621498156200121, + "grad_norm": 1.2619935274124146, + "learning_rate": 7.825377045062668e-06, + "loss": 0.7454, + "step": 11292 + }, + { + "epoch": 0.6215531950024767, + "grad_norm": 0.6713572144508362, + "learning_rate": 7.825019407468361e-06, + "loss": 0.7916, + "step": 11293 + }, + { + "epoch": 0.6216082338048324, + "grad_norm": 0.6143541932106018, + "learning_rate": 7.824661748641964e-06, + "loss": 0.6765, + "step": 11294 + }, + { + "epoch": 0.6216632726071881, + "grad_norm": 0.7141658067703247, + "learning_rate": 7.824304068586163e-06, + "loss": 0.7773, + "step": 11295 + }, + { + "epoch": 0.6217183114095437, + "grad_norm": 0.7320290803909302, + "learning_rate": 7.823946367303653e-06, + "loss": 0.8062, + "step": 11296 + }, + { + "epoch": 0.6217733502118994, + "grad_norm": 0.7523403167724609, + "learning_rate": 7.823588644797115e-06, + "loss": 0.7126, + "step": 11297 + }, + { + "epoch": 0.6218283890142551, + "grad_norm": 0.6512221097946167, + "learning_rate": 7.823230901069242e-06, + "loss": 0.7563, + "step": 11298 + }, + { + "epoch": 0.6218834278166107, + "grad_norm": 0.6512733697891235, + "learning_rate": 7.82287313612272e-06, + "loss": 0.7603, + "step": 11299 + }, + { + "epoch": 0.6219384666189663, + "grad_norm": 1.0590927600860596, + "learning_rate": 7.82251534996024e-06, + "loss": 0.8325, + "step": 11300 + }, + { + "epoch": 0.621993505421322, + "grad_norm": 0.6763397455215454, + "learning_rate": 7.82215754258449e-06, + "loss": 0.7915, + "step": 11301 + }, + { + "epoch": 0.6220485442236777, + "grad_norm": 0.6640639901161194, + "learning_rate": 7.82179971399816e-06, + "loss": 0.6953, + "step": 11302 + }, + { + "epoch": 0.6221035830260333, + "grad_norm": 0.6611515283584595, + "learning_rate": 7.821441864203938e-06, + "loss": 0.8331, + "step": 11303 + }, + { + "epoch": 0.622158621828389, + "grad_norm": 0.8226057887077332, + "learning_rate": 7.821083993204514e-06, + "loss": 0.7448, + "step": 11304 + }, + { + "epoch": 0.6222136606307447, + "grad_norm": 0.6798059940338135, + "learning_rate": 7.820726101002578e-06, + "loss": 0.717, + "step": 11305 + }, + { + "epoch": 0.6222686994331004, + "grad_norm": 0.7623499631881714, + "learning_rate": 7.820368187600821e-06, + "loss": 0.7343, + "step": 11306 + }, + { + "epoch": 0.622323738235456, + "grad_norm": 0.703886866569519, + "learning_rate": 7.82001025300193e-06, + "loss": 0.8008, + "step": 11307 + }, + { + "epoch": 0.6223787770378116, + "grad_norm": 0.6817659735679626, + "learning_rate": 7.819652297208597e-06, + "loss": 0.7534, + "step": 11308 + }, + { + "epoch": 0.6224338158401673, + "grad_norm": 0.8991402983665466, + "learning_rate": 7.819294320223513e-06, + "loss": 0.6236, + "step": 11309 + }, + { + "epoch": 0.622488854642523, + "grad_norm": 0.791199803352356, + "learning_rate": 7.818936322049366e-06, + "loss": 0.772, + "step": 11310 + }, + { + "epoch": 0.6225438934448786, + "grad_norm": 0.6401470303535461, + "learning_rate": 7.81857830268885e-06, + "loss": 0.7749, + "step": 11311 + }, + { + "epoch": 0.6225989322472343, + "grad_norm": 0.6731516122817993, + "learning_rate": 7.818220262144653e-06, + "loss": 0.7506, + "step": 11312 + }, + { + "epoch": 0.62265397104959, + "grad_norm": 0.7391661405563354, + "learning_rate": 7.817862200419467e-06, + "loss": 0.7288, + "step": 11313 + }, + { + "epoch": 0.6227090098519457, + "grad_norm": 0.7363784909248352, + "learning_rate": 7.817504117515984e-06, + "loss": 0.7087, + "step": 11314 + }, + { + "epoch": 0.6227640486543012, + "grad_norm": 0.7609296441078186, + "learning_rate": 7.817146013436893e-06, + "loss": 0.7553, + "step": 11315 + }, + { + "epoch": 0.6228190874566569, + "grad_norm": 0.6818829774856567, + "learning_rate": 7.816787888184886e-06, + "loss": 0.7534, + "step": 11316 + }, + { + "epoch": 0.6228741262590126, + "grad_norm": 0.7434844374656677, + "learning_rate": 7.816429741762657e-06, + "loss": 0.8008, + "step": 11317 + }, + { + "epoch": 0.6229291650613683, + "grad_norm": 0.6881742477416992, + "learning_rate": 7.816071574172895e-06, + "loss": 0.7324, + "step": 11318 + }, + { + "epoch": 0.6229842038637239, + "grad_norm": 0.7109540104866028, + "learning_rate": 7.815713385418293e-06, + "loss": 0.7954, + "step": 11319 + }, + { + "epoch": 0.6230392426660796, + "grad_norm": 0.6868860721588135, + "learning_rate": 7.815355175501542e-06, + "loss": 0.6703, + "step": 11320 + }, + { + "epoch": 0.6230942814684353, + "grad_norm": 0.7851449847221375, + "learning_rate": 7.814996944425337e-06, + "loss": 0.8321, + "step": 11321 + }, + { + "epoch": 0.623149320270791, + "grad_norm": 0.7966809272766113, + "learning_rate": 7.814638692192367e-06, + "loss": 0.7603, + "step": 11322 + }, + { + "epoch": 0.6232043590731465, + "grad_norm": 0.6612964272499084, + "learning_rate": 7.814280418805327e-06, + "loss": 0.8096, + "step": 11323 + }, + { + "epoch": 0.6232593978755022, + "grad_norm": 0.6398881077766418, + "learning_rate": 7.813922124266908e-06, + "loss": 0.7559, + "step": 11324 + }, + { + "epoch": 0.6233144366778579, + "grad_norm": 0.8062521815299988, + "learning_rate": 7.813563808579804e-06, + "loss": 0.7863, + "step": 11325 + }, + { + "epoch": 0.6233694754802136, + "grad_norm": 0.7083317041397095, + "learning_rate": 7.813205471746708e-06, + "loss": 0.7358, + "step": 11326 + }, + { + "epoch": 0.6234245142825692, + "grad_norm": 0.6190419793128967, + "learning_rate": 7.812847113770312e-06, + "loss": 0.637, + "step": 11327 + }, + { + "epoch": 0.6234795530849249, + "grad_norm": 0.7036548256874084, + "learning_rate": 7.812488734653309e-06, + "loss": 0.8049, + "step": 11328 + }, + { + "epoch": 0.6235345918872806, + "grad_norm": 0.7952288389205933, + "learning_rate": 7.812130334398395e-06, + "loss": 0.781, + "step": 11329 + }, + { + "epoch": 0.6235896306896362, + "grad_norm": 0.7925593852996826, + "learning_rate": 7.811771913008262e-06, + "loss": 0.7913, + "step": 11330 + }, + { + "epoch": 0.6236446694919918, + "grad_norm": 0.7190900444984436, + "learning_rate": 7.811413470485604e-06, + "loss": 0.7464, + "step": 11331 + }, + { + "epoch": 0.6236997082943475, + "grad_norm": 0.6476338505744934, + "learning_rate": 7.811055006833114e-06, + "loss": 0.699, + "step": 11332 + }, + { + "epoch": 0.6237547470967032, + "grad_norm": 0.7412729263305664, + "learning_rate": 7.810696522053487e-06, + "loss": 0.7958, + "step": 11333 + }, + { + "epoch": 0.6238097858990589, + "grad_norm": 0.6646767854690552, + "learning_rate": 7.81033801614942e-06, + "loss": 0.6276, + "step": 11334 + }, + { + "epoch": 0.6238648247014145, + "grad_norm": 0.6912583112716675, + "learning_rate": 7.809979489123601e-06, + "loss": 0.7611, + "step": 11335 + }, + { + "epoch": 0.6239198635037702, + "grad_norm": 0.7324331998825073, + "learning_rate": 7.80962094097873e-06, + "loss": 0.7436, + "step": 11336 + }, + { + "epoch": 0.6239749023061258, + "grad_norm": 0.7046643495559692, + "learning_rate": 7.809262371717501e-06, + "loss": 0.7287, + "step": 11337 + }, + { + "epoch": 0.6240299411084815, + "grad_norm": 0.6013771891593933, + "learning_rate": 7.808903781342607e-06, + "loss": 0.6822, + "step": 11338 + }, + { + "epoch": 0.6240849799108371, + "grad_norm": 0.633074164390564, + "learning_rate": 7.808545169856745e-06, + "loss": 0.7758, + "step": 11339 + }, + { + "epoch": 0.6241400187131928, + "grad_norm": 0.6603411436080933, + "learning_rate": 7.808186537262608e-06, + "loss": 0.6797, + "step": 11340 + }, + { + "epoch": 0.6241950575155485, + "grad_norm": 0.8316327929496765, + "learning_rate": 7.807827883562894e-06, + "loss": 0.777, + "step": 11341 + }, + { + "epoch": 0.6242500963179041, + "grad_norm": 0.7954252362251282, + "learning_rate": 7.807469208760295e-06, + "loss": 0.6581, + "step": 11342 + }, + { + "epoch": 0.6243051351202598, + "grad_norm": 0.6108134984970093, + "learning_rate": 7.80711051285751e-06, + "loss": 0.7126, + "step": 11343 + }, + { + "epoch": 0.6243601739226154, + "grad_norm": 0.7224909067153931, + "learning_rate": 7.806751795857235e-06, + "loss": 0.8677, + "step": 11344 + }, + { + "epoch": 0.6244152127249711, + "grad_norm": 0.720923125743866, + "learning_rate": 7.806393057762165e-06, + "loss": 0.7174, + "step": 11345 + }, + { + "epoch": 0.6244702515273267, + "grad_norm": 0.6837444305419922, + "learning_rate": 7.806034298574993e-06, + "loss": 0.7431, + "step": 11346 + }, + { + "epoch": 0.6245252903296824, + "grad_norm": 0.8486534953117371, + "learning_rate": 7.80567551829842e-06, + "loss": 0.7955, + "step": 11347 + }, + { + "epoch": 0.6245803291320381, + "grad_norm": 0.6459395885467529, + "learning_rate": 7.805316716935143e-06, + "loss": 0.7681, + "step": 11348 + }, + { + "epoch": 0.6246353679343938, + "grad_norm": 0.8414636850357056, + "learning_rate": 7.804957894487854e-06, + "loss": 0.8985, + "step": 11349 + }, + { + "epoch": 0.6246904067367494, + "grad_norm": 0.7930828928947449, + "learning_rate": 7.804599050959254e-06, + "loss": 0.7389, + "step": 11350 + }, + { + "epoch": 0.624745445539105, + "grad_norm": 0.7102516889572144, + "learning_rate": 7.804240186352038e-06, + "loss": 0.8072, + "step": 11351 + }, + { + "epoch": 0.6248004843414607, + "grad_norm": 0.773341178894043, + "learning_rate": 7.803881300668901e-06, + "loss": 0.7531, + "step": 11352 + }, + { + "epoch": 0.6248555231438164, + "grad_norm": 0.6354981064796448, + "learning_rate": 7.803522393912544e-06, + "loss": 0.6761, + "step": 11353 + }, + { + "epoch": 0.624910561946172, + "grad_norm": 0.7833859324455261, + "learning_rate": 7.803163466085663e-06, + "loss": 0.7768, + "step": 11354 + }, + { + "epoch": 0.6249656007485277, + "grad_norm": 0.6982376575469971, + "learning_rate": 7.802804517190957e-06, + "loss": 0.7472, + "step": 11355 + }, + { + "epoch": 0.6250206395508834, + "grad_norm": 0.7214694023132324, + "learning_rate": 7.80244554723112e-06, + "loss": 0.7919, + "step": 11356 + }, + { + "epoch": 0.6250756783532391, + "grad_norm": 0.8002933859825134, + "learning_rate": 7.802086556208855e-06, + "loss": 0.8278, + "step": 11357 + }, + { + "epoch": 0.6251307171555947, + "grad_norm": 0.7619680762290955, + "learning_rate": 7.801727544126858e-06, + "loss": 0.7775, + "step": 11358 + }, + { + "epoch": 0.6251857559579503, + "grad_norm": 0.6340392827987671, + "learning_rate": 7.801368510987825e-06, + "loss": 0.7324, + "step": 11359 + }, + { + "epoch": 0.625240794760306, + "grad_norm": 0.6754844784736633, + "learning_rate": 7.801009456794457e-06, + "loss": 0.7296, + "step": 11360 + }, + { + "epoch": 0.6252958335626617, + "grad_norm": 0.6871771216392517, + "learning_rate": 7.80065038154945e-06, + "loss": 0.7398, + "step": 11361 + }, + { + "epoch": 0.6253508723650173, + "grad_norm": 0.6610772013664246, + "learning_rate": 7.800291285255505e-06, + "loss": 0.738, + "step": 11362 + }, + { + "epoch": 0.625405911167373, + "grad_norm": 0.6858081221580505, + "learning_rate": 7.799932167915322e-06, + "loss": 0.7353, + "step": 11363 + }, + { + "epoch": 0.6254609499697287, + "grad_norm": 0.6698840856552124, + "learning_rate": 7.799573029531597e-06, + "loss": 0.7505, + "step": 11364 + }, + { + "epoch": 0.6255159887720844, + "grad_norm": 0.7374000549316406, + "learning_rate": 7.799213870107031e-06, + "loss": 0.7974, + "step": 11365 + }, + { + "epoch": 0.6255710275744399, + "grad_norm": 0.6962621808052063, + "learning_rate": 7.798854689644324e-06, + "loss": 0.8183, + "step": 11366 + }, + { + "epoch": 0.6256260663767956, + "grad_norm": 0.8477681279182434, + "learning_rate": 7.798495488146173e-06, + "loss": 0.7533, + "step": 11367 + }, + { + "epoch": 0.6256811051791513, + "grad_norm": 0.6963459253311157, + "learning_rate": 7.798136265615278e-06, + "loss": 0.6362, + "step": 11368 + }, + { + "epoch": 0.625736143981507, + "grad_norm": 0.7125601172447205, + "learning_rate": 7.79777702205434e-06, + "loss": 0.7296, + "step": 11369 + }, + { + "epoch": 0.6257911827838626, + "grad_norm": 0.6650554537773132, + "learning_rate": 7.79741775746606e-06, + "loss": 0.8231, + "step": 11370 + }, + { + "epoch": 0.6258462215862183, + "grad_norm": 0.6556620597839355, + "learning_rate": 7.797058471853138e-06, + "loss": 0.6952, + "step": 11371 + }, + { + "epoch": 0.625901260388574, + "grad_norm": 0.6350956559181213, + "learning_rate": 7.79669916521827e-06, + "loss": 0.686, + "step": 11372 + }, + { + "epoch": 0.6259562991909297, + "grad_norm": 0.6346702575683594, + "learning_rate": 7.796339837564163e-06, + "loss": 0.7234, + "step": 11373 + }, + { + "epoch": 0.6260113379932852, + "grad_norm": 0.741437554359436, + "learning_rate": 7.795980488893514e-06, + "loss": 0.8096, + "step": 11374 + }, + { + "epoch": 0.6260663767956409, + "grad_norm": 0.7057582139968872, + "learning_rate": 7.795621119209021e-06, + "loss": 0.8022, + "step": 11375 + }, + { + "epoch": 0.6261214155979966, + "grad_norm": 0.658107578754425, + "learning_rate": 7.79526172851339e-06, + "loss": 0.7564, + "step": 11376 + }, + { + "epoch": 0.6261764544003523, + "grad_norm": 0.7974086403846741, + "learning_rate": 7.79490231680932e-06, + "loss": 0.7721, + "step": 11377 + }, + { + "epoch": 0.6262314932027079, + "grad_norm": 0.6669130921363831, + "learning_rate": 7.794542884099513e-06, + "loss": 0.7652, + "step": 11378 + }, + { + "epoch": 0.6262865320050636, + "grad_norm": 0.7364919185638428, + "learning_rate": 7.794183430386669e-06, + "loss": 0.8679, + "step": 11379 + }, + { + "epoch": 0.6263415708074193, + "grad_norm": 0.7383667230606079, + "learning_rate": 7.793823955673489e-06, + "loss": 0.7715, + "step": 11380 + }, + { + "epoch": 0.626396609609775, + "grad_norm": 0.6688774228096008, + "learning_rate": 7.793464459962679e-06, + "loss": 0.7503, + "step": 11381 + }, + { + "epoch": 0.6264516484121305, + "grad_norm": 0.6771709322929382, + "learning_rate": 7.793104943256935e-06, + "loss": 0.7479, + "step": 11382 + }, + { + "epoch": 0.6265066872144862, + "grad_norm": 0.7121349573135376, + "learning_rate": 7.792745405558964e-06, + "loss": 0.7655, + "step": 11383 + }, + { + "epoch": 0.6265617260168419, + "grad_norm": 0.6315643787384033, + "learning_rate": 7.792385846871465e-06, + "loss": 0.7418, + "step": 11384 + }, + { + "epoch": 0.6266167648191975, + "grad_norm": 0.6701569557189941, + "learning_rate": 7.792026267197142e-06, + "loss": 0.7669, + "step": 11385 + }, + { + "epoch": 0.6266718036215532, + "grad_norm": 0.6890652179718018, + "learning_rate": 7.791666666538697e-06, + "loss": 0.7659, + "step": 11386 + }, + { + "epoch": 0.6267268424239089, + "grad_norm": 0.7636297345161438, + "learning_rate": 7.791307044898833e-06, + "loss": 0.7272, + "step": 11387 + }, + { + "epoch": 0.6267818812262645, + "grad_norm": 0.6563602089881897, + "learning_rate": 7.790947402280252e-06, + "loss": 0.7603, + "step": 11388 + }, + { + "epoch": 0.6268369200286201, + "grad_norm": 0.7252678275108337, + "learning_rate": 7.790587738685655e-06, + "loss": 0.7789, + "step": 11389 + }, + { + "epoch": 0.6268919588309758, + "grad_norm": 0.6703618764877319, + "learning_rate": 7.79022805411775e-06, + "loss": 0.6883, + "step": 11390 + }, + { + "epoch": 0.6269469976333315, + "grad_norm": 0.7165848612785339, + "learning_rate": 7.789868348579239e-06, + "loss": 0.7944, + "step": 11391 + }, + { + "epoch": 0.6270020364356872, + "grad_norm": 0.9325329065322876, + "learning_rate": 7.789508622072822e-06, + "loss": 0.9059, + "step": 11392 + }, + { + "epoch": 0.6270570752380428, + "grad_norm": 0.6875555515289307, + "learning_rate": 7.789148874601204e-06, + "loss": 0.7115, + "step": 11393 + }, + { + "epoch": 0.6271121140403985, + "grad_norm": 0.6470181941986084, + "learning_rate": 7.788789106167093e-06, + "loss": 0.7603, + "step": 11394 + }, + { + "epoch": 0.6271671528427541, + "grad_norm": 0.688685417175293, + "learning_rate": 7.788429316773188e-06, + "loss": 0.8397, + "step": 11395 + }, + { + "epoch": 0.6272221916451098, + "grad_norm": 0.6299887895584106, + "learning_rate": 7.788069506422193e-06, + "loss": 0.7026, + "step": 11396 + }, + { + "epoch": 0.6272772304474654, + "grad_norm": 0.8046191930770874, + "learning_rate": 7.787709675116817e-06, + "loss": 0.8573, + "step": 11397 + }, + { + "epoch": 0.6273322692498211, + "grad_norm": 0.6700685620307922, + "learning_rate": 7.78734982285976e-06, + "loss": 0.7225, + "step": 11398 + }, + { + "epoch": 0.6273873080521768, + "grad_norm": 0.6968538761138916, + "learning_rate": 7.786989949653726e-06, + "loss": 0.6571, + "step": 11399 + }, + { + "epoch": 0.6274423468545325, + "grad_norm": 0.6857314705848694, + "learning_rate": 7.786630055501425e-06, + "loss": 0.8131, + "step": 11400 + }, + { + "epoch": 0.6274973856568881, + "grad_norm": 0.702316403388977, + "learning_rate": 7.786270140405557e-06, + "loss": 0.7222, + "step": 11401 + }, + { + "epoch": 0.6275524244592438, + "grad_norm": 0.6987283825874329, + "learning_rate": 7.785910204368827e-06, + "loss": 0.7171, + "step": 11402 + }, + { + "epoch": 0.6276074632615994, + "grad_norm": 0.6835529208183289, + "learning_rate": 7.785550247393943e-06, + "loss": 0.8077, + "step": 11403 + }, + { + "epoch": 0.6276625020639551, + "grad_norm": 0.6423392295837402, + "learning_rate": 7.785190269483609e-06, + "loss": 0.6689, + "step": 11404 + }, + { + "epoch": 0.6277175408663107, + "grad_norm": 0.6995517611503601, + "learning_rate": 7.78483027064053e-06, + "loss": 0.7417, + "step": 11405 + }, + { + "epoch": 0.6277725796686664, + "grad_norm": 0.6639729142189026, + "learning_rate": 7.784470250867413e-06, + "loss": 0.6521, + "step": 11406 + }, + { + "epoch": 0.6278276184710221, + "grad_norm": 0.7280262112617493, + "learning_rate": 7.784110210166961e-06, + "loss": 0.7686, + "step": 11407 + }, + { + "epoch": 0.6278826572733778, + "grad_norm": 0.6741863489151001, + "learning_rate": 7.783750148541884e-06, + "loss": 0.7794, + "step": 11408 + }, + { + "epoch": 0.6279376960757334, + "grad_norm": 0.8160151243209839, + "learning_rate": 7.783390065994885e-06, + "loss": 0.7065, + "step": 11409 + }, + { + "epoch": 0.627992734878089, + "grad_norm": 0.7288973927497864, + "learning_rate": 7.783029962528672e-06, + "loss": 0.8337, + "step": 11410 + }, + { + "epoch": 0.6280477736804447, + "grad_norm": 0.7764643430709839, + "learning_rate": 7.782669838145952e-06, + "loss": 0.8812, + "step": 11411 + }, + { + "epoch": 0.6281028124828004, + "grad_norm": 0.8145303130149841, + "learning_rate": 7.782309692849425e-06, + "loss": 0.9206, + "step": 11412 + }, + { + "epoch": 0.628157851285156, + "grad_norm": 0.6883288621902466, + "learning_rate": 7.781949526641808e-06, + "loss": 0.7779, + "step": 11413 + }, + { + "epoch": 0.6282128900875117, + "grad_norm": 0.7281043529510498, + "learning_rate": 7.781589339525803e-06, + "loss": 0.7933, + "step": 11414 + }, + { + "epoch": 0.6282679288898674, + "grad_norm": 0.7998347878456116, + "learning_rate": 7.781229131504115e-06, + "loss": 0.8772, + "step": 11415 + }, + { + "epoch": 0.6283229676922231, + "grad_norm": 0.7591177225112915, + "learning_rate": 7.780868902579455e-06, + "loss": 0.9054, + "step": 11416 + }, + { + "epoch": 0.6283780064945786, + "grad_norm": 0.7209650278091431, + "learning_rate": 7.780508652754528e-06, + "loss": 0.7781, + "step": 11417 + }, + { + "epoch": 0.6284330452969343, + "grad_norm": 1.2373511791229248, + "learning_rate": 7.780148382032042e-06, + "loss": 0.7501, + "step": 11418 + }, + { + "epoch": 0.62848808409929, + "grad_norm": 0.6281551122665405, + "learning_rate": 7.779788090414704e-06, + "loss": 0.8122, + "step": 11419 + }, + { + "epoch": 0.6285431229016457, + "grad_norm": 0.6954115629196167, + "learning_rate": 7.779427777905224e-06, + "loss": 0.7815, + "step": 11420 + }, + { + "epoch": 0.6285981617040013, + "grad_norm": 0.727043628692627, + "learning_rate": 7.77906744450631e-06, + "loss": 0.7116, + "step": 11421 + }, + { + "epoch": 0.628653200506357, + "grad_norm": 0.6979809403419495, + "learning_rate": 7.778707090220667e-06, + "loss": 0.7707, + "step": 11422 + }, + { + "epoch": 0.6287082393087127, + "grad_norm": 0.6851169466972351, + "learning_rate": 7.778346715051006e-06, + "loss": 0.811, + "step": 11423 + }, + { + "epoch": 0.6287632781110684, + "grad_norm": 0.70259028673172, + "learning_rate": 7.777986319000036e-06, + "loss": 0.7766, + "step": 11424 + }, + { + "epoch": 0.6288183169134239, + "grad_norm": 0.7436364889144897, + "learning_rate": 7.777625902070463e-06, + "loss": 0.8449, + "step": 11425 + }, + { + "epoch": 0.6288733557157796, + "grad_norm": 0.6452080607414246, + "learning_rate": 7.777265464264998e-06, + "loss": 0.7138, + "step": 11426 + }, + { + "epoch": 0.6289283945181353, + "grad_norm": 0.6329460144042969, + "learning_rate": 7.776905005586349e-06, + "loss": 0.6482, + "step": 11427 + }, + { + "epoch": 0.6289834333204909, + "grad_norm": 0.7521186470985413, + "learning_rate": 7.776544526037225e-06, + "loss": 0.751, + "step": 11428 + }, + { + "epoch": 0.6290384721228466, + "grad_norm": 0.7105319499969482, + "learning_rate": 7.776184025620334e-06, + "loss": 0.843, + "step": 11429 + }, + { + "epoch": 0.6290935109252023, + "grad_norm": 0.7329964637756348, + "learning_rate": 7.77582350433839e-06, + "loss": 0.6992, + "step": 11430 + }, + { + "epoch": 0.629148549727558, + "grad_norm": 0.7492092847824097, + "learning_rate": 7.775462962194098e-06, + "loss": 0.7579, + "step": 11431 + }, + { + "epoch": 0.6292035885299135, + "grad_norm": 0.7332866191864014, + "learning_rate": 7.77510239919017e-06, + "loss": 0.7758, + "step": 11432 + }, + { + "epoch": 0.6292586273322692, + "grad_norm": 0.7532867193222046, + "learning_rate": 7.774741815329315e-06, + "loss": 0.8157, + "step": 11433 + }, + { + "epoch": 0.6293136661346249, + "grad_norm": 0.7498316168785095, + "learning_rate": 7.774381210614244e-06, + "loss": 0.7671, + "step": 11434 + }, + { + "epoch": 0.6293687049369806, + "grad_norm": 0.8017444610595703, + "learning_rate": 7.774020585047666e-06, + "loss": 0.6989, + "step": 11435 + }, + { + "epoch": 0.6294237437393362, + "grad_norm": 0.7827737927436829, + "learning_rate": 7.77365993863229e-06, + "loss": 0.852, + "step": 11436 + }, + { + "epoch": 0.6294787825416919, + "grad_norm": 1.1411668062210083, + "learning_rate": 7.77329927137083e-06, + "loss": 0.9303, + "step": 11437 + }, + { + "epoch": 0.6295338213440476, + "grad_norm": 1.2931067943572998, + "learning_rate": 7.772938583265995e-06, + "loss": 0.8913, + "step": 11438 + }, + { + "epoch": 0.6295888601464033, + "grad_norm": 0.7407616376876831, + "learning_rate": 7.772577874320494e-06, + "loss": 0.9247, + "step": 11439 + }, + { + "epoch": 0.6296438989487588, + "grad_norm": 0.6544716954231262, + "learning_rate": 7.772217144537043e-06, + "loss": 0.7879, + "step": 11440 + }, + { + "epoch": 0.6296989377511145, + "grad_norm": 0.7467932105064392, + "learning_rate": 7.77185639391835e-06, + "loss": 0.7624, + "step": 11441 + }, + { + "epoch": 0.6297539765534702, + "grad_norm": 0.6845136880874634, + "learning_rate": 7.771495622467123e-06, + "loss": 0.691, + "step": 11442 + }, + { + "epoch": 0.6298090153558259, + "grad_norm": 0.7881575226783752, + "learning_rate": 7.771134830186079e-06, + "loss": 0.7567, + "step": 11443 + }, + { + "epoch": 0.6298640541581815, + "grad_norm": 0.6910528540611267, + "learning_rate": 7.770774017077928e-06, + "loss": 0.7527, + "step": 11444 + }, + { + "epoch": 0.6299190929605372, + "grad_norm": 0.7395550608634949, + "learning_rate": 7.770413183145379e-06, + "loss": 0.8288, + "step": 11445 + }, + { + "epoch": 0.6299741317628929, + "grad_norm": 0.6876364350318909, + "learning_rate": 7.770052328391147e-06, + "loss": 0.7759, + "step": 11446 + }, + { + "epoch": 0.6300291705652485, + "grad_norm": 0.7936999201774597, + "learning_rate": 7.769691452817945e-06, + "loss": 0.6885, + "step": 11447 + }, + { + "epoch": 0.6300842093676041, + "grad_norm": 0.721479058265686, + "learning_rate": 7.769330556428482e-06, + "loss": 0.7215, + "step": 11448 + }, + { + "epoch": 0.6301392481699598, + "grad_norm": 0.6549312472343445, + "learning_rate": 7.76896963922547e-06, + "loss": 0.7523, + "step": 11449 + }, + { + "epoch": 0.6301942869723155, + "grad_norm": 0.6684648394584656, + "learning_rate": 7.768608701211627e-06, + "loss": 0.768, + "step": 11450 + }, + { + "epoch": 0.6302493257746712, + "grad_norm": 0.7014286518096924, + "learning_rate": 7.76824774238966e-06, + "loss": 0.7534, + "step": 11451 + }, + { + "epoch": 0.6303043645770268, + "grad_norm": 0.9186445474624634, + "learning_rate": 7.767886762762284e-06, + "loss": 0.8398, + "step": 11452 + }, + { + "epoch": 0.6303594033793825, + "grad_norm": 0.787187933921814, + "learning_rate": 7.76752576233221e-06, + "loss": 0.8035, + "step": 11453 + }, + { + "epoch": 0.6304144421817381, + "grad_norm": 0.7471121549606323, + "learning_rate": 7.767164741102157e-06, + "loss": 0.7983, + "step": 11454 + }, + { + "epoch": 0.6304694809840938, + "grad_norm": 0.6810591816902161, + "learning_rate": 7.766803699074834e-06, + "loss": 0.7132, + "step": 11455 + }, + { + "epoch": 0.6305245197864494, + "grad_norm": 0.7154163122177124, + "learning_rate": 7.766442636252953e-06, + "loss": 0.7942, + "step": 11456 + }, + { + "epoch": 0.6305795585888051, + "grad_norm": 0.6990880966186523, + "learning_rate": 7.766081552639231e-06, + "loss": 0.7296, + "step": 11457 + }, + { + "epoch": 0.6306345973911608, + "grad_norm": 0.8848066926002502, + "learning_rate": 7.76572044823638e-06, + "loss": 0.621, + "step": 11458 + }, + { + "epoch": 0.6306896361935165, + "grad_norm": 0.6929910182952881, + "learning_rate": 7.765359323047116e-06, + "loss": 0.5917, + "step": 11459 + }, + { + "epoch": 0.6307446749958721, + "grad_norm": 0.6874505281448364, + "learning_rate": 7.764998177074149e-06, + "loss": 0.7244, + "step": 11460 + }, + { + "epoch": 0.6307997137982277, + "grad_norm": 0.6823066473007202, + "learning_rate": 7.764637010320197e-06, + "loss": 0.7299, + "step": 11461 + }, + { + "epoch": 0.6308547526005834, + "grad_norm": 0.7315061688423157, + "learning_rate": 7.764275822787972e-06, + "loss": 0.7759, + "step": 11462 + }, + { + "epoch": 0.6309097914029391, + "grad_norm": 0.6186662316322327, + "learning_rate": 7.763914614480192e-06, + "loss": 0.6746, + "step": 11463 + }, + { + "epoch": 0.6309648302052947, + "grad_norm": 0.6751530170440674, + "learning_rate": 7.763553385399569e-06, + "loss": 0.8371, + "step": 11464 + }, + { + "epoch": 0.6310198690076504, + "grad_norm": 1.0283396244049072, + "learning_rate": 7.763192135548818e-06, + "loss": 0.7743, + "step": 11465 + }, + { + "epoch": 0.6310749078100061, + "grad_norm": 0.7695029973983765, + "learning_rate": 7.762830864930655e-06, + "loss": 0.7387, + "step": 11466 + }, + { + "epoch": 0.6311299466123618, + "grad_norm": 0.8087024688720703, + "learning_rate": 7.762469573547795e-06, + "loss": 0.8357, + "step": 11467 + }, + { + "epoch": 0.6311849854147173, + "grad_norm": 0.9203382134437561, + "learning_rate": 7.762108261402951e-06, + "loss": 0.8191, + "step": 11468 + }, + { + "epoch": 0.631240024217073, + "grad_norm": 0.6569168567657471, + "learning_rate": 7.761746928498843e-06, + "loss": 0.7035, + "step": 11469 + }, + { + "epoch": 0.6312950630194287, + "grad_norm": 0.7903677225112915, + "learning_rate": 7.761385574838183e-06, + "loss": 0.8295, + "step": 11470 + }, + { + "epoch": 0.6313501018217843, + "grad_norm": 0.6780279278755188, + "learning_rate": 7.76102420042369e-06, + "loss": 0.6497, + "step": 11471 + }, + { + "epoch": 0.63140514062414, + "grad_norm": 0.7150516510009766, + "learning_rate": 7.760662805258076e-06, + "loss": 0.7979, + "step": 11472 + }, + { + "epoch": 0.6314601794264957, + "grad_norm": 0.7278215885162354, + "learning_rate": 7.760301389344061e-06, + "loss": 0.8503, + "step": 11473 + }, + { + "epoch": 0.6315152182288514, + "grad_norm": 0.8695063591003418, + "learning_rate": 7.75993995268436e-06, + "loss": 0.7796, + "step": 11474 + }, + { + "epoch": 0.631570257031207, + "grad_norm": 0.7154332399368286, + "learning_rate": 7.759578495281688e-06, + "loss": 0.725, + "step": 11475 + }, + { + "epoch": 0.6316252958335626, + "grad_norm": 0.7151778936386108, + "learning_rate": 7.759217017138763e-06, + "loss": 0.6932, + "step": 11476 + }, + { + "epoch": 0.6316803346359183, + "grad_norm": 0.6328319311141968, + "learning_rate": 7.758855518258301e-06, + "loss": 0.7382, + "step": 11477 + }, + { + "epoch": 0.631735373438274, + "grad_norm": 0.8377438187599182, + "learning_rate": 7.75849399864302e-06, + "loss": 0.7782, + "step": 11478 + }, + { + "epoch": 0.6317904122406296, + "grad_norm": 0.6654751896858215, + "learning_rate": 7.758132458295637e-06, + "loss": 0.8076, + "step": 11479 + }, + { + "epoch": 0.6318454510429853, + "grad_norm": 0.6841873526573181, + "learning_rate": 7.757770897218869e-06, + "loss": 0.7195, + "step": 11480 + }, + { + "epoch": 0.631900489845341, + "grad_norm": 0.7791223526000977, + "learning_rate": 7.757409315415431e-06, + "loss": 0.7858, + "step": 11481 + }, + { + "epoch": 0.6319555286476967, + "grad_norm": 0.6412019729614258, + "learning_rate": 7.757047712888044e-06, + "loss": 0.6853, + "step": 11482 + }, + { + "epoch": 0.6320105674500522, + "grad_norm": 0.7058777213096619, + "learning_rate": 7.756686089639425e-06, + "loss": 0.8955, + "step": 11483 + }, + { + "epoch": 0.6320656062524079, + "grad_norm": 0.6950271725654602, + "learning_rate": 7.75632444567229e-06, + "loss": 0.7213, + "step": 11484 + }, + { + "epoch": 0.6321206450547636, + "grad_norm": 0.6938642859458923, + "learning_rate": 7.755962780989359e-06, + "loss": 0.749, + "step": 11485 + }, + { + "epoch": 0.6321756838571193, + "grad_norm": 4.447030544281006, + "learning_rate": 7.755601095593348e-06, + "loss": 0.7603, + "step": 11486 + }, + { + "epoch": 0.6322307226594749, + "grad_norm": 0.6693708896636963, + "learning_rate": 7.755239389486979e-06, + "loss": 0.769, + "step": 11487 + }, + { + "epoch": 0.6322857614618306, + "grad_norm": 0.830352246761322, + "learning_rate": 7.754877662672968e-06, + "loss": 0.8069, + "step": 11488 + }, + { + "epoch": 0.6323408002641863, + "grad_norm": 0.7211840748786926, + "learning_rate": 7.754515915154033e-06, + "loss": 0.7972, + "step": 11489 + }, + { + "epoch": 0.632395839066542, + "grad_norm": 0.723101019859314, + "learning_rate": 7.754154146932893e-06, + "loss": 0.7385, + "step": 11490 + }, + { + "epoch": 0.6324508778688975, + "grad_norm": 0.6515377759933472, + "learning_rate": 7.75379235801227e-06, + "loss": 0.7527, + "step": 11491 + }, + { + "epoch": 0.6325059166712532, + "grad_norm": 0.6296554803848267, + "learning_rate": 7.75343054839488e-06, + "loss": 0.7135, + "step": 11492 + }, + { + "epoch": 0.6325609554736089, + "grad_norm": 0.8153911232948303, + "learning_rate": 7.753068718083441e-06, + "loss": 0.7298, + "step": 11493 + }, + { + "epoch": 0.6326159942759646, + "grad_norm": 0.6735014915466309, + "learning_rate": 7.752706867080676e-06, + "loss": 0.6851, + "step": 11494 + }, + { + "epoch": 0.6326710330783202, + "grad_norm": 0.7077293992042542, + "learning_rate": 7.752344995389303e-06, + "loss": 0.7806, + "step": 11495 + }, + { + "epoch": 0.6327260718806759, + "grad_norm": 0.6928272843360901, + "learning_rate": 7.751983103012042e-06, + "loss": 0.7538, + "step": 11496 + }, + { + "epoch": 0.6327811106830316, + "grad_norm": 0.7058837413787842, + "learning_rate": 7.751621189951612e-06, + "loss": 0.7065, + "step": 11497 + }, + { + "epoch": 0.6328361494853872, + "grad_norm": 0.7272600531578064, + "learning_rate": 7.751259256210735e-06, + "loss": 0.7468, + "step": 11498 + }, + { + "epoch": 0.6328911882877428, + "grad_norm": 0.6175968050956726, + "learning_rate": 7.75089730179213e-06, + "loss": 0.7195, + "step": 11499 + }, + { + "epoch": 0.6329462270900985, + "grad_norm": 0.6567386984825134, + "learning_rate": 7.750535326698514e-06, + "loss": 0.8147, + "step": 11500 + }, + { + "epoch": 0.6330012658924542, + "grad_norm": 0.6325315237045288, + "learning_rate": 7.750173330932613e-06, + "loss": 0.7087, + "step": 11501 + }, + { + "epoch": 0.6330563046948099, + "grad_norm": 0.8607509732246399, + "learning_rate": 7.749811314497147e-06, + "loss": 0.8009, + "step": 11502 + }, + { + "epoch": 0.6331113434971655, + "grad_norm": 0.7452824711799622, + "learning_rate": 7.749449277394833e-06, + "loss": 0.7497, + "step": 11503 + }, + { + "epoch": 0.6331663822995212, + "grad_norm": 0.7371357679367065, + "learning_rate": 7.749087219628395e-06, + "loss": 0.8936, + "step": 11504 + }, + { + "epoch": 0.6332214211018768, + "grad_norm": 0.7177306413650513, + "learning_rate": 7.748725141200552e-06, + "loss": 0.8327, + "step": 11505 + }, + { + "epoch": 0.6332764599042325, + "grad_norm": 0.5938527584075928, + "learning_rate": 7.748363042114028e-06, + "loss": 0.6471, + "step": 11506 + }, + { + "epoch": 0.6333314987065881, + "grad_norm": 0.8827341198921204, + "learning_rate": 7.748000922371543e-06, + "loss": 0.7247, + "step": 11507 + }, + { + "epoch": 0.6333865375089438, + "grad_norm": 0.7008641958236694, + "learning_rate": 7.747638781975818e-06, + "loss": 0.684, + "step": 11508 + }, + { + "epoch": 0.6334415763112995, + "grad_norm": 0.7752355337142944, + "learning_rate": 7.747276620929576e-06, + "loss": 0.7993, + "step": 11509 + }, + { + "epoch": 0.6334966151136552, + "grad_norm": 0.6928088068962097, + "learning_rate": 7.74691443923554e-06, + "loss": 0.7213, + "step": 11510 + }, + { + "epoch": 0.6335516539160108, + "grad_norm": 0.8197296261787415, + "learning_rate": 7.746552236896428e-06, + "loss": 0.847, + "step": 11511 + }, + { + "epoch": 0.6336066927183664, + "grad_norm": 0.7912493348121643, + "learning_rate": 7.746190013914966e-06, + "loss": 0.8217, + "step": 11512 + }, + { + "epoch": 0.6336617315207221, + "grad_norm": 0.7726556062698364, + "learning_rate": 7.745827770293871e-06, + "loss": 0.7626, + "step": 11513 + }, + { + "epoch": 0.6337167703230777, + "grad_norm": 0.668569028377533, + "learning_rate": 7.745465506035873e-06, + "loss": 0.7141, + "step": 11514 + }, + { + "epoch": 0.6337718091254334, + "grad_norm": 0.7226139903068542, + "learning_rate": 7.745103221143694e-06, + "loss": 0.7262, + "step": 11515 + }, + { + "epoch": 0.6338268479277891, + "grad_norm": 0.7315354943275452, + "learning_rate": 7.744740915620051e-06, + "loss": 0.7955, + "step": 11516 + }, + { + "epoch": 0.6338818867301448, + "grad_norm": 0.6815279126167297, + "learning_rate": 7.744378589467668e-06, + "loss": 0.7347, + "step": 11517 + }, + { + "epoch": 0.6339369255325004, + "grad_norm": 0.6931445598602295, + "learning_rate": 7.744016242689272e-06, + "loss": 0.7959, + "step": 11518 + }, + { + "epoch": 0.633991964334856, + "grad_norm": 0.7156991362571716, + "learning_rate": 7.743653875287584e-06, + "loss": 0.7793, + "step": 11519 + }, + { + "epoch": 0.6340470031372117, + "grad_norm": 0.8503926396369934, + "learning_rate": 7.74329148726533e-06, + "loss": 0.823, + "step": 11520 + }, + { + "epoch": 0.6341020419395674, + "grad_norm": 0.6280057430267334, + "learning_rate": 7.742929078625228e-06, + "loss": 0.6729, + "step": 11521 + }, + { + "epoch": 0.634157080741923, + "grad_norm": 0.7004517316818237, + "learning_rate": 7.742566649370008e-06, + "loss": 0.7578, + "step": 11522 + }, + { + "epoch": 0.6342121195442787, + "grad_norm": 0.7147908210754395, + "learning_rate": 7.74220419950239e-06, + "loss": 0.7705, + "step": 11523 + }, + { + "epoch": 0.6342671583466344, + "grad_norm": 0.7191137671470642, + "learning_rate": 7.7418417290251e-06, + "loss": 0.789, + "step": 11524 + }, + { + "epoch": 0.6343221971489901, + "grad_norm": 0.7288943529129028, + "learning_rate": 7.741479237940862e-06, + "loss": 0.8204, + "step": 11525 + }, + { + "epoch": 0.6343772359513457, + "grad_norm": 0.714821994304657, + "learning_rate": 7.741116726252398e-06, + "loss": 0.8252, + "step": 11526 + }, + { + "epoch": 0.6344322747537013, + "grad_norm": 0.6869103312492371, + "learning_rate": 7.740754193962435e-06, + "loss": 0.8136, + "step": 11527 + }, + { + "epoch": 0.634487313556057, + "grad_norm": 0.6629248857498169, + "learning_rate": 7.740391641073698e-06, + "loss": 0.7049, + "step": 11528 + }, + { + "epoch": 0.6345423523584127, + "grad_norm": 0.7078685164451599, + "learning_rate": 7.74002906758891e-06, + "loss": 0.7345, + "step": 11529 + }, + { + "epoch": 0.6345973911607683, + "grad_norm": 0.7748367190361023, + "learning_rate": 7.739666473510798e-06, + "loss": 0.7085, + "step": 11530 + }, + { + "epoch": 0.634652429963124, + "grad_norm": 0.6661930084228516, + "learning_rate": 7.739303858842086e-06, + "loss": 0.7795, + "step": 11531 + }, + { + "epoch": 0.6347074687654797, + "grad_norm": 0.6847965121269226, + "learning_rate": 7.738941223585499e-06, + "loss": 0.797, + "step": 11532 + }, + { + "epoch": 0.6347625075678354, + "grad_norm": 0.695184051990509, + "learning_rate": 7.738578567743762e-06, + "loss": 0.8184, + "step": 11533 + }, + { + "epoch": 0.6348175463701909, + "grad_norm": 0.6620088815689087, + "learning_rate": 7.738215891319603e-06, + "loss": 0.721, + "step": 11534 + }, + { + "epoch": 0.6348725851725466, + "grad_norm": 0.6802023649215698, + "learning_rate": 7.737853194315745e-06, + "loss": 0.9207, + "step": 11535 + }, + { + "epoch": 0.6349276239749023, + "grad_norm": 1.0193618535995483, + "learning_rate": 7.737490476734916e-06, + "loss": 0.8495, + "step": 11536 + }, + { + "epoch": 0.634982662777258, + "grad_norm": 0.6578189730644226, + "learning_rate": 7.737127738579841e-06, + "loss": 0.7455, + "step": 11537 + }, + { + "epoch": 0.6350377015796136, + "grad_norm": 0.70018470287323, + "learning_rate": 7.736764979853248e-06, + "loss": 0.7414, + "step": 11538 + }, + { + "epoch": 0.6350927403819693, + "grad_norm": 0.8136304616928101, + "learning_rate": 7.736402200557862e-06, + "loss": 0.7327, + "step": 11539 + }, + { + "epoch": 0.635147779184325, + "grad_norm": 0.7805309295654297, + "learning_rate": 7.736039400696408e-06, + "loss": 0.7659, + "step": 11540 + }, + { + "epoch": 0.6352028179866807, + "grad_norm": 0.675215482711792, + "learning_rate": 7.735676580271615e-06, + "loss": 0.7532, + "step": 11541 + }, + { + "epoch": 0.6352578567890362, + "grad_norm": 0.6873239874839783, + "learning_rate": 7.735313739286208e-06, + "loss": 0.8123, + "step": 11542 + }, + { + "epoch": 0.6353128955913919, + "grad_norm": 0.6624773144721985, + "learning_rate": 7.734950877742917e-06, + "loss": 0.7642, + "step": 11543 + }, + { + "epoch": 0.6353679343937476, + "grad_norm": 0.8047438859939575, + "learning_rate": 7.734587995644468e-06, + "loss": 0.7452, + "step": 11544 + }, + { + "epoch": 0.6354229731961033, + "grad_norm": 0.7449815273284912, + "learning_rate": 7.734225092993585e-06, + "loss": 0.7756, + "step": 11545 + }, + { + "epoch": 0.6354780119984589, + "grad_norm": 0.693081259727478, + "learning_rate": 7.733862169792999e-06, + "loss": 0.7029, + "step": 11546 + }, + { + "epoch": 0.6355330508008146, + "grad_norm": 0.6593700051307678, + "learning_rate": 7.733499226045437e-06, + "loss": 0.6009, + "step": 11547 + }, + { + "epoch": 0.6355880896031703, + "grad_norm": 0.7402041554450989, + "learning_rate": 7.733136261753627e-06, + "loss": 0.6921, + "step": 11548 + }, + { + "epoch": 0.635643128405526, + "grad_norm": 0.7686228156089783, + "learning_rate": 7.732773276920294e-06, + "loss": 0.855, + "step": 11549 + }, + { + "epoch": 0.6356981672078815, + "grad_norm": 0.6776669025421143, + "learning_rate": 7.732410271548171e-06, + "loss": 0.7146, + "step": 11550 + }, + { + "epoch": 0.6357532060102372, + "grad_norm": 0.6055952906608582, + "learning_rate": 7.732047245639983e-06, + "loss": 0.6926, + "step": 11551 + }, + { + "epoch": 0.6358082448125929, + "grad_norm": 0.7452635765075684, + "learning_rate": 7.731684199198461e-06, + "loss": 0.7766, + "step": 11552 + }, + { + "epoch": 0.6358632836149486, + "grad_norm": 0.7482720017433167, + "learning_rate": 7.73132113222633e-06, + "loss": 0.7725, + "step": 11553 + }, + { + "epoch": 0.6359183224173042, + "grad_norm": 0.6534025073051453, + "learning_rate": 7.73095804472632e-06, + "loss": 0.7902, + "step": 11554 + }, + { + "epoch": 0.6359733612196599, + "grad_norm": 0.7364560961723328, + "learning_rate": 7.730594936701162e-06, + "loss": 0.7998, + "step": 11555 + }, + { + "epoch": 0.6360284000220155, + "grad_norm": 0.6881458163261414, + "learning_rate": 7.730231808153582e-06, + "loss": 0.7586, + "step": 11556 + }, + { + "epoch": 0.6360834388243711, + "grad_norm": 0.6574262976646423, + "learning_rate": 7.72986865908631e-06, + "loss": 0.6999, + "step": 11557 + }, + { + "epoch": 0.6361384776267268, + "grad_norm": 0.6976385712623596, + "learning_rate": 7.729505489502078e-06, + "loss": 0.7387, + "step": 11558 + }, + { + "epoch": 0.6361935164290825, + "grad_norm": 0.6482532620429993, + "learning_rate": 7.729142299403613e-06, + "loss": 0.7715, + "step": 11559 + }, + { + "epoch": 0.6362485552314382, + "grad_norm": 0.7140287160873413, + "learning_rate": 7.728779088793643e-06, + "loss": 0.8562, + "step": 11560 + }, + { + "epoch": 0.6363035940337938, + "grad_norm": 0.6579470634460449, + "learning_rate": 7.728415857674901e-06, + "loss": 0.727, + "step": 11561 + }, + { + "epoch": 0.6363586328361495, + "grad_norm": 0.8670933246612549, + "learning_rate": 7.728052606050116e-06, + "loss": 0.7459, + "step": 11562 + }, + { + "epoch": 0.6364136716385052, + "grad_norm": 0.7995489835739136, + "learning_rate": 7.72768933392202e-06, + "loss": 0.8228, + "step": 11563 + }, + { + "epoch": 0.6364687104408608, + "grad_norm": 0.6467362642288208, + "learning_rate": 7.727326041293336e-06, + "loss": 0.7545, + "step": 11564 + }, + { + "epoch": 0.6365237492432164, + "grad_norm": 0.6646577715873718, + "learning_rate": 7.726962728166803e-06, + "loss": 0.7824, + "step": 11565 + }, + { + "epoch": 0.6365787880455721, + "grad_norm": 0.6576912999153137, + "learning_rate": 7.726599394545149e-06, + "loss": 0.7324, + "step": 11566 + }, + { + "epoch": 0.6366338268479278, + "grad_norm": 0.7514963150024414, + "learning_rate": 7.726236040431101e-06, + "loss": 0.7712, + "step": 11567 + }, + { + "epoch": 0.6366888656502835, + "grad_norm": 0.7313328981399536, + "learning_rate": 7.725872665827394e-06, + "loss": 0.7361, + "step": 11568 + }, + { + "epoch": 0.6367439044526391, + "grad_norm": 0.7109994292259216, + "learning_rate": 7.725509270736759e-06, + "loss": 0.812, + "step": 11569 + }, + { + "epoch": 0.6367989432549948, + "grad_norm": 1.128675103187561, + "learning_rate": 7.725145855161924e-06, + "loss": 0.726, + "step": 11570 + }, + { + "epoch": 0.6368539820573504, + "grad_norm": 0.7357437014579773, + "learning_rate": 7.724782419105622e-06, + "loss": 0.7958, + "step": 11571 + }, + { + "epoch": 0.6369090208597061, + "grad_norm": 0.6874725222587585, + "learning_rate": 7.724418962570587e-06, + "loss": 0.751, + "step": 11572 + }, + { + "epoch": 0.6369640596620617, + "grad_norm": 0.7175989747047424, + "learning_rate": 7.724055485559545e-06, + "loss": 0.7191, + "step": 11573 + }, + { + "epoch": 0.6370190984644174, + "grad_norm": 0.6424688100814819, + "learning_rate": 7.723691988075235e-06, + "loss": 0.608, + "step": 11574 + }, + { + "epoch": 0.6370741372667731, + "grad_norm": 0.6845381855964661, + "learning_rate": 7.723328470120383e-06, + "loss": 0.7465, + "step": 11575 + }, + { + "epoch": 0.6371291760691288, + "grad_norm": 0.7955030202865601, + "learning_rate": 7.722964931697723e-06, + "loss": 0.745, + "step": 11576 + }, + { + "epoch": 0.6371842148714844, + "grad_norm": 0.6855689883232117, + "learning_rate": 7.722601372809989e-06, + "loss": 0.7764, + "step": 11577 + }, + { + "epoch": 0.63723925367384, + "grad_norm": 0.7505692839622498, + "learning_rate": 7.722237793459909e-06, + "loss": 0.8324, + "step": 11578 + }, + { + "epoch": 0.6372942924761957, + "grad_norm": 0.6852842569351196, + "learning_rate": 7.721874193650221e-06, + "loss": 0.7599, + "step": 11579 + }, + { + "epoch": 0.6373493312785514, + "grad_norm": 0.698210597038269, + "learning_rate": 7.721510573383654e-06, + "loss": 0.843, + "step": 11580 + }, + { + "epoch": 0.637404370080907, + "grad_norm": 0.8344444632530212, + "learning_rate": 7.721146932662942e-06, + "loss": 0.8602, + "step": 11581 + }, + { + "epoch": 0.6374594088832627, + "grad_norm": 0.6385721564292908, + "learning_rate": 7.72078327149082e-06, + "loss": 0.7449, + "step": 11582 + }, + { + "epoch": 0.6375144476856184, + "grad_norm": 0.6474401354789734, + "learning_rate": 7.720419589870016e-06, + "loss": 0.6328, + "step": 11583 + }, + { + "epoch": 0.6375694864879741, + "grad_norm": 0.6554263234138489, + "learning_rate": 7.720055887803268e-06, + "loss": 0.6672, + "step": 11584 + }, + { + "epoch": 0.6376245252903296, + "grad_norm": 0.6551910638809204, + "learning_rate": 7.719692165293309e-06, + "loss": 0.8024, + "step": 11585 + }, + { + "epoch": 0.6376795640926853, + "grad_norm": 0.693418025970459, + "learning_rate": 7.719328422342871e-06, + "loss": 0.726, + "step": 11586 + }, + { + "epoch": 0.637734602895041, + "grad_norm": 0.8642090559005737, + "learning_rate": 7.718964658954689e-06, + "loss": 0.8274, + "step": 11587 + }, + { + "epoch": 0.6377896416973967, + "grad_norm": 0.8255778551101685, + "learning_rate": 7.718600875131494e-06, + "loss": 0.7259, + "step": 11588 + }, + { + "epoch": 0.6378446804997523, + "grad_norm": 0.7492913007736206, + "learning_rate": 7.718237070876025e-06, + "loss": 0.7093, + "step": 11589 + }, + { + "epoch": 0.637899719302108, + "grad_norm": 0.7154868245124817, + "learning_rate": 7.717873246191013e-06, + "loss": 0.7909, + "step": 11590 + }, + { + "epoch": 0.6379547581044637, + "grad_norm": 0.7751424312591553, + "learning_rate": 7.717509401079194e-06, + "loss": 0.8528, + "step": 11591 + }, + { + "epoch": 0.6380097969068194, + "grad_norm": 0.68199223279953, + "learning_rate": 7.7171455355433e-06, + "loss": 0.7077, + "step": 11592 + }, + { + "epoch": 0.6380648357091749, + "grad_norm": 0.7340414524078369, + "learning_rate": 7.716781649586069e-06, + "loss": 0.693, + "step": 11593 + }, + { + "epoch": 0.6381198745115306, + "grad_norm": 0.6278988122940063, + "learning_rate": 7.716417743210234e-06, + "loss": 0.7049, + "step": 11594 + }, + { + "epoch": 0.6381749133138863, + "grad_norm": 0.9113193154335022, + "learning_rate": 7.716053816418532e-06, + "loss": 0.7757, + "step": 11595 + }, + { + "epoch": 0.638229952116242, + "grad_norm": 0.7059371471405029, + "learning_rate": 7.715689869213694e-06, + "loss": 0.7805, + "step": 11596 + }, + { + "epoch": 0.6382849909185976, + "grad_norm": 0.7508488297462463, + "learning_rate": 7.71532590159846e-06, + "loss": 0.7394, + "step": 11597 + }, + { + "epoch": 0.6383400297209533, + "grad_norm": 0.8222774863243103, + "learning_rate": 7.71496191357556e-06, + "loss": 0.7675, + "step": 11598 + }, + { + "epoch": 0.638395068523309, + "grad_norm": 0.7295246124267578, + "learning_rate": 7.714597905147736e-06, + "loss": 0.7766, + "step": 11599 + }, + { + "epoch": 0.6384501073256645, + "grad_norm": 0.7482065558433533, + "learning_rate": 7.71423387631772e-06, + "loss": 0.7334, + "step": 11600 + }, + { + "epoch": 0.6385051461280202, + "grad_norm": 0.7654659748077393, + "learning_rate": 7.71386982708825e-06, + "loss": 0.8097, + "step": 11601 + }, + { + "epoch": 0.6385601849303759, + "grad_norm": 0.9125531911849976, + "learning_rate": 7.71350575746206e-06, + "loss": 0.7776, + "step": 11602 + }, + { + "epoch": 0.6386152237327316, + "grad_norm": 0.8063878417015076, + "learning_rate": 7.713141667441886e-06, + "loss": 0.7899, + "step": 11603 + }, + { + "epoch": 0.6386702625350872, + "grad_norm": 0.7315171360969543, + "learning_rate": 7.712777557030466e-06, + "loss": 0.7884, + "step": 11604 + }, + { + "epoch": 0.6387253013374429, + "grad_norm": 0.7306345105171204, + "learning_rate": 7.712413426230536e-06, + "loss": 0.8646, + "step": 11605 + }, + { + "epoch": 0.6387803401397986, + "grad_norm": 0.8300313353538513, + "learning_rate": 7.712049275044833e-06, + "loss": 0.8131, + "step": 11606 + }, + { + "epoch": 0.6388353789421543, + "grad_norm": 0.7513623237609863, + "learning_rate": 7.711685103476093e-06, + "loss": 0.8115, + "step": 11607 + }, + { + "epoch": 0.6388904177445098, + "grad_norm": 0.7126060128211975, + "learning_rate": 7.711320911527054e-06, + "loss": 0.8198, + "step": 11608 + }, + { + "epoch": 0.6389454565468655, + "grad_norm": 0.7017398476600647, + "learning_rate": 7.710956699200454e-06, + "loss": 0.8088, + "step": 11609 + }, + { + "epoch": 0.6390004953492212, + "grad_norm": 0.7345026135444641, + "learning_rate": 7.710592466499027e-06, + "loss": 0.8228, + "step": 11610 + }, + { + "epoch": 0.6390555341515769, + "grad_norm": 0.6903058886528015, + "learning_rate": 7.710228213425514e-06, + "loss": 0.7058, + "step": 11611 + }, + { + "epoch": 0.6391105729539325, + "grad_norm": 0.6838604211807251, + "learning_rate": 7.70986393998265e-06, + "loss": 0.7091, + "step": 11612 + }, + { + "epoch": 0.6391656117562882, + "grad_norm": 0.7067943811416626, + "learning_rate": 7.709499646173177e-06, + "loss": 0.7631, + "step": 11613 + }, + { + "epoch": 0.6392206505586439, + "grad_norm": 0.7577057480812073, + "learning_rate": 7.709135331999827e-06, + "loss": 0.7545, + "step": 11614 + }, + { + "epoch": 0.6392756893609995, + "grad_norm": 0.6425572633743286, + "learning_rate": 7.70877099746534e-06, + "loss": 0.7188, + "step": 11615 + }, + { + "epoch": 0.6393307281633551, + "grad_norm": 0.7257497310638428, + "learning_rate": 7.708406642572459e-06, + "loss": 0.7514, + "step": 11616 + }, + { + "epoch": 0.6393857669657108, + "grad_norm": 0.8214251399040222, + "learning_rate": 7.708042267323916e-06, + "loss": 0.7824, + "step": 11617 + }, + { + "epoch": 0.6394408057680665, + "grad_norm": 0.7879108786582947, + "learning_rate": 7.707677871722453e-06, + "loss": 0.6122, + "step": 11618 + }, + { + "epoch": 0.6394958445704222, + "grad_norm": 0.6656795740127563, + "learning_rate": 7.707313455770808e-06, + "loss": 0.754, + "step": 11619 + }, + { + "epoch": 0.6395508833727778, + "grad_norm": 0.7196451425552368, + "learning_rate": 7.70694901947172e-06, + "loss": 0.7662, + "step": 11620 + }, + { + "epoch": 0.6396059221751335, + "grad_norm": 0.8213779926300049, + "learning_rate": 7.706584562827928e-06, + "loss": 0.8732, + "step": 11621 + }, + { + "epoch": 0.6396609609774891, + "grad_norm": 0.7114893794059753, + "learning_rate": 7.70622008584217e-06, + "loss": 0.8493, + "step": 11622 + }, + { + "epoch": 0.6397159997798448, + "grad_norm": 0.7009783983230591, + "learning_rate": 7.705855588517188e-06, + "loss": 0.738, + "step": 11623 + }, + { + "epoch": 0.6397710385822004, + "grad_norm": 0.7576995491981506, + "learning_rate": 7.705491070855717e-06, + "loss": 0.8839, + "step": 11624 + }, + { + "epoch": 0.6398260773845561, + "grad_norm": 0.705784022808075, + "learning_rate": 7.7051265328605e-06, + "loss": 0.7246, + "step": 11625 + }, + { + "epoch": 0.6398811161869118, + "grad_norm": 0.6696903109550476, + "learning_rate": 7.704761974534277e-06, + "loss": 0.7418, + "step": 11626 + }, + { + "epoch": 0.6399361549892675, + "grad_norm": 0.8617024421691895, + "learning_rate": 7.704397395879786e-06, + "loss": 0.8109, + "step": 11627 + }, + { + "epoch": 0.6399911937916231, + "grad_norm": 0.6819054484367371, + "learning_rate": 7.70403279689977e-06, + "loss": 0.6438, + "step": 11628 + }, + { + "epoch": 0.6400462325939787, + "grad_norm": 0.6145044565200806, + "learning_rate": 7.703668177596966e-06, + "loss": 0.6712, + "step": 11629 + }, + { + "epoch": 0.6401012713963344, + "grad_norm": 0.6946390271186829, + "learning_rate": 7.703303537974116e-06, + "loss": 0.8099, + "step": 11630 + }, + { + "epoch": 0.6401563101986901, + "grad_norm": 0.6791605949401855, + "learning_rate": 7.702938878033961e-06, + "loss": 0.7494, + "step": 11631 + }, + { + "epoch": 0.6402113490010457, + "grad_norm": 0.6718626618385315, + "learning_rate": 7.70257419777924e-06, + "loss": 0.7471, + "step": 11632 + }, + { + "epoch": 0.6402663878034014, + "grad_norm": 0.8051798343658447, + "learning_rate": 7.702209497212694e-06, + "loss": 0.8569, + "step": 11633 + }, + { + "epoch": 0.6403214266057571, + "grad_norm": 0.6602774858474731, + "learning_rate": 7.701844776337067e-06, + "loss": 0.7396, + "step": 11634 + }, + { + "epoch": 0.6403764654081128, + "grad_norm": 0.672363817691803, + "learning_rate": 7.701480035155096e-06, + "loss": 0.7584, + "step": 11635 + }, + { + "epoch": 0.6404315042104683, + "grad_norm": 0.7363641262054443, + "learning_rate": 7.701115273669524e-06, + "loss": 0.8149, + "step": 11636 + }, + { + "epoch": 0.640486543012824, + "grad_norm": 0.7238422632217407, + "learning_rate": 7.700750491883094e-06, + "loss": 0.7598, + "step": 11637 + }, + { + "epoch": 0.6405415818151797, + "grad_norm": 1.3627614974975586, + "learning_rate": 7.700385689798544e-06, + "loss": 0.8303, + "step": 11638 + }, + { + "epoch": 0.6405966206175354, + "grad_norm": 0.6339633464813232, + "learning_rate": 7.70002086741862e-06, + "loss": 0.7308, + "step": 11639 + }, + { + "epoch": 0.640651659419891, + "grad_norm": 0.6821589469909668, + "learning_rate": 7.699656024746062e-06, + "loss": 0.6728, + "step": 11640 + }, + { + "epoch": 0.6407066982222467, + "grad_norm": 0.8514766097068787, + "learning_rate": 7.699291161783611e-06, + "loss": 0.8693, + "step": 11641 + }, + { + "epoch": 0.6407617370246024, + "grad_norm": 0.649075984954834, + "learning_rate": 7.698926278534011e-06, + "loss": 0.7482, + "step": 11642 + }, + { + "epoch": 0.640816775826958, + "grad_norm": 0.6507017016410828, + "learning_rate": 7.698561375000001e-06, + "loss": 0.7841, + "step": 11643 + }, + { + "epoch": 0.6408718146293136, + "grad_norm": 0.6736069321632385, + "learning_rate": 7.69819645118433e-06, + "loss": 0.74, + "step": 11644 + }, + { + "epoch": 0.6409268534316693, + "grad_norm": 0.6727941632270813, + "learning_rate": 7.697831507089734e-06, + "loss": 0.806, + "step": 11645 + }, + { + "epoch": 0.640981892234025, + "grad_norm": 0.7089083194732666, + "learning_rate": 7.697466542718959e-06, + "loss": 0.8091, + "step": 11646 + }, + { + "epoch": 0.6410369310363806, + "grad_norm": 0.6355387568473816, + "learning_rate": 7.69710155807475e-06, + "loss": 0.7033, + "step": 11647 + }, + { + "epoch": 0.6410919698387363, + "grad_norm": 0.6327098608016968, + "learning_rate": 7.696736553159846e-06, + "loss": 0.7664, + "step": 11648 + }, + { + "epoch": 0.641147008641092, + "grad_norm": 0.6971945762634277, + "learning_rate": 7.69637152797699e-06, + "loss": 0.7441, + "step": 11649 + }, + { + "epoch": 0.6412020474434477, + "grad_norm": 0.7420539855957031, + "learning_rate": 7.696006482528929e-06, + "loss": 0.7909, + "step": 11650 + }, + { + "epoch": 0.6412570862458032, + "grad_norm": 0.6877853274345398, + "learning_rate": 7.695641416818405e-06, + "loss": 0.7624, + "step": 11651 + }, + { + "epoch": 0.6413121250481589, + "grad_norm": 0.7337075471878052, + "learning_rate": 7.695276330848162e-06, + "loss": 0.7829, + "step": 11652 + }, + { + "epoch": 0.6413671638505146, + "grad_norm": 0.6423582434654236, + "learning_rate": 7.694911224620944e-06, + "loss": 0.6686, + "step": 11653 + }, + { + "epoch": 0.6414222026528703, + "grad_norm": 0.7826602458953857, + "learning_rate": 7.694546098139492e-06, + "loss": 0.774, + "step": 11654 + }, + { + "epoch": 0.6414772414552259, + "grad_norm": 0.7678147554397583, + "learning_rate": 7.694180951406556e-06, + "loss": 0.8067, + "step": 11655 + }, + { + "epoch": 0.6415322802575816, + "grad_norm": 0.6400566101074219, + "learning_rate": 7.693815784424875e-06, + "loss": 0.7796, + "step": 11656 + }, + { + "epoch": 0.6415873190599373, + "grad_norm": 0.6606197357177734, + "learning_rate": 7.693450597197196e-06, + "loss": 0.7381, + "step": 11657 + }, + { + "epoch": 0.641642357862293, + "grad_norm": 0.7953683137893677, + "learning_rate": 7.693085389726262e-06, + "loss": 0.8867, + "step": 11658 + }, + { + "epoch": 0.6416973966646485, + "grad_norm": 0.6763843894004822, + "learning_rate": 7.692720162014822e-06, + "loss": 0.7579, + "step": 11659 + }, + { + "epoch": 0.6417524354670042, + "grad_norm": 0.6456292867660522, + "learning_rate": 7.692354914065617e-06, + "loss": 0.7814, + "step": 11660 + }, + { + "epoch": 0.6418074742693599, + "grad_norm": 0.702803373336792, + "learning_rate": 7.691989645881393e-06, + "loss": 0.7393, + "step": 11661 + }, + { + "epoch": 0.6418625130717156, + "grad_norm": 0.8328298926353455, + "learning_rate": 7.691624357464895e-06, + "loss": 0.6587, + "step": 11662 + }, + { + "epoch": 0.6419175518740712, + "grad_norm": 0.8409613966941833, + "learning_rate": 7.691259048818871e-06, + "loss": 0.8075, + "step": 11663 + }, + { + "epoch": 0.6419725906764269, + "grad_norm": 0.6969256401062012, + "learning_rate": 7.690893719946062e-06, + "loss": 0.8061, + "step": 11664 + }, + { + "epoch": 0.6420276294787826, + "grad_norm": 0.7689732313156128, + "learning_rate": 7.690528370849217e-06, + "loss": 0.7709, + "step": 11665 + }, + { + "epoch": 0.6420826682811382, + "grad_norm": 0.8239523768424988, + "learning_rate": 7.69016300153108e-06, + "loss": 0.7421, + "step": 11666 + }, + { + "epoch": 0.6421377070834938, + "grad_norm": 0.7199227809906006, + "learning_rate": 7.689797611994398e-06, + "loss": 0.7877, + "step": 11667 + }, + { + "epoch": 0.6421927458858495, + "grad_norm": 0.8315985798835754, + "learning_rate": 7.689432202241919e-06, + "loss": 0.8458, + "step": 11668 + }, + { + "epoch": 0.6422477846882052, + "grad_norm": 0.7213512063026428, + "learning_rate": 7.689066772276385e-06, + "loss": 0.7199, + "step": 11669 + }, + { + "epoch": 0.6423028234905609, + "grad_norm": 0.6023604273796082, + "learning_rate": 7.688701322100547e-06, + "loss": 0.6485, + "step": 11670 + }, + { + "epoch": 0.6423578622929165, + "grad_norm": 0.8171319365501404, + "learning_rate": 7.688335851717148e-06, + "loss": 0.7561, + "step": 11671 + }, + { + "epoch": 0.6424129010952722, + "grad_norm": 0.6545816659927368, + "learning_rate": 7.687970361128937e-06, + "loss": 0.6796, + "step": 11672 + }, + { + "epoch": 0.6424679398976278, + "grad_norm": 0.8093686103820801, + "learning_rate": 7.687604850338661e-06, + "loss": 0.8538, + "step": 11673 + }, + { + "epoch": 0.6425229786999835, + "grad_norm": 0.6438135504722595, + "learning_rate": 7.687239319349066e-06, + "loss": 0.7046, + "step": 11674 + }, + { + "epoch": 0.6425780175023391, + "grad_norm": 0.685100257396698, + "learning_rate": 7.6868737681629e-06, + "loss": 0.7568, + "step": 11675 + }, + { + "epoch": 0.6426330563046948, + "grad_norm": 0.6850112676620483, + "learning_rate": 7.68650819678291e-06, + "loss": 0.7082, + "step": 11676 + }, + { + "epoch": 0.6426880951070505, + "grad_norm": 0.7524490356445312, + "learning_rate": 7.686142605211843e-06, + "loss": 0.7285, + "step": 11677 + }, + { + "epoch": 0.6427431339094062, + "grad_norm": 0.7706617116928101, + "learning_rate": 7.685776993452446e-06, + "loss": 0.7934, + "step": 11678 + }, + { + "epoch": 0.6427981727117618, + "grad_norm": 0.6612235307693481, + "learning_rate": 7.68541136150747e-06, + "loss": 0.6538, + "step": 11679 + }, + { + "epoch": 0.6428532115141175, + "grad_norm": 0.6380587816238403, + "learning_rate": 7.68504570937966e-06, + "loss": 0.7, + "step": 11680 + }, + { + "epoch": 0.6429082503164731, + "grad_norm": 0.6563882231712341, + "learning_rate": 7.684680037071765e-06, + "loss": 0.6912, + "step": 11681 + }, + { + "epoch": 0.6429632891188288, + "grad_norm": 0.6579793095588684, + "learning_rate": 7.684314344586534e-06, + "loss": 0.7263, + "step": 11682 + }, + { + "epoch": 0.6430183279211844, + "grad_norm": 0.7029374837875366, + "learning_rate": 7.683948631926713e-06, + "loss": 0.7151, + "step": 11683 + }, + { + "epoch": 0.6430733667235401, + "grad_norm": 0.6683217883110046, + "learning_rate": 7.683582899095056e-06, + "loss": 0.7643, + "step": 11684 + }, + { + "epoch": 0.6431284055258958, + "grad_norm": 1.0482646226882935, + "learning_rate": 7.683217146094308e-06, + "loss": 0.8889, + "step": 11685 + }, + { + "epoch": 0.6431834443282514, + "grad_norm": 0.7101102471351624, + "learning_rate": 7.682851372927216e-06, + "loss": 0.7762, + "step": 11686 + }, + { + "epoch": 0.643238483130607, + "grad_norm": 0.674961268901825, + "learning_rate": 7.682485579596533e-06, + "loss": 0.736, + "step": 11687 + }, + { + "epoch": 0.6432935219329627, + "grad_norm": 0.7071837782859802, + "learning_rate": 7.682119766105005e-06, + "loss": 0.7231, + "step": 11688 + }, + { + "epoch": 0.6433485607353184, + "grad_norm": 0.6982744932174683, + "learning_rate": 7.681753932455383e-06, + "loss": 0.7498, + "step": 11689 + }, + { + "epoch": 0.643403599537674, + "grad_norm": 0.6927201747894287, + "learning_rate": 7.681388078650415e-06, + "loss": 0.803, + "step": 11690 + }, + { + "epoch": 0.6434586383400297, + "grad_norm": 0.7299236059188843, + "learning_rate": 7.681022204692854e-06, + "loss": 0.7386, + "step": 11691 + }, + { + "epoch": 0.6435136771423854, + "grad_norm": 0.8809047937393188, + "learning_rate": 7.680656310585449e-06, + "loss": 0.741, + "step": 11692 + }, + { + "epoch": 0.6435687159447411, + "grad_norm": 0.862843930721283, + "learning_rate": 7.680290396330947e-06, + "loss": 0.8357, + "step": 11693 + }, + { + "epoch": 0.6436237547470967, + "grad_norm": 0.7436664700508118, + "learning_rate": 7.679924461932098e-06, + "loss": 0.8352, + "step": 11694 + }, + { + "epoch": 0.6436787935494523, + "grad_norm": 0.6582232713699341, + "learning_rate": 7.679558507391657e-06, + "loss": 0.7107, + "step": 11695 + }, + { + "epoch": 0.643733832351808, + "grad_norm": 0.6798850297927856, + "learning_rate": 7.67919253271237e-06, + "loss": 0.6968, + "step": 11696 + }, + { + "epoch": 0.6437888711541637, + "grad_norm": 0.7747187614440918, + "learning_rate": 7.67882653789699e-06, + "loss": 0.7611, + "step": 11697 + }, + { + "epoch": 0.6438439099565193, + "grad_norm": 0.7097567915916443, + "learning_rate": 7.678460522948267e-06, + "loss": 0.7275, + "step": 11698 + }, + { + "epoch": 0.643898948758875, + "grad_norm": 0.6958394050598145, + "learning_rate": 7.678094487868952e-06, + "loss": 0.7441, + "step": 11699 + }, + { + "epoch": 0.6439539875612307, + "grad_norm": 0.9129040837287903, + "learning_rate": 7.677728432661794e-06, + "loss": 0.7693, + "step": 11700 + }, + { + "epoch": 0.6440090263635864, + "grad_norm": 1.1396137475967407, + "learning_rate": 7.677362357329548e-06, + "loss": 0.7479, + "step": 11701 + }, + { + "epoch": 0.644064065165942, + "grad_norm": 0.8163042664527893, + "learning_rate": 7.67699626187496e-06, + "loss": 0.835, + "step": 11702 + }, + { + "epoch": 0.6441191039682976, + "grad_norm": 0.9869117736816406, + "learning_rate": 7.676630146300787e-06, + "loss": 0.769, + "step": 11703 + }, + { + "epoch": 0.6441741427706533, + "grad_norm": 0.7439526915550232, + "learning_rate": 7.676264010609777e-06, + "loss": 0.8239, + "step": 11704 + }, + { + "epoch": 0.644229181573009, + "grad_norm": 0.6943735480308533, + "learning_rate": 7.675897854804685e-06, + "loss": 0.7702, + "step": 11705 + }, + { + "epoch": 0.6442842203753646, + "grad_norm": 0.7384238243103027, + "learning_rate": 7.67553167888826e-06, + "loss": 0.6911, + "step": 11706 + }, + { + "epoch": 0.6443392591777203, + "grad_norm": 0.660022497177124, + "learning_rate": 7.675165482863254e-06, + "loss": 0.7359, + "step": 11707 + }, + { + "epoch": 0.644394297980076, + "grad_norm": 0.6956108808517456, + "learning_rate": 7.674799266732422e-06, + "loss": 0.7845, + "step": 11708 + }, + { + "epoch": 0.6444493367824317, + "grad_norm": 0.7361618280410767, + "learning_rate": 7.674433030498513e-06, + "loss": 0.7391, + "step": 11709 + }, + { + "epoch": 0.6445043755847872, + "grad_norm": 0.7655043005943298, + "learning_rate": 7.674066774164284e-06, + "loss": 0.8305, + "step": 11710 + }, + { + "epoch": 0.6445594143871429, + "grad_norm": 0.7160911560058594, + "learning_rate": 7.673700497732483e-06, + "loss": 0.7654, + "step": 11711 + }, + { + "epoch": 0.6446144531894986, + "grad_norm": 0.7812016010284424, + "learning_rate": 7.673334201205866e-06, + "loss": 0.8212, + "step": 11712 + }, + { + "epoch": 0.6446694919918543, + "grad_norm": 0.7457767128944397, + "learning_rate": 7.672967884587184e-06, + "loss": 0.8084, + "step": 11713 + }, + { + "epoch": 0.6447245307942099, + "grad_norm": 0.7524051070213318, + "learning_rate": 7.672601547879189e-06, + "loss": 0.7525, + "step": 11714 + }, + { + "epoch": 0.6447795695965656, + "grad_norm": 0.7271043062210083, + "learning_rate": 7.672235191084638e-06, + "loss": 0.7627, + "step": 11715 + }, + { + "epoch": 0.6448346083989213, + "grad_norm": 0.6893014907836914, + "learning_rate": 7.671868814206283e-06, + "loss": 0.7969, + "step": 11716 + }, + { + "epoch": 0.644889647201277, + "grad_norm": 0.7057414054870605, + "learning_rate": 7.671502417246876e-06, + "loss": 0.7448, + "step": 11717 + }, + { + "epoch": 0.6449446860036325, + "grad_norm": 0.7490910887718201, + "learning_rate": 7.671136000209172e-06, + "loss": 0.8046, + "step": 11718 + }, + { + "epoch": 0.6449997248059882, + "grad_norm": 0.7338950634002686, + "learning_rate": 7.670769563095926e-06, + "loss": 0.8521, + "step": 11719 + }, + { + "epoch": 0.6450547636083439, + "grad_norm": 0.8669398427009583, + "learning_rate": 7.670403105909891e-06, + "loss": 0.7803, + "step": 11720 + }, + { + "epoch": 0.6451098024106996, + "grad_norm": 0.7012562155723572, + "learning_rate": 7.67003662865382e-06, + "loss": 0.8047, + "step": 11721 + }, + { + "epoch": 0.6451648412130552, + "grad_norm": 0.9933050274848938, + "learning_rate": 7.66967013133047e-06, + "loss": 0.7081, + "step": 11722 + }, + { + "epoch": 0.6452198800154109, + "grad_norm": 1.12044358253479, + "learning_rate": 7.669303613942592e-06, + "loss": 0.7315, + "step": 11723 + }, + { + "epoch": 0.6452749188177666, + "grad_norm": 0.8654733300209045, + "learning_rate": 7.668937076492943e-06, + "loss": 0.6849, + "step": 11724 + }, + { + "epoch": 0.6453299576201222, + "grad_norm": 0.7081291675567627, + "learning_rate": 7.668570518984277e-06, + "loss": 0.7584, + "step": 11725 + }, + { + "epoch": 0.6453849964224778, + "grad_norm": 0.7473898530006409, + "learning_rate": 7.66820394141935e-06, + "loss": 0.8364, + "step": 11726 + }, + { + "epoch": 0.6454400352248335, + "grad_norm": 0.7863657474517822, + "learning_rate": 7.667837343800916e-06, + "loss": 0.7235, + "step": 11727 + }, + { + "epoch": 0.6454950740271892, + "grad_norm": 0.6664546728134155, + "learning_rate": 7.667470726131732e-06, + "loss": 0.7203, + "step": 11728 + }, + { + "epoch": 0.6455501128295448, + "grad_norm": 0.7182374596595764, + "learning_rate": 7.667104088414552e-06, + "loss": 0.7376, + "step": 11729 + }, + { + "epoch": 0.6456051516319005, + "grad_norm": 0.6518070697784424, + "learning_rate": 7.666737430652128e-06, + "loss": 0.6804, + "step": 11730 + }, + { + "epoch": 0.6456601904342562, + "grad_norm": 0.7354047894477844, + "learning_rate": 7.666370752847223e-06, + "loss": 0.7648, + "step": 11731 + }, + { + "epoch": 0.6457152292366118, + "grad_norm": 0.7440805435180664, + "learning_rate": 7.666004055002588e-06, + "loss": 0.7674, + "step": 11732 + }, + { + "epoch": 0.6457702680389674, + "grad_norm": 1.6423569917678833, + "learning_rate": 7.665637337120981e-06, + "loss": 0.8957, + "step": 11733 + }, + { + "epoch": 0.6458253068413231, + "grad_norm": 0.6960558295249939, + "learning_rate": 7.665270599205156e-06, + "loss": 0.7278, + "step": 11734 + }, + { + "epoch": 0.6458803456436788, + "grad_norm": 0.6983850002288818, + "learning_rate": 7.664903841257871e-06, + "loss": 0.7351, + "step": 11735 + }, + { + "epoch": 0.6459353844460345, + "grad_norm": 0.6905686855316162, + "learning_rate": 7.664537063281883e-06, + "loss": 0.7558, + "step": 11736 + }, + { + "epoch": 0.6459904232483901, + "grad_norm": 0.7483980655670166, + "learning_rate": 7.664170265279946e-06, + "loss": 0.813, + "step": 11737 + }, + { + "epoch": 0.6460454620507458, + "grad_norm": 0.767756998538971, + "learning_rate": 7.66380344725482e-06, + "loss": 0.8397, + "step": 11738 + }, + { + "epoch": 0.6461005008531014, + "grad_norm": 0.7813250422477722, + "learning_rate": 7.66343660920926e-06, + "loss": 0.8034, + "step": 11739 + }, + { + "epoch": 0.6461555396554571, + "grad_norm": 0.7357046604156494, + "learning_rate": 7.663069751146022e-06, + "loss": 0.7604, + "step": 11740 + }, + { + "epoch": 0.6462105784578127, + "grad_norm": 0.620285153388977, + "learning_rate": 7.662702873067866e-06, + "loss": 0.6191, + "step": 11741 + }, + { + "epoch": 0.6462656172601684, + "grad_norm": 0.6711301803588867, + "learning_rate": 7.662335974977549e-06, + "loss": 0.7674, + "step": 11742 + }, + { + "epoch": 0.6463206560625241, + "grad_norm": 0.756258487701416, + "learning_rate": 7.661969056877824e-06, + "loss": 0.7074, + "step": 11743 + }, + { + "epoch": 0.6463756948648798, + "grad_norm": 0.8121050596237183, + "learning_rate": 7.661602118771456e-06, + "loss": 0.8028, + "step": 11744 + }, + { + "epoch": 0.6464307336672354, + "grad_norm": 0.735906720161438, + "learning_rate": 7.661235160661197e-06, + "loss": 0.7197, + "step": 11745 + }, + { + "epoch": 0.646485772469591, + "grad_norm": 0.644490122795105, + "learning_rate": 7.660868182549807e-06, + "loss": 0.6172, + "step": 11746 + }, + { + "epoch": 0.6465408112719467, + "grad_norm": 0.7228739261627197, + "learning_rate": 7.660501184440045e-06, + "loss": 0.8302, + "step": 11747 + }, + { + "epoch": 0.6465958500743024, + "grad_norm": 0.8292868137359619, + "learning_rate": 7.660134166334668e-06, + "loss": 0.7506, + "step": 11748 + }, + { + "epoch": 0.646650888876658, + "grad_norm": 0.7224695086479187, + "learning_rate": 7.659767128236433e-06, + "loss": 0.8043, + "step": 11749 + }, + { + "epoch": 0.6467059276790137, + "grad_norm": 0.7092188000679016, + "learning_rate": 7.659400070148102e-06, + "loss": 0.7838, + "step": 11750 + }, + { + "epoch": 0.6467609664813694, + "grad_norm": 0.6975178122520447, + "learning_rate": 7.65903299207243e-06, + "loss": 0.7576, + "step": 11751 + }, + { + "epoch": 0.6468160052837251, + "grad_norm": 0.6524471044540405, + "learning_rate": 7.658665894012179e-06, + "loss": 0.7822, + "step": 11752 + }, + { + "epoch": 0.6468710440860806, + "grad_norm": 0.8134269118309021, + "learning_rate": 7.658298775970107e-06, + "loss": 0.8116, + "step": 11753 + }, + { + "epoch": 0.6469260828884363, + "grad_norm": 0.7166362404823303, + "learning_rate": 7.657931637948974e-06, + "loss": 0.768, + "step": 11754 + }, + { + "epoch": 0.646981121690792, + "grad_norm": 0.6418643593788147, + "learning_rate": 7.657564479951535e-06, + "loss": 0.7488, + "step": 11755 + }, + { + "epoch": 0.6470361604931477, + "grad_norm": 0.7104085087776184, + "learning_rate": 7.657197301980556e-06, + "loss": 0.7518, + "step": 11756 + }, + { + "epoch": 0.6470911992955033, + "grad_norm": 0.7297894358634949, + "learning_rate": 7.656830104038793e-06, + "loss": 0.7877, + "step": 11757 + }, + { + "epoch": 0.647146238097859, + "grad_norm": 0.8037092089653015, + "learning_rate": 7.656462886129006e-06, + "loss": 0.7375, + "step": 11758 + }, + { + "epoch": 0.6472012769002147, + "grad_norm": 0.7498913407325745, + "learning_rate": 7.656095648253955e-06, + "loss": 0.7899, + "step": 11759 + }, + { + "epoch": 0.6472563157025704, + "grad_norm": 0.7383849620819092, + "learning_rate": 7.655728390416398e-06, + "loss": 0.8276, + "step": 11760 + }, + { + "epoch": 0.6473113545049259, + "grad_norm": 0.750481367111206, + "learning_rate": 7.6553611126191e-06, + "loss": 0.7649, + "step": 11761 + }, + { + "epoch": 0.6473663933072816, + "grad_norm": 0.8483286499977112, + "learning_rate": 7.654993814864817e-06, + "loss": 0.877, + "step": 11762 + }, + { + "epoch": 0.6474214321096373, + "grad_norm": 0.7938307523727417, + "learning_rate": 7.654626497156311e-06, + "loss": 0.8159, + "step": 11763 + }, + { + "epoch": 0.647476470911993, + "grad_norm": 0.6576653122901917, + "learning_rate": 7.654259159496343e-06, + "loss": 0.797, + "step": 11764 + }, + { + "epoch": 0.6475315097143486, + "grad_norm": 0.6495664715766907, + "learning_rate": 7.653891801887675e-06, + "loss": 0.6641, + "step": 11765 + }, + { + "epoch": 0.6475865485167043, + "grad_norm": 0.7447353601455688, + "learning_rate": 7.653524424333065e-06, + "loss": 0.667, + "step": 11766 + }, + { + "epoch": 0.64764158731906, + "grad_norm": 0.6565769910812378, + "learning_rate": 7.653157026835277e-06, + "loss": 0.7123, + "step": 11767 + }, + { + "epoch": 0.6476966261214157, + "grad_norm": 0.8406145572662354, + "learning_rate": 7.652789609397072e-06, + "loss": 0.7582, + "step": 11768 + }, + { + "epoch": 0.6477516649237712, + "grad_norm": 0.8478217720985413, + "learning_rate": 7.652422172021207e-06, + "loss": 0.6758, + "step": 11769 + }, + { + "epoch": 0.6478067037261269, + "grad_norm": 0.7230110168457031, + "learning_rate": 7.652054714710448e-06, + "loss": 0.8216, + "step": 11770 + }, + { + "epoch": 0.6478617425284826, + "grad_norm": 0.6718668341636658, + "learning_rate": 7.651687237467558e-06, + "loss": 0.7204, + "step": 11771 + }, + { + "epoch": 0.6479167813308382, + "grad_norm": 1.062383770942688, + "learning_rate": 7.651319740295296e-06, + "loss": 0.6853, + "step": 11772 + }, + { + "epoch": 0.6479718201331939, + "grad_norm": 0.7157385945320129, + "learning_rate": 7.650952223196423e-06, + "loss": 0.6826, + "step": 11773 + }, + { + "epoch": 0.6480268589355496, + "grad_norm": 0.6762190461158752, + "learning_rate": 7.650584686173703e-06, + "loss": 0.7673, + "step": 11774 + }, + { + "epoch": 0.6480818977379053, + "grad_norm": 0.7540121674537659, + "learning_rate": 7.650217129229897e-06, + "loss": 0.7361, + "step": 11775 + }, + { + "epoch": 0.6481369365402608, + "grad_norm": 1.0383096933364868, + "learning_rate": 7.649849552367771e-06, + "loss": 0.7936, + "step": 11776 + }, + { + "epoch": 0.6481919753426165, + "grad_norm": 0.6430917382240295, + "learning_rate": 7.649481955590084e-06, + "loss": 0.7738, + "step": 11777 + }, + { + "epoch": 0.6482470141449722, + "grad_norm": 0.7846735715866089, + "learning_rate": 7.6491143388996e-06, + "loss": 0.6892, + "step": 11778 + }, + { + "epoch": 0.6483020529473279, + "grad_norm": 0.7154437899589539, + "learning_rate": 7.64874670229908e-06, + "loss": 0.6889, + "step": 11779 + }, + { + "epoch": 0.6483570917496835, + "grad_norm": 0.731270432472229, + "learning_rate": 7.648379045791291e-06, + "loss": 0.6405, + "step": 11780 + }, + { + "epoch": 0.6484121305520392, + "grad_norm": 0.6782581210136414, + "learning_rate": 7.648011369378993e-06, + "loss": 0.7822, + "step": 11781 + }, + { + "epoch": 0.6484671693543949, + "grad_norm": 0.7025747299194336, + "learning_rate": 7.64764367306495e-06, + "loss": 0.6929, + "step": 11782 + }, + { + "epoch": 0.6485222081567505, + "grad_norm": 0.6791071891784668, + "learning_rate": 7.647275956851928e-06, + "loss": 0.7507, + "step": 11783 + }, + { + "epoch": 0.6485772469591061, + "grad_norm": 0.7598931193351746, + "learning_rate": 7.646908220742686e-06, + "loss": 0.776, + "step": 11784 + }, + { + "epoch": 0.6486322857614618, + "grad_norm": 0.6930273771286011, + "learning_rate": 7.646540464739993e-06, + "loss": 0.7653, + "step": 11785 + }, + { + "epoch": 0.6486873245638175, + "grad_norm": 0.7276393175125122, + "learning_rate": 7.646172688846608e-06, + "loss": 0.8102, + "step": 11786 + }, + { + "epoch": 0.6487423633661732, + "grad_norm": 0.6826562285423279, + "learning_rate": 7.645804893065298e-06, + "loss": 0.6182, + "step": 11787 + }, + { + "epoch": 0.6487974021685288, + "grad_norm": 0.7837507128715515, + "learning_rate": 7.645437077398827e-06, + "loss": 0.8124, + "step": 11788 + }, + { + "epoch": 0.6488524409708845, + "grad_norm": 0.6937540769577026, + "learning_rate": 7.645069241849959e-06, + "loss": 0.7831, + "step": 11789 + }, + { + "epoch": 0.6489074797732401, + "grad_norm": 0.6531546115875244, + "learning_rate": 7.644701386421458e-06, + "loss": 0.755, + "step": 11790 + }, + { + "epoch": 0.6489625185755958, + "grad_norm": 0.8563246726989746, + "learning_rate": 7.644333511116088e-06, + "loss": 0.7715, + "step": 11791 + }, + { + "epoch": 0.6490175573779514, + "grad_norm": 0.8330580592155457, + "learning_rate": 7.643965615936619e-06, + "loss": 0.6651, + "step": 11792 + }, + { + "epoch": 0.6490725961803071, + "grad_norm": 0.6478384137153625, + "learning_rate": 7.643597700885809e-06, + "loss": 0.7063, + "step": 11793 + }, + { + "epoch": 0.6491276349826628, + "grad_norm": 0.7169124484062195, + "learning_rate": 7.643229765966428e-06, + "loss": 0.7578, + "step": 11794 + }, + { + "epoch": 0.6491826737850185, + "grad_norm": 0.726198136806488, + "learning_rate": 7.642861811181239e-06, + "loss": 0.783, + "step": 11795 + }, + { + "epoch": 0.6492377125873741, + "grad_norm": 0.7167587280273438, + "learning_rate": 7.642493836533008e-06, + "loss": 0.81, + "step": 11796 + }, + { + "epoch": 0.6492927513897297, + "grad_norm": 0.7215337157249451, + "learning_rate": 7.642125842024502e-06, + "loss": 0.8176, + "step": 11797 + }, + { + "epoch": 0.6493477901920854, + "grad_norm": 0.7041502594947815, + "learning_rate": 7.641757827658484e-06, + "loss": 0.8117, + "step": 11798 + }, + { + "epoch": 0.6494028289944411, + "grad_norm": 1.0303698778152466, + "learning_rate": 7.64138979343772e-06, + "loss": 0.781, + "step": 11799 + }, + { + "epoch": 0.6494578677967967, + "grad_norm": 0.626518189907074, + "learning_rate": 7.64102173936498e-06, + "loss": 0.6668, + "step": 11800 + }, + { + "epoch": 0.6495129065991524, + "grad_norm": 0.8889065980911255, + "learning_rate": 7.640653665443025e-06, + "loss": 0.8076, + "step": 11801 + }, + { + "epoch": 0.6495679454015081, + "grad_norm": 0.8333556652069092, + "learning_rate": 7.640285571674626e-06, + "loss": 0.8111, + "step": 11802 + }, + { + "epoch": 0.6496229842038638, + "grad_norm": 0.7248615622520447, + "learning_rate": 7.639917458062547e-06, + "loss": 0.7876, + "step": 11803 + }, + { + "epoch": 0.6496780230062194, + "grad_norm": 0.8870820999145508, + "learning_rate": 7.639549324609554e-06, + "loss": 0.8586, + "step": 11804 + }, + { + "epoch": 0.649733061808575, + "grad_norm": 0.7777245044708252, + "learning_rate": 7.639181171318417e-06, + "loss": 0.7793, + "step": 11805 + }, + { + "epoch": 0.6497881006109307, + "grad_norm": 0.7858467102050781, + "learning_rate": 7.638812998191897e-06, + "loss": 0.7842, + "step": 11806 + }, + { + "epoch": 0.6498431394132864, + "grad_norm": 0.6278610825538635, + "learning_rate": 7.638444805232769e-06, + "loss": 0.6659, + "step": 11807 + }, + { + "epoch": 0.649898178215642, + "grad_norm": 0.6758826971054077, + "learning_rate": 7.638076592443795e-06, + "loss": 0.7047, + "step": 11808 + }, + { + "epoch": 0.6499532170179977, + "grad_norm": 0.745007336139679, + "learning_rate": 7.637708359827743e-06, + "loss": 0.8557, + "step": 11809 + }, + { + "epoch": 0.6500082558203534, + "grad_norm": 0.8092321157455444, + "learning_rate": 7.63734010738738e-06, + "loss": 0.7895, + "step": 11810 + }, + { + "epoch": 0.6500632946227091, + "grad_norm": 0.7055220603942871, + "learning_rate": 7.636971835125476e-06, + "loss": 0.7678, + "step": 11811 + }, + { + "epoch": 0.6501183334250646, + "grad_norm": 0.7130264043807983, + "learning_rate": 7.636603543044797e-06, + "loss": 0.7648, + "step": 11812 + }, + { + "epoch": 0.6501733722274203, + "grad_norm": 0.7494268417358398, + "learning_rate": 7.636235231148112e-06, + "loss": 0.7883, + "step": 11813 + }, + { + "epoch": 0.650228411029776, + "grad_norm": 0.7998068332672119, + "learning_rate": 7.635866899438189e-06, + "loss": 0.7849, + "step": 11814 + }, + { + "epoch": 0.6502834498321316, + "grad_norm": 0.6749094128608704, + "learning_rate": 7.635498547917795e-06, + "loss": 0.8488, + "step": 11815 + }, + { + "epoch": 0.6503384886344873, + "grad_norm": 0.743679940700531, + "learning_rate": 7.635130176589698e-06, + "loss": 0.7562, + "step": 11816 + }, + { + "epoch": 0.650393527436843, + "grad_norm": 0.8368289470672607, + "learning_rate": 7.634761785456671e-06, + "loss": 0.7012, + "step": 11817 } ], "logging_steps": 1, @@ -76382,7 +82745,7 @@ "attributes": {} } }, - "total_flos": 3.219017656946393e+19, + "total_flos": 3.4872691283585925e+19, "train_batch_size": 1, "trial_name": null, "trial_params": null