diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,12517 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 3564, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0016835016835016834, + "grad_norm": 9.827384948730469, + "learning_rate": 1.1173184357541899e-08, + "loss": 1.7055253982543945, + "step": 2 + }, + { + "epoch": 0.003367003367003367, + "grad_norm": 9.42782211303711, + "learning_rate": 3.3519553072625695e-08, + "loss": 1.2431578636169434, + "step": 4 + }, + { + "epoch": 0.005050505050505051, + "grad_norm": 9.667145729064941, + "learning_rate": 5.586592178770949e-08, + "loss": 1.6887383460998535, + "step": 6 + }, + { + "epoch": 0.006734006734006734, + "grad_norm": 11.942709922790527, + "learning_rate": 7.82122905027933e-08, + "loss": 1.6064767837524414, + "step": 8 + }, + { + "epoch": 0.008417508417508417, + "grad_norm": 131.6094207763672, + "learning_rate": 1.005586592178771e-07, + "loss": 4.499759674072266, + "step": 10 + }, + { + "epoch": 0.010101010101010102, + "grad_norm": 6.955765724182129, + "learning_rate": 1.2290502793296089e-07, + "loss": 1.9788310527801514, + "step": 12 + }, + { + "epoch": 0.011784511784511785, + "grad_norm": 4.201331615447998, + "learning_rate": 1.452513966480447e-07, + "loss": 1.6753560304641724, + "step": 14 + }, + { + "epoch": 0.013468013468013467, + "grad_norm": 17.091062545776367, + "learning_rate": 1.6759776536312846e-07, + "loss": 1.6581202745437622, + "step": 16 + }, + { + "epoch": 0.015151515151515152, + "grad_norm": 22.55893325805664, + "learning_rate": 1.8994413407821228e-07, + "loss": 2.7158942222595215, + "step": 18 + }, + { + "epoch": 0.016835016835016835, + "grad_norm": 6.976036548614502, + "learning_rate": 2.122905027932961e-07, + "loss": 1.9487460851669312, + "step": 20 + }, + { + "epoch": 0.018518518518518517, + "grad_norm": 4.87603759765625, + "learning_rate": 2.3463687150837988e-07, + "loss": 1.845729947090149, + "step": 22 + }, + { + "epoch": 0.020202020202020204, + "grad_norm": 13.902255058288574, + "learning_rate": 2.5698324022346367e-07, + "loss": 3.498323917388916, + "step": 24 + }, + { + "epoch": 0.021885521885521887, + "grad_norm": 14.09145450592041, + "learning_rate": 2.7932960893854745e-07, + "loss": 2.7927517890930176, + "step": 26 + }, + { + "epoch": 0.02356902356902357, + "grad_norm": 12.507741928100586, + "learning_rate": 3.016759776536313e-07, + "loss": 2.1394832134246826, + "step": 28 + }, + { + "epoch": 0.025252525252525252, + "grad_norm": 50.04438018798828, + "learning_rate": 3.240223463687151e-07, + "loss": 3.230577230453491, + "step": 30 + }, + { + "epoch": 0.026936026936026935, + "grad_norm": 22.915058135986328, + "learning_rate": 3.4636871508379887e-07, + "loss": 1.7826504707336426, + "step": 32 + }, + { + "epoch": 0.02861952861952862, + "grad_norm": 34.94866943359375, + "learning_rate": 3.6871508379888266e-07, + "loss": 3.590939998626709, + "step": 34 + }, + { + "epoch": 0.030303030303030304, + "grad_norm": 9.724323272705078, + "learning_rate": 3.9106145251396645e-07, + "loss": 1.9341622591018677, + "step": 36 + }, + { + "epoch": 0.03198653198653199, + "grad_norm": 16.15651512145996, + "learning_rate": 4.134078212290503e-07, + "loss": 1.4625201225280762, + "step": 38 + }, + { + "epoch": 0.03367003367003367, + "grad_norm": 7.4519453048706055, + "learning_rate": 4.35754189944134e-07, + "loss": 2.242250919342041, + "step": 40 + }, + { + "epoch": 0.03535353535353535, + "grad_norm": 6.571437835693359, + "learning_rate": 4.5810055865921786e-07, + "loss": 2.679516315460205, + "step": 42 + }, + { + "epoch": 0.037037037037037035, + "grad_norm": 19.185373306274414, + "learning_rate": 4.804469273743016e-07, + "loss": 2.1858067512512207, + "step": 44 + }, + { + "epoch": 0.03872053872053872, + "grad_norm": 18.07056999206543, + "learning_rate": 5.027932960893855e-07, + "loss": 1.433751106262207, + "step": 46 + }, + { + "epoch": 0.04040404040404041, + "grad_norm": 24.015710830688477, + "learning_rate": 5.251396648044693e-07, + "loss": 2.102412700653076, + "step": 48 + }, + { + "epoch": 0.04208754208754209, + "grad_norm": 22.281003952026367, + "learning_rate": 5.474860335195531e-07, + "loss": 1.8496794700622559, + "step": 50 + }, + { + "epoch": 0.04377104377104377, + "grad_norm": 16.242393493652344, + "learning_rate": 5.698324022346367e-07, + "loss": 1.9199731349945068, + "step": 52 + }, + { + "epoch": 0.045454545454545456, + "grad_norm": 11.205278396606445, + "learning_rate": 5.921787709497206e-07, + "loss": 1.8013508319854736, + "step": 54 + }, + { + "epoch": 0.04713804713804714, + "grad_norm": 4.4281840324401855, + "learning_rate": 6.145251396648044e-07, + "loss": 1.5387322902679443, + "step": 56 + }, + { + "epoch": 0.04882154882154882, + "grad_norm": 27.68507194519043, + "learning_rate": 6.368715083798882e-07, + "loss": 1.7617017030715942, + "step": 58 + }, + { + "epoch": 0.050505050505050504, + "grad_norm": 13.444940567016602, + "learning_rate": 6.59217877094972e-07, + "loss": 1.5345146656036377, + "step": 60 + }, + { + "epoch": 0.05218855218855219, + "grad_norm": 12.37048625946045, + "learning_rate": 6.815642458100558e-07, + "loss": 1.5472785234451294, + "step": 62 + }, + { + "epoch": 0.05387205387205387, + "grad_norm": 5.660282135009766, + "learning_rate": 7.039106145251397e-07, + "loss": 1.3724396228790283, + "step": 64 + }, + { + "epoch": 0.05555555555555555, + "grad_norm": 32.10633087158203, + "learning_rate": 7.262569832402235e-07, + "loss": 1.7364461421966553, + "step": 66 + }, + { + "epoch": 0.05723905723905724, + "grad_norm": 15.033787727355957, + "learning_rate": 7.486033519553073e-07, + "loss": 1.5618245601654053, + "step": 68 + }, + { + "epoch": 0.058922558922558925, + "grad_norm": 5.500316143035889, + "learning_rate": 7.709497206703909e-07, + "loss": 1.4692459106445312, + "step": 70 + }, + { + "epoch": 0.06060606060606061, + "grad_norm": 7.862852096557617, + "learning_rate": 7.932960893854748e-07, + "loss": 1.767068862915039, + "step": 72 + }, + { + "epoch": 0.06228956228956229, + "grad_norm": 3.3375768661499023, + "learning_rate": 8.156424581005586e-07, + "loss": 1.5882585048675537, + "step": 74 + }, + { + "epoch": 0.06397306397306397, + "grad_norm": 4.3638529777526855, + "learning_rate": 8.379888268156424e-07, + "loss": 1.0791618824005127, + "step": 76 + }, + { + "epoch": 0.06565656565656566, + "grad_norm": 3.2826614379882812, + "learning_rate": 8.603351955307262e-07, + "loss": 1.623827338218689, + "step": 78 + }, + { + "epoch": 0.06734006734006734, + "grad_norm": 13.223998069763184, + "learning_rate": 8.8268156424581e-07, + "loss": 1.4189568758010864, + "step": 80 + }, + { + "epoch": 0.06902356902356903, + "grad_norm": 8.176948547363281, + "learning_rate": 9.050279329608939e-07, + "loss": 1.5663306713104248, + "step": 82 + }, + { + "epoch": 0.0707070707070707, + "grad_norm": 8.477921485900879, + "learning_rate": 9.273743016759777e-07, + "loss": 1.3473039865493774, + "step": 84 + }, + { + "epoch": 0.0723905723905724, + "grad_norm": 5.039812088012695, + "learning_rate": 9.497206703910615e-07, + "loss": 1.4909709692001343, + "step": 86 + }, + { + "epoch": 0.07407407407407407, + "grad_norm": 4.436509132385254, + "learning_rate": 9.720670391061452e-07, + "loss": 1.3051445484161377, + "step": 88 + }, + { + "epoch": 0.07575757575757576, + "grad_norm": 10.7329740524292, + "learning_rate": 9.94413407821229e-07, + "loss": 1.4471063613891602, + "step": 90 + }, + { + "epoch": 0.07744107744107744, + "grad_norm": 48.17202377319336, + "learning_rate": 1.0167597765363128e-06, + "loss": 1.1504158973693848, + "step": 92 + }, + { + "epoch": 0.07912457912457913, + "grad_norm": 9.630391120910645, + "learning_rate": 1.0391061452513965e-06, + "loss": 1.238828182220459, + "step": 94 + }, + { + "epoch": 0.08080808080808081, + "grad_norm": 3.6707308292388916, + "learning_rate": 1.0614525139664804e-06, + "loss": 1.29024076461792, + "step": 96 + }, + { + "epoch": 0.08249158249158249, + "grad_norm": 20.06619644165039, + "learning_rate": 1.0837988826815643e-06, + "loss": 1.2375919818878174, + "step": 98 + }, + { + "epoch": 0.08417508417508418, + "grad_norm": 6.117098331451416, + "learning_rate": 1.106145251396648e-06, + "loss": 1.2162528038024902, + "step": 100 + }, + { + "epoch": 0.08585858585858586, + "grad_norm": 7.965595245361328, + "learning_rate": 1.1284916201117319e-06, + "loss": 1.0878969430923462, + "step": 102 + }, + { + "epoch": 0.08754208754208755, + "grad_norm": 3.471269369125366, + "learning_rate": 1.1508379888268155e-06, + "loss": 0.8488566875457764, + "step": 104 + }, + { + "epoch": 0.08922558922558922, + "grad_norm": 19.03371238708496, + "learning_rate": 1.1731843575418994e-06, + "loss": 0.9605998992919922, + "step": 106 + }, + { + "epoch": 0.09090909090909091, + "grad_norm": 4.8145551681518555, + "learning_rate": 1.1955307262569831e-06, + "loss": 1.2580342292785645, + "step": 108 + }, + { + "epoch": 0.09259259259259259, + "grad_norm": 12.215010643005371, + "learning_rate": 1.217877094972067e-06, + "loss": 0.8208008408546448, + "step": 110 + }, + { + "epoch": 0.09427609427609428, + "grad_norm": 5.212827682495117, + "learning_rate": 1.2402234636871507e-06, + "loss": 1.2487308979034424, + "step": 112 + }, + { + "epoch": 0.09595959595959595, + "grad_norm": 110.1784439086914, + "learning_rate": 1.2625698324022344e-06, + "loss": 1.0615664720535278, + "step": 114 + }, + { + "epoch": 0.09764309764309764, + "grad_norm": 8.633198738098145, + "learning_rate": 1.2849162011173185e-06, + "loss": 0.7479297518730164, + "step": 116 + }, + { + "epoch": 0.09932659932659933, + "grad_norm": 3.4412970542907715, + "learning_rate": 1.3072625698324022e-06, + "loss": 1.1516764163970947, + "step": 118 + }, + { + "epoch": 0.10101010101010101, + "grad_norm": 2.8980441093444824, + "learning_rate": 1.329608938547486e-06, + "loss": 1.0023488998413086, + "step": 120 + }, + { + "epoch": 0.1026936026936027, + "grad_norm": 4.491576671600342, + "learning_rate": 1.3519553072625697e-06, + "loss": 1.207779884338379, + "step": 122 + }, + { + "epoch": 0.10437710437710437, + "grad_norm": 5.334079742431641, + "learning_rate": 1.3743016759776536e-06, + "loss": 0.8073678612709045, + "step": 124 + }, + { + "epoch": 0.10606060606060606, + "grad_norm": 5.402129650115967, + "learning_rate": 1.3966480446927373e-06, + "loss": 0.7180484533309937, + "step": 126 + }, + { + "epoch": 0.10774410774410774, + "grad_norm": 33.15776824951172, + "learning_rate": 1.4189944134078212e-06, + "loss": 1.076992392539978, + "step": 128 + }, + { + "epoch": 0.10942760942760943, + "grad_norm": 12.190916061401367, + "learning_rate": 1.441340782122905e-06, + "loss": 0.9793660640716553, + "step": 130 + }, + { + "epoch": 0.1111111111111111, + "grad_norm": 5.5417070388793945, + "learning_rate": 1.4636871508379886e-06, + "loss": 0.9299952387809753, + "step": 132 + }, + { + "epoch": 0.1127946127946128, + "grad_norm": 3.002917766571045, + "learning_rate": 1.4860335195530727e-06, + "loss": 1.1973538398742676, + "step": 134 + }, + { + "epoch": 0.11447811447811448, + "grad_norm": 13.795450210571289, + "learning_rate": 1.5083798882681564e-06, + "loss": 1.1933711767196655, + "step": 136 + }, + { + "epoch": 0.11616161616161616, + "grad_norm": 3.4793336391448975, + "learning_rate": 1.5307262569832403e-06, + "loss": 1.5386559963226318, + "step": 138 + }, + { + "epoch": 0.11784511784511785, + "grad_norm": 9.980926513671875, + "learning_rate": 1.553072625698324e-06, + "loss": 1.125044584274292, + "step": 140 + }, + { + "epoch": 0.11952861952861953, + "grad_norm": 4.957187175750732, + "learning_rate": 1.5754189944134078e-06, + "loss": 1.0593317747116089, + "step": 142 + }, + { + "epoch": 0.12121212121212122, + "grad_norm": 14.749825477600098, + "learning_rate": 1.5977653631284915e-06, + "loss": 0.9547094702720642, + "step": 144 + }, + { + "epoch": 0.12289562289562289, + "grad_norm": 3.5250778198242188, + "learning_rate": 1.6201117318435752e-06, + "loss": 1.1345624923706055, + "step": 146 + }, + { + "epoch": 0.12457912457912458, + "grad_norm": 3.4003188610076904, + "learning_rate": 1.642458100558659e-06, + "loss": 0.9924101829528809, + "step": 148 + }, + { + "epoch": 0.12626262626262627, + "grad_norm": 18.434391021728516, + "learning_rate": 1.6648044692737428e-06, + "loss": 1.2128210067749023, + "step": 150 + }, + { + "epoch": 0.12794612794612795, + "grad_norm": 6.9610066413879395, + "learning_rate": 1.6871508379888269e-06, + "loss": 0.9494305849075317, + "step": 152 + }, + { + "epoch": 0.12962962962962962, + "grad_norm": 42.241188049316406, + "learning_rate": 1.7094972067039106e-06, + "loss": 1.1769180297851562, + "step": 154 + }, + { + "epoch": 0.13131313131313133, + "grad_norm": 19.53082275390625, + "learning_rate": 1.7318435754189945e-06, + "loss": 1.0955569744110107, + "step": 156 + }, + { + "epoch": 0.132996632996633, + "grad_norm": 4.005194187164307, + "learning_rate": 1.7541899441340781e-06, + "loss": 1.0531185865402222, + "step": 158 + }, + { + "epoch": 0.13468013468013468, + "grad_norm": 5.709774494171143, + "learning_rate": 1.776536312849162e-06, + "loss": 1.1533485651016235, + "step": 160 + }, + { + "epoch": 0.13636363636363635, + "grad_norm": 2.3597922325134277, + "learning_rate": 1.7988826815642457e-06, + "loss": 1.0321946144104004, + "step": 162 + }, + { + "epoch": 0.13804713804713806, + "grad_norm": 16.570262908935547, + "learning_rate": 1.8212290502793294e-06, + "loss": 0.9637615084648132, + "step": 164 + }, + { + "epoch": 0.13973063973063973, + "grad_norm": 8.452648162841797, + "learning_rate": 1.8435754189944133e-06, + "loss": 0.9408825039863586, + "step": 166 + }, + { + "epoch": 0.1414141414141414, + "grad_norm": 2.8005619049072266, + "learning_rate": 1.865921787709497e-06, + "loss": 1.127833366394043, + "step": 168 + }, + { + "epoch": 0.14309764309764308, + "grad_norm": 6.316201686859131, + "learning_rate": 1.8882681564245809e-06, + "loss": 1.0138617753982544, + "step": 170 + }, + { + "epoch": 0.1447811447811448, + "grad_norm": 14.958882331848145, + "learning_rate": 1.9106145251396648e-06, + "loss": 1.0158287286758423, + "step": 172 + }, + { + "epoch": 0.14646464646464646, + "grad_norm": 4.5443267822265625, + "learning_rate": 1.9329608938547484e-06, + "loss": 0.7117235064506531, + "step": 174 + }, + { + "epoch": 0.14814814814814814, + "grad_norm": 4.039905548095703, + "learning_rate": 1.9553072625698325e-06, + "loss": 1.0871771574020386, + "step": 176 + }, + { + "epoch": 0.14983164983164984, + "grad_norm": 3.271326780319214, + "learning_rate": 1.9776536312849162e-06, + "loss": 1.267643690109253, + "step": 178 + }, + { + "epoch": 0.15151515151515152, + "grad_norm": 5.037292957305908, + "learning_rate": 2e-06, + "loss": 1.0257434844970703, + "step": 180 + }, + { + "epoch": 0.1531986531986532, + "grad_norm": 4.92929220199585, + "learning_rate": 1.9999984495606584e-06, + "loss": 1.4013102054595947, + "step": 182 + }, + { + "epoch": 0.15488215488215487, + "grad_norm": 23.51206398010254, + "learning_rate": 1.999993798247977e-06, + "loss": 1.0038059949874878, + "step": 184 + }, + { + "epoch": 0.15656565656565657, + "grad_norm": 14.101850509643555, + "learning_rate": 1.99998604607798e-06, + "loss": 1.1263923645019531, + "step": 186 + }, + { + "epoch": 0.15824915824915825, + "grad_norm": 11.950604438781738, + "learning_rate": 1.9999751930773778e-06, + "loss": 0.9272401332855225, + "step": 188 + }, + { + "epoch": 0.15993265993265993, + "grad_norm": 21.03433609008789, + "learning_rate": 1.999961239283563e-06, + "loss": 0.7770416140556335, + "step": 190 + }, + { + "epoch": 0.16161616161616163, + "grad_norm": 3.4966766834259033, + "learning_rate": 1.999944184744613e-06, + "loss": 1.348158597946167, + "step": 192 + }, + { + "epoch": 0.1632996632996633, + "grad_norm": 3.6538894176483154, + "learning_rate": 1.999924029519287e-06, + "loss": 1.2516090869903564, + "step": 194 + }, + { + "epoch": 0.16498316498316498, + "grad_norm": 4.83535623550415, + "learning_rate": 1.9999007736770295e-06, + "loss": 1.072089672088623, + "step": 196 + }, + { + "epoch": 0.16666666666666666, + "grad_norm": 3.3021559715270996, + "learning_rate": 1.9998744172979654e-06, + "loss": 1.1623098850250244, + "step": 198 + }, + { + "epoch": 0.16835016835016836, + "grad_norm": 14.60655689239502, + "learning_rate": 1.9998449604729044e-06, + "loss": 0.8636209964752197, + "step": 200 + }, + { + "epoch": 0.17003367003367004, + "grad_norm": 12.559534072875977, + "learning_rate": 1.9998124033033366e-06, + "loss": 0.895442008972168, + "step": 202 + }, + { + "epoch": 0.1717171717171717, + "grad_norm": 4.964874744415283, + "learning_rate": 1.9997767459014363e-06, + "loss": 1.0330384969711304, + "step": 204 + }, + { + "epoch": 0.1734006734006734, + "grad_norm": 3.3170907497406006, + "learning_rate": 1.9997379883900572e-06, + "loss": 0.9942055940628052, + "step": 206 + }, + { + "epoch": 0.1750841750841751, + "grad_norm": 4.744529724121094, + "learning_rate": 1.999696130902736e-06, + "loss": 1.2099803686141968, + "step": 208 + }, + { + "epoch": 0.17676767676767677, + "grad_norm": 11.850593566894531, + "learning_rate": 1.9996511735836895e-06, + "loss": 0.7535406351089478, + "step": 210 + }, + { + "epoch": 0.17845117845117844, + "grad_norm": 16.69972038269043, + "learning_rate": 1.999603116587814e-06, + "loss": 0.9160436987876892, + "step": 212 + }, + { + "epoch": 0.18013468013468015, + "grad_norm": 2.5802817344665527, + "learning_rate": 1.9995519600806863e-06, + "loss": 1.3276009559631348, + "step": 214 + }, + { + "epoch": 0.18181818181818182, + "grad_norm": 9.903021812438965, + "learning_rate": 1.999497704238562e-06, + "loss": 0.8258368372917175, + "step": 216 + }, + { + "epoch": 0.1835016835016835, + "grad_norm": 10.159919738769531, + "learning_rate": 1.9994403492483755e-06, + "loss": 0.6640470027923584, + "step": 218 + }, + { + "epoch": 0.18518518518518517, + "grad_norm": 3.8735828399658203, + "learning_rate": 1.999379895307739e-06, + "loss": 1.3416516780853271, + "step": 220 + }, + { + "epoch": 0.18686868686868688, + "grad_norm": 3.4755043983459473, + "learning_rate": 1.999316342624941e-06, + "loss": 0.9075236320495605, + "step": 222 + }, + { + "epoch": 0.18855218855218855, + "grad_norm": 5.18587064743042, + "learning_rate": 1.999249691418948e-06, + "loss": 1.193176507949829, + "step": 224 + }, + { + "epoch": 0.19023569023569023, + "grad_norm": 6.766015529632568, + "learning_rate": 1.999179941919401e-06, + "loss": 0.9458363056182861, + "step": 226 + }, + { + "epoch": 0.1919191919191919, + "grad_norm": 12.469842910766602, + "learning_rate": 1.999107094366617e-06, + "loss": 1.1906776428222656, + "step": 228 + }, + { + "epoch": 0.1936026936026936, + "grad_norm": 15.036520004272461, + "learning_rate": 1.9990311490115858e-06, + "loss": 1.3650178909301758, + "step": 230 + }, + { + "epoch": 0.19528619528619529, + "grad_norm": 5.799370288848877, + "learning_rate": 1.9989521061159715e-06, + "loss": 1.0698531866073608, + "step": 232 + }, + { + "epoch": 0.19696969696969696, + "grad_norm": 5.714483737945557, + "learning_rate": 1.9988699659521098e-06, + "loss": 1.1641753911972046, + "step": 234 + }, + { + "epoch": 0.19865319865319866, + "grad_norm": 10.119220733642578, + "learning_rate": 1.9987847288030083e-06, + "loss": 0.9833089113235474, + "step": 236 + }, + { + "epoch": 0.20033670033670034, + "grad_norm": 3.4788730144500732, + "learning_rate": 1.998696394962345e-06, + "loss": 1.1086716651916504, + "step": 238 + }, + { + "epoch": 0.20202020202020202, + "grad_norm": 3.9894561767578125, + "learning_rate": 1.998604964734467e-06, + "loss": 0.9258865118026733, + "step": 240 + }, + { + "epoch": 0.2037037037037037, + "grad_norm": 4.706192970275879, + "learning_rate": 1.99851043843439e-06, + "loss": 1.1667051315307617, + "step": 242 + }, + { + "epoch": 0.2053872053872054, + "grad_norm": 15.748969078063965, + "learning_rate": 1.9984128163877964e-06, + "loss": 0.9964404106140137, + "step": 244 + }, + { + "epoch": 0.20707070707070707, + "grad_norm": 9.65405559539795, + "learning_rate": 1.998312098931036e-06, + "loss": 0.6644821166992188, + "step": 246 + }, + { + "epoch": 0.20875420875420875, + "grad_norm": 13.462628364562988, + "learning_rate": 1.998208286411122e-06, + "loss": 1.2101833820343018, + "step": 248 + }, + { + "epoch": 0.21043771043771045, + "grad_norm": 2.0463879108428955, + "learning_rate": 1.9981013791857327e-06, + "loss": 0.9958995580673218, + "step": 250 + }, + { + "epoch": 0.21212121212121213, + "grad_norm": 3.3968567848205566, + "learning_rate": 1.997991377623209e-06, + "loss": 0.8969879150390625, + "step": 252 + }, + { + "epoch": 0.2138047138047138, + "grad_norm": 17.595094680786133, + "learning_rate": 1.9978782821025513e-06, + "loss": 1.0462696552276611, + "step": 254 + }, + { + "epoch": 0.21548821548821548, + "grad_norm": 13.578154563903809, + "learning_rate": 1.9977620930134223e-06, + "loss": 1.1988019943237305, + "step": 256 + }, + { + "epoch": 0.21717171717171718, + "grad_norm": 4.280734062194824, + "learning_rate": 1.9976428107561415e-06, + "loss": 0.8459457755088806, + "step": 258 + }, + { + "epoch": 0.21885521885521886, + "grad_norm": 2.570441246032715, + "learning_rate": 1.997520435741687e-06, + "loss": 1.0279544591903687, + "step": 260 + }, + { + "epoch": 0.22053872053872053, + "grad_norm": 6.806192398071289, + "learning_rate": 1.9973949683916927e-06, + "loss": 1.0510814189910889, + "step": 262 + }, + { + "epoch": 0.2222222222222222, + "grad_norm": 4.318380832672119, + "learning_rate": 1.9972664091384454e-06, + "loss": 1.1062796115875244, + "step": 264 + }, + { + "epoch": 0.2239057239057239, + "grad_norm": 3.807039976119995, + "learning_rate": 1.997134758424886e-06, + "loss": 1.1960452795028687, + "step": 266 + }, + { + "epoch": 0.2255892255892256, + "grad_norm": 6.313713550567627, + "learning_rate": 1.9970000167046075e-06, + "loss": 0.6546218991279602, + "step": 268 + }, + { + "epoch": 0.22727272727272727, + "grad_norm": 3.2756094932556152, + "learning_rate": 1.996862184441851e-06, + "loss": 0.9819681644439697, + "step": 270 + }, + { + "epoch": 0.22895622895622897, + "grad_norm": 13.153508186340332, + "learning_rate": 1.9967212621115065e-06, + "loss": 1.3135335445404053, + "step": 272 + }, + { + "epoch": 0.23063973063973064, + "grad_norm": 14.49177074432373, + "learning_rate": 1.996577250199111e-06, + "loss": 1.1486749649047852, + "step": 274 + }, + { + "epoch": 0.23232323232323232, + "grad_norm": 26.132858276367188, + "learning_rate": 1.9964301492008464e-06, + "loss": 0.9009004831314087, + "step": 276 + }, + { + "epoch": 0.234006734006734, + "grad_norm": 3.963716506958008, + "learning_rate": 1.996279959623537e-06, + "loss": 1.1650899648666382, + "step": 278 + }, + { + "epoch": 0.2356902356902357, + "grad_norm": 13.785598754882812, + "learning_rate": 1.9961266819846495e-06, + "loss": 0.9621269702911377, + "step": 280 + }, + { + "epoch": 0.23737373737373738, + "grad_norm": 6.935214042663574, + "learning_rate": 1.9959703168122897e-06, + "loss": 0.9427906274795532, + "step": 282 + }, + { + "epoch": 0.23905723905723905, + "grad_norm": 3.0722286701202393, + "learning_rate": 1.995810864645202e-06, + "loss": 1.2749511003494263, + "step": 284 + }, + { + "epoch": 0.24074074074074073, + "grad_norm": 4.774331092834473, + "learning_rate": 1.995648326032765e-06, + "loss": 0.9315462112426758, + "step": 286 + }, + { + "epoch": 0.24242424242424243, + "grad_norm": 4.373500823974609, + "learning_rate": 1.9954827015349937e-06, + "loss": 0.8452310562133789, + "step": 288 + }, + { + "epoch": 0.2441077441077441, + "grad_norm": 9.997944831848145, + "learning_rate": 1.9953139917225333e-06, + "loss": 1.1583993434906006, + "step": 290 + }, + { + "epoch": 0.24579124579124578, + "grad_norm": 9.785924911499023, + "learning_rate": 1.995142197176661e-06, + "loss": 0.6743492484092712, + "step": 292 + }, + { + "epoch": 0.2474747474747475, + "grad_norm": 9.52839183807373, + "learning_rate": 1.9949673184892803e-06, + "loss": 1.274944543838501, + "step": 294 + }, + { + "epoch": 0.24915824915824916, + "grad_norm": 13.619229316711426, + "learning_rate": 1.9947893562629227e-06, + "loss": 1.085368037223816, + "step": 296 + }, + { + "epoch": 0.25084175084175087, + "grad_norm": 6.220252513885498, + "learning_rate": 1.9946083111107425e-06, + "loss": 0.6333813667297363, + "step": 298 + }, + { + "epoch": 0.25252525252525254, + "grad_norm": 12.346251487731934, + "learning_rate": 1.9944241836565167e-06, + "loss": 0.7786128520965576, + "step": 300 + }, + { + "epoch": 0.2542087542087542, + "grad_norm": 10.734468460083008, + "learning_rate": 1.9942369745346417e-06, + "loss": 1.0820167064666748, + "step": 302 + }, + { + "epoch": 0.2558922558922559, + "grad_norm": 25.510744094848633, + "learning_rate": 1.9940466843901318e-06, + "loss": 0.9161986112594604, + "step": 304 + }, + { + "epoch": 0.25757575757575757, + "grad_norm": 5.673551559448242, + "learning_rate": 1.9938533138786163e-06, + "loss": 1.3526289463043213, + "step": 306 + }, + { + "epoch": 0.25925925925925924, + "grad_norm": 11.891182899475098, + "learning_rate": 1.9936568636663383e-06, + "loss": 1.1077102422714233, + "step": 308 + }, + { + "epoch": 0.2609427609427609, + "grad_norm": 7.852316856384277, + "learning_rate": 1.9934573344301514e-06, + "loss": 1.0809465646743774, + "step": 310 + }, + { + "epoch": 0.26262626262626265, + "grad_norm": 20.96988296508789, + "learning_rate": 1.993254726857518e-06, + "loss": 1.225387454032898, + "step": 312 + }, + { + "epoch": 0.26430976430976433, + "grad_norm": 5.888166427612305, + "learning_rate": 1.9930490416465057e-06, + "loss": 1.086962103843689, + "step": 314 + }, + { + "epoch": 0.265993265993266, + "grad_norm": 2.8382439613342285, + "learning_rate": 1.992840279505787e-06, + "loss": 1.225638508796692, + "step": 316 + }, + { + "epoch": 0.2676767676767677, + "grad_norm": 4.078027725219727, + "learning_rate": 1.9926284411546355e-06, + "loss": 0.99470055103302, + "step": 318 + }, + { + "epoch": 0.26936026936026936, + "grad_norm": 14.269658088684082, + "learning_rate": 1.9924135273229235e-06, + "loss": 0.727924108505249, + "step": 320 + }, + { + "epoch": 0.27104377104377103, + "grad_norm": 3.730602502822876, + "learning_rate": 1.9921955387511195e-06, + "loss": 0.9582691192626953, + "step": 322 + }, + { + "epoch": 0.2727272727272727, + "grad_norm": 3.153249979019165, + "learning_rate": 1.991974476190285e-06, + "loss": 1.263975977897644, + "step": 324 + }, + { + "epoch": 0.27441077441077444, + "grad_norm": 2.4196362495422363, + "learning_rate": 1.9917503404020747e-06, + "loss": 1.0396244525909424, + "step": 326 + }, + { + "epoch": 0.2760942760942761, + "grad_norm": 12.836146354675293, + "learning_rate": 1.9915231321587305e-06, + "loss": 0.8178722262382507, + "step": 328 + }, + { + "epoch": 0.2777777777777778, + "grad_norm": 5.543509483337402, + "learning_rate": 1.99129285224308e-06, + "loss": 0.9038114547729492, + "step": 330 + }, + { + "epoch": 0.27946127946127947, + "grad_norm": 5.564317226409912, + "learning_rate": 1.9910595014485347e-06, + "loss": 1.0971403121948242, + "step": 332 + }, + { + "epoch": 0.28114478114478114, + "grad_norm": 5.212599754333496, + "learning_rate": 1.990823080579086e-06, + "loss": 1.0671043395996094, + "step": 334 + }, + { + "epoch": 0.2828282828282828, + "grad_norm": 5.401691436767578, + "learning_rate": 1.990583590449303e-06, + "loss": 1.0057094097137451, + "step": 336 + }, + { + "epoch": 0.2845117845117845, + "grad_norm": 3.39033579826355, + "learning_rate": 1.990341031884331e-06, + "loss": 1.1939620971679688, + "step": 338 + }, + { + "epoch": 0.28619528619528617, + "grad_norm": 12.433296203613281, + "learning_rate": 1.9900954057198856e-06, + "loss": 0.9549685120582581, + "step": 340 + }, + { + "epoch": 0.2878787878787879, + "grad_norm": 23.119340896606445, + "learning_rate": 1.989846712802252e-06, + "loss": 1.1277296543121338, + "step": 342 + }, + { + "epoch": 0.2895622895622896, + "grad_norm": 42.77076721191406, + "learning_rate": 1.9895949539882827e-06, + "loss": 0.8779406547546387, + "step": 344 + }, + { + "epoch": 0.29124579124579125, + "grad_norm": 2.3723807334899902, + "learning_rate": 1.9893401301453926e-06, + "loss": 1.1096537113189697, + "step": 346 + }, + { + "epoch": 0.29292929292929293, + "grad_norm": 7.652088165283203, + "learning_rate": 1.989082242151556e-06, + "loss": 1.053053379058838, + "step": 348 + }, + { + "epoch": 0.2946127946127946, + "grad_norm": 8.224458694458008, + "learning_rate": 1.988821290895307e-06, + "loss": 0.7571377754211426, + "step": 350 + }, + { + "epoch": 0.2962962962962963, + "grad_norm": 3.486557722091675, + "learning_rate": 1.988557277275732e-06, + "loss": 0.5875279903411865, + "step": 352 + }, + { + "epoch": 0.29797979797979796, + "grad_norm": 3.368520498275757, + "learning_rate": 1.9882902022024683e-06, + "loss": 1.0230705738067627, + "step": 354 + }, + { + "epoch": 0.2996632996632997, + "grad_norm": 7.633305549621582, + "learning_rate": 1.9880200665957026e-06, + "loss": 1.0808613300323486, + "step": 356 + }, + { + "epoch": 0.30134680134680136, + "grad_norm": 15.621920585632324, + "learning_rate": 1.9877468713861656e-06, + "loss": 0.9313445687294006, + "step": 358 + }, + { + "epoch": 0.30303030303030304, + "grad_norm": 3.499727249145508, + "learning_rate": 1.98747061751513e-06, + "loss": 0.8186299800872803, + "step": 360 + }, + { + "epoch": 0.3047138047138047, + "grad_norm": 3.564624547958374, + "learning_rate": 1.987191305934406e-06, + "loss": 0.9808353185653687, + "step": 362 + }, + { + "epoch": 0.3063973063973064, + "grad_norm": 2.6821398735046387, + "learning_rate": 1.98690893760634e-06, + "loss": 1.2293064594268799, + "step": 364 + }, + { + "epoch": 0.30808080808080807, + "grad_norm": 8.49547004699707, + "learning_rate": 1.9866235135038095e-06, + "loss": 0.8337675333023071, + "step": 366 + }, + { + "epoch": 0.30976430976430974, + "grad_norm": 3.7763280868530273, + "learning_rate": 1.986335034610221e-06, + "loss": 0.8535688519477844, + "step": 368 + }, + { + "epoch": 0.3114478114478115, + "grad_norm": 6.456183910369873, + "learning_rate": 1.9860435019195054e-06, + "loss": 1.0865236520767212, + "step": 370 + }, + { + "epoch": 0.31313131313131315, + "grad_norm": 6.974287509918213, + "learning_rate": 1.9857489164361147e-06, + "loss": 1.2327494621276855, + "step": 372 + }, + { + "epoch": 0.3148148148148148, + "grad_norm": 12.779848098754883, + "learning_rate": 1.9854512791750214e-06, + "loss": 0.6957528591156006, + "step": 374 + }, + { + "epoch": 0.3164983164983165, + "grad_norm": 3.759835720062256, + "learning_rate": 1.9851505911617097e-06, + "loss": 0.9909141659736633, + "step": 376 + }, + { + "epoch": 0.3181818181818182, + "grad_norm": 7.0778608322143555, + "learning_rate": 1.984846853432177e-06, + "loss": 1.3244696855545044, + "step": 378 + }, + { + "epoch": 0.31986531986531985, + "grad_norm": 24.917316436767578, + "learning_rate": 1.9845400670329275e-06, + "loss": 0.7233332991600037, + "step": 380 + }, + { + "epoch": 0.32154882154882153, + "grad_norm": 10.7407865524292, + "learning_rate": 1.98423023302097e-06, + "loss": 0.9228682518005371, + "step": 382 + }, + { + "epoch": 0.32323232323232326, + "grad_norm": 4.701694011688232, + "learning_rate": 1.9839173524638115e-06, + "loss": 1.1106748580932617, + "step": 384 + }, + { + "epoch": 0.32491582491582494, + "grad_norm": 5.9592976570129395, + "learning_rate": 1.9836014264394587e-06, + "loss": 0.7204115390777588, + "step": 386 + }, + { + "epoch": 0.3265993265993266, + "grad_norm": 24.467937469482422, + "learning_rate": 1.9832824560364093e-06, + "loss": 0.9101235866546631, + "step": 388 + }, + { + "epoch": 0.3282828282828283, + "grad_norm": 55.19502258300781, + "learning_rate": 1.98296044235365e-06, + "loss": 1.0853596925735474, + "step": 390 + }, + { + "epoch": 0.32996632996632996, + "grad_norm": 110.57111358642578, + "learning_rate": 1.9826353865006538e-06, + "loss": 0.7398289442062378, + "step": 392 + }, + { + "epoch": 0.33164983164983164, + "grad_norm": 6.112462520599365, + "learning_rate": 1.9823072895973748e-06, + "loss": 1.3101907968521118, + "step": 394 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 18.562759399414062, + "learning_rate": 1.981976152774245e-06, + "loss": 1.1832518577575684, + "step": 396 + }, + { + "epoch": 0.335016835016835, + "grad_norm": 10.324470520019531, + "learning_rate": 1.98164197717217e-06, + "loss": 0.7631848454475403, + "step": 398 + }, + { + "epoch": 0.3367003367003367, + "grad_norm": 5.662529468536377, + "learning_rate": 1.9813047639425253e-06, + "loss": 0.9376566410064697, + "step": 400 + }, + { + "epoch": 0.3383838383838384, + "grad_norm": 17.23822784423828, + "learning_rate": 1.9809645142471528e-06, + "loss": 0.9629780650138855, + "step": 402 + }, + { + "epoch": 0.3400673400673401, + "grad_norm": 8.00967025756836, + "learning_rate": 1.980621229258355e-06, + "loss": 1.0150327682495117, + "step": 404 + }, + { + "epoch": 0.34175084175084175, + "grad_norm": 4.659936904907227, + "learning_rate": 1.9802749101588942e-06, + "loss": 1.0681769847869873, + "step": 406 + }, + { + "epoch": 0.3434343434343434, + "grad_norm": 5.872868061065674, + "learning_rate": 1.9799255581419844e-06, + "loss": 0.9499913454055786, + "step": 408 + }, + { + "epoch": 0.3451178451178451, + "grad_norm": 4.6081109046936035, + "learning_rate": 1.9795731744112908e-06, + "loss": 0.5379456877708435, + "step": 410 + }, + { + "epoch": 0.3468013468013468, + "grad_norm": 4.34984016418457, + "learning_rate": 1.9792177601809234e-06, + "loss": 0.8700510263442993, + "step": 412 + }, + { + "epoch": 0.3484848484848485, + "grad_norm": 12.086810111999512, + "learning_rate": 1.9788593166754343e-06, + "loss": 0.8910826444625854, + "step": 414 + }, + { + "epoch": 0.3501683501683502, + "grad_norm": 15.385903358459473, + "learning_rate": 1.9784978451298115e-06, + "loss": 1.1716386079788208, + "step": 416 + }, + { + "epoch": 0.35185185185185186, + "grad_norm": 7.822863578796387, + "learning_rate": 1.9781333467894773e-06, + "loss": 0.687047004699707, + "step": 418 + }, + { + "epoch": 0.35353535353535354, + "grad_norm": 10.231508255004883, + "learning_rate": 1.9777658229102807e-06, + "loss": 0.8759807348251343, + "step": 420 + }, + { + "epoch": 0.3552188552188552, + "grad_norm": 10.260309219360352, + "learning_rate": 1.9773952747584976e-06, + "loss": 1.1332191228866577, + "step": 422 + }, + { + "epoch": 0.3569023569023569, + "grad_norm": 8.660632133483887, + "learning_rate": 1.9770217036108212e-06, + "loss": 0.5898092985153198, + "step": 424 + }, + { + "epoch": 0.35858585858585856, + "grad_norm": 24.724945068359375, + "learning_rate": 1.9766451107543614e-06, + "loss": 0.9762297868728638, + "step": 426 + }, + { + "epoch": 0.3602693602693603, + "grad_norm": 10.698787689208984, + "learning_rate": 1.9762654974866396e-06, + "loss": 0.7858309149742126, + "step": 428 + }, + { + "epoch": 0.36195286195286197, + "grad_norm": 9.971443176269531, + "learning_rate": 1.975882865115583e-06, + "loss": 1.2292566299438477, + "step": 430 + }, + { + "epoch": 0.36363636363636365, + "grad_norm": 7.011922359466553, + "learning_rate": 1.9754972149595204e-06, + "loss": 0.9748165607452393, + "step": 432 + }, + { + "epoch": 0.3653198653198653, + "grad_norm": 12.33168888092041, + "learning_rate": 1.97510854834718e-06, + "loss": 0.8448182940483093, + "step": 434 + }, + { + "epoch": 0.367003367003367, + "grad_norm": 2.4483745098114014, + "learning_rate": 1.9747168666176813e-06, + "loss": 1.008624792098999, + "step": 436 + }, + { + "epoch": 0.3686868686868687, + "grad_norm": 10.966385841369629, + "learning_rate": 1.9743221711205323e-06, + "loss": 1.0692952871322632, + "step": 438 + }, + { + "epoch": 0.37037037037037035, + "grad_norm": 2.965273141860962, + "learning_rate": 1.9739244632156256e-06, + "loss": 0.9337837100028992, + "step": 440 + }, + { + "epoch": 0.3720538720538721, + "grad_norm": 12.18703556060791, + "learning_rate": 1.973523744273232e-06, + "loss": 0.9473227262496948, + "step": 442 + }, + { + "epoch": 0.37373737373737376, + "grad_norm": 8.538522720336914, + "learning_rate": 1.973120015673997e-06, + "loss": 0.7716883420944214, + "step": 444 + }, + { + "epoch": 0.37542087542087543, + "grad_norm": 11.410622596740723, + "learning_rate": 1.9727132788089354e-06, + "loss": 0.6292431354522705, + "step": 446 + }, + { + "epoch": 0.3771043771043771, + "grad_norm": 3.9945926666259766, + "learning_rate": 1.972303535079427e-06, + "loss": 1.1218082904815674, + "step": 448 + }, + { + "epoch": 0.3787878787878788, + "grad_norm": 19.375045776367188, + "learning_rate": 1.971890785897211e-06, + "loss": 1.007505178451538, + "step": 450 + }, + { + "epoch": 0.38047138047138046, + "grad_norm": 3.713459014892578, + "learning_rate": 1.9714750326843825e-06, + "loss": 0.7216253280639648, + "step": 452 + }, + { + "epoch": 0.38215488215488214, + "grad_norm": 6.826941013336182, + "learning_rate": 1.9710562768733857e-06, + "loss": 0.9892054796218872, + "step": 454 + }, + { + "epoch": 0.3838383838383838, + "grad_norm": 7.63702392578125, + "learning_rate": 1.9706345199070107e-06, + "loss": 0.7905744314193726, + "step": 456 + }, + { + "epoch": 0.38552188552188554, + "grad_norm": 11.529894828796387, + "learning_rate": 1.970209763238388e-06, + "loss": 0.9695171117782593, + "step": 458 + }, + { + "epoch": 0.3872053872053872, + "grad_norm": 2.9292163848876953, + "learning_rate": 1.969782008330983e-06, + "loss": 1.1221948862075806, + "step": 460 + }, + { + "epoch": 0.3888888888888889, + "grad_norm": 4.672982215881348, + "learning_rate": 1.969351256658591e-06, + "loss": 0.8763028979301453, + "step": 462 + }, + { + "epoch": 0.39057239057239057, + "grad_norm": 4.81404972076416, + "learning_rate": 1.968917509705333e-06, + "loss": 0.8340336680412292, + "step": 464 + }, + { + "epoch": 0.39225589225589225, + "grad_norm": 19.125089645385742, + "learning_rate": 1.9684807689656497e-06, + "loss": 0.9119417071342468, + "step": 466 + }, + { + "epoch": 0.3939393939393939, + "grad_norm": 2.594858407974243, + "learning_rate": 1.9680410359442972e-06, + "loss": 0.9458074569702148, + "step": 468 + }, + { + "epoch": 0.3956228956228956, + "grad_norm": 3.8974621295928955, + "learning_rate": 1.9675983121563397e-06, + "loss": 0.9553569555282593, + "step": 470 + }, + { + "epoch": 0.39730639730639733, + "grad_norm": 6.4163641929626465, + "learning_rate": 1.9671525991271478e-06, + "loss": 0.7942986488342285, + "step": 472 + }, + { + "epoch": 0.398989898989899, + "grad_norm": 3.718247890472412, + "learning_rate": 1.9667038983923902e-06, + "loss": 0.9940693378448486, + "step": 474 + }, + { + "epoch": 0.4006734006734007, + "grad_norm": 25.65456199645996, + "learning_rate": 1.9662522114980296e-06, + "loss": 0.7515483498573303, + "step": 476 + }, + { + "epoch": 0.40235690235690236, + "grad_norm": 3.7314107418060303, + "learning_rate": 1.965797540000318e-06, + "loss": 0.9622472524642944, + "step": 478 + }, + { + "epoch": 0.40404040404040403, + "grad_norm": 16.488338470458984, + "learning_rate": 1.9653398854657887e-06, + "loss": 1.041235089302063, + "step": 480 + }, + { + "epoch": 0.4057239057239057, + "grad_norm": 8.276439666748047, + "learning_rate": 1.9648792494712553e-06, + "loss": 1.0389721393585205, + "step": 482 + }, + { + "epoch": 0.4074074074074074, + "grad_norm": 10.357524871826172, + "learning_rate": 1.9644156336038024e-06, + "loss": 0.8473480343818665, + "step": 484 + }, + { + "epoch": 0.4090909090909091, + "grad_norm": 12.934167861938477, + "learning_rate": 1.9639490394607813e-06, + "loss": 0.8664846420288086, + "step": 486 + }, + { + "epoch": 0.4107744107744108, + "grad_norm": 2.63865327835083, + "learning_rate": 1.9634794686498055e-06, + "loss": 1.0735490322113037, + "step": 488 + }, + { + "epoch": 0.41245791245791247, + "grad_norm": 9.611379623413086, + "learning_rate": 1.9630069227887444e-06, + "loss": 1.097601294517517, + "step": 490 + }, + { + "epoch": 0.41414141414141414, + "grad_norm": 37.54718780517578, + "learning_rate": 1.9625314035057167e-06, + "loss": 1.0461905002593994, + "step": 492 + }, + { + "epoch": 0.4158249158249158, + "grad_norm": 5.95384407043457, + "learning_rate": 1.9620529124390863e-06, + "loss": 0.9309274554252625, + "step": 494 + }, + { + "epoch": 0.4175084175084175, + "grad_norm": 5.1661763191223145, + "learning_rate": 1.9615714512374567e-06, + "loss": 1.0628364086151123, + "step": 496 + }, + { + "epoch": 0.41919191919191917, + "grad_norm": 4.157014846801758, + "learning_rate": 1.9610870215596643e-06, + "loss": 1.0677950382232666, + "step": 498 + }, + { + "epoch": 0.4208754208754209, + "grad_norm": 6.916998863220215, + "learning_rate": 1.960599625074773e-06, + "loss": 0.8103325366973877, + "step": 500 + }, + { + "epoch": 0.4225589225589226, + "grad_norm": 6.891815185546875, + "learning_rate": 1.9601092634620687e-06, + "loss": 0.6272333264350891, + "step": 502 + }, + { + "epoch": 0.42424242424242425, + "grad_norm": 9.089258193969727, + "learning_rate": 1.9596159384110535e-06, + "loss": 0.8941874504089355, + "step": 504 + }, + { + "epoch": 0.42592592592592593, + "grad_norm": 16.94425392150879, + "learning_rate": 1.95911965162144e-06, + "loss": 0.938546359539032, + "step": 506 + }, + { + "epoch": 0.4276094276094276, + "grad_norm": 15.095925331115723, + "learning_rate": 1.958620404803145e-06, + "loss": 1.293353796005249, + "step": 508 + }, + { + "epoch": 0.4292929292929293, + "grad_norm": 3.3025577068328857, + "learning_rate": 1.9581181996762834e-06, + "loss": 1.0367740392684937, + "step": 510 + }, + { + "epoch": 0.43097643097643096, + "grad_norm": 3.0691745281219482, + "learning_rate": 1.9576130379711634e-06, + "loss": 1.178546667098999, + "step": 512 + }, + { + "epoch": 0.43265993265993263, + "grad_norm": 3.2468979358673096, + "learning_rate": 1.95710492142828e-06, + "loss": 1.115210771560669, + "step": 514 + }, + { + "epoch": 0.43434343434343436, + "grad_norm": 12.401965141296387, + "learning_rate": 1.956593851798308e-06, + "loss": 1.0290696620941162, + "step": 516 + }, + { + "epoch": 0.43602693602693604, + "grad_norm": 8.208135604858398, + "learning_rate": 1.9560798308420974e-06, + "loss": 1.0394536256790161, + "step": 518 + }, + { + "epoch": 0.4377104377104377, + "grad_norm": 15.533670425415039, + "learning_rate": 1.955562860330667e-06, + "loss": 0.9136192798614502, + "step": 520 + }, + { + "epoch": 0.4393939393939394, + "grad_norm": 3.0875625610351562, + "learning_rate": 1.9550429420451973e-06, + "loss": 0.7975887060165405, + "step": 522 + }, + { + "epoch": 0.44107744107744107, + "grad_norm": 8.5232572555542, + "learning_rate": 1.954520077777026e-06, + "loss": 1.1077611446380615, + "step": 524 + }, + { + "epoch": 0.44276094276094274, + "grad_norm": 11.362956047058105, + "learning_rate": 1.9539942693276405e-06, + "loss": 0.7790743112564087, + "step": 526 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 2.6764779090881348, + "learning_rate": 1.9534655185086717e-06, + "loss": 1.1893084049224854, + "step": 528 + }, + { + "epoch": 0.44612794612794615, + "grad_norm": 11.054378509521484, + "learning_rate": 1.9529338271418886e-06, + "loss": 0.8206809759140015, + "step": 530 + }, + { + "epoch": 0.4478114478114478, + "grad_norm": 15.93736743927002, + "learning_rate": 1.952399197059192e-06, + "loss": 0.8338401317596436, + "step": 532 + }, + { + "epoch": 0.4494949494949495, + "grad_norm": 5.404129505157471, + "learning_rate": 1.9518616301026077e-06, + "loss": 0.9456153512001038, + "step": 534 + }, + { + "epoch": 0.4511784511784512, + "grad_norm": 4.291036128997803, + "learning_rate": 1.9513211281242795e-06, + "loss": 1.2254921197891235, + "step": 536 + }, + { + "epoch": 0.45286195286195285, + "grad_norm": 7.2202582359313965, + "learning_rate": 1.9507776929864643e-06, + "loss": 1.092686653137207, + "step": 538 + }, + { + "epoch": 0.45454545454545453, + "grad_norm": 8.635713577270508, + "learning_rate": 1.950231326561525e-06, + "loss": 0.8675233125686646, + "step": 540 + }, + { + "epoch": 0.4562289562289562, + "grad_norm": 8.679670333862305, + "learning_rate": 1.9496820307319237e-06, + "loss": 1.0159896612167358, + "step": 542 + }, + { + "epoch": 0.45791245791245794, + "grad_norm": 3.453657865524292, + "learning_rate": 1.9491298073902157e-06, + "loss": 1.118143081665039, + "step": 544 + }, + { + "epoch": 0.4595959595959596, + "grad_norm": 7.604466438293457, + "learning_rate": 1.9485746584390426e-06, + "loss": 1.1383062601089478, + "step": 546 + }, + { + "epoch": 0.4612794612794613, + "grad_norm": 10.454069137573242, + "learning_rate": 1.948016585791127e-06, + "loss": 1.3462685346603394, + "step": 548 + }, + { + "epoch": 0.46296296296296297, + "grad_norm": 7.511162757873535, + "learning_rate": 1.9474555913692627e-06, + "loss": 0.8798332214355469, + "step": 550 + }, + { + "epoch": 0.46464646464646464, + "grad_norm": 22.986238479614258, + "learning_rate": 1.946891677106312e-06, + "loss": 0.8471826314926147, + "step": 552 + }, + { + "epoch": 0.4663299663299663, + "grad_norm": 4.494133949279785, + "learning_rate": 1.946324844945197e-06, + "loss": 1.0384173393249512, + "step": 554 + }, + { + "epoch": 0.468013468013468, + "grad_norm": 9.850350379943848, + "learning_rate": 1.9457550968388928e-06, + "loss": 0.7141643166542053, + "step": 556 + }, + { + "epoch": 0.4696969696969697, + "grad_norm": 6.887972831726074, + "learning_rate": 1.9451824347504213e-06, + "loss": 1.190050721168518, + "step": 558 + }, + { + "epoch": 0.4713804713804714, + "grad_norm": 5.237252235412598, + "learning_rate": 1.944606860652845e-06, + "loss": 0.41058096289634705, + "step": 560 + }, + { + "epoch": 0.4730639730639731, + "grad_norm": 15.578932762145996, + "learning_rate": 1.944028376529258e-06, + "loss": 0.598914384841919, + "step": 562 + }, + { + "epoch": 0.47474747474747475, + "grad_norm": 3.727078437805176, + "learning_rate": 1.943446984372782e-06, + "loss": 1.2833001613616943, + "step": 564 + }, + { + "epoch": 0.4764309764309764, + "grad_norm": 8.145559310913086, + "learning_rate": 1.942862686186557e-06, + "loss": 1.1502578258514404, + "step": 566 + }, + { + "epoch": 0.4781144781144781, + "grad_norm": 8.36186408996582, + "learning_rate": 1.9422754839837366e-06, + "loss": 0.45712798833847046, + "step": 568 + }, + { + "epoch": 0.4797979797979798, + "grad_norm": 32.920475006103516, + "learning_rate": 1.9416853797874797e-06, + "loss": 1.1332796812057495, + "step": 570 + }, + { + "epoch": 0.48148148148148145, + "grad_norm": 17.55156135559082, + "learning_rate": 1.941092375630943e-06, + "loss": 0.6961038112640381, + "step": 572 + }, + { + "epoch": 0.4831649831649832, + "grad_norm": 4.492574214935303, + "learning_rate": 1.9404964735572754e-06, + "loss": 0.9653905630111694, + "step": 574 + }, + { + "epoch": 0.48484848484848486, + "grad_norm": 6.348426818847656, + "learning_rate": 1.939897675619611e-06, + "loss": 0.871944785118103, + "step": 576 + }, + { + "epoch": 0.48653198653198654, + "grad_norm": 25.369014739990234, + "learning_rate": 1.9392959838810597e-06, + "loss": 1.0709469318389893, + "step": 578 + }, + { + "epoch": 0.4882154882154882, + "grad_norm": 10.82548999786377, + "learning_rate": 1.9386914004147034e-06, + "loss": 0.7998636960983276, + "step": 580 + }, + { + "epoch": 0.4898989898989899, + "grad_norm": 10.758012771606445, + "learning_rate": 1.938083927303586e-06, + "loss": 1.3598113059997559, + "step": 582 + }, + { + "epoch": 0.49158249158249157, + "grad_norm": 6.756187915802002, + "learning_rate": 1.937473566640708e-06, + "loss": 0.9948703050613403, + "step": 584 + }, + { + "epoch": 0.49326599326599324, + "grad_norm": 2.756861686706543, + "learning_rate": 1.9368603205290196e-06, + "loss": 0.8475466966629028, + "step": 586 + }, + { + "epoch": 0.494949494949495, + "grad_norm": 5.148032188415527, + "learning_rate": 1.9362441910814105e-06, + "loss": 0.6347664594650269, + "step": 588 + }, + { + "epoch": 0.49663299663299665, + "grad_norm": 2.980475425720215, + "learning_rate": 1.935625180420706e-06, + "loss": 1.1008853912353516, + "step": 590 + }, + { + "epoch": 0.4983164983164983, + "grad_norm": 3.5861027240753174, + "learning_rate": 1.935003290679659e-06, + "loss": 1.1105575561523438, + "step": 592 + }, + { + "epoch": 0.5, + "grad_norm": 37.69801712036133, + "learning_rate": 1.934378524000941e-06, + "loss": 0.7997324466705322, + "step": 594 + }, + { + "epoch": 0.5016835016835017, + "grad_norm": 10.022683143615723, + "learning_rate": 1.933750882537136e-06, + "loss": 0.9395183324813843, + "step": 596 + }, + { + "epoch": 0.5033670033670034, + "grad_norm": 3.6454007625579834, + "learning_rate": 1.9331203684507333e-06, + "loss": 1.2922556400299072, + "step": 598 + }, + { + "epoch": 0.5050505050505051, + "grad_norm": 11.494460105895996, + "learning_rate": 1.9324869839141184e-06, + "loss": 0.7769290804862976, + "step": 600 + }, + { + "epoch": 0.5067340067340067, + "grad_norm": 101.31135559082031, + "learning_rate": 1.9318507311095686e-06, + "loss": 1.0425605773925781, + "step": 602 + }, + { + "epoch": 0.5084175084175084, + "grad_norm": 29.326383590698242, + "learning_rate": 1.9312116122292414e-06, + "loss": 1.0084577798843384, + "step": 604 + }, + { + "epoch": 0.51010101010101, + "grad_norm": 4.6560163497924805, + "learning_rate": 1.9305696294751707e-06, + "loss": 1.0687224864959717, + "step": 606 + }, + { + "epoch": 0.5117845117845118, + "grad_norm": 31.829082489013672, + "learning_rate": 1.9299247850592575e-06, + "loss": 0.5714974999427795, + "step": 608 + }, + { + "epoch": 0.5134680134680135, + "grad_norm": 3.3935041427612305, + "learning_rate": 1.9292770812032626e-06, + "loss": 0.9293146133422852, + "step": 610 + }, + { + "epoch": 0.5151515151515151, + "grad_norm": 35.04014587402344, + "learning_rate": 1.9286265201387966e-06, + "loss": 0.8598051071166992, + "step": 612 + }, + { + "epoch": 0.5168350168350169, + "grad_norm": 5.506503105163574, + "learning_rate": 1.9279731041073177e-06, + "loss": 0.7148240804672241, + "step": 614 + }, + { + "epoch": 0.5185185185185185, + "grad_norm": 7.014071941375732, + "learning_rate": 1.9273168353601185e-06, + "loss": 1.0927050113677979, + "step": 616 + }, + { + "epoch": 0.5202020202020202, + "grad_norm": 11.175944328308105, + "learning_rate": 1.9266577161583207e-06, + "loss": 1.0155811309814453, + "step": 618 + }, + { + "epoch": 0.5218855218855218, + "grad_norm": 4.795597076416016, + "learning_rate": 1.925995748772868e-06, + "loss": 0.9794735312461853, + "step": 620 + }, + { + "epoch": 0.5235690235690236, + "grad_norm": 24.483413696289062, + "learning_rate": 1.925330935484516e-06, + "loss": 1.045680284500122, + "step": 622 + }, + { + "epoch": 0.5252525252525253, + "grad_norm": 2.9763712882995605, + "learning_rate": 1.9246632785838263e-06, + "loss": 0.7627449631690979, + "step": 624 + }, + { + "epoch": 0.5269360269360269, + "grad_norm": 19.479745864868164, + "learning_rate": 1.9239927803711578e-06, + "loss": 0.945065975189209, + "step": 626 + }, + { + "epoch": 0.5286195286195287, + "grad_norm": 2.6288349628448486, + "learning_rate": 1.923319443156659e-06, + "loss": 0.839026153087616, + "step": 628 + }, + { + "epoch": 0.5303030303030303, + "grad_norm": 14.550789833068848, + "learning_rate": 1.92264326926026e-06, + "loss": 0.7562347054481506, + "step": 630 + }, + { + "epoch": 0.531986531986532, + "grad_norm": 7.969823360443115, + "learning_rate": 1.9219642610116647e-06, + "loss": 1.1040418148040771, + "step": 632 + }, + { + "epoch": 0.5336700336700336, + "grad_norm": 9.72048568725586, + "learning_rate": 1.9212824207503415e-06, + "loss": 0.9238873720169067, + "step": 634 + }, + { + "epoch": 0.5353535353535354, + "grad_norm": 4.213377475738525, + "learning_rate": 1.920597750825517e-06, + "loss": 0.8101857900619507, + "step": 636 + }, + { + "epoch": 0.5370370370370371, + "grad_norm": 13.104752540588379, + "learning_rate": 1.919910253596168e-06, + "loss": 0.9694643020629883, + "step": 638 + }, + { + "epoch": 0.5387205387205387, + "grad_norm": 10.729632377624512, + "learning_rate": 1.919219931431011e-06, + "loss": 0.8188080191612244, + "step": 640 + }, + { + "epoch": 0.5404040404040404, + "grad_norm": 4.642938613891602, + "learning_rate": 1.918526786708497e-06, + "loss": 0.944012463092804, + "step": 642 + }, + { + "epoch": 0.5420875420875421, + "grad_norm": 4.087347984313965, + "learning_rate": 1.9178308218168e-06, + "loss": 0.8914910554885864, + "step": 644 + }, + { + "epoch": 0.5437710437710438, + "grad_norm": 3.8000528812408447, + "learning_rate": 1.9171320391538132e-06, + "loss": 0.893518328666687, + "step": 646 + }, + { + "epoch": 0.5454545454545454, + "grad_norm": 9.262425422668457, + "learning_rate": 1.9164304411271364e-06, + "loss": 0.984040379524231, + "step": 648 + }, + { + "epoch": 0.5471380471380471, + "grad_norm": 10.015108108520508, + "learning_rate": 1.9157260301540697e-06, + "loss": 1.140836477279663, + "step": 650 + }, + { + "epoch": 0.5488215488215489, + "grad_norm": 160.21282958984375, + "learning_rate": 1.9150188086616055e-06, + "loss": 1.0449649095535278, + "step": 652 + }, + { + "epoch": 0.5505050505050505, + "grad_norm": 4.650694847106934, + "learning_rate": 1.91430877908642e-06, + "loss": 1.0726298093795776, + "step": 654 + }, + { + "epoch": 0.5521885521885522, + "grad_norm": 11.116467475891113, + "learning_rate": 1.9135959438748626e-06, + "loss": 0.9272226095199585, + "step": 656 + }, + { + "epoch": 0.5538720538720538, + "grad_norm": 7.265547752380371, + "learning_rate": 1.9128803054829515e-06, + "loss": 0.7893900871276855, + "step": 658 + }, + { + "epoch": 0.5555555555555556, + "grad_norm": 35.09156799316406, + "learning_rate": 1.912161866376362e-06, + "loss": 0.7798557281494141, + "step": 660 + }, + { + "epoch": 0.5572390572390572, + "grad_norm": 3.4387574195861816, + "learning_rate": 1.9114406290304186e-06, + "loss": 1.0308525562286377, + "step": 662 + }, + { + "epoch": 0.5589225589225589, + "grad_norm": 3.3560092449188232, + "learning_rate": 1.910716595930088e-06, + "loss": 1.0922589302062988, + "step": 664 + }, + { + "epoch": 0.5606060606060606, + "grad_norm": 12.50266170501709, + "learning_rate": 1.9099897695699684e-06, + "loss": 0.4920412600040436, + "step": 666 + }, + { + "epoch": 0.5622895622895623, + "grad_norm": 5.19976282119751, + "learning_rate": 1.9092601524542828e-06, + "loss": 0.6655771136283875, + "step": 668 + }, + { + "epoch": 0.563973063973064, + "grad_norm": 17.65725326538086, + "learning_rate": 1.9085277470968692e-06, + "loss": 1.0704545974731445, + "step": 670 + }, + { + "epoch": 0.5656565656565656, + "grad_norm": 13.295573234558105, + "learning_rate": 1.907792556021171e-06, + "loss": 0.5930483341217041, + "step": 672 + }, + { + "epoch": 0.5673400673400674, + "grad_norm": 5.582085609436035, + "learning_rate": 1.9070545817602328e-06, + "loss": 0.5818225145339966, + "step": 674 + }, + { + "epoch": 0.569023569023569, + "grad_norm": 7.926098823547363, + "learning_rate": 1.9063138268566851e-06, + "loss": 0.6757692098617554, + "step": 676 + }, + { + "epoch": 0.5707070707070707, + "grad_norm": 9.610929489135742, + "learning_rate": 1.9055702938627407e-06, + "loss": 1.3059725761413574, + "step": 678 + }, + { + "epoch": 0.5723905723905723, + "grad_norm": 14.765951156616211, + "learning_rate": 1.9048239853401833e-06, + "loss": 0.42610985040664673, + "step": 680 + }, + { + "epoch": 0.5740740740740741, + "grad_norm": 6.197120189666748, + "learning_rate": 1.9040749038603602e-06, + "loss": 1.0255128145217896, + "step": 682 + }, + { + "epoch": 0.5757575757575758, + "grad_norm": 6.4059038162231445, + "learning_rate": 1.9033230520041719e-06, + "loss": 1.1382319927215576, + "step": 684 + }, + { + "epoch": 0.5774410774410774, + "grad_norm": 6.532130241394043, + "learning_rate": 1.9025684323620645e-06, + "loss": 1.1159263849258423, + "step": 686 + }, + { + "epoch": 0.5791245791245792, + "grad_norm": 2.4945201873779297, + "learning_rate": 1.9018110475340203e-06, + "loss": 0.8307312726974487, + "step": 688 + }, + { + "epoch": 0.5808080808080808, + "grad_norm": 20.23617935180664, + "learning_rate": 1.9010509001295485e-06, + "loss": 0.7440475821495056, + "step": 690 + }, + { + "epoch": 0.5824915824915825, + "grad_norm": 4.1981072425842285, + "learning_rate": 1.9002879927676767e-06, + "loss": 0.8382600545883179, + "step": 692 + }, + { + "epoch": 0.5841750841750841, + "grad_norm": 2.907876491546631, + "learning_rate": 1.8995223280769424e-06, + "loss": 0.9814774990081787, + "step": 694 + }, + { + "epoch": 0.5858585858585859, + "grad_norm": 5.83011531829834, + "learning_rate": 1.8987539086953819e-06, + "loss": 0.8996963500976562, + "step": 696 + }, + { + "epoch": 0.5875420875420876, + "grad_norm": 8.185150146484375, + "learning_rate": 1.8979827372705233e-06, + "loss": 0.8781136274337769, + "step": 698 + }, + { + "epoch": 0.5892255892255892, + "grad_norm": 9.394926071166992, + "learning_rate": 1.8972088164593771e-06, + "loss": 0.8234498500823975, + "step": 700 + }, + { + "epoch": 0.5909090909090909, + "grad_norm": 15.942888259887695, + "learning_rate": 1.896432148928426e-06, + "loss": 0.9446474313735962, + "step": 702 + }, + { + "epoch": 0.5925925925925926, + "grad_norm": 4.5268330574035645, + "learning_rate": 1.895652737353616e-06, + "loss": 1.0645607709884644, + "step": 704 + }, + { + "epoch": 0.5942760942760943, + "grad_norm": 6.5960612297058105, + "learning_rate": 1.8948705844203482e-06, + "loss": 0.9992242455482483, + "step": 706 + }, + { + "epoch": 0.5959595959595959, + "grad_norm": 25.13721466064453, + "learning_rate": 1.8940856928234689e-06, + "loss": 0.746535062789917, + "step": 708 + }, + { + "epoch": 0.5976430976430976, + "grad_norm": 6.828306674957275, + "learning_rate": 1.8932980652672597e-06, + "loss": 0.8305199146270752, + "step": 710 + }, + { + "epoch": 0.5993265993265994, + "grad_norm": 5.863089561462402, + "learning_rate": 1.8925077044654288e-06, + "loss": 1.1452956199645996, + "step": 712 + }, + { + "epoch": 0.601010101010101, + "grad_norm": 3.158170700073242, + "learning_rate": 1.8917146131411015e-06, + "loss": 1.0598926544189453, + "step": 714 + }, + { + "epoch": 0.6026936026936027, + "grad_norm": 6.218857288360596, + "learning_rate": 1.8909187940268115e-06, + "loss": 0.7409163117408752, + "step": 716 + }, + { + "epoch": 0.6043771043771043, + "grad_norm": 6.748631000518799, + "learning_rate": 1.89012024986449e-06, + "loss": 0.9013140201568604, + "step": 718 + }, + { + "epoch": 0.6060606060606061, + "grad_norm": 4.563135623931885, + "learning_rate": 1.8893189834054586e-06, + "loss": 0.9499297738075256, + "step": 720 + }, + { + "epoch": 0.6077441077441077, + "grad_norm": 12.914100646972656, + "learning_rate": 1.8885149974104164e-06, + "loss": 0.9684711694717407, + "step": 722 + }, + { + "epoch": 0.6094276094276094, + "grad_norm": 16.68248748779297, + "learning_rate": 1.8877082946494339e-06, + "loss": 0.8916200995445251, + "step": 724 + }, + { + "epoch": 0.6111111111111112, + "grad_norm": 31.8973388671875, + "learning_rate": 1.8868988779019414e-06, + "loss": 0.9836832284927368, + "step": 726 + }, + { + "epoch": 0.6127946127946128, + "grad_norm": 38.546356201171875, + "learning_rate": 1.8860867499567203e-06, + "loss": 0.8979325294494629, + "step": 728 + }, + { + "epoch": 0.6144781144781145, + "grad_norm": 3.1298513412475586, + "learning_rate": 1.885271913611893e-06, + "loss": 1.1511611938476562, + "step": 730 + }, + { + "epoch": 0.6161616161616161, + "grad_norm": 4.0303263664245605, + "learning_rate": 1.8844543716749134e-06, + "loss": 1.0997979640960693, + "step": 732 + }, + { + "epoch": 0.6178451178451179, + "grad_norm": 4.650604724884033, + "learning_rate": 1.8836341269625578e-06, + "loss": 0.7802401781082153, + "step": 734 + }, + { + "epoch": 0.6195286195286195, + "grad_norm": 8.960386276245117, + "learning_rate": 1.882811182300914e-06, + "loss": 0.8063424229621887, + "step": 736 + }, + { + "epoch": 0.6212121212121212, + "grad_norm": 20.323410034179688, + "learning_rate": 1.881985540525373e-06, + "loss": 0.689705491065979, + "step": 738 + }, + { + "epoch": 0.622895622895623, + "grad_norm": 4.956573963165283, + "learning_rate": 1.8811572044806178e-06, + "loss": 1.2354564666748047, + "step": 740 + }, + { + "epoch": 0.6245791245791246, + "grad_norm": 4.285037040710449, + "learning_rate": 1.8803261770206149e-06, + "loss": 1.0013043880462646, + "step": 742 + }, + { + "epoch": 0.6262626262626263, + "grad_norm": 2.563471794128418, + "learning_rate": 1.8794924610086031e-06, + "loss": 1.2029197216033936, + "step": 744 + }, + { + "epoch": 0.6279461279461279, + "grad_norm": 2.987870216369629, + "learning_rate": 1.8786560593170854e-06, + "loss": 0.9561195969581604, + "step": 746 + }, + { + "epoch": 0.6296296296296297, + "grad_norm": 3.021315336227417, + "learning_rate": 1.877816974827817e-06, + "loss": 1.202516794204712, + "step": 748 + }, + { + "epoch": 0.6313131313131313, + "grad_norm": 3.505037307739258, + "learning_rate": 1.8769752104317973e-06, + "loss": 1.2894848585128784, + "step": 750 + }, + { + "epoch": 0.632996632996633, + "grad_norm": 8.464410781860352, + "learning_rate": 1.8761307690292589e-06, + "loss": 0.7271798849105835, + "step": 752 + }, + { + "epoch": 0.6346801346801347, + "grad_norm": 26.4637508392334, + "learning_rate": 1.875283653529658e-06, + "loss": 0.9941682815551758, + "step": 754 + }, + { + "epoch": 0.6363636363636364, + "grad_norm": 2.6587889194488525, + "learning_rate": 1.874433866851663e-06, + "loss": 0.7514116168022156, + "step": 756 + }, + { + "epoch": 0.6380471380471381, + "grad_norm": 10.891627311706543, + "learning_rate": 1.8735814119231475e-06, + "loss": 0.8671576976776123, + "step": 758 + }, + { + "epoch": 0.6397306397306397, + "grad_norm": 25.072734832763672, + "learning_rate": 1.872726291681177e-06, + "loss": 0.6143717169761658, + "step": 760 + }, + { + "epoch": 0.6414141414141414, + "grad_norm": 4.057854175567627, + "learning_rate": 1.8718685090720004e-06, + "loss": 0.46186384558677673, + "step": 762 + }, + { + "epoch": 0.6430976430976431, + "grad_norm": 10.258670806884766, + "learning_rate": 1.8710080670510402e-06, + "loss": 1.0092180967330933, + "step": 764 + }, + { + "epoch": 0.6447811447811448, + "grad_norm": 4.200110912322998, + "learning_rate": 1.8701449685828806e-06, + "loss": 1.0899416208267212, + "step": 766 + }, + { + "epoch": 0.6464646464646465, + "grad_norm": 10.581267356872559, + "learning_rate": 1.8692792166412595e-06, + "loss": 0.7667125463485718, + "step": 768 + }, + { + "epoch": 0.6481481481481481, + "grad_norm": 5.673297882080078, + "learning_rate": 1.8684108142090562e-06, + "loss": 0.7934967279434204, + "step": 770 + }, + { + "epoch": 0.6498316498316499, + "grad_norm": 3.9210774898529053, + "learning_rate": 1.8675397642782827e-06, + "loss": 0.7912408113479614, + "step": 772 + }, + { + "epoch": 0.6515151515151515, + "grad_norm": 12.99809455871582, + "learning_rate": 1.8666660698500726e-06, + "loss": 0.6966930627822876, + "step": 774 + }, + { + "epoch": 0.6531986531986532, + "grad_norm": 2.608152389526367, + "learning_rate": 1.8657897339346707e-06, + "loss": 0.9161090850830078, + "step": 776 + }, + { + "epoch": 0.6548821548821548, + "grad_norm": 4.8470282554626465, + "learning_rate": 1.8649107595514226e-06, + "loss": 1.050070881843567, + "step": 778 + }, + { + "epoch": 0.6565656565656566, + "grad_norm": 38.622154235839844, + "learning_rate": 1.8640291497287654e-06, + "loss": 0.948337197303772, + "step": 780 + }, + { + "epoch": 0.6582491582491582, + "grad_norm": 19.695106506347656, + "learning_rate": 1.8631449075042156e-06, + "loss": 1.065544605255127, + "step": 782 + }, + { + "epoch": 0.6599326599326599, + "grad_norm": 6.196758270263672, + "learning_rate": 1.8622580359243601e-06, + "loss": 0.9903167486190796, + "step": 784 + }, + { + "epoch": 0.6616161616161617, + "grad_norm": 11.652655601501465, + "learning_rate": 1.8613685380448441e-06, + "loss": 1.0705502033233643, + "step": 786 + }, + { + "epoch": 0.6632996632996633, + "grad_norm": 21.967121124267578, + "learning_rate": 1.8604764169303626e-06, + "loss": 0.8703781366348267, + "step": 788 + }, + { + "epoch": 0.664983164983165, + "grad_norm": 2.8076608180999756, + "learning_rate": 1.8595816756546477e-06, + "loss": 0.9413682222366333, + "step": 790 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 12.699344635009766, + "learning_rate": 1.8586843173004598e-06, + "loss": 0.9941300749778748, + "step": 792 + }, + { + "epoch": 0.6683501683501684, + "grad_norm": 2.5356881618499756, + "learning_rate": 1.8577843449595763e-06, + "loss": 0.6315573453903198, + "step": 794 + }, + { + "epoch": 0.67003367003367, + "grad_norm": 3.684738874435425, + "learning_rate": 1.85688176173278e-06, + "loss": 0.9797836542129517, + "step": 796 + }, + { + "epoch": 0.6717171717171717, + "grad_norm": 4.553958415985107, + "learning_rate": 1.8559765707298502e-06, + "loss": 1.0133525133132935, + "step": 798 + }, + { + "epoch": 0.6734006734006734, + "grad_norm": 5.8083367347717285, + "learning_rate": 1.8550687750695509e-06, + "loss": 0.635034441947937, + "step": 800 + }, + { + "epoch": 0.6750841750841751, + "grad_norm": 2.6168251037597656, + "learning_rate": 1.8541583778796196e-06, + "loss": 0.9916131496429443, + "step": 802 + }, + { + "epoch": 0.6767676767676768, + "grad_norm": 10.899927139282227, + "learning_rate": 1.8532453822967584e-06, + "loss": 0.7682900428771973, + "step": 804 + }, + { + "epoch": 0.6784511784511784, + "grad_norm": 11.195059776306152, + "learning_rate": 1.8523297914666207e-06, + "loss": 0.6411112546920776, + "step": 806 + }, + { + "epoch": 0.6801346801346801, + "grad_norm": 8.76089859008789, + "learning_rate": 1.8514116085438027e-06, + "loss": 1.0669599771499634, + "step": 808 + }, + { + "epoch": 0.6818181818181818, + "grad_norm": 2.9080264568328857, + "learning_rate": 1.8504908366918302e-06, + "loss": 0.9828901886940002, + "step": 810 + }, + { + "epoch": 0.6835016835016835, + "grad_norm": 4.848678112030029, + "learning_rate": 1.84956747908315e-06, + "loss": 1.1542444229125977, + "step": 812 + }, + { + "epoch": 0.6851851851851852, + "grad_norm": 6.960413932800293, + "learning_rate": 1.8486415388991173e-06, + "loss": 0.5982141494750977, + "step": 814 + }, + { + "epoch": 0.6868686868686869, + "grad_norm": 2.6384944915771484, + "learning_rate": 1.8477130193299863e-06, + "loss": 1.1131889820098877, + "step": 816 + }, + { + "epoch": 0.6885521885521886, + "grad_norm": 9.800881385803223, + "learning_rate": 1.846781923574897e-06, + "loss": 0.7944687604904175, + "step": 818 + }, + { + "epoch": 0.6902356902356902, + "grad_norm": 40.63787078857422, + "learning_rate": 1.8458482548418661e-06, + "loss": 0.7440886497497559, + "step": 820 + }, + { + "epoch": 0.6919191919191919, + "grad_norm": 3.366387367248535, + "learning_rate": 1.8449120163477753e-06, + "loss": 0.7828149199485779, + "step": 822 + }, + { + "epoch": 0.6936026936026936, + "grad_norm": 4.786665916442871, + "learning_rate": 1.8439732113183607e-06, + "loss": 0.8565751314163208, + "step": 824 + }, + { + "epoch": 0.6952861952861953, + "grad_norm": 9.01762866973877, + "learning_rate": 1.8430318429881997e-06, + "loss": 0.8942912817001343, + "step": 826 + }, + { + "epoch": 0.696969696969697, + "grad_norm": 2.231179714202881, + "learning_rate": 1.8420879146007025e-06, + "loss": 0.8027513027191162, + "step": 828 + }, + { + "epoch": 0.6986531986531986, + "grad_norm": 3.190427541732788, + "learning_rate": 1.8411414294081003e-06, + "loss": 1.2244315147399902, + "step": 830 + }, + { + "epoch": 0.7003367003367004, + "grad_norm": 8.976424217224121, + "learning_rate": 1.8401923906714321e-06, + "loss": 0.8990939855575562, + "step": 832 + }, + { + "epoch": 0.702020202020202, + "grad_norm": 11.49886703491211, + "learning_rate": 1.8392408016605358e-06, + "loss": 0.6986100673675537, + "step": 834 + }, + { + "epoch": 0.7037037037037037, + "grad_norm": 10.203569412231445, + "learning_rate": 1.8382866656540361e-06, + "loss": 0.8804981708526611, + "step": 836 + }, + { + "epoch": 0.7053872053872053, + "grad_norm": 6.145118713378906, + "learning_rate": 1.8373299859393326e-06, + "loss": 0.5913242697715759, + "step": 838 + }, + { + "epoch": 0.7070707070707071, + "grad_norm": 4.84503698348999, + "learning_rate": 1.8363707658125905e-06, + "loss": 1.2492575645446777, + "step": 840 + }, + { + "epoch": 0.7087542087542088, + "grad_norm": 6.014354228973389, + "learning_rate": 1.8354090085787252e-06, + "loss": 1.122812271118164, + "step": 842 + }, + { + "epoch": 0.7104377104377104, + "grad_norm": 10.91385269165039, + "learning_rate": 1.8344447175513965e-06, + "loss": 1.0250314474105835, + "step": 844 + }, + { + "epoch": 0.7121212121212122, + "grad_norm": 5.709978103637695, + "learning_rate": 1.8334778960529916e-06, + "loss": 0.8772053718566895, + "step": 846 + }, + { + "epoch": 0.7138047138047138, + "grad_norm": 25.334754943847656, + "learning_rate": 1.8325085474146178e-06, + "loss": 0.7974849939346313, + "step": 848 + }, + { + "epoch": 0.7154882154882155, + "grad_norm": 30.209260940551758, + "learning_rate": 1.8315366749760892e-06, + "loss": 0.9543988704681396, + "step": 850 + }, + { + "epoch": 0.7171717171717171, + "grad_norm": 3.697704315185547, + "learning_rate": 1.8305622820859153e-06, + "loss": 0.7927026748657227, + "step": 852 + }, + { + "epoch": 0.7188552188552189, + "grad_norm": 10.00793743133545, + "learning_rate": 1.829585372101289e-06, + "loss": 0.78277987241745, + "step": 854 + }, + { + "epoch": 0.7205387205387206, + "grad_norm": 7.505032539367676, + "learning_rate": 1.828605948388077e-06, + "loss": 1.1311378479003906, + "step": 856 + }, + { + "epoch": 0.7222222222222222, + "grad_norm": 4.7181572914123535, + "learning_rate": 1.8276240143208054e-06, + "loss": 0.7503079175949097, + "step": 858 + }, + { + "epoch": 0.7239057239057239, + "grad_norm": 3.523047924041748, + "learning_rate": 1.8266395732826508e-06, + "loss": 0.9047625064849854, + "step": 860 + }, + { + "epoch": 0.7255892255892256, + "grad_norm": 2.024121046066284, + "learning_rate": 1.8256526286654264e-06, + "loss": 1.1868062019348145, + "step": 862 + }, + { + "epoch": 0.7272727272727273, + "grad_norm": 14.294280052185059, + "learning_rate": 1.824663183869572e-06, + "loss": 1.0042986869812012, + "step": 864 + }, + { + "epoch": 0.7289562289562289, + "grad_norm": 17.085304260253906, + "learning_rate": 1.8236712423041408e-06, + "loss": 0.9877347946166992, + "step": 866 + }, + { + "epoch": 0.7306397306397306, + "grad_norm": 6.132120609283447, + "learning_rate": 1.822676807386789e-06, + "loss": 1.2511956691741943, + "step": 868 + }, + { + "epoch": 0.7323232323232324, + "grad_norm": 5.884708881378174, + "learning_rate": 1.8216798825437635e-06, + "loss": 1.1776090860366821, + "step": 870 + }, + { + "epoch": 0.734006734006734, + "grad_norm": 5.7460737228393555, + "learning_rate": 1.8206804712098903e-06, + "loss": 1.0924787521362305, + "step": 872 + }, + { + "epoch": 0.7356902356902357, + "grad_norm": 2.724154233932495, + "learning_rate": 1.819678576828561e-06, + "loss": 1.0940457582473755, + "step": 874 + }, + { + "epoch": 0.7373737373737373, + "grad_norm": 21.470823287963867, + "learning_rate": 1.8186742028517237e-06, + "loss": 0.8332981467247009, + "step": 876 + }, + { + "epoch": 0.7390572390572391, + "grad_norm": 7.482705116271973, + "learning_rate": 1.8176673527398694e-06, + "loss": 0.6369479894638062, + "step": 878 + }, + { + "epoch": 0.7407407407407407, + "grad_norm": 15.344402313232422, + "learning_rate": 1.8166580299620202e-06, + "loss": 0.612411618232727, + "step": 880 + }, + { + "epoch": 0.7424242424242424, + "grad_norm": 2.4508793354034424, + "learning_rate": 1.815646237995718e-06, + "loss": 1.1662663221359253, + "step": 882 + }, + { + "epoch": 0.7441077441077442, + "grad_norm": 3.4642128944396973, + "learning_rate": 1.814631980327012e-06, + "loss": 1.1108534336090088, + "step": 884 + }, + { + "epoch": 0.7457912457912458, + "grad_norm": 2.681384801864624, + "learning_rate": 1.813615260450446e-06, + "loss": 0.6596791744232178, + "step": 886 + }, + { + "epoch": 0.7474747474747475, + "grad_norm": 1.7828519344329834, + "learning_rate": 1.8125960818690485e-06, + "loss": 1.0084741115570068, + "step": 888 + }, + { + "epoch": 0.7491582491582491, + "grad_norm": 34.723270416259766, + "learning_rate": 1.811574448094318e-06, + "loss": 0.9112769961357117, + "step": 890 + }, + { + "epoch": 0.7508417508417509, + "grad_norm": 10.580464363098145, + "learning_rate": 1.8105503626462129e-06, + "loss": 0.9600024819374084, + "step": 892 + }, + { + "epoch": 0.7525252525252525, + "grad_norm": 17.393407821655273, + "learning_rate": 1.8095238290531385e-06, + "loss": 0.7573001384735107, + "step": 894 + }, + { + "epoch": 0.7542087542087542, + "grad_norm": 8.820290565490723, + "learning_rate": 1.8084948508519346e-06, + "loss": 0.8571316003799438, + "step": 896 + }, + { + "epoch": 0.7558922558922558, + "grad_norm": 15.848811149597168, + "learning_rate": 1.8074634315878644e-06, + "loss": 0.6229598522186279, + "step": 898 + }, + { + "epoch": 0.7575757575757576, + "grad_norm": 5.893372058868408, + "learning_rate": 1.8064295748146014e-06, + "loss": 0.8924508094787598, + "step": 900 + }, + { + "epoch": 0.7592592592592593, + "grad_norm": 21.465091705322266, + "learning_rate": 1.8053932840942175e-06, + "loss": 0.6515762209892273, + "step": 902 + }, + { + "epoch": 0.7609427609427609, + "grad_norm": 3.3033552169799805, + "learning_rate": 1.8043545629971689e-06, + "loss": 1.2100439071655273, + "step": 904 + }, + { + "epoch": 0.7626262626262627, + "grad_norm": 3.6212236881256104, + "learning_rate": 1.8033134151022881e-06, + "loss": 0.9367895126342773, + "step": 906 + }, + { + "epoch": 0.7643097643097643, + "grad_norm": 11.270123481750488, + "learning_rate": 1.8022698439967673e-06, + "loss": 0.9181069731712341, + "step": 908 + }, + { + "epoch": 0.765993265993266, + "grad_norm": 4.863030433654785, + "learning_rate": 1.8012238532761476e-06, + "loss": 0.8502522110939026, + "step": 910 + }, + { + "epoch": 0.7676767676767676, + "grad_norm": 7.718131065368652, + "learning_rate": 1.8001754465443078e-06, + "loss": 0.9918288588523865, + "step": 912 + }, + { + "epoch": 0.7693602693602694, + "grad_norm": 10.74516773223877, + "learning_rate": 1.79912462741345e-06, + "loss": 0.8540866374969482, + "step": 914 + }, + { + "epoch": 0.7710437710437711, + "grad_norm": 6.144227027893066, + "learning_rate": 1.798071399504088e-06, + "loss": 0.9551119804382324, + "step": 916 + }, + { + "epoch": 0.7727272727272727, + "grad_norm": 3.8601930141448975, + "learning_rate": 1.7970157664450357e-06, + "loss": 0.6338967084884644, + "step": 918 + }, + { + "epoch": 0.7744107744107744, + "grad_norm": 11.050410270690918, + "learning_rate": 1.7959577318733925e-06, + "loss": 0.5116314888000488, + "step": 920 + }, + { + "epoch": 0.7760942760942761, + "grad_norm": 4.513789176940918, + "learning_rate": 1.7948972994345328e-06, + "loss": 0.6171036958694458, + "step": 922 + }, + { + "epoch": 0.7777777777777778, + "grad_norm": 8.82806396484375, + "learning_rate": 1.7938344727820928e-06, + "loss": 0.9206382632255554, + "step": 924 + }, + { + "epoch": 0.7794612794612794, + "grad_norm": 4.373292446136475, + "learning_rate": 1.7927692555779577e-06, + "loss": 1.1664514541625977, + "step": 926 + }, + { + "epoch": 0.7811447811447811, + "grad_norm": 3.1802244186401367, + "learning_rate": 1.791701651492248e-06, + "loss": 0.48759081959724426, + "step": 928 + }, + { + "epoch": 0.7828282828282829, + "grad_norm": 6.313639163970947, + "learning_rate": 1.7906316642033099e-06, + "loss": 1.3327703475952148, + "step": 930 + }, + { + "epoch": 0.7845117845117845, + "grad_norm": 22.747098922729492, + "learning_rate": 1.7895592973976998e-06, + "loss": 0.8829092383384705, + "step": 932 + }, + { + "epoch": 0.7861952861952862, + "grad_norm": 3.2088170051574707, + "learning_rate": 1.7884845547701721e-06, + "loss": 1.0014090538024902, + "step": 934 + }, + { + "epoch": 0.7878787878787878, + "grad_norm": 12.781431198120117, + "learning_rate": 1.7874074400236677e-06, + "loss": 0.8620262145996094, + "step": 936 + }, + { + "epoch": 0.7895622895622896, + "grad_norm": 2.6499383449554443, + "learning_rate": 1.7863279568692999e-06, + "loss": 0.8909909725189209, + "step": 938 + }, + { + "epoch": 0.7912457912457912, + "grad_norm": 2.3473894596099854, + "learning_rate": 1.7852461090263422e-06, + "loss": 1.0048516988754272, + "step": 940 + }, + { + "epoch": 0.7929292929292929, + "grad_norm": 16.40445327758789, + "learning_rate": 1.7841619002222164e-06, + "loss": 0.3737819790840149, + "step": 942 + }, + { + "epoch": 0.7946127946127947, + "grad_norm": 3.327476978302002, + "learning_rate": 1.7830753341924768e-06, + "loss": 0.9010682106018066, + "step": 944 + }, + { + "epoch": 0.7962962962962963, + "grad_norm": 2.6396255493164062, + "learning_rate": 1.781986414680802e-06, + "loss": 0.925070583820343, + "step": 946 + }, + { + "epoch": 0.797979797979798, + "grad_norm": 3.3719475269317627, + "learning_rate": 1.7808951454389761e-06, + "loss": 1.036871075630188, + "step": 948 + }, + { + "epoch": 0.7996632996632996, + "grad_norm": 5.47444486618042, + "learning_rate": 1.7798015302268826e-06, + "loss": 0.8623565435409546, + "step": 950 + }, + { + "epoch": 0.8013468013468014, + "grad_norm": 11.89119815826416, + "learning_rate": 1.7787055728124853e-06, + "loss": 0.4426053762435913, + "step": 952 + }, + { + "epoch": 0.803030303030303, + "grad_norm": 3.086700916290283, + "learning_rate": 1.777607276971818e-06, + "loss": 0.9516481161117554, + "step": 954 + }, + { + "epoch": 0.8047138047138047, + "grad_norm": 11.045938491821289, + "learning_rate": 1.7765066464889729e-06, + "loss": 0.9658932685852051, + "step": 956 + }, + { + "epoch": 0.8063973063973064, + "grad_norm": 10.93420696258545, + "learning_rate": 1.775403685156085e-06, + "loss": 1.1045958995819092, + "step": 958 + }, + { + "epoch": 0.8080808080808081, + "grad_norm": 2.5317461490631104, + "learning_rate": 1.77429839677332e-06, + "loss": 0.673387348651886, + "step": 960 + }, + { + "epoch": 0.8097643097643098, + "grad_norm": 4.62790584564209, + "learning_rate": 1.773190785148861e-06, + "loss": 0.771082878112793, + "step": 962 + }, + { + "epoch": 0.8114478114478114, + "grad_norm": 6.418295860290527, + "learning_rate": 1.7720808540988965e-06, + "loss": 0.6905859112739563, + "step": 964 + }, + { + "epoch": 0.8131313131313131, + "grad_norm": 2.9778709411621094, + "learning_rate": 1.770968607447606e-06, + "loss": 0.9952410459518433, + "step": 966 + }, + { + "epoch": 0.8148148148148148, + "grad_norm": 17.664697647094727, + "learning_rate": 1.7698540490271475e-06, + "loss": 1.1883214712142944, + "step": 968 + }, + { + "epoch": 0.8164983164983165, + "grad_norm": 3.8164806365966797, + "learning_rate": 1.7687371826776432e-06, + "loss": 0.9806801080703735, + "step": 970 + }, + { + "epoch": 0.8181818181818182, + "grad_norm": 10.780609130859375, + "learning_rate": 1.7676180122471677e-06, + "loss": 0.9630722403526306, + "step": 972 + }, + { + "epoch": 0.8198653198653199, + "grad_norm": 6.188197612762451, + "learning_rate": 1.7664965415917342e-06, + "loss": 0.7298092842102051, + "step": 974 + }, + { + "epoch": 0.8215488215488216, + "grad_norm": 4.687350749969482, + "learning_rate": 1.765372774575281e-06, + "loss": 0.9373712539672852, + "step": 976 + }, + { + "epoch": 0.8232323232323232, + "grad_norm": 5.430413722991943, + "learning_rate": 1.764246715069658e-06, + "loss": 1.1954350471496582, + "step": 978 + }, + { + "epoch": 0.8249158249158249, + "grad_norm": 3.7986605167388916, + "learning_rate": 1.7631183669546146e-06, + "loss": 1.161393404006958, + "step": 980 + }, + { + "epoch": 0.8265993265993266, + "grad_norm": 4.60081672668457, + "learning_rate": 1.761987734117784e-06, + "loss": 1.046337366104126, + "step": 982 + }, + { + "epoch": 0.8282828282828283, + "grad_norm": 3.7046844959259033, + "learning_rate": 1.7608548204546724e-06, + "loss": 1.0424065589904785, + "step": 984 + }, + { + "epoch": 0.82996632996633, + "grad_norm": 19.03668212890625, + "learning_rate": 1.7597196298686446e-06, + "loss": 0.9536873698234558, + "step": 986 + }, + { + "epoch": 0.8316498316498316, + "grad_norm": 32.48857498168945, + "learning_rate": 1.7585821662709088e-06, + "loss": 0.8443811535835266, + "step": 988 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 11.665223121643066, + "learning_rate": 1.7574424335805066e-06, + "loss": 0.8324294686317444, + "step": 990 + }, + { + "epoch": 0.835016835016835, + "grad_norm": 21.848285675048828, + "learning_rate": 1.7563004357242962e-06, + "loss": 0.6908457279205322, + "step": 992 + }, + { + "epoch": 0.8367003367003367, + "grad_norm": 2.1612720489501953, + "learning_rate": 1.755156176636941e-06, + "loss": 0.9239605069160461, + "step": 994 + }, + { + "epoch": 0.8383838383838383, + "grad_norm": 4.865361213684082, + "learning_rate": 1.7540096602608946e-06, + "loss": 0.6591212153434753, + "step": 996 + }, + { + "epoch": 0.8400673400673401, + "grad_norm": 3.861494779586792, + "learning_rate": 1.7528608905463881e-06, + "loss": 0.9056419134140015, + "step": 998 + }, + { + "epoch": 0.8417508417508418, + "grad_norm": 2.9562947750091553, + "learning_rate": 1.7517098714514175e-06, + "loss": 1.0812749862670898, + "step": 1000 + }, + { + "epoch": 0.8434343434343434, + "grad_norm": 3.0346264839172363, + "learning_rate": 1.7505566069417272e-06, + "loss": 0.7617006301879883, + "step": 1002 + }, + { + "epoch": 0.8451178451178452, + "grad_norm": 3.785036325454712, + "learning_rate": 1.749401100990799e-06, + "loss": 0.6745568513870239, + "step": 1004 + }, + { + "epoch": 0.8468013468013468, + "grad_norm": 5.557058334350586, + "learning_rate": 1.748243357579837e-06, + "loss": 1.0811188220977783, + "step": 1006 + }, + { + "epoch": 0.8484848484848485, + "grad_norm": 1.9689534902572632, + "learning_rate": 1.747083380697754e-06, + "loss": 0.5900795459747314, + "step": 1008 + }, + { + "epoch": 0.8501683501683501, + "grad_norm": 14.491848945617676, + "learning_rate": 1.7459211743411589e-06, + "loss": 0.9504165649414062, + "step": 1010 + }, + { + "epoch": 0.8518518518518519, + "grad_norm": 21.8311767578125, + "learning_rate": 1.7447567425143413e-06, + "loss": 0.8922120928764343, + "step": 1012 + }, + { + "epoch": 0.8535353535353535, + "grad_norm": 13.790666580200195, + "learning_rate": 1.7435900892292593e-06, + "loss": 0.7710224390029907, + "step": 1014 + }, + { + "epoch": 0.8552188552188552, + "grad_norm": 20.326784133911133, + "learning_rate": 1.7424212185055236e-06, + "loss": 0.6666241884231567, + "step": 1016 + }, + { + "epoch": 0.8569023569023569, + "grad_norm": 18.170595169067383, + "learning_rate": 1.7412501343703858e-06, + "loss": 0.967223048210144, + "step": 1018 + }, + { + "epoch": 0.8585858585858586, + "grad_norm": 3.054368257522583, + "learning_rate": 1.740076840858724e-06, + "loss": 1.2456423044204712, + "step": 1020 + }, + { + "epoch": 0.8602693602693603, + "grad_norm": 26.2432861328125, + "learning_rate": 1.7389013420130278e-06, + "loss": 0.9183678030967712, + "step": 1022 + }, + { + "epoch": 0.8619528619528619, + "grad_norm": 4.530948162078857, + "learning_rate": 1.7377236418833855e-06, + "loss": 0.953632652759552, + "step": 1024 + }, + { + "epoch": 0.8636363636363636, + "grad_norm": 4.451155185699463, + "learning_rate": 1.736543744527469e-06, + "loss": 0.8909140825271606, + "step": 1026 + }, + { + "epoch": 0.8653198653198653, + "grad_norm": 3.3854105472564697, + "learning_rate": 1.7353616540105214e-06, + "loss": 0.9759948253631592, + "step": 1028 + }, + { + "epoch": 0.867003367003367, + "grad_norm": 7.278261184692383, + "learning_rate": 1.7341773744053423e-06, + "loss": 0.643425703048706, + "step": 1030 + }, + { + "epoch": 0.8686868686868687, + "grad_norm": 3.562976360321045, + "learning_rate": 1.7329909097922726e-06, + "loss": 0.8528425693511963, + "step": 1032 + }, + { + "epoch": 0.8703703703703703, + "grad_norm": 4.631925106048584, + "learning_rate": 1.7318022642591826e-06, + "loss": 0.9317729473114014, + "step": 1034 + }, + { + "epoch": 0.8720538720538721, + "grad_norm": 2.9623520374298096, + "learning_rate": 1.730611441901456e-06, + "loss": 0.9544110298156738, + "step": 1036 + }, + { + "epoch": 0.8737373737373737, + "grad_norm": 7.970090389251709, + "learning_rate": 1.7294184468219768e-06, + "loss": 1.1069408655166626, + "step": 1038 + }, + { + "epoch": 0.8754208754208754, + "grad_norm": 5.28152322769165, + "learning_rate": 1.728223283131116e-06, + "loss": 1.0873464345932007, + "step": 1040 + }, + { + "epoch": 0.877104377104377, + "grad_norm": 5.224731922149658, + "learning_rate": 1.727025954946714e-06, + "loss": 0.9729514718055725, + "step": 1042 + }, + { + "epoch": 0.8787878787878788, + "grad_norm": 13.218440055847168, + "learning_rate": 1.7258264663940706e-06, + "loss": 1.0898833274841309, + "step": 1044 + }, + { + "epoch": 0.8804713804713805, + "grad_norm": 2.7989261150360107, + "learning_rate": 1.724624821605929e-06, + "loss": 1.0561833381652832, + "step": 1046 + }, + { + "epoch": 0.8821548821548821, + "grad_norm": 13.938822746276855, + "learning_rate": 1.7234210247224608e-06, + "loss": 0.9620407223701477, + "step": 1048 + }, + { + "epoch": 0.8838383838383839, + "grad_norm": 14.411212921142578, + "learning_rate": 1.7222150798912527e-06, + "loss": 0.7809741497039795, + "step": 1050 + }, + { + "epoch": 0.8855218855218855, + "grad_norm": 6.374806880950928, + "learning_rate": 1.7210069912672924e-06, + "loss": 1.0467114448547363, + "step": 1052 + }, + { + "epoch": 0.8872053872053872, + "grad_norm": 9.24715805053711, + "learning_rate": 1.7197967630129533e-06, + "loss": 0.5621042251586914, + "step": 1054 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 3.0764286518096924, + "learning_rate": 1.7185843992979805e-06, + "loss": 0.9588031768798828, + "step": 1056 + }, + { + "epoch": 0.8905723905723906, + "grad_norm": 3.0444071292877197, + "learning_rate": 1.7173699042994778e-06, + "loss": 0.9131466150283813, + "step": 1058 + }, + { + "epoch": 0.8922558922558923, + "grad_norm": 7.547487735748291, + "learning_rate": 1.716153282201891e-06, + "loss": 0.9909827709197998, + "step": 1060 + }, + { + "epoch": 0.8939393939393939, + "grad_norm": 3.859555959701538, + "learning_rate": 1.7149345371969958e-06, + "loss": 0.8949623107910156, + "step": 1062 + }, + { + "epoch": 0.8956228956228957, + "grad_norm": 10.671557426452637, + "learning_rate": 1.7137136734838809e-06, + "loss": 0.8130732774734497, + "step": 1064 + }, + { + "epoch": 0.8973063973063973, + "grad_norm": 2.6384527683258057, + "learning_rate": 1.7124906952689354e-06, + "loss": 1.0677348375320435, + "step": 1066 + }, + { + "epoch": 0.898989898989899, + "grad_norm": 2.3599157333374023, + "learning_rate": 1.7112656067658345e-06, + "loss": 0.8169218301773071, + "step": 1068 + }, + { + "epoch": 0.9006734006734006, + "grad_norm": 6.580990314483643, + "learning_rate": 1.7100384121955229e-06, + "loss": 0.9567373991012573, + "step": 1070 + }, + { + "epoch": 0.9023569023569024, + "grad_norm": 2.7122886180877686, + "learning_rate": 1.7088091157862026e-06, + "loss": 1.2019579410552979, + "step": 1072 + }, + { + "epoch": 0.9040404040404041, + "grad_norm": 2.5349674224853516, + "learning_rate": 1.7075777217733169e-06, + "loss": 0.8406597971916199, + "step": 1074 + }, + { + "epoch": 0.9057239057239057, + "grad_norm": 6.190466403961182, + "learning_rate": 1.7063442343995361e-06, + "loss": 0.4906361401081085, + "step": 1076 + }, + { + "epoch": 0.9074074074074074, + "grad_norm": 26.555025100708008, + "learning_rate": 1.7051086579147436e-06, + "loss": 1.0886037349700928, + "step": 1078 + }, + { + "epoch": 0.9090909090909091, + "grad_norm": 3.0735490322113037, + "learning_rate": 1.7038709965760198e-06, + "loss": 0.9269078969955444, + "step": 1080 + }, + { + "epoch": 0.9107744107744108, + "grad_norm": 2.295616865158081, + "learning_rate": 1.7026312546476292e-06, + "loss": 0.9460815191268921, + "step": 1082 + }, + { + "epoch": 0.9124579124579124, + "grad_norm": 14.62086009979248, + "learning_rate": 1.701389436401004e-06, + "loss": 0.7059042453765869, + "step": 1084 + }, + { + "epoch": 0.9141414141414141, + "grad_norm": 4.020232200622559, + "learning_rate": 1.700145546114731e-06, + "loss": 1.15854811668396, + "step": 1086 + }, + { + "epoch": 0.9158249158249159, + "grad_norm": 4.303004264831543, + "learning_rate": 1.698899588074535e-06, + "loss": 0.9253766536712646, + "step": 1088 + }, + { + "epoch": 0.9175084175084175, + "grad_norm": 2.722356081008911, + "learning_rate": 1.6976515665732663e-06, + "loss": 0.9150590896606445, + "step": 1090 + }, + { + "epoch": 0.9191919191919192, + "grad_norm": 8.33704948425293, + "learning_rate": 1.6964014859108837e-06, + "loss": 1.0268497467041016, + "step": 1092 + }, + { + "epoch": 0.9208754208754208, + "grad_norm": 4.683021068572998, + "learning_rate": 1.6951493503944414e-06, + "loss": 0.9068109393119812, + "step": 1094 + }, + { + "epoch": 0.9225589225589226, + "grad_norm": 10.631436347961426, + "learning_rate": 1.693895164338073e-06, + "loss": 0.7467716932296753, + "step": 1096 + }, + { + "epoch": 0.9242424242424242, + "grad_norm": 8.113303184509277, + "learning_rate": 1.6926389320629768e-06, + "loss": 0.384426474571228, + "step": 1098 + }, + { + "epoch": 0.9259259259259259, + "grad_norm": 5.846349239349365, + "learning_rate": 1.6913806578974016e-06, + "loss": 0.9705697298049927, + "step": 1100 + }, + { + "epoch": 0.9276094276094277, + "grad_norm": 23.626840591430664, + "learning_rate": 1.690120346176632e-06, + "loss": 0.5436959266662598, + "step": 1102 + }, + { + "epoch": 0.9292929292929293, + "grad_norm": 4.793126106262207, + "learning_rate": 1.6888580012429717e-06, + "loss": 1.117484450340271, + "step": 1104 + }, + { + "epoch": 0.930976430976431, + "grad_norm": 10.387064933776855, + "learning_rate": 1.68759362744573e-06, + "loss": 1.031156301498413, + "step": 1106 + }, + { + "epoch": 0.9326599326599326, + "grad_norm": 14.877448081970215, + "learning_rate": 1.686327229141207e-06, + "loss": 0.8722270131111145, + "step": 1108 + }, + { + "epoch": 0.9343434343434344, + "grad_norm": 3.464400053024292, + "learning_rate": 1.6850588106926773e-06, + "loss": 1.2158129215240479, + "step": 1110 + }, + { + "epoch": 0.936026936026936, + "grad_norm": 4.9829421043396, + "learning_rate": 1.6837883764703765e-06, + "loss": 1.1986503601074219, + "step": 1112 + }, + { + "epoch": 0.9377104377104377, + "grad_norm": 3.5053603649139404, + "learning_rate": 1.6825159308514847e-06, + "loss": 1.0430546998977661, + "step": 1114 + }, + { + "epoch": 0.9393939393939394, + "grad_norm": 6.993835926055908, + "learning_rate": 1.6812414782201127e-06, + "loss": 1.1407470703125, + "step": 1116 + }, + { + "epoch": 0.9410774410774411, + "grad_norm": 6.774454116821289, + "learning_rate": 1.6799650229672862e-06, + "loss": 1.0087709426879883, + "step": 1118 + }, + { + "epoch": 0.9427609427609428, + "grad_norm": 3.8694427013397217, + "learning_rate": 1.6786865694909301e-06, + "loss": 1.2728749513626099, + "step": 1120 + }, + { + "epoch": 0.9444444444444444, + "grad_norm": 8.199234962463379, + "learning_rate": 1.6774061221958552e-06, + "loss": 0.7386917471885681, + "step": 1122 + }, + { + "epoch": 0.9461279461279462, + "grad_norm": 3.474858283996582, + "learning_rate": 1.6761236854937406e-06, + "loss": 0.8540256023406982, + "step": 1124 + }, + { + "epoch": 0.9478114478114478, + "grad_norm": 5.611124038696289, + "learning_rate": 1.674839263803121e-06, + "loss": 0.849441409111023, + "step": 1126 + }, + { + "epoch": 0.9494949494949495, + "grad_norm": 3.0861027240753174, + "learning_rate": 1.6735528615493686e-06, + "loss": 0.9585309028625488, + "step": 1128 + }, + { + "epoch": 0.9511784511784511, + "grad_norm": 20.665544509887695, + "learning_rate": 1.6722644831646815e-06, + "loss": 0.9195750951766968, + "step": 1130 + }, + { + "epoch": 0.9528619528619529, + "grad_norm": 2.3980801105499268, + "learning_rate": 1.6709741330880644e-06, + "loss": 0.9300163984298706, + "step": 1132 + }, + { + "epoch": 0.9545454545454546, + "grad_norm": 11.30346393585205, + "learning_rate": 1.6696818157653172e-06, + "loss": 0.9436147212982178, + "step": 1134 + }, + { + "epoch": 0.9562289562289562, + "grad_norm": 15.200255393981934, + "learning_rate": 1.6683875356490157e-06, + "loss": 0.83840012550354, + "step": 1136 + }, + { + "epoch": 0.9579124579124579, + "grad_norm": 11.014248847961426, + "learning_rate": 1.6670912971985002e-06, + "loss": 0.7340762615203857, + "step": 1138 + }, + { + "epoch": 0.9595959595959596, + "grad_norm": 3.3604698181152344, + "learning_rate": 1.6657931048798576e-06, + "loss": 0.5434874296188354, + "step": 1140 + }, + { + "epoch": 0.9612794612794613, + "grad_norm": 8.75454330444336, + "learning_rate": 1.6644929631659061e-06, + "loss": 0.8939019441604614, + "step": 1142 + }, + { + "epoch": 0.9629629629629629, + "grad_norm": 14.948843955993652, + "learning_rate": 1.6631908765361818e-06, + "loss": 0.6150766611099243, + "step": 1144 + }, + { + "epoch": 0.9646464646464646, + "grad_norm": 2.9250028133392334, + "learning_rate": 1.6618868494769202e-06, + "loss": 0.8925027847290039, + "step": 1146 + }, + { + "epoch": 0.9663299663299664, + "grad_norm": 10.11111831665039, + "learning_rate": 1.6605808864810437e-06, + "loss": 0.7491191029548645, + "step": 1148 + }, + { + "epoch": 0.968013468013468, + "grad_norm": 8.039884567260742, + "learning_rate": 1.6592729920481443e-06, + "loss": 0.9510982036590576, + "step": 1150 + }, + { + "epoch": 0.9696969696969697, + "grad_norm": 11.84205150604248, + "learning_rate": 1.6579631706844683e-06, + "loss": 0.6039742231369019, + "step": 1152 + }, + { + "epoch": 0.9713804713804713, + "grad_norm": 26.592609405517578, + "learning_rate": 1.6566514269029015e-06, + "loss": 0.9072830677032471, + "step": 1154 + }, + { + "epoch": 0.9730639730639731, + "grad_norm": 4.943899154663086, + "learning_rate": 1.6553377652229536e-06, + "loss": 0.5825839042663574, + "step": 1156 + }, + { + "epoch": 0.9747474747474747, + "grad_norm": 5.413260459899902, + "learning_rate": 1.6540221901707413e-06, + "loss": 0.9307392835617065, + "step": 1158 + }, + { + "epoch": 0.9764309764309764, + "grad_norm": 6.360762119293213, + "learning_rate": 1.6527047062789743e-06, + "loss": 0.4215626120567322, + "step": 1160 + }, + { + "epoch": 0.9781144781144782, + "grad_norm": 9.286370277404785, + "learning_rate": 1.6513853180869391e-06, + "loss": 1.088386058807373, + "step": 1162 + }, + { + "epoch": 0.9797979797979798, + "grad_norm": 6.5988993644714355, + "learning_rate": 1.6500640301404832e-06, + "loss": 0.6811473965644836, + "step": 1164 + }, + { + "epoch": 0.9814814814814815, + "grad_norm": 9.595373153686523, + "learning_rate": 1.6487408469919992e-06, + "loss": 0.7789331674575806, + "step": 1166 + }, + { + "epoch": 0.9831649831649831, + "grad_norm": 5.964288234710693, + "learning_rate": 1.6474157732004101e-06, + "loss": 0.8091530203819275, + "step": 1168 + }, + { + "epoch": 0.9848484848484849, + "grad_norm": 11.993547439575195, + "learning_rate": 1.6460888133311526e-06, + "loss": 0.832628607749939, + "step": 1170 + }, + { + "epoch": 0.9865319865319865, + "grad_norm": 3.2034716606140137, + "learning_rate": 1.6447599719561616e-06, + "loss": 0.612036406993866, + "step": 1172 + }, + { + "epoch": 0.9882154882154882, + "grad_norm": 5.53648567199707, + "learning_rate": 1.6434292536538547e-06, + "loss": 0.9042845964431763, + "step": 1174 + }, + { + "epoch": 0.98989898989899, + "grad_norm": 5.690428733825684, + "learning_rate": 1.6420966630091168e-06, + "loss": 0.44773343205451965, + "step": 1176 + }, + { + "epoch": 0.9915824915824916, + "grad_norm": 11.099560737609863, + "learning_rate": 1.6407622046132831e-06, + "loss": 1.0306243896484375, + "step": 1178 + }, + { + "epoch": 0.9932659932659933, + "grad_norm": 11.031452178955078, + "learning_rate": 1.6394258830641243e-06, + "loss": 0.42686060070991516, + "step": 1180 + }, + { + "epoch": 0.9949494949494949, + "grad_norm": 2.295154094696045, + "learning_rate": 1.6380877029658303e-06, + "loss": 0.8935648202896118, + "step": 1182 + }, + { + "epoch": 0.9966329966329966, + "grad_norm": 5.188049793243408, + "learning_rate": 1.6367476689289947e-06, + "loss": 1.000899076461792, + "step": 1184 + }, + { + "epoch": 0.9983164983164983, + "grad_norm": 5.049581527709961, + "learning_rate": 1.6354057855705984e-06, + "loss": 0.6279634833335876, + "step": 1186 + }, + { + "epoch": 1.0, + "grad_norm": 15.246573448181152, + "learning_rate": 1.6340620575139947e-06, + "loss": 0.6900116205215454, + "step": 1188 + }, + { + "epoch": 1.0016835016835017, + "grad_norm": 5.413362503051758, + "learning_rate": 1.6327164893888913e-06, + "loss": 0.39591357111930847, + "step": 1190 + }, + { + "epoch": 1.0033670033670035, + "grad_norm": 7.250094890594482, + "learning_rate": 1.6313690858313374e-06, + "loss": 0.41023939847946167, + "step": 1192 + }, + { + "epoch": 1.005050505050505, + "grad_norm": 4.482004642486572, + "learning_rate": 1.6300198514837045e-06, + "loss": 1.090850591659546, + "step": 1194 + }, + { + "epoch": 1.0067340067340067, + "grad_norm": 15.401289939880371, + "learning_rate": 1.6286687909946732e-06, + "loss": 0.8496726751327515, + "step": 1196 + }, + { + "epoch": 1.0084175084175084, + "grad_norm": 2.563889741897583, + "learning_rate": 1.6273159090192152e-06, + "loss": 0.9915731549263, + "step": 1198 + }, + { + "epoch": 1.0101010101010102, + "grad_norm": 8.505236625671387, + "learning_rate": 1.6259612102185778e-06, + "loss": 1.0761607885360718, + "step": 1200 + }, + { + "epoch": 1.0117845117845117, + "grad_norm": 2.467069625854492, + "learning_rate": 1.6246046992602685e-06, + "loss": 0.9234099984169006, + "step": 1202 + }, + { + "epoch": 1.0134680134680134, + "grad_norm": 2.2489092350006104, + "learning_rate": 1.6232463808180385e-06, + "loss": 0.9091596007347107, + "step": 1204 + }, + { + "epoch": 1.0151515151515151, + "grad_norm": 24.074737548828125, + "learning_rate": 1.6218862595718664e-06, + "loss": 1.0585005283355713, + "step": 1206 + }, + { + "epoch": 1.0168350168350169, + "grad_norm": 11.167364120483398, + "learning_rate": 1.620524340207942e-06, + "loss": 0.6014789938926697, + "step": 1208 + }, + { + "epoch": 1.0185185185185186, + "grad_norm": 9.423373222351074, + "learning_rate": 1.6191606274186504e-06, + "loss": 0.5883907079696655, + "step": 1210 + }, + { + "epoch": 1.02020202020202, + "grad_norm": 4.673365592956543, + "learning_rate": 1.6177951259025562e-06, + "loss": 0.5414766669273376, + "step": 1212 + }, + { + "epoch": 1.0218855218855218, + "grad_norm": 8.354643821716309, + "learning_rate": 1.6164278403643867e-06, + "loss": 0.7363089919090271, + "step": 1214 + }, + { + "epoch": 1.0235690235690236, + "grad_norm": 6.500521183013916, + "learning_rate": 1.6150587755150158e-06, + "loss": 0.38967499136924744, + "step": 1216 + }, + { + "epoch": 1.0252525252525253, + "grad_norm": 24.9106388092041, + "learning_rate": 1.6136879360714478e-06, + "loss": 0.9002467393875122, + "step": 1218 + }, + { + "epoch": 1.026936026936027, + "grad_norm": 3.819883346557617, + "learning_rate": 1.612315326756802e-06, + "loss": 0.7683883905410767, + "step": 1220 + }, + { + "epoch": 1.0286195286195285, + "grad_norm": 34.932952880859375, + "learning_rate": 1.6109409523002942e-06, + "loss": 0.9174226522445679, + "step": 1222 + }, + { + "epoch": 1.0303030303030303, + "grad_norm": 2.4514238834381104, + "learning_rate": 1.6095648174372231e-06, + "loss": 1.0709283351898193, + "step": 1224 + }, + { + "epoch": 1.031986531986532, + "grad_norm": 4.087513446807861, + "learning_rate": 1.6081869269089522e-06, + "loss": 0.6256165504455566, + "step": 1226 + }, + { + "epoch": 1.0336700336700337, + "grad_norm": 3.7036447525024414, + "learning_rate": 1.606807285462894e-06, + "loss": 0.8476806282997131, + "step": 1228 + }, + { + "epoch": 1.0353535353535352, + "grad_norm": 2.504366397857666, + "learning_rate": 1.6054258978524943e-06, + "loss": 0.8022794127464294, + "step": 1230 + }, + { + "epoch": 1.037037037037037, + "grad_norm": 11.632919311523438, + "learning_rate": 1.6040427688372143e-06, + "loss": 0.4790239632129669, + "step": 1232 + }, + { + "epoch": 1.0387205387205387, + "grad_norm": 1.2272193431854248, + "learning_rate": 1.602657903182515e-06, + "loss": 0.7812309265136719, + "step": 1234 + }, + { + "epoch": 1.0404040404040404, + "grad_norm": 1.8513426780700684, + "learning_rate": 1.6012713056598423e-06, + "loss": 0.7921426892280579, + "step": 1236 + }, + { + "epoch": 1.0420875420875422, + "grad_norm": 4.828263282775879, + "learning_rate": 1.599882981046607e-06, + "loss": 0.5412895679473877, + "step": 1238 + }, + { + "epoch": 1.0437710437710437, + "grad_norm": 2.7645084857940674, + "learning_rate": 1.5984929341261724e-06, + "loss": 0.9840224981307983, + "step": 1240 + }, + { + "epoch": 1.0454545454545454, + "grad_norm": 3.864872455596924, + "learning_rate": 1.5971011696878342e-06, + "loss": 0.9463930130004883, + "step": 1242 + }, + { + "epoch": 1.0471380471380471, + "grad_norm": 4.084227561950684, + "learning_rate": 1.5957076925268072e-06, + "loss": 0.639992356300354, + "step": 1244 + }, + { + "epoch": 1.0488215488215489, + "grad_norm": 3.3840675354003906, + "learning_rate": 1.5943125074442064e-06, + "loss": 0.6726884841918945, + "step": 1246 + }, + { + "epoch": 1.0505050505050506, + "grad_norm": 2.852729558944702, + "learning_rate": 1.5929156192470313e-06, + "loss": 0.9147169589996338, + "step": 1248 + }, + { + "epoch": 1.0521885521885521, + "grad_norm": 4.347400665283203, + "learning_rate": 1.5915170327481491e-06, + "loss": 0.7575803995132446, + "step": 1250 + }, + { + "epoch": 1.0538720538720538, + "grad_norm": 12.422771453857422, + "learning_rate": 1.5901167527662796e-06, + "loss": 0.6838544607162476, + "step": 1252 + }, + { + "epoch": 1.0555555555555556, + "grad_norm": 11.088696479797363, + "learning_rate": 1.5887147841259758e-06, + "loss": 0.9683138728141785, + "step": 1254 + }, + { + "epoch": 1.0572390572390573, + "grad_norm": 5.527649879455566, + "learning_rate": 1.5873111316576102e-06, + "loss": 0.7508020401000977, + "step": 1256 + }, + { + "epoch": 1.0589225589225588, + "grad_norm": 4.718619346618652, + "learning_rate": 1.5859058001973555e-06, + "loss": 0.5126559734344482, + "step": 1258 + }, + { + "epoch": 1.0606060606060606, + "grad_norm": 5.101532459259033, + "learning_rate": 1.5844987945871701e-06, + "loss": 0.77130526304245, + "step": 1260 + }, + { + "epoch": 1.0622895622895623, + "grad_norm": 5.325422763824463, + "learning_rate": 1.5830901196747805e-06, + "loss": 0.6283507347106934, + "step": 1262 + }, + { + "epoch": 1.063973063973064, + "grad_norm": 15.08485221862793, + "learning_rate": 1.5816797803136647e-06, + "loss": 0.7283768653869629, + "step": 1264 + }, + { + "epoch": 1.0656565656565657, + "grad_norm": 3.9415273666381836, + "learning_rate": 1.5802677813630348e-06, + "loss": 0.6957473754882812, + "step": 1266 + }, + { + "epoch": 1.0673400673400673, + "grad_norm": 10.470375061035156, + "learning_rate": 1.5788541276878212e-06, + "loss": 0.6225847005844116, + "step": 1268 + }, + { + "epoch": 1.069023569023569, + "grad_norm": 13.44847583770752, + "learning_rate": 1.577438824158656e-06, + "loss": 0.6269044280052185, + "step": 1270 + }, + { + "epoch": 1.0707070707070707, + "grad_norm": 1.2674486637115479, + "learning_rate": 1.5760218756518548e-06, + "loss": 0.6266012191772461, + "step": 1272 + }, + { + "epoch": 1.0723905723905724, + "grad_norm": 25.154924392700195, + "learning_rate": 1.5746032870494022e-06, + "loss": 0.4940655827522278, + "step": 1274 + }, + { + "epoch": 1.074074074074074, + "grad_norm": 5.607649326324463, + "learning_rate": 1.5731830632389322e-06, + "loss": 0.6989841461181641, + "step": 1276 + }, + { + "epoch": 1.0757575757575757, + "grad_norm": 111.35026550292969, + "learning_rate": 1.5717612091137137e-06, + "loss": 0.9674046039581299, + "step": 1278 + }, + { + "epoch": 1.0774410774410774, + "grad_norm": 36.46900939941406, + "learning_rate": 1.570337729572632e-06, + "loss": 0.5374500751495361, + "step": 1280 + }, + { + "epoch": 1.0791245791245792, + "grad_norm": 7.345931529998779, + "learning_rate": 1.5689126295201738e-06, + "loss": 0.3302645683288574, + "step": 1282 + }, + { + "epoch": 1.0808080808080809, + "grad_norm": 4.141447067260742, + "learning_rate": 1.5674859138664076e-06, + "loss": 1.053006887435913, + "step": 1284 + }, + { + "epoch": 1.0824915824915824, + "grad_norm": 18.335811614990234, + "learning_rate": 1.5660575875269696e-06, + "loss": 0.9029141664505005, + "step": 1286 + }, + { + "epoch": 1.0841750841750841, + "grad_norm": 4.0398850440979, + "learning_rate": 1.5646276554230454e-06, + "loss": 0.5438280701637268, + "step": 1288 + }, + { + "epoch": 1.0858585858585859, + "grad_norm": 20.008378982543945, + "learning_rate": 1.563196122481352e-06, + "loss": 0.6676660776138306, + "step": 1290 + }, + { + "epoch": 1.0875420875420876, + "grad_norm": 3.3898210525512695, + "learning_rate": 1.5617629936341225e-06, + "loss": 1.1070988178253174, + "step": 1292 + }, + { + "epoch": 1.0892255892255893, + "grad_norm": 5.172207355499268, + "learning_rate": 1.5603282738190898e-06, + "loss": 0.7852774858474731, + "step": 1294 + }, + { + "epoch": 1.0909090909090908, + "grad_norm": 14.538901329040527, + "learning_rate": 1.5588919679794668e-06, + "loss": 0.583429753780365, + "step": 1296 + }, + { + "epoch": 1.0925925925925926, + "grad_norm": 6.987974166870117, + "learning_rate": 1.5574540810639312e-06, + "loss": 0.6342300176620483, + "step": 1298 + }, + { + "epoch": 1.0942760942760943, + "grad_norm": 13.806412696838379, + "learning_rate": 1.556014618026609e-06, + "loss": 0.6277361512184143, + "step": 1300 + }, + { + "epoch": 1.095959595959596, + "grad_norm": 11.233121871948242, + "learning_rate": 1.5545735838270556e-06, + "loss": 0.6347372531890869, + "step": 1302 + }, + { + "epoch": 1.0976430976430978, + "grad_norm": 4.906972885131836, + "learning_rate": 1.5531309834302403e-06, + "loss": 0.5694692134857178, + "step": 1304 + }, + { + "epoch": 1.0993265993265993, + "grad_norm": 13.255314826965332, + "learning_rate": 1.5516868218065283e-06, + "loss": 0.5988457798957825, + "step": 1306 + }, + { + "epoch": 1.101010101010101, + "grad_norm": 18.89320182800293, + "learning_rate": 1.5502411039316642e-06, + "loss": 0.5894651412963867, + "step": 1308 + }, + { + "epoch": 1.1026936026936027, + "grad_norm": 2.3720078468322754, + "learning_rate": 1.5487938347867542e-06, + "loss": 0.39072656631469727, + "step": 1310 + }, + { + "epoch": 1.1043771043771045, + "grad_norm": 3.8021674156188965, + "learning_rate": 1.5473450193582498e-06, + "loss": 1.1303743124008179, + "step": 1312 + }, + { + "epoch": 1.106060606060606, + "grad_norm": 12.77686882019043, + "learning_rate": 1.5458946626379293e-06, + "loss": 0.9466381072998047, + "step": 1314 + }, + { + "epoch": 1.1077441077441077, + "grad_norm": 16.367809295654297, + "learning_rate": 1.5444427696228822e-06, + "loss": 0.896185576915741, + "step": 1316 + }, + { + "epoch": 1.1094276094276094, + "grad_norm": 4.367947578430176, + "learning_rate": 1.5429893453154906e-06, + "loss": 0.9018317461013794, + "step": 1318 + }, + { + "epoch": 1.1111111111111112, + "grad_norm": 11.2949857711792, + "learning_rate": 1.5415343947234132e-06, + "loss": 0.5716771483421326, + "step": 1320 + }, + { + "epoch": 1.112794612794613, + "grad_norm": 3.638136386871338, + "learning_rate": 1.5400779228595663e-06, + "loss": 0.8265483379364014, + "step": 1322 + }, + { + "epoch": 1.1144781144781144, + "grad_norm": 23.661731719970703, + "learning_rate": 1.538619934742109e-06, + "loss": 0.5200953483581543, + "step": 1324 + }, + { + "epoch": 1.1161616161616161, + "grad_norm": 5.394420146942139, + "learning_rate": 1.5371604353944235e-06, + "loss": 0.8769002556800842, + "step": 1326 + }, + { + "epoch": 1.1178451178451179, + "grad_norm": 3.2108795642852783, + "learning_rate": 1.5356994298450989e-06, + "loss": 0.6526933312416077, + "step": 1328 + }, + { + "epoch": 1.1195286195286196, + "grad_norm": 6.397909164428711, + "learning_rate": 1.5342369231279145e-06, + "loss": 0.994263768196106, + "step": 1330 + }, + { + "epoch": 1.121212121212121, + "grad_norm": 5.88171911239624, + "learning_rate": 1.5327729202818212e-06, + "loss": 0.7015285491943359, + "step": 1332 + }, + { + "epoch": 1.1228956228956228, + "grad_norm": 2.6668052673339844, + "learning_rate": 1.5313074263509242e-06, + "loss": 1.0788037776947021, + "step": 1334 + }, + { + "epoch": 1.1245791245791246, + "grad_norm": 5.609066009521484, + "learning_rate": 1.5298404463844675e-06, + "loss": 0.5919516086578369, + "step": 1336 + }, + { + "epoch": 1.1262626262626263, + "grad_norm": 3.103581428527832, + "learning_rate": 1.5283719854368142e-06, + "loss": 0.6757215857505798, + "step": 1338 + }, + { + "epoch": 1.127946127946128, + "grad_norm": 2.8614747524261475, + "learning_rate": 1.5269020485674299e-06, + "loss": 0.4805062413215637, + "step": 1340 + }, + { + "epoch": 1.1296296296296295, + "grad_norm": 4.264964580535889, + "learning_rate": 1.5254306408408657e-06, + "loss": 0.8218073844909668, + "step": 1342 + }, + { + "epoch": 1.1313131313131313, + "grad_norm": 3.358206272125244, + "learning_rate": 1.5239577673267401e-06, + "loss": 1.1272187232971191, + "step": 1344 + }, + { + "epoch": 1.132996632996633, + "grad_norm": 5.68251371383667, + "learning_rate": 1.5224834330997222e-06, + "loss": 1.0079560279846191, + "step": 1346 + }, + { + "epoch": 1.1346801346801347, + "grad_norm": 5.610229969024658, + "learning_rate": 1.5210076432395138e-06, + "loss": 0.6960790157318115, + "step": 1348 + }, + { + "epoch": 1.1363636363636362, + "grad_norm": 6.409191608428955, + "learning_rate": 1.5195304028308324e-06, + "loss": 0.48329275846481323, + "step": 1350 + }, + { + "epoch": 1.138047138047138, + "grad_norm": 17.214502334594727, + "learning_rate": 1.5180517169633914e-06, + "loss": 0.2905687391757965, + "step": 1352 + }, + { + "epoch": 1.1397306397306397, + "grad_norm": 4.7634406089782715, + "learning_rate": 1.5165715907318874e-06, + "loss": 0.9956916570663452, + "step": 1354 + }, + { + "epoch": 1.1414141414141414, + "grad_norm": 3.8894872665405273, + "learning_rate": 1.5150900292359775e-06, + "loss": 1.0472840070724487, + "step": 1356 + }, + { + "epoch": 1.1430976430976432, + "grad_norm": 28.076671600341797, + "learning_rate": 1.513607037580264e-06, + "loss": 0.7530080676078796, + "step": 1358 + }, + { + "epoch": 1.144781144781145, + "grad_norm": 5.491020679473877, + "learning_rate": 1.5121226208742771e-06, + "loss": 0.6445476412773132, + "step": 1360 + }, + { + "epoch": 1.1464646464646464, + "grad_norm": 2.686913251876831, + "learning_rate": 1.5106367842324578e-06, + "loss": 0.8437654376029968, + "step": 1362 + }, + { + "epoch": 1.1481481481481481, + "grad_norm": 35.050662994384766, + "learning_rate": 1.5091495327741375e-06, + "loss": 0.8638776540756226, + "step": 1364 + }, + { + "epoch": 1.1498316498316499, + "grad_norm": 3.9783761501312256, + "learning_rate": 1.507660871623524e-06, + "loss": 0.7111606597900391, + "step": 1366 + }, + { + "epoch": 1.1515151515151516, + "grad_norm": 14.50291633605957, + "learning_rate": 1.5061708059096807e-06, + "loss": 0.764883279800415, + "step": 1368 + }, + { + "epoch": 1.1531986531986531, + "grad_norm": 2.154838800430298, + "learning_rate": 1.5046793407665114e-06, + "loss": 1.0397025346755981, + "step": 1370 + }, + { + "epoch": 1.1548821548821548, + "grad_norm": 2.365380048751831, + "learning_rate": 1.503186481332741e-06, + "loss": 1.0539653301239014, + "step": 1372 + }, + { + "epoch": 1.1565656565656566, + "grad_norm": 8.504420280456543, + "learning_rate": 1.5016922327518986e-06, + "loss": 0.4366611838340759, + "step": 1374 + }, + { + "epoch": 1.1582491582491583, + "grad_norm": 2.675044298171997, + "learning_rate": 1.5001966001722986e-06, + "loss": 0.398744136095047, + "step": 1376 + }, + { + "epoch": 1.15993265993266, + "grad_norm": 8.629570960998535, + "learning_rate": 1.4986995887470248e-06, + "loss": 0.8844636678695679, + "step": 1378 + }, + { + "epoch": 1.1616161616161615, + "grad_norm": 2.5665788650512695, + "learning_rate": 1.497201203633912e-06, + "loss": 0.6772328019142151, + "step": 1380 + }, + { + "epoch": 1.1632996632996633, + "grad_norm": 9.2289457321167, + "learning_rate": 1.4957014499955265e-06, + "loss": 0.5273948907852173, + "step": 1382 + }, + { + "epoch": 1.164983164983165, + "grad_norm": 4.406887054443359, + "learning_rate": 1.4942003329991513e-06, + "loss": 0.36302030086517334, + "step": 1384 + }, + { + "epoch": 1.1666666666666667, + "grad_norm": 14.721182823181152, + "learning_rate": 1.492697857816766e-06, + "loss": 0.5152138471603394, + "step": 1386 + }, + { + "epoch": 1.1683501683501682, + "grad_norm": 2.9244027137756348, + "learning_rate": 1.491194029625029e-06, + "loss": 0.6069843173027039, + "step": 1388 + }, + { + "epoch": 1.17003367003367, + "grad_norm": 5.622206687927246, + "learning_rate": 1.489688853605262e-06, + "loss": 0.8698340654373169, + "step": 1390 + }, + { + "epoch": 1.1717171717171717, + "grad_norm": 3.113487482070923, + "learning_rate": 1.4881823349434296e-06, + "loss": 0.8122848272323608, + "step": 1392 + }, + { + "epoch": 1.1734006734006734, + "grad_norm": 8.594972610473633, + "learning_rate": 1.4866744788301226e-06, + "loss": 0.681936502456665, + "step": 1394 + }, + { + "epoch": 1.1750841750841752, + "grad_norm": 2.1322364807128906, + "learning_rate": 1.485165290460539e-06, + "loss": 0.571365237236023, + "step": 1396 + }, + { + "epoch": 1.1767676767676767, + "grad_norm": 3.1892471313476562, + "learning_rate": 1.4836547750344688e-06, + "loss": 0.7035370469093323, + "step": 1398 + }, + { + "epoch": 1.1784511784511784, + "grad_norm": 15.387435913085938, + "learning_rate": 1.4821429377562725e-06, + "loss": 0.49107053875923157, + "step": 1400 + }, + { + "epoch": 1.1801346801346801, + "grad_norm": 2.782883644104004, + "learning_rate": 1.4806297838348653e-06, + "loss": 0.9246771931648254, + "step": 1402 + }, + { + "epoch": 1.1818181818181819, + "grad_norm": 5.081911563873291, + "learning_rate": 1.4791153184837e-06, + "loss": 0.7164801955223083, + "step": 1404 + }, + { + "epoch": 1.1835016835016834, + "grad_norm": 11.42972469329834, + "learning_rate": 1.4775995469207467e-06, + "loss": 0.6407367587089539, + "step": 1406 + }, + { + "epoch": 1.1851851851851851, + "grad_norm": 5.799728870391846, + "learning_rate": 1.476082474368476e-06, + "loss": 0.9986523389816284, + "step": 1408 + }, + { + "epoch": 1.1868686868686869, + "grad_norm": 4.796317100524902, + "learning_rate": 1.4745641060538407e-06, + "loss": 0.700546145439148, + "step": 1410 + }, + { + "epoch": 1.1885521885521886, + "grad_norm": 21.660324096679688, + "learning_rate": 1.4730444472082597e-06, + "loss": 0.741939902305603, + "step": 1412 + }, + { + "epoch": 1.1902356902356903, + "grad_norm": 3.5754830837249756, + "learning_rate": 1.471523503067596e-06, + "loss": 0.7933897972106934, + "step": 1414 + }, + { + "epoch": 1.1919191919191918, + "grad_norm": 6.275886535644531, + "learning_rate": 1.4700012788721431e-06, + "loss": 0.7294763326644897, + "step": 1416 + }, + { + "epoch": 1.1936026936026936, + "grad_norm": 11.374263763427734, + "learning_rate": 1.4684777798666028e-06, + "loss": 1.066422939300537, + "step": 1418 + }, + { + "epoch": 1.1952861952861953, + "grad_norm": 8.107324600219727, + "learning_rate": 1.4669530113000712e-06, + "loss": 0.8409990072250366, + "step": 1420 + }, + { + "epoch": 1.196969696969697, + "grad_norm": 5.618307590484619, + "learning_rate": 1.465426978426017e-06, + "loss": 0.750501275062561, + "step": 1422 + }, + { + "epoch": 1.1986531986531987, + "grad_norm": 3.1983511447906494, + "learning_rate": 1.4638996865022658e-06, + "loss": 0.611116886138916, + "step": 1424 + }, + { + "epoch": 1.2003367003367003, + "grad_norm": 7.185869216918945, + "learning_rate": 1.4623711407909802e-06, + "loss": 0.8342564105987549, + "step": 1426 + }, + { + "epoch": 1.202020202020202, + "grad_norm": 5.156131267547607, + "learning_rate": 1.4608413465586444e-06, + "loss": 0.528020441532135, + "step": 1428 + }, + { + "epoch": 1.2037037037037037, + "grad_norm": 4.284945964813232, + "learning_rate": 1.4593103090760426e-06, + "loss": 0.867672324180603, + "step": 1430 + }, + { + "epoch": 1.2053872053872055, + "grad_norm": 4.11072301864624, + "learning_rate": 1.4577780336182429e-06, + "loss": 0.6711719036102295, + "step": 1432 + }, + { + "epoch": 1.2070707070707072, + "grad_norm": 2.3299851417541504, + "learning_rate": 1.4562445254645793e-06, + "loss": 1.1435985565185547, + "step": 1434 + }, + { + "epoch": 1.2087542087542087, + "grad_norm": 7.548894882202148, + "learning_rate": 1.4547097898986332e-06, + "loss": 0.5709949731826782, + "step": 1436 + }, + { + "epoch": 1.2104377104377104, + "grad_norm": 12.143434524536133, + "learning_rate": 1.453173832208213e-06, + "loss": 0.40696626901626587, + "step": 1438 + }, + { + "epoch": 1.2121212121212122, + "grad_norm": 3.1169068813323975, + "learning_rate": 1.4516366576853406e-06, + "loss": 0.4268173575401306, + "step": 1440 + }, + { + "epoch": 1.2138047138047139, + "grad_norm": 4.227779388427734, + "learning_rate": 1.450098271626228e-06, + "loss": 0.7122896313667297, + "step": 1442 + }, + { + "epoch": 1.2154882154882154, + "grad_norm": 7.247793674468994, + "learning_rate": 1.448558679331263e-06, + "loss": 0.8614311814308167, + "step": 1444 + }, + { + "epoch": 1.2171717171717171, + "grad_norm": 6.6793212890625, + "learning_rate": 1.4470178861049886e-06, + "loss": 0.8972820043563843, + "step": 1446 + }, + { + "epoch": 1.2188552188552189, + "grad_norm": 4.615921974182129, + "learning_rate": 1.4454758972560863e-06, + "loss": 0.6717212200164795, + "step": 1448 + }, + { + "epoch": 1.2205387205387206, + "grad_norm": 4.018466949462891, + "learning_rate": 1.4439327180973556e-06, + "loss": 0.8775206208229065, + "step": 1450 + }, + { + "epoch": 1.2222222222222223, + "grad_norm": 4.282815456390381, + "learning_rate": 1.4423883539456987e-06, + "loss": 0.867609977722168, + "step": 1452 + }, + { + "epoch": 1.2239057239057238, + "grad_norm": 5.375484466552734, + "learning_rate": 1.4408428101220997e-06, + "loss": 0.6089876294136047, + "step": 1454 + }, + { + "epoch": 1.2255892255892256, + "grad_norm": 4.924765110015869, + "learning_rate": 1.439296091951607e-06, + "loss": 0.852953314781189, + "step": 1456 + }, + { + "epoch": 1.2272727272727273, + "grad_norm": 6.108055591583252, + "learning_rate": 1.4377482047633162e-06, + "loss": 0.8556865453720093, + "step": 1458 + }, + { + "epoch": 1.228956228956229, + "grad_norm": 7.242824077606201, + "learning_rate": 1.4361991538903495e-06, + "loss": 0.9425716400146484, + "step": 1460 + }, + { + "epoch": 1.2306397306397305, + "grad_norm": 8.90245532989502, + "learning_rate": 1.4346489446698388e-06, + "loss": 0.6341677904129028, + "step": 1462 + }, + { + "epoch": 1.2323232323232323, + "grad_norm": 4.452878475189209, + "learning_rate": 1.4330975824429076e-06, + "loss": 0.6499779224395752, + "step": 1464 + }, + { + "epoch": 1.234006734006734, + "grad_norm": 2.3086910247802734, + "learning_rate": 1.4315450725546516e-06, + "loss": 0.8102267384529114, + "step": 1466 + }, + { + "epoch": 1.2356902356902357, + "grad_norm": 4.407566070556641, + "learning_rate": 1.42999142035412e-06, + "loss": 0.9032129049301147, + "step": 1468 + }, + { + "epoch": 1.2373737373737375, + "grad_norm": 3.0299272537231445, + "learning_rate": 1.4284366311942985e-06, + "loss": 1.0671682357788086, + "step": 1470 + }, + { + "epoch": 1.239057239057239, + "grad_norm": 5.777866840362549, + "learning_rate": 1.42688071043209e-06, + "loss": 0.5841819047927856, + "step": 1472 + }, + { + "epoch": 1.2407407407407407, + "grad_norm": 11.622872352600098, + "learning_rate": 1.4253236634282964e-06, + "loss": 0.6392555236816406, + "step": 1474 + }, + { + "epoch": 1.2424242424242424, + "grad_norm": 25.52138328552246, + "learning_rate": 1.4237654955475997e-06, + "loss": 0.45820027589797974, + "step": 1476 + }, + { + "epoch": 1.2441077441077442, + "grad_norm": 7.492943286895752, + "learning_rate": 1.4222062121585438e-06, + "loss": 0.6932016611099243, + "step": 1478 + }, + { + "epoch": 1.2457912457912457, + "grad_norm": 4.440412998199463, + "learning_rate": 1.4206458186335158e-06, + "loss": 0.7317427396774292, + "step": 1480 + }, + { + "epoch": 1.2474747474747474, + "grad_norm": 3.7973439693450928, + "learning_rate": 1.4190843203487285e-06, + "loss": 0.7156742811203003, + "step": 1482 + }, + { + "epoch": 1.2491582491582491, + "grad_norm": 5.348301410675049, + "learning_rate": 1.4175217226842e-06, + "loss": 0.4319908320903778, + "step": 1484 + }, + { + "epoch": 1.2508417508417509, + "grad_norm": 3.68155574798584, + "learning_rate": 1.4159580310237368e-06, + "loss": 0.5716394186019897, + "step": 1486 + }, + { + "epoch": 1.2525252525252526, + "grad_norm": 12.937089920043945, + "learning_rate": 1.414393250754915e-06, + "loss": 0.7173076272010803, + "step": 1488 + }, + { + "epoch": 1.2542087542087543, + "grad_norm": 4.815293312072754, + "learning_rate": 1.4128273872690608e-06, + "loss": 0.6426496505737305, + "step": 1490 + }, + { + "epoch": 1.2558922558922558, + "grad_norm": 6.455201148986816, + "learning_rate": 1.4112604459612326e-06, + "loss": 0.7094147801399231, + "step": 1492 + }, + { + "epoch": 1.2575757575757576, + "grad_norm": 2.647298812866211, + "learning_rate": 1.4096924322302025e-06, + "loss": 0.7964801788330078, + "step": 1494 + }, + { + "epoch": 1.2592592592592593, + "grad_norm": 10.454304695129395, + "learning_rate": 1.4081233514784377e-06, + "loss": 0.6100042462348938, + "step": 1496 + }, + { + "epoch": 1.2609427609427608, + "grad_norm": 3.6101741790771484, + "learning_rate": 1.4065532091120815e-06, + "loss": 0.9467732906341553, + "step": 1498 + }, + { + "epoch": 1.2626262626262625, + "grad_norm": 4.737046718597412, + "learning_rate": 1.4049820105409354e-06, + "loss": 0.9984631538391113, + "step": 1500 + }, + { + "epoch": 1.2643097643097643, + "grad_norm": 7.123760223388672, + "learning_rate": 1.4034097611784388e-06, + "loss": 0.5069697499275208, + "step": 1502 + }, + { + "epoch": 1.265993265993266, + "grad_norm": 6.340135097503662, + "learning_rate": 1.4018364664416531e-06, + "loss": 0.7557004690170288, + "step": 1504 + }, + { + "epoch": 1.2676767676767677, + "grad_norm": 2.5414600372314453, + "learning_rate": 1.4002621317512402e-06, + "loss": 1.086498498916626, + "step": 1506 + }, + { + "epoch": 1.2693602693602695, + "grad_norm": 6.803100109100342, + "learning_rate": 1.3986867625314453e-06, + "loss": 1.1087901592254639, + "step": 1508 + }, + { + "epoch": 1.271043771043771, + "grad_norm": 17.501358032226562, + "learning_rate": 1.397110364210079e-06, + "loss": 0.5395207405090332, + "step": 1510 + }, + { + "epoch": 1.2727272727272727, + "grad_norm": 17.035667419433594, + "learning_rate": 1.395532942218496e-06, + "loss": 0.5006218552589417, + "step": 1512 + }, + { + "epoch": 1.2744107744107744, + "grad_norm": 13.554049491882324, + "learning_rate": 1.393954501991579e-06, + "loss": 0.597407341003418, + "step": 1514 + }, + { + "epoch": 1.2760942760942762, + "grad_norm": 5.359893321990967, + "learning_rate": 1.3923750489677192e-06, + "loss": 0.7979379892349243, + "step": 1516 + }, + { + "epoch": 1.2777777777777777, + "grad_norm": 3.440288782119751, + "learning_rate": 1.3907945885887963e-06, + "loss": 0.7031858563423157, + "step": 1518 + }, + { + "epoch": 1.2794612794612794, + "grad_norm": 2.3797640800476074, + "learning_rate": 1.389213126300161e-06, + "loss": 0.8979378342628479, + "step": 1520 + }, + { + "epoch": 1.2811447811447811, + "grad_norm": 14.381575584411621, + "learning_rate": 1.3876306675506176e-06, + "loss": 0.6173551082611084, + "step": 1522 + }, + { + "epoch": 1.2828282828282829, + "grad_norm": 22.606948852539062, + "learning_rate": 1.3860472177924008e-06, + "loss": 0.5981260538101196, + "step": 1524 + }, + { + "epoch": 1.2845117845117846, + "grad_norm": 9.574856758117676, + "learning_rate": 1.3844627824811623e-06, + "loss": 0.8161386847496033, + "step": 1526 + }, + { + "epoch": 1.2861952861952861, + "grad_norm": 23.1750431060791, + "learning_rate": 1.3828773670759476e-06, + "loss": 0.7269278764724731, + "step": 1528 + }, + { + "epoch": 1.2878787878787878, + "grad_norm": 4.434001922607422, + "learning_rate": 1.3812909770391808e-06, + "loss": 0.3289014399051666, + "step": 1530 + }, + { + "epoch": 1.2895622895622896, + "grad_norm": 4.015097141265869, + "learning_rate": 1.3797036178366422e-06, + "loss": 0.7394604086875916, + "step": 1532 + }, + { + "epoch": 1.2912457912457913, + "grad_norm": 2.247042179107666, + "learning_rate": 1.3781152949374526e-06, + "loss": 1.0114760398864746, + "step": 1534 + }, + { + "epoch": 1.2929292929292928, + "grad_norm": 10.264386177062988, + "learning_rate": 1.3765260138140523e-06, + "loss": 0.9329554438591003, + "step": 1536 + }, + { + "epoch": 1.2946127946127945, + "grad_norm": 7.6681647300720215, + "learning_rate": 1.3749357799421846e-06, + "loss": 0.5743855237960815, + "step": 1538 + }, + { + "epoch": 1.2962962962962963, + "grad_norm": 51.10832977294922, + "learning_rate": 1.3733445988008729e-06, + "loss": 0.6765563488006592, + "step": 1540 + }, + { + "epoch": 1.297979797979798, + "grad_norm": 7.140315055847168, + "learning_rate": 1.3717524758724065e-06, + "loss": 0.5998942255973816, + "step": 1542 + }, + { + "epoch": 1.2996632996632997, + "grad_norm": 5.197514533996582, + "learning_rate": 1.3701594166423182e-06, + "loss": 0.8821581602096558, + "step": 1544 + }, + { + "epoch": 1.3013468013468015, + "grad_norm": 6.277469158172607, + "learning_rate": 1.3685654265993682e-06, + "loss": 0.767001211643219, + "step": 1546 + }, + { + "epoch": 1.303030303030303, + "grad_norm": 7.22768497467041, + "learning_rate": 1.366970511235522e-06, + "loss": 0.7709823250770569, + "step": 1548 + }, + { + "epoch": 1.3047138047138047, + "grad_norm": 4.289220333099365, + "learning_rate": 1.3653746760459345e-06, + "loss": 0.5894149541854858, + "step": 1550 + }, + { + "epoch": 1.3063973063973064, + "grad_norm": 7.390477657318115, + "learning_rate": 1.3637779265289299e-06, + "loss": 0.8726404905319214, + "step": 1552 + }, + { + "epoch": 1.308080808080808, + "grad_norm": 10.008243560791016, + "learning_rate": 1.3621802681859812e-06, + "loss": 0.947807788848877, + "step": 1554 + }, + { + "epoch": 1.3097643097643097, + "grad_norm": 2.8453805446624756, + "learning_rate": 1.3605817065216944e-06, + "loss": 0.8847697973251343, + "step": 1556 + }, + { + "epoch": 1.3114478114478114, + "grad_norm": 7.134622573852539, + "learning_rate": 1.3589822470437864e-06, + "loss": 0.8395899534225464, + "step": 1558 + }, + { + "epoch": 1.3131313131313131, + "grad_norm": 22.481409072875977, + "learning_rate": 1.3573818952630683e-06, + "loss": 0.42701858282089233, + "step": 1560 + }, + { + "epoch": 1.3148148148148149, + "grad_norm": 8.535077095031738, + "learning_rate": 1.3557806566934256e-06, + "loss": 0.5510627627372742, + "step": 1562 + }, + { + "epoch": 1.3164983164983166, + "grad_norm": 14.953362464904785, + "learning_rate": 1.354178536851799e-06, + "loss": 0.5616642236709595, + "step": 1564 + }, + { + "epoch": 1.3181818181818181, + "grad_norm": 3.324460983276367, + "learning_rate": 1.3525755412581645e-06, + "loss": 1.04994535446167, + "step": 1566 + }, + { + "epoch": 1.3198653198653199, + "grad_norm": 11.0078706741333, + "learning_rate": 1.3509716754355174e-06, + "loss": 0.5438690185546875, + "step": 1568 + }, + { + "epoch": 1.3215488215488216, + "grad_norm": 9.554030418395996, + "learning_rate": 1.34936694490985e-06, + "loss": 0.901394248008728, + "step": 1570 + }, + { + "epoch": 1.3232323232323233, + "grad_norm": 9.29176139831543, + "learning_rate": 1.3477613552101344e-06, + "loss": 0.7927477359771729, + "step": 1572 + }, + { + "epoch": 1.3249158249158248, + "grad_norm": 3.3643555641174316, + "learning_rate": 1.3461549118683023e-06, + "loss": 0.6502416133880615, + "step": 1574 + }, + { + "epoch": 1.3265993265993266, + "grad_norm": 3.0709450244903564, + "learning_rate": 1.344547620419227e-06, + "loss": 0.9406764507293701, + "step": 1576 + }, + { + "epoch": 1.3282828282828283, + "grad_norm": 74.16036224365234, + "learning_rate": 1.3429394864007037e-06, + "loss": 0.6865894794464111, + "step": 1578 + }, + { + "epoch": 1.32996632996633, + "grad_norm": 14.486356735229492, + "learning_rate": 1.3413305153534313e-06, + "loss": 0.49478814005851746, + "step": 1580 + }, + { + "epoch": 1.3316498316498318, + "grad_norm": 64.50064849853516, + "learning_rate": 1.3397207128209916e-06, + "loss": 0.6601588726043701, + "step": 1582 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 2.4977774620056152, + "learning_rate": 1.3381100843498315e-06, + "loss": 0.9941089153289795, + "step": 1584 + }, + { + "epoch": 1.335016835016835, + "grad_norm": 5.635324478149414, + "learning_rate": 1.3364986354892442e-06, + "loss": 0.8192329406738281, + "step": 1586 + }, + { + "epoch": 1.3367003367003367, + "grad_norm": 3.7212777137756348, + "learning_rate": 1.3348863717913485e-06, + "loss": 0.4632367491722107, + "step": 1588 + }, + { + "epoch": 1.3383838383838385, + "grad_norm": 2.295429229736328, + "learning_rate": 1.3332732988110717e-06, + "loss": 0.6560972332954407, + "step": 1590 + }, + { + "epoch": 1.34006734006734, + "grad_norm": 14.497373580932617, + "learning_rate": 1.3316594221061293e-06, + "loss": 0.553842306137085, + "step": 1592 + }, + { + "epoch": 1.3417508417508417, + "grad_norm": 2.9581053256988525, + "learning_rate": 1.3300447472370047e-06, + "loss": 0.9532322883605957, + "step": 1594 + }, + { + "epoch": 1.3434343434343434, + "grad_norm": 19.73745346069336, + "learning_rate": 1.3284292797669325e-06, + "loss": 0.3680313229560852, + "step": 1596 + }, + { + "epoch": 1.3451178451178452, + "grad_norm": 3.8030846118927, + "learning_rate": 1.326813025261878e-06, + "loss": 0.8829873204231262, + "step": 1598 + }, + { + "epoch": 1.3468013468013469, + "grad_norm": 9.470124244689941, + "learning_rate": 1.3251959892905183e-06, + "loss": 0.7422173023223877, + "step": 1600 + }, + { + "epoch": 1.3484848484848486, + "grad_norm": 4.198265075683594, + "learning_rate": 1.3235781774242221e-06, + "loss": 0.6670169830322266, + "step": 1602 + }, + { + "epoch": 1.3501683501683501, + "grad_norm": 11.831036567687988, + "learning_rate": 1.321959595237032e-06, + "loss": 0.8272008895874023, + "step": 1604 + }, + { + "epoch": 1.3518518518518519, + "grad_norm": 4.924741744995117, + "learning_rate": 1.3203402483056457e-06, + "loss": 1.091449499130249, + "step": 1606 + }, + { + "epoch": 1.3535353535353536, + "grad_norm": 4.869316101074219, + "learning_rate": 1.3187201422093937e-06, + "loss": 0.8597755432128906, + "step": 1608 + }, + { + "epoch": 1.355218855218855, + "grad_norm": 9.370150566101074, + "learning_rate": 1.3170992825302231e-06, + "loss": 0.38254064321517944, + "step": 1610 + }, + { + "epoch": 1.3569023569023568, + "grad_norm": 5.126072883605957, + "learning_rate": 1.315477674852678e-06, + "loss": 0.9957524538040161, + "step": 1612 + }, + { + "epoch": 1.3585858585858586, + "grad_norm": 4.2908172607421875, + "learning_rate": 1.3138553247638793e-06, + "loss": 0.6559964418411255, + "step": 1614 + }, + { + "epoch": 1.3602693602693603, + "grad_norm": 9.646893501281738, + "learning_rate": 1.3122322378535052e-06, + "loss": 0.6425015330314636, + "step": 1616 + }, + { + "epoch": 1.361952861952862, + "grad_norm": 2.957890510559082, + "learning_rate": 1.310608419713773e-06, + "loss": 0.8944872617721558, + "step": 1618 + }, + { + "epoch": 1.3636363636363638, + "grad_norm": 3.4394900798797607, + "learning_rate": 1.3089838759394198e-06, + "loss": 0.6483921408653259, + "step": 1620 + }, + { + "epoch": 1.3653198653198653, + "grad_norm": 2.6076972484588623, + "learning_rate": 1.3073586121276824e-06, + "loss": 0.9400961995124817, + "step": 1622 + }, + { + "epoch": 1.367003367003367, + "grad_norm": 2.1458706855773926, + "learning_rate": 1.3057326338782782e-06, + "loss": 0.8825739622116089, + "step": 1624 + }, + { + "epoch": 1.3686868686868687, + "grad_norm": 21.359161376953125, + "learning_rate": 1.3041059467933864e-06, + "loss": 0.6030191779136658, + "step": 1626 + }, + { + "epoch": 1.3703703703703702, + "grad_norm": 19.883914947509766, + "learning_rate": 1.3024785564776287e-06, + "loss": 0.8803253173828125, + "step": 1628 + }, + { + "epoch": 1.372053872053872, + "grad_norm": 5.972216606140137, + "learning_rate": 1.3008504685380493e-06, + "loss": 0.8786773085594177, + "step": 1630 + }, + { + "epoch": 1.3737373737373737, + "grad_norm": 4.644904613494873, + "learning_rate": 1.2992216885840964e-06, + "loss": 1.0024290084838867, + "step": 1632 + }, + { + "epoch": 1.3754208754208754, + "grad_norm": 6.252418041229248, + "learning_rate": 1.297592222227602e-06, + "loss": 0.6154271364212036, + "step": 1634 + }, + { + "epoch": 1.3771043771043772, + "grad_norm": 5.154648780822754, + "learning_rate": 1.2959620750827637e-06, + "loss": 0.3709207773208618, + "step": 1636 + }, + { + "epoch": 1.378787878787879, + "grad_norm": 4.736825466156006, + "learning_rate": 1.2943312527661236e-06, + "loss": 0.5821201801300049, + "step": 1638 + }, + { + "epoch": 1.3804713804713804, + "grad_norm": 2.9232895374298096, + "learning_rate": 1.2926997608965515e-06, + "loss": 0.6593613624572754, + "step": 1640 + }, + { + "epoch": 1.3821548821548821, + "grad_norm": 3.258718729019165, + "learning_rate": 1.2910676050952232e-06, + "loss": 0.9339215755462646, + "step": 1642 + }, + { + "epoch": 1.3838383838383839, + "grad_norm": 2.4435172080993652, + "learning_rate": 1.2894347909856021e-06, + "loss": 1.130608081817627, + "step": 1644 + }, + { + "epoch": 1.3855218855218856, + "grad_norm": 5.7142791748046875, + "learning_rate": 1.2878013241934195e-06, + "loss": 0.7692638635635376, + "step": 1646 + }, + { + "epoch": 1.387205387205387, + "grad_norm": 2.420278310775757, + "learning_rate": 1.2861672103466564e-06, + "loss": 0.93665611743927, + "step": 1648 + }, + { + "epoch": 1.3888888888888888, + "grad_norm": 3.4516067504882812, + "learning_rate": 1.284532455075522e-06, + "loss": 0.8558226823806763, + "step": 1650 + }, + { + "epoch": 1.3905723905723906, + "grad_norm": 4.455197811126709, + "learning_rate": 1.2828970640124361e-06, + "loss": 1.1693918704986572, + "step": 1652 + }, + { + "epoch": 1.3922558922558923, + "grad_norm": 4.881862640380859, + "learning_rate": 1.281261042792009e-06, + "loss": 0.9461103677749634, + "step": 1654 + }, + { + "epoch": 1.393939393939394, + "grad_norm": 10.862548828125, + "learning_rate": 1.2796243970510232e-06, + "loss": 0.5996136665344238, + "step": 1656 + }, + { + "epoch": 1.3956228956228955, + "grad_norm": 3.589484930038452, + "learning_rate": 1.2779871324284106e-06, + "loss": 0.6074084043502808, + "step": 1658 + }, + { + "epoch": 1.3973063973063973, + "grad_norm": 11.17980670928955, + "learning_rate": 1.2763492545652373e-06, + "loss": 0.9331209659576416, + "step": 1660 + }, + { + "epoch": 1.398989898989899, + "grad_norm": 19.434432983398438, + "learning_rate": 1.2747107691046815e-06, + "loss": 0.7953930497169495, + "step": 1662 + }, + { + "epoch": 1.4006734006734007, + "grad_norm": 42.425941467285156, + "learning_rate": 1.2730716816920151e-06, + "loss": 0.7052454352378845, + "step": 1664 + }, + { + "epoch": 1.4023569023569022, + "grad_norm": 5.138425827026367, + "learning_rate": 1.271431997974584e-06, + "loss": 0.424437016248703, + "step": 1666 + }, + { + "epoch": 1.404040404040404, + "grad_norm": 9.087939262390137, + "learning_rate": 1.2697917236017886e-06, + "loss": 0.814346194267273, + "step": 1668 + }, + { + "epoch": 1.4057239057239057, + "grad_norm": 3.4287939071655273, + "learning_rate": 1.2681508642250637e-06, + "loss": 0.7924845218658447, + "step": 1670 + }, + { + "epoch": 1.4074074074074074, + "grad_norm": 2.349846601486206, + "learning_rate": 1.266509425497861e-06, + "loss": 0.7972933650016785, + "step": 1672 + }, + { + "epoch": 1.4090909090909092, + "grad_norm": 3.433432102203369, + "learning_rate": 1.2648674130756271e-06, + "loss": 1.136865258216858, + "step": 1674 + }, + { + "epoch": 1.410774410774411, + "grad_norm": 18.93527603149414, + "learning_rate": 1.2632248326157854e-06, + "loss": 0.4568125009536743, + "step": 1676 + }, + { + "epoch": 1.4124579124579124, + "grad_norm": 21.089004516601562, + "learning_rate": 1.2615816897777176e-06, + "loss": 0.9250065088272095, + "step": 1678 + }, + { + "epoch": 1.4141414141414141, + "grad_norm": 3.9571752548217773, + "learning_rate": 1.2599379902227419e-06, + "loss": 1.0160582065582275, + "step": 1680 + }, + { + "epoch": 1.4158249158249159, + "grad_norm": 2.4356608390808105, + "learning_rate": 1.258293739614094e-06, + "loss": 0.5913569927215576, + "step": 1682 + }, + { + "epoch": 1.4175084175084174, + "grad_norm": 14.787010192871094, + "learning_rate": 1.2566489436169101e-06, + "loss": 0.46613961458206177, + "step": 1684 + }, + { + "epoch": 1.4191919191919191, + "grad_norm": 11.936421394348145, + "learning_rate": 1.255003607898204e-06, + "loss": 0.6293203830718994, + "step": 1686 + }, + { + "epoch": 1.4208754208754208, + "grad_norm": 3.085696220397949, + "learning_rate": 1.2533577381268495e-06, + "loss": 1.1134471893310547, + "step": 1688 + }, + { + "epoch": 1.4225589225589226, + "grad_norm": 8.348203659057617, + "learning_rate": 1.2517113399735608e-06, + "loss": 0.5143088698387146, + "step": 1690 + }, + { + "epoch": 1.4242424242424243, + "grad_norm": 21.37081527709961, + "learning_rate": 1.250064419110872e-06, + "loss": 0.6192675828933716, + "step": 1692 + }, + { + "epoch": 1.425925925925926, + "grad_norm": 3.3926167488098145, + "learning_rate": 1.2484169812131184e-06, + "loss": 0.563998818397522, + "step": 1694 + }, + { + "epoch": 1.4276094276094276, + "grad_norm": 2.4411673545837402, + "learning_rate": 1.246769031956417e-06, + "loss": 1.2114120721817017, + "step": 1696 + }, + { + "epoch": 1.4292929292929293, + "grad_norm": 4.939236640930176, + "learning_rate": 1.245120577018646e-06, + "loss": 1.056166410446167, + "step": 1698 + }, + { + "epoch": 1.430976430976431, + "grad_norm": 3.1179447174072266, + "learning_rate": 1.2434716220794265e-06, + "loss": 0.8100858926773071, + "step": 1700 + }, + { + "epoch": 1.4326599326599325, + "grad_norm": 2.682645320892334, + "learning_rate": 1.2418221728201023e-06, + "loss": 0.8299959897994995, + "step": 1702 + }, + { + "epoch": 1.4343434343434343, + "grad_norm": 3.0754740238189697, + "learning_rate": 1.2401722349237198e-06, + "loss": 0.33164000511169434, + "step": 1704 + }, + { + "epoch": 1.436026936026936, + "grad_norm": 4.3346381187438965, + "learning_rate": 1.238521814075009e-06, + "loss": 0.4199884235858917, + "step": 1706 + }, + { + "epoch": 1.4377104377104377, + "grad_norm": 12.329163551330566, + "learning_rate": 1.236870915960365e-06, + "loss": 0.9520546197891235, + "step": 1708 + }, + { + "epoch": 1.4393939393939394, + "grad_norm": 2.5863959789276123, + "learning_rate": 1.2352195462678257e-06, + "loss": 1.0822396278381348, + "step": 1710 + }, + { + "epoch": 1.4410774410774412, + "grad_norm": 5.638743877410889, + "learning_rate": 1.2335677106870546e-06, + "loss": 0.9755090475082397, + "step": 1712 + }, + { + "epoch": 1.4427609427609427, + "grad_norm": 2.6220881938934326, + "learning_rate": 1.2319154149093202e-06, + "loss": 0.8935360312461853, + "step": 1714 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 10.807649612426758, + "learning_rate": 1.2302626646274773e-06, + "loss": 0.8985303044319153, + "step": 1716 + }, + { + "epoch": 1.4461279461279462, + "grad_norm": 3.802117109298706, + "learning_rate": 1.228609465535946e-06, + "loss": 0.6814161539077759, + "step": 1718 + }, + { + "epoch": 1.4478114478114479, + "grad_norm": 8.011700630187988, + "learning_rate": 1.2269558233306918e-06, + "loss": 0.7456521391868591, + "step": 1720 + }, + { + "epoch": 1.4494949494949494, + "grad_norm": 6.23107385635376, + "learning_rate": 1.2253017437092088e-06, + "loss": 0.589634358882904, + "step": 1722 + }, + { + "epoch": 1.4511784511784511, + "grad_norm": 3.2185349464416504, + "learning_rate": 1.2236472323704971e-06, + "loss": 0.7695318460464478, + "step": 1724 + }, + { + "epoch": 1.4528619528619529, + "grad_norm": 5.373349189758301, + "learning_rate": 1.221992295015044e-06, + "loss": 0.8508809208869934, + "step": 1726 + }, + { + "epoch": 1.4545454545454546, + "grad_norm": 6.226076602935791, + "learning_rate": 1.2203369373448053e-06, + "loss": 0.664426863193512, + "step": 1728 + }, + { + "epoch": 1.4562289562289563, + "grad_norm": 3.2036166191101074, + "learning_rate": 1.2186811650631847e-06, + "loss": 0.9715543389320374, + "step": 1730 + }, + { + "epoch": 1.457912457912458, + "grad_norm": 2.1510095596313477, + "learning_rate": 1.217024983875014e-06, + "loss": 1.2159640789031982, + "step": 1732 + }, + { + "epoch": 1.4595959595959596, + "grad_norm": 2.128190040588379, + "learning_rate": 1.2153683994865354e-06, + "loss": 0.8712791800498962, + "step": 1734 + }, + { + "epoch": 1.4612794612794613, + "grad_norm": 15.694469451904297, + "learning_rate": 1.213711417605378e-06, + "loss": 0.6612798571586609, + "step": 1736 + }, + { + "epoch": 1.462962962962963, + "grad_norm": 3.5540852546691895, + "learning_rate": 1.2120540439405418e-06, + "loss": 0.6000321507453918, + "step": 1738 + }, + { + "epoch": 1.4646464646464645, + "grad_norm": 5.9053730964660645, + "learning_rate": 1.2103962842023765e-06, + "loss": 1.0903751850128174, + "step": 1740 + }, + { + "epoch": 1.4663299663299663, + "grad_norm": 3.0747792720794678, + "learning_rate": 1.2087381441025624e-06, + "loss": 0.6912112236022949, + "step": 1742 + }, + { + "epoch": 1.468013468013468, + "grad_norm": 4.498322010040283, + "learning_rate": 1.2070796293540887e-06, + "loss": 0.5265808701515198, + "step": 1744 + }, + { + "epoch": 1.4696969696969697, + "grad_norm": 3.914283275604248, + "learning_rate": 1.2054207456712377e-06, + "loss": 0.9266606569290161, + "step": 1746 + }, + { + "epoch": 1.4713804713804715, + "grad_norm": 3.2208728790283203, + "learning_rate": 1.2037614987695609e-06, + "loss": 0.9809207916259766, + "step": 1748 + }, + { + "epoch": 1.4730639730639732, + "grad_norm": 4.662408828735352, + "learning_rate": 1.2021018943658623e-06, + "loss": 0.7404388189315796, + "step": 1750 + }, + { + "epoch": 1.4747474747474747, + "grad_norm": 2.950866460800171, + "learning_rate": 1.2004419381781779e-06, + "loss": 0.6600291728973389, + "step": 1752 + }, + { + "epoch": 1.4764309764309764, + "grad_norm": 7.190127372741699, + "learning_rate": 1.1987816359257543e-06, + "loss": 0.6781315803527832, + "step": 1754 + }, + { + "epoch": 1.4781144781144782, + "grad_norm": 9.120945930480957, + "learning_rate": 1.1971209933290318e-06, + "loss": 0.8286664485931396, + "step": 1756 + }, + { + "epoch": 1.4797979797979797, + "grad_norm": 46.43217468261719, + "learning_rate": 1.1954600161096226e-06, + "loss": 0.6408827900886536, + "step": 1758 + }, + { + "epoch": 1.4814814814814814, + "grad_norm": 3.931215286254883, + "learning_rate": 1.1937987099902927e-06, + "loss": 0.7160297632217407, + "step": 1760 + }, + { + "epoch": 1.4831649831649831, + "grad_norm": 2.768970251083374, + "learning_rate": 1.19213708069494e-06, + "loss": 0.9132235050201416, + "step": 1762 + }, + { + "epoch": 1.4848484848484849, + "grad_norm": 3.2081525325775146, + "learning_rate": 1.190475133948577e-06, + "loss": 0.8853850364685059, + "step": 1764 + }, + { + "epoch": 1.4865319865319866, + "grad_norm": 7.524960041046143, + "learning_rate": 1.1888128754773092e-06, + "loss": 0.6852905750274658, + "step": 1766 + }, + { + "epoch": 1.4882154882154883, + "grad_norm": 4.307741165161133, + "learning_rate": 1.1871503110083167e-06, + "loss": 0.7655327320098877, + "step": 1768 + }, + { + "epoch": 1.4898989898989898, + "grad_norm": 3.650569200515747, + "learning_rate": 1.1854874462698337e-06, + "loss": 0.9417293071746826, + "step": 1770 + }, + { + "epoch": 1.4915824915824916, + "grad_norm": 5.581574440002441, + "learning_rate": 1.1838242869911285e-06, + "loss": 0.3258330821990967, + "step": 1772 + }, + { + "epoch": 1.4932659932659933, + "grad_norm": 2.098912000656128, + "learning_rate": 1.182160838902485e-06, + "loss": 0.826897144317627, + "step": 1774 + }, + { + "epoch": 1.494949494949495, + "grad_norm": 7.627374172210693, + "learning_rate": 1.1804971077351818e-06, + "loss": 0.7514946460723877, + "step": 1776 + }, + { + "epoch": 1.4966329966329965, + "grad_norm": 3.7137930393218994, + "learning_rate": 1.1788330992214724e-06, + "loss": 0.8887453079223633, + "step": 1778 + }, + { + "epoch": 1.4983164983164983, + "grad_norm": 8.848133087158203, + "learning_rate": 1.1771688190945664e-06, + "loss": 0.9019075036048889, + "step": 1780 + }, + { + "epoch": 1.5, + "grad_norm": 8.9419584274292, + "learning_rate": 1.1755042730886093e-06, + "loss": 0.5869305729866028, + "step": 1782 + }, + { + "epoch": 1.5016835016835017, + "grad_norm": 2.39841365814209, + "learning_rate": 1.1738394669386621e-06, + "loss": 1.1196240186691284, + "step": 1784 + }, + { + "epoch": 1.5033670033670035, + "grad_norm": 6.431698322296143, + "learning_rate": 1.172174406380683e-06, + "loss": 0.807545006275177, + "step": 1786 + }, + { + "epoch": 1.5050505050505052, + "grad_norm": 3.8912956714630127, + "learning_rate": 1.170509097151506e-06, + "loss": 0.9450180530548096, + "step": 1788 + }, + { + "epoch": 1.5067340067340067, + "grad_norm": 22.158241271972656, + "learning_rate": 1.168843544988822e-06, + "loss": 0.6185091733932495, + "step": 1790 + }, + { + "epoch": 1.5084175084175084, + "grad_norm": 7.974305629730225, + "learning_rate": 1.1671777556311587e-06, + "loss": 0.6012750267982483, + "step": 1792 + }, + { + "epoch": 1.51010101010101, + "grad_norm": 2.431042432785034, + "learning_rate": 1.1655117348178619e-06, + "loss": 0.8983908891677856, + "step": 1794 + }, + { + "epoch": 1.5117845117845117, + "grad_norm": 10.86044692993164, + "learning_rate": 1.163845488289074e-06, + "loss": 0.8865917921066284, + "step": 1796 + }, + { + "epoch": 1.5134680134680134, + "grad_norm": 12.615477561950684, + "learning_rate": 1.1621790217857153e-06, + "loss": 0.9755824208259583, + "step": 1798 + }, + { + "epoch": 1.5151515151515151, + "grad_norm": 4.471153736114502, + "learning_rate": 1.1605123410494643e-06, + "loss": 0.678105890750885, + "step": 1800 + }, + { + "epoch": 1.5168350168350169, + "grad_norm": 2.3955981731414795, + "learning_rate": 1.1588454518227375e-06, + "loss": 1.0274368524551392, + "step": 1802 + }, + { + "epoch": 1.5185185185185186, + "grad_norm": 3.6730523109436035, + "learning_rate": 1.157178359848669e-06, + "loss": 0.9852594137191772, + "step": 1804 + }, + { + "epoch": 1.5202020202020203, + "grad_norm": 4.832586288452148, + "learning_rate": 1.155511070871093e-06, + "loss": 0.7990705966949463, + "step": 1806 + }, + { + "epoch": 1.5218855218855218, + "grad_norm": 7.295440196990967, + "learning_rate": 1.1538435906345213e-06, + "loss": 0.7585336565971375, + "step": 1808 + }, + { + "epoch": 1.5235690235690236, + "grad_norm": 5.79640531539917, + "learning_rate": 1.1521759248841237e-06, + "loss": 0.6978878974914551, + "step": 1810 + }, + { + "epoch": 1.5252525252525253, + "grad_norm": 3.875293016433716, + "learning_rate": 1.1505080793657124e-06, + "loss": 0.22595882415771484, + "step": 1812 + }, + { + "epoch": 1.5269360269360268, + "grad_norm": 3.867565870285034, + "learning_rate": 1.1488400598257157e-06, + "loss": 1.1055881977081299, + "step": 1814 + }, + { + "epoch": 1.5286195286195285, + "grad_norm": 50.10768127441406, + "learning_rate": 1.1471718720111629e-06, + "loss": 0.7640130519866943, + "step": 1816 + }, + { + "epoch": 1.5303030303030303, + "grad_norm": 20.99407196044922, + "learning_rate": 1.1455035216696634e-06, + "loss": 0.8898581266403198, + "step": 1818 + }, + { + "epoch": 1.531986531986532, + "grad_norm": 3.8618974685668945, + "learning_rate": 1.1438350145493853e-06, + "loss": 0.7621004581451416, + "step": 1820 + }, + { + "epoch": 1.5336700336700337, + "grad_norm": 5.8136162757873535, + "learning_rate": 1.1421663563990383e-06, + "loss": 0.7234241962432861, + "step": 1822 + }, + { + "epoch": 1.5353535353535355, + "grad_norm": 2.8319544792175293, + "learning_rate": 1.1404975529678515e-06, + "loss": 0.9921367168426514, + "step": 1824 + }, + { + "epoch": 1.5370370370370372, + "grad_norm": 2.6894915103912354, + "learning_rate": 1.1388286100055555e-06, + "loss": 0.841090738773346, + "step": 1826 + }, + { + "epoch": 1.5387205387205387, + "grad_norm": 2.3952138423919678, + "learning_rate": 1.1371595332623601e-06, + "loss": 0.8845152258872986, + "step": 1828 + }, + { + "epoch": 1.5404040404040404, + "grad_norm": 7.501322269439697, + "learning_rate": 1.1354903284889377e-06, + "loss": 0.7155517935752869, + "step": 1830 + }, + { + "epoch": 1.542087542087542, + "grad_norm": 7.9082136154174805, + "learning_rate": 1.133821001436401e-06, + "loss": 0.7049411535263062, + "step": 1832 + }, + { + "epoch": 1.5437710437710437, + "grad_norm": 2.185568332672119, + "learning_rate": 1.1321515578562835e-06, + "loss": 1.0648796558380127, + "step": 1834 + }, + { + "epoch": 1.5454545454545454, + "grad_norm": 17.329938888549805, + "learning_rate": 1.1304820035005211e-06, + "loss": 0.8813831806182861, + "step": 1836 + }, + { + "epoch": 1.5471380471380471, + "grad_norm": 1.5673277378082275, + "learning_rate": 1.1288123441214315e-06, + "loss": 0.45255547761917114, + "step": 1838 + }, + { + "epoch": 1.5488215488215489, + "grad_norm": 3.232985258102417, + "learning_rate": 1.1271425854716931e-06, + "loss": 0.6964028477668762, + "step": 1840 + }, + { + "epoch": 1.5505050505050506, + "grad_norm": 4.322386741638184, + "learning_rate": 1.125472733304327e-06, + "loss": 0.6157456636428833, + "step": 1842 + }, + { + "epoch": 1.5521885521885523, + "grad_norm": 4.216830730438232, + "learning_rate": 1.1238027933726776e-06, + "loss": 0.4383459687232971, + "step": 1844 + }, + { + "epoch": 1.5538720538720538, + "grad_norm": 3.0813772678375244, + "learning_rate": 1.122132771430389e-06, + "loss": 0.9130579233169556, + "step": 1846 + }, + { + "epoch": 1.5555555555555556, + "grad_norm": 4.2144975662231445, + "learning_rate": 1.1204626732313907e-06, + "loss": 0.9694530963897705, + "step": 1848 + }, + { + "epoch": 1.557239057239057, + "grad_norm": 3.75293231010437, + "learning_rate": 1.1187925045298732e-06, + "loss": 0.7483557462692261, + "step": 1850 + }, + { + "epoch": 1.5589225589225588, + "grad_norm": 7.035089015960693, + "learning_rate": 1.1171222710802704e-06, + "loss": 0.9532842040061951, + "step": 1852 + }, + { + "epoch": 1.5606060606060606, + "grad_norm": 4.142365455627441, + "learning_rate": 1.1154519786372392e-06, + "loss": 0.5940355658531189, + "step": 1854 + }, + { + "epoch": 1.5622895622895623, + "grad_norm": 1.9475144147872925, + "learning_rate": 1.1137816329556403e-06, + "loss": 0.6380103826522827, + "step": 1856 + }, + { + "epoch": 1.563973063973064, + "grad_norm": 2.4910194873809814, + "learning_rate": 1.112111239790517e-06, + "loss": 0.9142417907714844, + "step": 1858 + }, + { + "epoch": 1.5656565656565657, + "grad_norm": 5.697439193725586, + "learning_rate": 1.1104408048970765e-06, + "loss": 0.4324049949645996, + "step": 1860 + }, + { + "epoch": 1.5673400673400675, + "grad_norm": 7.662766456604004, + "learning_rate": 1.1087703340306707e-06, + "loss": 0.9493654370307922, + "step": 1862 + }, + { + "epoch": 1.569023569023569, + "grad_norm": 2.1827774047851562, + "learning_rate": 1.1070998329467738e-06, + "loss": 0.355845183134079, + "step": 1864 + }, + { + "epoch": 1.5707070707070707, + "grad_norm": 7.288192272186279, + "learning_rate": 1.1054293074009646e-06, + "loss": 1.0024428367614746, + "step": 1866 + }, + { + "epoch": 1.5723905723905722, + "grad_norm": 7.846567630767822, + "learning_rate": 1.1037587631489077e-06, + "loss": 0.600260853767395, + "step": 1868 + }, + { + "epoch": 1.574074074074074, + "grad_norm": 3.9028728008270264, + "learning_rate": 1.1020882059463297e-06, + "loss": 0.8100777268409729, + "step": 1870 + }, + { + "epoch": 1.5757575757575757, + "grad_norm": 4.646785736083984, + "learning_rate": 1.1004176415490036e-06, + "loss": 0.7916046380996704, + "step": 1872 + }, + { + "epoch": 1.5774410774410774, + "grad_norm": 2.543654680252075, + "learning_rate": 1.0987470757127267e-06, + "loss": 0.9251663684844971, + "step": 1874 + }, + { + "epoch": 1.5791245791245792, + "grad_norm": 21.24106788635254, + "learning_rate": 1.0970765141933012e-06, + "loss": 0.5762704610824585, + "step": 1876 + }, + { + "epoch": 1.5808080808080809, + "grad_norm": 2.501488447189331, + "learning_rate": 1.0954059627465144e-06, + "loss": 1.1238887310028076, + "step": 1878 + }, + { + "epoch": 1.5824915824915826, + "grad_norm": 5.235997200012207, + "learning_rate": 1.093735427128119e-06, + "loss": 0.7707400321960449, + "step": 1880 + }, + { + "epoch": 1.5841750841750841, + "grad_norm": 5.779091835021973, + "learning_rate": 1.092064913093813e-06, + "loss": 0.4793959856033325, + "step": 1882 + }, + { + "epoch": 1.5858585858585859, + "grad_norm": 7.471992015838623, + "learning_rate": 1.09039442639922e-06, + "loss": 0.5366681814193726, + "step": 1884 + }, + { + "epoch": 1.5875420875420876, + "grad_norm": 3.628077983856201, + "learning_rate": 1.0887239727998697e-06, + "loss": 0.6487268209457397, + "step": 1886 + }, + { + "epoch": 1.589225589225589, + "grad_norm": 3.7435550689697266, + "learning_rate": 1.0870535580511778e-06, + "loss": 0.996959388256073, + "step": 1888 + }, + { + "epoch": 1.5909090909090908, + "grad_norm": 4.557770252227783, + "learning_rate": 1.0853831879084254e-06, + "loss": 0.2108735740184784, + "step": 1890 + }, + { + "epoch": 1.5925925925925926, + "grad_norm": 4.259451389312744, + "learning_rate": 1.0837128681267409e-06, + "loss": 1.057731032371521, + "step": 1892 + }, + { + "epoch": 1.5942760942760943, + "grad_norm": 3.0099260807037354, + "learning_rate": 1.082042604461079e-06, + "loss": 0.8130640983581543, + "step": 1894 + }, + { + "epoch": 1.595959595959596, + "grad_norm": 7.435500144958496, + "learning_rate": 1.0803724026662e-06, + "loss": 0.9344555139541626, + "step": 1896 + }, + { + "epoch": 1.5976430976430978, + "grad_norm": 4.205924034118652, + "learning_rate": 1.0787022684966524e-06, + "loss": 0.8660852313041687, + "step": 1898 + }, + { + "epoch": 1.5993265993265995, + "grad_norm": 14.64234447479248, + "learning_rate": 1.0770322077067512e-06, + "loss": 0.7825689315795898, + "step": 1900 + }, + { + "epoch": 1.601010101010101, + "grad_norm": 2.525815725326538, + "learning_rate": 1.0753622260505582e-06, + "loss": 0.8996245265007019, + "step": 1902 + }, + { + "epoch": 1.6026936026936027, + "grad_norm": 5.750382423400879, + "learning_rate": 1.0736923292818631e-06, + "loss": 0.7357829213142395, + "step": 1904 + }, + { + "epoch": 1.6043771043771042, + "grad_norm": 2.830305814743042, + "learning_rate": 1.0720225231541629e-06, + "loss": 1.1233978271484375, + "step": 1906 + }, + { + "epoch": 1.606060606060606, + "grad_norm": 6.201582908630371, + "learning_rate": 1.0703528134206418e-06, + "loss": 0.9390593767166138, + "step": 1908 + }, + { + "epoch": 1.6077441077441077, + "grad_norm": 2.511575698852539, + "learning_rate": 1.0686832058341534e-06, + "loss": 0.5838450789451599, + "step": 1910 + }, + { + "epoch": 1.6094276094276094, + "grad_norm": 9.2995023727417, + "learning_rate": 1.0670137061471972e-06, + "loss": 0.5779824256896973, + "step": 1912 + }, + { + "epoch": 1.6111111111111112, + "grad_norm": 10.087990760803223, + "learning_rate": 1.0653443201119026e-06, + "loss": 0.7840274572372437, + "step": 1914 + }, + { + "epoch": 1.612794612794613, + "grad_norm": 3.4181957244873047, + "learning_rate": 1.063675053480007e-06, + "loss": 0.6986541152000427, + "step": 1916 + }, + { + "epoch": 1.6144781144781146, + "grad_norm": 29.79077911376953, + "learning_rate": 1.0620059120028363e-06, + "loss": 0.6631942987442017, + "step": 1918 + }, + { + "epoch": 1.6161616161616161, + "grad_norm": 7.215582370758057, + "learning_rate": 1.0603369014312848e-06, + "loss": 0.6879869699478149, + "step": 1920 + }, + { + "epoch": 1.6178451178451179, + "grad_norm": 2.632085084915161, + "learning_rate": 1.0586680275157966e-06, + "loss": 0.8899586200714111, + "step": 1922 + }, + { + "epoch": 1.6195286195286194, + "grad_norm": 2.167722225189209, + "learning_rate": 1.0569992960063445e-06, + "loss": 0.5768526792526245, + "step": 1924 + }, + { + "epoch": 1.621212121212121, + "grad_norm": 4.157503604888916, + "learning_rate": 1.0553307126524105e-06, + "loss": 0.6109682321548462, + "step": 1926 + }, + { + "epoch": 1.6228956228956228, + "grad_norm": 2.805830478668213, + "learning_rate": 1.0536622832029663e-06, + "loss": 0.741910457611084, + "step": 1928 + }, + { + "epoch": 1.6245791245791246, + "grad_norm": 8.529329299926758, + "learning_rate": 1.0519940134064535e-06, + "loss": 0.8265746831893921, + "step": 1930 + }, + { + "epoch": 1.6262626262626263, + "grad_norm": 3.2494988441467285, + "learning_rate": 1.0503259090107635e-06, + "loss": 0.664577841758728, + "step": 1932 + }, + { + "epoch": 1.627946127946128, + "grad_norm": 5.897353172302246, + "learning_rate": 1.0486579757632177e-06, + "loss": 0.9694902896881104, + "step": 1934 + }, + { + "epoch": 1.6296296296296298, + "grad_norm": 5.868167400360107, + "learning_rate": 1.046990219410548e-06, + "loss": 0.9580270648002625, + "step": 1936 + }, + { + "epoch": 1.6313131313131313, + "grad_norm": 5.813265323638916, + "learning_rate": 1.0453226456988766e-06, + "loss": 1.0353319644927979, + "step": 1938 + }, + { + "epoch": 1.632996632996633, + "grad_norm": 8.491958618164062, + "learning_rate": 1.0436552603736967e-06, + "loss": 0.8483461141586304, + "step": 1940 + }, + { + "epoch": 1.6346801346801347, + "grad_norm": 2.543708086013794, + "learning_rate": 1.0419880691798526e-06, + "loss": 1.0242235660552979, + "step": 1942 + }, + { + "epoch": 1.6363636363636362, + "grad_norm": 2.253805160522461, + "learning_rate": 1.040321077861519e-06, + "loss": 0.7730292677879333, + "step": 1944 + }, + { + "epoch": 1.638047138047138, + "grad_norm": 2.954116106033325, + "learning_rate": 1.0386542921621824e-06, + "loss": 0.4111822545528412, + "step": 1946 + }, + { + "epoch": 1.6397306397306397, + "grad_norm": 6.733564853668213, + "learning_rate": 1.036987717824621e-06, + "loss": 0.9653711318969727, + "step": 1948 + }, + { + "epoch": 1.6414141414141414, + "grad_norm": 4.305788993835449, + "learning_rate": 1.0353213605908854e-06, + "loss": 0.9876930713653564, + "step": 1950 + }, + { + "epoch": 1.6430976430976432, + "grad_norm": 5.421419143676758, + "learning_rate": 1.0336552262022756e-06, + "loss": 0.49330899119377136, + "step": 1952 + }, + { + "epoch": 1.644781144781145, + "grad_norm": 6.326197624206543, + "learning_rate": 1.0319893203993276e-06, + "loss": 0.42090070247650146, + "step": 1954 + }, + { + "epoch": 1.6464646464646466, + "grad_norm": 16.150659561157227, + "learning_rate": 1.0303236489217863e-06, + "loss": 0.22029098868370056, + "step": 1956 + }, + { + "epoch": 1.6481481481481481, + "grad_norm": 5.668072700500488, + "learning_rate": 1.0286582175085913e-06, + "loss": 0.6529502868652344, + "step": 1958 + }, + { + "epoch": 1.6498316498316499, + "grad_norm": 2.8413267135620117, + "learning_rate": 1.0269930318978552e-06, + "loss": 0.7630746960639954, + "step": 1960 + }, + { + "epoch": 1.6515151515151514, + "grad_norm": 10.319131851196289, + "learning_rate": 1.0253280978268421e-06, + "loss": 0.6666793823242188, + "step": 1962 + }, + { + "epoch": 1.6531986531986531, + "grad_norm": 9.414068222045898, + "learning_rate": 1.0236634210319507e-06, + "loss": 0.5435478687286377, + "step": 1964 + }, + { + "epoch": 1.6548821548821548, + "grad_norm": 12.622198104858398, + "learning_rate": 1.0219990072486938e-06, + "loss": 0.6335460543632507, + "step": 1966 + }, + { + "epoch": 1.6565656565656566, + "grad_norm": 1.7483079433441162, + "learning_rate": 1.020334862211676e-06, + "loss": 0.8370047211647034, + "step": 1968 + }, + { + "epoch": 1.6582491582491583, + "grad_norm": 12.047608375549316, + "learning_rate": 1.0186709916545775e-06, + "loss": 0.7684140205383301, + "step": 1970 + }, + { + "epoch": 1.65993265993266, + "grad_norm": 10.904447555541992, + "learning_rate": 1.0170074013101329e-06, + "loss": 0.9606258869171143, + "step": 1972 + }, + { + "epoch": 1.6616161616161618, + "grad_norm": 2.283515453338623, + "learning_rate": 1.0153440969101103e-06, + "loss": 0.7740556001663208, + "step": 1974 + }, + { + "epoch": 1.6632996632996633, + "grad_norm": 3.3896608352661133, + "learning_rate": 1.0136810841852937e-06, + "loss": 0.7479045391082764, + "step": 1976 + }, + { + "epoch": 1.664983164983165, + "grad_norm": 11.400617599487305, + "learning_rate": 1.0120183688654616e-06, + "loss": 0.743224024772644, + "step": 1978 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 1.4617348909378052, + "learning_rate": 1.0103559566793679e-06, + "loss": 0.7983130216598511, + "step": 1980 + }, + { + "epoch": 1.6683501683501682, + "grad_norm": 7.328155994415283, + "learning_rate": 1.0086938533547213e-06, + "loss": 0.5365386009216309, + "step": 1982 + }, + { + "epoch": 1.67003367003367, + "grad_norm": 12.693415641784668, + "learning_rate": 1.0070320646181684e-06, + "loss": 0.46709537506103516, + "step": 1984 + }, + { + "epoch": 1.6717171717171717, + "grad_norm": 3.179992437362671, + "learning_rate": 1.0053705961952697e-06, + "loss": 1.0043718814849854, + "step": 1986 + }, + { + "epoch": 1.6734006734006734, + "grad_norm": 2.304699182510376, + "learning_rate": 1.0037094538104832e-06, + "loss": 0.8764192461967468, + "step": 1988 + }, + { + "epoch": 1.6750841750841752, + "grad_norm": 2.90543270111084, + "learning_rate": 1.002048643187143e-06, + "loss": 0.6470460891723633, + "step": 1990 + }, + { + "epoch": 1.676767676767677, + "grad_norm": 3.9131369590759277, + "learning_rate": 1.0003881700474415e-06, + "loss": 1.0713807344436646, + "step": 1992 + }, + { + "epoch": 1.6784511784511784, + "grad_norm": 12.474353790283203, + "learning_rate": 9.987280401124063e-07, + "loss": 0.6647155284881592, + "step": 1994 + }, + { + "epoch": 1.6801346801346801, + "grad_norm": 2.7717926502227783, + "learning_rate": 9.970682591018842e-07, + "loss": 0.6175976991653442, + "step": 1996 + }, + { + "epoch": 1.6818181818181817, + "grad_norm": 1.6829837560653687, + "learning_rate": 9.95408832734519e-07, + "loss": 0.9222723245620728, + "step": 1998 + }, + { + "epoch": 1.6835016835016834, + "grad_norm": 61.439422607421875, + "learning_rate": 9.937497667277322e-07, + "loss": 0.7147092819213867, + "step": 2000 + }, + { + "epoch": 1.6851851851851851, + "grad_norm": 4.989965438842773, + "learning_rate": 9.92091066797705e-07, + "loss": 0.6293914914131165, + "step": 2002 + }, + { + "epoch": 1.6868686868686869, + "grad_norm": 11.067621231079102, + "learning_rate": 9.904327386593563e-07, + "loss": 0.652735710144043, + "step": 2004 + }, + { + "epoch": 1.6885521885521886, + "grad_norm": 7.8212666511535645, + "learning_rate": 9.887747880263236e-07, + "loss": 0.6376103162765503, + "step": 2006 + }, + { + "epoch": 1.6902356902356903, + "grad_norm": 3.7688381671905518, + "learning_rate": 9.871172206109458e-07, + "loss": 0.9424273371696472, + "step": 2008 + }, + { + "epoch": 1.691919191919192, + "grad_norm": 5.420353889465332, + "learning_rate": 9.854600421242396e-07, + "loss": 0.5027921199798584, + "step": 2010 + }, + { + "epoch": 1.6936026936026936, + "grad_norm": 4.543862819671631, + "learning_rate": 9.838032582758814e-07, + "loss": 0.82335364818573, + "step": 2012 + }, + { + "epoch": 1.6952861952861953, + "grad_norm": 3.9203450679779053, + "learning_rate": 9.821468747741893e-07, + "loss": 0.5697500705718994, + "step": 2014 + }, + { + "epoch": 1.696969696969697, + "grad_norm": 4.254537582397461, + "learning_rate": 9.804908973261012e-07, + "loss": 0.7458208799362183, + "step": 2016 + }, + { + "epoch": 1.6986531986531985, + "grad_norm": 35.745418548583984, + "learning_rate": 9.788353316371562e-07, + "loss": 0.7252602577209473, + "step": 2018 + }, + { + "epoch": 1.7003367003367003, + "grad_norm": 5.118950366973877, + "learning_rate": 9.771801834114748e-07, + "loss": 0.721235454082489, + "step": 2020 + }, + { + "epoch": 1.702020202020202, + "grad_norm": 8.20414924621582, + "learning_rate": 9.755254583517394e-07, + "loss": 1.0950629711151123, + "step": 2022 + }, + { + "epoch": 1.7037037037037037, + "grad_norm": 3.2535030841827393, + "learning_rate": 9.738711621591733e-07, + "loss": 0.7883695363998413, + "step": 2024 + }, + { + "epoch": 1.7053872053872055, + "grad_norm": 2.4924561977386475, + "learning_rate": 9.722173005335235e-07, + "loss": 0.8893304467201233, + "step": 2026 + }, + { + "epoch": 1.7070707070707072, + "grad_norm": 3.33543062210083, + "learning_rate": 9.705638791730391e-07, + "loss": 0.9973706007003784, + "step": 2028 + }, + { + "epoch": 1.708754208754209, + "grad_norm": 12.050497055053711, + "learning_rate": 9.689109037744522e-07, + "loss": 0.6256110668182373, + "step": 2030 + }, + { + "epoch": 1.7104377104377104, + "grad_norm": 7.641107082366943, + "learning_rate": 9.672583800329585e-07, + "loss": 0.4611208438873291, + "step": 2032 + }, + { + "epoch": 1.7121212121212122, + "grad_norm": 4.1710405349731445, + "learning_rate": 9.65606313642198e-07, + "loss": 0.8477398157119751, + "step": 2034 + }, + { + "epoch": 1.7138047138047137, + "grad_norm": 12.162333488464355, + "learning_rate": 9.63954710294234e-07, + "loss": 0.7969092130661011, + "step": 2036 + }, + { + "epoch": 1.7154882154882154, + "grad_norm": 6.495959281921387, + "learning_rate": 9.623035756795352e-07, + "loss": 0.41181480884552, + "step": 2038 + }, + { + "epoch": 1.7171717171717171, + "grad_norm": 5.608903408050537, + "learning_rate": 9.606529154869556e-07, + "loss": 0.45445549488067627, + "step": 2040 + }, + { + "epoch": 1.7188552188552189, + "grad_norm": 3.937591552734375, + "learning_rate": 9.590027354037134e-07, + "loss": 0.8946130275726318, + "step": 2042 + }, + { + "epoch": 1.7205387205387206, + "grad_norm": 3.99568247795105, + "learning_rate": 9.573530411153732e-07, + "loss": 0.8655031323432922, + "step": 2044 + }, + { + "epoch": 1.7222222222222223, + "grad_norm": 7.455286502838135, + "learning_rate": 9.557038383058265e-07, + "loss": 1.0632479190826416, + "step": 2046 + }, + { + "epoch": 1.723905723905724, + "grad_norm": 2.330151081085205, + "learning_rate": 9.540551326572709e-07, + "loss": 1.0349470376968384, + "step": 2048 + }, + { + "epoch": 1.7255892255892256, + "grad_norm": 6.064199924468994, + "learning_rate": 9.524069298501902e-07, + "loss": 0.41284000873565674, + "step": 2050 + }, + { + "epoch": 1.7272727272727273, + "grad_norm": 1.6751161813735962, + "learning_rate": 9.507592355633376e-07, + "loss": 1.0285980701446533, + "step": 2052 + }, + { + "epoch": 1.7289562289562288, + "grad_norm": 26.606491088867188, + "learning_rate": 9.491120554737126e-07, + "loss": 0.9353586435317993, + "step": 2054 + }, + { + "epoch": 1.7306397306397305, + "grad_norm": 4.331685543060303, + "learning_rate": 9.474653952565439e-07, + "loss": 0.7286108732223511, + "step": 2056 + }, + { + "epoch": 1.7323232323232323, + "grad_norm": 2.1677701473236084, + "learning_rate": 9.458192605852691e-07, + "loss": 1.0569818019866943, + "step": 2058 + }, + { + "epoch": 1.734006734006734, + "grad_norm": 2.619204521179199, + "learning_rate": 9.441736571315142e-07, + "loss": 0.620589554309845, + "step": 2060 + }, + { + "epoch": 1.7356902356902357, + "grad_norm": 5.867666721343994, + "learning_rate": 9.425285905650755e-07, + "loss": 0.9633854627609253, + "step": 2062 + }, + { + "epoch": 1.7373737373737375, + "grad_norm": 16.939653396606445, + "learning_rate": 9.408840665538999e-07, + "loss": 0.6605305671691895, + "step": 2064 + }, + { + "epoch": 1.7390572390572392, + "grad_norm": 2.5597705841064453, + "learning_rate": 9.392400907640645e-07, + "loss": 0.6780143976211548, + "step": 2066 + }, + { + "epoch": 1.7407407407407407, + "grad_norm": 14.445930480957031, + "learning_rate": 9.375966688597572e-07, + "loss": 0.8258605003356934, + "step": 2068 + }, + { + "epoch": 1.7424242424242424, + "grad_norm": 5.176375389099121, + "learning_rate": 9.359538065032586e-07, + "loss": 0.7047204971313477, + "step": 2070 + }, + { + "epoch": 1.7441077441077442, + "grad_norm": 9.773624420166016, + "learning_rate": 9.343115093549203e-07, + "loss": 0.6722849011421204, + "step": 2072 + }, + { + "epoch": 1.7457912457912457, + "grad_norm": 3.369567394256592, + "learning_rate": 9.32669783073147e-07, + "loss": 0.49055272340774536, + "step": 2074 + }, + { + "epoch": 1.7474747474747474, + "grad_norm": 16.458398818969727, + "learning_rate": 9.310286333143767e-07, + "loss": 1.0591087341308594, + "step": 2076 + }, + { + "epoch": 1.7491582491582491, + "grad_norm": 3.6667587757110596, + "learning_rate": 9.293880657330604e-07, + "loss": 0.8024224042892456, + "step": 2078 + }, + { + "epoch": 1.7508417508417509, + "grad_norm": 3.5527923107147217, + "learning_rate": 9.277480859816444e-07, + "loss": 0.9343531131744385, + "step": 2080 + }, + { + "epoch": 1.7525252525252526, + "grad_norm": 4.238471984863281, + "learning_rate": 9.261086997105487e-07, + "loss": 0.6490952968597412, + "step": 2082 + }, + { + "epoch": 1.7542087542087543, + "grad_norm": 2.784026861190796, + "learning_rate": 9.244699125681485e-07, + "loss": 1.1208921670913696, + "step": 2084 + }, + { + "epoch": 1.7558922558922558, + "grad_norm": 3.683945655822754, + "learning_rate": 9.228317302007556e-07, + "loss": 0.788274884223938, + "step": 2086 + }, + { + "epoch": 1.7575757575757576, + "grad_norm": 8.775335311889648, + "learning_rate": 9.211941582525968e-07, + "loss": 0.4447941184043884, + "step": 2088 + }, + { + "epoch": 1.7592592592592593, + "grad_norm": 35.036190032958984, + "learning_rate": 9.195572023657969e-07, + "loss": 0.5342724323272705, + "step": 2090 + }, + { + "epoch": 1.7609427609427608, + "grad_norm": 11.131832122802734, + "learning_rate": 9.179208681803579e-07, + "loss": 0.535330057144165, + "step": 2092 + }, + { + "epoch": 1.7626262626262625, + "grad_norm": 4.160572052001953, + "learning_rate": 9.162851613341389e-07, + "loss": 0.3984565734863281, + "step": 2094 + }, + { + "epoch": 1.7643097643097643, + "grad_norm": 3.6985437870025635, + "learning_rate": 9.146500874628391e-07, + "loss": 0.6421704292297363, + "step": 2096 + }, + { + "epoch": 1.765993265993266, + "grad_norm": 2.077662467956543, + "learning_rate": 9.130156521999757e-07, + "loss": 1.0149686336517334, + "step": 2098 + }, + { + "epoch": 1.7676767676767677, + "grad_norm": 2.065174102783203, + "learning_rate": 9.113818611768654e-07, + "loss": 0.8843855857849121, + "step": 2100 + }, + { + "epoch": 1.7693602693602695, + "grad_norm": 2.7010414600372314, + "learning_rate": 9.097487200226059e-07, + "loss": 0.8571631908416748, + "step": 2102 + }, + { + "epoch": 1.7710437710437712, + "grad_norm": 9.685044288635254, + "learning_rate": 9.081162343640561e-07, + "loss": 0.5381686687469482, + "step": 2104 + }, + { + "epoch": 1.7727272727272727, + "grad_norm": 3.8229737281799316, + "learning_rate": 9.064844098258153e-07, + "loss": 0.6796019077301025, + "step": 2106 + }, + { + "epoch": 1.7744107744107744, + "grad_norm": 6.055543899536133, + "learning_rate": 9.048532520302061e-07, + "loss": 0.8706216812133789, + "step": 2108 + }, + { + "epoch": 1.776094276094276, + "grad_norm": 7.083333969116211, + "learning_rate": 9.032227665972534e-07, + "loss": 0.5699350237846375, + "step": 2110 + }, + { + "epoch": 1.7777777777777777, + "grad_norm": 2.0101730823516846, + "learning_rate": 9.015929591446651e-07, + "loss": 0.8485995531082153, + "step": 2112 + }, + { + "epoch": 1.7794612794612794, + "grad_norm": 2.6497552394866943, + "learning_rate": 8.999638352878142e-07, + "loss": 0.8866308927536011, + "step": 2114 + }, + { + "epoch": 1.7811447811447811, + "grad_norm": 2.9094290733337402, + "learning_rate": 8.983354006397177e-07, + "loss": 0.9138184785842896, + "step": 2116 + }, + { + "epoch": 1.7828282828282829, + "grad_norm": 2.6958985328674316, + "learning_rate": 8.96707660811018e-07, + "loss": 0.9850746989250183, + "step": 2118 + }, + { + "epoch": 1.7845117845117846, + "grad_norm": 21.947837829589844, + "learning_rate": 8.950806214099638e-07, + "loss": 0.6375728249549866, + "step": 2120 + }, + { + "epoch": 1.7861952861952863, + "grad_norm": 4.920895099639893, + "learning_rate": 8.934542880423903e-07, + "loss": 0.5961431860923767, + "step": 2122 + }, + { + "epoch": 1.7878787878787878, + "grad_norm": 2.1890132427215576, + "learning_rate": 8.918286663117005e-07, + "loss": 0.659866452217102, + "step": 2124 + }, + { + "epoch": 1.7895622895622896, + "grad_norm": 10.51028823852539, + "learning_rate": 8.902037618188449e-07, + "loss": 0.6706059575080872, + "step": 2126 + }, + { + "epoch": 1.791245791245791, + "grad_norm": 6.073541164398193, + "learning_rate": 8.885795801623035e-07, + "loss": 0.6864989995956421, + "step": 2128 + }, + { + "epoch": 1.7929292929292928, + "grad_norm": 19.274333953857422, + "learning_rate": 8.869561269380652e-07, + "loss": 0.674058198928833, + "step": 2130 + }, + { + "epoch": 1.7946127946127945, + "grad_norm": 3.4625072479248047, + "learning_rate": 8.853334077396098e-07, + "loss": 0.5736150741577148, + "step": 2132 + }, + { + "epoch": 1.7962962962962963, + "grad_norm": 1.9551900625228882, + "learning_rate": 8.837114281578872e-07, + "loss": 0.6773728728294373, + "step": 2134 + }, + { + "epoch": 1.797979797979798, + "grad_norm": 6.7064208984375, + "learning_rate": 8.820901937813003e-07, + "loss": 0.347098171710968, + "step": 2136 + }, + { + "epoch": 1.7996632996632997, + "grad_norm": 1.6629834175109863, + "learning_rate": 8.804697101956828e-07, + "loss": 0.9595216512680054, + "step": 2138 + }, + { + "epoch": 1.8013468013468015, + "grad_norm": 3.6944870948791504, + "learning_rate": 8.78849982984283e-07, + "loss": 0.7999200820922852, + "step": 2140 + }, + { + "epoch": 1.803030303030303, + "grad_norm": 3.7662339210510254, + "learning_rate": 8.772310177277427e-07, + "loss": 0.7555183172225952, + "step": 2142 + }, + { + "epoch": 1.8047138047138047, + "grad_norm": 2.7332985401153564, + "learning_rate": 8.756128200040782e-07, + "loss": 0.7414171099662781, + "step": 2144 + }, + { + "epoch": 1.8063973063973064, + "grad_norm": 5.167442798614502, + "learning_rate": 8.739953953886614e-07, + "loss": 0.904849112033844, + "step": 2146 + }, + { + "epoch": 1.808080808080808, + "grad_norm": 7.448000907897949, + "learning_rate": 8.72378749454201e-07, + "loss": 0.8806520104408264, + "step": 2148 + }, + { + "epoch": 1.8097643097643097, + "grad_norm": 2.8185012340545654, + "learning_rate": 8.707628877707221e-07, + "loss": 0.9877094030380249, + "step": 2150 + }, + { + "epoch": 1.8114478114478114, + "grad_norm": 3.56538987159729, + "learning_rate": 8.691478159055483e-07, + "loss": 0.9566267728805542, + "step": 2152 + }, + { + "epoch": 1.8131313131313131, + "grad_norm": 6.516078472137451, + "learning_rate": 8.675335394232819e-07, + "loss": 0.8102941513061523, + "step": 2154 + }, + { + "epoch": 1.8148148148148149, + "grad_norm": 5.387680530548096, + "learning_rate": 8.659200638857845e-07, + "loss": 0.655036449432373, + "step": 2156 + }, + { + "epoch": 1.8164983164983166, + "grad_norm": 4.142063140869141, + "learning_rate": 8.643073948521576e-07, + "loss": 0.44311749935150146, + "step": 2158 + }, + { + "epoch": 1.8181818181818183, + "grad_norm": 1.9489187002182007, + "learning_rate": 8.626955378787256e-07, + "loss": 0.8758860230445862, + "step": 2160 + }, + { + "epoch": 1.8198653198653199, + "grad_norm": 8.752238273620605, + "learning_rate": 8.610844985190127e-07, + "loss": 0.7219128608703613, + "step": 2162 + }, + { + "epoch": 1.8215488215488216, + "grad_norm": 8.243671417236328, + "learning_rate": 8.594742823237287e-07, + "loss": 0.8195970058441162, + "step": 2164 + }, + { + "epoch": 1.823232323232323, + "grad_norm": 8.444494247436523, + "learning_rate": 8.578648948407452e-07, + "loss": 0.9344632625579834, + "step": 2166 + }, + { + "epoch": 1.8249158249158248, + "grad_norm": 1.946562647819519, + "learning_rate": 8.562563416150794e-07, + "loss": 0.8328951597213745, + "step": 2168 + }, + { + "epoch": 1.8265993265993266, + "grad_norm": 4.5011749267578125, + "learning_rate": 8.546486281888739e-07, + "loss": 0.5535922050476074, + "step": 2170 + }, + { + "epoch": 1.8282828282828283, + "grad_norm": 10.435762405395508, + "learning_rate": 8.53041760101378e-07, + "loss": 0.733657956123352, + "step": 2172 + }, + { + "epoch": 1.82996632996633, + "grad_norm": 6.214064121246338, + "learning_rate": 8.51435742888928e-07, + "loss": 0.40798521041870117, + "step": 2174 + }, + { + "epoch": 1.8316498316498318, + "grad_norm": 4.490242958068848, + "learning_rate": 8.498305820849296e-07, + "loss": 0.45203477144241333, + "step": 2176 + }, + { + "epoch": 1.8333333333333335, + "grad_norm": 6.816056251525879, + "learning_rate": 8.482262832198365e-07, + "loss": 0.6513058543205261, + "step": 2178 + }, + { + "epoch": 1.835016835016835, + "grad_norm": 2.1644816398620605, + "learning_rate": 8.46622851821134e-07, + "loss": 0.7746816277503967, + "step": 2180 + }, + { + "epoch": 1.8367003367003367, + "grad_norm": 11.113990783691406, + "learning_rate": 8.450202934133174e-07, + "loss": 0.4632836580276489, + "step": 2182 + }, + { + "epoch": 1.8383838383838382, + "grad_norm": 4.4734086990356445, + "learning_rate": 8.434186135178749e-07, + "loss": 0.899796724319458, + "step": 2184 + }, + { + "epoch": 1.84006734006734, + "grad_norm": 2.3766531944274902, + "learning_rate": 8.418178176532674e-07, + "loss": 0.90257328748703, + "step": 2186 + }, + { + "epoch": 1.8417508417508417, + "grad_norm": 13.302746772766113, + "learning_rate": 8.402179113349106e-07, + "loss": 0.8778829574584961, + "step": 2188 + }, + { + "epoch": 1.8434343434343434, + "grad_norm": 10.324798583984375, + "learning_rate": 8.386189000751544e-07, + "loss": 0.5610869526863098, + "step": 2190 + }, + { + "epoch": 1.8451178451178452, + "grad_norm": 3.937783718109131, + "learning_rate": 8.370207893832661e-07, + "loss": 0.7988660335540771, + "step": 2192 + }, + { + "epoch": 1.8468013468013469, + "grad_norm": 7.830168724060059, + "learning_rate": 8.354235847654092e-07, + "loss": 0.6106054782867432, + "step": 2194 + }, + { + "epoch": 1.8484848484848486, + "grad_norm": 7.153279781341553, + "learning_rate": 8.338272917246252e-07, + "loss": 0.7764344215393066, + "step": 2196 + }, + { + "epoch": 1.8501683501683501, + "grad_norm": 6.39476203918457, + "learning_rate": 8.322319157608158e-07, + "loss": 0.48035871982574463, + "step": 2198 + }, + { + "epoch": 1.8518518518518519, + "grad_norm": 7.486396312713623, + "learning_rate": 8.306374623707222e-07, + "loss": 0.9800804853439331, + "step": 2200 + }, + { + "epoch": 1.8535353535353534, + "grad_norm": 3.6824681758880615, + "learning_rate": 8.29043937047907e-07, + "loss": 0.7192468643188477, + "step": 2202 + }, + { + "epoch": 1.855218855218855, + "grad_norm": 6.612771987915039, + "learning_rate": 8.274513452827361e-07, + "loss": 0.5936028957366943, + "step": 2204 + }, + { + "epoch": 1.8569023569023568, + "grad_norm": 3.079265832901001, + "learning_rate": 8.258596925623578e-07, + "loss": 0.9140318632125854, + "step": 2206 + }, + { + "epoch": 1.8585858585858586, + "grad_norm": 10.242953300476074, + "learning_rate": 8.242689843706852e-07, + "loss": 0.713873028755188, + "step": 2208 + }, + { + "epoch": 1.8602693602693603, + "grad_norm": 26.58353042602539, + "learning_rate": 8.226792261883777e-07, + "loss": 0.29191094636917114, + "step": 2210 + }, + { + "epoch": 1.861952861952862, + "grad_norm": 6.435546398162842, + "learning_rate": 8.210904234928213e-07, + "loss": 0.8298804759979248, + "step": 2212 + }, + { + "epoch": 1.8636363636363638, + "grad_norm": 2.913339853286743, + "learning_rate": 8.195025817581092e-07, + "loss": 1.0796676874160767, + "step": 2214 + }, + { + "epoch": 1.8653198653198653, + "grad_norm": 42.50606155395508, + "learning_rate": 8.179157064550246e-07, + "loss": 0.3906444311141968, + "step": 2216 + }, + { + "epoch": 1.867003367003367, + "grad_norm": 13.17294692993164, + "learning_rate": 8.163298030510208e-07, + "loss": 0.5464171171188354, + "step": 2218 + }, + { + "epoch": 1.8686868686868687, + "grad_norm": 17.247772216796875, + "learning_rate": 8.147448770102019e-07, + "loss": 0.48076120018959045, + "step": 2220 + }, + { + "epoch": 1.8703703703703702, + "grad_norm": 5.142391681671143, + "learning_rate": 8.131609337933054e-07, + "loss": 0.6968168616294861, + "step": 2222 + }, + { + "epoch": 1.872053872053872, + "grad_norm": 4.890412330627441, + "learning_rate": 8.115779788576818e-07, + "loss": 0.9484931230545044, + "step": 2224 + }, + { + "epoch": 1.8737373737373737, + "grad_norm": 4.0591044425964355, + "learning_rate": 8.099960176572768e-07, + "loss": 0.5798113346099854, + "step": 2226 + }, + { + "epoch": 1.8754208754208754, + "grad_norm": 16.09890365600586, + "learning_rate": 8.08415055642613e-07, + "loss": 0.35563382506370544, + "step": 2228 + }, + { + "epoch": 1.8771043771043772, + "grad_norm": 6.097412109375, + "learning_rate": 8.068350982607693e-07, + "loss": 1.0293006896972656, + "step": 2230 + }, + { + "epoch": 1.878787878787879, + "grad_norm": 3.246103525161743, + "learning_rate": 8.052561509553633e-07, + "loss": 0.9102228879928589, + "step": 2232 + }, + { + "epoch": 1.8804713804713806, + "grad_norm": 6.635921001434326, + "learning_rate": 8.03678219166533e-07, + "loss": 0.515903115272522, + "step": 2234 + }, + { + "epoch": 1.8821548821548821, + "grad_norm": 5.258808135986328, + "learning_rate": 8.021013083309181e-07, + "loss": 0.7250782251358032, + "step": 2236 + }, + { + "epoch": 1.8838383838383839, + "grad_norm": 27.69781494140625, + "learning_rate": 8.005254238816392e-07, + "loss": 0.9729253053665161, + "step": 2238 + }, + { + "epoch": 1.8855218855218854, + "grad_norm": 2.754936933517456, + "learning_rate": 7.989505712482814e-07, + "loss": 1.1490654945373535, + "step": 2240 + }, + { + "epoch": 1.887205387205387, + "grad_norm": 1.9234169721603394, + "learning_rate": 7.973767558568749e-07, + "loss": 0.9823436737060547, + "step": 2242 + }, + { + "epoch": 1.8888888888888888, + "grad_norm": 2.8880441188812256, + "learning_rate": 7.95803983129876e-07, + "loss": 0.8976832032203674, + "step": 2244 + }, + { + "epoch": 1.8905723905723906, + "grad_norm": 4.514529228210449, + "learning_rate": 7.942322584861476e-07, + "loss": 0.9340039491653442, + "step": 2246 + }, + { + "epoch": 1.8922558922558923, + "grad_norm": 7.478911876678467, + "learning_rate": 7.926615873409435e-07, + "loss": 0.8636904954910278, + "step": 2248 + }, + { + "epoch": 1.893939393939394, + "grad_norm": 2.7240192890167236, + "learning_rate": 7.910919751058863e-07, + "loss": 0.9821701049804688, + "step": 2250 + }, + { + "epoch": 1.8956228956228958, + "grad_norm": 2.6539080142974854, + "learning_rate": 7.895234271889502e-07, + "loss": 1.1389049291610718, + "step": 2252 + }, + { + "epoch": 1.8973063973063973, + "grad_norm": 2.555716037750244, + "learning_rate": 7.879559489944431e-07, + "loss": 0.8757186532020569, + "step": 2254 + }, + { + "epoch": 1.898989898989899, + "grad_norm": 3.2359490394592285, + "learning_rate": 7.86389545922987e-07, + "loss": 0.7967367172241211, + "step": 2256 + }, + { + "epoch": 1.9006734006734005, + "grad_norm": 2.5815160274505615, + "learning_rate": 7.848242233714992e-07, + "loss": 0.9813891649246216, + "step": 2258 + }, + { + "epoch": 1.9023569023569022, + "grad_norm": 5.316218852996826, + "learning_rate": 7.832599867331751e-07, + "loss": 0.6991989612579346, + "step": 2260 + }, + { + "epoch": 1.904040404040404, + "grad_norm": 3.514714241027832, + "learning_rate": 7.816968413974676e-07, + "loss": 0.7938976883888245, + "step": 2262 + }, + { + "epoch": 1.9057239057239057, + "grad_norm": 6.5592474937438965, + "learning_rate": 7.801347927500701e-07, + "loss": 0.46941909193992615, + "step": 2264 + }, + { + "epoch": 1.9074074074074074, + "grad_norm": 11.761022567749023, + "learning_rate": 7.785738461728975e-07, + "loss": 0.7285200953483582, + "step": 2266 + }, + { + "epoch": 1.9090909090909092, + "grad_norm": 7.991189002990723, + "learning_rate": 7.770140070440679e-07, + "loss": 0.6555970907211304, + "step": 2268 + }, + { + "epoch": 1.910774410774411, + "grad_norm": 4.922752857208252, + "learning_rate": 7.754552807378827e-07, + "loss": 0.7720062136650085, + "step": 2270 + }, + { + "epoch": 1.9124579124579124, + "grad_norm": 2.78389573097229, + "learning_rate": 7.738976726248105e-07, + "loss": 0.8745548725128174, + "step": 2272 + }, + { + "epoch": 1.9141414141414141, + "grad_norm": 10.283120155334473, + "learning_rate": 7.723411880714663e-07, + "loss": 0.7076643705368042, + "step": 2274 + }, + { + "epoch": 1.9158249158249159, + "grad_norm": 13.527719497680664, + "learning_rate": 7.707858324405945e-07, + "loss": 0.8855887651443481, + "step": 2276 + }, + { + "epoch": 1.9175084175084174, + "grad_norm": 13.780444145202637, + "learning_rate": 7.692316110910495e-07, + "loss": 0.5699777603149414, + "step": 2278 + }, + { + "epoch": 1.9191919191919191, + "grad_norm": 7.046093940734863, + "learning_rate": 7.676785293777779e-07, + "loss": 0.20726297795772552, + "step": 2280 + }, + { + "epoch": 1.9208754208754208, + "grad_norm": 5.450412750244141, + "learning_rate": 7.661265926517997e-07, + "loss": 0.960862398147583, + "step": 2282 + }, + { + "epoch": 1.9225589225589226, + "grad_norm": 13.540059089660645, + "learning_rate": 7.6457580626019e-07, + "loss": 0.44127357006073, + "step": 2284 + }, + { + "epoch": 1.9242424242424243, + "grad_norm": 5.831504821777344, + "learning_rate": 7.630261755460598e-07, + "loss": 0.5103174448013306, + "step": 2286 + }, + { + "epoch": 1.925925925925926, + "grad_norm": 7.158233165740967, + "learning_rate": 7.614777058485398e-07, + "loss": 0.9973621368408203, + "step": 2288 + }, + { + "epoch": 1.9276094276094278, + "grad_norm": 3.2046473026275635, + "learning_rate": 7.59930402502759e-07, + "loss": 0.6976436972618103, + "step": 2290 + }, + { + "epoch": 1.9292929292929293, + "grad_norm": 9.439109802246094, + "learning_rate": 7.58384270839829e-07, + "loss": 0.4523466229438782, + "step": 2292 + }, + { + "epoch": 1.930976430976431, + "grad_norm": 6.197632789611816, + "learning_rate": 7.568393161868234e-07, + "loss": 0.9106472134590149, + "step": 2294 + }, + { + "epoch": 1.9326599326599325, + "grad_norm": 8.470841407775879, + "learning_rate": 7.552955438667612e-07, + "loss": 0.7909121513366699, + "step": 2296 + }, + { + "epoch": 1.9343434343434343, + "grad_norm": 3.3162317276000977, + "learning_rate": 7.537529591985879e-07, + "loss": 0.7960456609725952, + "step": 2298 + }, + { + "epoch": 1.936026936026936, + "grad_norm": 7.409903526306152, + "learning_rate": 7.522115674971564e-07, + "loss": 0.6709874868392944, + "step": 2300 + }, + { + "epoch": 1.9377104377104377, + "grad_norm": 8.22396183013916, + "learning_rate": 7.506713740732098e-07, + "loss": 1.1500425338745117, + "step": 2302 + }, + { + "epoch": 1.9393939393939394, + "grad_norm": 3.9755733013153076, + "learning_rate": 7.491323842333626e-07, + "loss": 0.9240370988845825, + "step": 2304 + }, + { + "epoch": 1.9410774410774412, + "grad_norm": 7.245258331298828, + "learning_rate": 7.47594603280082e-07, + "loss": 0.30636048316955566, + "step": 2306 + }, + { + "epoch": 1.942760942760943, + "grad_norm": 4.102907180786133, + "learning_rate": 7.460580365116704e-07, + "loss": 0.8063202500343323, + "step": 2308 + }, + { + "epoch": 1.9444444444444444, + "grad_norm": 2.798117160797119, + "learning_rate": 7.445226892222476e-07, + "loss": 1.042150855064392, + "step": 2310 + }, + { + "epoch": 1.9461279461279462, + "grad_norm": 11.515227317810059, + "learning_rate": 7.429885667017301e-07, + "loss": 0.9472934603691101, + "step": 2312 + }, + { + "epoch": 1.9478114478114477, + "grad_norm": 5.401071548461914, + "learning_rate": 7.41455674235816e-07, + "loss": 0.9147957563400269, + "step": 2314 + }, + { + "epoch": 1.9494949494949494, + "grad_norm": 3.730478048324585, + "learning_rate": 7.399240171059649e-07, + "loss": 0.7157914638519287, + "step": 2316 + }, + { + "epoch": 1.9511784511784511, + "grad_norm": 4.426076889038086, + "learning_rate": 7.383936005893798e-07, + "loss": 0.8011871576309204, + "step": 2318 + }, + { + "epoch": 1.9528619528619529, + "grad_norm": 6.439156532287598, + "learning_rate": 7.368644299589894e-07, + "loss": 0.8518431186676025, + "step": 2320 + }, + { + "epoch": 1.9545454545454546, + "grad_norm": 2.613004446029663, + "learning_rate": 7.353365104834304e-07, + "loss": 0.936795711517334, + "step": 2322 + }, + { + "epoch": 1.9562289562289563, + "grad_norm": 6.956838130950928, + "learning_rate": 7.338098474270277e-07, + "loss": 0.7357702851295471, + "step": 2324 + }, + { + "epoch": 1.957912457912458, + "grad_norm": 13.74077320098877, + "learning_rate": 7.322844460497783e-07, + "loss": 0.5305231809616089, + "step": 2326 + }, + { + "epoch": 1.9595959595959596, + "grad_norm": 2.220991373062134, + "learning_rate": 7.307603116073317e-07, + "loss": 0.9905499219894409, + "step": 2328 + }, + { + "epoch": 1.9612794612794613, + "grad_norm": 1.9964042901992798, + "learning_rate": 7.292374493509725e-07, + "loss": 1.0259349346160889, + "step": 2330 + }, + { + "epoch": 1.9629629629629628, + "grad_norm": 3.4638054370880127, + "learning_rate": 7.277158645276014e-07, + "loss": 0.9553219079971313, + "step": 2332 + }, + { + "epoch": 1.9646464646464645, + "grad_norm": 2.130671977996826, + "learning_rate": 7.261955623797189e-07, + "loss": 0.9786357283592224, + "step": 2334 + }, + { + "epoch": 1.9663299663299663, + "grad_norm": 2.262347459793091, + "learning_rate": 7.246765481454056e-07, + "loss": 0.8999519348144531, + "step": 2336 + }, + { + "epoch": 1.968013468013468, + "grad_norm": 8.985565185546875, + "learning_rate": 7.23158827058304e-07, + "loss": 1.0301485061645508, + "step": 2338 + }, + { + "epoch": 1.9696969696969697, + "grad_norm": 15.289015769958496, + "learning_rate": 7.216424043476022e-07, + "loss": 0.4213113784790039, + "step": 2340 + }, + { + "epoch": 1.9713804713804715, + "grad_norm": 5.483232498168945, + "learning_rate": 7.20127285238015e-07, + "loss": 0.6755249500274658, + "step": 2342 + }, + { + "epoch": 1.9730639730639732, + "grad_norm": 5.321086883544922, + "learning_rate": 7.186134749497645e-07, + "loss": 0.5112136602401733, + "step": 2344 + }, + { + "epoch": 1.9747474747474747, + "grad_norm": 6.330574989318848, + "learning_rate": 7.171009786985642e-07, + "loss": 0.7962218523025513, + "step": 2346 + }, + { + "epoch": 1.9764309764309764, + "grad_norm": 7.868488788604736, + "learning_rate": 7.155898016956008e-07, + "loss": 0.6971943378448486, + "step": 2348 + }, + { + "epoch": 1.9781144781144782, + "grad_norm": 51.063167572021484, + "learning_rate": 7.14079949147514e-07, + "loss": 0.6931584477424622, + "step": 2350 + }, + { + "epoch": 1.9797979797979797, + "grad_norm": 5.527878761291504, + "learning_rate": 7.125714262563814e-07, + "loss": 0.6461153030395508, + "step": 2352 + }, + { + "epoch": 1.9814814814814814, + "grad_norm": 3.8143720626831055, + "learning_rate": 7.110642382196996e-07, + "loss": 0.4134939908981323, + "step": 2354 + }, + { + "epoch": 1.9831649831649831, + "grad_norm": 2.772143840789795, + "learning_rate": 7.095583902303648e-07, + "loss": 1.014623999595642, + "step": 2356 + }, + { + "epoch": 1.9848484848484849, + "grad_norm": 2.1666996479034424, + "learning_rate": 7.080538874766573e-07, + "loss": 0.8629425764083862, + "step": 2358 + }, + { + "epoch": 1.9865319865319866, + "grad_norm": 1.9438031911849976, + "learning_rate": 7.06550735142222e-07, + "loss": 0.8896007537841797, + "step": 2360 + }, + { + "epoch": 1.9882154882154883, + "grad_norm": 6.1856369972229, + "learning_rate": 7.050489384060512e-07, + "loss": 0.6207383275032043, + "step": 2362 + }, + { + "epoch": 1.98989898989899, + "grad_norm": 2.3403923511505127, + "learning_rate": 7.035485024424666e-07, + "loss": 0.912721574306488, + "step": 2364 + }, + { + "epoch": 1.9915824915824916, + "grad_norm": 11.149336814880371, + "learning_rate": 7.020494324211017e-07, + "loss": 0.8143168687820435, + "step": 2366 + }, + { + "epoch": 1.9932659932659933, + "grad_norm": 2.9151461124420166, + "learning_rate": 7.005517335068827e-07, + "loss": 0.9495657682418823, + "step": 2368 + }, + { + "epoch": 1.9949494949494948, + "grad_norm": 1.9637680053710938, + "learning_rate": 6.99055410860013e-07, + "loss": 0.26862990856170654, + "step": 2370 + }, + { + "epoch": 1.9966329966329965, + "grad_norm": 17.319799423217773, + "learning_rate": 6.975604696359542e-07, + "loss": 0.5134755969047546, + "step": 2372 + }, + { + "epoch": 1.9983164983164983, + "grad_norm": 5.046746730804443, + "learning_rate": 6.960669149854068e-07, + "loss": 0.8662137985229492, + "step": 2374 + }, + { + "epoch": 2.0, + "grad_norm": 3.1785898208618164, + "learning_rate": 6.945747520542955e-07, + "loss": 0.8281479477882385, + "step": 2376 + }, + { + "epoch": 2.0016835016835017, + "grad_norm": 15.919424057006836, + "learning_rate": 6.930839859837496e-07, + "loss": 0.5921661853790283, + "step": 2378 + }, + { + "epoch": 2.0033670033670035, + "grad_norm": 13.652657508850098, + "learning_rate": 6.915946219100852e-07, + "loss": 1.0555100440979004, + "step": 2380 + }, + { + "epoch": 2.005050505050505, + "grad_norm": 5.170054912567139, + "learning_rate": 6.901066649647887e-07, + "loss": 0.6134198904037476, + "step": 2382 + }, + { + "epoch": 2.006734006734007, + "grad_norm": 3.480863094329834, + "learning_rate": 6.886201202744972e-07, + "loss": 0.48556286096572876, + "step": 2384 + }, + { + "epoch": 2.008417508417508, + "grad_norm": 1.9658989906311035, + "learning_rate": 6.871349929609826e-07, + "loss": 0.6283817291259766, + "step": 2386 + }, + { + "epoch": 2.01010101010101, + "grad_norm": 3.805121421813965, + "learning_rate": 6.856512881411343e-07, + "loss": 0.7825635671615601, + "step": 2388 + }, + { + "epoch": 2.0117845117845117, + "grad_norm": 3.4738574028015137, + "learning_rate": 6.841690109269386e-07, + "loss": 0.9271956086158752, + "step": 2390 + }, + { + "epoch": 2.0134680134680134, + "grad_norm": 6.440873622894287, + "learning_rate": 6.826881664254646e-07, + "loss": 0.6064585447311401, + "step": 2392 + }, + { + "epoch": 2.015151515151515, + "grad_norm": 5.510295391082764, + "learning_rate": 6.812087597388452e-07, + "loss": 0.610366940498352, + "step": 2394 + }, + { + "epoch": 2.016835016835017, + "grad_norm": 2.200218439102173, + "learning_rate": 6.79730795964258e-07, + "loss": 0.7530055046081543, + "step": 2396 + }, + { + "epoch": 2.0185185185185186, + "grad_norm": 9.030868530273438, + "learning_rate": 6.782542801939105e-07, + "loss": 0.7531571388244629, + "step": 2398 + }, + { + "epoch": 2.0202020202020203, + "grad_norm": 3.04939866065979, + "learning_rate": 6.767792175150211e-07, + "loss": 0.4959731698036194, + "step": 2400 + }, + { + "epoch": 2.021885521885522, + "grad_norm": 10.346657752990723, + "learning_rate": 6.753056130098009e-07, + "loss": 0.31336265802383423, + "step": 2402 + }, + { + "epoch": 2.0235690235690234, + "grad_norm": 2.892493486404419, + "learning_rate": 6.738334717554373e-07, + "loss": 0.7610318660736084, + "step": 2404 + }, + { + "epoch": 2.025252525252525, + "grad_norm": 13.560941696166992, + "learning_rate": 6.723627988240772e-07, + "loss": 0.6177215576171875, + "step": 2406 + }, + { + "epoch": 2.026936026936027, + "grad_norm": 2.220264196395874, + "learning_rate": 6.708935992828068e-07, + "loss": 0.6627448797225952, + "step": 2408 + }, + { + "epoch": 2.0286195286195285, + "grad_norm": 4.267128944396973, + "learning_rate": 6.694258781936369e-07, + "loss": 0.664837121963501, + "step": 2410 + }, + { + "epoch": 2.0303030303030303, + "grad_norm": 3.3940136432647705, + "learning_rate": 6.679596406134844e-07, + "loss": 0.8382737636566162, + "step": 2412 + }, + { + "epoch": 2.031986531986532, + "grad_norm": 2.147282838821411, + "learning_rate": 6.664948915941546e-07, + "loss": 0.5983447432518005, + "step": 2414 + }, + { + "epoch": 2.0336700336700337, + "grad_norm": 2.9526758193969727, + "learning_rate": 6.65031636182324e-07, + "loss": 0.8206237554550171, + "step": 2416 + }, + { + "epoch": 2.0353535353535355, + "grad_norm": 15.74380874633789, + "learning_rate": 6.635698794195237e-07, + "loss": 0.5850080847740173, + "step": 2418 + }, + { + "epoch": 2.037037037037037, + "grad_norm": 63.14246368408203, + "learning_rate": 6.621096263421202e-07, + "loss": 0.4533715844154358, + "step": 2420 + }, + { + "epoch": 2.038720538720539, + "grad_norm": 3.8994693756103516, + "learning_rate": 6.606508819813001e-07, + "loss": 0.7626893520355225, + "step": 2422 + }, + { + "epoch": 2.04040404040404, + "grad_norm": 3.742114543914795, + "learning_rate": 6.591936513630514e-07, + "loss": 0.17822477221488953, + "step": 2424 + }, + { + "epoch": 2.042087542087542, + "grad_norm": 12.309547424316406, + "learning_rate": 6.577379395081466e-07, + "loss": 0.38434261083602905, + "step": 2426 + }, + { + "epoch": 2.0437710437710437, + "grad_norm": 3.1989083290100098, + "learning_rate": 6.562837514321258e-07, + "loss": 0.5980604290962219, + "step": 2428 + }, + { + "epoch": 2.0454545454545454, + "grad_norm": 32.80799865722656, + "learning_rate": 6.548310921452784e-07, + "loss": 0.716747522354126, + "step": 2430 + }, + { + "epoch": 2.047138047138047, + "grad_norm": 4.070531368255615, + "learning_rate": 6.533799666526275e-07, + "loss": 0.6677117347717285, + "step": 2432 + }, + { + "epoch": 2.048821548821549, + "grad_norm": 4.801085472106934, + "learning_rate": 6.519303799539104e-07, + "loss": 0.7861591577529907, + "step": 2434 + }, + { + "epoch": 2.0505050505050506, + "grad_norm": 3.876065731048584, + "learning_rate": 6.504823370435633e-07, + "loss": 1.105973720550537, + "step": 2436 + }, + { + "epoch": 2.0521885521885523, + "grad_norm": 2.630798578262329, + "learning_rate": 6.490358429107038e-07, + "loss": 0.6676466464996338, + "step": 2438 + }, + { + "epoch": 2.053872053872054, + "grad_norm": 3.058680534362793, + "learning_rate": 6.47590902539112e-07, + "loss": 0.824833869934082, + "step": 2440 + }, + { + "epoch": 2.0555555555555554, + "grad_norm": 5.962945461273193, + "learning_rate": 6.461475209072161e-07, + "loss": 0.7032083868980408, + "step": 2442 + }, + { + "epoch": 2.057239057239057, + "grad_norm": 2.236006021499634, + "learning_rate": 6.44705702988073e-07, + "loss": 0.7378408908843994, + "step": 2444 + }, + { + "epoch": 2.058922558922559, + "grad_norm": 5.968637943267822, + "learning_rate": 6.432654537493518e-07, + "loss": 0.9346398115158081, + "step": 2446 + }, + { + "epoch": 2.0606060606060606, + "grad_norm": 2.306854009628296, + "learning_rate": 6.418267781533173e-07, + "loss": 0.7191810607910156, + "step": 2448 + }, + { + "epoch": 2.0622895622895623, + "grad_norm": 9.214242935180664, + "learning_rate": 6.403896811568124e-07, + "loss": 0.760452389717102, + "step": 2450 + }, + { + "epoch": 2.063973063973064, + "grad_norm": 5.0180792808532715, + "learning_rate": 6.389541677112407e-07, + "loss": 0.8763862252235413, + "step": 2452 + }, + { + "epoch": 2.0656565656565657, + "grad_norm": 4.374032974243164, + "learning_rate": 6.375202427625505e-07, + "loss": 0.6157338619232178, + "step": 2454 + }, + { + "epoch": 2.0673400673400675, + "grad_norm": 5.687982082366943, + "learning_rate": 6.360879112512159e-07, + "loss": 0.7349066138267517, + "step": 2456 + }, + { + "epoch": 2.069023569023569, + "grad_norm": 2.7759313583374023, + "learning_rate": 6.346571781122218e-07, + "loss": 0.6915404796600342, + "step": 2458 + }, + { + "epoch": 2.0707070707070705, + "grad_norm": 7.065815448760986, + "learning_rate": 6.332280482750466e-07, + "loss": 0.561396062374115, + "step": 2460 + }, + { + "epoch": 2.0723905723905722, + "grad_norm": 16.879159927368164, + "learning_rate": 6.318005266636428e-07, + "loss": 0.5830413103103638, + "step": 2462 + }, + { + "epoch": 2.074074074074074, + "grad_norm": 9.45602798461914, + "learning_rate": 6.303746181964234e-07, + "loss": 0.6078395843505859, + "step": 2464 + }, + { + "epoch": 2.0757575757575757, + "grad_norm": 6.788721561431885, + "learning_rate": 6.289503277862438e-07, + "loss": 0.7341784238815308, + "step": 2466 + }, + { + "epoch": 2.0774410774410774, + "grad_norm": 2.810659408569336, + "learning_rate": 6.275276603403824e-07, + "loss": 0.5312877893447876, + "step": 2468 + }, + { + "epoch": 2.079124579124579, + "grad_norm": 5.600820541381836, + "learning_rate": 6.26106620760528e-07, + "loss": 0.961767315864563, + "step": 2470 + }, + { + "epoch": 2.080808080808081, + "grad_norm": 2.6611502170562744, + "learning_rate": 6.246872139427602e-07, + "loss": 0.9193134307861328, + "step": 2472 + }, + { + "epoch": 2.0824915824915826, + "grad_norm": 5.997580528259277, + "learning_rate": 6.232694447775316e-07, + "loss": 0.4731786549091339, + "step": 2474 + }, + { + "epoch": 2.0841750841750843, + "grad_norm": 6.405127048492432, + "learning_rate": 6.218533181496541e-07, + "loss": 0.57915198802948, + "step": 2476 + }, + { + "epoch": 2.0858585858585856, + "grad_norm": 2.907135486602783, + "learning_rate": 6.204388389382804e-07, + "loss": 0.8042079210281372, + "step": 2478 + }, + { + "epoch": 2.0875420875420874, + "grad_norm": 1.9114086627960205, + "learning_rate": 6.190260120168855e-07, + "loss": 0.6323788166046143, + "step": 2480 + }, + { + "epoch": 2.089225589225589, + "grad_norm": 1.966032862663269, + "learning_rate": 6.17614842253253e-07, + "loss": 0.594678521156311, + "step": 2482 + }, + { + "epoch": 2.090909090909091, + "grad_norm": 2.6483774185180664, + "learning_rate": 6.162053345094569e-07, + "loss": 0.9705860018730164, + "step": 2484 + }, + { + "epoch": 2.0925925925925926, + "grad_norm": 5.268326282501221, + "learning_rate": 6.147974936418436e-07, + "loss": 0.6276801228523254, + "step": 2486 + }, + { + "epoch": 2.0942760942760943, + "grad_norm": 2.3642375469207764, + "learning_rate": 6.133913245010181e-07, + "loss": 0.6014080047607422, + "step": 2488 + }, + { + "epoch": 2.095959595959596, + "grad_norm": 5.21682071685791, + "learning_rate": 6.119868319318244e-07, + "loss": 0.7621322870254517, + "step": 2490 + }, + { + "epoch": 2.0976430976430978, + "grad_norm": 3.3880903720855713, + "learning_rate": 6.105840207733302e-07, + "loss": 0.8144615888595581, + "step": 2492 + }, + { + "epoch": 2.0993265993265995, + "grad_norm": 2.8465569019317627, + "learning_rate": 6.091828958588101e-07, + "loss": 0.499761700630188, + "step": 2494 + }, + { + "epoch": 2.101010101010101, + "grad_norm": 4.338362216949463, + "learning_rate": 6.077834620157296e-07, + "loss": 0.9024825096130371, + "step": 2496 + }, + { + "epoch": 2.1026936026936025, + "grad_norm": 2.8251841068267822, + "learning_rate": 6.063857240657264e-07, + "loss": 0.4348450303077698, + "step": 2498 + }, + { + "epoch": 2.1043771043771042, + "grad_norm": 4.72477388381958, + "learning_rate": 6.049896868245962e-07, + "loss": 0.613303005695343, + "step": 2500 + }, + { + "epoch": 2.106060606060606, + "grad_norm": 2.4342687129974365, + "learning_rate": 6.035953551022748e-07, + "loss": 0.9862151145935059, + "step": 2502 + }, + { + "epoch": 2.1077441077441077, + "grad_norm": 11.250151634216309, + "learning_rate": 6.022027337028212e-07, + "loss": 0.7949624061584473, + "step": 2504 + }, + { + "epoch": 2.1094276094276094, + "grad_norm": 8.595945358276367, + "learning_rate": 6.008118274244025e-07, + "loss": 0.844199538230896, + "step": 2506 + }, + { + "epoch": 2.111111111111111, + "grad_norm": 5.953275680541992, + "learning_rate": 5.994226410592762e-07, + "loss": 0.47989651560783386, + "step": 2508 + }, + { + "epoch": 2.112794612794613, + "grad_norm": 5.53914213180542, + "learning_rate": 5.980351793937734e-07, + "loss": 0.5320888757705688, + "step": 2510 + }, + { + "epoch": 2.1144781144781146, + "grad_norm": 19.394433975219727, + "learning_rate": 5.966494472082832e-07, + "loss": 0.7170990705490112, + "step": 2512 + }, + { + "epoch": 2.1161616161616164, + "grad_norm": 7.686086654663086, + "learning_rate": 5.952654492772369e-07, + "loss": 0.431751549243927, + "step": 2514 + }, + { + "epoch": 2.1178451178451176, + "grad_norm": 3.4454784393310547, + "learning_rate": 5.938831903690887e-07, + "loss": 0.840388834476471, + "step": 2516 + }, + { + "epoch": 2.1195286195286194, + "grad_norm": 4.63939094543457, + "learning_rate": 5.925026752463027e-07, + "loss": 0.17465031147003174, + "step": 2518 + }, + { + "epoch": 2.121212121212121, + "grad_norm": 23.637449264526367, + "learning_rate": 5.911239086653345e-07, + "loss": 0.3789297044277191, + "step": 2520 + }, + { + "epoch": 2.122895622895623, + "grad_norm": 4.005544185638428, + "learning_rate": 5.89746895376614e-07, + "loss": 0.20194318890571594, + "step": 2522 + }, + { + "epoch": 2.1245791245791246, + "grad_norm": 3.130404233932495, + "learning_rate": 5.883716401245329e-07, + "loss": 0.40525293350219727, + "step": 2524 + }, + { + "epoch": 2.1262626262626263, + "grad_norm": 14.998170852661133, + "learning_rate": 5.869981476474235e-07, + "loss": 0.2688121795654297, + "step": 2526 + }, + { + "epoch": 2.127946127946128, + "grad_norm": 18.963912963867188, + "learning_rate": 5.856264226775451e-07, + "loss": 0.3136770725250244, + "step": 2528 + }, + { + "epoch": 2.1296296296296298, + "grad_norm": 3.191150188446045, + "learning_rate": 5.842564699410676e-07, + "loss": 0.5626152753829956, + "step": 2530 + }, + { + "epoch": 2.1313131313131315, + "grad_norm": 3.6382803916931152, + "learning_rate": 5.828882941580548e-07, + "loss": 0.7779805660247803, + "step": 2532 + }, + { + "epoch": 2.1329966329966332, + "grad_norm": 3.3205113410949707, + "learning_rate": 5.815219000424475e-07, + "loss": 0.40261930227279663, + "step": 2534 + }, + { + "epoch": 2.1346801346801345, + "grad_norm": 5.565113544464111, + "learning_rate": 5.801572923020486e-07, + "loss": 0.6595053672790527, + "step": 2536 + }, + { + "epoch": 2.1363636363636362, + "grad_norm": 9.94298267364502, + "learning_rate": 5.787944756385061e-07, + "loss": 0.32748013734817505, + "step": 2538 + }, + { + "epoch": 2.138047138047138, + "grad_norm": 0.7888699173927307, + "learning_rate": 5.774334547472963e-07, + "loss": 0.34032267332077026, + "step": 2540 + }, + { + "epoch": 2.1397306397306397, + "grad_norm": 8.096704483032227, + "learning_rate": 5.760742343177091e-07, + "loss": 0.7002683281898499, + "step": 2542 + }, + { + "epoch": 2.1414141414141414, + "grad_norm": 3.1933655738830566, + "learning_rate": 5.747168190328313e-07, + "loss": 0.10309363156557083, + "step": 2544 + }, + { + "epoch": 2.143097643097643, + "grad_norm": 2.4028244018554688, + "learning_rate": 5.73361213569529e-07, + "loss": 0.323750376701355, + "step": 2546 + }, + { + "epoch": 2.144781144781145, + "grad_norm": 1.90052330493927, + "learning_rate": 5.720074225984335e-07, + "loss": 0.6766308546066284, + "step": 2548 + }, + { + "epoch": 2.1464646464646466, + "grad_norm": 2.1108572483062744, + "learning_rate": 5.706554507839247e-07, + "loss": 0.8565983772277832, + "step": 2550 + }, + { + "epoch": 2.148148148148148, + "grad_norm": 2.810182571411133, + "learning_rate": 5.693053027841139e-07, + "loss": 0.4966258108615875, + "step": 2552 + }, + { + "epoch": 2.1498316498316496, + "grad_norm": 75.41299438476562, + "learning_rate": 5.679569832508294e-07, + "loss": 0.2292374223470688, + "step": 2554 + }, + { + "epoch": 2.1515151515151514, + "grad_norm": 3.583876132965088, + "learning_rate": 5.666104968295993e-07, + "loss": 0.4831843674182892, + "step": 2556 + }, + { + "epoch": 2.153198653198653, + "grad_norm": 3.617044448852539, + "learning_rate": 5.652658481596355e-07, + "loss": 0.5890083312988281, + "step": 2558 + }, + { + "epoch": 2.154882154882155, + "grad_norm": 4.1594061851501465, + "learning_rate": 5.639230418738186e-07, + "loss": 0.416708379983902, + "step": 2560 + }, + { + "epoch": 2.1565656565656566, + "grad_norm": 4.285228252410889, + "learning_rate": 5.625820825986818e-07, + "loss": 0.477688729763031, + "step": 2562 + }, + { + "epoch": 2.1582491582491583, + "grad_norm": 3.6317057609558105, + "learning_rate": 5.61242974954393e-07, + "loss": 0.6931259632110596, + "step": 2564 + }, + { + "epoch": 2.15993265993266, + "grad_norm": 7.4866943359375, + "learning_rate": 5.599057235547422e-07, + "loss": 0.4877997040748596, + "step": 2566 + }, + { + "epoch": 2.1616161616161618, + "grad_norm": 5.388299465179443, + "learning_rate": 5.585703330071232e-07, + "loss": 0.391178697347641, + "step": 2568 + }, + { + "epoch": 2.1632996632996635, + "grad_norm": 2.264526605606079, + "learning_rate": 5.572368079125177e-07, + "loss": 0.9337778687477112, + "step": 2570 + }, + { + "epoch": 2.164983164983165, + "grad_norm": 3.827529191970825, + "learning_rate": 5.559051528654812e-07, + "loss": 1.0406713485717773, + "step": 2572 + }, + { + "epoch": 2.1666666666666665, + "grad_norm": 6.1171650886535645, + "learning_rate": 5.545753724541259e-07, + "loss": 0.7416504621505737, + "step": 2574 + }, + { + "epoch": 2.1683501683501682, + "grad_norm": 9.12820053100586, + "learning_rate": 5.532474712601041e-07, + "loss": 0.1839454025030136, + "step": 2576 + }, + { + "epoch": 2.17003367003367, + "grad_norm": 13.084949493408203, + "learning_rate": 5.519214538585945e-07, + "loss": 0.6754062175750732, + "step": 2578 + }, + { + "epoch": 2.1717171717171717, + "grad_norm": 8.969803810119629, + "learning_rate": 5.505973248182854e-07, + "loss": 0.22235676646232605, + "step": 2580 + }, + { + "epoch": 2.1734006734006734, + "grad_norm": 6.776020526885986, + "learning_rate": 5.492750887013576e-07, + "loss": 0.41986188292503357, + "step": 2582 + }, + { + "epoch": 2.175084175084175, + "grad_norm": 15.121447563171387, + "learning_rate": 5.479547500634716e-07, + "loss": 0.31534767150878906, + "step": 2584 + }, + { + "epoch": 2.176767676767677, + "grad_norm": 4.160110950469971, + "learning_rate": 5.466363134537495e-07, + "loss": 0.6025125980377197, + "step": 2586 + }, + { + "epoch": 2.1784511784511786, + "grad_norm": 12.059831619262695, + "learning_rate": 5.453197834147596e-07, + "loss": 0.5609304904937744, + "step": 2588 + }, + { + "epoch": 2.18013468013468, + "grad_norm": 8.022695541381836, + "learning_rate": 5.440051644825024e-07, + "loss": 0.6940740346908569, + "step": 2590 + }, + { + "epoch": 2.1818181818181817, + "grad_norm": 11.945213317871094, + "learning_rate": 5.426924611863932e-07, + "loss": 0.523178219795227, + "step": 2592 + }, + { + "epoch": 2.1835016835016834, + "grad_norm": 12.750484466552734, + "learning_rate": 5.413816780492464e-07, + "loss": 0.3450314402580261, + "step": 2594 + }, + { + "epoch": 2.185185185185185, + "grad_norm": 5.865060329437256, + "learning_rate": 5.400728195872627e-07, + "loss": 0.6967110633850098, + "step": 2596 + }, + { + "epoch": 2.186868686868687, + "grad_norm": 2.9188671112060547, + "learning_rate": 5.387658903100093e-07, + "loss": 0.8298006057739258, + "step": 2598 + }, + { + "epoch": 2.1885521885521886, + "grad_norm": 8.126681327819824, + "learning_rate": 5.374608947204078e-07, + "loss": 0.5891833901405334, + "step": 2600 + }, + { + "epoch": 2.1902356902356903, + "grad_norm": 1.921739101409912, + "learning_rate": 5.361578373147173e-07, + "loss": 0.7303223609924316, + "step": 2602 + }, + { + "epoch": 2.191919191919192, + "grad_norm": 10.952816009521484, + "learning_rate": 5.348567225825182e-07, + "loss": 0.785490870475769, + "step": 2604 + }, + { + "epoch": 2.1936026936026938, + "grad_norm": 9.251832008361816, + "learning_rate": 5.335575550066987e-07, + "loss": 0.46439725160598755, + "step": 2606 + }, + { + "epoch": 2.1952861952861955, + "grad_norm": 5.436981201171875, + "learning_rate": 5.322603390634379e-07, + "loss": 0.895796000957489, + "step": 2608 + }, + { + "epoch": 2.196969696969697, + "grad_norm": 3.214667320251465, + "learning_rate": 5.3096507922219e-07, + "loss": 0.6566123962402344, + "step": 2610 + }, + { + "epoch": 2.1986531986531985, + "grad_norm": 41.99171447753906, + "learning_rate": 5.296717799456703e-07, + "loss": 0.32645493745803833, + "step": 2612 + }, + { + "epoch": 2.2003367003367003, + "grad_norm": 6.42157506942749, + "learning_rate": 5.283804456898393e-07, + "loss": 0.7071173191070557, + "step": 2614 + }, + { + "epoch": 2.202020202020202, + "grad_norm": 5.982941627502441, + "learning_rate": 5.270910809038866e-07, + "loss": 0.5429423451423645, + "step": 2616 + }, + { + "epoch": 2.2037037037037037, + "grad_norm": 23.397838592529297, + "learning_rate": 5.258036900302162e-07, + "loss": 0.4608469009399414, + "step": 2618 + }, + { + "epoch": 2.2053872053872055, + "grad_norm": 1.3942065238952637, + "learning_rate": 5.245182775044319e-07, + "loss": 0.24561887979507446, + "step": 2620 + }, + { + "epoch": 2.207070707070707, + "grad_norm": 3.1465113162994385, + "learning_rate": 5.2323484775532e-07, + "loss": 0.5467818975448608, + "step": 2622 + }, + { + "epoch": 2.208754208754209, + "grad_norm": 12.308442115783691, + "learning_rate": 5.219534052048364e-07, + "loss": 0.48555779457092285, + "step": 2624 + }, + { + "epoch": 2.2104377104377106, + "grad_norm": 6.089041709899902, + "learning_rate": 5.206739542680903e-07, + "loss": 0.4167608618736267, + "step": 2626 + }, + { + "epoch": 2.212121212121212, + "grad_norm": 7.500848293304443, + "learning_rate": 5.193964993533275e-07, + "loss": 0.5702179074287415, + "step": 2628 + }, + { + "epoch": 2.2138047138047137, + "grad_norm": 10.495234489440918, + "learning_rate": 5.181210448619185e-07, + "loss": 0.2557629644870758, + "step": 2630 + }, + { + "epoch": 2.2154882154882154, + "grad_norm": 2.5270442962646484, + "learning_rate": 5.168475951883405e-07, + "loss": 0.39183729887008667, + "step": 2632 + }, + { + "epoch": 2.217171717171717, + "grad_norm": 2.1306686401367188, + "learning_rate": 5.155761547201631e-07, + "loss": 0.06966563314199448, + "step": 2634 + }, + { + "epoch": 2.218855218855219, + "grad_norm": 4.132006645202637, + "learning_rate": 5.143067278380339e-07, + "loss": 0.7425806522369385, + "step": 2636 + }, + { + "epoch": 2.2205387205387206, + "grad_norm": 2.9199447631835938, + "learning_rate": 5.13039318915663e-07, + "loss": 1.07930326461792, + "step": 2638 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 2.4841439723968506, + "learning_rate": 5.117739323198067e-07, + "loss": 0.982938289642334, + "step": 2640 + }, + { + "epoch": 2.223905723905724, + "grad_norm": 4.3581013679504395, + "learning_rate": 5.105105724102547e-07, + "loss": 0.5647614002227783, + "step": 2642 + }, + { + "epoch": 2.225589225589226, + "grad_norm": 6.911370754241943, + "learning_rate": 5.092492435398137e-07, + "loss": 0.5829119086265564, + "step": 2644 + }, + { + "epoch": 2.227272727272727, + "grad_norm": 4.011280059814453, + "learning_rate": 5.079899500542917e-07, + "loss": 0.5897196531295776, + "step": 2646 + }, + { + "epoch": 2.228956228956229, + "grad_norm": 4.96337890625, + "learning_rate": 5.067326962924848e-07, + "loss": 0.2728573977947235, + "step": 2648 + }, + { + "epoch": 2.2306397306397305, + "grad_norm": 6.272621154785156, + "learning_rate": 5.054774865861617e-07, + "loss": 0.9227702617645264, + "step": 2650 + }, + { + "epoch": 2.2323232323232323, + "grad_norm": 4.739163875579834, + "learning_rate": 5.042243252600475e-07, + "loss": 0.5031465888023376, + "step": 2652 + }, + { + "epoch": 2.234006734006734, + "grad_norm": 13.35574722290039, + "learning_rate": 5.029732166318106e-07, + "loss": 0.49748843908309937, + "step": 2654 + }, + { + "epoch": 2.2356902356902357, + "grad_norm": 4.151340484619141, + "learning_rate": 5.017241650120462e-07, + "loss": 0.585181713104248, + "step": 2656 + }, + { + "epoch": 2.2373737373737375, + "grad_norm": 17.889524459838867, + "learning_rate": 5.004771747042631e-07, + "loss": 0.7983870506286621, + "step": 2658 + }, + { + "epoch": 2.239057239057239, + "grad_norm": 6.143094539642334, + "learning_rate": 4.992322500048673e-07, + "loss": 0.6713172197341919, + "step": 2660 + }, + { + "epoch": 2.240740740740741, + "grad_norm": 3.4442899227142334, + "learning_rate": 4.979893952031483e-07, + "loss": 0.7296475768089294, + "step": 2662 + }, + { + "epoch": 2.242424242424242, + "grad_norm": 16.668384552001953, + "learning_rate": 4.96748614581264e-07, + "loss": 0.3102848529815674, + "step": 2664 + }, + { + "epoch": 2.244107744107744, + "grad_norm": 2.3950233459472656, + "learning_rate": 4.955099124142251e-07, + "loss": 0.712740421295166, + "step": 2666 + }, + { + "epoch": 2.2457912457912457, + "grad_norm": 4.428253650665283, + "learning_rate": 4.942732929698827e-07, + "loss": 0.5821852684020996, + "step": 2668 + }, + { + "epoch": 2.2474747474747474, + "grad_norm": 8.776701927185059, + "learning_rate": 4.930387605089104e-07, + "loss": 0.4474225640296936, + "step": 2670 + }, + { + "epoch": 2.249158249158249, + "grad_norm": 3.6381278038024902, + "learning_rate": 4.918063192847921e-07, + "loss": 0.33651861548423767, + "step": 2672 + }, + { + "epoch": 2.250841750841751, + "grad_norm": 4.837399482727051, + "learning_rate": 4.905759735438068e-07, + "loss": 0.5961496829986572, + "step": 2674 + }, + { + "epoch": 2.2525252525252526, + "grad_norm": 2.985142946243286, + "learning_rate": 4.893477275250127e-07, + "loss": 0.6518359184265137, + "step": 2676 + }, + { + "epoch": 2.2542087542087543, + "grad_norm": 7.1583943367004395, + "learning_rate": 4.881215854602342e-07, + "loss": 0.4896303117275238, + "step": 2678 + }, + { + "epoch": 2.255892255892256, + "grad_norm": 1.9810396432876587, + "learning_rate": 4.868975515740471e-07, + "loss": 0.8590680956840515, + "step": 2680 + }, + { + "epoch": 2.257575757575758, + "grad_norm": 7.562203884124756, + "learning_rate": 4.856756300837625e-07, + "loss": 0.18953704833984375, + "step": 2682 + }, + { + "epoch": 2.259259259259259, + "grad_norm": 7.8364481925964355, + "learning_rate": 4.844558251994146e-07, + "loss": 0.12749773263931274, + "step": 2684 + }, + { + "epoch": 2.260942760942761, + "grad_norm": 3.5520970821380615, + "learning_rate": 4.832381411237444e-07, + "loss": 0.6111665964126587, + "step": 2686 + }, + { + "epoch": 2.2626262626262625, + "grad_norm": 4.207799911499023, + "learning_rate": 4.820225820521855e-07, + "loss": 0.36922651529312134, + "step": 2688 + }, + { + "epoch": 2.2643097643097643, + "grad_norm": 1.94363534450531, + "learning_rate": 4.808091521728506e-07, + "loss": 0.9025669097900391, + "step": 2690 + }, + { + "epoch": 2.265993265993266, + "grad_norm": 14.200057029724121, + "learning_rate": 4.795978556665165e-07, + "loss": 0.8429475426673889, + "step": 2692 + }, + { + "epoch": 2.2676767676767677, + "grad_norm": 3.5672523975372314, + "learning_rate": 4.783886967066088e-07, + "loss": 0.6566574573516846, + "step": 2694 + }, + { + "epoch": 2.2693602693602695, + "grad_norm": 4.338009357452393, + "learning_rate": 4.77181679459189e-07, + "loss": 0.5327779054641724, + "step": 2696 + }, + { + "epoch": 2.271043771043771, + "grad_norm": 2.5908162593841553, + "learning_rate": 4.759768080829399e-07, + "loss": 0.624381959438324, + "step": 2698 + }, + { + "epoch": 2.2727272727272725, + "grad_norm": 6.710553169250488, + "learning_rate": 4.747740867291497e-07, + "loss": 0.7681624889373779, + "step": 2700 + }, + { + "epoch": 2.274410774410774, + "grad_norm": 2.840843915939331, + "learning_rate": 4.7357351954169973e-07, + "loss": 0.49092429876327515, + "step": 2702 + }, + { + "epoch": 2.276094276094276, + "grad_norm": 2.1035234928131104, + "learning_rate": 4.7237511065704933e-07, + "loss": 0.8667645454406738, + "step": 2704 + }, + { + "epoch": 2.2777777777777777, + "grad_norm": 3.245436429977417, + "learning_rate": 4.7117886420422094e-07, + "loss": 0.9094717502593994, + "step": 2706 + }, + { + "epoch": 2.2794612794612794, + "grad_norm": 2.4817285537719727, + "learning_rate": 4.6998478430478714e-07, + "loss": 0.351574569940567, + "step": 2708 + }, + { + "epoch": 2.281144781144781, + "grad_norm": 5.749747276306152, + "learning_rate": 4.6879287507285596e-07, + "loss": 0.5877597332000732, + "step": 2710 + }, + { + "epoch": 2.282828282828283, + "grad_norm": 9.687824249267578, + "learning_rate": 4.676031406150555e-07, + "loss": 0.5526677370071411, + "step": 2712 + }, + { + "epoch": 2.2845117845117846, + "grad_norm": 3.64471435546875, + "learning_rate": 4.66415585030522e-07, + "loss": 0.4332752227783203, + "step": 2714 + }, + { + "epoch": 2.2861952861952863, + "grad_norm": 7.181333065032959, + "learning_rate": 4.6523021241088416e-07, + "loss": 0.7148293256759644, + "step": 2716 + }, + { + "epoch": 2.287878787878788, + "grad_norm": 4.991126537322998, + "learning_rate": 4.6404702684024905e-07, + "loss": 0.5515605807304382, + "step": 2718 + }, + { + "epoch": 2.28956228956229, + "grad_norm": 10.846860885620117, + "learning_rate": 4.628660323951891e-07, + "loss": 0.5390480160713196, + "step": 2720 + }, + { + "epoch": 2.291245791245791, + "grad_norm": 3.9083449840545654, + "learning_rate": 4.616872331447272e-07, + "loss": 0.63498854637146, + "step": 2722 + }, + { + "epoch": 2.292929292929293, + "grad_norm": 6.314955234527588, + "learning_rate": 4.605106331503223e-07, + "loss": 0.6880998611450195, + "step": 2724 + }, + { + "epoch": 2.2946127946127945, + "grad_norm": 3.322652816772461, + "learning_rate": 4.5933623646585683e-07, + "loss": 0.6316101551055908, + "step": 2726 + }, + { + "epoch": 2.2962962962962963, + "grad_norm": 5.35445499420166, + "learning_rate": 4.581640471376215e-07, + "loss": 0.5416774749755859, + "step": 2728 + }, + { + "epoch": 2.297979797979798, + "grad_norm": 6.625260353088379, + "learning_rate": 4.5699406920430155e-07, + "loss": 0.972043514251709, + "step": 2730 + }, + { + "epoch": 2.2996632996632997, + "grad_norm": 3.9685635566711426, + "learning_rate": 4.5582630669696324e-07, + "loss": 0.5268035531044006, + "step": 2732 + }, + { + "epoch": 2.3013468013468015, + "grad_norm": 9.009088516235352, + "learning_rate": 4.5466076363904e-07, + "loss": 0.4689450263977051, + "step": 2734 + }, + { + "epoch": 2.303030303030303, + "grad_norm": 6.697409629821777, + "learning_rate": 4.5349744404631785e-07, + "loss": 0.43555888533592224, + "step": 2736 + }, + { + "epoch": 2.3047138047138045, + "grad_norm": 9.158797264099121, + "learning_rate": 4.5233635192692206e-07, + "loss": 0.5540938377380371, + "step": 2738 + }, + { + "epoch": 2.3063973063973062, + "grad_norm": 18.85773468017578, + "learning_rate": 4.511774912813043e-07, + "loss": 0.4014560580253601, + "step": 2740 + }, + { + "epoch": 2.308080808080808, + "grad_norm": 1.82210111618042, + "learning_rate": 4.5002086610222626e-07, + "loss": 0.7727656364440918, + "step": 2742 + }, + { + "epoch": 2.3097643097643097, + "grad_norm": 3.7924273014068604, + "learning_rate": 4.488664803747487e-07, + "loss": 0.7189053297042847, + "step": 2744 + }, + { + "epoch": 2.3114478114478114, + "grad_norm": 3.0608716011047363, + "learning_rate": 4.4771433807621644e-07, + "loss": 0.7668474912643433, + "step": 2746 + }, + { + "epoch": 2.313131313131313, + "grad_norm": 5.792914867401123, + "learning_rate": 4.4656444317624397e-07, + "loss": 0.6078014373779297, + "step": 2748 + }, + { + "epoch": 2.314814814814815, + "grad_norm": 1.747604250907898, + "learning_rate": 4.454167996367032e-07, + "loss": 0.10793264210224152, + "step": 2750 + }, + { + "epoch": 2.3164983164983166, + "grad_norm": 4.28343391418457, + "learning_rate": 4.442714114117092e-07, + "loss": 0.33263859152793884, + "step": 2752 + }, + { + "epoch": 2.3181818181818183, + "grad_norm": 2.2499372959136963, + "learning_rate": 4.4312828244760613e-07, + "loss": 0.39961159229278564, + "step": 2754 + }, + { + "epoch": 2.31986531986532, + "grad_norm": 3.355552911758423, + "learning_rate": 4.4198741668295425e-07, + "loss": 0.8770014047622681, + "step": 2756 + }, + { + "epoch": 2.3215488215488214, + "grad_norm": 2.2010586261749268, + "learning_rate": 4.4084881804851644e-07, + "loss": 0.5539072751998901, + "step": 2758 + }, + { + "epoch": 2.323232323232323, + "grad_norm": 4.903811931610107, + "learning_rate": 4.397124904672437e-07, + "loss": 0.6975724697113037, + "step": 2760 + }, + { + "epoch": 2.324915824915825, + "grad_norm": 5.035953044891357, + "learning_rate": 4.3857843785426263e-07, + "loss": 0.5050334334373474, + "step": 2762 + }, + { + "epoch": 2.3265993265993266, + "grad_norm": 3.3227932453155518, + "learning_rate": 4.374466641168622e-07, + "loss": 0.8777497410774231, + "step": 2764 + }, + { + "epoch": 2.3282828282828283, + "grad_norm": 4.905037879943848, + "learning_rate": 4.363171731544786e-07, + "loss": 0.7257252931594849, + "step": 2766 + }, + { + "epoch": 2.32996632996633, + "grad_norm": 2.3318030834198, + "learning_rate": 4.351899688586834e-07, + "loss": 0.5315639972686768, + "step": 2768 + }, + { + "epoch": 2.3316498316498318, + "grad_norm": 12.677505493164062, + "learning_rate": 4.3406505511317025e-07, + "loss": 0.6226543188095093, + "step": 2770 + }, + { + "epoch": 2.3333333333333335, + "grad_norm": 3.6738951206207275, + "learning_rate": 4.329424357937397e-07, + "loss": 0.5986767411231995, + "step": 2772 + }, + { + "epoch": 2.3350168350168348, + "grad_norm": 3.570671558380127, + "learning_rate": 4.318221147682879e-07, + "loss": 0.693830132484436, + "step": 2774 + }, + { + "epoch": 2.3367003367003365, + "grad_norm": 3.0889062881469727, + "learning_rate": 4.307040958967924e-07, + "loss": 0.6411426663398743, + "step": 2776 + }, + { + "epoch": 2.3383838383838382, + "grad_norm": 4.422166347503662, + "learning_rate": 4.2958838303129817e-07, + "loss": 0.45083481073379517, + "step": 2778 + }, + { + "epoch": 2.34006734006734, + "grad_norm": 29.303316116333008, + "learning_rate": 4.2847498001590573e-07, + "loss": 0.6881177425384521, + "step": 2780 + }, + { + "epoch": 2.3417508417508417, + "grad_norm": 2.217395544052124, + "learning_rate": 4.273638906867573e-07, + "loss": 0.5657017230987549, + "step": 2782 + }, + { + "epoch": 2.3434343434343434, + "grad_norm": 10.195280075073242, + "learning_rate": 4.2625511887202225e-07, + "loss": 0.7839221954345703, + "step": 2784 + }, + { + "epoch": 2.345117845117845, + "grad_norm": 2.6481029987335205, + "learning_rate": 4.2514866839188657e-07, + "loss": 0.5463940501213074, + "step": 2786 + }, + { + "epoch": 2.346801346801347, + "grad_norm": 2.2342593669891357, + "learning_rate": 4.2404454305853796e-07, + "loss": 0.8763151168823242, + "step": 2788 + }, + { + "epoch": 2.3484848484848486, + "grad_norm": 4.609320640563965, + "learning_rate": 4.229427466761522e-07, + "loss": 0.7232416868209839, + "step": 2790 + }, + { + "epoch": 2.3501683501683504, + "grad_norm": 6.990656852722168, + "learning_rate": 4.2184328304088164e-07, + "loss": 0.5656273365020752, + "step": 2792 + }, + { + "epoch": 2.351851851851852, + "grad_norm": 10.642841339111328, + "learning_rate": 4.2074615594084146e-07, + "loss": 0.6187400817871094, + "step": 2794 + }, + { + "epoch": 2.3535353535353534, + "grad_norm": 3.1630921363830566, + "learning_rate": 4.1965136915609543e-07, + "loss": 0.9885926246643066, + "step": 2796 + }, + { + "epoch": 2.355218855218855, + "grad_norm": 13.756888389587402, + "learning_rate": 4.1855892645864513e-07, + "loss": 0.45941799879074097, + "step": 2798 + }, + { + "epoch": 2.356902356902357, + "grad_norm": 2.228693962097168, + "learning_rate": 4.1746883161241555e-07, + "loss": 0.9851700067520142, + "step": 2800 + }, + { + "epoch": 2.3585858585858586, + "grad_norm": 2.863492965698242, + "learning_rate": 4.1638108837324137e-07, + "loss": 0.9169178009033203, + "step": 2802 + }, + { + "epoch": 2.3602693602693603, + "grad_norm": 3.3131117820739746, + "learning_rate": 4.152957004888563e-07, + "loss": 0.7946122884750366, + "step": 2804 + }, + { + "epoch": 2.361952861952862, + "grad_norm": 6.783644676208496, + "learning_rate": 4.142126716988784e-07, + "loss": 0.7735965847969055, + "step": 2806 + }, + { + "epoch": 2.3636363636363638, + "grad_norm": 3.6407532691955566, + "learning_rate": 4.131320057347969e-07, + "loss": 0.802727460861206, + "step": 2808 + }, + { + "epoch": 2.3653198653198655, + "grad_norm": 3.4392080307006836, + "learning_rate": 4.120537063199612e-07, + "loss": 1.0042896270751953, + "step": 2810 + }, + { + "epoch": 2.3670033670033668, + "grad_norm": 15.25992202758789, + "learning_rate": 4.109777771695663e-07, + "loss": 0.7024844288825989, + "step": 2812 + }, + { + "epoch": 2.3686868686868685, + "grad_norm": 2.76926589012146, + "learning_rate": 4.0990422199064103e-07, + "loss": 0.6036837100982666, + "step": 2814 + }, + { + "epoch": 2.3703703703703702, + "grad_norm": 4.845790386199951, + "learning_rate": 4.0883304448203477e-07, + "loss": 0.484286904335022, + "step": 2816 + }, + { + "epoch": 2.372053872053872, + "grad_norm": 3.267883777618408, + "learning_rate": 4.077642483344044e-07, + "loss": 0.5557587146759033, + "step": 2818 + }, + { + "epoch": 2.3737373737373737, + "grad_norm": 5.12905216217041, + "learning_rate": 4.066978372302025e-07, + "loss": 0.6941782236099243, + "step": 2820 + }, + { + "epoch": 2.3754208754208754, + "grad_norm": 3.630934953689575, + "learning_rate": 4.056338148436643e-07, + "loss": 0.4251060485839844, + "step": 2822 + }, + { + "epoch": 2.377104377104377, + "grad_norm": 5.501477241516113, + "learning_rate": 4.0457218484079414e-07, + "loss": 0.9760651588439941, + "step": 2824 + }, + { + "epoch": 2.378787878787879, + "grad_norm": 3.194762945175171, + "learning_rate": 4.035129508793542e-07, + "loss": 0.8394796848297119, + "step": 2826 + }, + { + "epoch": 2.3804713804713806, + "grad_norm": 689.3011474609375, + "learning_rate": 4.024561166088516e-07, + "loss": 0.4385402798652649, + "step": 2828 + }, + { + "epoch": 2.3821548821548824, + "grad_norm": 8.300933837890625, + "learning_rate": 4.0140168567052447e-07, + "loss": 0.932929277420044, + "step": 2830 + }, + { + "epoch": 2.3838383838383836, + "grad_norm": 20.601125717163086, + "learning_rate": 4.003496616973312e-07, + "loss": 0.6770232915878296, + "step": 2832 + }, + { + "epoch": 2.3855218855218854, + "grad_norm": 7.719077110290527, + "learning_rate": 3.9930004831393757e-07, + "loss": 0.5193581581115723, + "step": 2834 + }, + { + "epoch": 2.387205387205387, + "grad_norm": 3.433854341506958, + "learning_rate": 3.982528491367025e-07, + "loss": 0.5733506679534912, + "step": 2836 + }, + { + "epoch": 2.388888888888889, + "grad_norm": 5.136038780212402, + "learning_rate": 3.9720806777366817e-07, + "loss": 0.47218313813209534, + "step": 2838 + }, + { + "epoch": 2.3905723905723906, + "grad_norm": 1.433040976524353, + "learning_rate": 3.961657078245462e-07, + "loss": 0.8041648864746094, + "step": 2840 + }, + { + "epoch": 2.3922558922558923, + "grad_norm": 0.9403243660926819, + "learning_rate": 3.9512577288070487e-07, + "loss": 0.3452025055885315, + "step": 2842 + }, + { + "epoch": 2.393939393939394, + "grad_norm": 2.0302951335906982, + "learning_rate": 3.940882665251576e-07, + "loss": 0.9638313055038452, + "step": 2844 + }, + { + "epoch": 2.3956228956228958, + "grad_norm": 2.591130495071411, + "learning_rate": 3.930531923325506e-07, + "loss": 0.7442007064819336, + "step": 2846 + }, + { + "epoch": 2.3973063973063975, + "grad_norm": 4.4280548095703125, + "learning_rate": 3.920205538691497e-07, + "loss": 0.953087329864502, + "step": 2848 + }, + { + "epoch": 2.398989898989899, + "grad_norm": 2.4256279468536377, + "learning_rate": 3.9099035469282906e-07, + "loss": 0.7336077094078064, + "step": 2850 + }, + { + "epoch": 2.4006734006734005, + "grad_norm": 8.586638450622559, + "learning_rate": 3.8996259835305835e-07, + "loss": 0.390910804271698, + "step": 2852 + }, + { + "epoch": 2.4023569023569022, + "grad_norm": 32.83812713623047, + "learning_rate": 3.8893728839089035e-07, + "loss": 0.609326958656311, + "step": 2854 + }, + { + "epoch": 2.404040404040404, + "grad_norm": 4.8817458152771, + "learning_rate": 3.879144283389495e-07, + "loss": 0.5054650902748108, + "step": 2856 + }, + { + "epoch": 2.4057239057239057, + "grad_norm": 6.203306198120117, + "learning_rate": 3.8689402172141915e-07, + "loss": 0.6514500975608826, + "step": 2858 + }, + { + "epoch": 2.4074074074074074, + "grad_norm": 5.882429122924805, + "learning_rate": 3.8587607205402916e-07, + "loss": 0.41622331738471985, + "step": 2860 + }, + { + "epoch": 2.409090909090909, + "grad_norm": 2.390727996826172, + "learning_rate": 3.848605828440444e-07, + "loss": 0.7136590480804443, + "step": 2862 + }, + { + "epoch": 2.410774410774411, + "grad_norm": 6.754751682281494, + "learning_rate": 3.8384755759025313e-07, + "loss": 0.4541894793510437, + "step": 2864 + }, + { + "epoch": 2.4124579124579126, + "grad_norm": 3.0260815620422363, + "learning_rate": 3.828369997829528e-07, + "loss": 0.6994350552558899, + "step": 2866 + }, + { + "epoch": 2.4141414141414144, + "grad_norm": 2.372957706451416, + "learning_rate": 3.818289129039405e-07, + "loss": 0.8106458187103271, + "step": 2868 + }, + { + "epoch": 2.4158249158249157, + "grad_norm": 2.801581621170044, + "learning_rate": 3.808233004264997e-07, + "loss": 0.5665256977081299, + "step": 2870 + }, + { + "epoch": 2.4175084175084174, + "grad_norm": 3.397507905960083, + "learning_rate": 3.79820165815389e-07, + "loss": 0.44936102628707886, + "step": 2872 + }, + { + "epoch": 2.419191919191919, + "grad_norm": 2.3020706176757812, + "learning_rate": 3.788195125268284e-07, + "loss": 0.8391485214233398, + "step": 2874 + }, + { + "epoch": 2.420875420875421, + "grad_norm": 3.2758114337921143, + "learning_rate": 3.7782134400848995e-07, + "loss": 0.7489950656890869, + "step": 2876 + }, + { + "epoch": 2.4225589225589226, + "grad_norm": 5.947027206420898, + "learning_rate": 3.768256636994843e-07, + "loss": 0.4590849280357361, + "step": 2878 + }, + { + "epoch": 2.4242424242424243, + "grad_norm": 2.203789234161377, + "learning_rate": 3.7583247503034864e-07, + "loss": 0.7745201587677002, + "step": 2880 + }, + { + "epoch": 2.425925925925926, + "grad_norm": 3.3688504695892334, + "learning_rate": 3.7484178142303625e-07, + "loss": 0.5334046483039856, + "step": 2882 + }, + { + "epoch": 2.4276094276094278, + "grad_norm": 6.785653114318848, + "learning_rate": 3.738535862909031e-07, + "loss": 0.5028021335601807, + "step": 2884 + }, + { + "epoch": 2.429292929292929, + "grad_norm": 2.8243677616119385, + "learning_rate": 3.7286789303869735e-07, + "loss": 0.5118685960769653, + "step": 2886 + }, + { + "epoch": 2.430976430976431, + "grad_norm": 9.112323760986328, + "learning_rate": 3.7188470506254744e-07, + "loss": 0.5720535516738892, + "step": 2888 + }, + { + "epoch": 2.4326599326599325, + "grad_norm": 2.4455068111419678, + "learning_rate": 3.7090402574994885e-07, + "loss": 0.5391176342964172, + "step": 2890 + }, + { + "epoch": 2.4343434343434343, + "grad_norm": 5.355926990509033, + "learning_rate": 3.699258584797548e-07, + "loss": 0.6294881105422974, + "step": 2892 + }, + { + "epoch": 2.436026936026936, + "grad_norm": 2.457951545715332, + "learning_rate": 3.6895020662216326e-07, + "loss": 0.9022385478019714, + "step": 2894 + }, + { + "epoch": 2.4377104377104377, + "grad_norm": 7.03529167175293, + "learning_rate": 3.679770735387052e-07, + "loss": 0.720146656036377, + "step": 2896 + }, + { + "epoch": 2.4393939393939394, + "grad_norm": 10.114142417907715, + "learning_rate": 3.6700646258223343e-07, + "loss": 0.6195645332336426, + "step": 2898 + }, + { + "epoch": 2.441077441077441, + "grad_norm": 5.667145729064941, + "learning_rate": 3.6603837709691153e-07, + "loss": 0.43182432651519775, + "step": 2900 + }, + { + "epoch": 2.442760942760943, + "grad_norm": 13.144913673400879, + "learning_rate": 3.6507282041820085e-07, + "loss": 0.7789742350578308, + "step": 2902 + }, + { + "epoch": 2.4444444444444446, + "grad_norm": 9.248213768005371, + "learning_rate": 3.641097958728506e-07, + "loss": 0.48242291808128357, + "step": 2904 + }, + { + "epoch": 2.4461279461279464, + "grad_norm": 2.1247684955596924, + "learning_rate": 3.631493067788858e-07, + "loss": 0.3829724192619324, + "step": 2906 + }, + { + "epoch": 2.4478114478114477, + "grad_norm": 5.711479663848877, + "learning_rate": 3.6219135644559506e-07, + "loss": 0.5261117815971375, + "step": 2908 + }, + { + "epoch": 2.4494949494949494, + "grad_norm": 9.852108001708984, + "learning_rate": 3.6123594817352046e-07, + "loss": 0.6702965497970581, + "step": 2910 + }, + { + "epoch": 2.451178451178451, + "grad_norm": 6.790271282196045, + "learning_rate": 3.602830852544458e-07, + "loss": 0.4730827212333679, + "step": 2912 + }, + { + "epoch": 2.452861952861953, + "grad_norm": 8.912752151489258, + "learning_rate": 3.593327709713844e-07, + "loss": 0.7823283076286316, + "step": 2914 + }, + { + "epoch": 2.4545454545454546, + "grad_norm": 4.171782970428467, + "learning_rate": 3.5838500859856893e-07, + "loss": 0.6686667203903198, + "step": 2916 + }, + { + "epoch": 2.4562289562289563, + "grad_norm": 2.5204222202301025, + "learning_rate": 3.5743980140143975e-07, + "loss": 0.3113139867782593, + "step": 2918 + }, + { + "epoch": 2.457912457912458, + "grad_norm": 3.9417402744293213, + "learning_rate": 3.5649715263663297e-07, + "loss": 0.7965060472488403, + "step": 2920 + }, + { + "epoch": 2.45959595959596, + "grad_norm": 98.92294311523438, + "learning_rate": 3.5555706555197043e-07, + "loss": 0.43743637204170227, + "step": 2922 + }, + { + "epoch": 2.461279461279461, + "grad_norm": 3.686532974243164, + "learning_rate": 3.5461954338644795e-07, + "loss": 0.30664563179016113, + "step": 2924 + }, + { + "epoch": 2.462962962962963, + "grad_norm": 2.410140037536621, + "learning_rate": 3.536845893702234e-07, + "loss": 0.5530849695205688, + "step": 2926 + }, + { + "epoch": 2.4646464646464645, + "grad_norm": 24.317949295043945, + "learning_rate": 3.527522067246068e-07, + "loss": 0.5903668403625488, + "step": 2928 + }, + { + "epoch": 2.4663299663299663, + "grad_norm": 3.0360710620880127, + "learning_rate": 3.518223986620491e-07, + "loss": 0.24971121549606323, + "step": 2930 + }, + { + "epoch": 2.468013468013468, + "grad_norm": 5.305819511413574, + "learning_rate": 3.5089516838612986e-07, + "loss": 0.654639482498169, + "step": 2932 + }, + { + "epoch": 2.4696969696969697, + "grad_norm": 6.428488254547119, + "learning_rate": 3.499705190915476e-07, + "loss": 0.6544331312179565, + "step": 2934 + }, + { + "epoch": 2.4713804713804715, + "grad_norm": 5.150181293487549, + "learning_rate": 3.4904845396410854e-07, + "loss": 0.4527553915977478, + "step": 2936 + }, + { + "epoch": 2.473063973063973, + "grad_norm": 9.783395767211914, + "learning_rate": 3.4812897618071445e-07, + "loss": 0.5435815453529358, + "step": 2938 + }, + { + "epoch": 2.474747474747475, + "grad_norm": 5.587001800537109, + "learning_rate": 3.472120889093536e-07, + "loss": 0.4773102402687073, + "step": 2940 + }, + { + "epoch": 2.4764309764309766, + "grad_norm": 4.579451084136963, + "learning_rate": 3.462977953090884e-07, + "loss": 0.40418028831481934, + "step": 2942 + }, + { + "epoch": 2.478114478114478, + "grad_norm": 8.405234336853027, + "learning_rate": 3.453860985300446e-07, + "loss": 0.43912988901138306, + "step": 2944 + }, + { + "epoch": 2.4797979797979797, + "grad_norm": 2.54058837890625, + "learning_rate": 3.4447700171340164e-07, + "loss": 0.9208707213401794, + "step": 2946 + }, + { + "epoch": 2.4814814814814814, + "grad_norm": 2.506683588027954, + "learning_rate": 3.4357050799138053e-07, + "loss": 0.9445154666900635, + "step": 2948 + }, + { + "epoch": 2.483164983164983, + "grad_norm": 2.4092612266540527, + "learning_rate": 3.4266662048723337e-07, + "loss": 0.9850308895111084, + "step": 2950 + }, + { + "epoch": 2.484848484848485, + "grad_norm": 10.964947700500488, + "learning_rate": 3.417653423152329e-07, + "loss": 0.8890873193740845, + "step": 2952 + }, + { + "epoch": 2.4865319865319866, + "grad_norm": 3.6544744968414307, + "learning_rate": 3.4086667658066186e-07, + "loss": 0.5936705470085144, + "step": 2954 + }, + { + "epoch": 2.4882154882154883, + "grad_norm": 6.769886016845703, + "learning_rate": 3.3997062637980167e-07, + "loss": 0.8404591083526611, + "step": 2956 + }, + { + "epoch": 2.48989898989899, + "grad_norm": 6.549720764160156, + "learning_rate": 3.390771947999224e-07, + "loss": 0.5225011110305786, + "step": 2958 + }, + { + "epoch": 2.4915824915824913, + "grad_norm": 3.255201816558838, + "learning_rate": 3.381863849192718e-07, + "loss": 0.8342874050140381, + "step": 2960 + }, + { + "epoch": 2.493265993265993, + "grad_norm": 4.254117488861084, + "learning_rate": 3.3729819980706444e-07, + "loss": 0.5838370323181152, + "step": 2962 + }, + { + "epoch": 2.494949494949495, + "grad_norm": 2.933912992477417, + "learning_rate": 3.364126425234719e-07, + "loss": 0.7112206220626831, + "step": 2964 + }, + { + "epoch": 2.4966329966329965, + "grad_norm": 5.019345760345459, + "learning_rate": 3.3552971611961187e-07, + "loss": 0.5937138199806213, + "step": 2966 + }, + { + "epoch": 2.4983164983164983, + "grad_norm": 3.7426111698150635, + "learning_rate": 3.34649423637537e-07, + "loss": 0.81259685754776, + "step": 2968 + }, + { + "epoch": 2.5, + "grad_norm": 14.945383071899414, + "learning_rate": 3.337717681102253e-07, + "loss": 0.8419524431228638, + "step": 2970 + }, + { + "epoch": 2.5016835016835017, + "grad_norm": 3.5432753562927246, + "learning_rate": 3.328967525615697e-07, + "loss": 0.36146029829978943, + "step": 2972 + }, + { + "epoch": 2.5033670033670035, + "grad_norm": 31.251523971557617, + "learning_rate": 3.3202438000636634e-07, + "loss": 0.5271892547607422, + "step": 2974 + }, + { + "epoch": 2.505050505050505, + "grad_norm": 4.31404447555542, + "learning_rate": 3.311546534503061e-07, + "loss": 0.6813575029373169, + "step": 2976 + }, + { + "epoch": 2.506734006734007, + "grad_norm": 10.586312294006348, + "learning_rate": 3.3028757588996303e-07, + "loss": 0.3660055994987488, + "step": 2978 + }, + { + "epoch": 2.5084175084175087, + "grad_norm": 3.4156813621520996, + "learning_rate": 3.294231503127839e-07, + "loss": 0.7575110197067261, + "step": 2980 + }, + { + "epoch": 2.51010101010101, + "grad_norm": 8.647886276245117, + "learning_rate": 3.2856137969707847e-07, + "loss": 0.788750171661377, + "step": 2982 + }, + { + "epoch": 2.5117845117845117, + "grad_norm": 3.4446113109588623, + "learning_rate": 3.277022670120095e-07, + "loss": 0.4518158435821533, + "step": 2984 + }, + { + "epoch": 2.5134680134680134, + "grad_norm": 15.611486434936523, + "learning_rate": 3.268458152175813e-07, + "loss": 0.7932558059692383, + "step": 2986 + }, + { + "epoch": 2.515151515151515, + "grad_norm": 27.114980697631836, + "learning_rate": 3.2599202726463084e-07, + "loss": 0.61873459815979, + "step": 2988 + }, + { + "epoch": 2.516835016835017, + "grad_norm": 2.904008626937866, + "learning_rate": 3.2514090609481683e-07, + "loss": 0.10597741603851318, + "step": 2990 + }, + { + "epoch": 2.5185185185185186, + "grad_norm": 4.048925399780273, + "learning_rate": 3.2429245464060965e-07, + "loss": 0.8708055019378662, + "step": 2992 + }, + { + "epoch": 2.5202020202020203, + "grad_norm": 8.804458618164062, + "learning_rate": 3.234466758252818e-07, + "loss": 0.5630843043327332, + "step": 2994 + }, + { + "epoch": 2.5218855218855216, + "grad_norm": 2.408494234085083, + "learning_rate": 3.2260357256289715e-07, + "loss": 0.6830452084541321, + "step": 2996 + }, + { + "epoch": 2.5235690235690234, + "grad_norm": 4.321279525756836, + "learning_rate": 3.217631477583009e-07, + "loss": 0.5143815875053406, + "step": 2998 + }, + { + "epoch": 2.525252525252525, + "grad_norm": 1.794520378112793, + "learning_rate": 3.2092540430711044e-07, + "loss": 0.5180540084838867, + "step": 3000 + }, + { + "epoch": 2.526936026936027, + "grad_norm": 3.5048828125, + "learning_rate": 3.200903450957044e-07, + "loss": 0.49375149607658386, + "step": 3002 + }, + { + "epoch": 2.5286195286195285, + "grad_norm": 3.251695156097412, + "learning_rate": 3.192579730012129e-07, + "loss": 0.9845426082611084, + "step": 3004 + }, + { + "epoch": 2.5303030303030303, + "grad_norm": 6.4302263259887695, + "learning_rate": 3.184282908915081e-07, + "loss": 0.7751657962799072, + "step": 3006 + }, + { + "epoch": 2.531986531986532, + "grad_norm": 2.9614450931549072, + "learning_rate": 3.1760130162519427e-07, + "loss": 0.6437252759933472, + "step": 3008 + }, + { + "epoch": 2.5336700336700337, + "grad_norm": 3.641021728515625, + "learning_rate": 3.16777008051597e-07, + "loss": 0.33099907636642456, + "step": 3010 + }, + { + "epoch": 2.5353535353535355, + "grad_norm": 6.20613431930542, + "learning_rate": 3.159554130107546e-07, + "loss": 0.7693390846252441, + "step": 3012 + }, + { + "epoch": 2.537037037037037, + "grad_norm": 2.9264049530029297, + "learning_rate": 3.1513651933340797e-07, + "loss": 0.6058576107025146, + "step": 3014 + }, + { + "epoch": 2.538720538720539, + "grad_norm": 4.105390548706055, + "learning_rate": 3.143203298409899e-07, + "loss": 0.5138027667999268, + "step": 3016 + }, + { + "epoch": 2.5404040404040407, + "grad_norm": 13.755269050598145, + "learning_rate": 3.1350684734561676e-07, + "loss": 0.8655276298522949, + "step": 3018 + }, + { + "epoch": 2.542087542087542, + "grad_norm": 2.1755192279815674, + "learning_rate": 3.126960746500784e-07, + "loss": 0.7289071083068848, + "step": 3020 + }, + { + "epoch": 2.5437710437710437, + "grad_norm": 12.643874168395996, + "learning_rate": 3.118880145478274e-07, + "loss": 0.8041051030158997, + "step": 3022 + }, + { + "epoch": 2.5454545454545454, + "grad_norm": 3.0522072315216064, + "learning_rate": 3.110826698229711e-07, + "loss": 0.978661835193634, + "step": 3024 + }, + { + "epoch": 2.547138047138047, + "grad_norm": 10.360844612121582, + "learning_rate": 3.102800432502607e-07, + "loss": 0.2467118501663208, + "step": 3026 + }, + { + "epoch": 2.548821548821549, + "grad_norm": 4.895616054534912, + "learning_rate": 3.0948013759508274e-07, + "loss": 0.522205114364624, + "step": 3028 + }, + { + "epoch": 2.5505050505050506, + "grad_norm": 8.892946243286133, + "learning_rate": 3.0868295561344874e-07, + "loss": 0.4860239624977112, + "step": 3030 + }, + { + "epoch": 2.5521885521885523, + "grad_norm": 2.0342283248901367, + "learning_rate": 3.078885000519858e-07, + "loss": 0.4318680763244629, + "step": 3032 + }, + { + "epoch": 2.5538720538720536, + "grad_norm": 3.473409414291382, + "learning_rate": 3.0709677364792767e-07, + "loss": 0.8540394306182861, + "step": 3034 + }, + { + "epoch": 2.5555555555555554, + "grad_norm": 10.30406665802002, + "learning_rate": 3.0630777912910533e-07, + "loss": 0.9184716939926147, + "step": 3036 + }, + { + "epoch": 2.557239057239057, + "grad_norm": 6.738753795623779, + "learning_rate": 3.0552151921393633e-07, + "loss": 0.6098148822784424, + "step": 3038 + }, + { + "epoch": 2.558922558922559, + "grad_norm": 2.9204185009002686, + "learning_rate": 3.0473799661141707e-07, + "loss": 0.9494307041168213, + "step": 3040 + }, + { + "epoch": 2.5606060606060606, + "grad_norm": 5.460939407348633, + "learning_rate": 3.0395721402111286e-07, + "loss": 0.6524157524108887, + "step": 3042 + }, + { + "epoch": 2.5622895622895623, + "grad_norm": 4.9505109786987305, + "learning_rate": 3.031791741331478e-07, + "loss": 0.8453473448753357, + "step": 3044 + }, + { + "epoch": 2.563973063973064, + "grad_norm": 12.800024032592773, + "learning_rate": 3.0240387962819695e-07, + "loss": 0.6964143514633179, + "step": 3046 + }, + { + "epoch": 2.5656565656565657, + "grad_norm": 2.980398654937744, + "learning_rate": 3.016313331774762e-07, + "loss": 0.8597656488418579, + "step": 3048 + }, + { + "epoch": 2.5673400673400675, + "grad_norm": 5.009873867034912, + "learning_rate": 3.008615374427329e-07, + "loss": 0.3663683533668518, + "step": 3050 + }, + { + "epoch": 2.569023569023569, + "grad_norm": 3.2331385612487793, + "learning_rate": 3.000944950762373e-07, + "loss": 0.9516968131065369, + "step": 3052 + }, + { + "epoch": 2.570707070707071, + "grad_norm": 3.4293010234832764, + "learning_rate": 2.993302087207732e-07, + "loss": 0.07853099703788757, + "step": 3054 + }, + { + "epoch": 2.5723905723905722, + "grad_norm": 7.385575771331787, + "learning_rate": 2.985686810096285e-07, + "loss": 0.5600473284721375, + "step": 3056 + }, + { + "epoch": 2.574074074074074, + "grad_norm": 6.306962490081787, + "learning_rate": 2.978099145665867e-07, + "loss": 0.3351885974407196, + "step": 3058 + }, + { + "epoch": 2.5757575757575757, + "grad_norm": 4.720430850982666, + "learning_rate": 2.970539120059174e-07, + "loss": 0.6371778249740601, + "step": 3060 + }, + { + "epoch": 2.5774410774410774, + "grad_norm": 6.102284908294678, + "learning_rate": 2.963006759323676e-07, + "loss": 0.5941987037658691, + "step": 3062 + }, + { + "epoch": 2.579124579124579, + "grad_norm": 5.050604820251465, + "learning_rate": 2.955502089411523e-07, + "loss": 0.424297571182251, + "step": 3064 + }, + { + "epoch": 2.580808080808081, + "grad_norm": 5.28799295425415, + "learning_rate": 2.9480251361794656e-07, + "loss": 0.5996015667915344, + "step": 3066 + }, + { + "epoch": 2.5824915824915826, + "grad_norm": 9.331116676330566, + "learning_rate": 2.940575925388746e-07, + "loss": 0.3746086657047272, + "step": 3068 + }, + { + "epoch": 2.584175084175084, + "grad_norm": 13.008201599121094, + "learning_rate": 2.933154482705035e-07, + "loss": 0.17353637516498566, + "step": 3070 + }, + { + "epoch": 2.5858585858585856, + "grad_norm": 5.598928928375244, + "learning_rate": 2.925760833698327e-07, + "loss": 0.43435174226760864, + "step": 3072 + }, + { + "epoch": 2.5875420875420874, + "grad_norm": 4.106137752532959, + "learning_rate": 2.9183950038428475e-07, + "loss": 0.8951042890548706, + "step": 3074 + }, + { + "epoch": 2.589225589225589, + "grad_norm": 7.533908843994141, + "learning_rate": 2.9110570185169834e-07, + "loss": 0.35531511902809143, + "step": 3076 + }, + { + "epoch": 2.590909090909091, + "grad_norm": 2.466156482696533, + "learning_rate": 2.903746903003184e-07, + "loss": 0.8299113512039185, + "step": 3078 + }, + { + "epoch": 2.5925925925925926, + "grad_norm": 4.047122478485107, + "learning_rate": 2.896464682487866e-07, + "loss": 0.6478674411773682, + "step": 3080 + }, + { + "epoch": 2.5942760942760943, + "grad_norm": 2.4090776443481445, + "learning_rate": 2.8892103820613487e-07, + "loss": 0.9649114012718201, + "step": 3082 + }, + { + "epoch": 2.595959595959596, + "grad_norm": 3.08392071723938, + "learning_rate": 2.88198402671775e-07, + "loss": 0.5619069337844849, + "step": 3084 + }, + { + "epoch": 2.5976430976430978, + "grad_norm": 3.889181137084961, + "learning_rate": 2.874785641354901e-07, + "loss": 0.5941061973571777, + "step": 3086 + }, + { + "epoch": 2.5993265993265995, + "grad_norm": 4.151243209838867, + "learning_rate": 2.867615250774269e-07, + "loss": 0.7975903153419495, + "step": 3088 + }, + { + "epoch": 2.601010101010101, + "grad_norm": 6.307215690612793, + "learning_rate": 2.860472879680869e-07, + "loss": 0.8723431825637817, + "step": 3090 + }, + { + "epoch": 2.602693602693603, + "grad_norm": 4.979188442230225, + "learning_rate": 2.8533585526831726e-07, + "loss": 0.6906735897064209, + "step": 3092 + }, + { + "epoch": 2.6043771043771042, + "grad_norm": 5.310150623321533, + "learning_rate": 2.8462722942930286e-07, + "loss": 0.5048916339874268, + "step": 3094 + }, + { + "epoch": 2.606060606060606, + "grad_norm": 5.775015830993652, + "learning_rate": 2.8392141289255806e-07, + "loss": 0.660202145576477, + "step": 3096 + }, + { + "epoch": 2.6077441077441077, + "grad_norm": 12.841134071350098, + "learning_rate": 2.8321840808991775e-07, + "loss": 0.5634772777557373, + "step": 3098 + }, + { + "epoch": 2.6094276094276094, + "grad_norm": 6.739739418029785, + "learning_rate": 2.8251821744352933e-07, + "loss": 0.5956814289093018, + "step": 3100 + }, + { + "epoch": 2.611111111111111, + "grad_norm": 2.563978433609009, + "learning_rate": 2.8182084336584423e-07, + "loss": 0.5830974578857422, + "step": 3102 + }, + { + "epoch": 2.612794612794613, + "grad_norm": 4.95272970199585, + "learning_rate": 2.8112628825960926e-07, + "loss": 0.8090439438819885, + "step": 3104 + }, + { + "epoch": 2.6144781144781146, + "grad_norm": 3.6197354793548584, + "learning_rate": 2.804345545178594e-07, + "loss": 0.7719713449478149, + "step": 3106 + }, + { + "epoch": 2.616161616161616, + "grad_norm": 12.875308990478516, + "learning_rate": 2.7974564452390833e-07, + "loss": 0.18324008584022522, + "step": 3108 + }, + { + "epoch": 2.6178451178451176, + "grad_norm": 3.717010498046875, + "learning_rate": 2.790595606513406e-07, + "loss": 0.7723451852798462, + "step": 3110 + }, + { + "epoch": 2.6195286195286194, + "grad_norm": 2.814573287963867, + "learning_rate": 2.78376305264004e-07, + "loss": 0.39754652976989746, + "step": 3112 + }, + { + "epoch": 2.621212121212121, + "grad_norm": 3.2848994731903076, + "learning_rate": 2.776958807160011e-07, + "loss": 0.4727073609828949, + "step": 3114 + }, + { + "epoch": 2.622895622895623, + "grad_norm": 3.7905068397521973, + "learning_rate": 2.7701828935168026e-07, + "loss": 0.8447589874267578, + "step": 3116 + }, + { + "epoch": 2.6245791245791246, + "grad_norm": 2.8799266815185547, + "learning_rate": 2.763435335056291e-07, + "loss": 1.0325953960418701, + "step": 3118 + }, + { + "epoch": 2.6262626262626263, + "grad_norm": 3.1782491207122803, + "learning_rate": 2.756716155026656e-07, + "loss": 0.5554063320159912, + "step": 3120 + }, + { + "epoch": 2.627946127946128, + "grad_norm": 2.897000551223755, + "learning_rate": 2.750025376578295e-07, + "loss": 0.9207072854042053, + "step": 3122 + }, + { + "epoch": 2.6296296296296298, + "grad_norm": 2.4364206790924072, + "learning_rate": 2.743363022763758e-07, + "loss": 0.8367090225219727, + "step": 3124 + }, + { + "epoch": 2.6313131313131315, + "grad_norm": 4.580779075622559, + "learning_rate": 2.7367291165376593e-07, + "loss": 0.6048181056976318, + "step": 3126 + }, + { + "epoch": 2.6329966329966332, + "grad_norm": 6.332035064697266, + "learning_rate": 2.7301236807565925e-07, + "loss": 0.808570384979248, + "step": 3128 + }, + { + "epoch": 2.634680134680135, + "grad_norm": 7.100130081176758, + "learning_rate": 2.7235467381790654e-07, + "loss": 0.49354591965675354, + "step": 3130 + }, + { + "epoch": 2.6363636363636362, + "grad_norm": 2.4457104206085205, + "learning_rate": 2.716998311465415e-07, + "loss": 0.2983268201351166, + "step": 3132 + }, + { + "epoch": 2.638047138047138, + "grad_norm": 4.332514762878418, + "learning_rate": 2.710478423177722e-07, + "loss": 0.8370668888092041, + "step": 3134 + }, + { + "epoch": 2.6397306397306397, + "grad_norm": 4.5044684410095215, + "learning_rate": 2.7039870957797464e-07, + "loss": 0.7652538418769836, + "step": 3136 + }, + { + "epoch": 2.6414141414141414, + "grad_norm": 4.017055511474609, + "learning_rate": 2.697524351636844e-07, + "loss": 0.4114927649497986, + "step": 3138 + }, + { + "epoch": 2.643097643097643, + "grad_norm": 3.3894689083099365, + "learning_rate": 2.691090213015886e-07, + "loss": 0.8686310052871704, + "step": 3140 + }, + { + "epoch": 2.644781144781145, + "grad_norm": 2.7027831077575684, + "learning_rate": 2.6846847020851884e-07, + "loss": 0.5540004372596741, + "step": 3142 + }, + { + "epoch": 2.6464646464646466, + "grad_norm": 3.608794927597046, + "learning_rate": 2.678307840914431e-07, + "loss": 0.8333272933959961, + "step": 3144 + }, + { + "epoch": 2.648148148148148, + "grad_norm": 3.318763494491577, + "learning_rate": 2.6719596514745826e-07, + "loss": 0.9629621505737305, + "step": 3146 + }, + { + "epoch": 2.6498316498316496, + "grad_norm": 3.6985297203063965, + "learning_rate": 2.665640155637828e-07, + "loss": 0.5129526853561401, + "step": 3148 + }, + { + "epoch": 2.6515151515151514, + "grad_norm": 2.535443067550659, + "learning_rate": 2.659349375177489e-07, + "loss": 0.8636926412582397, + "step": 3150 + }, + { + "epoch": 2.653198653198653, + "grad_norm": 2.768599510192871, + "learning_rate": 2.6530873317679515e-07, + "loss": 0.20498168468475342, + "step": 3152 + }, + { + "epoch": 2.654882154882155, + "grad_norm": 34.86625671386719, + "learning_rate": 2.6468540469845895e-07, + "loss": 0.9441362619400024, + "step": 3154 + }, + { + "epoch": 2.6565656565656566, + "grad_norm": 9.064558982849121, + "learning_rate": 2.640649542303693e-07, + "loss": 0.5518494844436646, + "step": 3156 + }, + { + "epoch": 2.6582491582491583, + "grad_norm": 3.18203067779541, + "learning_rate": 2.634473839102389e-07, + "loss": 0.35931962728500366, + "step": 3158 + }, + { + "epoch": 2.65993265993266, + "grad_norm": 7.7922282218933105, + "learning_rate": 2.6283269586585737e-07, + "loss": 0.44168537855148315, + "step": 3160 + }, + { + "epoch": 2.6616161616161618, + "grad_norm": 4.682225227355957, + "learning_rate": 2.6222089221508404e-07, + "loss": 0.6104831695556641, + "step": 3162 + }, + { + "epoch": 2.6632996632996635, + "grad_norm": 2.9735536575317383, + "learning_rate": 2.6161197506583944e-07, + "loss": 0.8378016352653503, + "step": 3164 + }, + { + "epoch": 2.6649831649831652, + "grad_norm": 6.616426467895508, + "learning_rate": 2.610059465160995e-07, + "loss": 0.6439419984817505, + "step": 3166 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 3.3657751083374023, + "learning_rate": 2.6040280865388773e-07, + "loss": 0.7727220058441162, + "step": 3168 + }, + { + "epoch": 2.6683501683501682, + "grad_norm": 3.285837173461914, + "learning_rate": 2.5980256355726744e-07, + "loss": 0.6320611834526062, + "step": 3170 + }, + { + "epoch": 2.67003367003367, + "grad_norm": 4.853776931762695, + "learning_rate": 2.5920521329433606e-07, + "loss": 1.043792963027954, + "step": 3172 + }, + { + "epoch": 2.6717171717171717, + "grad_norm": 2.360769271850586, + "learning_rate": 2.586107599232164e-07, + "loss": 0.9384379386901855, + "step": 3174 + }, + { + "epoch": 2.6734006734006734, + "grad_norm": 14.25788402557373, + "learning_rate": 2.5801920549205023e-07, + "loss": 0.4818713068962097, + "step": 3176 + }, + { + "epoch": 2.675084175084175, + "grad_norm": 2.0616092681884766, + "learning_rate": 2.5743055203899167e-07, + "loss": 0.9861509799957275, + "step": 3178 + }, + { + "epoch": 2.676767676767677, + "grad_norm": 4.687266826629639, + "learning_rate": 2.568448015921996e-07, + "loss": 0.6932214498519897, + "step": 3180 + }, + { + "epoch": 2.678451178451178, + "grad_norm": 2.3194851875305176, + "learning_rate": 2.562619561698306e-07, + "loss": 0.7709292769432068, + "step": 3182 + }, + { + "epoch": 2.68013468013468, + "grad_norm": 2.256274461746216, + "learning_rate": 2.556820177800324e-07, + "loss": 0.8786018490791321, + "step": 3184 + }, + { + "epoch": 2.6818181818181817, + "grad_norm": 1.7933223247528076, + "learning_rate": 2.551049884209371e-07, + "loss": 0.7843552827835083, + "step": 3186 + }, + { + "epoch": 2.6835016835016834, + "grad_norm": 3.6488430500030518, + "learning_rate": 2.5453087008065307e-07, + "loss": 0.7388215661048889, + "step": 3188 + }, + { + "epoch": 2.685185185185185, + "grad_norm": 1.1536720991134644, + "learning_rate": 2.5395966473725994e-07, + "loss": 0.552982747554779, + "step": 3190 + }, + { + "epoch": 2.686868686868687, + "grad_norm": 3.049055814743042, + "learning_rate": 2.5339137435880043e-07, + "loss": 0.617717981338501, + "step": 3192 + }, + { + "epoch": 2.6885521885521886, + "grad_norm": 2.4993679523468018, + "learning_rate": 2.5282600090327383e-07, + "loss": 0.7265998125076294, + "step": 3194 + }, + { + "epoch": 2.6902356902356903, + "grad_norm": 12.052529335021973, + "learning_rate": 2.5226354631862966e-07, + "loss": 0.6202006340026855, + "step": 3196 + }, + { + "epoch": 2.691919191919192, + "grad_norm": 2.131632089614868, + "learning_rate": 2.517040125427608e-07, + "loss": 0.741972804069519, + "step": 3198 + }, + { + "epoch": 2.6936026936026938, + "grad_norm": 2.2996838092803955, + "learning_rate": 2.511474015034964e-07, + "loss": 0.8759193420410156, + "step": 3200 + }, + { + "epoch": 2.6952861952861955, + "grad_norm": 6.061952590942383, + "learning_rate": 2.5059371511859557e-07, + "loss": 0.6976549625396729, + "step": 3202 + }, + { + "epoch": 2.6969696969696972, + "grad_norm": 3.891650915145874, + "learning_rate": 2.50042955295741e-07, + "loss": 0.6694223880767822, + "step": 3204 + }, + { + "epoch": 2.6986531986531985, + "grad_norm": 5.893383026123047, + "learning_rate": 2.494951239325321e-07, + "loss": 0.7830284833908081, + "step": 3206 + }, + { + "epoch": 2.7003367003367003, + "grad_norm": 4.715972423553467, + "learning_rate": 2.489502229164781e-07, + "loss": 0.5429476499557495, + "step": 3208 + }, + { + "epoch": 2.702020202020202, + "grad_norm": 3.343920946121216, + "learning_rate": 2.4840825412499274e-07, + "loss": 0.8423386812210083, + "step": 3210 + }, + { + "epoch": 2.7037037037037037, + "grad_norm": 2.458588123321533, + "learning_rate": 2.478692194253861e-07, + "loss": 0.4965520203113556, + "step": 3212 + }, + { + "epoch": 2.7053872053872055, + "grad_norm": 2.6822140216827393, + "learning_rate": 2.473331206748597e-07, + "loss": 0.6127833127975464, + "step": 3214 + }, + { + "epoch": 2.707070707070707, + "grad_norm": 3.830547571182251, + "learning_rate": 2.467999597204996e-07, + "loss": 0.2938854694366455, + "step": 3216 + }, + { + "epoch": 2.708754208754209, + "grad_norm": 3.668973684310913, + "learning_rate": 2.462697383992691e-07, + "loss": 0.7545672655105591, + "step": 3218 + }, + { + "epoch": 2.71043771043771, + "grad_norm": 4.789590358734131, + "learning_rate": 2.457424585380041e-07, + "loss": 0.3368055820465088, + "step": 3220 + }, + { + "epoch": 2.712121212121212, + "grad_norm": 4.607179641723633, + "learning_rate": 2.4521812195340544e-07, + "loss": 0.7228003144264221, + "step": 3222 + }, + { + "epoch": 2.7138047138047137, + "grad_norm": 3.7761380672454834, + "learning_rate": 2.4469673045203333e-07, + "loss": 0.39306753873825073, + "step": 3224 + }, + { + "epoch": 2.7154882154882154, + "grad_norm": 3.8872487545013428, + "learning_rate": 2.441782858303007e-07, + "loss": 0.388794481754303, + "step": 3226 + }, + { + "epoch": 2.717171717171717, + "grad_norm": 3.936227560043335, + "learning_rate": 2.436627898744678e-07, + "loss": 0.7990210056304932, + "step": 3228 + }, + { + "epoch": 2.718855218855219, + "grad_norm": 10.530872344970703, + "learning_rate": 2.4315024436063464e-07, + "loss": 0.3864361643791199, + "step": 3230 + }, + { + "epoch": 2.7205387205387206, + "grad_norm": 8.344436645507812, + "learning_rate": 2.4264065105473637e-07, + "loss": 0.8147022724151611, + "step": 3232 + }, + { + "epoch": 2.7222222222222223, + "grad_norm": 1.8948400020599365, + "learning_rate": 2.4213401171253656e-07, + "loss": 0.6463346481323242, + "step": 3234 + }, + { + "epoch": 2.723905723905724, + "grad_norm": 2.3045897483825684, + "learning_rate": 2.416303280796206e-07, + "loss": 0.7769128084182739, + "step": 3236 + }, + { + "epoch": 2.725589225589226, + "grad_norm": 10.252862930297852, + "learning_rate": 2.411296018913907e-07, + "loss": 0.7157000303268433, + "step": 3238 + }, + { + "epoch": 2.7272727272727275, + "grad_norm": 12.489968299865723, + "learning_rate": 2.406318348730592e-07, + "loss": 0.7306414842605591, + "step": 3240 + }, + { + "epoch": 2.728956228956229, + "grad_norm": 3.00982666015625, + "learning_rate": 2.401370287396428e-07, + "loss": 0.8304033279418945, + "step": 3242 + }, + { + "epoch": 2.7306397306397305, + "grad_norm": 4.058210849761963, + "learning_rate": 2.396451851959571e-07, + "loss": 0.5530973672866821, + "step": 3244 + }, + { + "epoch": 2.7323232323232323, + "grad_norm": 4.974558353424072, + "learning_rate": 2.391563059366099e-07, + "loss": 0.7806906700134277, + "step": 3246 + }, + { + "epoch": 2.734006734006734, + "grad_norm": 10.766674995422363, + "learning_rate": 2.3867039264599587e-07, + "loss": 0.805009126663208, + "step": 3248 + }, + { + "epoch": 2.7356902356902357, + "grad_norm": 4.717216491699219, + "learning_rate": 2.3818744699829105e-07, + "loss": 0.6719311475753784, + "step": 3250 + }, + { + "epoch": 2.7373737373737375, + "grad_norm": 4.689093112945557, + "learning_rate": 2.3770747065744594e-07, + "loss": 0.37460649013519287, + "step": 3252 + }, + { + "epoch": 2.739057239057239, + "grad_norm": 3.905974864959717, + "learning_rate": 2.3723046527718137e-07, + "loss": 0.528462290763855, + "step": 3254 + }, + { + "epoch": 2.7407407407407405, + "grad_norm": 3.8697361946105957, + "learning_rate": 2.367564325009815e-07, + "loss": 0.4876176714897156, + "step": 3256 + }, + { + "epoch": 2.742424242424242, + "grad_norm": 2.9344778060913086, + "learning_rate": 2.362853739620885e-07, + "loss": 0.6226130723953247, + "step": 3258 + }, + { + "epoch": 2.744107744107744, + "grad_norm": 4.8839497566223145, + "learning_rate": 2.3581729128349745e-07, + "loss": 0.4137502908706665, + "step": 3260 + }, + { + "epoch": 2.7457912457912457, + "grad_norm": 2.9513931274414062, + "learning_rate": 2.3535218607795013e-07, + "loss": 0.6418605446815491, + "step": 3262 + }, + { + "epoch": 2.7474747474747474, + "grad_norm": 3.3043465614318848, + "learning_rate": 2.3489005994792948e-07, + "loss": 0.857982337474823, + "step": 3264 + }, + { + "epoch": 2.749158249158249, + "grad_norm": 5.111167907714844, + "learning_rate": 2.3443091448565454e-07, + "loss": 0.958759605884552, + "step": 3266 + }, + { + "epoch": 2.750841750841751, + "grad_norm": 74.0482406616211, + "learning_rate": 2.339747512730749e-07, + "loss": 0.4375573396682739, + "step": 3268 + }, + { + "epoch": 2.7525252525252526, + "grad_norm": 3.2530107498168945, + "learning_rate": 2.3352157188186424e-07, + "loss": 0.9555472135543823, + "step": 3270 + }, + { + "epoch": 2.7542087542087543, + "grad_norm": 44.573936462402344, + "learning_rate": 2.3307137787341667e-07, + "loss": 0.5092712044715881, + "step": 3272 + }, + { + "epoch": 2.755892255892256, + "grad_norm": 2.355350971221924, + "learning_rate": 2.3262417079883986e-07, + "loss": 0.7026905417442322, + "step": 3274 + }, + { + "epoch": 2.757575757575758, + "grad_norm": 13.51882553100586, + "learning_rate": 2.3217995219895016e-07, + "loss": 0.3385421633720398, + "step": 3276 + }, + { + "epoch": 2.7592592592592595, + "grad_norm": 1.3492799997329712, + "learning_rate": 2.317387236042678e-07, + "loss": 0.03149527311325073, + "step": 3278 + }, + { + "epoch": 2.760942760942761, + "grad_norm": 24.129674911499023, + "learning_rate": 2.313004865350109e-07, + "loss": 1.0571789741516113, + "step": 3280 + }, + { + "epoch": 2.7626262626262625, + "grad_norm": 2.9202077388763428, + "learning_rate": 2.3086524250109045e-07, + "loss": 1.0254530906677246, + "step": 3282 + }, + { + "epoch": 2.7643097643097643, + "grad_norm": 10.319761276245117, + "learning_rate": 2.3043299300210528e-07, + "loss": 0.2718232274055481, + "step": 3284 + }, + { + "epoch": 2.765993265993266, + "grad_norm": 4.364471435546875, + "learning_rate": 2.30003739527337e-07, + "loss": 0.7651864290237427, + "step": 3286 + }, + { + "epoch": 2.7676767676767677, + "grad_norm": 5.035273551940918, + "learning_rate": 2.2957748355574408e-07, + "loss": 0.7020351886749268, + "step": 3288 + }, + { + "epoch": 2.7693602693602695, + "grad_norm": 11.138975143432617, + "learning_rate": 2.2915422655595795e-07, + "loss": 0.20551855862140656, + "step": 3290 + }, + { + "epoch": 2.771043771043771, + "grad_norm": 5.818138599395752, + "learning_rate": 2.287339699862771e-07, + "loss": 0.9749652147293091, + "step": 3292 + }, + { + "epoch": 2.7727272727272725, + "grad_norm": 5.8484063148498535, + "learning_rate": 2.2831671529466205e-07, + "loss": 0.7997506260871887, + "step": 3294 + }, + { + "epoch": 2.774410774410774, + "grad_norm": 3.476667642593384, + "learning_rate": 2.2790246391873086e-07, + "loss": 0.8032985925674438, + "step": 3296 + }, + { + "epoch": 2.776094276094276, + "grad_norm": 4.120417594909668, + "learning_rate": 2.2749121728575393e-07, + "loss": 0.23050040006637573, + "step": 3298 + }, + { + "epoch": 2.7777777777777777, + "grad_norm": 3.6002514362335205, + "learning_rate": 2.2708297681264874e-07, + "loss": 0.45907649397850037, + "step": 3300 + }, + { + "epoch": 2.7794612794612794, + "grad_norm": 2.618075370788574, + "learning_rate": 2.2667774390597562e-07, + "loss": 0.4696184992790222, + "step": 3302 + }, + { + "epoch": 2.781144781144781, + "grad_norm": 6.530674457550049, + "learning_rate": 2.2627551996193247e-07, + "loss": 0.47576916217803955, + "step": 3304 + }, + { + "epoch": 2.782828282828283, + "grad_norm": 18.45606231689453, + "learning_rate": 2.2587630636634985e-07, + "loss": 0.6657184362411499, + "step": 3306 + }, + { + "epoch": 2.7845117845117846, + "grad_norm": 11.66965389251709, + "learning_rate": 2.2548010449468676e-07, + "loss": 0.48266786336898804, + "step": 3308 + }, + { + "epoch": 2.7861952861952863, + "grad_norm": 2.84804368019104, + "learning_rate": 2.2508691571202528e-07, + "loss": 0.6634323596954346, + "step": 3310 + }, + { + "epoch": 2.787878787878788, + "grad_norm": 3.701871395111084, + "learning_rate": 2.2469674137306627e-07, + "loss": 0.4185872972011566, + "step": 3312 + }, + { + "epoch": 2.78956228956229, + "grad_norm": 2.2695560455322266, + "learning_rate": 2.2430958282212414e-07, + "loss": 0.6932981014251709, + "step": 3314 + }, + { + "epoch": 2.791245791245791, + "grad_norm": 3.9276177883148193, + "learning_rate": 2.239254413931236e-07, + "loss": 0.9720036387443542, + "step": 3316 + }, + { + "epoch": 2.792929292929293, + "grad_norm": 3.183957099914551, + "learning_rate": 2.2354431840959307e-07, + "loss": 0.7453635334968567, + "step": 3318 + }, + { + "epoch": 2.7946127946127945, + "grad_norm": 4.194116115570068, + "learning_rate": 2.2316621518466167e-07, + "loss": 0.3255777359008789, + "step": 3320 + }, + { + "epoch": 2.7962962962962963, + "grad_norm": 5.5670366287231445, + "learning_rate": 2.227911330210542e-07, + "loss": 0.6090131998062134, + "step": 3322 + }, + { + "epoch": 2.797979797979798, + "grad_norm": 2.372026205062866, + "learning_rate": 2.2241907321108638e-07, + "loss": 0.6710550785064697, + "step": 3324 + }, + { + "epoch": 2.7996632996632997, + "grad_norm": 3.636491060256958, + "learning_rate": 2.22050037036661e-07, + "loss": 0.30255502462387085, + "step": 3326 + }, + { + "epoch": 2.8013468013468015, + "grad_norm": 3.7633321285247803, + "learning_rate": 2.216840257692628e-07, + "loss": 0.723252534866333, + "step": 3328 + }, + { + "epoch": 2.8030303030303028, + "grad_norm": 2.568369150161743, + "learning_rate": 2.213210406699547e-07, + "loss": 0.78731769323349, + "step": 3330 + }, + { + "epoch": 2.8047138047138045, + "grad_norm": 3.9559519290924072, + "learning_rate": 2.209610829893729e-07, + "loss": 0.5705679655075073, + "step": 3332 + }, + { + "epoch": 2.8063973063973062, + "grad_norm": 5.107378005981445, + "learning_rate": 2.2060415396772337e-07, + "loss": 0.4503876864910126, + "step": 3334 + }, + { + "epoch": 2.808080808080808, + "grad_norm": 3.7301788330078125, + "learning_rate": 2.2025025483477654e-07, + "loss": 0.5614144802093506, + "step": 3336 + }, + { + "epoch": 2.8097643097643097, + "grad_norm": 3.425426959991455, + "learning_rate": 2.1989938680986382e-07, + "loss": 0.27632904052734375, + "step": 3338 + }, + { + "epoch": 2.8114478114478114, + "grad_norm": 11.55947208404541, + "learning_rate": 2.1955155110187344e-07, + "loss": 0.6297179460525513, + "step": 3340 + }, + { + "epoch": 2.813131313131313, + "grad_norm": 5.041746139526367, + "learning_rate": 2.1920674890924545e-07, + "loss": 0.7801995873451233, + "step": 3342 + }, + { + "epoch": 2.814814814814815, + "grad_norm": 1.9846611022949219, + "learning_rate": 2.1886498141996858e-07, + "loss": 0.3154934346675873, + "step": 3344 + }, + { + "epoch": 2.8164983164983166, + "grad_norm": 3.4041101932525635, + "learning_rate": 2.185262498115759e-07, + "loss": 0.7565585374832153, + "step": 3346 + }, + { + "epoch": 2.8181818181818183, + "grad_norm": 5.533918380737305, + "learning_rate": 2.1819055525113995e-07, + "loss": 0.5513463020324707, + "step": 3348 + }, + { + "epoch": 2.81986531986532, + "grad_norm": 3.816920042037964, + "learning_rate": 2.178578988952698e-07, + "loss": 0.8172674179077148, + "step": 3350 + }, + { + "epoch": 2.821548821548822, + "grad_norm": 4.7206573486328125, + "learning_rate": 2.1752828189010677e-07, + "loss": 0.7926508188247681, + "step": 3352 + }, + { + "epoch": 2.823232323232323, + "grad_norm": 2.8711562156677246, + "learning_rate": 2.1720170537132003e-07, + "loss": 0.7785905599594116, + "step": 3354 + }, + { + "epoch": 2.824915824915825, + "grad_norm": 7.083092212677002, + "learning_rate": 2.16878170464103e-07, + "loss": 0.8117780685424805, + "step": 3356 + }, + { + "epoch": 2.8265993265993266, + "grad_norm": 6.3713178634643555, + "learning_rate": 2.1655767828316967e-07, + "loss": 0.4899190068244934, + "step": 3358 + }, + { + "epoch": 2.8282828282828283, + "grad_norm": 8.093062400817871, + "learning_rate": 2.1624022993275042e-07, + "loss": 0.481950581073761, + "step": 3360 + }, + { + "epoch": 2.82996632996633, + "grad_norm": 3.7031800746917725, + "learning_rate": 2.1592582650658838e-07, + "loss": 0.6889939308166504, + "step": 3362 + }, + { + "epoch": 2.8316498316498318, + "grad_norm": 8.515325546264648, + "learning_rate": 2.1561446908793575e-07, + "loss": 0.5986655950546265, + "step": 3364 + }, + { + "epoch": 2.8333333333333335, + "grad_norm": 2.5616695880889893, + "learning_rate": 2.1530615874954978e-07, + "loss": 0.4613681137561798, + "step": 3366 + }, + { + "epoch": 2.8350168350168348, + "grad_norm": 6.432313919067383, + "learning_rate": 2.1500089655368913e-07, + "loss": 0.35357874631881714, + "step": 3368 + }, + { + "epoch": 2.8367003367003365, + "grad_norm": 5.070071220397949, + "learning_rate": 2.146986835521108e-07, + "loss": 0.815057635307312, + "step": 3370 + }, + { + "epoch": 2.8383838383838382, + "grad_norm": 1.3125436305999756, + "learning_rate": 2.143995207860655e-07, + "loss": 0.6456162929534912, + "step": 3372 + }, + { + "epoch": 2.84006734006734, + "grad_norm": 50.76771545410156, + "learning_rate": 2.1410340928629483e-07, + "loss": 0.29310160875320435, + "step": 3374 + }, + { + "epoch": 2.8417508417508417, + "grad_norm": 2.078246831893921, + "learning_rate": 2.138103500730278e-07, + "loss": 0.851909875869751, + "step": 3376 + }, + { + "epoch": 2.8434343434343434, + "grad_norm": 2.2148220539093018, + "learning_rate": 2.1352034415597635e-07, + "loss": 0.7448092699050903, + "step": 3378 + }, + { + "epoch": 2.845117845117845, + "grad_norm": 2.512826919555664, + "learning_rate": 2.1323339253433309e-07, + "loss": 0.5352383255958557, + "step": 3380 + }, + { + "epoch": 2.846801346801347, + "grad_norm": 5.046896934509277, + "learning_rate": 2.1294949619676717e-07, + "loss": 0.522847056388855, + "step": 3382 + }, + { + "epoch": 2.8484848484848486, + "grad_norm": 4.314877033233643, + "learning_rate": 2.1266865612142064e-07, + "loss": 0.5352615118026733, + "step": 3384 + }, + { + "epoch": 2.8501683501683504, + "grad_norm": 3.3411834239959717, + "learning_rate": 2.1239087327590582e-07, + "loss": 0.7238250970840454, + "step": 3386 + }, + { + "epoch": 2.851851851851852, + "grad_norm": 0.83232581615448, + "learning_rate": 2.121161486173017e-07, + "loss": 0.6121417284011841, + "step": 3388 + }, + { + "epoch": 2.8535353535353534, + "grad_norm": 8.091914176940918, + "learning_rate": 2.1184448309215015e-07, + "loss": 0.4724659025669098, + "step": 3390 + }, + { + "epoch": 2.855218855218855, + "grad_norm": 3.3312911987304688, + "learning_rate": 2.1157587763645322e-07, + "loss": 0.5098093748092651, + "step": 3392 + }, + { + "epoch": 2.856902356902357, + "grad_norm": 5.780312538146973, + "learning_rate": 2.113103331756698e-07, + "loss": 0.9295372366905212, + "step": 3394 + }, + { + "epoch": 2.8585858585858586, + "grad_norm": 2.5686521530151367, + "learning_rate": 2.110478506247122e-07, + "loss": 0.9365147948265076, + "step": 3396 + }, + { + "epoch": 2.8602693602693603, + "grad_norm": 2.75380277633667, + "learning_rate": 2.1078843088794325e-07, + "loss": 0.4805770516395569, + "step": 3398 + }, + { + "epoch": 2.861952861952862, + "grad_norm": 14.623507499694824, + "learning_rate": 2.105320748591732e-07, + "loss": 0.38062724471092224, + "step": 3400 + }, + { + "epoch": 2.8636363636363638, + "grad_norm": 47.26361846923828, + "learning_rate": 2.1027878342165624e-07, + "loss": 0.4569489359855652, + "step": 3402 + }, + { + "epoch": 2.865319865319865, + "grad_norm": 2.116769313812256, + "learning_rate": 2.1002855744808815e-07, + "loss": 0.34320202469825745, + "step": 3404 + }, + { + "epoch": 2.8670033670033668, + "grad_norm": 4.610642910003662, + "learning_rate": 2.0978139780060257e-07, + "loss": 0.7092417478561401, + "step": 3406 + }, + { + "epoch": 2.8686868686868685, + "grad_norm": 4.693014144897461, + "learning_rate": 2.0953730533076862e-07, + "loss": 0.29190459847450256, + "step": 3408 + }, + { + "epoch": 2.8703703703703702, + "grad_norm": 3.3123207092285156, + "learning_rate": 2.0929628087958734e-07, + "loss": 0.7917627692222595, + "step": 3410 + }, + { + "epoch": 2.872053872053872, + "grad_norm": 1.7922461032867432, + "learning_rate": 2.0905832527748953e-07, + "loss": 0.43554821610450745, + "step": 3412 + }, + { + "epoch": 2.8737373737373737, + "grad_norm": 4.745511054992676, + "learning_rate": 2.0882343934433236e-07, + "loss": 0.5983174443244934, + "step": 3414 + }, + { + "epoch": 2.8754208754208754, + "grad_norm": 6.916215896606445, + "learning_rate": 2.085916238893966e-07, + "loss": 0.17676572501659393, + "step": 3416 + }, + { + "epoch": 2.877104377104377, + "grad_norm": 4.048447132110596, + "learning_rate": 2.0836287971138418e-07, + "loss": 0.6077107191085815, + "step": 3418 + }, + { + "epoch": 2.878787878787879, + "grad_norm": 2.5704290866851807, + "learning_rate": 2.0813720759841492e-07, + "loss": 0.4146248400211334, + "step": 3420 + }, + { + "epoch": 2.8804713804713806, + "grad_norm": 5.706145286560059, + "learning_rate": 2.0791460832802423e-07, + "loss": 0.7497705221176147, + "step": 3422 + }, + { + "epoch": 2.8821548821548824, + "grad_norm": 1.7757506370544434, + "learning_rate": 2.0769508266716027e-07, + "loss": 0.5505831241607666, + "step": 3424 + }, + { + "epoch": 2.883838383838384, + "grad_norm": 7.052734851837158, + "learning_rate": 2.0747863137218126e-07, + "loss": 0.6165893077850342, + "step": 3426 + }, + { + "epoch": 2.8855218855218854, + "grad_norm": 5.826257705688477, + "learning_rate": 2.0726525518885308e-07, + "loss": 0.5343178510665894, + "step": 3428 + }, + { + "epoch": 2.887205387205387, + "grad_norm": 8.041903495788574, + "learning_rate": 2.0705495485234653e-07, + "loss": 0.3310260772705078, + "step": 3430 + }, + { + "epoch": 2.888888888888889, + "grad_norm": 15.362848281860352, + "learning_rate": 2.0684773108723455e-07, + "loss": 0.5320956707000732, + "step": 3432 + }, + { + "epoch": 2.8905723905723906, + "grad_norm": 7.592126369476318, + "learning_rate": 2.0664358460749018e-07, + "loss": 0.29516857862472534, + "step": 3434 + }, + { + "epoch": 2.8922558922558923, + "grad_norm": 1.8380248546600342, + "learning_rate": 2.064425161164842e-07, + "loss": 0.9136509895324707, + "step": 3436 + }, + { + "epoch": 2.893939393939394, + "grad_norm": 4.288794994354248, + "learning_rate": 2.0624452630698195e-07, + "loss": 0.8272508382797241, + "step": 3438 + }, + { + "epoch": 2.8956228956228958, + "grad_norm": 3.879866600036621, + "learning_rate": 2.0604961586114163e-07, + "loss": 0.744123101234436, + "step": 3440 + }, + { + "epoch": 2.897306397306397, + "grad_norm": 3.288698196411133, + "learning_rate": 2.0585778545051195e-07, + "loss": 0.8894016742706299, + "step": 3442 + }, + { + "epoch": 2.898989898989899, + "grad_norm": 15.847039222717285, + "learning_rate": 2.0566903573602913e-07, + "loss": 0.2585524320602417, + "step": 3444 + }, + { + "epoch": 2.9006734006734005, + "grad_norm": 4.235921859741211, + "learning_rate": 2.0548336736801548e-07, + "loss": 0.5225664377212524, + "step": 3446 + }, + { + "epoch": 2.9023569023569022, + "grad_norm": 5.334314346313477, + "learning_rate": 2.0530078098617668e-07, + "loss": 1.000659704208374, + "step": 3448 + }, + { + "epoch": 2.904040404040404, + "grad_norm": 13.81791877746582, + "learning_rate": 2.0512127721959954e-07, + "loss": 0.2958747446537018, + "step": 3450 + }, + { + "epoch": 2.9057239057239057, + "grad_norm": 2.8504996299743652, + "learning_rate": 2.0494485668675003e-07, + "loss": 0.5946668386459351, + "step": 3452 + }, + { + "epoch": 2.9074074074074074, + "grad_norm": 30.945682525634766, + "learning_rate": 2.0477151999547137e-07, + "loss": 0.6222255229949951, + "step": 3454 + }, + { + "epoch": 2.909090909090909, + "grad_norm": 2.8661885261535645, + "learning_rate": 2.0460126774298115e-07, + "loss": 0.9090818166732788, + "step": 3456 + }, + { + "epoch": 2.910774410774411, + "grad_norm": 3.6362955570220947, + "learning_rate": 2.044341005158701e-07, + "loss": 0.6454827785491943, + "step": 3458 + }, + { + "epoch": 2.9124579124579126, + "grad_norm": 5.509945392608643, + "learning_rate": 2.042700188900996e-07, + "loss": 0.8902723789215088, + "step": 3460 + }, + { + "epoch": 2.9141414141414144, + "grad_norm": 4.623058795928955, + "learning_rate": 2.0410902343099998e-07, + "loss": 0.9835023283958435, + "step": 3462 + }, + { + "epoch": 2.915824915824916, + "grad_norm": 5.559566020965576, + "learning_rate": 2.039511146932683e-07, + "loss": 0.725146472454071, + "step": 3464 + }, + { + "epoch": 2.9175084175084174, + "grad_norm": 2.3381059169769287, + "learning_rate": 2.0379629322096658e-07, + "loss": 0.8742655515670776, + "step": 3466 + }, + { + "epoch": 2.919191919191919, + "grad_norm": 3.1581509113311768, + "learning_rate": 2.036445595475199e-07, + "loss": 0.5896962881088257, + "step": 3468 + }, + { + "epoch": 2.920875420875421, + "grad_norm": 2.895928382873535, + "learning_rate": 2.0349591419571473e-07, + "loss": 0.08913551270961761, + "step": 3470 + }, + { + "epoch": 2.9225589225589226, + "grad_norm": 3.939779758453369, + "learning_rate": 2.0335035767769674e-07, + "loss": 0.5938529968261719, + "step": 3472 + }, + { + "epoch": 2.9242424242424243, + "grad_norm": 2.6540651321411133, + "learning_rate": 2.032078904949694e-07, + "loss": 0.607816755771637, + "step": 3474 + }, + { + "epoch": 2.925925925925926, + "grad_norm": 11.374692916870117, + "learning_rate": 2.0306851313839217e-07, + "loss": 0.26831308007240295, + "step": 3476 + }, + { + "epoch": 2.9276094276094278, + "grad_norm": 4.051253318786621, + "learning_rate": 2.0293222608817862e-07, + "loss": 0.776150107383728, + "step": 3478 + }, + { + "epoch": 2.929292929292929, + "grad_norm": 6.790820121765137, + "learning_rate": 2.0279902981389491e-07, + "loss": 0.44397690892219543, + "step": 3480 + }, + { + "epoch": 2.930976430976431, + "grad_norm": 4.825781345367432, + "learning_rate": 2.026689247744584e-07, + "loss": 0.7775415182113647, + "step": 3482 + }, + { + "epoch": 2.9326599326599325, + "grad_norm": 3.1354546546936035, + "learning_rate": 2.0254191141813563e-07, + "loss": 0.5349434614181519, + "step": 3484 + }, + { + "epoch": 2.9343434343434343, + "grad_norm": 3.595128059387207, + "learning_rate": 2.0241799018254102e-07, + "loss": 0.6211014986038208, + "step": 3486 + }, + { + "epoch": 2.936026936026936, + "grad_norm": 4.181585311889648, + "learning_rate": 2.0229716149463543e-07, + "loss": 0.6584489345550537, + "step": 3488 + }, + { + "epoch": 2.9377104377104377, + "grad_norm": 5.394354343414307, + "learning_rate": 2.0217942577072447e-07, + "loss": 0.5959441661834717, + "step": 3490 + }, + { + "epoch": 2.9393939393939394, + "grad_norm": 13.857940673828125, + "learning_rate": 2.0206478341645734e-07, + "loss": 0.8532196283340454, + "step": 3492 + }, + { + "epoch": 2.941077441077441, + "grad_norm": 6.366513252258301, + "learning_rate": 2.0195323482682508e-07, + "loss": 0.3821958899497986, + "step": 3494 + }, + { + "epoch": 2.942760942760943, + "grad_norm": 2.0421321392059326, + "learning_rate": 2.0184478038615948e-07, + "loss": 0.7394722700119019, + "step": 3496 + }, + { + "epoch": 2.9444444444444446, + "grad_norm": 4.313158988952637, + "learning_rate": 2.0173942046813191e-07, + "loss": 0.2922773063182831, + "step": 3498 + }, + { + "epoch": 2.9461279461279464, + "grad_norm": 5.628312110900879, + "learning_rate": 2.016371554357515e-07, + "loss": 0.608026385307312, + "step": 3500 + }, + { + "epoch": 2.9478114478114477, + "grad_norm": 10.177474975585938, + "learning_rate": 2.015379856413643e-07, + "loss": 0.684483528137207, + "step": 3502 + }, + { + "epoch": 2.9494949494949494, + "grad_norm": 9.977062225341797, + "learning_rate": 2.01441911426652e-07, + "loss": 0.36152565479278564, + "step": 3504 + }, + { + "epoch": 2.951178451178451, + "grad_norm": 1.5593669414520264, + "learning_rate": 2.013489331226307e-07, + "loss": 0.6608873009681702, + "step": 3506 + }, + { + "epoch": 2.952861952861953, + "grad_norm": 3.423954486846924, + "learning_rate": 2.0125905104964978e-07, + "loss": 0.8101043701171875, + "step": 3508 + }, + { + "epoch": 2.9545454545454546, + "grad_norm": 4.263778209686279, + "learning_rate": 2.0117226551739068e-07, + "loss": 0.7046741247177124, + "step": 3510 + }, + { + "epoch": 2.9562289562289563, + "grad_norm": 3.3937125205993652, + "learning_rate": 2.0108857682486629e-07, + "loss": 0.7705718874931335, + "step": 3512 + }, + { + "epoch": 2.957912457912458, + "grad_norm": 10.03588581085205, + "learning_rate": 2.0100798526041927e-07, + "loss": 0.31763288378715515, + "step": 3514 + }, + { + "epoch": 2.9595959595959593, + "grad_norm": 3.6547443866729736, + "learning_rate": 2.009304911017215e-07, + "loss": 0.8195918202400208, + "step": 3516 + }, + { + "epoch": 2.961279461279461, + "grad_norm": 2.8320508003234863, + "learning_rate": 2.0085609461577295e-07, + "loss": 0.871679425239563, + "step": 3518 + }, + { + "epoch": 2.962962962962963, + "grad_norm": 5.754692554473877, + "learning_rate": 2.0078479605890064e-07, + "loss": 0.3950427770614624, + "step": 3520 + }, + { + "epoch": 2.9646464646464645, + "grad_norm": 3.0160629749298096, + "learning_rate": 2.007165956767584e-07, + "loss": 0.65765380859375, + "step": 3522 + }, + { + "epoch": 2.9663299663299663, + "grad_norm": 5.943231105804443, + "learning_rate": 2.00651493704325e-07, + "loss": 0.2477177381515503, + "step": 3524 + }, + { + "epoch": 2.968013468013468, + "grad_norm": 6.068716049194336, + "learning_rate": 2.0058949036590426e-07, + "loss": 0.8671658039093018, + "step": 3526 + }, + { + "epoch": 2.9696969696969697, + "grad_norm": 2.297165632247925, + "learning_rate": 2.0053058587512378e-07, + "loss": 0.7299938201904297, + "step": 3528 + }, + { + "epoch": 2.9713804713804715, + "grad_norm": 3.451326847076416, + "learning_rate": 2.0047478043493418e-07, + "loss": 0.7638918161392212, + "step": 3530 + }, + { + "epoch": 2.973063973063973, + "grad_norm": 5.721773147583008, + "learning_rate": 2.004220742376088e-07, + "loss": 0.6010457873344421, + "step": 3532 + }, + { + "epoch": 2.974747474747475, + "grad_norm": 11.908121109008789, + "learning_rate": 2.0037246746474277e-07, + "loss": 0.21666747331619263, + "step": 3534 + }, + { + "epoch": 2.9764309764309766, + "grad_norm": 2.7472894191741943, + "learning_rate": 2.0032596028725204e-07, + "loss": 0.828637421131134, + "step": 3536 + }, + { + "epoch": 2.9781144781144784, + "grad_norm": 7.899786949157715, + "learning_rate": 2.0028255286537355e-07, + "loss": 0.4242842197418213, + "step": 3538 + }, + { + "epoch": 2.9797979797979797, + "grad_norm": 2.2358016967773438, + "learning_rate": 2.0024224534866408e-07, + "loss": 0.9581695795059204, + "step": 3540 + }, + { + "epoch": 2.9814814814814814, + "grad_norm": 4.023903846740723, + "learning_rate": 2.0020503787599998e-07, + "loss": 0.8976711630821228, + "step": 3542 + }, + { + "epoch": 2.983164983164983, + "grad_norm": 5.354180812835693, + "learning_rate": 2.001709305755767e-07, + "loss": 0.47080734372138977, + "step": 3544 + }, + { + "epoch": 2.984848484848485, + "grad_norm": 6.203042507171631, + "learning_rate": 2.0013992356490827e-07, + "loss": 0.799166202545166, + "step": 3546 + }, + { + "epoch": 2.9865319865319866, + "grad_norm": 6.4163031578063965, + "learning_rate": 2.0011201695082687e-07, + "loss": 0.30166110396385193, + "step": 3548 + }, + { + "epoch": 2.9882154882154883, + "grad_norm": 9.541460037231445, + "learning_rate": 2.0008721082948243e-07, + "loss": 0.3377661108970642, + "step": 3550 + }, + { + "epoch": 2.98989898989899, + "grad_norm": 12.612906455993652, + "learning_rate": 2.0006550528634258e-07, + "loss": 0.4944566488265991, + "step": 3552 + }, + { + "epoch": 2.9915824915824913, + "grad_norm": 1.854871153831482, + "learning_rate": 2.00046900396192e-07, + "loss": 0.9397309422492981, + "step": 3554 + }, + { + "epoch": 2.993265993265993, + "grad_norm": 2.197124719619751, + "learning_rate": 2.0003139622313241e-07, + "loss": 0.7814288139343262, + "step": 3556 + }, + { + "epoch": 2.994949494949495, + "grad_norm": 2.3128502368927, + "learning_rate": 2.0001899282058216e-07, + "loss": 0.6661207675933838, + "step": 3558 + }, + { + "epoch": 2.9966329966329965, + "grad_norm": 12.201488494873047, + "learning_rate": 2.000096902312762e-07, + "loss": 0.40893661975860596, + "step": 3560 + }, + { + "epoch": 2.9983164983164983, + "grad_norm": 4.00324821472168, + "learning_rate": 2.0000348848726586e-07, + "loss": 0.5416642427444458, + "step": 3562 + }, + { + "epoch": 3.0, + "grad_norm": 11.186657905578613, + "learning_rate": 2.0000038760991877e-07, + "loss": 0.361904501914978, + "step": 3564 + }, + { + "epoch": 3.0, + "step": 3564, + "total_flos": 4.2988160857187287e+18, + "train_loss": 0.7978645538875685, + "train_runtime": 6311.8591, + "train_samples_per_second": 9.034, + "train_steps_per_second": 0.565 + } + ], + "logging_steps": 2, + "max_steps": 3564, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 99999, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.2988160857187287e+18, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}