diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,5776 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 1638, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.003663003663003663, + "grad_norm": 2.6953821182250977, + "learning_rate": 4.0000000000000003e-07, + "loss": 2.6832668781280518, + "step": 2 + }, + { + "epoch": 0.007326007326007326, + "grad_norm": 0.3516251742839813, + "learning_rate": 1.2000000000000002e-06, + "loss": 1.6667841672897339, + "step": 4 + }, + { + "epoch": 0.01098901098901099, + "grad_norm": 0.37944722175598145, + "learning_rate": 2.0000000000000003e-06, + "loss": 1.8819491863250732, + "step": 6 + }, + { + "epoch": 0.014652014652014652, + "grad_norm": 0.41177836060523987, + "learning_rate": 2.8000000000000003e-06, + "loss": 2.0589606761932373, + "step": 8 + }, + { + "epoch": 0.018315018315018316, + "grad_norm": 0.226902037858963, + "learning_rate": 3.6000000000000003e-06, + "loss": 2.2131433486938477, + "step": 10 + }, + { + "epoch": 0.02197802197802198, + "grad_norm": 0.6165574193000793, + "learning_rate": 4.4e-06, + "loss": 2.016043186187744, + "step": 12 + }, + { + "epoch": 0.02564102564102564, + "grad_norm": 0.41696810722351074, + "learning_rate": 5.2e-06, + "loss": 1.7563908100128174, + "step": 14 + }, + { + "epoch": 0.029304029304029304, + "grad_norm": 0.20804435014724731, + "learning_rate": 6e-06, + "loss": 1.8174551725387573, + "step": 16 + }, + { + "epoch": 0.03296703296703297, + "grad_norm": 0.6122412085533142, + "learning_rate": 6.800000000000001e-06, + "loss": 1.750252366065979, + "step": 18 + }, + { + "epoch": 0.03663003663003663, + "grad_norm": 0.1671883761882782, + "learning_rate": 7.600000000000001e-06, + "loss": 1.7453185319900513, + "step": 20 + }, + { + "epoch": 0.040293040293040296, + "grad_norm": 0.6752980947494507, + "learning_rate": 8.400000000000001e-06, + "loss": 1.116586685180664, + "step": 22 + }, + { + "epoch": 0.04395604395604396, + "grad_norm": 0.3505832552909851, + "learning_rate": 9.200000000000002e-06, + "loss": 1.2873910665512085, + "step": 24 + }, + { + "epoch": 0.047619047619047616, + "grad_norm": 0.13198299705982208, + "learning_rate": 1e-05, + "loss": 1.4626007080078125, + "step": 26 + }, + { + "epoch": 0.05128205128205128, + "grad_norm": 1.5736981630325317, + "learning_rate": 1.0800000000000002e-05, + "loss": 1.4963958263397217, + "step": 28 + }, + { + "epoch": 0.054945054945054944, + "grad_norm": 0.40237972140312195, + "learning_rate": 1.16e-05, + "loss": 1.1777423620224, + "step": 30 + }, + { + "epoch": 0.05860805860805861, + "grad_norm": 0.6054655909538269, + "learning_rate": 1.2400000000000002e-05, + "loss": 1.109989047050476, + "step": 32 + }, + { + "epoch": 0.06227106227106227, + "grad_norm": 0.07327596098184586, + "learning_rate": 1.3200000000000002e-05, + "loss": 1.2445521354675293, + "step": 34 + }, + { + "epoch": 0.06593406593406594, + "grad_norm": 0.13201776146888733, + "learning_rate": 1.4e-05, + "loss": 1.6131020784378052, + "step": 36 + }, + { + "epoch": 0.0695970695970696, + "grad_norm": 0.24574972689151764, + "learning_rate": 1.48e-05, + "loss": 1.3740808963775635, + "step": 38 + }, + { + "epoch": 0.07326007326007326, + "grad_norm": 0.20254714787006378, + "learning_rate": 1.5600000000000003e-05, + "loss": 1.4367315769195557, + "step": 40 + }, + { + "epoch": 0.07692307692307693, + "grad_norm": 0.1146855428814888, + "learning_rate": 1.64e-05, + "loss": 1.1236861944198608, + "step": 42 + }, + { + "epoch": 0.08058608058608059, + "grad_norm": 0.44093406200408936, + "learning_rate": 1.72e-05, + "loss": 1.5332335233688354, + "step": 44 + }, + { + "epoch": 0.08424908424908426, + "grad_norm": 0.11041630059480667, + "learning_rate": 1.8e-05, + "loss": 1.6215733289718628, + "step": 46 + }, + { + "epoch": 0.08791208791208792, + "grad_norm": 0.6309869289398193, + "learning_rate": 1.88e-05, + "loss": 1.0613853931427002, + "step": 48 + }, + { + "epoch": 0.09157509157509157, + "grad_norm": 0.07759591192007065, + "learning_rate": 1.9600000000000002e-05, + "loss": 1.5116233825683594, + "step": 50 + }, + { + "epoch": 0.09523809523809523, + "grad_norm": 0.2021571397781372, + "learning_rate": 1.999998238790087e-05, + "loss": 0.7548582553863525, + "step": 52 + }, + { + "epoch": 0.0989010989010989, + "grad_norm": 0.448813259601593, + "learning_rate": 1.999984149152137e-05, + "loss": 0.9764630198478699, + "step": 54 + }, + { + "epoch": 0.10256410256410256, + "grad_norm": 1.4041872024536133, + "learning_rate": 1.999955970096814e-05, + "loss": 1.34801185131073, + "step": 56 + }, + { + "epoch": 0.10622710622710622, + "grad_norm": 0.12317265570163727, + "learning_rate": 1.9999137020652663e-05, + "loss": 1.1960068941116333, + "step": 58 + }, + { + "epoch": 0.10989010989010989, + "grad_norm": 0.10297997295856476, + "learning_rate": 1.999857345719207e-05, + "loss": 1.411942481994629, + "step": 60 + }, + { + "epoch": 0.11355311355311355, + "grad_norm": 0.21834629774093628, + "learning_rate": 1.9997869019409047e-05, + "loss": 1.4279382228851318, + "step": 62 + }, + { + "epoch": 0.11721611721611722, + "grad_norm": 0.12077513337135315, + "learning_rate": 1.9997023718331707e-05, + "loss": 1.3897336721420288, + "step": 64 + }, + { + "epoch": 0.12087912087912088, + "grad_norm": 0.26337432861328125, + "learning_rate": 1.9996037567193388e-05, + "loss": 1.3532465696334839, + "step": 66 + }, + { + "epoch": 0.12454212454212454, + "grad_norm": 0.1620335876941681, + "learning_rate": 1.9994910581432466e-05, + "loss": 1.3219798803329468, + "step": 68 + }, + { + "epoch": 0.1282051282051282, + "grad_norm": 0.4770932197570801, + "learning_rate": 1.9993642778692116e-05, + "loss": 1.0979112386703491, + "step": 70 + }, + { + "epoch": 0.13186813186813187, + "grad_norm": 0.11338527500629425, + "learning_rate": 1.999223417882002e-05, + "loss": 1.4318253993988037, + "step": 72 + }, + { + "epoch": 0.13553113553113552, + "grad_norm": 0.1759554147720337, + "learning_rate": 1.9990684803868068e-05, + "loss": 1.5916600227355957, + "step": 74 + }, + { + "epoch": 0.1391941391941392, + "grad_norm": 0.06032824143767357, + "learning_rate": 1.9988994678092007e-05, + "loss": 1.0843533277511597, + "step": 76 + }, + { + "epoch": 0.14285714285714285, + "grad_norm": 0.23505143821239471, + "learning_rate": 1.9987163827951077e-05, + "loss": 1.4368324279785156, + "step": 78 + }, + { + "epoch": 0.14652014652014653, + "grad_norm": 0.2501824200153351, + "learning_rate": 1.998519228210756e-05, + "loss": 1.5794371366500854, + "step": 80 + }, + { + "epoch": 0.15018315018315018, + "grad_norm": 0.16015635430812836, + "learning_rate": 1.998308007142638e-05, + "loss": 1.2044681310653687, + "step": 82 + }, + { + "epoch": 0.15384615384615385, + "grad_norm": 0.10342324525117874, + "learning_rate": 1.9980827228974575e-05, + "loss": 1.3625484704971313, + "step": 84 + }, + { + "epoch": 0.1575091575091575, + "grad_norm": 0.3542490005493164, + "learning_rate": 1.997843379002081e-05, + "loss": 1.497490406036377, + "step": 86 + }, + { + "epoch": 0.16117216117216118, + "grad_norm": 0.24796238541603088, + "learning_rate": 1.9975899792034824e-05, + "loss": 0.7302997708320618, + "step": 88 + }, + { + "epoch": 0.16483516483516483, + "grad_norm": 0.12096573412418365, + "learning_rate": 1.9973225274686804e-05, + "loss": 0.8656339645385742, + "step": 90 + }, + { + "epoch": 0.1684981684981685, + "grad_norm": 0.12484420835971832, + "learning_rate": 1.9970410279846816e-05, + "loss": 1.3151899576187134, + "step": 92 + }, + { + "epoch": 0.17216117216117216, + "grad_norm": 0.23127628862857819, + "learning_rate": 1.9967454851584132e-05, + "loss": 1.341341495513916, + "step": 94 + }, + { + "epoch": 0.17582417582417584, + "grad_norm": 0.12265295535326004, + "learning_rate": 1.996435903616651e-05, + "loss": 1.2242141962051392, + "step": 96 + }, + { + "epoch": 0.1794871794871795, + "grad_norm": 0.11659730225801468, + "learning_rate": 1.9961122882059523e-05, + "loss": 1.3279354572296143, + "step": 98 + }, + { + "epoch": 0.18315018315018314, + "grad_norm": 0.1897869110107422, + "learning_rate": 1.9957746439925748e-05, + "loss": 1.177512764930725, + "step": 100 + }, + { + "epoch": 0.18681318681318682, + "grad_norm": 0.1538306474685669, + "learning_rate": 1.9954229762624016e-05, + "loss": 1.2517194747924805, + "step": 102 + }, + { + "epoch": 0.19047619047619047, + "grad_norm": 0.1801384538412094, + "learning_rate": 1.995057290520855e-05, + "loss": 0.9078481793403625, + "step": 104 + }, + { + "epoch": 0.19413919413919414, + "grad_norm": 0.2679578959941864, + "learning_rate": 1.9946775924928132e-05, + "loss": 1.091381311416626, + "step": 106 + }, + { + "epoch": 0.1978021978021978, + "grad_norm": 0.08728894591331482, + "learning_rate": 1.9942838881225183e-05, + "loss": 1.313234806060791, + "step": 108 + }, + { + "epoch": 0.20146520146520147, + "grad_norm": 0.17695370316505432, + "learning_rate": 1.9938761835734842e-05, + "loss": 1.167786717414856, + "step": 110 + }, + { + "epoch": 0.20512820512820512, + "grad_norm": 0.14857837557792664, + "learning_rate": 1.9934544852284013e-05, + "loss": 1.225075602531433, + "step": 112 + }, + { + "epoch": 0.2087912087912088, + "grad_norm": 0.3321186602115631, + "learning_rate": 1.9930187996890347e-05, + "loss": 0.6301332712173462, + "step": 114 + }, + { + "epoch": 0.21245421245421245, + "grad_norm": 0.23063647747039795, + "learning_rate": 1.992569133776121e-05, + "loss": 1.2985056638717651, + "step": 116 + }, + { + "epoch": 0.21611721611721613, + "grad_norm": 0.2007586658000946, + "learning_rate": 1.992105494529264e-05, + "loss": 1.29304039478302, + "step": 118 + }, + { + "epoch": 0.21978021978021978, + "grad_norm": 0.18116247653961182, + "learning_rate": 1.99162788920682e-05, + "loss": 1.363271713256836, + "step": 120 + }, + { + "epoch": 0.22344322344322345, + "grad_norm": 0.5378410220146179, + "learning_rate": 1.9911363252857887e-05, + "loss": 1.2927430868148804, + "step": 122 + }, + { + "epoch": 0.2271062271062271, + "grad_norm": 0.4190959632396698, + "learning_rate": 1.990630810461694e-05, + "loss": 1.0202268362045288, + "step": 124 + }, + { + "epoch": 0.23076923076923078, + "grad_norm": 0.3055667281150818, + "learning_rate": 1.990111352648463e-05, + "loss": 0.894250750541687, + "step": 126 + }, + { + "epoch": 0.23443223443223443, + "grad_norm": 0.6129741668701172, + "learning_rate": 1.9895779599783033e-05, + "loss": 1.1953431367874146, + "step": 128 + }, + { + "epoch": 0.23809523809523808, + "grad_norm": 0.4913721978664398, + "learning_rate": 1.989030640801576e-05, + "loss": 1.2949491739273071, + "step": 130 + }, + { + "epoch": 0.24175824175824176, + "grad_norm": 0.1244189590215683, + "learning_rate": 1.9884694036866624e-05, + "loss": 1.394888997077942, + "step": 132 + }, + { + "epoch": 0.2454212454212454, + "grad_norm": 0.2685690224170685, + "learning_rate": 1.9878942574198334e-05, + "loss": 1.289100170135498, + "step": 134 + }, + { + "epoch": 0.2490842490842491, + "grad_norm": 0.6625173091888428, + "learning_rate": 1.9873052110051094e-05, + "loss": 1.2693977355957031, + "step": 136 + }, + { + "epoch": 0.25274725274725274, + "grad_norm": 0.18069927394390106, + "learning_rate": 1.9867022736641205e-05, + "loss": 1.0891296863555908, + "step": 138 + }, + { + "epoch": 0.2564102564102564, + "grad_norm": 0.17798098921775818, + "learning_rate": 1.9860854548359615e-05, + "loss": 1.2750816345214844, + "step": 140 + }, + { + "epoch": 0.2600732600732601, + "grad_norm": 0.3319156765937805, + "learning_rate": 1.9854547641770446e-05, + "loss": 1.287410855293274, + "step": 142 + }, + { + "epoch": 0.26373626373626374, + "grad_norm": 0.4369660019874573, + "learning_rate": 1.9848102115609483e-05, + "loss": 1.2549277544021606, + "step": 144 + }, + { + "epoch": 0.2673992673992674, + "grad_norm": 0.2410343736410141, + "learning_rate": 1.9841518070782615e-05, + "loss": 1.4124380350112915, + "step": 146 + }, + { + "epoch": 0.27106227106227104, + "grad_norm": 0.15733124315738678, + "learning_rate": 1.983479561036429e-05, + "loss": 1.3137692213058472, + "step": 148 + }, + { + "epoch": 0.27472527472527475, + "grad_norm": 0.07597765326499939, + "learning_rate": 1.982793483959585e-05, + "loss": 0.9285884499549866, + "step": 150 + }, + { + "epoch": 0.2783882783882784, + "grad_norm": 0.07245276868343353, + "learning_rate": 1.9820935865883924e-05, + "loss": 0.6362338662147522, + "step": 152 + }, + { + "epoch": 0.28205128205128205, + "grad_norm": 0.2137291133403778, + "learning_rate": 1.981379879879874e-05, + "loss": 1.0962413549423218, + "step": 154 + }, + { + "epoch": 0.2857142857142857, + "grad_norm": 0.11381687223911285, + "learning_rate": 1.9806523750072385e-05, + "loss": 1.3381097316741943, + "step": 156 + }, + { + "epoch": 0.2893772893772894, + "grad_norm": 0.12019956111907959, + "learning_rate": 1.9799110833597093e-05, + "loss": 1.2722944021224976, + "step": 158 + }, + { + "epoch": 0.29304029304029305, + "grad_norm": 0.10780342668294907, + "learning_rate": 1.9791560165423433e-05, + "loss": 0.9358208179473877, + "step": 160 + }, + { + "epoch": 0.2967032967032967, + "grad_norm": 0.23072299361228943, + "learning_rate": 1.9783871863758503e-05, + "loss": 1.5278652906417847, + "step": 162 + }, + { + "epoch": 0.30036630036630035, + "grad_norm": 0.1233779788017273, + "learning_rate": 1.9776046048964082e-05, + "loss": 1.045573115348816, + "step": 164 + }, + { + "epoch": 0.304029304029304, + "grad_norm": 1.4242370128631592, + "learning_rate": 1.9768082843554737e-05, + "loss": 1.3891277313232422, + "step": 166 + }, + { + "epoch": 0.3076923076923077, + "grad_norm": 0.1399042010307312, + "learning_rate": 1.9759982372195918e-05, + "loss": 1.1296571493148804, + "step": 168 + }, + { + "epoch": 0.31135531135531136, + "grad_norm": 0.1742742955684662, + "learning_rate": 1.9751744761701984e-05, + "loss": 1.2520029544830322, + "step": 170 + }, + { + "epoch": 0.315018315018315, + "grad_norm": 0.28852102160453796, + "learning_rate": 1.9743370141034248e-05, + "loss": 1.0042704343795776, + "step": 172 + }, + { + "epoch": 0.31868131868131866, + "grad_norm": 0.1639695167541504, + "learning_rate": 1.973485864129894e-05, + "loss": 0.8751227855682373, + "step": 174 + }, + { + "epoch": 0.32234432234432236, + "grad_norm": 0.12699364125728607, + "learning_rate": 1.9726210395745148e-05, + "loss": 1.3895294666290283, + "step": 176 + }, + { + "epoch": 0.326007326007326, + "grad_norm": 0.23989446461200714, + "learning_rate": 1.971742553976275e-05, + "loss": 0.924419105052948, + "step": 178 + }, + { + "epoch": 0.32967032967032966, + "grad_norm": 0.19326151907444, + "learning_rate": 1.9708504210880284e-05, + "loss": 1.484243631362915, + "step": 180 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 0.314147412776947, + "learning_rate": 1.969944654876279e-05, + "loss": 0.9775104522705078, + "step": 182 + }, + { + "epoch": 0.336996336996337, + "grad_norm": 0.2284594476222992, + "learning_rate": 1.9690252695209636e-05, + "loss": 1.2366245985031128, + "step": 184 + }, + { + "epoch": 0.34065934065934067, + "grad_norm": 0.9983108639717102, + "learning_rate": 1.9680922794152294e-05, + "loss": 1.3184081315994263, + "step": 186 + }, + { + "epoch": 0.3443223443223443, + "grad_norm": 0.1190100610256195, + "learning_rate": 1.9671456991652072e-05, + "loss": 1.1691346168518066, + "step": 188 + }, + { + "epoch": 0.34798534798534797, + "grad_norm": 0.16565515100955963, + "learning_rate": 1.9661855435897858e-05, + "loss": 1.2617851495742798, + "step": 190 + }, + { + "epoch": 0.3516483516483517, + "grad_norm": 0.35118862986564636, + "learning_rate": 1.9652118277203767e-05, + "loss": 1.1071832180023193, + "step": 192 + }, + { + "epoch": 0.3553113553113553, + "grad_norm": 0.1040225550532341, + "learning_rate": 1.9642245668006814e-05, + "loss": 1.2461092472076416, + "step": 194 + }, + { + "epoch": 0.358974358974359, + "grad_norm": 0.1812414824962616, + "learning_rate": 1.963223776286451e-05, + "loss": 1.2525054216384888, + "step": 196 + }, + { + "epoch": 0.3626373626373626, + "grad_norm": 0.19630086421966553, + "learning_rate": 1.9622094718452448e-05, + "loss": 0.8609032034873962, + "step": 198 + }, + { + "epoch": 0.3663003663003663, + "grad_norm": 0.2004232257604599, + "learning_rate": 1.9611816693561858e-05, + "loss": 1.0185203552246094, + "step": 200 + }, + { + "epoch": 0.36996336996337, + "grad_norm": 0.2183266133069992, + "learning_rate": 1.96014038490971e-05, + "loss": 1.4265942573547363, + "step": 202 + }, + { + "epoch": 0.37362637362637363, + "grad_norm": 0.47130584716796875, + "learning_rate": 1.9590856348073182e-05, + "loss": 1.204400897026062, + "step": 204 + }, + { + "epoch": 0.3772893772893773, + "grad_norm": 0.5094867944717407, + "learning_rate": 1.9580174355613168e-05, + "loss": 0.7685510516166687, + "step": 206 + }, + { + "epoch": 0.38095238095238093, + "grad_norm": 0.18307189643383026, + "learning_rate": 1.9569358038945617e-05, + "loss": 1.135070562362671, + "step": 208 + }, + { + "epoch": 0.38461538461538464, + "grad_norm": 0.3276787996292114, + "learning_rate": 1.9558407567401945e-05, + "loss": 1.409148097038269, + "step": 210 + }, + { + "epoch": 0.3882783882783883, + "grad_norm": 0.2551312744617462, + "learning_rate": 1.9547323112413806e-05, + "loss": 1.0703577995300293, + "step": 212 + }, + { + "epoch": 0.39194139194139194, + "grad_norm": 0.4214024841785431, + "learning_rate": 1.9536104847510384e-05, + "loss": 1.1370527744293213, + "step": 214 + }, + { + "epoch": 0.3956043956043956, + "grad_norm": 0.18482302129268646, + "learning_rate": 1.9524752948315677e-05, + "loss": 1.2194429636001587, + "step": 216 + }, + { + "epoch": 0.3992673992673993, + "grad_norm": 0.1274556815624237, + "learning_rate": 1.9513267592545752e-05, + "loss": 1.2507331371307373, + "step": 218 + }, + { + "epoch": 0.40293040293040294, + "grad_norm": 0.11079567670822144, + "learning_rate": 1.9501648960005964e-05, + "loss": 0.6284658908843994, + "step": 220 + }, + { + "epoch": 0.4065934065934066, + "grad_norm": 0.1624489277601242, + "learning_rate": 1.948989723258815e-05, + "loss": 1.3324811458587646, + "step": 222 + }, + { + "epoch": 0.41025641025641024, + "grad_norm": 0.2038378268480301, + "learning_rate": 1.9478012594267757e-05, + "loss": 1.0770931243896484, + "step": 224 + }, + { + "epoch": 0.4139194139194139, + "grad_norm": 0.18007361888885498, + "learning_rate": 1.946599523110099e-05, + "loss": 1.2338424921035767, + "step": 226 + }, + { + "epoch": 0.4175824175824176, + "grad_norm": 0.24560625851154327, + "learning_rate": 1.945384533122187e-05, + "loss": 1.3044496774673462, + "step": 228 + }, + { + "epoch": 0.42124542124542125, + "grad_norm": 0.2072787582874298, + "learning_rate": 1.9441563084839324e-05, + "loss": 1.233733057975769, + "step": 230 + }, + { + "epoch": 0.4249084249084249, + "grad_norm": 0.39441704750061035, + "learning_rate": 1.942914868423417e-05, + "loss": 0.997710645198822, + "step": 232 + }, + { + "epoch": 0.42857142857142855, + "grad_norm": 0.30978092551231384, + "learning_rate": 1.941660232375614e-05, + "loss": 1.4850120544433594, + "step": 234 + }, + { + "epoch": 0.43223443223443225, + "grad_norm": 0.4644772708415985, + "learning_rate": 1.9403924199820813e-05, + "loss": 1.0170940160751343, + "step": 236 + }, + { + "epoch": 0.4358974358974359, + "grad_norm": 0.147545725107193, + "learning_rate": 1.9391114510906546e-05, + "loss": 1.0712804794311523, + "step": 238 + }, + { + "epoch": 0.43956043956043955, + "grad_norm": 0.23715418577194214, + "learning_rate": 1.937817345755138e-05, + "loss": 0.9054774641990662, + "step": 240 + }, + { + "epoch": 0.4432234432234432, + "grad_norm": 0.20748640596866608, + "learning_rate": 1.9365101242349883e-05, + "loss": 0.8780495524406433, + "step": 242 + }, + { + "epoch": 0.4468864468864469, + "grad_norm": 0.1142963096499443, + "learning_rate": 1.9351898069949985e-05, + "loss": 0.5809678435325623, + "step": 244 + }, + { + "epoch": 0.45054945054945056, + "grad_norm": 0.16483286023139954, + "learning_rate": 1.9338564147049785e-05, + "loss": 1.2516872882843018, + "step": 246 + }, + { + "epoch": 0.4542124542124542, + "grad_norm": 0.31650102138519287, + "learning_rate": 1.9325099682394296e-05, + "loss": 0.8757045269012451, + "step": 248 + }, + { + "epoch": 0.45787545787545786, + "grad_norm": 0.5607954263687134, + "learning_rate": 1.9311504886772183e-05, + "loss": 1.2788811922073364, + "step": 250 + }, + { + "epoch": 0.46153846153846156, + "grad_norm": 0.23190627992153168, + "learning_rate": 1.929777997301248e-05, + "loss": 1.1750297546386719, + "step": 252 + }, + { + "epoch": 0.4652014652014652, + "grad_norm": 0.18342120945453644, + "learning_rate": 1.9283925155981228e-05, + "loss": 0.9642306566238403, + "step": 254 + }, + { + "epoch": 0.46886446886446886, + "grad_norm": 0.20935122668743134, + "learning_rate": 1.9269940652578143e-05, + "loss": 1.2557034492492676, + "step": 256 + }, + { + "epoch": 0.4725274725274725, + "grad_norm": 0.26344409584999084, + "learning_rate": 1.9255826681733194e-05, + "loss": 1.2821006774902344, + "step": 258 + }, + { + "epoch": 0.47619047619047616, + "grad_norm": 0.08390780538320541, + "learning_rate": 1.924158346440319e-05, + "loss": 0.7510356307029724, + "step": 260 + }, + { + "epoch": 0.47985347985347987, + "grad_norm": 0.13334789872169495, + "learning_rate": 1.9227211223568317e-05, + "loss": 1.1425384283065796, + "step": 262 + }, + { + "epoch": 0.4835164835164835, + "grad_norm": 0.292468786239624, + "learning_rate": 1.9212710184228654e-05, + "loss": 1.2262020111083984, + "step": 264 + }, + { + "epoch": 0.48717948717948717, + "grad_norm": 0.25990208983421326, + "learning_rate": 1.9198080573400634e-05, + "loss": 1.493544578552246, + "step": 266 + }, + { + "epoch": 0.4908424908424908, + "grad_norm": 1.0916262865066528, + "learning_rate": 1.9183322620113505e-05, + "loss": 0.8011191487312317, + "step": 268 + }, + { + "epoch": 0.4945054945054945, + "grad_norm": 0.18405258655548096, + "learning_rate": 1.916843655540574e-05, + "loss": 1.1977633237838745, + "step": 270 + }, + { + "epoch": 0.4981684981684982, + "grad_norm": 0.808682918548584, + "learning_rate": 1.915342261232142e-05, + "loss": 0.8853522539138794, + "step": 272 + }, + { + "epoch": 0.5018315018315018, + "grad_norm": 0.27525171637535095, + "learning_rate": 1.913828102590659e-05, + "loss": 1.246770977973938, + "step": 274 + }, + { + "epoch": 0.5054945054945055, + "grad_norm": 0.2584117352962494, + "learning_rate": 1.9123012033205564e-05, + "loss": 0.8163619041442871, + "step": 276 + }, + { + "epoch": 0.5091575091575091, + "grad_norm": 0.24883966147899628, + "learning_rate": 1.9107615873257234e-05, + "loss": 0.8779458403587341, + "step": 278 + }, + { + "epoch": 0.5128205128205128, + "grad_norm": 0.3886781930923462, + "learning_rate": 1.909209278709131e-05, + "loss": 1.2527726888656616, + "step": 280 + }, + { + "epoch": 0.5164835164835165, + "grad_norm": 0.29291361570358276, + "learning_rate": 1.9076443017724568e-05, + "loss": 1.2405891418457031, + "step": 282 + }, + { + "epoch": 0.5201465201465202, + "grad_norm": 0.21082301437854767, + "learning_rate": 1.9060666810157025e-05, + "loss": 1.2477757930755615, + "step": 284 + }, + { + "epoch": 0.5238095238095238, + "grad_norm": 0.9164182543754578, + "learning_rate": 1.9044764411368106e-05, + "loss": 1.0075207948684692, + "step": 286 + }, + { + "epoch": 0.5274725274725275, + "grad_norm": 0.19336040318012238, + "learning_rate": 1.9028736070312796e-05, + "loss": 1.2511680126190186, + "step": 288 + }, + { + "epoch": 0.5311355311355311, + "grad_norm": 0.3687734305858612, + "learning_rate": 1.9012582037917713e-05, + "loss": 1.2166516780853271, + "step": 290 + }, + { + "epoch": 0.5347985347985348, + "grad_norm": 0.13847005367279053, + "learning_rate": 1.8996302567077217e-05, + "loss": 0.7319579720497131, + "step": 292 + }, + { + "epoch": 0.5384615384615384, + "grad_norm": 0.03533535450696945, + "learning_rate": 1.897989791264941e-05, + "loss": 0.947640061378479, + "step": 294 + }, + { + "epoch": 0.5421245421245421, + "grad_norm": 0.15593798458576202, + "learning_rate": 1.8963368331452172e-05, + "loss": 1.0241395235061646, + "step": 296 + }, + { + "epoch": 0.5457875457875457, + "grad_norm": 0.6071200370788574, + "learning_rate": 1.8946714082259145e-05, + "loss": 1.2927855253219604, + "step": 298 + }, + { + "epoch": 0.5494505494505495, + "grad_norm": 0.22545884549617767, + "learning_rate": 1.8929935425795655e-05, + "loss": 1.2249903678894043, + "step": 300 + }, + { + "epoch": 0.5531135531135531, + "grad_norm": 0.2020682394504547, + "learning_rate": 1.8913032624734657e-05, + "loss": 1.1927720308303833, + "step": 302 + }, + { + "epoch": 0.5567765567765568, + "grad_norm": 0.15021179616451263, + "learning_rate": 1.8896005943692614e-05, + "loss": 0.9835414886474609, + "step": 304 + }, + { + "epoch": 0.5604395604395604, + "grad_norm": 0.20427605509757996, + "learning_rate": 1.8878855649225346e-05, + "loss": 0.9726413488388062, + "step": 306 + }, + { + "epoch": 0.5641025641025641, + "grad_norm": 0.3385741412639618, + "learning_rate": 1.8861582009823868e-05, + "loss": 1.4145900011062622, + "step": 308 + }, + { + "epoch": 0.5677655677655677, + "grad_norm": 1.1410894393920898, + "learning_rate": 1.884418529591018e-05, + "loss": 0.9968710541725159, + "step": 310 + }, + { + "epoch": 0.5714285714285714, + "grad_norm": 0.13543498516082764, + "learning_rate": 1.882666577983304e-05, + "loss": 1.2110992670059204, + "step": 312 + }, + { + "epoch": 0.575091575091575, + "grad_norm": 0.8556830883026123, + "learning_rate": 1.8809023735863693e-05, + "loss": 1.139275074005127, + "step": 314 + }, + { + "epoch": 0.5787545787545788, + "grad_norm": 0.20495963096618652, + "learning_rate": 1.879125944019158e-05, + "loss": 1.282318353652954, + "step": 316 + }, + { + "epoch": 0.5824175824175825, + "grad_norm": 0.3021373748779297, + "learning_rate": 1.8773373170920022e-05, + "loss": 1.1240531206130981, + "step": 318 + }, + { + "epoch": 0.5860805860805861, + "grad_norm": 0.15304189920425415, + "learning_rate": 1.875536520806185e-05, + "loss": 1.3365286588668823, + "step": 320 + }, + { + "epoch": 0.5897435897435898, + "grad_norm": 0.35537189245224, + "learning_rate": 1.8737235833535033e-05, + "loss": 1.522202491760254, + "step": 322 + }, + { + "epoch": 0.5934065934065934, + "grad_norm": 0.18663744628429413, + "learning_rate": 1.871898533115827e-05, + "loss": 1.2638378143310547, + "step": 324 + }, + { + "epoch": 0.5970695970695971, + "grad_norm": 0.38809436559677124, + "learning_rate": 1.870061398664653e-05, + "loss": 1.3586820363998413, + "step": 326 + }, + { + "epoch": 0.6007326007326007, + "grad_norm": 0.17811237275600433, + "learning_rate": 1.868212208760658e-05, + "loss": 1.2239891290664673, + "step": 328 + }, + { + "epoch": 0.6043956043956044, + "grad_norm": 0.21789351105690002, + "learning_rate": 1.8663509923532514e-05, + "loss": 1.1124786138534546, + "step": 330 + }, + { + "epoch": 0.608058608058608, + "grad_norm": 0.1282075196504593, + "learning_rate": 1.8644777785801175e-05, + "loss": 1.1825826168060303, + "step": 332 + }, + { + "epoch": 0.6117216117216118, + "grad_norm": 0.13316233456134796, + "learning_rate": 1.862592596766763e-05, + "loss": 1.2802841663360596, + "step": 334 + }, + { + "epoch": 0.6153846153846154, + "grad_norm": 0.8452423214912415, + "learning_rate": 1.8606954764260556e-05, + "loss": 0.9047472476959229, + "step": 336 + }, + { + "epoch": 0.6190476190476191, + "grad_norm": 1.1467108726501465, + "learning_rate": 1.8587864472577632e-05, + "loss": 1.2362874746322632, + "step": 338 + }, + { + "epoch": 0.6227106227106227, + "grad_norm": 0.36012572050094604, + "learning_rate": 1.8568655391480882e-05, + "loss": 1.2298355102539062, + "step": 340 + }, + { + "epoch": 0.6263736263736264, + "grad_norm": 0.402875155210495, + "learning_rate": 1.8549327821692008e-05, + "loss": 0.5815396904945374, + "step": 342 + }, + { + "epoch": 0.63003663003663, + "grad_norm": 0.15672507882118225, + "learning_rate": 1.852988206578767e-05, + "loss": 1.4353272914886475, + "step": 344 + }, + { + "epoch": 0.6336996336996337, + "grad_norm": 0.48711514472961426, + "learning_rate": 1.851031842819475e-05, + "loss": 0.6951726675033569, + "step": 346 + }, + { + "epoch": 0.6373626373626373, + "grad_norm": 0.18968786299228668, + "learning_rate": 1.849063721518559e-05, + "loss": 1.1487343311309814, + "step": 348 + }, + { + "epoch": 0.6410256410256411, + "grad_norm": 0.28493592143058777, + "learning_rate": 1.8470838734873205e-05, + "loss": 0.8871045112609863, + "step": 350 + }, + { + "epoch": 0.6446886446886447, + "grad_norm": 0.1983412355184555, + "learning_rate": 1.8450923297206446e-05, + "loss": 0.9253445267677307, + "step": 352 + }, + { + "epoch": 0.6483516483516484, + "grad_norm": 0.37879106402397156, + "learning_rate": 1.8430891213965146e-05, + "loss": 0.9548962712287903, + "step": 354 + }, + { + "epoch": 0.652014652014652, + "grad_norm": 0.3444487750530243, + "learning_rate": 1.8410742798755255e-05, + "loss": 1.1662979125976562, + "step": 356 + }, + { + "epoch": 0.6556776556776557, + "grad_norm": 0.3182324171066284, + "learning_rate": 1.8390478367003922e-05, + "loss": 1.1501445770263672, + "step": 358 + }, + { + "epoch": 0.6593406593406593, + "grad_norm": 0.3570864796638489, + "learning_rate": 1.8370098235954553e-05, + "loss": 0.6915228366851807, + "step": 360 + }, + { + "epoch": 0.663003663003663, + "grad_norm": 0.25218188762664795, + "learning_rate": 1.834960272466184e-05, + "loss": 0.9469904899597168, + "step": 362 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.2885875105857849, + "learning_rate": 1.832899215398679e-05, + "loss": 0.9307279586791992, + "step": 364 + }, + { + "epoch": 0.6703296703296703, + "grad_norm": 0.3014400005340576, + "learning_rate": 1.8308266846591673e-05, + "loss": 1.1860599517822266, + "step": 366 + }, + { + "epoch": 0.673992673992674, + "grad_norm": 0.22383049130439758, + "learning_rate": 1.828742712693499e-05, + "loss": 1.05683434009552, + "step": 368 + }, + { + "epoch": 0.6776556776556777, + "grad_norm": 0.26446428894996643, + "learning_rate": 1.8266473321266385e-05, + "loss": 1.0885175466537476, + "step": 370 + }, + { + "epoch": 0.6813186813186813, + "grad_norm": 0.511090874671936, + "learning_rate": 1.824540575762154e-05, + "loss": 1.189785122871399, + "step": 372 + }, + { + "epoch": 0.684981684981685, + "grad_norm": 0.3425454795360565, + "learning_rate": 1.8224224765817033e-05, + "loss": 1.1994352340698242, + "step": 374 + }, + { + "epoch": 0.6886446886446886, + "grad_norm": 0.2739914655685425, + "learning_rate": 1.820293067744519e-05, + "loss": 0.9016158580780029, + "step": 376 + }, + { + "epoch": 0.6923076923076923, + "grad_norm": 0.42973825335502625, + "learning_rate": 1.8181523825868882e-05, + "loss": 0.8308103084564209, + "step": 378 + }, + { + "epoch": 0.6959706959706959, + "grad_norm": 0.6218528747558594, + "learning_rate": 1.816000454621631e-05, + "loss": 1.0427895784378052, + "step": 380 + }, + { + "epoch": 0.6996336996336996, + "grad_norm": 0.14869005978107452, + "learning_rate": 1.8138373175375744e-05, + "loss": 0.9768989086151123, + "step": 382 + }, + { + "epoch": 0.7032967032967034, + "grad_norm": 0.23566092550754547, + "learning_rate": 1.8116630051990283e-05, + "loss": 1.1809189319610596, + "step": 384 + }, + { + "epoch": 0.706959706959707, + "grad_norm": 1.1532787084579468, + "learning_rate": 1.8094775516452522e-05, + "loss": 1.0936170816421509, + "step": 386 + }, + { + "epoch": 0.7106227106227107, + "grad_norm": 0.19187268614768982, + "learning_rate": 1.807280991089923e-05, + "loss": 0.8833699226379395, + "step": 388 + }, + { + "epoch": 0.7142857142857143, + "grad_norm": 0.15410183370113373, + "learning_rate": 1.8050733579206005e-05, + "loss": 1.0913102626800537, + "step": 390 + }, + { + "epoch": 0.717948717948718, + "grad_norm": 0.42997273802757263, + "learning_rate": 1.8028546866981875e-05, + "loss": 1.175827980041504, + "step": 392 + }, + { + "epoch": 0.7216117216117216, + "grad_norm": 0.49263566732406616, + "learning_rate": 1.8006250121563903e-05, + "loss": 1.118180513381958, + "step": 394 + }, + { + "epoch": 0.7252747252747253, + "grad_norm": 0.24720455706119537, + "learning_rate": 1.798384369201174e-05, + "loss": 1.2381089925765991, + "step": 396 + }, + { + "epoch": 0.7289377289377289, + "grad_norm": 0.2054165154695511, + "learning_rate": 1.796132792910216e-05, + "loss": 0.9162780046463013, + "step": 398 + }, + { + "epoch": 0.7326007326007326, + "grad_norm": 0.2705683410167694, + "learning_rate": 1.7938703185323575e-05, + "loss": 0.8435118198394775, + "step": 400 + }, + { + "epoch": 0.7362637362637363, + "grad_norm": 0.18964652717113495, + "learning_rate": 1.7915969814870508e-05, + "loss": 1.2435356378555298, + "step": 402 + }, + { + "epoch": 0.73992673992674, + "grad_norm": 0.10768619179725647, + "learning_rate": 1.789312817363805e-05, + "loss": 0.8427106738090515, + "step": 404 + }, + { + "epoch": 0.7435897435897436, + "grad_norm": 0.6611467599868774, + "learning_rate": 1.7870178619216304e-05, + "loss": 1.0203189849853516, + "step": 406 + }, + { + "epoch": 0.7472527472527473, + "grad_norm": 0.6479913592338562, + "learning_rate": 1.784712151088476e-05, + "loss": 1.0223674774169922, + "step": 408 + }, + { + "epoch": 0.7509157509157509, + "grad_norm": 0.5998079776763916, + "learning_rate": 1.782395720960669e-05, + "loss": 0.8774959444999695, + "step": 410 + }, + { + "epoch": 0.7545787545787546, + "grad_norm": 0.31854361295700073, + "learning_rate": 1.780068607802349e-05, + "loss": 1.1714166402816772, + "step": 412 + }, + { + "epoch": 0.7582417582417582, + "grad_norm": 0.4224836528301239, + "learning_rate": 1.7777308480449006e-05, + "loss": 1.010443091392517, + "step": 414 + }, + { + "epoch": 0.7619047619047619, + "grad_norm": 0.16301847994327545, + "learning_rate": 1.7753824782863827e-05, + "loss": 1.2712624073028564, + "step": 416 + }, + { + "epoch": 0.7655677655677655, + "grad_norm": 0.1754055619239807, + "learning_rate": 1.773023535290956e-05, + "loss": 0.6752095222473145, + "step": 418 + }, + { + "epoch": 0.7692307692307693, + "grad_norm": 0.53148353099823, + "learning_rate": 1.7706540559883066e-05, + "loss": 1.2383452653884888, + "step": 420 + }, + { + "epoch": 0.7728937728937729, + "grad_norm": 0.21067863702774048, + "learning_rate": 1.7682740774730688e-05, + "loss": 0.9969592094421387, + "step": 422 + }, + { + "epoch": 0.7765567765567766, + "grad_norm": 0.286111444234848, + "learning_rate": 1.7658836370042443e-05, + "loss": 0.49340909719467163, + "step": 424 + }, + { + "epoch": 0.7802197802197802, + "grad_norm": 0.48605793714523315, + "learning_rate": 1.7634827720046178e-05, + "loss": 0.8091546893119812, + "step": 426 + }, + { + "epoch": 0.7838827838827839, + "grad_norm": 0.5086367130279541, + "learning_rate": 1.7610715200601727e-05, + "loss": 1.0666477680206299, + "step": 428 + }, + { + "epoch": 0.7875457875457875, + "grad_norm": 0.3185122311115265, + "learning_rate": 1.7586499189195016e-05, + "loss": 1.2098157405853271, + "step": 430 + }, + { + "epoch": 0.7912087912087912, + "grad_norm": 0.23432797193527222, + "learning_rate": 1.7562180064932158e-05, + "loss": 1.291565179824829, + "step": 432 + }, + { + "epoch": 0.7948717948717948, + "grad_norm": 0.19452722370624542, + "learning_rate": 1.7537758208533516e-05, + "loss": 0.8848323225975037, + "step": 434 + }, + { + "epoch": 0.7985347985347986, + "grad_norm": 0.24718429148197174, + "learning_rate": 1.7513234002327738e-05, + "loss": 0.9670489430427551, + "step": 436 + }, + { + "epoch": 0.8021978021978022, + "grad_norm": 1.831937313079834, + "learning_rate": 1.748860783024579e-05, + "loss": 0.8808807134628296, + "step": 438 + }, + { + "epoch": 0.8058608058608059, + "grad_norm": 0.23156145215034485, + "learning_rate": 1.746388007781492e-05, + "loss": 1.3112839460372925, + "step": 440 + }, + { + "epoch": 0.8095238095238095, + "grad_norm": 0.1498938649892807, + "learning_rate": 1.7439051132152644e-05, + "loss": 1.2047144174575806, + "step": 442 + }, + { + "epoch": 0.8131868131868132, + "grad_norm": 0.21056711673736572, + "learning_rate": 1.741412138196067e-05, + "loss": 1.2016336917877197, + "step": 444 + }, + { + "epoch": 0.8168498168498168, + "grad_norm": 0.5614007115364075, + "learning_rate": 1.738909121751882e-05, + "loss": 1.227624773979187, + "step": 446 + }, + { + "epoch": 0.8205128205128205, + "grad_norm": 0.5933297872543335, + "learning_rate": 1.736396103067893e-05, + "loss": 1.2266818284988403, + "step": 448 + }, + { + "epoch": 0.8241758241758241, + "grad_norm": 0.2088741809129715, + "learning_rate": 1.7338731214858688e-05, + "loss": 1.3620444536209106, + "step": 450 + }, + { + "epoch": 0.8278388278388278, + "grad_norm": 0.30190014839172363, + "learning_rate": 1.7313402165035504e-05, + "loss": 0.9975396990776062, + "step": 452 + }, + { + "epoch": 0.8315018315018315, + "grad_norm": 0.07316415756940842, + "learning_rate": 1.728797427774031e-05, + "loss": 0.49027442932128906, + "step": 454 + }, + { + "epoch": 0.8351648351648352, + "grad_norm": 0.08831637352705002, + "learning_rate": 1.7262447951051366e-05, + "loss": 0.9005019664764404, + "step": 456 + }, + { + "epoch": 0.8388278388278388, + "grad_norm": 0.23266173899173737, + "learning_rate": 1.7236823584587995e-05, + "loss": 0.8408886790275574, + "step": 458 + }, + { + "epoch": 0.8424908424908425, + "grad_norm": 0.7009871006011963, + "learning_rate": 1.7211101579504382e-05, + "loss": 1.0286332368850708, + "step": 460 + }, + { + "epoch": 0.8461538461538461, + "grad_norm": 0.4962588846683502, + "learning_rate": 1.7185282338483243e-05, + "loss": 1.2270429134368896, + "step": 462 + }, + { + "epoch": 0.8498168498168498, + "grad_norm": 0.3354407548904419, + "learning_rate": 1.7159366265729537e-05, + "loss": 1.1797465085983276, + "step": 464 + }, + { + "epoch": 0.8534798534798534, + "grad_norm": 0.07982466369867325, + "learning_rate": 1.713335376696416e-05, + "loss": 1.201880693435669, + "step": 466 + }, + { + "epoch": 0.8571428571428571, + "grad_norm": 0.6047555208206177, + "learning_rate": 1.7107245249417556e-05, + "loss": 0.8821382522583008, + "step": 468 + }, + { + "epoch": 0.8608058608058609, + "grad_norm": 0.7960102558135986, + "learning_rate": 1.7081041121823375e-05, + "loss": 0.9151496887207031, + "step": 470 + }, + { + "epoch": 0.8644688644688645, + "grad_norm": 0.1812516152858734, + "learning_rate": 1.705474179441205e-05, + "loss": 1.1692742109298706, + "step": 472 + }, + { + "epoch": 0.8681318681318682, + "grad_norm": 0.09980528056621552, + "learning_rate": 1.7028347678904388e-05, + "loss": 0.8694528937339783, + "step": 474 + }, + { + "epoch": 0.8717948717948718, + "grad_norm": 0.34480732679367065, + "learning_rate": 1.700185918850512e-05, + "loss": 1.0994765758514404, + "step": 476 + }, + { + "epoch": 0.8754578754578755, + "grad_norm": 0.35163024067878723, + "learning_rate": 1.6975276737896443e-05, + "loss": 1.0502619743347168, + "step": 478 + }, + { + "epoch": 0.8791208791208791, + "grad_norm": 0.42012107372283936, + "learning_rate": 1.69486007432315e-05, + "loss": 1.0646753311157227, + "step": 480 + }, + { + "epoch": 0.8827838827838828, + "grad_norm": 0.39207834005355835, + "learning_rate": 1.6921831622127905e-05, + "loss": 1.1882476806640625, + "step": 482 + }, + { + "epoch": 0.8864468864468864, + "grad_norm": 0.34004685282707214, + "learning_rate": 1.6894969793661163e-05, + "loss": 1.263951063156128, + "step": 484 + }, + { + "epoch": 0.8901098901098901, + "grad_norm": 0.34993913769721985, + "learning_rate": 1.686801567835814e-05, + "loss": 0.902937650680542, + "step": 486 + }, + { + "epoch": 0.8937728937728938, + "grad_norm": 0.5401197075843811, + "learning_rate": 1.6840969698190467e-05, + "loss": 1.1535415649414062, + "step": 488 + }, + { + "epoch": 0.8974358974358975, + "grad_norm": 0.23730972409248352, + "learning_rate": 1.6813832276567942e-05, + "loss": 1.1057976484298706, + "step": 490 + }, + { + "epoch": 0.9010989010989011, + "grad_norm": 0.32215645909309387, + "learning_rate": 1.6786603838331894e-05, + "loss": 1.048466444015503, + "step": 492 + }, + { + "epoch": 0.9047619047619048, + "grad_norm": 0.22649593651294708, + "learning_rate": 1.6759284809748522e-05, + "loss": 0.5831862092018127, + "step": 494 + }, + { + "epoch": 0.9084249084249084, + "grad_norm": 0.2719734311103821, + "learning_rate": 1.673187561850225e-05, + "loss": 1.2790480852127075, + "step": 496 + }, + { + "epoch": 0.9120879120879121, + "grad_norm": 0.11034264415502548, + "learning_rate": 1.6704376693689003e-05, + "loss": 1.1298277378082275, + "step": 498 + }, + { + "epoch": 0.9157509157509157, + "grad_norm": 0.21742600202560425, + "learning_rate": 1.6676788465809506e-05, + "loss": 0.8114557862281799, + "step": 500 + }, + { + "epoch": 0.9194139194139194, + "grad_norm": 0.05262337997555733, + "learning_rate": 1.6649111366762552e-05, + "loss": 0.8611850738525391, + "step": 502 + }, + { + "epoch": 0.9230769230769231, + "grad_norm": 0.3120154142379761, + "learning_rate": 1.66213458298382e-05, + "loss": 0.9393185973167419, + "step": 504 + }, + { + "epoch": 0.9267399267399268, + "grad_norm": 0.21198345720767975, + "learning_rate": 1.659349228971105e-05, + "loss": 0.8434366583824158, + "step": 506 + }, + { + "epoch": 0.9304029304029304, + "grad_norm": 0.2922159731388092, + "learning_rate": 1.6565551182433382e-05, + "loss": 1.1591928005218506, + "step": 508 + }, + { + "epoch": 0.9340659340659341, + "grad_norm": 0.273187518119812, + "learning_rate": 1.6537522945428386e-05, + "loss": 1.2172907590866089, + "step": 510 + }, + { + "epoch": 0.9377289377289377, + "grad_norm": 0.3839268088340759, + "learning_rate": 1.6509408017483258e-05, + "loss": 1.1717743873596191, + "step": 512 + }, + { + "epoch": 0.9413919413919414, + "grad_norm": 0.1463819146156311, + "learning_rate": 1.6481206838742362e-05, + "loss": 0.9574457406997681, + "step": 514 + }, + { + "epoch": 0.945054945054945, + "grad_norm": 0.18058021366596222, + "learning_rate": 1.645291985070034e-05, + "loss": 1.1745246648788452, + "step": 516 + }, + { + "epoch": 0.9487179487179487, + "grad_norm": 0.13750861585140228, + "learning_rate": 1.64245474961952e-05, + "loss": 1.2134760618209839, + "step": 518 + }, + { + "epoch": 0.9523809523809523, + "grad_norm": 0.22304584085941315, + "learning_rate": 1.639609021940136e-05, + "loss": 1.2141666412353516, + "step": 520 + }, + { + "epoch": 0.9560439560439561, + "grad_norm": 0.17950168251991272, + "learning_rate": 1.6367548465822723e-05, + "loss": 0.8784098625183105, + "step": 522 + }, + { + "epoch": 0.9597069597069597, + "grad_norm": 0.3971985876560211, + "learning_rate": 1.6338922682285697e-05, + "loss": 1.0205121040344238, + "step": 524 + }, + { + "epoch": 0.9633699633699634, + "grad_norm": 0.21712158620357513, + "learning_rate": 1.6310213316932187e-05, + "loss": 0.9632675051689148, + "step": 526 + }, + { + "epoch": 0.967032967032967, + "grad_norm": 0.28583937883377075, + "learning_rate": 1.6281420819212578e-05, + "loss": 0.647495448589325, + "step": 528 + }, + { + "epoch": 0.9706959706959707, + "grad_norm": 0.16611649096012115, + "learning_rate": 1.6252545639878728e-05, + "loss": 0.9034907817840576, + "step": 530 + }, + { + "epoch": 0.9743589743589743, + "grad_norm": 0.5935060977935791, + "learning_rate": 1.6223588230976874e-05, + "loss": 1.3511052131652832, + "step": 532 + }, + { + "epoch": 0.978021978021978, + "grad_norm": 0.13882240653038025, + "learning_rate": 1.6194549045840582e-05, + "loss": 0.6054647564888, + "step": 534 + }, + { + "epoch": 0.9816849816849816, + "grad_norm": 0.41965755820274353, + "learning_rate": 1.616542853908363e-05, + "loss": 0.8549352288246155, + "step": 536 + }, + { + "epoch": 0.9853479853479854, + "grad_norm": 0.10902281850576401, + "learning_rate": 1.6136227166592912e-05, + "loss": 0.7982230186462402, + "step": 538 + }, + { + "epoch": 0.989010989010989, + "grad_norm": 0.26821738481521606, + "learning_rate": 1.6106945385521286e-05, + "loss": 1.1226874589920044, + "step": 540 + }, + { + "epoch": 0.9926739926739927, + "grad_norm": 0.6362835764884949, + "learning_rate": 1.6077583654280416e-05, + "loss": 1.169198989868164, + "step": 542 + }, + { + "epoch": 0.9963369963369964, + "grad_norm": 0.5359551906585693, + "learning_rate": 1.60481424325336e-05, + "loss": 1.181970238685608, + "step": 544 + }, + { + "epoch": 1.0, + "grad_norm": 0.19026517868041992, + "learning_rate": 1.6018622181188594e-05, + "loss": 1.3597557544708252, + "step": 546 + }, + { + "epoch": 1.0036630036630036, + "grad_norm": 0.2745984196662903, + "learning_rate": 1.598902336239035e-05, + "loss": 0.9415228366851807, + "step": 548 + }, + { + "epoch": 1.0073260073260073, + "grad_norm": 0.5006294846534729, + "learning_rate": 1.595934643951382e-05, + "loss": 1.180120825767517, + "step": 550 + }, + { + "epoch": 1.010989010989011, + "grad_norm": 0.08472568541765213, + "learning_rate": 1.5929591877156694e-05, + "loss": 0.6712950468063354, + "step": 552 + }, + { + "epoch": 1.0146520146520146, + "grad_norm": 0.1828070878982544, + "learning_rate": 1.5899760141132115e-05, + "loss": 1.1309884786605835, + "step": 554 + }, + { + "epoch": 1.0183150183150182, + "grad_norm": 0.20885473489761353, + "learning_rate": 1.58698516984614e-05, + "loss": 1.025935411453247, + "step": 556 + }, + { + "epoch": 1.021978021978022, + "grad_norm": 0.15055039525032043, + "learning_rate": 1.583986701736672e-05, + "loss": 1.2158845663070679, + "step": 558 + }, + { + "epoch": 1.0256410256410255, + "grad_norm": 0.10766030102968216, + "learning_rate": 1.5809806567263767e-05, + "loss": 0.9240056276321411, + "step": 560 + }, + { + "epoch": 1.0293040293040292, + "grad_norm": 0.19933700561523438, + "learning_rate": 1.577967081875442e-05, + "loss": 1.1643272638320923, + "step": 562 + }, + { + "epoch": 1.032967032967033, + "grad_norm": 0.5722149014472961, + "learning_rate": 1.574946024361936e-05, + "loss": 1.150964617729187, + "step": 564 + }, + { + "epoch": 1.0366300366300367, + "grad_norm": 0.2388359010219574, + "learning_rate": 1.5719175314810706e-05, + "loss": 1.0588456392288208, + "step": 566 + }, + { + "epoch": 1.0402930402930404, + "grad_norm": 0.2551656663417816, + "learning_rate": 1.568881650644458e-05, + "loss": 1.045701265335083, + "step": 568 + }, + { + "epoch": 1.043956043956044, + "grad_norm": 0.25093504786491394, + "learning_rate": 1.565838429379371e-05, + "loss": 1.1056462526321411, + "step": 570 + }, + { + "epoch": 1.0476190476190477, + "grad_norm": 0.18721139430999756, + "learning_rate": 1.5627879153279986e-05, + "loss": 1.2383893728256226, + "step": 572 + }, + { + "epoch": 1.0512820512820513, + "grad_norm": 0.2834416627883911, + "learning_rate": 1.559730156246699e-05, + "loss": 1.0736546516418457, + "step": 574 + }, + { + "epoch": 1.054945054945055, + "grad_norm": 0.2321881204843521, + "learning_rate": 1.5566652000052533e-05, + "loss": 1.24500572681427, + "step": 576 + }, + { + "epoch": 1.0586080586080586, + "grad_norm": 0.23740430176258087, + "learning_rate": 1.553593094586115e-05, + "loss": 1.220283031463623, + "step": 578 + }, + { + "epoch": 1.0622710622710623, + "grad_norm": 0.24411214888095856, + "learning_rate": 1.5505138880836595e-05, + "loss": 1.2158234119415283, + "step": 580 + }, + { + "epoch": 1.065934065934066, + "grad_norm": 0.26935407519340515, + "learning_rate": 1.5474276287034305e-05, + "loss": 0.9861366152763367, + "step": 582 + }, + { + "epoch": 1.0695970695970696, + "grad_norm": 0.34071117639541626, + "learning_rate": 1.544334364761387e-05, + "loss": 1.146247386932373, + "step": 584 + }, + { + "epoch": 1.0732600732600732, + "grad_norm": 0.1204061508178711, + "learning_rate": 1.541234144683144e-05, + "loss": 1.0828087329864502, + "step": 586 + }, + { + "epoch": 1.0769230769230769, + "grad_norm": 0.24158774316310883, + "learning_rate": 1.5381270170032173e-05, + "loss": 0.8217217922210693, + "step": 588 + }, + { + "epoch": 1.0805860805860805, + "grad_norm": 0.3989861309528351, + "learning_rate": 1.5350130303642625e-05, + "loss": 1.1990596055984497, + "step": 590 + }, + { + "epoch": 1.0842490842490842, + "grad_norm": 0.09942612051963806, + "learning_rate": 1.5318922335163128e-05, + "loss": 1.0216456651687622, + "step": 592 + }, + { + "epoch": 1.0879120879120878, + "grad_norm": 0.2642340362071991, + "learning_rate": 1.5287646753160174e-05, + "loss": 0.9809384942054749, + "step": 594 + }, + { + "epoch": 1.0915750915750915, + "grad_norm": 0.33687108755111694, + "learning_rate": 1.5256304047258739e-05, + "loss": 1.0079659223556519, + "step": 596 + }, + { + "epoch": 1.0952380952380953, + "grad_norm": 0.44691038131713867, + "learning_rate": 1.522489470813466e-05, + "loss": 1.1001455783843994, + "step": 598 + }, + { + "epoch": 1.098901098901099, + "grad_norm": 1.3902361392974854, + "learning_rate": 1.5193419227506913e-05, + "loss": 1.1559927463531494, + "step": 600 + }, + { + "epoch": 1.1025641025641026, + "grad_norm": 0.5698482394218445, + "learning_rate": 1.5161878098129937e-05, + "loss": 0.8155554533004761, + "step": 602 + }, + { + "epoch": 1.1062271062271063, + "grad_norm": 0.33973053097724915, + "learning_rate": 1.5130271813785908e-05, + "loss": 0.9548834562301636, + "step": 604 + }, + { + "epoch": 1.10989010989011, + "grad_norm": 0.21200524270534515, + "learning_rate": 1.509860086927703e-05, + "loss": 0.8916766047477722, + "step": 606 + }, + { + "epoch": 1.1135531135531136, + "grad_norm": 0.9633941650390625, + "learning_rate": 1.5066865760417757e-05, + "loss": 1.2829712629318237, + "step": 608 + }, + { + "epoch": 1.1172161172161172, + "grad_norm": 0.3105233907699585, + "learning_rate": 1.5035066984027053e-05, + "loss": 0.7248431444168091, + "step": 610 + }, + { + "epoch": 1.120879120879121, + "grad_norm": 0.8827745318412781, + "learning_rate": 1.5003205037920616e-05, + "loss": 1.1643881797790527, + "step": 612 + }, + { + "epoch": 1.1245421245421245, + "grad_norm": 0.19762255251407623, + "learning_rate": 1.497128042090307e-05, + "loss": 0.9884153604507446, + "step": 614 + }, + { + "epoch": 1.1282051282051282, + "grad_norm": 0.11015181988477707, + "learning_rate": 1.493929363276017e-05, + "loss": 1.1548609733581543, + "step": 616 + }, + { + "epoch": 1.1318681318681318, + "grad_norm": 0.43714115023612976, + "learning_rate": 1.4907245174250957e-05, + "loss": 0.3965158462524414, + "step": 618 + }, + { + "epoch": 1.1355311355311355, + "grad_norm": 0.4128834307193756, + "learning_rate": 1.4875135547099953e-05, + "loss": 1.0857808589935303, + "step": 620 + }, + { + "epoch": 1.1391941391941391, + "grad_norm": 0.36989882588386536, + "learning_rate": 1.484296525398927e-05, + "loss": 0.7546265125274658, + "step": 622 + }, + { + "epoch": 1.1428571428571428, + "grad_norm": 0.32148852944374084, + "learning_rate": 1.4810734798550769e-05, + "loss": 1.0892126560211182, + "step": 624 + }, + { + "epoch": 1.1465201465201464, + "grad_norm": 0.3738671541213989, + "learning_rate": 1.4778444685358147e-05, + "loss": 1.3703358173370361, + "step": 626 + }, + { + "epoch": 1.15018315018315, + "grad_norm": 0.4861363172531128, + "learning_rate": 1.4746095419919075e-05, + "loss": 0.4386288821697235, + "step": 628 + }, + { + "epoch": 1.1538461538461537, + "grad_norm": 0.04527602717280388, + "learning_rate": 1.4713687508667251e-05, + "loss": 1.1120142936706543, + "step": 630 + }, + { + "epoch": 1.1575091575091574, + "grad_norm": 0.2540167570114136, + "learning_rate": 1.4681221458954484e-05, + "loss": 1.0848397016525269, + "step": 632 + }, + { + "epoch": 1.1611721611721613, + "grad_norm": 0.40409815311431885, + "learning_rate": 1.4648697779042754e-05, + "loss": 0.852532148361206, + "step": 634 + }, + { + "epoch": 1.164835164835165, + "grad_norm": 0.35430774092674255, + "learning_rate": 1.461611697809625e-05, + "loss": 0.9712074995040894, + "step": 636 + }, + { + "epoch": 1.1684981684981686, + "grad_norm": 0.3518703281879425, + "learning_rate": 1.4583479566173401e-05, + "loss": 1.1836236715316772, + "step": 638 + }, + { + "epoch": 1.1721611721611722, + "grad_norm": 0.08637535572052002, + "learning_rate": 1.4550786054218902e-05, + "loss": 0.7413387894630432, + "step": 640 + }, + { + "epoch": 1.1758241758241759, + "grad_norm": 0.25933629274368286, + "learning_rate": 1.4518036954055685e-05, + "loss": 1.1189838647842407, + "step": 642 + }, + { + "epoch": 1.1794871794871795, + "grad_norm": 0.32789698243141174, + "learning_rate": 1.4485232778376945e-05, + "loss": 1.0686718225479126, + "step": 644 + }, + { + "epoch": 1.1831501831501832, + "grad_norm": 0.17057186365127563, + "learning_rate": 1.4452374040738078e-05, + "loss": 0.7557857632637024, + "step": 646 + }, + { + "epoch": 1.1868131868131868, + "grad_norm": 0.3353375494480133, + "learning_rate": 1.4419461255548666e-05, + "loss": 0.8876537084579468, + "step": 648 + }, + { + "epoch": 1.1904761904761905, + "grad_norm": 0.22652913630008698, + "learning_rate": 1.4386494938064417e-05, + "loss": 1.1409285068511963, + "step": 650 + }, + { + "epoch": 1.1941391941391941, + "grad_norm": 0.1531185358762741, + "learning_rate": 1.4353475604379093e-05, + "loss": 0.671623170375824, + "step": 652 + }, + { + "epoch": 1.1978021978021978, + "grad_norm": 0.4093690812587738, + "learning_rate": 1.4320403771416438e-05, + "loss": 1.3042771816253662, + "step": 654 + }, + { + "epoch": 1.2014652014652014, + "grad_norm": 1.0563490390777588, + "learning_rate": 1.4287279956922076e-05, + "loss": 0.8223202228546143, + "step": 656 + }, + { + "epoch": 1.205128205128205, + "grad_norm": 0.4100596606731415, + "learning_rate": 1.4254104679455416e-05, + "loss": 0.74940425157547, + "step": 658 + }, + { + "epoch": 1.2087912087912087, + "grad_norm": 0.6579332947731018, + "learning_rate": 1.4220878458381523e-05, + "loss": 1.1517488956451416, + "step": 660 + }, + { + "epoch": 1.2124542124542124, + "grad_norm": 0.9731626510620117, + "learning_rate": 1.418760181386301e-05, + "loss": 1.026989459991455, + "step": 662 + }, + { + "epoch": 1.2161172161172162, + "grad_norm": 0.30989789962768555, + "learning_rate": 1.4154275266851856e-05, + "loss": 0.8894543051719666, + "step": 664 + }, + { + "epoch": 1.2197802197802199, + "grad_norm": 0.23721382021903992, + "learning_rate": 1.4120899339081291e-05, + "loss": 0.8070454001426697, + "step": 666 + }, + { + "epoch": 1.2234432234432235, + "grad_norm": 0.2819421887397766, + "learning_rate": 1.4087474553057599e-05, + "loss": 0.8887211084365845, + "step": 668 + }, + { + "epoch": 1.2271062271062272, + "grad_norm": 0.27606913447380066, + "learning_rate": 1.405400143205195e-05, + "loss": 0.675251305103302, + "step": 670 + }, + { + "epoch": 1.2307692307692308, + "grad_norm": 0.23686662316322327, + "learning_rate": 1.4020480500092217e-05, + "loss": 1.177452802658081, + "step": 672 + }, + { + "epoch": 1.2344322344322345, + "grad_norm": 0.32350587844848633, + "learning_rate": 1.3986912281954745e-05, + "loss": 1.1530712842941284, + "step": 674 + }, + { + "epoch": 1.2380952380952381, + "grad_norm": 0.1811443716287613, + "learning_rate": 1.3953297303156174e-05, + "loss": 1.1759916543960571, + "step": 676 + }, + { + "epoch": 1.2417582417582418, + "grad_norm": 0.19516615569591522, + "learning_rate": 1.391963608994517e-05, + "loss": 0.7716677784919739, + "step": 678 + }, + { + "epoch": 1.2454212454212454, + "grad_norm": 0.19349494576454163, + "learning_rate": 1.3885929169294218e-05, + "loss": 0.8281112313270569, + "step": 680 + }, + { + "epoch": 1.249084249084249, + "grad_norm": 0.9416590929031372, + "learning_rate": 1.3852177068891364e-05, + "loss": 1.1924660205841064, + "step": 682 + }, + { + "epoch": 1.2527472527472527, + "grad_norm": 0.26466265320777893, + "learning_rate": 1.3818380317131946e-05, + "loss": 1.1589021682739258, + "step": 684 + }, + { + "epoch": 1.2564102564102564, + "grad_norm": 0.24540755152702332, + "learning_rate": 1.3784539443110323e-05, + "loss": 0.8185974359512329, + "step": 686 + }, + { + "epoch": 1.26007326007326, + "grad_norm": 0.23551802337169647, + "learning_rate": 1.375065497661161e-05, + "loss": 1.111383318901062, + "step": 688 + }, + { + "epoch": 1.2637362637362637, + "grad_norm": 0.6621056199073792, + "learning_rate": 1.3716727448103356e-05, + "loss": 1.1371651887893677, + "step": 690 + }, + { + "epoch": 1.2673992673992673, + "grad_norm": 0.16852295398712158, + "learning_rate": 1.3682757388727261e-05, + "loss": 1.1790460348129272, + "step": 692 + }, + { + "epoch": 1.271062271062271, + "grad_norm": 0.7090398073196411, + "learning_rate": 1.3648745330290848e-05, + "loss": 0.9768311381340027, + "step": 694 + }, + { + "epoch": 1.2747252747252746, + "grad_norm": 0.5497918128967285, + "learning_rate": 1.361469180525916e-05, + "loss": 1.0126756429672241, + "step": 696 + }, + { + "epoch": 1.2783882783882783, + "grad_norm": 0.31008267402648926, + "learning_rate": 1.358059734674638e-05, + "loss": 0.9194446802139282, + "step": 698 + }, + { + "epoch": 1.282051282051282, + "grad_norm": 0.10667116194963455, + "learning_rate": 1.3546462488507532e-05, + "loss": 0.6427603960037231, + "step": 700 + }, + { + "epoch": 1.2857142857142856, + "grad_norm": 0.2069520652294159, + "learning_rate": 1.3512287764930102e-05, + "loss": 0.5255513787269592, + "step": 702 + }, + { + "epoch": 1.2893772893772895, + "grad_norm": 0.4747946262359619, + "learning_rate": 1.347807371102567e-05, + "loss": 1.2734118700027466, + "step": 704 + }, + { + "epoch": 1.293040293040293, + "grad_norm": 1.2546977996826172, + "learning_rate": 1.3443820862421542e-05, + "loss": 1.0028886795043945, + "step": 706 + }, + { + "epoch": 1.2967032967032968, + "grad_norm": 0.09716126322746277, + "learning_rate": 1.3409529755352361e-05, + "loss": 0.9534561634063721, + "step": 708 + }, + { + "epoch": 1.3003663003663004, + "grad_norm": 0.16677714884281158, + "learning_rate": 1.3375200926651719e-05, + "loss": 0.5677421689033508, + "step": 710 + }, + { + "epoch": 1.304029304029304, + "grad_norm": 0.5648819208145142, + "learning_rate": 1.3340834913743742e-05, + "loss": 0.8859728574752808, + "step": 712 + }, + { + "epoch": 1.3076923076923077, + "grad_norm": 0.3466564118862152, + "learning_rate": 1.3306432254634676e-05, + "loss": 1.2071309089660645, + "step": 714 + }, + { + "epoch": 1.3113553113553114, + "grad_norm": 0.34816622734069824, + "learning_rate": 1.3271993487904485e-05, + "loss": 1.1303997039794922, + "step": 716 + }, + { + "epoch": 1.315018315018315, + "grad_norm": 0.17138248682022095, + "learning_rate": 1.3237519152698392e-05, + "loss": 0.9440135359764099, + "step": 718 + }, + { + "epoch": 1.3186813186813187, + "grad_norm": 0.2666979432106018, + "learning_rate": 1.3203009788718454e-05, + "loss": 0.9132227301597595, + "step": 720 + }, + { + "epoch": 1.3223443223443223, + "grad_norm": 0.3015995919704437, + "learning_rate": 1.3168465936215114e-05, + "loss": 0.9173396229743958, + "step": 722 + }, + { + "epoch": 1.326007326007326, + "grad_norm": 0.3288915157318115, + "learning_rate": 1.3133888135978733e-05, + "loss": 1.2012741565704346, + "step": 724 + }, + { + "epoch": 1.3296703296703296, + "grad_norm": 0.1542222797870636, + "learning_rate": 1.3099276929331132e-05, + "loss": 1.10904061794281, + "step": 726 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 0.22464217245578766, + "learning_rate": 1.3064632858117123e-05, + "loss": 1.1415292024612427, + "step": 728 + }, + { + "epoch": 1.3369963369963371, + "grad_norm": 0.5490389466285706, + "learning_rate": 1.3029956464696006e-05, + "loss": 0.7423273324966431, + "step": 730 + }, + { + "epoch": 1.3406593406593408, + "grad_norm": 0.4176447093486786, + "learning_rate": 1.2995248291933099e-05, + "loss": 0.8297606110572815, + "step": 732 + }, + { + "epoch": 1.3443223443223444, + "grad_norm": 0.05192068964242935, + "learning_rate": 1.296050888319123e-05, + "loss": 0.6149548292160034, + "step": 734 + }, + { + "epoch": 1.347985347985348, + "grad_norm": 0.8712805509567261, + "learning_rate": 1.2925738782322232e-05, + "loss": 1.1270866394042969, + "step": 736 + }, + { + "epoch": 1.3516483516483517, + "grad_norm": 0.20594623684883118, + "learning_rate": 1.2890938533658429e-05, + "loss": 0.7438766360282898, + "step": 738 + }, + { + "epoch": 1.3553113553113554, + "grad_norm": 0.23629705607891083, + "learning_rate": 1.2856108682004116e-05, + "loss": 1.00837242603302, + "step": 740 + }, + { + "epoch": 1.358974358974359, + "grad_norm": 1.347622036933899, + "learning_rate": 1.282124977262702e-05, + "loss": 0.6100921630859375, + "step": 742 + }, + { + "epoch": 1.3626373626373627, + "grad_norm": 0.1946464329957962, + "learning_rate": 1.2786362351249785e-05, + "loss": 1.2331260442733765, + "step": 744 + }, + { + "epoch": 1.3663003663003663, + "grad_norm": 0.2741944193840027, + "learning_rate": 1.2751446964041405e-05, + "loss": 1.0520378351211548, + "step": 746 + }, + { + "epoch": 1.36996336996337, + "grad_norm": 0.18286797404289246, + "learning_rate": 1.2716504157608693e-05, + "loss": 1.065680742263794, + "step": 748 + }, + { + "epoch": 1.3736263736263736, + "grad_norm": 0.47365298867225647, + "learning_rate": 1.2681534478987703e-05, + "loss": 0.7774507403373718, + "step": 750 + }, + { + "epoch": 1.3772893772893773, + "grad_norm": 0.19642506539821625, + "learning_rate": 1.264653847563519e-05, + "loss": 1.1186778545379639, + "step": 752 + }, + { + "epoch": 1.380952380952381, + "grad_norm": 0.12549656629562378, + "learning_rate": 1.2611516695420023e-05, + "loss": 0.963532567024231, + "step": 754 + }, + { + "epoch": 1.3846153846153846, + "grad_norm": 0.2738596796989441, + "learning_rate": 1.2576469686614608e-05, + "loss": 1.3037950992584229, + "step": 756 + }, + { + "epoch": 1.3882783882783882, + "grad_norm": 0.265229731798172, + "learning_rate": 1.2541397997886317e-05, + "loss": 1.2012255191802979, + "step": 758 + }, + { + "epoch": 1.3919413919413919, + "grad_norm": 0.11555524170398712, + "learning_rate": 1.2506302178288887e-05, + "loss": 1.145890712738037, + "step": 760 + }, + { + "epoch": 1.3956043956043955, + "grad_norm": 0.33266228437423706, + "learning_rate": 1.2471182777253832e-05, + "loss": 1.140316367149353, + "step": 762 + }, + { + "epoch": 1.3992673992673992, + "grad_norm": 0.371504545211792, + "learning_rate": 1.2436040344581824e-05, + "loss": 0.6883344650268555, + "step": 764 + }, + { + "epoch": 1.4029304029304028, + "grad_norm": 0.26427412033081055, + "learning_rate": 1.2400875430434119e-05, + "loss": 0.8899670243263245, + "step": 766 + }, + { + "epoch": 1.4065934065934065, + "grad_norm": 1.1619846820831299, + "learning_rate": 1.236568858532391e-05, + "loss": 0.8952260613441467, + "step": 768 + }, + { + "epoch": 1.4102564102564101, + "grad_norm": 0.22171390056610107, + "learning_rate": 1.2330480360107728e-05, + "loss": 1.182429552078247, + "step": 770 + }, + { + "epoch": 1.4139194139194138, + "grad_norm": 0.25736895203590393, + "learning_rate": 1.2295251305976818e-05, + "loss": 1.2108275890350342, + "step": 772 + }, + { + "epoch": 1.4175824175824177, + "grad_norm": 0.40875253081321716, + "learning_rate": 1.2260001974448504e-05, + "loss": 1.021303415298462, + "step": 774 + }, + { + "epoch": 1.4212454212454213, + "grad_norm": 0.13845305144786835, + "learning_rate": 1.222473291735754e-05, + "loss": 1.154242753982544, + "step": 776 + }, + { + "epoch": 1.424908424908425, + "grad_norm": 0.18540990352630615, + "learning_rate": 1.218944468684752e-05, + "loss": 0.8286201357841492, + "step": 778 + }, + { + "epoch": 1.4285714285714286, + "grad_norm": 0.18653152883052826, + "learning_rate": 1.215413783536217e-05, + "loss": 1.3623868227005005, + "step": 780 + }, + { + "epoch": 1.4322344322344323, + "grad_norm": 0.531924307346344, + "learning_rate": 1.2118812915636744e-05, + "loss": 1.2334803342819214, + "step": 782 + }, + { + "epoch": 1.435897435897436, + "grad_norm": 3.9369657039642334, + "learning_rate": 1.2083470480689363e-05, + "loss": 1.1573415994644165, + "step": 784 + }, + { + "epoch": 1.4395604395604396, + "grad_norm": 0.710831880569458, + "learning_rate": 1.2048111083812342e-05, + "loss": 0.9501171112060547, + "step": 786 + }, + { + "epoch": 1.4432234432234432, + "grad_norm": 0.5587278008460999, + "learning_rate": 1.2012735278563546e-05, + "loss": 1.135019063949585, + "step": 788 + }, + { + "epoch": 1.4468864468864469, + "grad_norm": 0.06678500026464462, + "learning_rate": 1.1977343618757702e-05, + "loss": 0.7230968475341797, + "step": 790 + }, + { + "epoch": 1.4505494505494505, + "grad_norm": 0.3107512295246124, + "learning_rate": 1.1941936658457769e-05, + "loss": 0.9185922741889954, + "step": 792 + }, + { + "epoch": 1.4542124542124542, + "grad_norm": 0.6749280691146851, + "learning_rate": 1.1906514951966208e-05, + "loss": 0.8022876977920532, + "step": 794 + }, + { + "epoch": 1.4578754578754578, + "grad_norm": 0.17742304503917694, + "learning_rate": 1.1871079053816357e-05, + "loss": 1.1445965766906738, + "step": 796 + }, + { + "epoch": 1.4615384615384617, + "grad_norm": 0.25644049048423767, + "learning_rate": 1.1835629518763714e-05, + "loss": 0.792290985584259, + "step": 798 + }, + { + "epoch": 1.4652014652014653, + "grad_norm": 0.275553822517395, + "learning_rate": 1.1800166901777272e-05, + "loss": 1.024118423461914, + "step": 800 + }, + { + "epoch": 1.468864468864469, + "grad_norm": 0.7942085266113281, + "learning_rate": 1.1764691758030825e-05, + "loss": 1.5601838827133179, + "step": 802 + }, + { + "epoch": 1.4725274725274726, + "grad_norm": 0.20729446411132812, + "learning_rate": 1.1729204642894265e-05, + "loss": 1.0184231996536255, + "step": 804 + }, + { + "epoch": 1.4761904761904763, + "grad_norm": 0.6443558931350708, + "learning_rate": 1.1693706111924912e-05, + "loss": 1.1878175735473633, + "step": 806 + }, + { + "epoch": 1.47985347985348, + "grad_norm": 0.24344100058078766, + "learning_rate": 1.1658196720858794e-05, + "loss": 1.1712795495986938, + "step": 808 + }, + { + "epoch": 1.4835164835164836, + "grad_norm": 0.5350912809371948, + "learning_rate": 1.1622677025601966e-05, + "loss": 0.998128354549408, + "step": 810 + }, + { + "epoch": 1.4871794871794872, + "grad_norm": 0.22139771282672882, + "learning_rate": 1.1587147582221776e-05, + "loss": 0.8508384823799133, + "step": 812 + }, + { + "epoch": 1.4908424908424909, + "grad_norm": 0.23050732910633087, + "learning_rate": 1.1551608946938208e-05, + "loss": 1.2045676708221436, + "step": 814 + }, + { + "epoch": 1.4945054945054945, + "grad_norm": 0.22233106195926666, + "learning_rate": 1.1516061676115124e-05, + "loss": 1.1350584030151367, + "step": 816 + }, + { + "epoch": 1.4981684981684982, + "grad_norm": 0.09867780655622482, + "learning_rate": 1.1480506326251595e-05, + "loss": 0.4009166657924652, + "step": 818 + }, + { + "epoch": 1.5018315018315018, + "grad_norm": 0.19808466732501984, + "learning_rate": 1.1444943453973155e-05, + "loss": 1.1480183601379395, + "step": 820 + }, + { + "epoch": 1.5054945054945055, + "grad_norm": 0.2406286746263504, + "learning_rate": 1.1409373616023111e-05, + "loss": 0.9088360071182251, + "step": 822 + }, + { + "epoch": 1.5091575091575091, + "grad_norm": 0.45124879479408264, + "learning_rate": 1.1373797369253818e-05, + "loss": 0.8226873874664307, + "step": 824 + }, + { + "epoch": 1.5128205128205128, + "grad_norm": 0.2533392310142517, + "learning_rate": 1.1338215270617967e-05, + "loss": 1.046976089477539, + "step": 826 + }, + { + "epoch": 1.5164835164835164, + "grad_norm": 0.9905543327331543, + "learning_rate": 1.130262787715985e-05, + "loss": 1.0114374160766602, + "step": 828 + }, + { + "epoch": 1.52014652014652, + "grad_norm": 0.423899382352829, + "learning_rate": 1.1267035746006658e-05, + "loss": 1.003442406654358, + "step": 830 + }, + { + "epoch": 1.5238095238095237, + "grad_norm": 0.5581493377685547, + "learning_rate": 1.1231439434359755e-05, + "loss": 1.2469056844711304, + "step": 832 + }, + { + "epoch": 1.5274725274725274, + "grad_norm": 0.25180450081825256, + "learning_rate": 1.119583949948594e-05, + "loss": 0.8143158555030823, + "step": 834 + }, + { + "epoch": 1.531135531135531, + "grad_norm": 0.8289534449577332, + "learning_rate": 1.1160236498708742e-05, + "loss": 1.1500300168991089, + "step": 836 + }, + { + "epoch": 1.5347985347985347, + "grad_norm": 0.6642610430717468, + "learning_rate": 1.112463098939969e-05, + "loss": 1.2071179151535034, + "step": 838 + }, + { + "epoch": 1.5384615384615383, + "grad_norm": 0.41747331619262695, + "learning_rate": 1.1089023528969576e-05, + "loss": 1.0496968030929565, + "step": 840 + }, + { + "epoch": 1.542124542124542, + "grad_norm": 0.20446696877479553, + "learning_rate": 1.1053414674859741e-05, + "loss": 0.8783029913902283, + "step": 842 + }, + { + "epoch": 1.5457875457875456, + "grad_norm": 0.3679927885532379, + "learning_rate": 1.1017804984533351e-05, + "loss": 1.0342862606048584, + "step": 844 + }, + { + "epoch": 1.5494505494505495, + "grad_norm": 0.16902367770671844, + "learning_rate": 1.0982195015466652e-05, + "loss": 1.1826444864273071, + "step": 846 + }, + { + "epoch": 1.5531135531135531, + "grad_norm": 1.2264890670776367, + "learning_rate": 1.0946585325140261e-05, + "loss": 0.5550611615180969, + "step": 848 + }, + { + "epoch": 1.5567765567765568, + "grad_norm": 0.5985056757926941, + "learning_rate": 1.0910976471030428e-05, + "loss": 1.0341655015945435, + "step": 850 + }, + { + "epoch": 1.5604395604395604, + "grad_norm": 0.19666093587875366, + "learning_rate": 1.0875369010600317e-05, + "loss": 1.1766899824142456, + "step": 852 + }, + { + "epoch": 1.564102564102564, + "grad_norm": 0.6608942151069641, + "learning_rate": 1.083976350129126e-05, + "loss": 0.8616961240768433, + "step": 854 + }, + { + "epoch": 1.5677655677655677, + "grad_norm": 0.24679680168628693, + "learning_rate": 1.0804160500514062e-05, + "loss": 0.809736430644989, + "step": 856 + }, + { + "epoch": 1.5714285714285714, + "grad_norm": 0.9515239596366882, + "learning_rate": 1.0768560565640252e-05, + "loss": 0.9475131630897522, + "step": 858 + }, + { + "epoch": 1.575091575091575, + "grad_norm": 0.8901296257972717, + "learning_rate": 1.0732964253993343e-05, + "loss": 0.7505902051925659, + "step": 860 + }, + { + "epoch": 1.578754578754579, + "grad_norm": 0.2663470506668091, + "learning_rate": 1.0697372122840156e-05, + "loss": 1.1575169563293457, + "step": 862 + }, + { + "epoch": 1.5824175824175826, + "grad_norm": 1.596577525138855, + "learning_rate": 1.0661784729382036e-05, + "loss": 0.9081068634986877, + "step": 864 + }, + { + "epoch": 1.5860805860805862, + "grad_norm": 0.19321048259735107, + "learning_rate": 1.0626202630746183e-05, + "loss": 1.1882286071777344, + "step": 866 + }, + { + "epoch": 1.5897435897435899, + "grad_norm": 0.2287406027317047, + "learning_rate": 1.0590626383976894e-05, + "loss": 1.1828335523605347, + "step": 868 + }, + { + "epoch": 1.5934065934065935, + "grad_norm": 0.8435899019241333, + "learning_rate": 1.055505654602685e-05, + "loss": 0.587890625, + "step": 870 + }, + { + "epoch": 1.5970695970695972, + "grad_norm": 0.290306955575943, + "learning_rate": 1.0519493673748406e-05, + "loss": 1.1838449239730835, + "step": 872 + }, + { + "epoch": 1.6007326007326008, + "grad_norm": 0.3988237679004669, + "learning_rate": 1.0483938323884879e-05, + "loss": 1.066159963607788, + "step": 874 + }, + { + "epoch": 1.6043956043956045, + "grad_norm": 0.3236004114151001, + "learning_rate": 1.0448391053061795e-05, + "loss": 0.7672710418701172, + "step": 876 + }, + { + "epoch": 1.6080586080586081, + "grad_norm": 0.41797590255737305, + "learning_rate": 1.0412852417778225e-05, + "loss": 0.994400143623352, + "step": 878 + }, + { + "epoch": 1.6117216117216118, + "grad_norm": 0.3434767425060272, + "learning_rate": 1.037732297439804e-05, + "loss": 0.9163567423820496, + "step": 880 + }, + { + "epoch": 1.6153846153846154, + "grad_norm": 0.4138431251049042, + "learning_rate": 1.034180327914121e-05, + "loss": 1.1349215507507324, + "step": 882 + }, + { + "epoch": 1.619047619047619, + "grad_norm": 1.8183345794677734, + "learning_rate": 1.030629388807509e-05, + "loss": 0.945977509021759, + "step": 884 + }, + { + "epoch": 1.6227106227106227, + "grad_norm": 0.15837231278419495, + "learning_rate": 1.0270795357105738e-05, + "loss": 1.1418633460998535, + "step": 886 + }, + { + "epoch": 1.6263736263736264, + "grad_norm": 0.18444959819316864, + "learning_rate": 1.023530824196918e-05, + "loss": 0.8405629396438599, + "step": 888 + }, + { + "epoch": 1.63003663003663, + "grad_norm": 0.7059314250946045, + "learning_rate": 1.019983309822273e-05, + "loss": 1.2206584215164185, + "step": 890 + }, + { + "epoch": 1.6336996336996337, + "grad_norm": 5.118255615234375, + "learning_rate": 1.0164370481236292e-05, + "loss": 0.9084610939025879, + "step": 892 + }, + { + "epoch": 1.6373626373626373, + "grad_norm": 0.31705963611602783, + "learning_rate": 1.0128920946183646e-05, + "loss": 1.1889030933380127, + "step": 894 + }, + { + "epoch": 1.641025641025641, + "grad_norm": 0.30195608735084534, + "learning_rate": 1.0093485048033798e-05, + "loss": 0.7536079287528992, + "step": 896 + }, + { + "epoch": 1.6446886446886446, + "grad_norm": 0.5159720778465271, + "learning_rate": 1.0058063341542238e-05, + "loss": 1.1412334442138672, + "step": 898 + }, + { + "epoch": 1.6483516483516483, + "grad_norm": 0.37968581914901733, + "learning_rate": 1.0022656381242297e-05, + "loss": 0.8290563821792603, + "step": 900 + }, + { + "epoch": 1.652014652014652, + "grad_norm": 0.21596157550811768, + "learning_rate": 9.98726472143646e-06, + "loss": 0.8873069286346436, + "step": 902 + }, + { + "epoch": 1.6556776556776556, + "grad_norm": 0.17106634378433228, + "learning_rate": 9.951888916187662e-06, + "loss": 1.0214476585388184, + "step": 904 + }, + { + "epoch": 1.6593406593406592, + "grad_norm": 0.31868913769721985, + "learning_rate": 9.916529519310638e-06, + "loss": 1.139350414276123, + "step": 906 + }, + { + "epoch": 1.6630036630036629, + "grad_norm": 0.19221165776252747, + "learning_rate": 9.881187084363257e-06, + "loss": 0.7617915868759155, + "step": 908 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 12.817173957824707, + "learning_rate": 9.845862164637834e-06, + "loss": 1.206006646156311, + "step": 910 + }, + { + "epoch": 1.6703296703296702, + "grad_norm": 0.4153216481208801, + "learning_rate": 9.810555313152486e-06, + "loss": 1.2200528383255005, + "step": 912 + }, + { + "epoch": 1.673992673992674, + "grad_norm": 0.33920755982398987, + "learning_rate": 9.775267082642461e-06, + "loss": 1.0979692935943604, + "step": 914 + }, + { + "epoch": 1.6776556776556777, + "grad_norm": 0.4049103260040283, + "learning_rate": 9.7399980255515e-06, + "loss": 1.2116343975067139, + "step": 916 + }, + { + "epoch": 1.6813186813186813, + "grad_norm": 0.2901177704334259, + "learning_rate": 9.704748694023183e-06, + "loss": 0.8435489535331726, + "step": 918 + }, + { + "epoch": 1.684981684981685, + "grad_norm": 0.29318225383758545, + "learning_rate": 9.669519639892275e-06, + "loss": 1.2188955545425415, + "step": 920 + }, + { + "epoch": 1.6886446886446886, + "grad_norm": 0.13588818907737732, + "learning_rate": 9.634311414676096e-06, + "loss": 1.0209497213363647, + "step": 922 + }, + { + "epoch": 1.6923076923076923, + "grad_norm": 0.16762422025203705, + "learning_rate": 9.599124569565887e-06, + "loss": 0.851481556892395, + "step": 924 + }, + { + "epoch": 1.695970695970696, + "grad_norm": 0.2845093309879303, + "learning_rate": 9.56395965541818e-06, + "loss": 0.9773625135421753, + "step": 926 + }, + { + "epoch": 1.6996336996336996, + "grad_norm": 0.2550683617591858, + "learning_rate": 9.528817222746171e-06, + "loss": 0.8091166615486145, + "step": 928 + }, + { + "epoch": 1.7032967032967035, + "grad_norm": 0.030724653974175453, + "learning_rate": 9.493697821711116e-06, + "loss": 0.8957905173301697, + "step": 930 + }, + { + "epoch": 1.7069597069597071, + "grad_norm": 0.6097201704978943, + "learning_rate": 9.458602002113684e-06, + "loss": 0.936105489730835, + "step": 932 + }, + { + "epoch": 1.7106227106227108, + "grad_norm": 0.16642826795578003, + "learning_rate": 9.423530313385395e-06, + "loss": 1.3963714838027954, + "step": 934 + }, + { + "epoch": 1.7142857142857144, + "grad_norm": 0.23555639386177063, + "learning_rate": 9.388483304579983e-06, + "loss": 1.195056676864624, + "step": 936 + }, + { + "epoch": 1.717948717948718, + "grad_norm": 0.2510616183280945, + "learning_rate": 9.353461524364814e-06, + "loss": 0.48157253861427307, + "step": 938 + }, + { + "epoch": 1.7216117216117217, + "grad_norm": 0.1841297298669815, + "learning_rate": 9.318465521012298e-06, + "loss": 0.5561771988868713, + "step": 940 + }, + { + "epoch": 1.7252747252747254, + "grad_norm": 0.24431364238262177, + "learning_rate": 9.283495842391313e-06, + "loss": 1.127332329750061, + "step": 942 + }, + { + "epoch": 1.728937728937729, + "grad_norm": 0.34080448746681213, + "learning_rate": 9.248553035958596e-06, + "loss": 0.931099534034729, + "step": 944 + }, + { + "epoch": 1.7326007326007327, + "grad_norm": 0.25586462020874023, + "learning_rate": 9.213637648750217e-06, + "loss": 1.1583143472671509, + "step": 946 + }, + { + "epoch": 1.7362637362637363, + "grad_norm": 0.17189502716064453, + "learning_rate": 9.178750227372983e-06, + "loss": 1.1040928363800049, + "step": 948 + }, + { + "epoch": 1.73992673992674, + "grad_norm": 0.21041589975357056, + "learning_rate": 9.143891317995888e-06, + "loss": 0.9976288676261902, + "step": 950 + }, + { + "epoch": 1.7435897435897436, + "grad_norm": 0.2621840238571167, + "learning_rate": 9.109061466341576e-06, + "loss": 0.9492763876914978, + "step": 952 + }, + { + "epoch": 1.7472527472527473, + "grad_norm": 1.197185754776001, + "learning_rate": 9.074261217677771e-06, + "loss": 1.2083392143249512, + "step": 954 + }, + { + "epoch": 1.750915750915751, + "grad_norm": 0.19799767434597015, + "learning_rate": 9.039491116808773e-06, + "loss": 0.7902243137359619, + "step": 956 + }, + { + "epoch": 1.7545787545787546, + "grad_norm": 0.4546964466571808, + "learning_rate": 9.004751708066906e-06, + "loss": 1.2029194831848145, + "step": 958 + }, + { + "epoch": 1.7582417582417582, + "grad_norm": 0.13221463561058044, + "learning_rate": 8.970043535303999e-06, + "loss": 0.5475955009460449, + "step": 960 + }, + { + "epoch": 1.7619047619047619, + "grad_norm": 0.378535658121109, + "learning_rate": 8.93536714188288e-06, + "loss": 0.9881755709648132, + "step": 962 + }, + { + "epoch": 1.7655677655677655, + "grad_norm": 0.2644976079463959, + "learning_rate": 8.900723070668869e-06, + "loss": 1.0287450551986694, + "step": 964 + }, + { + "epoch": 1.7692307692307692, + "grad_norm": 0.6430861353874207, + "learning_rate": 8.86611186402127e-06, + "loss": 1.1860679388046265, + "step": 966 + }, + { + "epoch": 1.7728937728937728, + "grad_norm": 0.4337344467639923, + "learning_rate": 8.831534063784891e-06, + "loss": 0.5775477886199951, + "step": 968 + }, + { + "epoch": 1.7765567765567765, + "grad_norm": 0.27584108710289, + "learning_rate": 8.796990211281549e-06, + "loss": 0.8390186429023743, + "step": 970 + }, + { + "epoch": 1.7802197802197801, + "grad_norm": 0.7176196575164795, + "learning_rate": 8.76248084730161e-06, + "loss": 0.9147000908851624, + "step": 972 + }, + { + "epoch": 1.7838827838827838, + "grad_norm": 0.29480162262916565, + "learning_rate": 8.728006512095517e-06, + "loss": 1.221879243850708, + "step": 974 + }, + { + "epoch": 1.7875457875457874, + "grad_norm": 0.2008034586906433, + "learning_rate": 8.693567745365325e-06, + "loss": 1.1880409717559814, + "step": 976 + }, + { + "epoch": 1.791208791208791, + "grad_norm": 0.2755773961544037, + "learning_rate": 8.659165086256263e-06, + "loss": 0.9174471497535706, + "step": 978 + }, + { + "epoch": 1.7948717948717947, + "grad_norm": 0.23839768767356873, + "learning_rate": 8.624799073348282e-06, + "loss": 0.9608420729637146, + "step": 980 + }, + { + "epoch": 1.7985347985347986, + "grad_norm": 0.33076974749565125, + "learning_rate": 8.590470244647643e-06, + "loss": 1.1425825357437134, + "step": 982 + }, + { + "epoch": 1.8021978021978022, + "grad_norm": 1.4269599914550781, + "learning_rate": 8.556179137578461e-06, + "loss": 1.1003776788711548, + "step": 984 + }, + { + "epoch": 1.8058608058608059, + "grad_norm": 0.10697717219591141, + "learning_rate": 8.521926288974336e-06, + "loss": 0.6553662419319153, + "step": 986 + }, + { + "epoch": 1.8095238095238095, + "grad_norm": 0.325648695230484, + "learning_rate": 8.487712235069901e-06, + "loss": 0.8113435506820679, + "step": 988 + }, + { + "epoch": 1.8131868131868132, + "grad_norm": 0.7604925632476807, + "learning_rate": 8.453537511492469e-06, + "loss": 0.7590834498405457, + "step": 990 + }, + { + "epoch": 1.8168498168498168, + "grad_norm": 0.5670424699783325, + "learning_rate": 8.419402653253623e-06, + "loss": 0.7830116748809814, + "step": 992 + }, + { + "epoch": 1.8205128205128205, + "grad_norm": 0.2287733256816864, + "learning_rate": 8.385308194740846e-06, + "loss": 0.8508743047714233, + "step": 994 + }, + { + "epoch": 1.8241758241758241, + "grad_norm": 0.3089318573474884, + "learning_rate": 8.35125466970915e-06, + "loss": 1.1281416416168213, + "step": 996 + }, + { + "epoch": 1.8278388278388278, + "grad_norm": 0.19302226603031158, + "learning_rate": 8.317242611272745e-06, + "loss": 0.8560003042221069, + "step": 998 + }, + { + "epoch": 1.8315018315018317, + "grad_norm": 0.21388359367847443, + "learning_rate": 8.283272551896649e-06, + "loss": 1.1172125339508057, + "step": 1000 + }, + { + "epoch": 1.8351648351648353, + "grad_norm": 0.4454153776168823, + "learning_rate": 8.249345023388393e-06, + "loss": 1.1452471017837524, + "step": 1002 + }, + { + "epoch": 1.838827838827839, + "grad_norm": 0.4535501301288605, + "learning_rate": 8.21546055688968e-06, + "loss": 1.1414506435394287, + "step": 1004 + }, + { + "epoch": 1.8424908424908426, + "grad_norm": 0.3102174699306488, + "learning_rate": 8.181619682868059e-06, + "loss": 1.1409966945648193, + "step": 1006 + }, + { + "epoch": 1.8461538461538463, + "grad_norm": 0.29411637783050537, + "learning_rate": 8.147822931108638e-06, + "loss": 0.7928659915924072, + "step": 1008 + }, + { + "epoch": 1.84981684981685, + "grad_norm": 0.39852795004844666, + "learning_rate": 8.114070830705785e-06, + "loss": 1.103369116783142, + "step": 1010 + }, + { + "epoch": 1.8534798534798536, + "grad_norm": 0.10598917305469513, + "learning_rate": 8.080363910054833e-06, + "loss": 0.765097439289093, + "step": 1012 + }, + { + "epoch": 1.8571428571428572, + "grad_norm": 0.25210511684417725, + "learning_rate": 8.04670269684383e-06, + "loss": 1.1578898429870605, + "step": 1014 + }, + { + "epoch": 1.8608058608058609, + "grad_norm": 0.25960707664489746, + "learning_rate": 8.013087718045256e-06, + "loss": 1.1436736583709717, + "step": 1016 + }, + { + "epoch": 1.8644688644688645, + "grad_norm": 0.8823764324188232, + "learning_rate": 7.979519499907786e-06, + "loss": 1.224177598953247, + "step": 1018 + }, + { + "epoch": 1.8681318681318682, + "grad_norm": 0.031707342714071274, + "learning_rate": 7.945998567948052e-06, + "loss": 0.9655585289001465, + "step": 1020 + }, + { + "epoch": 1.8717948717948718, + "grad_norm": 0.4221316874027252, + "learning_rate": 7.912525446942406e-06, + "loss": 1.1163830757141113, + "step": 1022 + }, + { + "epoch": 1.8754578754578755, + "grad_norm": 0.4291134774684906, + "learning_rate": 7.879100660918713e-06, + "loss": 0.5574125051498413, + "step": 1024 + }, + { + "epoch": 1.879120879120879, + "grad_norm": 0.27034541964530945, + "learning_rate": 7.845724733148149e-06, + "loss": 1.154983401298523, + "step": 1026 + }, + { + "epoch": 1.8827838827838828, + "grad_norm": 0.6794618964195251, + "learning_rate": 7.812398186136994e-06, + "loss": 0.9258843064308167, + "step": 1028 + }, + { + "epoch": 1.8864468864468864, + "grad_norm": 0.2529967129230499, + "learning_rate": 7.779121541618478e-06, + "loss": 1.1542246341705322, + "step": 1030 + }, + { + "epoch": 1.89010989010989, + "grad_norm": 0.2132919579744339, + "learning_rate": 7.74589532054459e-06, + "loss": 0.9495857954025269, + "step": 1032 + }, + { + "epoch": 1.8937728937728937, + "grad_norm": 1.0942879915237427, + "learning_rate": 7.712720043077929e-06, + "loss": 0.7942955493927002, + "step": 1034 + }, + { + "epoch": 1.8974358974358974, + "grad_norm": 0.38435038924217224, + "learning_rate": 7.679596228583563e-06, + "loss": 1.1871325969696045, + "step": 1036 + }, + { + "epoch": 1.901098901098901, + "grad_norm": 0.26236408948898315, + "learning_rate": 7.646524395620908e-06, + "loss": 1.1567392349243164, + "step": 1038 + }, + { + "epoch": 1.9047619047619047, + "grad_norm": 0.7066712975502014, + "learning_rate": 7.613505061935584e-06, + "loss": 1.227746844291687, + "step": 1040 + }, + { + "epoch": 1.9084249084249083, + "grad_norm": 0.363916277885437, + "learning_rate": 7.580538744451336e-06, + "loss": 0.6182924509048462, + "step": 1042 + }, + { + "epoch": 1.912087912087912, + "grad_norm": 0.4058935046195984, + "learning_rate": 7.547625959261928e-06, + "loss": 0.8744593858718872, + "step": 1044 + }, + { + "epoch": 1.9157509157509156, + "grad_norm": 0.15468518435955048, + "learning_rate": 7.5147672216230605e-06, + "loss": 1.1123764514923096, + "step": 1046 + }, + { + "epoch": 1.9194139194139193, + "grad_norm": 0.3142243027687073, + "learning_rate": 7.481963045944318e-06, + "loss": 0.4596446752548218, + "step": 1048 + }, + { + "epoch": 1.9230769230769231, + "grad_norm": 1.057568907737732, + "learning_rate": 7.449213945781102e-06, + "loss": 0.8759109973907471, + "step": 1050 + }, + { + "epoch": 1.9267399267399268, + "grad_norm": 0.2798309028148651, + "learning_rate": 7.416520433826599e-06, + "loss": 0.8125253319740295, + "step": 1052 + }, + { + "epoch": 1.9304029304029304, + "grad_norm": 0.3273449242115021, + "learning_rate": 7.383883021903755e-06, + "loss": 1.1237201690673828, + "step": 1054 + }, + { + "epoch": 1.934065934065934, + "grad_norm": 1.727766990661621, + "learning_rate": 7.351302220957251e-06, + "loss": 0.7471660375595093, + "step": 1056 + }, + { + "epoch": 1.9377289377289377, + "grad_norm": 0.21788397431373596, + "learning_rate": 7.318778541045517e-06, + "loss": 0.9874610304832458, + "step": 1058 + }, + { + "epoch": 1.9413919413919414, + "grad_norm": 0.3261561393737793, + "learning_rate": 7.286312491332754e-06, + "loss": 1.0792856216430664, + "step": 1060 + }, + { + "epoch": 1.945054945054945, + "grad_norm": 0.19054535031318665, + "learning_rate": 7.253904580080926e-06, + "loss": 0.8402732014656067, + "step": 1062 + }, + { + "epoch": 1.9487179487179487, + "grad_norm": 0.28284960985183716, + "learning_rate": 7.221555314641853e-06, + "loss": 0.8096812963485718, + "step": 1064 + }, + { + "epoch": 1.9523809523809523, + "grad_norm": 0.4529983699321747, + "learning_rate": 7.18926520144924e-06, + "loss": 1.15970778465271, + "step": 1066 + }, + { + "epoch": 1.9560439560439562, + "grad_norm": 0.2033073455095291, + "learning_rate": 7.1570347460107335e-06, + "loss": 1.1832486391067505, + "step": 1068 + }, + { + "epoch": 1.9597069597069599, + "grad_norm": 0.09281979501247406, + "learning_rate": 7.124864452900049e-06, + "loss": 0.7031117081642151, + "step": 1070 + }, + { + "epoch": 1.9633699633699635, + "grad_norm": 0.20894154906272888, + "learning_rate": 7.0927548257490465e-06, + "loss": 0.8534321188926697, + "step": 1072 + }, + { + "epoch": 1.9670329670329672, + "grad_norm": 0.292833149433136, + "learning_rate": 7.060706367239836e-06, + "loss": 1.1420583724975586, + "step": 1074 + }, + { + "epoch": 1.9706959706959708, + "grad_norm": 0.16723449528217316, + "learning_rate": 7.028719579096932e-06, + "loss": 1.12359619140625, + "step": 1076 + }, + { + "epoch": 1.9743589743589745, + "grad_norm": 0.36365193128585815, + "learning_rate": 6.9967949620793854e-06, + "loss": 0.999005138874054, + "step": 1078 + }, + { + "epoch": 1.978021978021978, + "grad_norm": 0.2431362122297287, + "learning_rate": 6.964933015972947e-06, + "loss": 1.0200340747833252, + "step": 1080 + }, + { + "epoch": 1.9816849816849818, + "grad_norm": 0.23444706201553345, + "learning_rate": 6.933134239582246e-06, + "loss": 1.1037390232086182, + "step": 1082 + }, + { + "epoch": 1.9853479853479854, + "grad_norm": 0.13071784377098083, + "learning_rate": 6.9013991307229745e-06, + "loss": 0.7239289879798889, + "step": 1084 + }, + { + "epoch": 1.989010989010989, + "grad_norm": 1.6581590175628662, + "learning_rate": 6.869728186214093e-06, + "loss": 0.9764227271080017, + "step": 1086 + }, + { + "epoch": 1.9926739926739927, + "grad_norm": 0.40288543701171875, + "learning_rate": 6.8381219018700675e-06, + "loss": 0.976794421672821, + "step": 1088 + }, + { + "epoch": 1.9963369963369964, + "grad_norm": 0.14982423186302185, + "learning_rate": 6.806580772493088e-06, + "loss": 0.9331335425376892, + "step": 1090 + }, + { + "epoch": 2.0, + "grad_norm": 0.9287575483322144, + "learning_rate": 6.775105291865343e-06, + "loss": 1.042527198791504, + "step": 1092 + }, + { + "epoch": 2.0036630036630036, + "grad_norm": 0.3157676160335541, + "learning_rate": 6.743695952741265e-06, + "loss": 1.0824307203292847, + "step": 1094 + }, + { + "epoch": 2.0073260073260073, + "grad_norm": 0.2867039144039154, + "learning_rate": 6.71235324683983e-06, + "loss": 1.1392470598220825, + "step": 1096 + }, + { + "epoch": 2.010989010989011, + "grad_norm": 0.24429775774478912, + "learning_rate": 6.681077664836872e-06, + "loss": 1.0826945304870605, + "step": 1098 + }, + { + "epoch": 2.0146520146520146, + "grad_norm": 0.49506935477256775, + "learning_rate": 6.649869696357381e-06, + "loss": 1.162758469581604, + "step": 1100 + }, + { + "epoch": 2.0183150183150182, + "grad_norm": 0.2504410743713379, + "learning_rate": 6.6187298299678295e-06, + "loss": 0.8383739590644836, + "step": 1102 + }, + { + "epoch": 2.021978021978022, + "grad_norm": 0.2852114140987396, + "learning_rate": 6.587658553168563e-06, + "loss": 1.142665147781372, + "step": 1104 + }, + { + "epoch": 2.0256410256410255, + "grad_norm": 0.21823060512542725, + "learning_rate": 6.556656352386135e-06, + "loss": 0.7671647667884827, + "step": 1106 + }, + { + "epoch": 2.029304029304029, + "grad_norm": 0.23471537232398987, + "learning_rate": 6.525723712965698e-06, + "loss": 1.1795700788497925, + "step": 1108 + }, + { + "epoch": 2.032967032967033, + "grad_norm": 0.2844531536102295, + "learning_rate": 6.494861119163412e-06, + "loss": 0.8791682720184326, + "step": 1110 + }, + { + "epoch": 2.0366300366300365, + "grad_norm": 0.09012774378061295, + "learning_rate": 6.464069054138853e-06, + "loss": 0.6179495453834534, + "step": 1112 + }, + { + "epoch": 2.04029304029304, + "grad_norm": 0.4640715718269348, + "learning_rate": 6.433347999947468e-06, + "loss": 0.8358191251754761, + "step": 1114 + }, + { + "epoch": 2.043956043956044, + "grad_norm": 0.4230431318283081, + "learning_rate": 6.402698437533012e-06, + "loss": 1.157958984375, + "step": 1116 + }, + { + "epoch": 2.0476190476190474, + "grad_norm": 0.2907547950744629, + "learning_rate": 6.372120846720018e-06, + "loss": 1.117270827293396, + "step": 1118 + }, + { + "epoch": 2.051282051282051, + "grad_norm": 0.406525582075119, + "learning_rate": 6.341615706206292e-06, + "loss": 0.840583324432373, + "step": 1120 + }, + { + "epoch": 2.0549450549450547, + "grad_norm": 3.0575461387634277, + "learning_rate": 6.311183493555426e-06, + "loss": 1.3155573606491089, + "step": 1122 + }, + { + "epoch": 2.0586080586080584, + "grad_norm": 0.2396925538778305, + "learning_rate": 6.280824685189296e-06, + "loss": 1.136577844619751, + "step": 1124 + }, + { + "epoch": 2.062271062271062, + "grad_norm": 0.2372613549232483, + "learning_rate": 6.25053975638064e-06, + "loss": 0.761581540107727, + "step": 1126 + }, + { + "epoch": 2.065934065934066, + "grad_norm": 0.1678687483072281, + "learning_rate": 6.220329181245585e-06, + "loss": 1.104315161705017, + "step": 1128 + }, + { + "epoch": 2.06959706959707, + "grad_norm": 0.24123162031173706, + "learning_rate": 6.1901934327362355e-06, + "loss": 1.0985783338546753, + "step": 1130 + }, + { + "epoch": 2.0732600732600734, + "grad_norm": 0.2733916640281677, + "learning_rate": 6.16013298263328e-06, + "loss": 1.13059401512146, + "step": 1132 + }, + { + "epoch": 2.076923076923077, + "grad_norm": 0.2685484290122986, + "learning_rate": 6.130148301538601e-06, + "loss": 1.1149473190307617, + "step": 1134 + }, + { + "epoch": 2.0805860805860807, + "grad_norm": 0.2145625352859497, + "learning_rate": 6.100239858867887e-06, + "loss": 0.7262951731681824, + "step": 1136 + }, + { + "epoch": 2.0842490842490844, + "grad_norm": 0.21125853061676025, + "learning_rate": 6.070408122843311e-06, + "loss": 1.1171889305114746, + "step": 1138 + }, + { + "epoch": 2.087912087912088, + "grad_norm": 0.6227608323097229, + "learning_rate": 6.040653560486183e-06, + "loss": 1.0216211080551147, + "step": 1140 + }, + { + "epoch": 2.0915750915750917, + "grad_norm": 1.5662765502929688, + "learning_rate": 6.010976637609653e-06, + "loss": 1.1097993850708008, + "step": 1142 + }, + { + "epoch": 2.0952380952380953, + "grad_norm": 0.17495885491371155, + "learning_rate": 5.9813778188114125e-06, + "loss": 0.38371744751930237, + "step": 1144 + }, + { + "epoch": 2.098901098901099, + "grad_norm": 0.5771632790565491, + "learning_rate": 5.951857567466401e-06, + "loss": 0.9106112718582153, + "step": 1146 + }, + { + "epoch": 2.1025641025641026, + "grad_norm": 0.33344414830207825, + "learning_rate": 5.922416345719588e-06, + "loss": 0.8066908717155457, + "step": 1148 + }, + { + "epoch": 2.1062271062271063, + "grad_norm": 0.22791945934295654, + "learning_rate": 5.893054614478718e-06, + "loss": 0.8162147998809814, + "step": 1150 + }, + { + "epoch": 2.10989010989011, + "grad_norm": 0.2723308801651001, + "learning_rate": 5.8637728334070905e-06, + "loss": 0.9493057131767273, + "step": 1152 + }, + { + "epoch": 2.1135531135531136, + "grad_norm": 0.5839288830757141, + "learning_rate": 5.834571460916371e-06, + "loss": 0.7895359992980957, + "step": 1154 + }, + { + "epoch": 2.1172161172161172, + "grad_norm": 0.22610190510749817, + "learning_rate": 5.805450954159422e-06, + "loss": 1.0748239755630493, + "step": 1156 + }, + { + "epoch": 2.120879120879121, + "grad_norm": 0.5260972380638123, + "learning_rate": 5.776411769023127e-06, + "loss": 1.0262991189956665, + "step": 1158 + }, + { + "epoch": 2.1245421245421245, + "grad_norm": 1.0613799095153809, + "learning_rate": 5.747454360121274e-06, + "loss": 0.8698439002037048, + "step": 1160 + }, + { + "epoch": 2.128205128205128, + "grad_norm": 0.7695472240447998, + "learning_rate": 5.718579180787425e-06, + "loss": 0.7719033360481262, + "step": 1162 + }, + { + "epoch": 2.131868131868132, + "grad_norm": 0.3090405762195587, + "learning_rate": 5.689786683067817e-06, + "loss": 0.9184219837188721, + "step": 1164 + }, + { + "epoch": 2.1355311355311355, + "grad_norm": 0.3468276262283325, + "learning_rate": 5.661077317714303e-06, + "loss": 0.4223445951938629, + "step": 1166 + }, + { + "epoch": 2.139194139194139, + "grad_norm": 0.24201928079128265, + "learning_rate": 5.632451534177276e-06, + "loss": 0.4232040047645569, + "step": 1168 + }, + { + "epoch": 2.142857142857143, + "grad_norm": 0.6462178826332092, + "learning_rate": 5.603909780598644e-06, + "loss": 0.8980230689048767, + "step": 1170 + }, + { + "epoch": 2.1465201465201464, + "grad_norm": 0.6586897969245911, + "learning_rate": 5.575452503804805e-06, + "loss": 1.1271127462387085, + "step": 1172 + }, + { + "epoch": 2.15018315018315, + "grad_norm": 0.5097822546958923, + "learning_rate": 5.5470801492996605e-06, + "loss": 1.2748925685882568, + "step": 1174 + }, + { + "epoch": 2.1538461538461537, + "grad_norm": 0.3521871864795685, + "learning_rate": 5.518793161257641e-06, + "loss": 0.7676212191581726, + "step": 1176 + }, + { + "epoch": 2.1575091575091574, + "grad_norm": 0.15629425644874573, + "learning_rate": 5.490591982516749e-06, + "loss": 1.1415168046951294, + "step": 1178 + }, + { + "epoch": 2.161172161172161, + "grad_norm": 0.1941349059343338, + "learning_rate": 5.462477054571617e-06, + "loss": 1.12305748462677, + "step": 1180 + }, + { + "epoch": 2.1648351648351647, + "grad_norm": 0.22985997796058655, + "learning_rate": 5.4344488175666154e-06, + "loss": 1.1313296556472778, + "step": 1182 + }, + { + "epoch": 2.1684981684981683, + "grad_norm": 0.15692557394504547, + "learning_rate": 5.406507710288955e-06, + "loss": 1.1444305181503296, + "step": 1184 + }, + { + "epoch": 2.172161172161172, + "grad_norm": 0.247311532497406, + "learning_rate": 5.378654170161805e-06, + "loss": 0.31945711374282837, + "step": 1186 + }, + { + "epoch": 2.1758241758241756, + "grad_norm": 1.0448226928710938, + "learning_rate": 5.3508886332374534e-06, + "loss": 1.2239130735397339, + "step": 1188 + }, + { + "epoch": 2.1794871794871793, + "grad_norm": 1.5233474969863892, + "learning_rate": 5.323211534190496e-06, + "loss": 0.7491469979286194, + "step": 1190 + }, + { + "epoch": 2.183150183150183, + "grad_norm": 0.4820270538330078, + "learning_rate": 5.295623306310999e-06, + "loss": 0.9774181246757507, + "step": 1192 + }, + { + "epoch": 2.186813186813187, + "grad_norm": 0.19126620888710022, + "learning_rate": 5.268124381497755e-06, + "loss": 0.8192515969276428, + "step": 1194 + }, + { + "epoch": 2.1904761904761907, + "grad_norm": 0.45134642720222473, + "learning_rate": 5.240715190251484e-06, + "loss": 0.8925463557243347, + "step": 1196 + }, + { + "epoch": 2.1941391941391943, + "grad_norm": 0.4911046624183655, + "learning_rate": 5.213396161668111e-06, + "loss": 1.1126856803894043, + "step": 1198 + }, + { + "epoch": 2.197802197802198, + "grad_norm": 3.0808489322662354, + "learning_rate": 5.186167723432061e-06, + "loss": 0.7509762048721313, + "step": 1200 + }, + { + "epoch": 2.2014652014652016, + "grad_norm": 0.27347448468208313, + "learning_rate": 5.159030301809534e-06, + "loss": 1.1802153587341309, + "step": 1202 + }, + { + "epoch": 2.2051282051282053, + "grad_norm": 0.2847099006175995, + "learning_rate": 5.131984321641865e-06, + "loss": 1.129975438117981, + "step": 1204 + }, + { + "epoch": 2.208791208791209, + "grad_norm": 0.4671761393547058, + "learning_rate": 5.105030206338843e-06, + "loss": 0.4967349171638489, + "step": 1206 + }, + { + "epoch": 2.2124542124542126, + "grad_norm": 0.3511367738246918, + "learning_rate": 5.0781683778720965e-06, + "loss": 0.7303721308708191, + "step": 1208 + }, + { + "epoch": 2.2161172161172162, + "grad_norm": 0.5738789439201355, + "learning_rate": 5.051399256768498e-06, + "loss": 0.6005207896232605, + "step": 1210 + }, + { + "epoch": 2.21978021978022, + "grad_norm": 0.14039315283298492, + "learning_rate": 5.024723262103559e-06, + "loss": 1.0828591585159302, + "step": 1212 + }, + { + "epoch": 2.2234432234432235, + "grad_norm": 0.6400618553161621, + "learning_rate": 4.998140811494881e-06, + "loss": 0.8194615840911865, + "step": 1214 + }, + { + "epoch": 2.227106227106227, + "grad_norm": 0.4215098023414612, + "learning_rate": 4.971652321095614e-06, + "loss": 0.8995423913002014, + "step": 1216 + }, + { + "epoch": 2.230769230769231, + "grad_norm": 0.35582196712493896, + "learning_rate": 4.945258205587955e-06, + "loss": 1.1104472875595093, + "step": 1218 + }, + { + "epoch": 2.2344322344322345, + "grad_norm": 0.3917670249938965, + "learning_rate": 4.918958878176628e-06, + "loss": 1.117361068725586, + "step": 1220 + }, + { + "epoch": 2.238095238095238, + "grad_norm": 0.25310394167900085, + "learning_rate": 4.8927547505824465e-06, + "loss": 1.1276395320892334, + "step": 1222 + }, + { + "epoch": 2.241758241758242, + "grad_norm": 0.3766920566558838, + "learning_rate": 4.866646233035845e-06, + "loss": 0.5073704123497009, + "step": 1224 + }, + { + "epoch": 2.2454212454212454, + "grad_norm": 0.36118075251579285, + "learning_rate": 4.840633734270464e-06, + "loss": 1.083837866783142, + "step": 1226 + }, + { + "epoch": 2.249084249084249, + "grad_norm": 0.9737128019332886, + "learning_rate": 4.814717661516762e-06, + "loss": 0.6141456365585327, + "step": 1228 + }, + { + "epoch": 2.2527472527472527, + "grad_norm": 0.060193657875061035, + "learning_rate": 4.788898420495622e-06, + "loss": 0.5997035503387451, + "step": 1230 + }, + { + "epoch": 2.2564102564102564, + "grad_norm": 0.1481347531080246, + "learning_rate": 4.763176415412006e-06, + "loss": 0.6170958876609802, + "step": 1232 + }, + { + "epoch": 2.26007326007326, + "grad_norm": 2.355868339538574, + "learning_rate": 4.7375520489486395e-06, + "loss": 1.1160298585891724, + "step": 1234 + }, + { + "epoch": 2.2637362637362637, + "grad_norm": 0.468891978263855, + "learning_rate": 4.71202572225969e-06, + "loss": 0.7722483277320862, + "step": 1236 + }, + { + "epoch": 2.2673992673992673, + "grad_norm": 0.17015230655670166, + "learning_rate": 4.686597834964499e-06, + "loss": 0.9469270706176758, + "step": 1238 + }, + { + "epoch": 2.271062271062271, + "grad_norm": 0.5269258618354797, + "learning_rate": 4.661268785141316e-06, + "loss": 0.8888347744941711, + "step": 1240 + }, + { + "epoch": 2.2747252747252746, + "grad_norm": 0.799193799495697, + "learning_rate": 4.636038969321073e-06, + "loss": 0.9122021198272705, + "step": 1242 + }, + { + "epoch": 2.2783882783882783, + "grad_norm": 0.239344522356987, + "learning_rate": 4.610908782481179e-06, + "loss": 1.1070375442504883, + "step": 1244 + }, + { + "epoch": 2.282051282051282, + "grad_norm": 0.10058314353227615, + "learning_rate": 4.5858786180393326e-06, + "loss": 0.7249715328216553, + "step": 1246 + }, + { + "epoch": 2.2857142857142856, + "grad_norm": 0.4845520853996277, + "learning_rate": 4.560948867847359e-06, + "loss": 0.7078754305839539, + "step": 1248 + }, + { + "epoch": 2.2893772893772892, + "grad_norm": 0.3414648175239563, + "learning_rate": 4.536119922185082e-06, + "loss": 1.100352168083191, + "step": 1250 + }, + { + "epoch": 2.293040293040293, + "grad_norm": 0.4015507400035858, + "learning_rate": 4.511392169754214e-06, + "loss": 0.8295157551765442, + "step": 1252 + }, + { + "epoch": 2.2967032967032965, + "grad_norm": 1.4144065380096436, + "learning_rate": 4.486765997672263e-06, + "loss": 1.1649739742279053, + "step": 1254 + }, + { + "epoch": 2.3003663003663, + "grad_norm": 0.19826774299144745, + "learning_rate": 4.46224179146649e-06, + "loss": 1.1785088777542114, + "step": 1256 + }, + { + "epoch": 2.304029304029304, + "grad_norm": 0.8159498572349548, + "learning_rate": 4.437819935067847e-06, + "loss": 0.7735372185707092, + "step": 1258 + }, + { + "epoch": 2.3076923076923075, + "grad_norm": 0.6313501596450806, + "learning_rate": 4.413500810804986e-06, + "loss": 1.1706268787384033, + "step": 1260 + }, + { + "epoch": 2.311355311355311, + "grad_norm": 0.29691141843795776, + "learning_rate": 4.389284799398276e-06, + "loss": 1.0271168947219849, + "step": 1262 + }, + { + "epoch": 2.315018315018315, + "grad_norm": 0.36938998103141785, + "learning_rate": 4.365172279953825e-06, + "loss": 0.8776956796646118, + "step": 1264 + }, + { + "epoch": 2.3186813186813184, + "grad_norm": 0.44165417551994324, + "learning_rate": 4.34116362995756e-06, + "loss": 0.7929419279098511, + "step": 1266 + }, + { + "epoch": 2.3223443223443225, + "grad_norm": 0.5072553157806396, + "learning_rate": 4.317259225269313e-06, + "loss": 0.8222364187240601, + "step": 1268 + }, + { + "epoch": 2.326007326007326, + "grad_norm": 0.37858933210372925, + "learning_rate": 4.293459440116935e-06, + "loss": 1.1856492757797241, + "step": 1270 + }, + { + "epoch": 2.32967032967033, + "grad_norm": 0.27582108974456787, + "learning_rate": 4.269764647090442e-06, + "loss": 1.0090421438217163, + "step": 1272 + }, + { + "epoch": 2.3333333333333335, + "grad_norm": 0.26256465911865234, + "learning_rate": 4.246175217136176e-06, + "loss": 0.6174538135528564, + "step": 1274 + }, + { + "epoch": 2.336996336996337, + "grad_norm": 0.2537238597869873, + "learning_rate": 4.2226915195509954e-06, + "loss": 1.0155659914016724, + "step": 1276 + }, + { + "epoch": 2.340659340659341, + "grad_norm": 0.8025590777397156, + "learning_rate": 4.199313921976511e-06, + "loss": 0.8254562616348267, + "step": 1278 + }, + { + "epoch": 2.3443223443223444, + "grad_norm": 0.2071162611246109, + "learning_rate": 4.176042790393313e-06, + "loss": 0.752740204334259, + "step": 1280 + }, + { + "epoch": 2.347985347985348, + "grad_norm": 0.381971538066864, + "learning_rate": 4.152878489115244e-06, + "loss": 0.6165816783905029, + "step": 1282 + }, + { + "epoch": 2.3516483516483517, + "grad_norm": 0.16895699501037598, + "learning_rate": 4.129821380783698e-06, + "loss": 1.092519760131836, + "step": 1284 + }, + { + "epoch": 2.3553113553113554, + "grad_norm": 2.325608491897583, + "learning_rate": 4.106871826361952e-06, + "loss": 0.7248493432998657, + "step": 1286 + }, + { + "epoch": 2.358974358974359, + "grad_norm": 0.22204965353012085, + "learning_rate": 4.084030185129495e-06, + "loss": 0.924081027507782, + "step": 1288 + }, + { + "epoch": 2.3626373626373627, + "grad_norm": 0.368304044008255, + "learning_rate": 4.061296814676429e-06, + "loss": 0.921589195728302, + "step": 1290 + }, + { + "epoch": 2.3663003663003663, + "grad_norm": 0.35086876153945923, + "learning_rate": 4.038672070897844e-06, + "loss": 0.7640153765678406, + "step": 1292 + }, + { + "epoch": 2.36996336996337, + "grad_norm": 0.29217588901519775, + "learning_rate": 4.016156307988262e-06, + "loss": 0.8899466395378113, + "step": 1294 + }, + { + "epoch": 2.3736263736263736, + "grad_norm": 0.3964160978794098, + "learning_rate": 3.9937498784361e-06, + "loss": 1.2294063568115234, + "step": 1296 + }, + { + "epoch": 2.3772893772893773, + "grad_norm": 0.282614141702652, + "learning_rate": 3.9714531330181275e-06, + "loss": 1.0998694896697998, + "step": 1298 + }, + { + "epoch": 2.380952380952381, + "grad_norm": 0.35640949010849, + "learning_rate": 3.949266420793999e-06, + "loss": 1.1518428325653076, + "step": 1300 + }, + { + "epoch": 2.3846153846153846, + "grad_norm": 0.8390282988548279, + "learning_rate": 3.9271900891007734e-06, + "loss": 0.8655717968940735, + "step": 1302 + }, + { + "epoch": 2.3882783882783882, + "grad_norm": 0.27162355184555054, + "learning_rate": 3.905224483547479e-06, + "loss": 0.4701066017150879, + "step": 1304 + }, + { + "epoch": 2.391941391941392, + "grad_norm": 0.5130133628845215, + "learning_rate": 3.883369948009714e-06, + "loss": 0.9348631501197815, + "step": 1306 + }, + { + "epoch": 2.3956043956043955, + "grad_norm": 0.19122500717639923, + "learning_rate": 3.861626824624258e-06, + "loss": 1.1146955490112305, + "step": 1308 + }, + { + "epoch": 2.399267399267399, + "grad_norm": 0.7361530065536499, + "learning_rate": 3.839995453783694e-06, + "loss": 0.514556348323822, + "step": 1310 + }, + { + "epoch": 2.402930402930403, + "grad_norm": 0.4885084629058838, + "learning_rate": 3.818476174131118e-06, + "loss": 1.121952772140503, + "step": 1312 + }, + { + "epoch": 2.4065934065934065, + "grad_norm": 0.2934611439704895, + "learning_rate": 3.7970693225548116e-06, + "loss": 0.8278881311416626, + "step": 1314 + }, + { + "epoch": 2.41025641025641, + "grad_norm": 0.24372442066669464, + "learning_rate": 3.7757752341829723e-06, + "loss": 1.1141726970672607, + "step": 1316 + }, + { + "epoch": 2.413919413919414, + "grad_norm": 0.7190600633621216, + "learning_rate": 3.754594242378466e-06, + "loss": 0.7716094255447388, + "step": 1318 + }, + { + "epoch": 2.4175824175824174, + "grad_norm": 0.14406920969486237, + "learning_rate": 3.7335266787336194e-06, + "loss": 0.7671050429344177, + "step": 1320 + }, + { + "epoch": 2.421245421245421, + "grad_norm": 0.32376912236213684, + "learning_rate": 3.712572873065012e-06, + "loss": 0.6145520210266113, + "step": 1322 + }, + { + "epoch": 2.4249084249084247, + "grad_norm": 0.44639283418655396, + "learning_rate": 3.69173315340833e-06, + "loss": 0.7646421194076538, + "step": 1324 + }, + { + "epoch": 2.4285714285714284, + "grad_norm": 0.21105606853961945, + "learning_rate": 3.6710078460132137e-06, + "loss": 0.7964082956314087, + "step": 1326 + }, + { + "epoch": 2.4322344322344325, + "grad_norm": 0.0819050669670105, + "learning_rate": 3.650397275338161e-06, + "loss": 0.6835886240005493, + "step": 1328 + }, + { + "epoch": 2.435897435897436, + "grad_norm": 0.21763509511947632, + "learning_rate": 3.6299017640454516e-06, + "loss": 0.7992286682128906, + "step": 1330 + }, + { + "epoch": 2.4395604395604398, + "grad_norm": 0.2989668846130371, + "learning_rate": 3.6095216329960786e-06, + "loss": 1.0545233488082886, + "step": 1332 + }, + { + "epoch": 2.4432234432234434, + "grad_norm": 0.5623311400413513, + "learning_rate": 3.5892572012447457e-06, + "loss": 0.7113739848136902, + "step": 1334 + }, + { + "epoch": 2.446886446886447, + "grad_norm": 0.372221440076828, + "learning_rate": 3.5691087860348577e-06, + "loss": 0.6705981492996216, + "step": 1336 + }, + { + "epoch": 2.4505494505494507, + "grad_norm": 0.29782795906066895, + "learning_rate": 3.549076702793557e-06, + "loss": 1.0251377820968628, + "step": 1338 + }, + { + "epoch": 2.4542124542124544, + "grad_norm": 0.12661497294902802, + "learning_rate": 3.529161265126795e-06, + "loss": 0.8721184730529785, + "step": 1340 + }, + { + "epoch": 2.457875457875458, + "grad_norm": 0.20114953815937042, + "learning_rate": 3.5093627848144128e-06, + "loss": 0.6586589813232422, + "step": 1342 + }, + { + "epoch": 2.4615384615384617, + "grad_norm": 0.22184857726097107, + "learning_rate": 3.4896815718052534e-06, + "loss": 0.8093105554580688, + "step": 1344 + }, + { + "epoch": 2.4652014652014653, + "grad_norm": 0.3594045639038086, + "learning_rate": 3.4701179342123313e-06, + "loss": 1.0655725002288818, + "step": 1346 + }, + { + "epoch": 2.468864468864469, + "grad_norm": 0.19054588675498962, + "learning_rate": 3.4506721783079925e-06, + "loss": 1.1080653667449951, + "step": 1348 + }, + { + "epoch": 2.4725274725274726, + "grad_norm": 0.2145404815673828, + "learning_rate": 3.4313446085191203e-06, + "loss": 0.7264165878295898, + "step": 1350 + }, + { + "epoch": 2.4761904761904763, + "grad_norm": 0.21821582317352295, + "learning_rate": 3.4121355274223727e-06, + "loss": 0.8345311284065247, + "step": 1352 + }, + { + "epoch": 2.47985347985348, + "grad_norm": 0.7348686456680298, + "learning_rate": 3.3930452357394473e-06, + "loss": 0.9193353056907654, + "step": 1354 + }, + { + "epoch": 2.4835164835164836, + "grad_norm": 4.1631364822387695, + "learning_rate": 3.3740740323323705e-06, + "loss": 0.9116562604904175, + "step": 1356 + }, + { + "epoch": 2.4871794871794872, + "grad_norm": 0.33455634117126465, + "learning_rate": 3.3552222141988257e-06, + "loss": 1.0841375589370728, + "step": 1358 + }, + { + "epoch": 2.490842490842491, + "grad_norm": 0.42869114875793457, + "learning_rate": 3.336490076467489e-06, + "loss": 0.8795775175094604, + "step": 1360 + }, + { + "epoch": 2.4945054945054945, + "grad_norm": 0.1992548406124115, + "learning_rate": 3.31787791239342e-06, + "loss": 0.8005448579788208, + "step": 1362 + }, + { + "epoch": 2.498168498168498, + "grad_norm": 0.4712922275066376, + "learning_rate": 3.2993860133534763e-06, + "loss": 0.78562992811203, + "step": 1364 + }, + { + "epoch": 2.501831501831502, + "grad_norm": 0.33841851353645325, + "learning_rate": 3.2810146688417304e-06, + "loss": 1.1035329103469849, + "step": 1366 + }, + { + "epoch": 2.5054945054945055, + "grad_norm": 0.44614377617836, + "learning_rate": 3.2627641664649666e-06, + "loss": 0.8065237998962402, + "step": 1368 + }, + { + "epoch": 2.509157509157509, + "grad_norm": 0.30702492594718933, + "learning_rate": 3.2446347919381533e-06, + "loss": 0.9479190707206726, + "step": 1370 + }, + { + "epoch": 2.5128205128205128, + "grad_norm": 0.13606101274490356, + "learning_rate": 3.226626829079979e-06, + "loss": 0.6367249488830566, + "step": 1372 + }, + { + "epoch": 2.5164835164835164, + "grad_norm": 0.3242248296737671, + "learning_rate": 3.2087405598084194e-06, + "loss": 0.7743354439735413, + "step": 1374 + }, + { + "epoch": 2.52014652014652, + "grad_norm": 0.9609295129776001, + "learning_rate": 3.1909762641363083e-06, + "loss": 0.9668674468994141, + "step": 1376 + }, + { + "epoch": 2.5238095238095237, + "grad_norm": 0.35177093744277954, + "learning_rate": 3.173334220166962e-06, + "loss": 0.7824784517288208, + "step": 1378 + }, + { + "epoch": 2.5274725274725274, + "grad_norm": 0.26057368516921997, + "learning_rate": 3.155814704089823e-06, + "loss": 0.8903582692146301, + "step": 1380 + }, + { + "epoch": 2.531135531135531, + "grad_norm": 0.06582798063755035, + "learning_rate": 3.1384179901761343e-06, + "loss": 0.8639101982116699, + "step": 1382 + }, + { + "epoch": 2.5347985347985347, + "grad_norm": 0.33872169256210327, + "learning_rate": 3.1211443507746546e-06, + "loss": 0.7889021039009094, + "step": 1384 + }, + { + "epoch": 2.5384615384615383, + "grad_norm": 0.21574373543262482, + "learning_rate": 3.1039940563073894e-06, + "loss": 1.0973457098007202, + "step": 1386 + }, + { + "epoch": 2.542124542124542, + "grad_norm": 0.23947708308696747, + "learning_rate": 3.0869673752653447e-06, + "loss": 0.755636990070343, + "step": 1388 + }, + { + "epoch": 2.5457875457875456, + "grad_norm": 0.2152792364358902, + "learning_rate": 3.0700645742043476e-06, + "loss": 0.8107748627662659, + "step": 1390 + }, + { + "epoch": 2.5494505494505493, + "grad_norm": 0.23220431804656982, + "learning_rate": 3.0532859177408587e-06, + "loss": 0.9791698455810547, + "step": 1392 + }, + { + "epoch": 2.553113553113553, + "grad_norm": 0.3370932638645172, + "learning_rate": 3.03663166854783e-06, + "loss": 0.7153705358505249, + "step": 1394 + }, + { + "epoch": 2.5567765567765566, + "grad_norm": 0.1939450353384018, + "learning_rate": 3.020102087350594e-06, + "loss": 0.3394162356853485, + "step": 1396 + }, + { + "epoch": 2.5604395604395602, + "grad_norm": 0.34136688709259033, + "learning_rate": 3.0036974329227862e-06, + "loss": 1.1137323379516602, + "step": 1398 + }, + { + "epoch": 2.564102564102564, + "grad_norm": 0.5828881859779358, + "learning_rate": 2.9874179620822856e-06, + "loss": 1.1080697774887085, + "step": 1400 + }, + { + "epoch": 2.5677655677655675, + "grad_norm": 0.3508465886116028, + "learning_rate": 2.971263929687207e-06, + "loss": 0.892469048500061, + "step": 1402 + }, + { + "epoch": 2.571428571428571, + "grad_norm": 0.27003780007362366, + "learning_rate": 2.9552355886318968e-06, + "loss": 1.1012053489685059, + "step": 1404 + }, + { + "epoch": 2.575091575091575, + "grad_norm": 4.884339809417725, + "learning_rate": 2.9393331898429777e-06, + "loss": 0.46851903200149536, + "step": 1406 + }, + { + "epoch": 2.578754578754579, + "grad_norm": 0.24890904128551483, + "learning_rate": 2.9235569822754317e-06, + "loss": 1.1480822563171387, + "step": 1408 + }, + { + "epoch": 2.5824175824175826, + "grad_norm": 0.41308149695396423, + "learning_rate": 2.9079072129086906e-06, + "loss": 0.8952036499977112, + "step": 1410 + }, + { + "epoch": 2.586080586080586, + "grad_norm": 0.7369239330291748, + "learning_rate": 2.89238412674277e-06, + "loss": 0.8408064246177673, + "step": 1412 + }, + { + "epoch": 2.58974358974359, + "grad_norm": 0.5877048969268799, + "learning_rate": 2.8769879667944393e-06, + "loss": 0.9505341053009033, + "step": 1414 + }, + { + "epoch": 2.5934065934065935, + "grad_norm": 0.2738566994667053, + "learning_rate": 2.8617189740934113e-06, + "loss": 1.0852067470550537, + "step": 1416 + }, + { + "epoch": 2.597069597069597, + "grad_norm": 0.7545573711395264, + "learning_rate": 2.8465773876785786e-06, + "loss": 0.7618016600608826, + "step": 1418 + }, + { + "epoch": 2.600732600732601, + "grad_norm": 0.4518103301525116, + "learning_rate": 2.8315634445942623e-06, + "loss": 1.0812193155288696, + "step": 1420 + }, + { + "epoch": 2.6043956043956045, + "grad_norm": 0.3092360496520996, + "learning_rate": 2.8166773798864978e-06, + "loss": 1.0327297449111938, + "step": 1422 + }, + { + "epoch": 2.608058608058608, + "grad_norm": 0.30283570289611816, + "learning_rate": 2.8019194265993683e-06, + "loss": 1.1834499835968018, + "step": 1424 + }, + { + "epoch": 2.6117216117216118, + "grad_norm": 0.4102247357368469, + "learning_rate": 2.787289815771348e-06, + "loss": 1.272240161895752, + "step": 1426 + }, + { + "epoch": 2.6153846153846154, + "grad_norm": 0.2674598693847656, + "learning_rate": 2.7727887764316835e-06, + "loss": 1.137374997138977, + "step": 1428 + }, + { + "epoch": 2.619047619047619, + "grad_norm": 0.8950290083885193, + "learning_rate": 2.758416535596812e-06, + "loss": 1.0797327756881714, + "step": 1430 + }, + { + "epoch": 2.6227106227106227, + "grad_norm": 0.3816538155078888, + "learning_rate": 2.744173318266809e-06, + "loss": 0.9714991450309753, + "step": 1432 + }, + { + "epoch": 2.6263736263736264, + "grad_norm": 0.43699485063552856, + "learning_rate": 2.7300593474218583e-06, + "loss": 0.9439188838005066, + "step": 1434 + }, + { + "epoch": 2.63003663003663, + "grad_norm": 0.805819571018219, + "learning_rate": 2.7160748440187736e-06, + "loss": 1.3011527061462402, + "step": 1436 + }, + { + "epoch": 2.6336996336996337, + "grad_norm": 2.7278735637664795, + "learning_rate": 2.702220026987525e-06, + "loss": 1.1346534490585327, + "step": 1438 + }, + { + "epoch": 2.6373626373626373, + "grad_norm": 0.21300676465034485, + "learning_rate": 2.6884951132278185e-06, + "loss": 1.095414161682129, + "step": 1440 + }, + { + "epoch": 2.641025641025641, + "grad_norm": 0.16259470582008362, + "learning_rate": 2.6749003176057092e-06, + "loss": 1.1217632293701172, + "step": 1442 + }, + { + "epoch": 2.6446886446886446, + "grad_norm": 0.4592875838279724, + "learning_rate": 2.6614358529502165e-06, + "loss": 1.0697576999664307, + "step": 1444 + }, + { + "epoch": 2.6483516483516483, + "grad_norm": 0.3081722557544708, + "learning_rate": 2.6481019300500166e-06, + "loss": 0.7924908995628357, + "step": 1446 + }, + { + "epoch": 2.652014652014652, + "grad_norm": 0.12425895035266876, + "learning_rate": 2.634898757650121e-06, + "loss": 0.8344003558158875, + "step": 1448 + }, + { + "epoch": 2.6556776556776556, + "grad_norm": 0.213938370347023, + "learning_rate": 2.6218265424486233e-06, + "loss": 1.090872049331665, + "step": 1450 + }, + { + "epoch": 2.659340659340659, + "grad_norm": 0.21575380861759186, + "learning_rate": 2.608885489093455e-06, + "loss": 0.7010395526885986, + "step": 1452 + }, + { + "epoch": 2.663003663003663, + "grad_norm": 0.07869178056716919, + "learning_rate": 2.5960758001791893e-06, + "loss": 0.6901416182518005, + "step": 1454 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 0.32918277382850647, + "learning_rate": 2.5833976762438605e-06, + "loss": 1.0698477029800415, + "step": 1456 + }, + { + "epoch": 2.67032967032967, + "grad_norm": 0.39367684721946716, + "learning_rate": 2.5708513157658295e-06, + "loss": 0.7514879107475281, + "step": 1458 + }, + { + "epoch": 2.6739926739926743, + "grad_norm": 0.07575741410255432, + "learning_rate": 2.5584369151606785e-06, + "loss": 0.7730086445808411, + "step": 1460 + }, + { + "epoch": 2.677655677655678, + "grad_norm": 0.2596700191497803, + "learning_rate": 2.5461546687781325e-06, + "loss": 0.7882156372070312, + "step": 1462 + }, + { + "epoch": 2.6813186813186816, + "grad_norm": 0.3725094199180603, + "learning_rate": 2.5340047688990142e-06, + "loss": 1.0619717836380005, + "step": 1464 + }, + { + "epoch": 2.684981684981685, + "grad_norm": 0.4336903989315033, + "learning_rate": 2.5219874057322453e-06, + "loss": 0.8744177222251892, + "step": 1466 + }, + { + "epoch": 2.688644688644689, + "grad_norm": 0.31862640380859375, + "learning_rate": 2.5101027674118523e-06, + "loss": 1.03647780418396, + "step": 1468 + }, + { + "epoch": 2.6923076923076925, + "grad_norm": 0.636555552482605, + "learning_rate": 2.4983510399940377e-06, + "loss": 0.6445450186729431, + "step": 1470 + }, + { + "epoch": 2.695970695970696, + "grad_norm": 0.5853780508041382, + "learning_rate": 2.4867324074542525e-06, + "loss": 0.7737827301025391, + "step": 1472 + }, + { + "epoch": 2.6996336996337, + "grad_norm": 0.12945802509784698, + "learning_rate": 2.4752470516843257e-06, + "loss": 0.43008971214294434, + "step": 1474 + }, + { + "epoch": 2.7032967032967035, + "grad_norm": 0.24216750264167786, + "learning_rate": 2.463895152489617e-06, + "loss": 1.1383622884750366, + "step": 1476 + }, + { + "epoch": 2.706959706959707, + "grad_norm": 0.29354915022850037, + "learning_rate": 2.4526768875861938e-06, + "loss": 0.6484386324882507, + "step": 1478 + }, + { + "epoch": 2.7106227106227108, + "grad_norm": 0.2161005586385727, + "learning_rate": 2.4415924325980575e-06, + "loss": 1.0878047943115234, + "step": 1480 + }, + { + "epoch": 2.7142857142857144, + "grad_norm": 0.2699625492095947, + "learning_rate": 2.4306419610543885e-06, + "loss": 1.0988802909851074, + "step": 1482 + }, + { + "epoch": 2.717948717948718, + "grad_norm": 0.20100824534893036, + "learning_rate": 2.4198256443868327e-06, + "loss": 1.0038988590240479, + "step": 1484 + }, + { + "epoch": 2.7216117216117217, + "grad_norm": 0.37425243854522705, + "learning_rate": 2.4091436519268167e-06, + "loss": 1.1373145580291748, + "step": 1486 + }, + { + "epoch": 2.7252747252747254, + "grad_norm": 0.8878154158592224, + "learning_rate": 2.3985961509028994e-06, + "loss": 1.1287609338760376, + "step": 1488 + }, + { + "epoch": 2.728937728937729, + "grad_norm": 0.36207541823387146, + "learning_rate": 2.3881833064381478e-06, + "loss": 0.5098522901535034, + "step": 1490 + }, + { + "epoch": 2.7326007326007327, + "grad_norm": 0.2365618795156479, + "learning_rate": 2.3779052815475553e-06, + "loss": 1.1206612586975098, + "step": 1492 + }, + { + "epoch": 2.7362637362637363, + "grad_norm": 0.5252121090888977, + "learning_rate": 2.3677622371354932e-06, + "loss": 0.6413739919662476, + "step": 1494 + }, + { + "epoch": 2.73992673992674, + "grad_norm": 0.575518012046814, + "learning_rate": 2.357754331993187e-06, + "loss": 1.2001720666885376, + "step": 1496 + }, + { + "epoch": 2.7435897435897436, + "grad_norm": 0.3244689404964447, + "learning_rate": 2.347881722796234e-06, + "loss": 1.0752826929092407, + "step": 1498 + }, + { + "epoch": 2.7472527472527473, + "grad_norm": 0.33936455845832825, + "learning_rate": 2.3381445641021445e-06, + "loss": 0.7569327354431152, + "step": 1500 + }, + { + "epoch": 2.750915750915751, + "grad_norm": 0.28682929277420044, + "learning_rate": 2.328543008347928e-06, + "loss": 1.0332515239715576, + "step": 1502 + }, + { + "epoch": 2.7545787545787546, + "grad_norm": 0.23232409358024597, + "learning_rate": 2.31907720584771e-06, + "loss": 1.1103003025054932, + "step": 1504 + }, + { + "epoch": 2.758241758241758, + "grad_norm": 0.2504388988018036, + "learning_rate": 2.3097473047903645e-06, + "loss": 1.1210606098175049, + "step": 1506 + }, + { + "epoch": 2.761904761904762, + "grad_norm": 0.05474567040801048, + "learning_rate": 2.3005534512372106e-06, + "loss": 0.9155768752098083, + "step": 1508 + }, + { + "epoch": 2.7655677655677655, + "grad_norm": 0.4082033932209015, + "learning_rate": 2.2914957891197182e-06, + "loss": 0.9890879988670349, + "step": 1510 + }, + { + "epoch": 2.769230769230769, + "grad_norm": 0.2753402292728424, + "learning_rate": 2.2825744602372506e-06, + "loss": 0.9224788546562195, + "step": 1512 + }, + { + "epoch": 2.772893772893773, + "grad_norm": 0.3398146629333496, + "learning_rate": 2.2737896042548537e-06, + "loss": 1.0887320041656494, + "step": 1514 + }, + { + "epoch": 2.7765567765567765, + "grad_norm": 0.31702274084091187, + "learning_rate": 2.2651413587010634e-06, + "loss": 1.0131077766418457, + "step": 1516 + }, + { + "epoch": 2.78021978021978, + "grad_norm": 0.06860560923814774, + "learning_rate": 2.2566298589657546e-06, + "loss": 0.9779605865478516, + "step": 1518 + }, + { + "epoch": 2.7838827838827838, + "grad_norm": 0.3455958664417267, + "learning_rate": 2.2482552382980194e-06, + "loss": 0.5144427418708801, + "step": 1520 + }, + { + "epoch": 2.7875457875457874, + "grad_norm": 0.2682342231273651, + "learning_rate": 2.240017627804088e-06, + "loss": 0.7746493816375732, + "step": 1522 + }, + { + "epoch": 2.791208791208791, + "grad_norm": 0.24560944736003876, + "learning_rate": 2.231917156445265e-06, + "loss": 0.825928807258606, + "step": 1524 + }, + { + "epoch": 2.7948717948717947, + "grad_norm": 0.5874441862106323, + "learning_rate": 2.223953951035919e-06, + "loss": 1.140251874923706, + "step": 1526 + }, + { + "epoch": 2.7985347985347984, + "grad_norm": 0.2171962410211563, + "learning_rate": 2.216128136241497e-06, + "loss": 1.0953679084777832, + "step": 1528 + }, + { + "epoch": 2.802197802197802, + "grad_norm": 0.5042185187339783, + "learning_rate": 2.208439834576568e-06, + "loss": 1.093937635421753, + "step": 1530 + }, + { + "epoch": 2.8058608058608057, + "grad_norm": 0.2920451760292053, + "learning_rate": 2.200889166402908e-06, + "loss": 0.9886079430580139, + "step": 1532 + }, + { + "epoch": 2.8095238095238093, + "grad_norm": 0.5010365843772888, + "learning_rate": 2.193476249927617e-06, + "loss": 0.7637280821800232, + "step": 1534 + }, + { + "epoch": 2.813186813186813, + "grad_norm": 0.313466340303421, + "learning_rate": 2.1862012012012647e-06, + "loss": 1.2877520322799683, + "step": 1536 + }, + { + "epoch": 2.8168498168498166, + "grad_norm": 0.2607482373714447, + "learning_rate": 2.179064134116078e-06, + "loss": 0.9093769192695618, + "step": 1538 + }, + { + "epoch": 2.8205128205128203, + "grad_norm": 0.9507859349250793, + "learning_rate": 2.1720651604041543e-06, + "loss": 0.7832710146903992, + "step": 1540 + }, + { + "epoch": 2.824175824175824, + "grad_norm": 0.5445396900177002, + "learning_rate": 2.1652043896357132e-06, + "loss": 0.8669724464416504, + "step": 1542 + }, + { + "epoch": 2.8278388278388276, + "grad_norm": 0.2889232933521271, + "learning_rate": 2.1584819292173844e-06, + "loss": 1.0738338232040405, + "step": 1544 + }, + { + "epoch": 2.8315018315018317, + "grad_norm": 0.3314497172832489, + "learning_rate": 2.1518978843905204e-06, + "loss": 1.120241641998291, + "step": 1546 + }, + { + "epoch": 2.8351648351648353, + "grad_norm": 0.43042734265327454, + "learning_rate": 2.1454523582295567e-06, + "loss": 1.1897751092910767, + "step": 1548 + }, + { + "epoch": 2.838827838827839, + "grad_norm": 0.41529417037963867, + "learning_rate": 2.1391454516403876e-06, + "loss": 0.8347918391227722, + "step": 1550 + }, + { + "epoch": 2.8424908424908426, + "grad_norm": 0.06782124936580658, + "learning_rate": 2.1329772633587976e-06, + "loss": 0.5578287839889526, + "step": 1552 + }, + { + "epoch": 2.8461538461538463, + "grad_norm": 0.28071868419647217, + "learning_rate": 2.1269478899489068e-06, + "loss": 1.1331698894500732, + "step": 1554 + }, + { + "epoch": 2.84981684981685, + "grad_norm": 4.985344409942627, + "learning_rate": 2.1210574258016675e-06, + "loss": 0.910840630531311, + "step": 1556 + }, + { + "epoch": 2.8534798534798536, + "grad_norm": 5.596924781799316, + "learning_rate": 2.1153059631333785e-06, + "loss": 1.0615748167037964, + "step": 1558 + }, + { + "epoch": 2.857142857142857, + "grad_norm": 0.5638831853866577, + "learning_rate": 2.1096935919842434e-06, + "loss": 0.5931094288825989, + "step": 1560 + }, + { + "epoch": 2.860805860805861, + "grad_norm": 0.3857717514038086, + "learning_rate": 2.104220400216967e-06, + "loss": 0.7401585578918457, + "step": 1562 + }, + { + "epoch": 2.8644688644688645, + "grad_norm": 0.32050344347953796, + "learning_rate": 2.0988864735153724e-06, + "loss": 0.8747660517692566, + "step": 1564 + }, + { + "epoch": 2.868131868131868, + "grad_norm": 1.977245569229126, + "learning_rate": 2.0936918953830633e-06, + "loss": 0.6961140036582947, + "step": 1566 + }, + { + "epoch": 2.871794871794872, + "grad_norm": 0.33985045552253723, + "learning_rate": 2.088636747142114e-06, + "loss": 0.7156071662902832, + "step": 1568 + }, + { + "epoch": 2.8754578754578755, + "grad_norm": 0.1318127065896988, + "learning_rate": 2.083721107931803e-06, + "loss": 0.7040363550186157, + "step": 1570 + }, + { + "epoch": 2.879120879120879, + "grad_norm": 0.434373676776886, + "learning_rate": 2.0789450547073634e-06, + "loss": 0.6764128804206848, + "step": 1572 + }, + { + "epoch": 2.8827838827838828, + "grad_norm": 0.7847844958305359, + "learning_rate": 2.074308662238789e-06, + "loss": 0.9994201064109802, + "step": 1574 + }, + { + "epoch": 2.8864468864468864, + "grad_norm": 5.1912055015563965, + "learning_rate": 2.069812003109654e-06, + "loss": 0.9506621956825256, + "step": 1576 + }, + { + "epoch": 2.89010989010989, + "grad_norm": 0.22814835608005524, + "learning_rate": 2.0654551477159868e-06, + "loss": 0.8836541175842285, + "step": 1578 + }, + { + "epoch": 2.8937728937728937, + "grad_norm": 1.1357293128967285, + "learning_rate": 2.0612381642651584e-06, + "loss": 1.1715425252914429, + "step": 1580 + }, + { + "epoch": 2.8974358974358974, + "grad_norm": 0.6360121369361877, + "learning_rate": 2.057161118774821e-06, + "loss": 0.8418415784835815, + "step": 1582 + }, + { + "epoch": 2.901098901098901, + "grad_norm": 0.03927984833717346, + "learning_rate": 2.05322407507187e-06, + "loss": 0.7738023996353149, + "step": 1584 + }, + { + "epoch": 2.9047619047619047, + "grad_norm": 0.14784355461597443, + "learning_rate": 2.0494270947914507e-06, + "loss": 0.785838782787323, + "step": 1586 + }, + { + "epoch": 2.9084249084249083, + "grad_norm": 0.17373321950435638, + "learning_rate": 2.0457702373759864e-06, + "loss": 1.1310051679611206, + "step": 1588 + }, + { + "epoch": 2.912087912087912, + "grad_norm": 0.2693270444869995, + "learning_rate": 2.0422535600742526e-06, + "loss": 1.1932168006896973, + "step": 1590 + }, + { + "epoch": 2.9157509157509156, + "grad_norm": 0.29240870475769043, + "learning_rate": 2.03887711794048e-06, + "loss": 0.5143145322799683, + "step": 1592 + }, + { + "epoch": 2.9194139194139193, + "grad_norm": 0.2930733859539032, + "learning_rate": 2.0356409638334902e-06, + "loss": 1.1435730457305908, + "step": 1594 + }, + { + "epoch": 2.9230769230769234, + "grad_norm": 0.4619033634662628, + "learning_rate": 2.032545148415871e-06, + "loss": 0.7462179064750671, + "step": 1596 + }, + { + "epoch": 2.926739926739927, + "grad_norm": 0.5609884858131409, + "learning_rate": 2.0295897201531838e-06, + "loss": 1.1510316133499146, + "step": 1598 + }, + { + "epoch": 2.9304029304029307, + "grad_norm": 0.1557919830083847, + "learning_rate": 2.026774725313199e-06, + "loss": 0.7830919623374939, + "step": 1600 + }, + { + "epoch": 2.9340659340659343, + "grad_norm": 0.5669822692871094, + "learning_rate": 2.0241002079651803e-06, + "loss": 1.1249396800994873, + "step": 1602 + }, + { + "epoch": 2.937728937728938, + "grad_norm": 0.2051316201686859, + "learning_rate": 2.0215662099791874e-06, + "loss": 0.8561792969703674, + "step": 1604 + }, + { + "epoch": 2.9413919413919416, + "grad_norm": 0.621616005897522, + "learning_rate": 2.019172771025426e-06, + "loss": 1.1106517314910889, + "step": 1606 + }, + { + "epoch": 2.9450549450549453, + "grad_norm": 0.3406311869621277, + "learning_rate": 2.0169199285736234e-06, + "loss": 0.6978309154510498, + "step": 1608 + }, + { + "epoch": 2.948717948717949, + "grad_norm": 0.1934393346309662, + "learning_rate": 2.0148077178924412e-06, + "loss": 1.0018409490585327, + "step": 1610 + }, + { + "epoch": 2.9523809523809526, + "grad_norm": 0.5513986349105835, + "learning_rate": 2.0128361720489263e-06, + "loss": 0.868675708770752, + "step": 1612 + }, + { + "epoch": 2.956043956043956, + "grad_norm": 0.5031806826591492, + "learning_rate": 2.0110053219079927e-06, + "loss": 0.6836384534835815, + "step": 1614 + }, + { + "epoch": 2.95970695970696, + "grad_norm": 1.7091178894042969, + "learning_rate": 2.009315196131934e-06, + "loss": 0.8271127343177795, + "step": 1616 + }, + { + "epoch": 2.9633699633699635, + "grad_norm": 0.2841132581233978, + "learning_rate": 2.0077658211799823e-06, + "loss": 1.3688499927520752, + "step": 1618 + }, + { + "epoch": 2.967032967032967, + "grad_norm": 0.20573776960372925, + "learning_rate": 2.0063572213078856e-06, + "loss": 1.2264219522476196, + "step": 1620 + }, + { + "epoch": 2.970695970695971, + "grad_norm": 0.29216548800468445, + "learning_rate": 2.0050894185675354e-06, + "loss": 0.9183750152587891, + "step": 1622 + }, + { + "epoch": 2.9743589743589745, + "grad_norm": 0.47401827573776245, + "learning_rate": 2.0039624328066154e-06, + "loss": 0.7726737260818481, + "step": 1624 + }, + { + "epoch": 2.978021978021978, + "grad_norm": 0.17664581537246704, + "learning_rate": 2.0029762816682963e-06, + "loss": 0.8954548239707947, + "step": 1626 + }, + { + "epoch": 2.9816849816849818, + "grad_norm": 0.2111157476902008, + "learning_rate": 2.0021309805909546e-06, + "loss": 0.9459641575813293, + "step": 1628 + }, + { + "epoch": 2.9853479853479854, + "grad_norm": 0.19905273616313934, + "learning_rate": 2.001426542807935e-06, + "loss": 1.3968180418014526, + "step": 1630 + }, + { + "epoch": 2.989010989010989, + "grad_norm": 0.42986905574798584, + "learning_rate": 2.000862979347339e-06, + "loss": 1.0010343790054321, + "step": 1632 + }, + { + "epoch": 2.9926739926739927, + "grad_norm": 0.26831236481666565, + "learning_rate": 2.0004402990318574e-06, + "loss": 0.7905582189559937, + "step": 1634 + }, + { + "epoch": 2.9963369963369964, + "grad_norm": 0.35902222990989685, + "learning_rate": 2.000158508478629e-06, + "loss": 1.2652593851089478, + "step": 1636 + }, + { + "epoch": 3.0, + "grad_norm": 0.369983971118927, + "learning_rate": 2.0000176120991345e-06, + "loss": 1.0647433996200562, + "step": 1638 + }, + { + "epoch": 3.0, + "step": 1638, + "total_flos": 8.4482141520606e+18, + "train_loss": 1.0226961683586548, + "train_runtime": 55136.4168, + "train_samples_per_second": 0.713, + "train_steps_per_second": 0.03 + } + ], + "logging_steps": 2, + "max_steps": 1638, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 99999, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 8.4482141520606e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}