diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,13578 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 19351, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.000516769159216578, + "grad_norm": 2.2854151725769043, + "learning_rate": 2.58384579608289e-05, + "loss": 8.3208, + "step": 10 + }, + { + "epoch": 0.001033538318433156, + "grad_norm": 1.7825466394424438, + "learning_rate": 5.16769159216578e-05, + "loss": 7.8091, + "step": 20 + }, + { + "epoch": 0.001550307477649734, + "grad_norm": 1.5718376636505127, + "learning_rate": 7.751537388248669e-05, + "loss": 7.23, + "step": 30 + }, + { + "epoch": 0.002067076636866312, + "grad_norm": 0.9627140164375305, + "learning_rate": 0.0001033538318433156, + "loss": 6.6397, + "step": 40 + }, + { + "epoch": 0.0025838457960828897, + "grad_norm": 0.7302472591400146, + "learning_rate": 0.0001291922898041445, + "loss": 6.2658, + "step": 50 + }, + { + "epoch": 0.003100614955299468, + "grad_norm": 0.6241187453269958, + "learning_rate": 0.00015503074776497338, + "loss": 5.9946, + "step": 60 + }, + { + "epoch": 0.0036173841145160455, + "grad_norm": 1.97952139377594, + "learning_rate": 0.0001808692057258023, + "loss": 5.7746, + "step": 70 + }, + { + "epoch": 0.004134153273732624, + "grad_norm": 0.8876756429672241, + "learning_rate": 0.0002067076636866312, + "loss": 5.6108, + "step": 80 + }, + { + "epoch": 0.004650922432949202, + "grad_norm": 0.9505946040153503, + "learning_rate": 0.00023254612164746007, + "loss": 5.4343, + "step": 90 + }, + { + "epoch": 0.005167691592165779, + "grad_norm": 0.8270168304443359, + "learning_rate": 0.000258384579608289, + "loss": 5.1914, + "step": 100 + }, + { + "epoch": 0.005684460751382358, + "grad_norm": 1.4167860746383667, + "learning_rate": 0.0002842230375691179, + "loss": 4.9526, + "step": 110 + }, + { + "epoch": 0.006201229910598936, + "grad_norm": 0.8067450523376465, + "learning_rate": 0.00031006149552994676, + "loss": 4.7455, + "step": 120 + }, + { + "epoch": 0.006717999069815513, + "grad_norm": 1.0726768970489502, + "learning_rate": 0.0003358999534907757, + "loss": 4.5333, + "step": 130 + }, + { + "epoch": 0.007234768229032091, + "grad_norm": 0.9126196503639221, + "learning_rate": 0.0003617384114516046, + "loss": 4.3803, + "step": 140 + }, + { + "epoch": 0.0077515373882486695, + "grad_norm": 1.0933046340942383, + "learning_rate": 0.0003875768694124335, + "loss": 4.2341, + "step": 150 + }, + { + "epoch": 0.008268306547465248, + "grad_norm": 1.2751045227050781, + "learning_rate": 0.0004134153273732624, + "loss": 4.1177, + "step": 160 + }, + { + "epoch": 0.008785075706681826, + "grad_norm": 1.0147113800048828, + "learning_rate": 0.0004392537853340913, + "loss": 4.006, + "step": 170 + }, + { + "epoch": 0.009301844865898403, + "grad_norm": 1.2339316606521606, + "learning_rate": 0.00046509224329492014, + "loss": 3.9218, + "step": 180 + }, + { + "epoch": 0.009818614025114981, + "grad_norm": 1.2789371013641357, + "learning_rate": 0.0004909307012557491, + "loss": 3.8193, + "step": 190 + }, + { + "epoch": 0.010335383184331559, + "grad_norm": 1.337181568145752, + "learning_rate": 0.0005, + "loss": 3.729, + "step": 200 + }, + { + "epoch": 0.010852152343548136, + "grad_norm": 1.29481840133667, + "learning_rate": 0.0005, + "loss": 3.6638, + "step": 210 + }, + { + "epoch": 0.011368921502764716, + "grad_norm": 1.2725201845169067, + "learning_rate": 0.0005, + "loss": 3.6127, + "step": 220 + }, + { + "epoch": 0.011885690661981294, + "grad_norm": 1.4627310037612915, + "learning_rate": 0.0005, + "loss": 3.5382, + "step": 230 + }, + { + "epoch": 0.012402459821197871, + "grad_norm": 1.0869964361190796, + "learning_rate": 0.0005, + "loss": 3.5016, + "step": 240 + }, + { + "epoch": 0.012919228980414449, + "grad_norm": 1.142592191696167, + "learning_rate": 0.0005, + "loss": 3.4392, + "step": 250 + }, + { + "epoch": 0.013435998139631027, + "grad_norm": 1.0130606889724731, + "learning_rate": 0.0005, + "loss": 3.3923, + "step": 260 + }, + { + "epoch": 0.013952767298847604, + "grad_norm": 1.2244436740875244, + "learning_rate": 0.0005, + "loss": 3.3526, + "step": 270 + }, + { + "epoch": 0.014469536458064182, + "grad_norm": 1.164255976676941, + "learning_rate": 0.0005, + "loss": 3.2819, + "step": 280 + }, + { + "epoch": 0.014986305617280761, + "grad_norm": 0.9863433241844177, + "learning_rate": 0.0005, + "loss": 3.2612, + "step": 290 + }, + { + "epoch": 0.015503074776497339, + "grad_norm": 1.1939715147018433, + "learning_rate": 0.0005, + "loss": 3.2136, + "step": 300 + }, + { + "epoch": 0.016019843935713915, + "grad_norm": 1.0233927965164185, + "learning_rate": 0.0005, + "loss": 3.1914, + "step": 310 + }, + { + "epoch": 0.016536613094930496, + "grad_norm": 1.3167331218719482, + "learning_rate": 0.0005, + "loss": 3.1614, + "step": 320 + }, + { + "epoch": 0.017053382254147074, + "grad_norm": 1.10019850730896, + "learning_rate": 0.0005, + "loss": 3.1133, + "step": 330 + }, + { + "epoch": 0.01757015141336365, + "grad_norm": 1.267260193824768, + "learning_rate": 0.0005, + "loss": 3.0851, + "step": 340 + }, + { + "epoch": 0.01808692057258023, + "grad_norm": 1.134069800376892, + "learning_rate": 0.0005, + "loss": 3.0513, + "step": 350 + }, + { + "epoch": 0.018603689731796807, + "grad_norm": 1.3286008834838867, + "learning_rate": 0.0005, + "loss": 3.0269, + "step": 360 + }, + { + "epoch": 0.019120458891013385, + "grad_norm": 1.3457223176956177, + "learning_rate": 0.0005, + "loss": 2.9893, + "step": 370 + }, + { + "epoch": 0.019637228050229962, + "grad_norm": 1.254515290260315, + "learning_rate": 0.0005, + "loss": 2.9698, + "step": 380 + }, + { + "epoch": 0.02015399720944654, + "grad_norm": 1.1265138387680054, + "learning_rate": 0.0005, + "loss": 2.9409, + "step": 390 + }, + { + "epoch": 0.020670766368663118, + "grad_norm": 1.1716142892837524, + "learning_rate": 0.0005, + "loss": 2.9289, + "step": 400 + }, + { + "epoch": 0.021187535527879695, + "grad_norm": 1.7391645908355713, + "learning_rate": 0.0005, + "loss": 2.9168, + "step": 410 + }, + { + "epoch": 0.021704304687096273, + "grad_norm": 1.2796969413757324, + "learning_rate": 0.0005, + "loss": 2.8871, + "step": 420 + }, + { + "epoch": 0.02222107384631285, + "grad_norm": 1.192845106124878, + "learning_rate": 0.0005, + "loss": 2.85, + "step": 430 + }, + { + "epoch": 0.02273784300552943, + "grad_norm": 1.2399319410324097, + "learning_rate": 0.0005, + "loss": 2.8469, + "step": 440 + }, + { + "epoch": 0.02325461216474601, + "grad_norm": 2.0635640621185303, + "learning_rate": 0.0005, + "loss": 2.8174, + "step": 450 + }, + { + "epoch": 0.023771381323962587, + "grad_norm": 1.2288042306900024, + "learning_rate": 0.0005, + "loss": 2.8065, + "step": 460 + }, + { + "epoch": 0.024288150483179165, + "grad_norm": 1.1476280689239502, + "learning_rate": 0.0005, + "loss": 2.7702, + "step": 470 + }, + { + "epoch": 0.024804919642395742, + "grad_norm": 1.0889836549758911, + "learning_rate": 0.0005, + "loss": 2.7561, + "step": 480 + }, + { + "epoch": 0.02532168880161232, + "grad_norm": 1.2561166286468506, + "learning_rate": 0.0005, + "loss": 2.7407, + "step": 490 + }, + { + "epoch": 0.025838457960828898, + "grad_norm": 1.105919599533081, + "learning_rate": 0.0005, + "loss": 2.7356, + "step": 500 + }, + { + "epoch": 0.026355227120045475, + "grad_norm": 1.2789875268936157, + "learning_rate": 0.0005, + "loss": 2.6941, + "step": 510 + }, + { + "epoch": 0.026871996279262053, + "grad_norm": 1.2486238479614258, + "learning_rate": 0.0005, + "loss": 2.7009, + "step": 520 + }, + { + "epoch": 0.02738876543847863, + "grad_norm": 1.3023815155029297, + "learning_rate": 0.0005, + "loss": 2.6761, + "step": 530 + }, + { + "epoch": 0.02790553459769521, + "grad_norm": 1.3703725337982178, + "learning_rate": 0.0005, + "loss": 2.659, + "step": 540 + }, + { + "epoch": 0.028422303756911786, + "grad_norm": 1.1285632848739624, + "learning_rate": 0.0005, + "loss": 2.6444, + "step": 550 + }, + { + "epoch": 0.028939072916128364, + "grad_norm": 1.6290286779403687, + "learning_rate": 0.0005, + "loss": 2.607, + "step": 560 + }, + { + "epoch": 0.029455842075344945, + "grad_norm": 1.2040055990219116, + "learning_rate": 0.0005, + "loss": 2.6061, + "step": 570 + }, + { + "epoch": 0.029972611234561523, + "grad_norm": 1.13448166847229, + "learning_rate": 0.0005, + "loss": 2.5709, + "step": 580 + }, + { + "epoch": 0.0304893803937781, + "grad_norm": 1.9924914836883545, + "learning_rate": 0.0005, + "loss": 2.5914, + "step": 590 + }, + { + "epoch": 0.031006149552994678, + "grad_norm": 1.6680738925933838, + "learning_rate": 0.0005, + "loss": 2.5523, + "step": 600 + }, + { + "epoch": 0.03152291871221125, + "grad_norm": 1.5603681802749634, + "learning_rate": 0.0005, + "loss": 2.5157, + "step": 610 + }, + { + "epoch": 0.03203968787142783, + "grad_norm": 1.2942813634872437, + "learning_rate": 0.0005, + "loss": 2.4997, + "step": 620 + }, + { + "epoch": 0.032556457030644415, + "grad_norm": 1.4313350915908813, + "learning_rate": 0.0005, + "loss": 2.4648, + "step": 630 + }, + { + "epoch": 0.03307322618986099, + "grad_norm": 1.41900634765625, + "learning_rate": 0.0005, + "loss": 2.4899, + "step": 640 + }, + { + "epoch": 0.03358999534907757, + "grad_norm": 2.0101678371429443, + "learning_rate": 0.0005, + "loss": 2.4705, + "step": 650 + }, + { + "epoch": 0.03410676450829415, + "grad_norm": 1.567264437675476, + "learning_rate": 0.0005, + "loss": 2.4655, + "step": 660 + }, + { + "epoch": 0.034623533667510725, + "grad_norm": 1.504845142364502, + "learning_rate": 0.0005, + "loss": 2.4357, + "step": 670 + }, + { + "epoch": 0.0351403028267273, + "grad_norm": 1.3784253597259521, + "learning_rate": 0.0005, + "loss": 2.4484, + "step": 680 + }, + { + "epoch": 0.03565707198594388, + "grad_norm": 1.2612375020980835, + "learning_rate": 0.0005, + "loss": 2.4108, + "step": 690 + }, + { + "epoch": 0.03617384114516046, + "grad_norm": 1.2809542417526245, + "learning_rate": 0.0005, + "loss": 2.429, + "step": 700 + }, + { + "epoch": 0.036690610304377036, + "grad_norm": 1.282116174697876, + "learning_rate": 0.0005, + "loss": 2.4136, + "step": 710 + }, + { + "epoch": 0.037207379463593614, + "grad_norm": 2.6279282569885254, + "learning_rate": 0.0005, + "loss": 2.4043, + "step": 720 + }, + { + "epoch": 0.03772414862281019, + "grad_norm": 1.158665657043457, + "learning_rate": 0.0005, + "loss": 2.3666, + "step": 730 + }, + { + "epoch": 0.03824091778202677, + "grad_norm": 1.0581611394882202, + "learning_rate": 0.0005, + "loss": 2.3588, + "step": 740 + }, + { + "epoch": 0.03875768694124335, + "grad_norm": 1.225664496421814, + "learning_rate": 0.0005, + "loss": 2.3499, + "step": 750 + }, + { + "epoch": 0.039274456100459924, + "grad_norm": 1.9708060026168823, + "learning_rate": 0.0005, + "loss": 2.3462, + "step": 760 + }, + { + "epoch": 0.0397912252596765, + "grad_norm": 1.856858730316162, + "learning_rate": 0.0005, + "loss": 2.3443, + "step": 770 + }, + { + "epoch": 0.04030799441889308, + "grad_norm": 1.4437602758407593, + "learning_rate": 0.0005, + "loss": 2.289, + "step": 780 + }, + { + "epoch": 0.04082476357810966, + "grad_norm": 1.289876937866211, + "learning_rate": 0.0005, + "loss": 2.2881, + "step": 790 + }, + { + "epoch": 0.041341532737326235, + "grad_norm": 1.2440109252929688, + "learning_rate": 0.0005, + "loss": 2.2639, + "step": 800 + }, + { + "epoch": 0.04185830189654281, + "grad_norm": 1.2062422037124634, + "learning_rate": 0.0005, + "loss": 2.263, + "step": 810 + }, + { + "epoch": 0.04237507105575939, + "grad_norm": 1.171238660812378, + "learning_rate": 0.0005, + "loss": 2.259, + "step": 820 + }, + { + "epoch": 0.04289184021497597, + "grad_norm": 1.1757316589355469, + "learning_rate": 0.0005, + "loss": 2.2527, + "step": 830 + }, + { + "epoch": 0.043408609374192546, + "grad_norm": 1.3125736713409424, + "learning_rate": 0.0005, + "loss": 2.2429, + "step": 840 + }, + { + "epoch": 0.04392537853340912, + "grad_norm": 1.2145166397094727, + "learning_rate": 0.0005, + "loss": 2.2615, + "step": 850 + }, + { + "epoch": 0.0444421476926257, + "grad_norm": 1.3434226512908936, + "learning_rate": 0.0005, + "loss": 2.2398, + "step": 860 + }, + { + "epoch": 0.04495891685184228, + "grad_norm": 1.5729234218597412, + "learning_rate": 0.0005, + "loss": 2.2263, + "step": 870 + }, + { + "epoch": 0.04547568601105886, + "grad_norm": 1.4341133832931519, + "learning_rate": 0.0005, + "loss": 2.193, + "step": 880 + }, + { + "epoch": 0.04599245517027544, + "grad_norm": 1.1859130859375, + "learning_rate": 0.0005, + "loss": 2.1928, + "step": 890 + }, + { + "epoch": 0.04650922432949202, + "grad_norm": 1.8495599031448364, + "learning_rate": 0.0005, + "loss": 2.182, + "step": 900 + }, + { + "epoch": 0.047025993488708596, + "grad_norm": 1.416601538658142, + "learning_rate": 0.0005, + "loss": 2.1748, + "step": 910 + }, + { + "epoch": 0.047542762647925174, + "grad_norm": 1.2966763973236084, + "learning_rate": 0.0005, + "loss": 2.1481, + "step": 920 + }, + { + "epoch": 0.04805953180714175, + "grad_norm": 1.499526858329773, + "learning_rate": 0.0005, + "loss": 2.1442, + "step": 930 + }, + { + "epoch": 0.04857630096635833, + "grad_norm": 1.2409976720809937, + "learning_rate": 0.0005, + "loss": 2.1788, + "step": 940 + }, + { + "epoch": 0.04909307012557491, + "grad_norm": 1.2645729780197144, + "learning_rate": 0.0005, + "loss": 2.1577, + "step": 950 + }, + { + "epoch": 0.049609839284791485, + "grad_norm": 1.297904372215271, + "learning_rate": 0.0005, + "loss": 2.1412, + "step": 960 + }, + { + "epoch": 0.05012660844400806, + "grad_norm": 1.3194257020950317, + "learning_rate": 0.0005, + "loss": 2.1242, + "step": 970 + }, + { + "epoch": 0.05064337760322464, + "grad_norm": 1.4184504747390747, + "learning_rate": 0.0005, + "loss": 2.1414, + "step": 980 + }, + { + "epoch": 0.05116014676244122, + "grad_norm": 2.1643896102905273, + "learning_rate": 0.0005, + "loss": 2.12, + "step": 990 + }, + { + "epoch": 0.051676915921657796, + "grad_norm": 2.3233911991119385, + "learning_rate": 0.0005, + "loss": 2.0972, + "step": 1000 + }, + { + "epoch": 0.05219368508087437, + "grad_norm": 2.2393977642059326, + "learning_rate": 0.0005, + "loss": 2.0906, + "step": 1010 + }, + { + "epoch": 0.05271045424009095, + "grad_norm": 1.298572063446045, + "learning_rate": 0.0005, + "loss": 2.0664, + "step": 1020 + }, + { + "epoch": 0.05322722339930753, + "grad_norm": 1.8283194303512573, + "learning_rate": 0.0005, + "loss": 2.0726, + "step": 1030 + }, + { + "epoch": 0.053743992558524106, + "grad_norm": 1.4193987846374512, + "learning_rate": 0.0005, + "loss": 2.0643, + "step": 1040 + }, + { + "epoch": 0.054260761717740684, + "grad_norm": 1.3053640127182007, + "learning_rate": 0.0005, + "loss": 2.0124, + "step": 1050 + }, + { + "epoch": 0.05477753087695726, + "grad_norm": 1.598849892616272, + "learning_rate": 0.0005, + "loss": 2.0588, + "step": 1060 + }, + { + "epoch": 0.05529430003617384, + "grad_norm": 1.3975911140441895, + "learning_rate": 0.0005, + "loss": 2.0349, + "step": 1070 + }, + { + "epoch": 0.05581106919539042, + "grad_norm": 1.1616796255111694, + "learning_rate": 0.0005, + "loss": 2.0226, + "step": 1080 + }, + { + "epoch": 0.056327838354606995, + "grad_norm": 1.3756109476089478, + "learning_rate": 0.0005, + "loss": 2.0305, + "step": 1090 + }, + { + "epoch": 0.05684460751382357, + "grad_norm": 1.4646899700164795, + "learning_rate": 0.0005, + "loss": 2.0329, + "step": 1100 + }, + { + "epoch": 0.05736137667304015, + "grad_norm": 1.215995192527771, + "learning_rate": 0.0005, + "loss": 2.039, + "step": 1110 + }, + { + "epoch": 0.05787814583225673, + "grad_norm": 1.6965135335922241, + "learning_rate": 0.0005, + "loss": 2.0142, + "step": 1120 + }, + { + "epoch": 0.05839491499147331, + "grad_norm": 1.466676950454712, + "learning_rate": 0.0005, + "loss": 2.0231, + "step": 1130 + }, + { + "epoch": 0.05891168415068989, + "grad_norm": 1.5624445676803589, + "learning_rate": 0.0005, + "loss": 1.9893, + "step": 1140 + }, + { + "epoch": 0.05942845330990647, + "grad_norm": 1.3193562030792236, + "learning_rate": 0.0005, + "loss": 1.9744, + "step": 1150 + }, + { + "epoch": 0.059945222469123045, + "grad_norm": 1.1906757354736328, + "learning_rate": 0.0005, + "loss": 1.9866, + "step": 1160 + }, + { + "epoch": 0.06046199162833962, + "grad_norm": 2.6814322471618652, + "learning_rate": 0.0005, + "loss": 1.9743, + "step": 1170 + }, + { + "epoch": 0.0609787607875562, + "grad_norm": 1.7349072694778442, + "learning_rate": 0.0005, + "loss": 1.9815, + "step": 1180 + }, + { + "epoch": 0.06149552994677278, + "grad_norm": 2.099928617477417, + "learning_rate": 0.0005, + "loss": 1.9469, + "step": 1190 + }, + { + "epoch": 0.062012299105989356, + "grad_norm": 1.141414999961853, + "learning_rate": 0.0005, + "loss": 1.9422, + "step": 1200 + }, + { + "epoch": 0.06252906826520593, + "grad_norm": 1.1726713180541992, + "learning_rate": 0.0005, + "loss": 1.956, + "step": 1210 + }, + { + "epoch": 0.0630458374244225, + "grad_norm": 1.7521125078201294, + "learning_rate": 0.0005, + "loss": 1.9423, + "step": 1220 + }, + { + "epoch": 0.06356260658363909, + "grad_norm": 1.1364134550094604, + "learning_rate": 0.0005, + "loss": 1.9614, + "step": 1230 + }, + { + "epoch": 0.06407937574285566, + "grad_norm": 1.3168714046478271, + "learning_rate": 0.0005, + "loss": 1.9571, + "step": 1240 + }, + { + "epoch": 0.06459614490207224, + "grad_norm": 1.787176251411438, + "learning_rate": 0.0005, + "loss": 1.9566, + "step": 1250 + }, + { + "epoch": 0.06511291406128883, + "grad_norm": 1.2864240407943726, + "learning_rate": 0.0005, + "loss": 1.9349, + "step": 1260 + }, + { + "epoch": 0.0656296832205054, + "grad_norm": 1.5742415189743042, + "learning_rate": 0.0005, + "loss": 1.9006, + "step": 1270 + }, + { + "epoch": 0.06614645237972198, + "grad_norm": 1.349160075187683, + "learning_rate": 0.0005, + "loss": 1.9083, + "step": 1280 + }, + { + "epoch": 0.06666322153893856, + "grad_norm": 2.4660980701446533, + "learning_rate": 0.0005, + "loss": 1.9296, + "step": 1290 + }, + { + "epoch": 0.06717999069815514, + "grad_norm": 2.0999865531921387, + "learning_rate": 0.0005, + "loss": 1.876, + "step": 1300 + }, + { + "epoch": 0.06769675985737171, + "grad_norm": 1.3416152000427246, + "learning_rate": 0.0005, + "loss": 1.8874, + "step": 1310 + }, + { + "epoch": 0.0682135290165883, + "grad_norm": 1.2447683811187744, + "learning_rate": 0.0005, + "loss": 1.8892, + "step": 1320 + }, + { + "epoch": 0.06873029817580487, + "grad_norm": 1.1316670179367065, + "learning_rate": 0.0005, + "loss": 1.8754, + "step": 1330 + }, + { + "epoch": 0.06924706733502145, + "grad_norm": 1.354366421699524, + "learning_rate": 0.0005, + "loss": 1.8778, + "step": 1340 + }, + { + "epoch": 0.06976383649423802, + "grad_norm": 1.7485758066177368, + "learning_rate": 0.0005, + "loss": 1.8982, + "step": 1350 + }, + { + "epoch": 0.0702806056534546, + "grad_norm": 1.5367459058761597, + "learning_rate": 0.0005, + "loss": 1.8862, + "step": 1360 + }, + { + "epoch": 0.07079737481267118, + "grad_norm": 1.112423062324524, + "learning_rate": 0.0005, + "loss": 1.8748, + "step": 1370 + }, + { + "epoch": 0.07131414397188776, + "grad_norm": 1.269370436668396, + "learning_rate": 0.0005, + "loss": 1.8389, + "step": 1380 + }, + { + "epoch": 0.07183091313110433, + "grad_norm": 1.6045186519622803, + "learning_rate": 0.0005, + "loss": 1.8814, + "step": 1390 + }, + { + "epoch": 0.07234768229032092, + "grad_norm": 2.199096918106079, + "learning_rate": 0.0005, + "loss": 1.8576, + "step": 1400 + }, + { + "epoch": 0.07286445144953749, + "grad_norm": 1.2949317693710327, + "learning_rate": 0.0005, + "loss": 1.837, + "step": 1410 + }, + { + "epoch": 0.07338122060875407, + "grad_norm": 1.2082808017730713, + "learning_rate": 0.0005, + "loss": 1.8191, + "step": 1420 + }, + { + "epoch": 0.07389798976797064, + "grad_norm": 1.632419466972351, + "learning_rate": 0.0005, + "loss": 1.8192, + "step": 1430 + }, + { + "epoch": 0.07441475892718723, + "grad_norm": 1.1036083698272705, + "learning_rate": 0.0005, + "loss": 1.8046, + "step": 1440 + }, + { + "epoch": 0.0749315280864038, + "grad_norm": 1.6769006252288818, + "learning_rate": 0.0005, + "loss": 1.8174, + "step": 1450 + }, + { + "epoch": 0.07544829724562038, + "grad_norm": 1.587368130683899, + "learning_rate": 0.0005, + "loss": 1.8311, + "step": 1460 + }, + { + "epoch": 0.07596506640483695, + "grad_norm": 1.063362956047058, + "learning_rate": 0.0005, + "loss": 1.7886, + "step": 1470 + }, + { + "epoch": 0.07648183556405354, + "grad_norm": 1.258238673210144, + "learning_rate": 0.0005, + "loss": 1.8063, + "step": 1480 + }, + { + "epoch": 0.07699860472327011, + "grad_norm": 1.3020492792129517, + "learning_rate": 0.0005, + "loss": 1.8289, + "step": 1490 + }, + { + "epoch": 0.0775153738824867, + "grad_norm": 2.0609872341156006, + "learning_rate": 0.0005, + "loss": 1.7893, + "step": 1500 + }, + { + "epoch": 0.07803214304170328, + "grad_norm": 1.1070424318313599, + "learning_rate": 0.0005, + "loss": 1.768, + "step": 1510 + }, + { + "epoch": 0.07854891220091985, + "grad_norm": 2.055048704147339, + "learning_rate": 0.0005, + "loss": 1.7597, + "step": 1520 + }, + { + "epoch": 0.07906568136013643, + "grad_norm": 1.4444563388824463, + "learning_rate": 0.0005, + "loss": 1.7827, + "step": 1530 + }, + { + "epoch": 0.079582450519353, + "grad_norm": 1.388077735900879, + "learning_rate": 0.0005, + "loss": 1.7753, + "step": 1540 + }, + { + "epoch": 0.08009921967856959, + "grad_norm": 1.2297486066818237, + "learning_rate": 0.0005, + "loss": 1.7371, + "step": 1550 + }, + { + "epoch": 0.08061598883778616, + "grad_norm": 1.1055219173431396, + "learning_rate": 0.0005, + "loss": 1.7616, + "step": 1560 + }, + { + "epoch": 0.08113275799700274, + "grad_norm": 1.330352783203125, + "learning_rate": 0.0005, + "loss": 1.7753, + "step": 1570 + }, + { + "epoch": 0.08164952715621931, + "grad_norm": 1.0750646591186523, + "learning_rate": 0.0005, + "loss": 1.7551, + "step": 1580 + }, + { + "epoch": 0.0821662963154359, + "grad_norm": 1.1137466430664062, + "learning_rate": 0.0005, + "loss": 1.7686, + "step": 1590 + }, + { + "epoch": 0.08268306547465247, + "grad_norm": 1.2276798486709595, + "learning_rate": 0.0005, + "loss": 1.7617, + "step": 1600 + }, + { + "epoch": 0.08319983463386905, + "grad_norm": 1.0940239429473877, + "learning_rate": 0.0005, + "loss": 1.7269, + "step": 1610 + }, + { + "epoch": 0.08371660379308563, + "grad_norm": 1.1361453533172607, + "learning_rate": 0.0005, + "loss": 1.7481, + "step": 1620 + }, + { + "epoch": 0.08423337295230221, + "grad_norm": 1.482571005821228, + "learning_rate": 0.0005, + "loss": 1.7265, + "step": 1630 + }, + { + "epoch": 0.08475014211151878, + "grad_norm": 1.2309211492538452, + "learning_rate": 0.0005, + "loss": 1.7087, + "step": 1640 + }, + { + "epoch": 0.08526691127073537, + "grad_norm": 1.162300705909729, + "learning_rate": 0.0005, + "loss": 1.708, + "step": 1650 + }, + { + "epoch": 0.08578368042995194, + "grad_norm": 1.1956666707992554, + "learning_rate": 0.0005, + "loss": 1.73, + "step": 1660 + }, + { + "epoch": 0.08630044958916852, + "grad_norm": 1.5038352012634277, + "learning_rate": 0.0005, + "loss": 1.7213, + "step": 1670 + }, + { + "epoch": 0.08681721874838509, + "grad_norm": 1.2151919603347778, + "learning_rate": 0.0005, + "loss": 1.7224, + "step": 1680 + }, + { + "epoch": 0.08733398790760168, + "grad_norm": 1.0433135032653809, + "learning_rate": 0.0005, + "loss": 1.7049, + "step": 1690 + }, + { + "epoch": 0.08785075706681825, + "grad_norm": 1.8113486766815186, + "learning_rate": 0.0005, + "loss": 1.7132, + "step": 1700 + }, + { + "epoch": 0.08836752622603483, + "grad_norm": 0.9753373861312866, + "learning_rate": 0.0005, + "loss": 1.7109, + "step": 1710 + }, + { + "epoch": 0.0888842953852514, + "grad_norm": 1.355560064315796, + "learning_rate": 0.0005, + "loss": 1.7041, + "step": 1720 + }, + { + "epoch": 0.08940106454446799, + "grad_norm": 1.1716082096099854, + "learning_rate": 0.0005, + "loss": 1.7114, + "step": 1730 + }, + { + "epoch": 0.08991783370368456, + "grad_norm": 1.10747492313385, + "learning_rate": 0.0005, + "loss": 1.7068, + "step": 1740 + }, + { + "epoch": 0.09043460286290114, + "grad_norm": 1.0477211475372314, + "learning_rate": 0.0005, + "loss": 1.6935, + "step": 1750 + }, + { + "epoch": 0.09095137202211773, + "grad_norm": 1.1489983797073364, + "learning_rate": 0.0005, + "loss": 1.6976, + "step": 1760 + }, + { + "epoch": 0.0914681411813343, + "grad_norm": 1.2262177467346191, + "learning_rate": 0.0005, + "loss": 1.689, + "step": 1770 + }, + { + "epoch": 0.09198491034055088, + "grad_norm": 1.111374020576477, + "learning_rate": 0.0005, + "loss": 1.6811, + "step": 1780 + }, + { + "epoch": 0.09250167949976745, + "grad_norm": 1.0549476146697998, + "learning_rate": 0.0005, + "loss": 1.6539, + "step": 1790 + }, + { + "epoch": 0.09301844865898404, + "grad_norm": 1.2341543436050415, + "learning_rate": 0.0005, + "loss": 1.6643, + "step": 1800 + }, + { + "epoch": 0.09353521781820061, + "grad_norm": 1.6305192708969116, + "learning_rate": 0.0005, + "loss": 1.6553, + "step": 1810 + }, + { + "epoch": 0.09405198697741719, + "grad_norm": 1.0614426136016846, + "learning_rate": 0.0005, + "loss": 1.6474, + "step": 1820 + }, + { + "epoch": 0.09456875613663376, + "grad_norm": 1.1092963218688965, + "learning_rate": 0.0005, + "loss": 1.6686, + "step": 1830 + }, + { + "epoch": 0.09508552529585035, + "grad_norm": 1.521255373954773, + "learning_rate": 0.0005, + "loss": 1.6586, + "step": 1840 + }, + { + "epoch": 0.09560229445506692, + "grad_norm": 1.353458285331726, + "learning_rate": 0.0005, + "loss": 1.6632, + "step": 1850 + }, + { + "epoch": 0.0961190636142835, + "grad_norm": 1.0622385740280151, + "learning_rate": 0.0005, + "loss": 1.6417, + "step": 1860 + }, + { + "epoch": 0.09663583277350007, + "grad_norm": 1.1304274797439575, + "learning_rate": 0.0005, + "loss": 1.6374, + "step": 1870 + }, + { + "epoch": 0.09715260193271666, + "grad_norm": 1.6776567697525024, + "learning_rate": 0.0005, + "loss": 1.648, + "step": 1880 + }, + { + "epoch": 0.09766937109193323, + "grad_norm": 1.2316774129867554, + "learning_rate": 0.0005, + "loss": 1.6055, + "step": 1890 + }, + { + "epoch": 0.09818614025114981, + "grad_norm": 1.1291395425796509, + "learning_rate": 0.0005, + "loss": 1.6199, + "step": 1900 + }, + { + "epoch": 0.09870290941036639, + "grad_norm": 1.2423152923583984, + "learning_rate": 0.0005, + "loss": 1.6343, + "step": 1910 + }, + { + "epoch": 0.09921967856958297, + "grad_norm": 1.6953014135360718, + "learning_rate": 0.0005, + "loss": 1.6011, + "step": 1920 + }, + { + "epoch": 0.09973644772879954, + "grad_norm": 1.078352451324463, + "learning_rate": 0.0005, + "loss": 1.5773, + "step": 1930 + }, + { + "epoch": 0.10025321688801613, + "grad_norm": 1.1383408308029175, + "learning_rate": 0.0005, + "loss": 1.6175, + "step": 1940 + }, + { + "epoch": 0.1007699860472327, + "grad_norm": 0.998919665813446, + "learning_rate": 0.0005, + "loss": 1.6388, + "step": 1950 + }, + { + "epoch": 0.10128675520644928, + "grad_norm": 1.6332008838653564, + "learning_rate": 0.0005, + "loss": 1.5987, + "step": 1960 + }, + { + "epoch": 0.10180352436566585, + "grad_norm": 1.041397213935852, + "learning_rate": 0.0005, + "loss": 1.6043, + "step": 1970 + }, + { + "epoch": 0.10232029352488244, + "grad_norm": 1.1090408563613892, + "learning_rate": 0.0005, + "loss": 1.5859, + "step": 1980 + }, + { + "epoch": 0.102837062684099, + "grad_norm": 1.0914579629898071, + "learning_rate": 0.0005, + "loss": 1.6069, + "step": 1990 + }, + { + "epoch": 0.10335383184331559, + "grad_norm": 1.5213651657104492, + "learning_rate": 0.0005, + "loss": 1.5897, + "step": 2000 + }, + { + "epoch": 0.10387060100253218, + "grad_norm": 1.5415380001068115, + "learning_rate": 0.0005, + "loss": 1.6035, + "step": 2010 + }, + { + "epoch": 0.10438737016174875, + "grad_norm": 1.1095470190048218, + "learning_rate": 0.0005, + "loss": 1.5697, + "step": 2020 + }, + { + "epoch": 0.10490413932096533, + "grad_norm": 1.3773058652877808, + "learning_rate": 0.0005, + "loss": 1.5827, + "step": 2030 + }, + { + "epoch": 0.1054209084801819, + "grad_norm": 0.9746466279029846, + "learning_rate": 0.0005, + "loss": 1.5612, + "step": 2040 + }, + { + "epoch": 0.10593767763939849, + "grad_norm": 1.047061800956726, + "learning_rate": 0.0005, + "loss": 1.5707, + "step": 2050 + }, + { + "epoch": 0.10645444679861506, + "grad_norm": 0.9137332439422607, + "learning_rate": 0.0005, + "loss": 1.5646, + "step": 2060 + }, + { + "epoch": 0.10697121595783164, + "grad_norm": 0.9967837929725647, + "learning_rate": 0.0005, + "loss": 1.5659, + "step": 2070 + }, + { + "epoch": 0.10748798511704821, + "grad_norm": 1.2617110013961792, + "learning_rate": 0.0005, + "loss": 1.5673, + "step": 2080 + }, + { + "epoch": 0.1080047542762648, + "grad_norm": 0.9831250309944153, + "learning_rate": 0.0005, + "loss": 1.5742, + "step": 2090 + }, + { + "epoch": 0.10852152343548137, + "grad_norm": 1.1735457181930542, + "learning_rate": 0.0005, + "loss": 1.5811, + "step": 2100 + }, + { + "epoch": 0.10903829259469795, + "grad_norm": 1.1183675527572632, + "learning_rate": 0.0005, + "loss": 1.5546, + "step": 2110 + }, + { + "epoch": 0.10955506175391452, + "grad_norm": 1.3536667823791504, + "learning_rate": 0.0005, + "loss": 1.5551, + "step": 2120 + }, + { + "epoch": 0.11007183091313111, + "grad_norm": 0.9417304396629333, + "learning_rate": 0.0005, + "loss": 1.562, + "step": 2130 + }, + { + "epoch": 0.11058860007234768, + "grad_norm": 0.9261025786399841, + "learning_rate": 0.0005, + "loss": 1.5736, + "step": 2140 + }, + { + "epoch": 0.11110536923156426, + "grad_norm": 1.1396183967590332, + "learning_rate": 0.0005, + "loss": 1.5417, + "step": 2150 + }, + { + "epoch": 0.11162213839078083, + "grad_norm": 0.9720540642738342, + "learning_rate": 0.0005, + "loss": 1.5231, + "step": 2160 + }, + { + "epoch": 0.11213890754999742, + "grad_norm": 0.9784930348396301, + "learning_rate": 0.0005, + "loss": 1.5428, + "step": 2170 + }, + { + "epoch": 0.11265567670921399, + "grad_norm": 1.037022590637207, + "learning_rate": 0.0005, + "loss": 1.5562, + "step": 2180 + }, + { + "epoch": 0.11317244586843057, + "grad_norm": 1.3437378406524658, + "learning_rate": 0.0005, + "loss": 1.5452, + "step": 2190 + }, + { + "epoch": 0.11368921502764714, + "grad_norm": 1.2525360584259033, + "learning_rate": 0.0005, + "loss": 1.5372, + "step": 2200 + }, + { + "epoch": 0.11420598418686373, + "grad_norm": 1.0389316082000732, + "learning_rate": 0.0005, + "loss": 1.5273, + "step": 2210 + }, + { + "epoch": 0.1147227533460803, + "grad_norm": 1.2379904985427856, + "learning_rate": 0.0005, + "loss": 1.5281, + "step": 2220 + }, + { + "epoch": 0.11523952250529688, + "grad_norm": 1.0728790760040283, + "learning_rate": 0.0005, + "loss": 1.5228, + "step": 2230 + }, + { + "epoch": 0.11575629166451346, + "grad_norm": 1.54011070728302, + "learning_rate": 0.0005, + "loss": 1.5257, + "step": 2240 + }, + { + "epoch": 0.11627306082373004, + "grad_norm": 1.4011873006820679, + "learning_rate": 0.0005, + "loss": 1.5258, + "step": 2250 + }, + { + "epoch": 0.11678982998294662, + "grad_norm": 1.2126344442367554, + "learning_rate": 0.0005, + "loss": 1.5249, + "step": 2260 + }, + { + "epoch": 0.1173065991421632, + "grad_norm": 1.1125898361206055, + "learning_rate": 0.0005, + "loss": 1.5034, + "step": 2270 + }, + { + "epoch": 0.11782336830137978, + "grad_norm": 1.0404047966003418, + "learning_rate": 0.0005, + "loss": 1.5243, + "step": 2280 + }, + { + "epoch": 0.11834013746059635, + "grad_norm": 0.9504315257072449, + "learning_rate": 0.0005, + "loss": 1.501, + "step": 2290 + }, + { + "epoch": 0.11885690661981294, + "grad_norm": 1.0554097890853882, + "learning_rate": 0.0005, + "loss": 1.5115, + "step": 2300 + }, + { + "epoch": 0.1193736757790295, + "grad_norm": 0.9352626204490662, + "learning_rate": 0.0005, + "loss": 1.5038, + "step": 2310 + }, + { + "epoch": 0.11989044493824609, + "grad_norm": 0.9765718579292297, + "learning_rate": 0.0005, + "loss": 1.5019, + "step": 2320 + }, + { + "epoch": 0.12040721409746266, + "grad_norm": 1.2419780492782593, + "learning_rate": 0.0005, + "loss": 1.4993, + "step": 2330 + }, + { + "epoch": 0.12092398325667925, + "grad_norm": 1.0337820053100586, + "learning_rate": 0.0005, + "loss": 1.4843, + "step": 2340 + }, + { + "epoch": 0.12144075241589582, + "grad_norm": 1.0803256034851074, + "learning_rate": 0.0005, + "loss": 1.4902, + "step": 2350 + }, + { + "epoch": 0.1219575215751124, + "grad_norm": 0.9424406886100769, + "learning_rate": 0.0005, + "loss": 1.5031, + "step": 2360 + }, + { + "epoch": 0.12247429073432897, + "grad_norm": 0.9924182891845703, + "learning_rate": 0.0005, + "loss": 1.489, + "step": 2370 + }, + { + "epoch": 0.12299105989354556, + "grad_norm": 1.0602052211761475, + "learning_rate": 0.0005, + "loss": 1.4801, + "step": 2380 + }, + { + "epoch": 0.12350782905276213, + "grad_norm": 0.9463520646095276, + "learning_rate": 0.0005, + "loss": 1.49, + "step": 2390 + }, + { + "epoch": 0.12402459821197871, + "grad_norm": 0.9301887154579163, + "learning_rate": 0.0005, + "loss": 1.4923, + "step": 2400 + }, + { + "epoch": 0.12454136737119528, + "grad_norm": 0.9018756151199341, + "learning_rate": 0.0005, + "loss": 1.457, + "step": 2410 + }, + { + "epoch": 0.12505813653041187, + "grad_norm": 0.9669187068939209, + "learning_rate": 0.0005, + "loss": 1.4691, + "step": 2420 + }, + { + "epoch": 0.12557490568962845, + "grad_norm": 0.9768301248550415, + "learning_rate": 0.0005, + "loss": 1.4448, + "step": 2430 + }, + { + "epoch": 0.126091674848845, + "grad_norm": 0.9736414551734924, + "learning_rate": 0.0005, + "loss": 1.4671, + "step": 2440 + }, + { + "epoch": 0.1266084440080616, + "grad_norm": 1.3117995262145996, + "learning_rate": 0.0005, + "loss": 1.4577, + "step": 2450 + }, + { + "epoch": 0.12712521316727818, + "grad_norm": 0.976732075214386, + "learning_rate": 0.0005, + "loss": 1.4624, + "step": 2460 + }, + { + "epoch": 0.12764198232649476, + "grad_norm": 1.1756422519683838, + "learning_rate": 0.0005, + "loss": 1.4675, + "step": 2470 + }, + { + "epoch": 0.12815875148571132, + "grad_norm": 0.9411507844924927, + "learning_rate": 0.0005, + "loss": 1.4634, + "step": 2480 + }, + { + "epoch": 0.1286755206449279, + "grad_norm": 1.6214072704315186, + "learning_rate": 0.0005, + "loss": 1.4685, + "step": 2490 + }, + { + "epoch": 0.1291922898041445, + "grad_norm": 1.0801911354064941, + "learning_rate": 0.0005, + "loss": 1.4468, + "step": 2500 + }, + { + "epoch": 0.12970905896336107, + "grad_norm": 0.9756599068641663, + "learning_rate": 0.0005, + "loss": 1.4438, + "step": 2510 + }, + { + "epoch": 0.13022582812257766, + "grad_norm": 1.1823363304138184, + "learning_rate": 0.0005, + "loss": 1.4522, + "step": 2520 + }, + { + "epoch": 0.13074259728179422, + "grad_norm": 1.0005122423171997, + "learning_rate": 0.0005, + "loss": 1.436, + "step": 2530 + }, + { + "epoch": 0.1312593664410108, + "grad_norm": 1.4303867816925049, + "learning_rate": 0.0005, + "loss": 1.4411, + "step": 2540 + }, + { + "epoch": 0.13177613560022738, + "grad_norm": 0.867132842540741, + "learning_rate": 0.0005, + "loss": 1.4558, + "step": 2550 + }, + { + "epoch": 0.13229290475944397, + "grad_norm": 0.9243984222412109, + "learning_rate": 0.0005, + "loss": 1.4282, + "step": 2560 + }, + { + "epoch": 0.13280967391866053, + "grad_norm": 1.1926263570785522, + "learning_rate": 0.0005, + "loss": 1.4187, + "step": 2570 + }, + { + "epoch": 0.1333264430778771, + "grad_norm": 1.1110721826553345, + "learning_rate": 0.0005, + "loss": 1.4302, + "step": 2580 + }, + { + "epoch": 0.1338432122370937, + "grad_norm": 0.9598495960235596, + "learning_rate": 0.0005, + "loss": 1.4459, + "step": 2590 + }, + { + "epoch": 0.13435998139631028, + "grad_norm": 0.9147258996963501, + "learning_rate": 0.0005, + "loss": 1.4174, + "step": 2600 + }, + { + "epoch": 0.13487675055552684, + "grad_norm": 0.8530228734016418, + "learning_rate": 0.0005, + "loss": 1.4348, + "step": 2610 + }, + { + "epoch": 0.13539351971474342, + "grad_norm": 1.0487037897109985, + "learning_rate": 0.0005, + "loss": 1.4302, + "step": 2620 + }, + { + "epoch": 0.13591028887396, + "grad_norm": 1.0711545944213867, + "learning_rate": 0.0005, + "loss": 1.425, + "step": 2630 + }, + { + "epoch": 0.1364270580331766, + "grad_norm": 1.0053889751434326, + "learning_rate": 0.0005, + "loss": 1.4099, + "step": 2640 + }, + { + "epoch": 0.13694382719239315, + "grad_norm": 0.8895754814147949, + "learning_rate": 0.0005, + "loss": 1.4101, + "step": 2650 + }, + { + "epoch": 0.13746059635160973, + "grad_norm": 1.1464654207229614, + "learning_rate": 0.0005, + "loss": 1.409, + "step": 2660 + }, + { + "epoch": 0.13797736551082632, + "grad_norm": 1.4213604927062988, + "learning_rate": 0.0005, + "loss": 1.4333, + "step": 2670 + }, + { + "epoch": 0.1384941346700429, + "grad_norm": 0.8963467478752136, + "learning_rate": 0.0005, + "loss": 1.4047, + "step": 2680 + }, + { + "epoch": 0.13901090382925946, + "grad_norm": 0.9514134526252747, + "learning_rate": 0.0005, + "loss": 1.3923, + "step": 2690 + }, + { + "epoch": 0.13952767298847604, + "grad_norm": 0.8818897604942322, + "learning_rate": 0.0005, + "loss": 1.4031, + "step": 2700 + }, + { + "epoch": 0.14004444214769263, + "grad_norm": 0.8554843664169312, + "learning_rate": 0.0005, + "loss": 1.4005, + "step": 2710 + }, + { + "epoch": 0.1405612113069092, + "grad_norm": 0.9477766752243042, + "learning_rate": 0.0005, + "loss": 1.3871, + "step": 2720 + }, + { + "epoch": 0.14107798046612577, + "grad_norm": 0.9560056924819946, + "learning_rate": 0.0005, + "loss": 1.388, + "step": 2730 + }, + { + "epoch": 0.14159474962534235, + "grad_norm": 1.325939655303955, + "learning_rate": 0.0005, + "loss": 1.372, + "step": 2740 + }, + { + "epoch": 0.14211151878455894, + "grad_norm": 0.9184489846229553, + "learning_rate": 0.0005, + "loss": 1.3901, + "step": 2750 + }, + { + "epoch": 0.14262828794377552, + "grad_norm": 0.905005693435669, + "learning_rate": 0.0005, + "loss": 1.3652, + "step": 2760 + }, + { + "epoch": 0.1431450571029921, + "grad_norm": 0.9112023115158081, + "learning_rate": 0.0005, + "loss": 1.3805, + "step": 2770 + }, + { + "epoch": 0.14366182626220866, + "grad_norm": 0.909542977809906, + "learning_rate": 0.0005, + "loss": 1.3851, + "step": 2780 + }, + { + "epoch": 0.14417859542142525, + "grad_norm": 0.8679105639457703, + "learning_rate": 0.0005, + "loss": 1.3776, + "step": 2790 + }, + { + "epoch": 0.14469536458064183, + "grad_norm": 0.884416401386261, + "learning_rate": 0.0005, + "loss": 1.3787, + "step": 2800 + }, + { + "epoch": 0.14521213373985842, + "grad_norm": 0.8939566612243652, + "learning_rate": 0.0005, + "loss": 1.3695, + "step": 2810 + }, + { + "epoch": 0.14572890289907497, + "grad_norm": 1.2388486862182617, + "learning_rate": 0.0005, + "loss": 1.3926, + "step": 2820 + }, + { + "epoch": 0.14624567205829156, + "grad_norm": 1.2662867307662964, + "learning_rate": 0.0005, + "loss": 1.3804, + "step": 2830 + }, + { + "epoch": 0.14676244121750814, + "grad_norm": 0.8967621326446533, + "learning_rate": 0.0005, + "loss": 1.3513, + "step": 2840 + }, + { + "epoch": 0.14727921037672473, + "grad_norm": 0.8640676736831665, + "learning_rate": 0.0005, + "loss": 1.3546, + "step": 2850 + }, + { + "epoch": 0.14779597953594129, + "grad_norm": 1.0147978067398071, + "learning_rate": 0.0005, + "loss": 1.3699, + "step": 2860 + }, + { + "epoch": 0.14831274869515787, + "grad_norm": 0.8949346542358398, + "learning_rate": 0.0005, + "loss": 1.345, + "step": 2870 + }, + { + "epoch": 0.14882951785437445, + "grad_norm": 0.8535652756690979, + "learning_rate": 0.0005, + "loss": 1.3724, + "step": 2880 + }, + { + "epoch": 0.14934628701359104, + "grad_norm": 0.840876042842865, + "learning_rate": 0.0005, + "loss": 1.3692, + "step": 2890 + }, + { + "epoch": 0.1498630561728076, + "grad_norm": 0.8421388864517212, + "learning_rate": 0.0005, + "loss": 1.3639, + "step": 2900 + }, + { + "epoch": 0.15037982533202418, + "grad_norm": 0.8401720523834229, + "learning_rate": 0.0005, + "loss": 1.348, + "step": 2910 + }, + { + "epoch": 0.15089659449124077, + "grad_norm": 0.8139095306396484, + "learning_rate": 0.0005, + "loss": 1.365, + "step": 2920 + }, + { + "epoch": 0.15141336365045735, + "grad_norm": 0.8704052567481995, + "learning_rate": 0.0005, + "loss": 1.3482, + "step": 2930 + }, + { + "epoch": 0.1519301328096739, + "grad_norm": 0.8963611125946045, + "learning_rate": 0.0005, + "loss": 1.3336, + "step": 2940 + }, + { + "epoch": 0.1524469019688905, + "grad_norm": 0.8725153207778931, + "learning_rate": 0.0005, + "loss": 1.3724, + "step": 2950 + }, + { + "epoch": 0.15296367112810708, + "grad_norm": 0.9125774502754211, + "learning_rate": 0.0005, + "loss": 1.3377, + "step": 2960 + }, + { + "epoch": 0.15348044028732366, + "grad_norm": 1.1160928010940552, + "learning_rate": 0.0005, + "loss": 1.3582, + "step": 2970 + }, + { + "epoch": 0.15399720944654022, + "grad_norm": 0.8732350468635559, + "learning_rate": 0.0005, + "loss": 1.3471, + "step": 2980 + }, + { + "epoch": 0.1545139786057568, + "grad_norm": 0.8881607055664062, + "learning_rate": 0.0005, + "loss": 1.3552, + "step": 2990 + }, + { + "epoch": 0.1550307477649734, + "grad_norm": 1.0814484357833862, + "learning_rate": 0.0005, + "loss": 1.3628, + "step": 3000 + }, + { + "epoch": 0.15554751692418997, + "grad_norm": 0.81389319896698, + "learning_rate": 0.0005, + "loss": 1.3249, + "step": 3010 + }, + { + "epoch": 0.15606428608340656, + "grad_norm": 0.8424196839332581, + "learning_rate": 0.0005, + "loss": 1.323, + "step": 3020 + }, + { + "epoch": 0.1565810552426231, + "grad_norm": 0.8028131127357483, + "learning_rate": 0.0005, + "loss": 1.3302, + "step": 3030 + }, + { + "epoch": 0.1570978244018397, + "grad_norm": 0.8348473906517029, + "learning_rate": 0.0005, + "loss": 1.3314, + "step": 3040 + }, + { + "epoch": 0.15761459356105628, + "grad_norm": 1.2074034214019775, + "learning_rate": 0.0005, + "loss": 1.3355, + "step": 3050 + }, + { + "epoch": 0.15813136272027287, + "grad_norm": 0.8177675604820251, + "learning_rate": 0.0005, + "loss": 1.3427, + "step": 3060 + }, + { + "epoch": 0.15864813187948942, + "grad_norm": 0.796273410320282, + "learning_rate": 0.0005, + "loss": 1.3088, + "step": 3070 + }, + { + "epoch": 0.159164901038706, + "grad_norm": 1.0104438066482544, + "learning_rate": 0.0005, + "loss": 1.3255, + "step": 3080 + }, + { + "epoch": 0.1596816701979226, + "grad_norm": 0.9192485809326172, + "learning_rate": 0.0005, + "loss": 1.3347, + "step": 3090 + }, + { + "epoch": 0.16019843935713918, + "grad_norm": 0.912550151348114, + "learning_rate": 0.0005, + "loss": 1.3157, + "step": 3100 + }, + { + "epoch": 0.16071520851635573, + "grad_norm": 0.9644028544425964, + "learning_rate": 0.0005, + "loss": 1.3242, + "step": 3110 + }, + { + "epoch": 0.16123197767557232, + "grad_norm": 0.9894726872444153, + "learning_rate": 0.0005, + "loss": 1.2968, + "step": 3120 + }, + { + "epoch": 0.1617487468347889, + "grad_norm": 0.9292682409286499, + "learning_rate": 0.0005, + "loss": 1.3342, + "step": 3130 + }, + { + "epoch": 0.1622655159940055, + "grad_norm": 0.9219216704368591, + "learning_rate": 0.0005, + "loss": 1.3242, + "step": 3140 + }, + { + "epoch": 0.16278228515322204, + "grad_norm": 1.1059894561767578, + "learning_rate": 0.0005, + "loss": 1.3238, + "step": 3150 + }, + { + "epoch": 0.16329905431243863, + "grad_norm": 0.8726058602333069, + "learning_rate": 0.0005, + "loss": 1.315, + "step": 3160 + }, + { + "epoch": 0.16381582347165521, + "grad_norm": 0.8204345107078552, + "learning_rate": 0.0005, + "loss": 1.3085, + "step": 3170 + }, + { + "epoch": 0.1643325926308718, + "grad_norm": 0.9515188932418823, + "learning_rate": 0.0005, + "loss": 1.2986, + "step": 3180 + }, + { + "epoch": 0.16484936179008836, + "grad_norm": 0.8825114369392395, + "learning_rate": 0.0005, + "loss": 1.2921, + "step": 3190 + }, + { + "epoch": 0.16536613094930494, + "grad_norm": 0.8144583702087402, + "learning_rate": 0.0005, + "loss": 1.2991, + "step": 3200 + }, + { + "epoch": 0.16588290010852152, + "grad_norm": 0.8747395873069763, + "learning_rate": 0.0005, + "loss": 1.2936, + "step": 3210 + }, + { + "epoch": 0.1663996692677381, + "grad_norm": 0.9829278588294983, + "learning_rate": 0.0005, + "loss": 1.2898, + "step": 3220 + }, + { + "epoch": 0.16691643842695467, + "grad_norm": 0.917072594165802, + "learning_rate": 0.0005, + "loss": 1.3056, + "step": 3230 + }, + { + "epoch": 0.16743320758617125, + "grad_norm": 0.893224835395813, + "learning_rate": 0.0005, + "loss": 1.2958, + "step": 3240 + }, + { + "epoch": 0.16794997674538784, + "grad_norm": 0.8513831496238708, + "learning_rate": 0.0005, + "loss": 1.3073, + "step": 3250 + }, + { + "epoch": 0.16846674590460442, + "grad_norm": 0.7902063727378845, + "learning_rate": 0.0005, + "loss": 1.2962, + "step": 3260 + }, + { + "epoch": 0.168983515063821, + "grad_norm": 0.8533388376235962, + "learning_rate": 0.0005, + "loss": 1.3034, + "step": 3270 + }, + { + "epoch": 0.16950028422303756, + "grad_norm": 0.89384526014328, + "learning_rate": 0.0005, + "loss": 1.306, + "step": 3280 + }, + { + "epoch": 0.17001705338225415, + "grad_norm": 1.1740915775299072, + "learning_rate": 0.0005, + "loss": 1.2861, + "step": 3290 + }, + { + "epoch": 0.17053382254147073, + "grad_norm": 0.7941210269927979, + "learning_rate": 0.0005, + "loss": 1.29, + "step": 3300 + }, + { + "epoch": 0.17105059170068732, + "grad_norm": 0.82374107837677, + "learning_rate": 0.0005, + "loss": 1.2715, + "step": 3310 + }, + { + "epoch": 0.17156736085990387, + "grad_norm": 0.9856778979301453, + "learning_rate": 0.0005, + "loss": 1.2908, + "step": 3320 + }, + { + "epoch": 0.17208413001912046, + "grad_norm": 0.777244508266449, + "learning_rate": 0.0005, + "loss": 1.2891, + "step": 3330 + }, + { + "epoch": 0.17260089917833704, + "grad_norm": 0.8938208222389221, + "learning_rate": 0.0005, + "loss": 1.285, + "step": 3340 + }, + { + "epoch": 0.17311766833755363, + "grad_norm": 0.8124037384986877, + "learning_rate": 0.0005, + "loss": 1.2908, + "step": 3350 + }, + { + "epoch": 0.17363443749677018, + "grad_norm": 0.9345457553863525, + "learning_rate": 0.0005, + "loss": 1.2964, + "step": 3360 + }, + { + "epoch": 0.17415120665598677, + "grad_norm": 0.7821003794670105, + "learning_rate": 0.0005, + "loss": 1.2767, + "step": 3370 + }, + { + "epoch": 0.17466797581520335, + "grad_norm": 0.8330212831497192, + "learning_rate": 0.0005, + "loss": 1.2779, + "step": 3380 + }, + { + "epoch": 0.17518474497441994, + "grad_norm": 0.764042854309082, + "learning_rate": 0.0005, + "loss": 1.2698, + "step": 3390 + }, + { + "epoch": 0.1757015141336365, + "grad_norm": 0.9339214563369751, + "learning_rate": 0.0005, + "loss": 1.2777, + "step": 3400 + }, + { + "epoch": 0.17621828329285308, + "grad_norm": 0.8121135830879211, + "learning_rate": 0.0005, + "loss": 1.2869, + "step": 3410 + }, + { + "epoch": 0.17673505245206966, + "grad_norm": 0.8460163474082947, + "learning_rate": 0.0005, + "loss": 1.2913, + "step": 3420 + }, + { + "epoch": 0.17725182161128625, + "grad_norm": 1.3961695432662964, + "learning_rate": 0.0005, + "loss": 1.2971, + "step": 3430 + }, + { + "epoch": 0.1777685907705028, + "grad_norm": 0.8089907765388489, + "learning_rate": 0.0005, + "loss": 1.2612, + "step": 3440 + }, + { + "epoch": 0.1782853599297194, + "grad_norm": 0.8770979046821594, + "learning_rate": 0.0005, + "loss": 1.2739, + "step": 3450 + }, + { + "epoch": 0.17880212908893597, + "grad_norm": 0.8448237776756287, + "learning_rate": 0.0005, + "loss": 1.2735, + "step": 3460 + }, + { + "epoch": 0.17931889824815256, + "grad_norm": 0.9335261583328247, + "learning_rate": 0.0005, + "loss": 1.2671, + "step": 3470 + }, + { + "epoch": 0.17983566740736912, + "grad_norm": 0.7510360479354858, + "learning_rate": 0.0005, + "loss": 1.2691, + "step": 3480 + }, + { + "epoch": 0.1803524365665857, + "grad_norm": 0.7871717810630798, + "learning_rate": 0.0005, + "loss": 1.2642, + "step": 3490 + }, + { + "epoch": 0.18086920572580228, + "grad_norm": 1.1407464742660522, + "learning_rate": 0.0005, + "loss": 1.248, + "step": 3500 + }, + { + "epoch": 0.18138597488501887, + "grad_norm": 0.8027787208557129, + "learning_rate": 0.0005, + "loss": 1.2557, + "step": 3510 + }, + { + "epoch": 0.18190274404423545, + "grad_norm": 0.8517947793006897, + "learning_rate": 0.0005, + "loss": 1.2529, + "step": 3520 + }, + { + "epoch": 0.182419513203452, + "grad_norm": 0.9083014726638794, + "learning_rate": 0.0005, + "loss": 1.2489, + "step": 3530 + }, + { + "epoch": 0.1829362823626686, + "grad_norm": 1.0628485679626465, + "learning_rate": 0.0005, + "loss": 1.2669, + "step": 3540 + }, + { + "epoch": 0.18345305152188518, + "grad_norm": 1.0175726413726807, + "learning_rate": 0.0005, + "loss": 1.2473, + "step": 3550 + }, + { + "epoch": 0.18396982068110176, + "grad_norm": 0.7979172468185425, + "learning_rate": 0.0005, + "loss": 1.2471, + "step": 3560 + }, + { + "epoch": 0.18448658984031832, + "grad_norm": 0.7472112774848938, + "learning_rate": 0.0005, + "loss": 1.2413, + "step": 3570 + }, + { + "epoch": 0.1850033589995349, + "grad_norm": 0.8240432739257812, + "learning_rate": 0.0005, + "loss": 1.2521, + "step": 3580 + }, + { + "epoch": 0.1855201281587515, + "grad_norm": 0.8023159503936768, + "learning_rate": 0.0005, + "loss": 1.2471, + "step": 3590 + }, + { + "epoch": 0.18603689731796808, + "grad_norm": 0.7950299978256226, + "learning_rate": 0.0005, + "loss": 1.2327, + "step": 3600 + }, + { + "epoch": 0.18655366647718463, + "grad_norm": 0.7718859314918518, + "learning_rate": 0.0005, + "loss": 1.2417, + "step": 3610 + }, + { + "epoch": 0.18707043563640122, + "grad_norm": 0.8416433334350586, + "learning_rate": 0.0005, + "loss": 1.2531, + "step": 3620 + }, + { + "epoch": 0.1875872047956178, + "grad_norm": 0.7842203974723816, + "learning_rate": 0.0005, + "loss": 1.2435, + "step": 3630 + }, + { + "epoch": 0.18810397395483439, + "grad_norm": 0.8708809614181519, + "learning_rate": 0.0005, + "loss": 1.245, + "step": 3640 + }, + { + "epoch": 0.18862074311405094, + "grad_norm": 0.8131195902824402, + "learning_rate": 0.0005, + "loss": 1.244, + "step": 3650 + }, + { + "epoch": 0.18913751227326753, + "grad_norm": 0.8010774254798889, + "learning_rate": 0.0005, + "loss": 1.245, + "step": 3660 + }, + { + "epoch": 0.1896542814324841, + "grad_norm": 0.7978084087371826, + "learning_rate": 0.0005, + "loss": 1.2475, + "step": 3670 + }, + { + "epoch": 0.1901710505917007, + "grad_norm": 0.7844563722610474, + "learning_rate": 0.0005, + "loss": 1.2325, + "step": 3680 + }, + { + "epoch": 0.19068781975091725, + "grad_norm": 0.8755462765693665, + "learning_rate": 0.0005, + "loss": 1.2243, + "step": 3690 + }, + { + "epoch": 0.19120458891013384, + "grad_norm": 0.7727536559104919, + "learning_rate": 0.0005, + "loss": 1.2447, + "step": 3700 + }, + { + "epoch": 0.19172135806935042, + "grad_norm": 0.7509860396385193, + "learning_rate": 0.0005, + "loss": 1.2324, + "step": 3710 + }, + { + "epoch": 0.192238127228567, + "grad_norm": 0.9001826047897339, + "learning_rate": 0.0005, + "loss": 1.2175, + "step": 3720 + }, + { + "epoch": 0.19275489638778356, + "grad_norm": 0.7595515847206116, + "learning_rate": 0.0005, + "loss": 1.2536, + "step": 3730 + }, + { + "epoch": 0.19327166554700015, + "grad_norm": 0.746465802192688, + "learning_rate": 0.0005, + "loss": 1.2439, + "step": 3740 + }, + { + "epoch": 0.19378843470621673, + "grad_norm": 0.8454607725143433, + "learning_rate": 0.0005, + "loss": 1.2319, + "step": 3750 + }, + { + "epoch": 0.19430520386543332, + "grad_norm": 0.7905994057655334, + "learning_rate": 0.0005, + "loss": 1.2335, + "step": 3760 + }, + { + "epoch": 0.1948219730246499, + "grad_norm": 1.1130495071411133, + "learning_rate": 0.0005, + "loss": 1.2444, + "step": 3770 + }, + { + "epoch": 0.19533874218386646, + "grad_norm": 0.9213355183601379, + "learning_rate": 0.0005, + "loss": 1.2188, + "step": 3780 + }, + { + "epoch": 0.19585551134308304, + "grad_norm": 0.8003748655319214, + "learning_rate": 0.0005, + "loss": 1.2478, + "step": 3790 + }, + { + "epoch": 0.19637228050229963, + "grad_norm": 0.7667946815490723, + "learning_rate": 0.0005, + "loss": 1.2286, + "step": 3800 + }, + { + "epoch": 0.1968890496615162, + "grad_norm": 0.7806205153465271, + "learning_rate": 0.0005, + "loss": 1.2152, + "step": 3810 + }, + { + "epoch": 0.19740581882073277, + "grad_norm": 1.1093833446502686, + "learning_rate": 0.0005, + "loss": 1.2281, + "step": 3820 + }, + { + "epoch": 0.19792258797994935, + "grad_norm": 0.8750317692756653, + "learning_rate": 0.0005, + "loss": 1.2418, + "step": 3830 + }, + { + "epoch": 0.19843935713916594, + "grad_norm": 0.9322946071624756, + "learning_rate": 0.0005, + "loss": 1.2168, + "step": 3840 + }, + { + "epoch": 0.19895612629838252, + "grad_norm": 0.9042627215385437, + "learning_rate": 0.0005, + "loss": 1.229, + "step": 3850 + }, + { + "epoch": 0.19947289545759908, + "grad_norm": 0.8162991404533386, + "learning_rate": 0.0005, + "loss": 1.2044, + "step": 3860 + }, + { + "epoch": 0.19998966461681567, + "grad_norm": 0.7078894972801208, + "learning_rate": 0.0005, + "loss": 1.2077, + "step": 3870 + }, + { + "epoch": 0.20050643377603225, + "grad_norm": 0.8144243955612183, + "learning_rate": 0.0005, + "loss": 1.1932, + "step": 3880 + }, + { + "epoch": 0.20102320293524883, + "grad_norm": 0.7456822991371155, + "learning_rate": 0.0005, + "loss": 1.2187, + "step": 3890 + }, + { + "epoch": 0.2015399720944654, + "grad_norm": 0.7855635285377502, + "learning_rate": 0.0005, + "loss": 1.2096, + "step": 3900 + }, + { + "epoch": 0.20205674125368198, + "grad_norm": 0.7501581311225891, + "learning_rate": 0.0005, + "loss": 1.2083, + "step": 3910 + }, + { + "epoch": 0.20257351041289856, + "grad_norm": 0.7569208145141602, + "learning_rate": 0.0005, + "loss": 1.2208, + "step": 3920 + }, + { + "epoch": 0.20309027957211515, + "grad_norm": 0.7520230412483215, + "learning_rate": 0.0005, + "loss": 1.2031, + "step": 3930 + }, + { + "epoch": 0.2036070487313317, + "grad_norm": 0.9110859632492065, + "learning_rate": 0.0005, + "loss": 1.2135, + "step": 3940 + }, + { + "epoch": 0.2041238178905483, + "grad_norm": 0.738043487071991, + "learning_rate": 0.0005, + "loss": 1.2066, + "step": 3950 + }, + { + "epoch": 0.20464058704976487, + "grad_norm": 0.7910060286521912, + "learning_rate": 0.0005, + "loss": 1.2089, + "step": 3960 + }, + { + "epoch": 0.20515735620898146, + "grad_norm": 0.7672162652015686, + "learning_rate": 0.0005, + "loss": 1.216, + "step": 3970 + }, + { + "epoch": 0.205674125368198, + "grad_norm": 0.7567201852798462, + "learning_rate": 0.0005, + "loss": 1.1915, + "step": 3980 + }, + { + "epoch": 0.2061908945274146, + "grad_norm": 0.759067714214325, + "learning_rate": 0.0005, + "loss": 1.2111, + "step": 3990 + }, + { + "epoch": 0.20670766368663118, + "grad_norm": 0.7911349534988403, + "learning_rate": 0.0005, + "loss": 1.211, + "step": 4000 + }, + { + "epoch": 0.20722443284584777, + "grad_norm": 1.0086050033569336, + "learning_rate": 0.0005, + "loss": 1.2122, + "step": 4010 + }, + { + "epoch": 0.20774120200506435, + "grad_norm": 1.1961076259613037, + "learning_rate": 0.0005, + "loss": 1.1972, + "step": 4020 + }, + { + "epoch": 0.2082579711642809, + "grad_norm": 0.8429704308509827, + "learning_rate": 0.0005, + "loss": 1.2038, + "step": 4030 + }, + { + "epoch": 0.2087747403234975, + "grad_norm": 1.0080244541168213, + "learning_rate": 0.0005, + "loss": 1.1981, + "step": 4040 + }, + { + "epoch": 0.20929150948271408, + "grad_norm": 0.7220394611358643, + "learning_rate": 0.0005, + "loss": 1.2083, + "step": 4050 + }, + { + "epoch": 0.20980827864193066, + "grad_norm": 0.7594371438026428, + "learning_rate": 0.0005, + "loss": 1.1976, + "step": 4060 + }, + { + "epoch": 0.21032504780114722, + "grad_norm": 0.7990491986274719, + "learning_rate": 0.0005, + "loss": 1.1938, + "step": 4070 + }, + { + "epoch": 0.2108418169603638, + "grad_norm": 1.0034983158111572, + "learning_rate": 0.0005, + "loss": 1.1769, + "step": 4080 + }, + { + "epoch": 0.2113585861195804, + "grad_norm": 0.8476843237876892, + "learning_rate": 0.0005, + "loss": 1.1914, + "step": 4090 + }, + { + "epoch": 0.21187535527879697, + "grad_norm": 0.7301702499389648, + "learning_rate": 0.0005, + "loss": 1.2054, + "step": 4100 + }, + { + "epoch": 0.21239212443801353, + "grad_norm": 0.7379107475280762, + "learning_rate": 0.0005, + "loss": 1.1945, + "step": 4110 + }, + { + "epoch": 0.21290889359723011, + "grad_norm": 0.7332804203033447, + "learning_rate": 0.0005, + "loss": 1.1921, + "step": 4120 + }, + { + "epoch": 0.2134256627564467, + "grad_norm": 0.7600969672203064, + "learning_rate": 0.0005, + "loss": 1.1957, + "step": 4130 + }, + { + "epoch": 0.21394243191566328, + "grad_norm": 0.9124670028686523, + "learning_rate": 0.0005, + "loss": 1.199, + "step": 4140 + }, + { + "epoch": 0.21445920107487984, + "grad_norm": 0.7995319962501526, + "learning_rate": 0.0005, + "loss": 1.1806, + "step": 4150 + }, + { + "epoch": 0.21497597023409643, + "grad_norm": 0.7137150168418884, + "learning_rate": 0.0005, + "loss": 1.1944, + "step": 4160 + }, + { + "epoch": 0.215492739393313, + "grad_norm": 0.8427070379257202, + "learning_rate": 0.0005, + "loss": 1.204, + "step": 4170 + }, + { + "epoch": 0.2160095085525296, + "grad_norm": 0.6893758177757263, + "learning_rate": 0.0005, + "loss": 1.2056, + "step": 4180 + }, + { + "epoch": 0.21652627771174615, + "grad_norm": 0.777153730392456, + "learning_rate": 0.0005, + "loss": 1.1834, + "step": 4190 + }, + { + "epoch": 0.21704304687096274, + "grad_norm": 0.7304201126098633, + "learning_rate": 0.0005, + "loss": 1.1918, + "step": 4200 + }, + { + "epoch": 0.21755981603017932, + "grad_norm": 0.7642196416854858, + "learning_rate": 0.0005, + "loss": 1.2043, + "step": 4210 + }, + { + "epoch": 0.2180765851893959, + "grad_norm": 0.703868567943573, + "learning_rate": 0.0005, + "loss": 1.1717, + "step": 4220 + }, + { + "epoch": 0.21859335434861246, + "grad_norm": 0.751356840133667, + "learning_rate": 0.0005, + "loss": 1.1975, + "step": 4230 + }, + { + "epoch": 0.21911012350782905, + "grad_norm": 0.8302937150001526, + "learning_rate": 0.0005, + "loss": 1.1981, + "step": 4240 + }, + { + "epoch": 0.21962689266704563, + "grad_norm": 0.8335602879524231, + "learning_rate": 0.0005, + "loss": 1.1863, + "step": 4250 + }, + { + "epoch": 0.22014366182626222, + "grad_norm": 0.7479858994483948, + "learning_rate": 0.0005, + "loss": 1.1788, + "step": 4260 + }, + { + "epoch": 0.2206604309854788, + "grad_norm": 0.9171736836433411, + "learning_rate": 0.0005, + "loss": 1.1773, + "step": 4270 + }, + { + "epoch": 0.22117720014469536, + "grad_norm": 0.7626177668571472, + "learning_rate": 0.0005, + "loss": 1.1869, + "step": 4280 + }, + { + "epoch": 0.22169396930391194, + "grad_norm": 0.7428616881370544, + "learning_rate": 0.0005, + "loss": 1.1698, + "step": 4290 + }, + { + "epoch": 0.22221073846312853, + "grad_norm": 0.8029087781906128, + "learning_rate": 0.0005, + "loss": 1.1884, + "step": 4300 + }, + { + "epoch": 0.2227275076223451, + "grad_norm": 0.7876361608505249, + "learning_rate": 0.0005, + "loss": 1.1843, + "step": 4310 + }, + { + "epoch": 0.22324427678156167, + "grad_norm": 0.6730009913444519, + "learning_rate": 0.0005, + "loss": 1.1703, + "step": 4320 + }, + { + "epoch": 0.22376104594077825, + "grad_norm": 0.7202760577201843, + "learning_rate": 0.0005, + "loss": 1.1753, + "step": 4330 + }, + { + "epoch": 0.22427781509999484, + "grad_norm": 0.7547861337661743, + "learning_rate": 0.0005, + "loss": 1.1755, + "step": 4340 + }, + { + "epoch": 0.22479458425921142, + "grad_norm": 0.7263453602790833, + "learning_rate": 0.0005, + "loss": 1.1783, + "step": 4350 + }, + { + "epoch": 0.22531135341842798, + "grad_norm": 0.7226181030273438, + "learning_rate": 0.0005, + "loss": 1.1829, + "step": 4360 + }, + { + "epoch": 0.22582812257764456, + "grad_norm": 0.7433076500892639, + "learning_rate": 0.0005, + "loss": 1.1821, + "step": 4370 + }, + { + "epoch": 0.22634489173686115, + "grad_norm": 0.8025347590446472, + "learning_rate": 0.0005, + "loss": 1.1548, + "step": 4380 + }, + { + "epoch": 0.22686166089607773, + "grad_norm": 0.8330517411231995, + "learning_rate": 0.0005, + "loss": 1.1757, + "step": 4390 + }, + { + "epoch": 0.2273784300552943, + "grad_norm": 0.7150396704673767, + "learning_rate": 0.0005, + "loss": 1.1592, + "step": 4400 + }, + { + "epoch": 0.22789519921451087, + "grad_norm": 0.8366827368736267, + "learning_rate": 0.0005, + "loss": 1.1614, + "step": 4410 + }, + { + "epoch": 0.22841196837372746, + "grad_norm": 0.8655450344085693, + "learning_rate": 0.0005, + "loss": 1.1553, + "step": 4420 + }, + { + "epoch": 0.22892873753294404, + "grad_norm": 0.6938055753707886, + "learning_rate": 0.0005, + "loss": 1.1657, + "step": 4430 + }, + { + "epoch": 0.2294455066921606, + "grad_norm": 0.7177290320396423, + "learning_rate": 0.0005, + "loss": 1.1728, + "step": 4440 + }, + { + "epoch": 0.22996227585137718, + "grad_norm": 0.7082594037055969, + "learning_rate": 0.0005, + "loss": 1.1659, + "step": 4450 + }, + { + "epoch": 0.23047904501059377, + "grad_norm": 0.7543273568153381, + "learning_rate": 0.0005, + "loss": 1.1517, + "step": 4460 + }, + { + "epoch": 0.23099581416981035, + "grad_norm": 0.722029983997345, + "learning_rate": 0.0005, + "loss": 1.1593, + "step": 4470 + }, + { + "epoch": 0.2315125833290269, + "grad_norm": 0.7107385396957397, + "learning_rate": 0.0005, + "loss": 1.1499, + "step": 4480 + }, + { + "epoch": 0.2320293524882435, + "grad_norm": 0.8118393421173096, + "learning_rate": 0.0005, + "loss": 1.1614, + "step": 4490 + }, + { + "epoch": 0.23254612164746008, + "grad_norm": 0.7901565432548523, + "learning_rate": 0.0005, + "loss": 1.1627, + "step": 4500 + }, + { + "epoch": 0.23306289080667666, + "grad_norm": 0.6997384428977966, + "learning_rate": 0.0005, + "loss": 1.1694, + "step": 4510 + }, + { + "epoch": 0.23357965996589325, + "grad_norm": 0.7574887871742249, + "learning_rate": 0.0005, + "loss": 1.1772, + "step": 4520 + }, + { + "epoch": 0.2340964291251098, + "grad_norm": 0.709123432636261, + "learning_rate": 0.0005, + "loss": 1.1793, + "step": 4530 + }, + { + "epoch": 0.2346131982843264, + "grad_norm": 0.7011120915412903, + "learning_rate": 0.0005, + "loss": 1.1569, + "step": 4540 + }, + { + "epoch": 0.23512996744354298, + "grad_norm": 0.7826752662658691, + "learning_rate": 0.0005, + "loss": 1.1551, + "step": 4550 + }, + { + "epoch": 0.23564673660275956, + "grad_norm": 0.7468019723892212, + "learning_rate": 0.0005, + "loss": 1.177, + "step": 4560 + }, + { + "epoch": 0.23616350576197612, + "grad_norm": 0.8336277604103088, + "learning_rate": 0.0005, + "loss": 1.1437, + "step": 4570 + }, + { + "epoch": 0.2366802749211927, + "grad_norm": 0.7412180304527283, + "learning_rate": 0.0005, + "loss": 1.1371, + "step": 4580 + }, + { + "epoch": 0.23719704408040929, + "grad_norm": 0.7702532410621643, + "learning_rate": 0.0005, + "loss": 1.1539, + "step": 4590 + }, + { + "epoch": 0.23771381323962587, + "grad_norm": 0.7170100808143616, + "learning_rate": 0.0005, + "loss": 1.1493, + "step": 4600 + }, + { + "epoch": 0.23823058239884243, + "grad_norm": 0.6973877549171448, + "learning_rate": 0.0005, + "loss": 1.1686, + "step": 4610 + }, + { + "epoch": 0.238747351558059, + "grad_norm": 0.7682148218154907, + "learning_rate": 0.0005, + "loss": 1.1374, + "step": 4620 + }, + { + "epoch": 0.2392641207172756, + "grad_norm": 0.7360324263572693, + "learning_rate": 0.0005, + "loss": 1.1461, + "step": 4630 + }, + { + "epoch": 0.23978088987649218, + "grad_norm": 0.6636998057365417, + "learning_rate": 0.0005, + "loss": 1.1468, + "step": 4640 + }, + { + "epoch": 0.24029765903570874, + "grad_norm": 0.9023354053497314, + "learning_rate": 0.0005, + "loss": 1.1523, + "step": 4650 + }, + { + "epoch": 0.24081442819492532, + "grad_norm": 0.6802653074264526, + "learning_rate": 0.0005, + "loss": 1.1354, + "step": 4660 + }, + { + "epoch": 0.2413311973541419, + "grad_norm": 0.917087972164154, + "learning_rate": 0.0005, + "loss": 1.1402, + "step": 4670 + }, + { + "epoch": 0.2418479665133585, + "grad_norm": 0.8304193019866943, + "learning_rate": 0.0005, + "loss": 1.1526, + "step": 4680 + }, + { + "epoch": 0.24236473567257505, + "grad_norm": 0.833188533782959, + "learning_rate": 0.0005, + "loss": 1.165, + "step": 4690 + }, + { + "epoch": 0.24288150483179163, + "grad_norm": 0.7147198915481567, + "learning_rate": 0.0005, + "loss": 1.1431, + "step": 4700 + }, + { + "epoch": 0.24339827399100822, + "grad_norm": 0.6784700155258179, + "learning_rate": 0.0005, + "loss": 1.138, + "step": 4710 + }, + { + "epoch": 0.2439150431502248, + "grad_norm": 0.6933045983314514, + "learning_rate": 0.0005, + "loss": 1.1173, + "step": 4720 + }, + { + "epoch": 0.24443181230944136, + "grad_norm": 0.7840824127197266, + "learning_rate": 0.0005, + "loss": 1.1384, + "step": 4730 + }, + { + "epoch": 0.24494858146865794, + "grad_norm": 0.8129291534423828, + "learning_rate": 0.0005, + "loss": 1.151, + "step": 4740 + }, + { + "epoch": 0.24546535062787453, + "grad_norm": 0.7420192360877991, + "learning_rate": 0.0005, + "loss": 1.1218, + "step": 4750 + }, + { + "epoch": 0.2459821197870911, + "grad_norm": 0.6665251851081848, + "learning_rate": 0.0005, + "loss": 1.1278, + "step": 4760 + }, + { + "epoch": 0.2464988889463077, + "grad_norm": 0.7529242038726807, + "learning_rate": 0.0005, + "loss": 1.1417, + "step": 4770 + }, + { + "epoch": 0.24701565810552426, + "grad_norm": 0.6908478140830994, + "learning_rate": 0.0005, + "loss": 1.1353, + "step": 4780 + }, + { + "epoch": 0.24753242726474084, + "grad_norm": 0.6860882043838501, + "learning_rate": 0.0005, + "loss": 1.1278, + "step": 4790 + }, + { + "epoch": 0.24804919642395742, + "grad_norm": 0.7322950959205627, + "learning_rate": 0.0005, + "loss": 1.1447, + "step": 4800 + }, + { + "epoch": 0.248565965583174, + "grad_norm": 0.679210364818573, + "learning_rate": 0.0005, + "loss": 1.146, + "step": 4810 + }, + { + "epoch": 0.24908273474239057, + "grad_norm": 0.7133141756057739, + "learning_rate": 0.0005, + "loss": 1.1389, + "step": 4820 + }, + { + "epoch": 0.24959950390160715, + "grad_norm": 0.6991278529167175, + "learning_rate": 0.0005, + "loss": 1.1324, + "step": 4830 + }, + { + "epoch": 0.25011627306082374, + "grad_norm": 0.7213752865791321, + "learning_rate": 0.0005, + "loss": 1.1303, + "step": 4840 + }, + { + "epoch": 0.2506330422200403, + "grad_norm": 0.6555566191673279, + "learning_rate": 0.0005, + "loss": 1.1277, + "step": 4850 + }, + { + "epoch": 0.2511498113792569, + "grad_norm": 0.7012516260147095, + "learning_rate": 0.0005, + "loss": 1.1267, + "step": 4860 + }, + { + "epoch": 0.25166658053847346, + "grad_norm": 0.74920654296875, + "learning_rate": 0.0005, + "loss": 1.1432, + "step": 4870 + }, + { + "epoch": 0.25218334969769, + "grad_norm": 0.721111536026001, + "learning_rate": 0.0005, + "loss": 1.1393, + "step": 4880 + }, + { + "epoch": 0.25270011885690663, + "grad_norm": 0.7633620500564575, + "learning_rate": 0.0005, + "loss": 1.135, + "step": 4890 + }, + { + "epoch": 0.2532168880161232, + "grad_norm": 0.7658079266548157, + "learning_rate": 0.0005, + "loss": 1.1223, + "step": 4900 + }, + { + "epoch": 0.2537336571753398, + "grad_norm": 0.6615222692489624, + "learning_rate": 0.0005, + "loss": 1.1476, + "step": 4910 + }, + { + "epoch": 0.25425042633455636, + "grad_norm": 0.6398602724075317, + "learning_rate": 0.0005, + "loss": 1.1044, + "step": 4920 + }, + { + "epoch": 0.2547671954937729, + "grad_norm": 0.7086970210075378, + "learning_rate": 0.0005, + "loss": 1.1253, + "step": 4930 + }, + { + "epoch": 0.2552839646529895, + "grad_norm": 0.6913731694221497, + "learning_rate": 0.0005, + "loss": 1.1356, + "step": 4940 + }, + { + "epoch": 0.2558007338122061, + "grad_norm": 0.7111396789550781, + "learning_rate": 0.0005, + "loss": 1.1219, + "step": 4950 + }, + { + "epoch": 0.25631750297142264, + "grad_norm": 0.699747622013092, + "learning_rate": 0.0005, + "loss": 1.1198, + "step": 4960 + }, + { + "epoch": 0.25683427213063925, + "grad_norm": 0.6903569102287292, + "learning_rate": 0.0005, + "loss": 1.1384, + "step": 4970 + }, + { + "epoch": 0.2573510412898558, + "grad_norm": 0.7051145434379578, + "learning_rate": 0.0005, + "loss": 1.1439, + "step": 4980 + }, + { + "epoch": 0.2578678104490724, + "grad_norm": 0.7983745336532593, + "learning_rate": 0.0005, + "loss": 1.1171, + "step": 4990 + }, + { + "epoch": 0.258384579608289, + "grad_norm": 0.7234880924224854, + "learning_rate": 0.0005, + "loss": 1.1083, + "step": 5000 + }, + { + "epoch": 0.25890134876750553, + "grad_norm": 0.740550696849823, + "learning_rate": 0.0005, + "loss": 1.1211, + "step": 5010 + }, + { + "epoch": 0.25941811792672215, + "grad_norm": 0.7128597497940063, + "learning_rate": 0.0005, + "loss": 1.1432, + "step": 5020 + }, + { + "epoch": 0.2599348870859387, + "grad_norm": 0.6916446089744568, + "learning_rate": 0.0005, + "loss": 1.1358, + "step": 5030 + }, + { + "epoch": 0.2604516562451553, + "grad_norm": 0.776382327079773, + "learning_rate": 0.0005, + "loss": 1.125, + "step": 5040 + }, + { + "epoch": 0.2609684254043719, + "grad_norm": 0.720817506313324, + "learning_rate": 0.0005, + "loss": 1.11, + "step": 5050 + }, + { + "epoch": 0.26148519456358843, + "grad_norm": 0.6699787378311157, + "learning_rate": 0.0005, + "loss": 1.1143, + "step": 5060 + }, + { + "epoch": 0.26200196372280504, + "grad_norm": 0.7283949851989746, + "learning_rate": 0.0005, + "loss": 1.1094, + "step": 5070 + }, + { + "epoch": 0.2625187328820216, + "grad_norm": 0.6964280009269714, + "learning_rate": 0.0005, + "loss": 1.1332, + "step": 5080 + }, + { + "epoch": 0.26303550204123816, + "grad_norm": 0.7906248569488525, + "learning_rate": 0.0005, + "loss": 1.1242, + "step": 5090 + }, + { + "epoch": 0.26355227120045477, + "grad_norm": 0.7149584889411926, + "learning_rate": 0.0005, + "loss": 1.1215, + "step": 5100 + }, + { + "epoch": 0.2640690403596713, + "grad_norm": 0.6400547027587891, + "learning_rate": 0.0005, + "loss": 1.1319, + "step": 5110 + }, + { + "epoch": 0.26458580951888794, + "grad_norm": 0.6504139304161072, + "learning_rate": 0.0005, + "loss": 1.1145, + "step": 5120 + }, + { + "epoch": 0.2651025786781045, + "grad_norm": 0.724251389503479, + "learning_rate": 0.0005, + "loss": 1.1185, + "step": 5130 + }, + { + "epoch": 0.26561934783732105, + "grad_norm": 0.7142144441604614, + "learning_rate": 0.0005, + "loss": 1.1296, + "step": 5140 + }, + { + "epoch": 0.26613611699653766, + "grad_norm": 0.7482824325561523, + "learning_rate": 0.0005, + "loss": 1.1035, + "step": 5150 + }, + { + "epoch": 0.2666528861557542, + "grad_norm": 0.7604995369911194, + "learning_rate": 0.0005, + "loss": 1.113, + "step": 5160 + }, + { + "epoch": 0.2671696553149708, + "grad_norm": 0.7642651200294495, + "learning_rate": 0.0005, + "loss": 1.0964, + "step": 5170 + }, + { + "epoch": 0.2676864244741874, + "grad_norm": 0.9142786860466003, + "learning_rate": 0.0005, + "loss": 1.101, + "step": 5180 + }, + { + "epoch": 0.26820319363340395, + "grad_norm": 0.6688016057014465, + "learning_rate": 0.0005, + "loss": 1.1125, + "step": 5190 + }, + { + "epoch": 0.26871996279262056, + "grad_norm": 0.7352325916290283, + "learning_rate": 0.0005, + "loss": 1.1081, + "step": 5200 + }, + { + "epoch": 0.2692367319518371, + "grad_norm": 0.696356475353241, + "learning_rate": 0.0005, + "loss": 1.0972, + "step": 5210 + }, + { + "epoch": 0.2697535011110537, + "grad_norm": 0.6730584502220154, + "learning_rate": 0.0005, + "loss": 1.1173, + "step": 5220 + }, + { + "epoch": 0.2702702702702703, + "grad_norm": 0.6800664067268372, + "learning_rate": 0.0005, + "loss": 1.0942, + "step": 5230 + }, + { + "epoch": 0.27078703942948684, + "grad_norm": 0.6622713208198547, + "learning_rate": 0.0005, + "loss": 1.1297, + "step": 5240 + }, + { + "epoch": 0.2713038085887034, + "grad_norm": 0.7148898839950562, + "learning_rate": 0.0005, + "loss": 1.0997, + "step": 5250 + }, + { + "epoch": 0.27182057774792, + "grad_norm": 0.6884311437606812, + "learning_rate": 0.0005, + "loss": 1.1031, + "step": 5260 + }, + { + "epoch": 0.27233734690713657, + "grad_norm": 0.6427676677703857, + "learning_rate": 0.0005, + "loss": 1.1102, + "step": 5270 + }, + { + "epoch": 0.2728541160663532, + "grad_norm": 0.6422214508056641, + "learning_rate": 0.0005, + "loss": 1.1116, + "step": 5280 + }, + { + "epoch": 0.27337088522556974, + "grad_norm": 0.6933507919311523, + "learning_rate": 0.0005, + "loss": 1.1179, + "step": 5290 + }, + { + "epoch": 0.2738876543847863, + "grad_norm": 0.6655607223510742, + "learning_rate": 0.0005, + "loss": 1.0943, + "step": 5300 + }, + { + "epoch": 0.2744044235440029, + "grad_norm": 0.7125523686408997, + "learning_rate": 0.0005, + "loss": 1.1065, + "step": 5310 + }, + { + "epoch": 0.27492119270321946, + "grad_norm": 0.8208178281784058, + "learning_rate": 0.0005, + "loss": 1.1193, + "step": 5320 + }, + { + "epoch": 0.2754379618624361, + "grad_norm": 0.715416669845581, + "learning_rate": 0.0005, + "loss": 1.1064, + "step": 5330 + }, + { + "epoch": 0.27595473102165263, + "grad_norm": 0.7992897629737854, + "learning_rate": 0.0005, + "loss": 1.1008, + "step": 5340 + }, + { + "epoch": 0.2764715001808692, + "grad_norm": 0.6610242128372192, + "learning_rate": 0.0005, + "loss": 1.1132, + "step": 5350 + }, + { + "epoch": 0.2769882693400858, + "grad_norm": 0.7205715775489807, + "learning_rate": 0.0005, + "loss": 1.0994, + "step": 5360 + }, + { + "epoch": 0.27750503849930236, + "grad_norm": 0.6824073791503906, + "learning_rate": 0.0005, + "loss": 1.0882, + "step": 5370 + }, + { + "epoch": 0.2780218076585189, + "grad_norm": 0.7015029191970825, + "learning_rate": 0.0005, + "loss": 1.1182, + "step": 5380 + }, + { + "epoch": 0.27853857681773553, + "grad_norm": 0.6447197794914246, + "learning_rate": 0.0005, + "loss": 1.105, + "step": 5390 + }, + { + "epoch": 0.2790553459769521, + "grad_norm": 0.7455316781997681, + "learning_rate": 0.0005, + "loss": 1.1069, + "step": 5400 + }, + { + "epoch": 0.2795721151361687, + "grad_norm": 0.8284129500389099, + "learning_rate": 0.0005, + "loss": 1.108, + "step": 5410 + }, + { + "epoch": 0.28008888429538525, + "grad_norm": 0.6697763204574585, + "learning_rate": 0.0005, + "loss": 1.1079, + "step": 5420 + }, + { + "epoch": 0.2806056534546018, + "grad_norm": 0.6729034781455994, + "learning_rate": 0.0005, + "loss": 1.1004, + "step": 5430 + }, + { + "epoch": 0.2811224226138184, + "grad_norm": 0.6567364931106567, + "learning_rate": 0.0005, + "loss": 1.0876, + "step": 5440 + }, + { + "epoch": 0.281639191773035, + "grad_norm": 0.6983076333999634, + "learning_rate": 0.0005, + "loss": 1.0979, + "step": 5450 + }, + { + "epoch": 0.28215596093225154, + "grad_norm": 0.6503905057907104, + "learning_rate": 0.0005, + "loss": 1.0884, + "step": 5460 + }, + { + "epoch": 0.28267273009146815, + "grad_norm": 0.6191208362579346, + "learning_rate": 0.0005, + "loss": 1.1057, + "step": 5470 + }, + { + "epoch": 0.2831894992506847, + "grad_norm": 0.7421597838401794, + "learning_rate": 0.0005, + "loss": 1.0992, + "step": 5480 + }, + { + "epoch": 0.2837062684099013, + "grad_norm": 0.6919003129005432, + "learning_rate": 0.0005, + "loss": 1.0961, + "step": 5490 + }, + { + "epoch": 0.2842230375691179, + "grad_norm": 0.6625383496284485, + "learning_rate": 0.0005, + "loss": 1.1108, + "step": 5500 + }, + { + "epoch": 0.28473980672833443, + "grad_norm": 0.6479719877243042, + "learning_rate": 0.0005, + "loss": 1.0969, + "step": 5510 + }, + { + "epoch": 0.28525657588755104, + "grad_norm": 0.765210747718811, + "learning_rate": 0.0005, + "loss": 1.0857, + "step": 5520 + }, + { + "epoch": 0.2857733450467676, + "grad_norm": 0.6934791803359985, + "learning_rate": 0.0005, + "loss": 1.0945, + "step": 5530 + }, + { + "epoch": 0.2862901142059842, + "grad_norm": 0.6789985299110413, + "learning_rate": 0.0005, + "loss": 1.1165, + "step": 5540 + }, + { + "epoch": 0.28680688336520077, + "grad_norm": 0.6476292014122009, + "learning_rate": 0.0005, + "loss": 1.0886, + "step": 5550 + }, + { + "epoch": 0.28732365252441733, + "grad_norm": 0.8015202283859253, + "learning_rate": 0.0005, + "loss": 1.09, + "step": 5560 + }, + { + "epoch": 0.28784042168363394, + "grad_norm": 0.8759499192237854, + "learning_rate": 0.0005, + "loss": 1.0962, + "step": 5570 + }, + { + "epoch": 0.2883571908428505, + "grad_norm": 0.6740782856941223, + "learning_rate": 0.0005, + "loss": 1.0803, + "step": 5580 + }, + { + "epoch": 0.28887396000206705, + "grad_norm": 0.6475633978843689, + "learning_rate": 0.0005, + "loss": 1.0825, + "step": 5590 + }, + { + "epoch": 0.28939072916128367, + "grad_norm": 0.7087163329124451, + "learning_rate": 0.0005, + "loss": 1.0982, + "step": 5600 + }, + { + "epoch": 0.2899074983205002, + "grad_norm": 0.6702967882156372, + "learning_rate": 0.0005, + "loss": 1.1146, + "step": 5610 + }, + { + "epoch": 0.29042426747971684, + "grad_norm": 0.6150313019752502, + "learning_rate": 0.0005, + "loss": 1.0919, + "step": 5620 + }, + { + "epoch": 0.2909410366389334, + "grad_norm": 0.6218642592430115, + "learning_rate": 0.0005, + "loss": 1.0874, + "step": 5630 + }, + { + "epoch": 0.29145780579814995, + "grad_norm": 0.670069694519043, + "learning_rate": 0.0005, + "loss": 1.0764, + "step": 5640 + }, + { + "epoch": 0.29197457495736656, + "grad_norm": 0.7384163737297058, + "learning_rate": 0.0005, + "loss": 1.0888, + "step": 5650 + }, + { + "epoch": 0.2924913441165831, + "grad_norm": 0.6525676250457764, + "learning_rate": 0.0005, + "loss": 1.0955, + "step": 5660 + }, + { + "epoch": 0.2930081132757997, + "grad_norm": 0.6424722075462341, + "learning_rate": 0.0005, + "loss": 1.077, + "step": 5670 + }, + { + "epoch": 0.2935248824350163, + "grad_norm": 0.6522981524467468, + "learning_rate": 0.0005, + "loss": 1.0996, + "step": 5680 + }, + { + "epoch": 0.29404165159423284, + "grad_norm": 0.686553955078125, + "learning_rate": 0.0005, + "loss": 1.0776, + "step": 5690 + }, + { + "epoch": 0.29455842075344946, + "grad_norm": 0.6501746773719788, + "learning_rate": 0.0005, + "loss": 1.09, + "step": 5700 + }, + { + "epoch": 0.295075189912666, + "grad_norm": 0.661805272102356, + "learning_rate": 0.0005, + "loss": 1.0987, + "step": 5710 + }, + { + "epoch": 0.29559195907188257, + "grad_norm": 0.6171291470527649, + "learning_rate": 0.0005, + "loss": 1.0896, + "step": 5720 + }, + { + "epoch": 0.2961087282310992, + "grad_norm": 0.6660189032554626, + "learning_rate": 0.0005, + "loss": 1.0795, + "step": 5730 + }, + { + "epoch": 0.29662549739031574, + "grad_norm": 0.7182852625846863, + "learning_rate": 0.0005, + "loss": 1.0888, + "step": 5740 + }, + { + "epoch": 0.2971422665495323, + "grad_norm": 0.6748793125152588, + "learning_rate": 0.0005, + "loss": 1.1066, + "step": 5750 + }, + { + "epoch": 0.2976590357087489, + "grad_norm": 0.9658355712890625, + "learning_rate": 0.0005, + "loss": 1.0788, + "step": 5760 + }, + { + "epoch": 0.29817580486796547, + "grad_norm": 0.7361212968826294, + "learning_rate": 0.0005, + "loss": 1.0963, + "step": 5770 + }, + { + "epoch": 0.2986925740271821, + "grad_norm": 0.6640811562538147, + "learning_rate": 0.0005, + "loss": 1.0872, + "step": 5780 + }, + { + "epoch": 0.29920934318639864, + "grad_norm": 0.6937102675437927, + "learning_rate": 0.0005, + "loss": 1.0777, + "step": 5790 + }, + { + "epoch": 0.2997261123456152, + "grad_norm": 0.7803467512130737, + "learning_rate": 0.0005, + "loss": 1.0986, + "step": 5800 + }, + { + "epoch": 0.3002428815048318, + "grad_norm": 0.8593279719352722, + "learning_rate": 0.0005, + "loss": 1.0796, + "step": 5810 + }, + { + "epoch": 0.30075965066404836, + "grad_norm": 0.6236810088157654, + "learning_rate": 0.0005, + "loss": 1.0932, + "step": 5820 + }, + { + "epoch": 0.301276419823265, + "grad_norm": 0.6399732828140259, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 5830 + }, + { + "epoch": 0.30179318898248153, + "grad_norm": 0.6762784123420715, + "learning_rate": 0.0005, + "loss": 1.0763, + "step": 5840 + }, + { + "epoch": 0.3023099581416981, + "grad_norm": 0.7428263425827026, + "learning_rate": 0.0005, + "loss": 1.0701, + "step": 5850 + }, + { + "epoch": 0.3028267273009147, + "grad_norm": 0.6435476541519165, + "learning_rate": 0.0005, + "loss": 1.0782, + "step": 5860 + }, + { + "epoch": 0.30334349646013126, + "grad_norm": 0.6325916647911072, + "learning_rate": 0.0005, + "loss": 1.0858, + "step": 5870 + }, + { + "epoch": 0.3038602656193478, + "grad_norm": 0.6759895086288452, + "learning_rate": 0.0005, + "loss": 1.082, + "step": 5880 + }, + { + "epoch": 0.3043770347785644, + "grad_norm": 0.705319881439209, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 5890 + }, + { + "epoch": 0.304893803937781, + "grad_norm": 0.6924307346343994, + "learning_rate": 0.0005, + "loss": 1.0756, + "step": 5900 + }, + { + "epoch": 0.3054105730969976, + "grad_norm": 0.6262795925140381, + "learning_rate": 0.0005, + "loss": 1.0875, + "step": 5910 + }, + { + "epoch": 0.30592734225621415, + "grad_norm": 0.6304033398628235, + "learning_rate": 0.0005, + "loss": 1.0889, + "step": 5920 + }, + { + "epoch": 0.3064441114154307, + "grad_norm": 0.6266285181045532, + "learning_rate": 0.0005, + "loss": 1.0734, + "step": 5930 + }, + { + "epoch": 0.3069608805746473, + "grad_norm": 0.66020268201828, + "learning_rate": 0.0005, + "loss": 1.0756, + "step": 5940 + }, + { + "epoch": 0.3074776497338639, + "grad_norm": 0.6455373764038086, + "learning_rate": 0.0005, + "loss": 1.088, + "step": 5950 + }, + { + "epoch": 0.30799441889308043, + "grad_norm": 0.6743224263191223, + "learning_rate": 0.0005, + "loss": 1.0777, + "step": 5960 + }, + { + "epoch": 0.30851118805229705, + "grad_norm": 0.6214370131492615, + "learning_rate": 0.0005, + "loss": 1.069, + "step": 5970 + }, + { + "epoch": 0.3090279572115136, + "grad_norm": 0.6882118582725525, + "learning_rate": 0.0005, + "loss": 1.0713, + "step": 5980 + }, + { + "epoch": 0.3095447263707302, + "grad_norm": 0.6656840443611145, + "learning_rate": 0.0005, + "loss": 1.0783, + "step": 5990 + }, + { + "epoch": 0.3100614955299468, + "grad_norm": 0.7134031653404236, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 6000 + }, + { + "epoch": 0.31057826468916333, + "grad_norm": 0.7211028933525085, + "learning_rate": 0.0005, + "loss": 1.067, + "step": 6010 + }, + { + "epoch": 0.31109503384837994, + "grad_norm": 0.6382066607475281, + "learning_rate": 0.0005, + "loss": 1.0771, + "step": 6020 + }, + { + "epoch": 0.3116118030075965, + "grad_norm": 0.7246118187904358, + "learning_rate": 0.0005, + "loss": 1.0877, + "step": 6030 + }, + { + "epoch": 0.3121285721668131, + "grad_norm": 0.6753916144371033, + "learning_rate": 0.0005, + "loss": 1.0655, + "step": 6040 + }, + { + "epoch": 0.31264534132602967, + "grad_norm": 0.6585648655891418, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 6050 + }, + { + "epoch": 0.3131621104852462, + "grad_norm": 0.6378208994865417, + "learning_rate": 0.0005, + "loss": 1.0657, + "step": 6060 + }, + { + "epoch": 0.31367887964446284, + "grad_norm": 0.6496950387954712, + "learning_rate": 0.0005, + "loss": 1.0743, + "step": 6070 + }, + { + "epoch": 0.3141956488036794, + "grad_norm": 0.6112158298492432, + "learning_rate": 0.0005, + "loss": 1.076, + "step": 6080 + }, + { + "epoch": 0.31471241796289595, + "grad_norm": 0.6267996430397034, + "learning_rate": 0.0005, + "loss": 1.0882, + "step": 6090 + }, + { + "epoch": 0.31522918712211256, + "grad_norm": 0.6258119940757751, + "learning_rate": 0.0005, + "loss": 1.0747, + "step": 6100 + }, + { + "epoch": 0.3157459562813291, + "grad_norm": 0.6293036341667175, + "learning_rate": 0.0005, + "loss": 1.0648, + "step": 6110 + }, + { + "epoch": 0.31626272544054573, + "grad_norm": 0.6443596482276917, + "learning_rate": 0.0005, + "loss": 1.0898, + "step": 6120 + }, + { + "epoch": 0.3167794945997623, + "grad_norm": 0.6488006711006165, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 6130 + }, + { + "epoch": 0.31729626375897885, + "grad_norm": 0.6419286131858826, + "learning_rate": 0.0005, + "loss": 1.0755, + "step": 6140 + }, + { + "epoch": 0.31781303291819546, + "grad_norm": 0.6659611463546753, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 6150 + }, + { + "epoch": 0.318329802077412, + "grad_norm": 0.6645331382751465, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 6160 + }, + { + "epoch": 0.3188465712366286, + "grad_norm": 0.7420417070388794, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 6170 + }, + { + "epoch": 0.3193633403958452, + "grad_norm": 0.6399688720703125, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 6180 + }, + { + "epoch": 0.31988010955506174, + "grad_norm": 0.6128381490707397, + "learning_rate": 0.0005, + "loss": 1.0692, + "step": 6190 + }, + { + "epoch": 0.32039687871427835, + "grad_norm": 0.6373854279518127, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 6200 + }, + { + "epoch": 0.3209136478734949, + "grad_norm": 0.8587968349456787, + "learning_rate": 0.0005, + "loss": 1.0697, + "step": 6210 + }, + { + "epoch": 0.32143041703271147, + "grad_norm": 0.6043888926506042, + "learning_rate": 0.0005, + "loss": 1.0748, + "step": 6220 + }, + { + "epoch": 0.3219471861919281, + "grad_norm": 0.6279845237731934, + "learning_rate": 0.0005, + "loss": 1.0746, + "step": 6230 + }, + { + "epoch": 0.32246395535114464, + "grad_norm": 0.6751164793968201, + "learning_rate": 0.0005, + "loss": 1.0715, + "step": 6240 + }, + { + "epoch": 0.3229807245103612, + "grad_norm": 0.5915717482566833, + "learning_rate": 0.0005, + "loss": 1.0705, + "step": 6250 + }, + { + "epoch": 0.3234974936695778, + "grad_norm": 0.6816694140434265, + "learning_rate": 0.0005, + "loss": 1.0666, + "step": 6260 + }, + { + "epoch": 0.32401426282879436, + "grad_norm": 0.7093113660812378, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 6270 + }, + { + "epoch": 0.324531031988011, + "grad_norm": 0.6673592925071716, + "learning_rate": 0.0005, + "loss": 1.0767, + "step": 6280 + }, + { + "epoch": 0.32504780114722753, + "grad_norm": 0.5884393453598022, + "learning_rate": 0.0005, + "loss": 1.0662, + "step": 6290 + }, + { + "epoch": 0.3255645703064441, + "grad_norm": 0.6808472871780396, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 6300 + }, + { + "epoch": 0.3260813394656607, + "grad_norm": 0.6658387184143066, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 6310 + }, + { + "epoch": 0.32659810862487726, + "grad_norm": 0.6469089388847351, + "learning_rate": 0.0005, + "loss": 1.0645, + "step": 6320 + }, + { + "epoch": 0.32711487778409387, + "grad_norm": 0.6215671896934509, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 6330 + }, + { + "epoch": 0.32763164694331043, + "grad_norm": 0.6409225463867188, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 6340 + }, + { + "epoch": 0.328148416102527, + "grad_norm": 0.6427381038665771, + "learning_rate": 0.0005, + "loss": 1.0696, + "step": 6350 + }, + { + "epoch": 0.3286651852617436, + "grad_norm": 0.5856565833091736, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 6360 + }, + { + "epoch": 0.32918195442096015, + "grad_norm": 0.6217045187950134, + "learning_rate": 0.0005, + "loss": 1.066, + "step": 6370 + }, + { + "epoch": 0.3296987235801767, + "grad_norm": 0.7256447672843933, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 6380 + }, + { + "epoch": 0.3302154927393933, + "grad_norm": 0.6222741007804871, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 6390 + }, + { + "epoch": 0.3307322618986099, + "grad_norm": 0.6448323726654053, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 6400 + }, + { + "epoch": 0.3312490310578265, + "grad_norm": 0.6215245723724365, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 6410 + }, + { + "epoch": 0.33176580021704305, + "grad_norm": 0.6422061920166016, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 6420 + }, + { + "epoch": 0.3322825693762596, + "grad_norm": 0.8208865523338318, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 6430 + }, + { + "epoch": 0.3327993385354762, + "grad_norm": 1.1319376230239868, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 6440 + }, + { + "epoch": 0.3333161076946928, + "grad_norm": 0.63709956407547, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 6450 + }, + { + "epoch": 0.33383287685390933, + "grad_norm": 0.6338751912117004, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 6460 + }, + { + "epoch": 0.33434964601312595, + "grad_norm": 0.6264437437057495, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 6470 + }, + { + "epoch": 0.3348664151723425, + "grad_norm": 0.6507226228713989, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 6480 + }, + { + "epoch": 0.3353831843315591, + "grad_norm": 0.6316462755203247, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 6490 + }, + { + "epoch": 0.33589995349077567, + "grad_norm": 0.8337516188621521, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 6500 + }, + { + "epoch": 0.33641672264999223, + "grad_norm": 0.9597588777542114, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 6510 + }, + { + "epoch": 0.33693349180920884, + "grad_norm": 0.6857469081878662, + "learning_rate": 0.0005, + "loss": 1.0651, + "step": 6520 + }, + { + "epoch": 0.3374502609684254, + "grad_norm": 0.6196707487106323, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 6530 + }, + { + "epoch": 0.337967030127642, + "grad_norm": 0.6072001457214355, + "learning_rate": 0.0005, + "loss": 1.0698, + "step": 6540 + }, + { + "epoch": 0.33848379928685857, + "grad_norm": 0.6677159667015076, + "learning_rate": 0.0005, + "loss": 1.0646, + "step": 6550 + }, + { + "epoch": 0.3390005684460751, + "grad_norm": 0.6435421109199524, + "learning_rate": 0.0005, + "loss": 1.0342, + "step": 6560 + }, + { + "epoch": 0.33951733760529174, + "grad_norm": 0.5953618288040161, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 6570 + }, + { + "epoch": 0.3400341067645083, + "grad_norm": 0.6292535066604614, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 6580 + }, + { + "epoch": 0.34055087592372485, + "grad_norm": 0.7501185536384583, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 6590 + }, + { + "epoch": 0.34106764508294146, + "grad_norm": 0.58536696434021, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 6600 + }, + { + "epoch": 0.341584414242158, + "grad_norm": 0.6455935835838318, + "learning_rate": 0.0005, + "loss": 1.0366, + "step": 6610 + }, + { + "epoch": 0.34210118340137463, + "grad_norm": 0.6323394179344177, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 6620 + }, + { + "epoch": 0.3426179525605912, + "grad_norm": 0.6140257120132446, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 6630 + }, + { + "epoch": 0.34313472171980774, + "grad_norm": 0.6486880779266357, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 6640 + }, + { + "epoch": 0.34365149087902436, + "grad_norm": 0.6136801838874817, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 6650 + }, + { + "epoch": 0.3441682600382409, + "grad_norm": 0.78439861536026, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 6660 + }, + { + "epoch": 0.34468502919745747, + "grad_norm": 0.6717984080314636, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 6670 + }, + { + "epoch": 0.3452017983566741, + "grad_norm": 0.632985532283783, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 6680 + }, + { + "epoch": 0.34571856751589064, + "grad_norm": 0.6086390018463135, + "learning_rate": 0.0005, + "loss": 1.0384, + "step": 6690 + }, + { + "epoch": 0.34623533667510725, + "grad_norm": 0.7206865549087524, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 6700 + }, + { + "epoch": 0.3467521058343238, + "grad_norm": 0.6115614771842957, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 6710 + }, + { + "epoch": 0.34726887499354037, + "grad_norm": 0.6737103462219238, + "learning_rate": 0.0005, + "loss": 1.0679, + "step": 6720 + }, + { + "epoch": 0.347785644152757, + "grad_norm": 0.632331132888794, + "learning_rate": 0.0005, + "loss": 1.0327, + "step": 6730 + }, + { + "epoch": 0.34830241331197354, + "grad_norm": 0.7133494019508362, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 6740 + }, + { + "epoch": 0.3488191824711901, + "grad_norm": 0.5726544260978699, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 6750 + }, + { + "epoch": 0.3493359516304067, + "grad_norm": 0.719832181930542, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 6760 + }, + { + "epoch": 0.34985272078962326, + "grad_norm": 0.7709729671478271, + "learning_rate": 0.0005, + "loss": 1.0424, + "step": 6770 + }, + { + "epoch": 0.3503694899488399, + "grad_norm": 0.6043444275856018, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 6780 + }, + { + "epoch": 0.35088625910805643, + "grad_norm": 0.5770915746688843, + "learning_rate": 0.0005, + "loss": 1.033, + "step": 6790 + }, + { + "epoch": 0.351403028267273, + "grad_norm": 0.6332295536994934, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 6800 + }, + { + "epoch": 0.3519197974264896, + "grad_norm": 0.6505199670791626, + "learning_rate": 0.0005, + "loss": 1.0389, + "step": 6810 + }, + { + "epoch": 0.35243656658570616, + "grad_norm": 0.6215615272521973, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 6820 + }, + { + "epoch": 0.35295333574492277, + "grad_norm": 0.6917248368263245, + "learning_rate": 0.0005, + "loss": 1.0384, + "step": 6830 + }, + { + "epoch": 0.3534701049041393, + "grad_norm": 0.6240680813789368, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 6840 + }, + { + "epoch": 0.3539868740633559, + "grad_norm": 0.6082044243812561, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 6850 + }, + { + "epoch": 0.3545036432225725, + "grad_norm": 0.6314426064491272, + "learning_rate": 0.0005, + "loss": 1.0274, + "step": 6860 + }, + { + "epoch": 0.35502041238178905, + "grad_norm": 0.6714574694633484, + "learning_rate": 0.0005, + "loss": 1.0275, + "step": 6870 + }, + { + "epoch": 0.3555371815410056, + "grad_norm": 0.6438120603561401, + "learning_rate": 0.0005, + "loss": 1.0383, + "step": 6880 + }, + { + "epoch": 0.3560539507002222, + "grad_norm": 0.7354781031608582, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 6890 + }, + { + "epoch": 0.3565707198594388, + "grad_norm": 0.6491745114326477, + "learning_rate": 0.0005, + "loss": 1.0386, + "step": 6900 + }, + { + "epoch": 0.3570874890186554, + "grad_norm": 0.5888579487800598, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 6910 + }, + { + "epoch": 0.35760425817787195, + "grad_norm": 0.6474457383155823, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 6920 + }, + { + "epoch": 0.3581210273370885, + "grad_norm": 0.6235959529876709, + "learning_rate": 0.0005, + "loss": 1.03, + "step": 6930 + }, + { + "epoch": 0.3586377964963051, + "grad_norm": 0.6418899297714233, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 6940 + }, + { + "epoch": 0.3591545656555217, + "grad_norm": 0.671491801738739, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 6950 + }, + { + "epoch": 0.35967133481473823, + "grad_norm": 0.6662471890449524, + "learning_rate": 0.0005, + "loss": 1.0339, + "step": 6960 + }, + { + "epoch": 0.36018810397395484, + "grad_norm": 0.6041388511657715, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 6970 + }, + { + "epoch": 0.3607048731331714, + "grad_norm": 0.6014126539230347, + "learning_rate": 0.0005, + "loss": 1.0327, + "step": 6980 + }, + { + "epoch": 0.361221642292388, + "grad_norm": 0.611056387424469, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 6990 + }, + { + "epoch": 0.36173841145160457, + "grad_norm": 0.605475127696991, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 7000 + }, + { + "epoch": 0.3622551806108211, + "grad_norm": 0.5799763798713684, + "learning_rate": 0.0005, + "loss": 1.0396, + "step": 7010 + }, + { + "epoch": 0.36277194977003774, + "grad_norm": 0.5857988595962524, + "learning_rate": 0.0005, + "loss": 1.0362, + "step": 7020 + }, + { + "epoch": 0.3632887189292543, + "grad_norm": 0.6305558085441589, + "learning_rate": 0.0005, + "loss": 1.0378, + "step": 7030 + }, + { + "epoch": 0.3638054880884709, + "grad_norm": 0.5987147688865662, + "learning_rate": 0.0005, + "loss": 1.0407, + "step": 7040 + }, + { + "epoch": 0.36432225724768746, + "grad_norm": 0.5889327526092529, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 7050 + }, + { + "epoch": 0.364839026406904, + "grad_norm": 0.5972746014595032, + "learning_rate": 0.0005, + "loss": 1.0333, + "step": 7060 + }, + { + "epoch": 0.36535579556612063, + "grad_norm": 0.6437240839004517, + "learning_rate": 0.0005, + "loss": 1.0219, + "step": 7070 + }, + { + "epoch": 0.3658725647253372, + "grad_norm": 0.6240195631980896, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 7080 + }, + { + "epoch": 0.36638933388455375, + "grad_norm": 0.6170317530632019, + "learning_rate": 0.0005, + "loss": 1.0402, + "step": 7090 + }, + { + "epoch": 0.36690610304377036, + "grad_norm": 0.661592423915863, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 7100 + }, + { + "epoch": 0.3674228722029869, + "grad_norm": 0.6611010432243347, + "learning_rate": 0.0005, + "loss": 1.0387, + "step": 7110 + }, + { + "epoch": 0.36793964136220353, + "grad_norm": 0.6037949323654175, + "learning_rate": 0.0005, + "loss": 1.0398, + "step": 7120 + }, + { + "epoch": 0.3684564105214201, + "grad_norm": 0.6260375380516052, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 7130 + }, + { + "epoch": 0.36897317968063664, + "grad_norm": 0.7400781512260437, + "learning_rate": 0.0005, + "loss": 1.0331, + "step": 7140 + }, + { + "epoch": 0.36948994883985325, + "grad_norm": 0.8144364356994629, + "learning_rate": 0.0005, + "loss": 1.0341, + "step": 7150 + }, + { + "epoch": 0.3700067179990698, + "grad_norm": 0.6299716830253601, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 7160 + }, + { + "epoch": 0.37052348715828637, + "grad_norm": 0.605995774269104, + "learning_rate": 0.0005, + "loss": 1.0628, + "step": 7170 + }, + { + "epoch": 0.371040256317503, + "grad_norm": 0.5977038145065308, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 7180 + }, + { + "epoch": 0.37155702547671954, + "grad_norm": 0.6418441534042358, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 7190 + }, + { + "epoch": 0.37207379463593615, + "grad_norm": 0.6550008654594421, + "learning_rate": 0.0005, + "loss": 1.0317, + "step": 7200 + }, + { + "epoch": 0.3725905637951527, + "grad_norm": 0.6023372411727905, + "learning_rate": 0.0005, + "loss": 1.0291, + "step": 7210 + }, + { + "epoch": 0.37310733295436926, + "grad_norm": 0.6071696877479553, + "learning_rate": 0.0005, + "loss": 1.041, + "step": 7220 + }, + { + "epoch": 0.3736241021135859, + "grad_norm": 0.6096029877662659, + "learning_rate": 0.0005, + "loss": 1.0316, + "step": 7230 + }, + { + "epoch": 0.37414087127280243, + "grad_norm": 0.5897752642631531, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 7240 + }, + { + "epoch": 0.374657640432019, + "grad_norm": 0.6093285083770752, + "learning_rate": 0.0005, + "loss": 1.0368, + "step": 7250 + }, + { + "epoch": 0.3751744095912356, + "grad_norm": 0.6444416046142578, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 7260 + }, + { + "epoch": 0.37569117875045216, + "grad_norm": 0.6363521814346313, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 7270 + }, + { + "epoch": 0.37620794790966877, + "grad_norm": 0.6633175611495972, + "learning_rate": 0.0005, + "loss": 1.0178, + "step": 7280 + }, + { + "epoch": 0.37672471706888533, + "grad_norm": 0.5611307621002197, + "learning_rate": 0.0005, + "loss": 1.0319, + "step": 7290 + }, + { + "epoch": 0.3772414862281019, + "grad_norm": 0.5733465552330017, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 7300 + }, + { + "epoch": 0.3777582553873185, + "grad_norm": 0.6538148522377014, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 7310 + }, + { + "epoch": 0.37827502454653505, + "grad_norm": 0.6904069781303406, + "learning_rate": 0.0005, + "loss": 1.0322, + "step": 7320 + }, + { + "epoch": 0.37879179370575167, + "grad_norm": 0.6486346125602722, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 7330 + }, + { + "epoch": 0.3793085628649682, + "grad_norm": 0.5600974559783936, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 7340 + }, + { + "epoch": 0.3798253320241848, + "grad_norm": 0.5800735354423523, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 7350 + }, + { + "epoch": 0.3803421011834014, + "grad_norm": 0.6365842819213867, + "learning_rate": 0.0005, + "loss": 1.0229, + "step": 7360 + }, + { + "epoch": 0.38085887034261795, + "grad_norm": 0.6074081659317017, + "learning_rate": 0.0005, + "loss": 1.0325, + "step": 7370 + }, + { + "epoch": 0.3813756395018345, + "grad_norm": 0.5998241901397705, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 7380 + }, + { + "epoch": 0.3818924086610511, + "grad_norm": 0.6576969623565674, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 7390 + }, + { + "epoch": 0.3824091778202677, + "grad_norm": 0.6602439284324646, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 7400 + }, + { + "epoch": 0.3829259469794843, + "grad_norm": 0.6058171987533569, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 7410 + }, + { + "epoch": 0.38344271613870085, + "grad_norm": 0.7188865542411804, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 7420 + }, + { + "epoch": 0.3839594852979174, + "grad_norm": 0.6025785803794861, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 7430 + }, + { + "epoch": 0.384476254457134, + "grad_norm": 0.6643381118774414, + "learning_rate": 0.0005, + "loss": 1.0059, + "step": 7440 + }, + { + "epoch": 0.38499302361635057, + "grad_norm": 0.6015246510505676, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 7450 + }, + { + "epoch": 0.38550979277556713, + "grad_norm": 0.6102477312088013, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 7460 + }, + { + "epoch": 0.38602656193478374, + "grad_norm": 0.6054964661598206, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 7470 + }, + { + "epoch": 0.3865433310940003, + "grad_norm": 0.5937122106552124, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 7480 + }, + { + "epoch": 0.3870601002532169, + "grad_norm": 0.5697932839393616, + "learning_rate": 0.0005, + "loss": 0.9999, + "step": 7490 + }, + { + "epoch": 0.38757686941243347, + "grad_norm": 0.6040372848510742, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 7500 + }, + { + "epoch": 0.38809363857165, + "grad_norm": 0.666986346244812, + "learning_rate": 0.0005, + "loss": 1.0253, + "step": 7510 + }, + { + "epoch": 0.38861040773086664, + "grad_norm": 0.5957795977592468, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 7520 + }, + { + "epoch": 0.3891271768900832, + "grad_norm": 0.7224922776222229, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 7530 + }, + { + "epoch": 0.3896439460492998, + "grad_norm": 0.6356753706932068, + "learning_rate": 0.0005, + "loss": 1.028, + "step": 7540 + }, + { + "epoch": 0.39016071520851636, + "grad_norm": 0.6179920434951782, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 7550 + }, + { + "epoch": 0.3906774843677329, + "grad_norm": 0.7617205381393433, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 7560 + }, + { + "epoch": 0.39119425352694953, + "grad_norm": 0.6080652475357056, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 7570 + }, + { + "epoch": 0.3917110226861661, + "grad_norm": 0.6190568804740906, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 7580 + }, + { + "epoch": 0.39222779184538265, + "grad_norm": 0.584118127822876, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 7590 + }, + { + "epoch": 0.39274456100459926, + "grad_norm": 0.5745325088500977, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 7600 + }, + { + "epoch": 0.3932613301638158, + "grad_norm": 0.586669385433197, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 7610 + }, + { + "epoch": 0.3937780993230324, + "grad_norm": 0.6320251822471619, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 7620 + }, + { + "epoch": 0.394294868482249, + "grad_norm": 0.6066457033157349, + "learning_rate": 0.0005, + "loss": 1.0084, + "step": 7630 + }, + { + "epoch": 0.39481163764146554, + "grad_norm": 0.5506545305252075, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 7640 + }, + { + "epoch": 0.39532840680068215, + "grad_norm": 0.6136749982833862, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 7650 + }, + { + "epoch": 0.3958451759598987, + "grad_norm": 0.7134038805961609, + "learning_rate": 0.0005, + "loss": 1.0056, + "step": 7660 + }, + { + "epoch": 0.39636194511911527, + "grad_norm": 0.6053097248077393, + "learning_rate": 0.0005, + "loss": 1.0099, + "step": 7670 + }, + { + "epoch": 0.3968787142783319, + "grad_norm": 0.5632675290107727, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 7680 + }, + { + "epoch": 0.39739548343754844, + "grad_norm": 0.6165273189544678, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 7690 + }, + { + "epoch": 0.39791225259676505, + "grad_norm": 0.6279580593109131, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 7700 + }, + { + "epoch": 0.3984290217559816, + "grad_norm": 0.6073136329650879, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 7710 + }, + { + "epoch": 0.39894579091519816, + "grad_norm": 0.5953530073165894, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 7720 + }, + { + "epoch": 0.3994625600744148, + "grad_norm": 0.5744448900222778, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 7730 + }, + { + "epoch": 0.39997932923363133, + "grad_norm": 0.5618404746055603, + "learning_rate": 0.0005, + "loss": 1.0079, + "step": 7740 + }, + { + "epoch": 0.4004960983928479, + "grad_norm": 0.567597508430481, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 7750 + }, + { + "epoch": 0.4010128675520645, + "grad_norm": 0.5764487981796265, + "learning_rate": 0.0005, + "loss": 1.0379, + "step": 7760 + }, + { + "epoch": 0.40152963671128106, + "grad_norm": 0.6651884913444519, + "learning_rate": 0.0005, + "loss": 1.0082, + "step": 7770 + }, + { + "epoch": 0.40204640587049767, + "grad_norm": 0.7175072431564331, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 7780 + }, + { + "epoch": 0.4025631750297142, + "grad_norm": 0.591261625289917, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 7790 + }, + { + "epoch": 0.4030799441889308, + "grad_norm": 0.5823299884796143, + "learning_rate": 0.0005, + "loss": 1.0016, + "step": 7800 + }, + { + "epoch": 0.4035967133481474, + "grad_norm": 0.5339162945747375, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 7810 + }, + { + "epoch": 0.40411348250736395, + "grad_norm": 0.6042317748069763, + "learning_rate": 0.0005, + "loss": 1.0006, + "step": 7820 + }, + { + "epoch": 0.40463025166658056, + "grad_norm": 0.6178877353668213, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 7830 + }, + { + "epoch": 0.4051470208257971, + "grad_norm": 0.6470639705657959, + "learning_rate": 0.0005, + "loss": 0.9899, + "step": 7840 + }, + { + "epoch": 0.4056637899850137, + "grad_norm": 0.5468031167984009, + "learning_rate": 0.0005, + "loss": 1.0, + "step": 7850 + }, + { + "epoch": 0.4061805591442303, + "grad_norm": 0.566137433052063, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 7860 + }, + { + "epoch": 0.40669732830344685, + "grad_norm": 0.6000310182571411, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 7870 + }, + { + "epoch": 0.4072140974626634, + "grad_norm": 0.5763528943061829, + "learning_rate": 0.0005, + "loss": 1.0318, + "step": 7880 + }, + { + "epoch": 0.40773086662188, + "grad_norm": 0.5767903327941895, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 7890 + }, + { + "epoch": 0.4082476357810966, + "grad_norm": 0.6295961737632751, + "learning_rate": 0.0005, + "loss": 0.9885, + "step": 7900 + }, + { + "epoch": 0.4087644049403132, + "grad_norm": 0.6416009068489075, + "learning_rate": 0.0005, + "loss": 1.0013, + "step": 7910 + }, + { + "epoch": 0.40928117409952974, + "grad_norm": 0.6039779186248779, + "learning_rate": 0.0005, + "loss": 0.9986, + "step": 7920 + }, + { + "epoch": 0.4097979432587463, + "grad_norm": 0.6459826827049255, + "learning_rate": 0.0005, + "loss": 1.0051, + "step": 7930 + }, + { + "epoch": 0.4103147124179629, + "grad_norm": 0.597352147102356, + "learning_rate": 0.0005, + "loss": 1.002, + "step": 7940 + }, + { + "epoch": 0.41083148157717947, + "grad_norm": 0.5876639485359192, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 7950 + }, + { + "epoch": 0.411348250736396, + "grad_norm": 0.5862469673156738, + "learning_rate": 0.0005, + "loss": 0.9972, + "step": 7960 + }, + { + "epoch": 0.41186501989561264, + "grad_norm": 0.5829436779022217, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 7970 + }, + { + "epoch": 0.4123817890548292, + "grad_norm": 0.5912736058235168, + "learning_rate": 0.0005, + "loss": 1.0038, + "step": 7980 + }, + { + "epoch": 0.4128985582140458, + "grad_norm": 0.5810758471488953, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 7990 + }, + { + "epoch": 0.41341532737326236, + "grad_norm": 0.5771864056587219, + "learning_rate": 0.0005, + "loss": 0.9976, + "step": 8000 + }, + { + "epoch": 0.4139320965324789, + "grad_norm": 0.5928204655647278, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 8010 + }, + { + "epoch": 0.41444886569169553, + "grad_norm": 0.603636622428894, + "learning_rate": 0.0005, + "loss": 1.0009, + "step": 8020 + }, + { + "epoch": 0.4149656348509121, + "grad_norm": 0.5715627670288086, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 8030 + }, + { + "epoch": 0.4154824040101287, + "grad_norm": 0.5580553412437439, + "learning_rate": 0.0005, + "loss": 0.9957, + "step": 8040 + }, + { + "epoch": 0.41599917316934526, + "grad_norm": 0.5680859088897705, + "learning_rate": 0.0005, + "loss": 1.0027, + "step": 8050 + }, + { + "epoch": 0.4165159423285618, + "grad_norm": 0.5446572303771973, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 8060 + }, + { + "epoch": 0.41703271148777843, + "grad_norm": 0.5877604484558105, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 8070 + }, + { + "epoch": 0.417549480646995, + "grad_norm": 0.5905182361602783, + "learning_rate": 0.0005, + "loss": 0.9889, + "step": 8080 + }, + { + "epoch": 0.41806624980621154, + "grad_norm": 0.6025214195251465, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 8090 + }, + { + "epoch": 0.41858301896542816, + "grad_norm": 0.5762201547622681, + "learning_rate": 0.0005, + "loss": 1.0199, + "step": 8100 + }, + { + "epoch": 0.4190997881246447, + "grad_norm": 0.5564827919006348, + "learning_rate": 0.0005, + "loss": 1.0049, + "step": 8110 + }, + { + "epoch": 0.4196165572838613, + "grad_norm": 0.5517228841781616, + "learning_rate": 0.0005, + "loss": 1.0036, + "step": 8120 + }, + { + "epoch": 0.4201333264430779, + "grad_norm": 0.6581810712814331, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 8130 + }, + { + "epoch": 0.42065009560229444, + "grad_norm": 0.5902772545814514, + "learning_rate": 0.0005, + "loss": 0.9956, + "step": 8140 + }, + { + "epoch": 0.42116686476151105, + "grad_norm": 0.5903311967849731, + "learning_rate": 0.0005, + "loss": 0.9994, + "step": 8150 + }, + { + "epoch": 0.4216836339207276, + "grad_norm": 0.5883710980415344, + "learning_rate": 0.0005, + "loss": 1.001, + "step": 8160 + }, + { + "epoch": 0.42220040307994416, + "grad_norm": 0.5694506764411926, + "learning_rate": 0.0005, + "loss": 0.997, + "step": 8170 + }, + { + "epoch": 0.4227171722391608, + "grad_norm": 0.5448591113090515, + "learning_rate": 0.0005, + "loss": 0.9987, + "step": 8180 + }, + { + "epoch": 0.42323394139837733, + "grad_norm": 0.5763291120529175, + "learning_rate": 0.0005, + "loss": 0.9957, + "step": 8190 + }, + { + "epoch": 0.42375071055759395, + "grad_norm": 0.5763616561889648, + "learning_rate": 0.0005, + "loss": 1.009, + "step": 8200 + }, + { + "epoch": 0.4242674797168105, + "grad_norm": 0.5575286149978638, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 8210 + }, + { + "epoch": 0.42478424887602706, + "grad_norm": 0.5435507297515869, + "learning_rate": 0.0005, + "loss": 0.9947, + "step": 8220 + }, + { + "epoch": 0.42530101803524367, + "grad_norm": 0.6307750344276428, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 8230 + }, + { + "epoch": 0.42581778719446023, + "grad_norm": 0.5419248342514038, + "learning_rate": 0.0005, + "loss": 1.013, + "step": 8240 + }, + { + "epoch": 0.4263345563536768, + "grad_norm": 0.5558311343193054, + "learning_rate": 0.0005, + "loss": 0.9956, + "step": 8250 + }, + { + "epoch": 0.4268513255128934, + "grad_norm": 0.5593147277832031, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 8260 + }, + { + "epoch": 0.42736809467210995, + "grad_norm": 0.5839881896972656, + "learning_rate": 0.0005, + "loss": 1.001, + "step": 8270 + }, + { + "epoch": 0.42788486383132657, + "grad_norm": 0.5981064438819885, + "learning_rate": 0.0005, + "loss": 0.9952, + "step": 8280 + }, + { + "epoch": 0.4284016329905431, + "grad_norm": 0.6945583820343018, + "learning_rate": 0.0005, + "loss": 0.9971, + "step": 8290 + }, + { + "epoch": 0.4289184021497597, + "grad_norm": 0.5536506772041321, + "learning_rate": 0.0005, + "loss": 0.9929, + "step": 8300 + }, + { + "epoch": 0.4294351713089763, + "grad_norm": 0.557338297367096, + "learning_rate": 0.0005, + "loss": 0.9833, + "step": 8310 + }, + { + "epoch": 0.42995194046819285, + "grad_norm": 0.5480133295059204, + "learning_rate": 0.0005, + "loss": 1.0008, + "step": 8320 + }, + { + "epoch": 0.43046870962740946, + "grad_norm": 0.5495566129684448, + "learning_rate": 0.0005, + "loss": 1.0126, + "step": 8330 + }, + { + "epoch": 0.430985478786626, + "grad_norm": 0.5759509801864624, + "learning_rate": 0.0005, + "loss": 0.9866, + "step": 8340 + }, + { + "epoch": 0.4315022479458426, + "grad_norm": 0.5602892637252808, + "learning_rate": 0.0005, + "loss": 0.9893, + "step": 8350 + }, + { + "epoch": 0.4320190171050592, + "grad_norm": 0.560892641544342, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 8360 + }, + { + "epoch": 0.43253578626427575, + "grad_norm": 0.582815408706665, + "learning_rate": 0.0005, + "loss": 1.0, + "step": 8370 + }, + { + "epoch": 0.4330525554234923, + "grad_norm": 0.6133496165275574, + "learning_rate": 0.0005, + "loss": 0.9928, + "step": 8380 + }, + { + "epoch": 0.4335693245827089, + "grad_norm": 0.5611013174057007, + "learning_rate": 0.0005, + "loss": 0.998, + "step": 8390 + }, + { + "epoch": 0.43408609374192547, + "grad_norm": 0.5589267611503601, + "learning_rate": 0.0005, + "loss": 0.999, + "step": 8400 + }, + { + "epoch": 0.4346028629011421, + "grad_norm": 0.5508078932762146, + "learning_rate": 0.0005, + "loss": 0.9954, + "step": 8410 + }, + { + "epoch": 0.43511963206035864, + "grad_norm": 0.5803013443946838, + "learning_rate": 0.0005, + "loss": 0.9891, + "step": 8420 + }, + { + "epoch": 0.4356364012195752, + "grad_norm": 0.532085120677948, + "learning_rate": 0.0005, + "loss": 0.9935, + "step": 8430 + }, + { + "epoch": 0.4361531703787918, + "grad_norm": 0.6158758401870728, + "learning_rate": 0.0005, + "loss": 0.9927, + "step": 8440 + }, + { + "epoch": 0.43666993953800837, + "grad_norm": 0.5444722771644592, + "learning_rate": 0.0005, + "loss": 0.9754, + "step": 8450 + }, + { + "epoch": 0.4371867086972249, + "grad_norm": 0.5872038006782532, + "learning_rate": 0.0005, + "loss": 0.9803, + "step": 8460 + }, + { + "epoch": 0.43770347785644154, + "grad_norm": 0.5382379293441772, + "learning_rate": 0.0005, + "loss": 1.0025, + "step": 8470 + }, + { + "epoch": 0.4382202470156581, + "grad_norm": 0.5538324117660522, + "learning_rate": 0.0005, + "loss": 0.9764, + "step": 8480 + }, + { + "epoch": 0.4387370161748747, + "grad_norm": 0.5917341709136963, + "learning_rate": 0.0005, + "loss": 0.9732, + "step": 8490 + }, + { + "epoch": 0.43925378533409126, + "grad_norm": 0.5395458340644836, + "learning_rate": 0.0005, + "loss": 0.9948, + "step": 8500 + }, + { + "epoch": 0.4397705544933078, + "grad_norm": 0.5973149538040161, + "learning_rate": 0.0005, + "loss": 0.9971, + "step": 8510 + }, + { + "epoch": 0.44028732365252443, + "grad_norm": 0.579712450504303, + "learning_rate": 0.0005, + "loss": 0.9836, + "step": 8520 + }, + { + "epoch": 0.440804092811741, + "grad_norm": 0.5590643882751465, + "learning_rate": 0.0005, + "loss": 0.9896, + "step": 8530 + }, + { + "epoch": 0.4413208619709576, + "grad_norm": 0.5443204045295715, + "learning_rate": 0.0005, + "loss": 0.9868, + "step": 8540 + }, + { + "epoch": 0.44183763113017416, + "grad_norm": 0.5973614454269409, + "learning_rate": 0.0005, + "loss": 0.9881, + "step": 8550 + }, + { + "epoch": 0.4423544002893907, + "grad_norm": 0.6157576441764832, + "learning_rate": 0.0005, + "loss": 1.0007, + "step": 8560 + }, + { + "epoch": 0.4428711694486073, + "grad_norm": 0.5678598880767822, + "learning_rate": 0.0005, + "loss": 0.9878, + "step": 8570 + }, + { + "epoch": 0.4433879386078239, + "grad_norm": 0.5606565475463867, + "learning_rate": 0.0005, + "loss": 0.9899, + "step": 8580 + }, + { + "epoch": 0.44390470776704044, + "grad_norm": 0.651261031627655, + "learning_rate": 0.0005, + "loss": 1.0053, + "step": 8590 + }, + { + "epoch": 0.44442147692625705, + "grad_norm": 0.6717237830162048, + "learning_rate": 0.0005, + "loss": 0.9783, + "step": 8600 + }, + { + "epoch": 0.4449382460854736, + "grad_norm": 0.5981956720352173, + "learning_rate": 0.0005, + "loss": 0.9846, + "step": 8610 + }, + { + "epoch": 0.4454550152446902, + "grad_norm": 0.6338360905647278, + "learning_rate": 0.0005, + "loss": 0.9903, + "step": 8620 + }, + { + "epoch": 0.4459717844039068, + "grad_norm": 0.6431187987327576, + "learning_rate": 0.0005, + "loss": 0.9967, + "step": 8630 + }, + { + "epoch": 0.44648855356312334, + "grad_norm": 0.6032900810241699, + "learning_rate": 0.0005, + "loss": 0.9814, + "step": 8640 + }, + { + "epoch": 0.44700532272233995, + "grad_norm": 0.5607067942619324, + "learning_rate": 0.0005, + "loss": 1.0021, + "step": 8650 + }, + { + "epoch": 0.4475220918815565, + "grad_norm": 0.5442407727241516, + "learning_rate": 0.0005, + "loss": 0.9911, + "step": 8660 + }, + { + "epoch": 0.44803886104077306, + "grad_norm": 0.5274026989936829, + "learning_rate": 0.0005, + "loss": 0.9798, + "step": 8670 + }, + { + "epoch": 0.4485556301999897, + "grad_norm": 0.5678251385688782, + "learning_rate": 0.0005, + "loss": 0.9869, + "step": 8680 + }, + { + "epoch": 0.44907239935920623, + "grad_norm": 0.5528420805931091, + "learning_rate": 0.0005, + "loss": 0.9963, + "step": 8690 + }, + { + "epoch": 0.44958916851842284, + "grad_norm": 0.5485315918922424, + "learning_rate": 0.0005, + "loss": 0.974, + "step": 8700 + }, + { + "epoch": 0.4501059376776394, + "grad_norm": 0.566852331161499, + "learning_rate": 0.0005, + "loss": 0.9891, + "step": 8710 + }, + { + "epoch": 0.45062270683685596, + "grad_norm": 0.5270015597343445, + "learning_rate": 0.0005, + "loss": 0.979, + "step": 8720 + }, + { + "epoch": 0.45113947599607257, + "grad_norm": 0.5595947504043579, + "learning_rate": 0.0005, + "loss": 0.9632, + "step": 8730 + }, + { + "epoch": 0.4516562451552891, + "grad_norm": 0.5901986360549927, + "learning_rate": 0.0005, + "loss": 0.9952, + "step": 8740 + }, + { + "epoch": 0.4521730143145057, + "grad_norm": 0.5500153303146362, + "learning_rate": 0.0005, + "loss": 0.9826, + "step": 8750 + }, + { + "epoch": 0.4526897834737223, + "grad_norm": 0.6439850926399231, + "learning_rate": 0.0005, + "loss": 0.9783, + "step": 8760 + }, + { + "epoch": 0.45320655263293885, + "grad_norm": 0.6534972190856934, + "learning_rate": 0.0005, + "loss": 0.9905, + "step": 8770 + }, + { + "epoch": 0.45372332179215547, + "grad_norm": 0.6489924192428589, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 8780 + }, + { + "epoch": 0.454240090951372, + "grad_norm": 0.5541792511940002, + "learning_rate": 0.0005, + "loss": 0.989, + "step": 8790 + }, + { + "epoch": 0.4547568601105886, + "grad_norm": 0.5128721594810486, + "learning_rate": 0.0005, + "loss": 0.9943, + "step": 8800 + }, + { + "epoch": 0.4552736292698052, + "grad_norm": 0.5839647054672241, + "learning_rate": 0.0005, + "loss": 0.9824, + "step": 8810 + }, + { + "epoch": 0.45579039842902175, + "grad_norm": 0.6303303241729736, + "learning_rate": 0.0005, + "loss": 0.9975, + "step": 8820 + }, + { + "epoch": 0.45630716758823836, + "grad_norm": 0.5413320064544678, + "learning_rate": 0.0005, + "loss": 0.973, + "step": 8830 + }, + { + "epoch": 0.4568239367474549, + "grad_norm": 0.5503526926040649, + "learning_rate": 0.0005, + "loss": 0.997, + "step": 8840 + }, + { + "epoch": 0.4573407059066715, + "grad_norm": 0.5337091684341431, + "learning_rate": 0.0005, + "loss": 0.9785, + "step": 8850 + }, + { + "epoch": 0.4578574750658881, + "grad_norm": 0.5215671062469482, + "learning_rate": 0.0005, + "loss": 0.988, + "step": 8860 + }, + { + "epoch": 0.45837424422510464, + "grad_norm": 0.5596259236335754, + "learning_rate": 0.0005, + "loss": 0.9619, + "step": 8870 + }, + { + "epoch": 0.4588910133843212, + "grad_norm": 0.644656777381897, + "learning_rate": 0.0005, + "loss": 1.0018, + "step": 8880 + }, + { + "epoch": 0.4594077825435378, + "grad_norm": 0.546576976776123, + "learning_rate": 0.0005, + "loss": 0.9895, + "step": 8890 + }, + { + "epoch": 0.45992455170275437, + "grad_norm": 0.5912691354751587, + "learning_rate": 0.0005, + "loss": 0.9769, + "step": 8900 + }, + { + "epoch": 0.460441320861971, + "grad_norm": 0.5670520663261414, + "learning_rate": 0.0005, + "loss": 0.9841, + "step": 8910 + }, + { + "epoch": 0.46095809002118754, + "grad_norm": 0.5410053730010986, + "learning_rate": 0.0005, + "loss": 0.9842, + "step": 8920 + }, + { + "epoch": 0.4614748591804041, + "grad_norm": 0.5501711964607239, + "learning_rate": 0.0005, + "loss": 0.9833, + "step": 8930 + }, + { + "epoch": 0.4619916283396207, + "grad_norm": 0.5702757835388184, + "learning_rate": 0.0005, + "loss": 0.996, + "step": 8940 + }, + { + "epoch": 0.46250839749883726, + "grad_norm": 0.5536521077156067, + "learning_rate": 0.0005, + "loss": 0.9808, + "step": 8950 + }, + { + "epoch": 0.4630251666580538, + "grad_norm": 0.5470142364501953, + "learning_rate": 0.0005, + "loss": 0.9701, + "step": 8960 + }, + { + "epoch": 0.46354193581727043, + "grad_norm": 0.5773063898086548, + "learning_rate": 0.0005, + "loss": 0.9648, + "step": 8970 + }, + { + "epoch": 0.464058704976487, + "grad_norm": 0.5552759170532227, + "learning_rate": 0.0005, + "loss": 0.9801, + "step": 8980 + }, + { + "epoch": 0.4645754741357036, + "grad_norm": 0.5589256882667542, + "learning_rate": 0.0005, + "loss": 0.9762, + "step": 8990 + }, + { + "epoch": 0.46509224329492016, + "grad_norm": 0.5548306703567505, + "learning_rate": 0.0005, + "loss": 0.9536, + "step": 9000 + }, + { + "epoch": 0.4656090124541367, + "grad_norm": 0.5578811168670654, + "learning_rate": 0.0005, + "loss": 0.9758, + "step": 9010 + }, + { + "epoch": 0.46612578161335333, + "grad_norm": 0.542353630065918, + "learning_rate": 0.0005, + "loss": 0.9754, + "step": 9020 + }, + { + "epoch": 0.4666425507725699, + "grad_norm": 0.5240308046340942, + "learning_rate": 0.0005, + "loss": 0.9527, + "step": 9030 + }, + { + "epoch": 0.4671593199317865, + "grad_norm": 0.5662107467651367, + "learning_rate": 0.0005, + "loss": 0.9812, + "step": 9040 + }, + { + "epoch": 0.46767608909100306, + "grad_norm": 0.5549916625022888, + "learning_rate": 0.0005, + "loss": 0.9881, + "step": 9050 + }, + { + "epoch": 0.4681928582502196, + "grad_norm": 0.5178738832473755, + "learning_rate": 0.0005, + "loss": 0.9641, + "step": 9060 + }, + { + "epoch": 0.4687096274094362, + "grad_norm": 0.52500981092453, + "learning_rate": 0.0005, + "loss": 0.969, + "step": 9070 + }, + { + "epoch": 0.4692263965686528, + "grad_norm": 0.5403527617454529, + "learning_rate": 0.0005, + "loss": 0.9853, + "step": 9080 + }, + { + "epoch": 0.46974316572786934, + "grad_norm": 0.6338274478912354, + "learning_rate": 0.0005, + "loss": 0.9762, + "step": 9090 + }, + { + "epoch": 0.47025993488708595, + "grad_norm": 0.5694402456283569, + "learning_rate": 0.0005, + "loss": 0.9947, + "step": 9100 + }, + { + "epoch": 0.4707767040463025, + "grad_norm": 0.5308618545532227, + "learning_rate": 0.0005, + "loss": 1.0035, + "step": 9110 + }, + { + "epoch": 0.4712934732055191, + "grad_norm": 0.5705435872077942, + "learning_rate": 0.0005, + "loss": 0.979, + "step": 9120 + }, + { + "epoch": 0.4718102423647357, + "grad_norm": 0.5150364637374878, + "learning_rate": 0.0005, + "loss": 0.9907, + "step": 9130 + }, + { + "epoch": 0.47232701152395223, + "grad_norm": 0.6099853515625, + "learning_rate": 0.0005, + "loss": 0.9834, + "step": 9140 + }, + { + "epoch": 0.47284378068316885, + "grad_norm": 0.5578297972679138, + "learning_rate": 0.0005, + "loss": 0.9758, + "step": 9150 + }, + { + "epoch": 0.4733605498423854, + "grad_norm": 0.5842065811157227, + "learning_rate": 0.0005, + "loss": 0.9831, + "step": 9160 + }, + { + "epoch": 0.47387731900160196, + "grad_norm": 0.54753577709198, + "learning_rate": 0.0005, + "loss": 0.9767, + "step": 9170 + }, + { + "epoch": 0.47439408816081857, + "grad_norm": 0.5472375750541687, + "learning_rate": 0.0005, + "loss": 0.9844, + "step": 9180 + }, + { + "epoch": 0.47491085732003513, + "grad_norm": 0.6289487481117249, + "learning_rate": 0.0005, + "loss": 0.9806, + "step": 9190 + }, + { + "epoch": 0.47542762647925174, + "grad_norm": 0.5702399015426636, + "learning_rate": 0.0005, + "loss": 0.973, + "step": 9200 + }, + { + "epoch": 0.4759443956384683, + "grad_norm": 0.5393164753913879, + "learning_rate": 0.0005, + "loss": 0.9862, + "step": 9210 + }, + { + "epoch": 0.47646116479768486, + "grad_norm": 0.5307340621948242, + "learning_rate": 0.0005, + "loss": 0.9949, + "step": 9220 + }, + { + "epoch": 0.47697793395690147, + "grad_norm": 0.6061729788780212, + "learning_rate": 0.0005, + "loss": 0.9869, + "step": 9230 + }, + { + "epoch": 0.477494703116118, + "grad_norm": 0.5458270311355591, + "learning_rate": 0.0005, + "loss": 0.9782, + "step": 9240 + }, + { + "epoch": 0.4780114722753346, + "grad_norm": 0.5837684869766235, + "learning_rate": 0.0005, + "loss": 0.9794, + "step": 9250 + }, + { + "epoch": 0.4785282414345512, + "grad_norm": 0.557824432849884, + "learning_rate": 0.0005, + "loss": 0.9723, + "step": 9260 + }, + { + "epoch": 0.47904501059376775, + "grad_norm": 0.57038414478302, + "learning_rate": 0.0005, + "loss": 0.9782, + "step": 9270 + }, + { + "epoch": 0.47956177975298436, + "grad_norm": 0.5163660645484924, + "learning_rate": 0.0005, + "loss": 0.9615, + "step": 9280 + }, + { + "epoch": 0.4800785489122009, + "grad_norm": 0.5604984760284424, + "learning_rate": 0.0005, + "loss": 0.9806, + "step": 9290 + }, + { + "epoch": 0.4805953180714175, + "grad_norm": 0.5169503092765808, + "learning_rate": 0.0005, + "loss": 0.9594, + "step": 9300 + }, + { + "epoch": 0.4811120872306341, + "grad_norm": 0.547803521156311, + "learning_rate": 0.0005, + "loss": 0.9795, + "step": 9310 + }, + { + "epoch": 0.48162885638985065, + "grad_norm": 0.5462937951087952, + "learning_rate": 0.0005, + "loss": 0.9756, + "step": 9320 + }, + { + "epoch": 0.48214562554906726, + "grad_norm": 0.5670326352119446, + "learning_rate": 0.0005, + "loss": 0.9726, + "step": 9330 + }, + { + "epoch": 0.4826623947082838, + "grad_norm": 0.5633768439292908, + "learning_rate": 0.0005, + "loss": 0.958, + "step": 9340 + }, + { + "epoch": 0.48317916386750037, + "grad_norm": 0.5781881213188171, + "learning_rate": 0.0005, + "loss": 0.9531, + "step": 9350 + }, + { + "epoch": 0.483695933026717, + "grad_norm": 0.6162354350090027, + "learning_rate": 0.0005, + "loss": 0.9584, + "step": 9360 + }, + { + "epoch": 0.48421270218593354, + "grad_norm": 0.5659033060073853, + "learning_rate": 0.0005, + "loss": 0.9691, + "step": 9370 + }, + { + "epoch": 0.4847294713451501, + "grad_norm": 0.5409724116325378, + "learning_rate": 0.0005, + "loss": 0.9654, + "step": 9380 + }, + { + "epoch": 0.4852462405043667, + "grad_norm": 0.5185449719429016, + "learning_rate": 0.0005, + "loss": 0.9767, + "step": 9390 + }, + { + "epoch": 0.48576300966358327, + "grad_norm": 0.5317234992980957, + "learning_rate": 0.0005, + "loss": 0.9797, + "step": 9400 + }, + { + "epoch": 0.4862797788227999, + "grad_norm": 0.5362582802772522, + "learning_rate": 0.0005, + "loss": 0.9691, + "step": 9410 + }, + { + "epoch": 0.48679654798201644, + "grad_norm": 0.5296323895454407, + "learning_rate": 0.0005, + "loss": 0.9714, + "step": 9420 + }, + { + "epoch": 0.487313317141233, + "grad_norm": 0.5387376546859741, + "learning_rate": 0.0005, + "loss": 0.9857, + "step": 9430 + }, + { + "epoch": 0.4878300863004496, + "grad_norm": 0.5592471957206726, + "learning_rate": 0.0005, + "loss": 0.9687, + "step": 9440 + }, + { + "epoch": 0.48834685545966616, + "grad_norm": 0.5368979573249817, + "learning_rate": 0.0005, + "loss": 0.9624, + "step": 9450 + }, + { + "epoch": 0.4888636246188827, + "grad_norm": 0.559069037437439, + "learning_rate": 0.0005, + "loss": 0.9713, + "step": 9460 + }, + { + "epoch": 0.48938039377809933, + "grad_norm": 0.5417030453681946, + "learning_rate": 0.0005, + "loss": 0.9749, + "step": 9470 + }, + { + "epoch": 0.4898971629373159, + "grad_norm": 0.6302499771118164, + "learning_rate": 0.0005, + "loss": 0.976, + "step": 9480 + }, + { + "epoch": 0.4904139320965325, + "grad_norm": 0.5580116510391235, + "learning_rate": 0.0005, + "loss": 0.9696, + "step": 9490 + }, + { + "epoch": 0.49093070125574906, + "grad_norm": 0.5281049013137817, + "learning_rate": 0.0005, + "loss": 0.9626, + "step": 9500 + }, + { + "epoch": 0.4914474704149656, + "grad_norm": 0.6579439043998718, + "learning_rate": 0.0005, + "loss": 0.9915, + "step": 9510 + }, + { + "epoch": 0.4919642395741822, + "grad_norm": 0.6327407956123352, + "learning_rate": 0.0005, + "loss": 0.976, + "step": 9520 + }, + { + "epoch": 0.4924810087333988, + "grad_norm": 0.5917522311210632, + "learning_rate": 0.0005, + "loss": 0.9698, + "step": 9530 + }, + { + "epoch": 0.4929977778926154, + "grad_norm": 0.5556752681732178, + "learning_rate": 0.0005, + "loss": 0.974, + "step": 9540 + }, + { + "epoch": 0.49351454705183195, + "grad_norm": 0.6051674485206604, + "learning_rate": 0.0005, + "loss": 0.9673, + "step": 9550 + }, + { + "epoch": 0.4940313162110485, + "grad_norm": 0.6255143880844116, + "learning_rate": 0.0005, + "loss": 0.9741, + "step": 9560 + }, + { + "epoch": 0.4945480853702651, + "grad_norm": 0.5358819961547852, + "learning_rate": 0.0005, + "loss": 0.965, + "step": 9570 + }, + { + "epoch": 0.4950648545294817, + "grad_norm": 0.5503594279289246, + "learning_rate": 0.0005, + "loss": 0.9668, + "step": 9580 + }, + { + "epoch": 0.49558162368869824, + "grad_norm": 0.510237455368042, + "learning_rate": 0.0005, + "loss": 0.9685, + "step": 9590 + }, + { + "epoch": 0.49609839284791485, + "grad_norm": 0.5995839238166809, + "learning_rate": 0.0005, + "loss": 0.9709, + "step": 9600 + }, + { + "epoch": 0.4966151620071314, + "grad_norm": 0.5354804992675781, + "learning_rate": 0.0005, + "loss": 0.9618, + "step": 9610 + }, + { + "epoch": 0.497131931166348, + "grad_norm": 0.5301372408866882, + "learning_rate": 0.0005, + "loss": 0.9644, + "step": 9620 + }, + { + "epoch": 0.4976487003255646, + "grad_norm": 0.6010123491287231, + "learning_rate": 0.0005, + "loss": 0.9834, + "step": 9630 + }, + { + "epoch": 0.49816546948478113, + "grad_norm": 0.5131679177284241, + "learning_rate": 0.0005, + "loss": 0.9695, + "step": 9640 + }, + { + "epoch": 0.49868223864399774, + "grad_norm": 0.5364587903022766, + "learning_rate": 0.0005, + "loss": 0.9572, + "step": 9650 + }, + { + "epoch": 0.4991990078032143, + "grad_norm": 0.5561274290084839, + "learning_rate": 0.0005, + "loss": 0.9739, + "step": 9660 + }, + { + "epoch": 0.49971577696243086, + "grad_norm": 0.5267083048820496, + "learning_rate": 0.0005, + "loss": 0.9659, + "step": 9670 + }, + { + "epoch": 0.5002325461216475, + "grad_norm": 0.5306525230407715, + "learning_rate": 0.0005, + "loss": 0.9698, + "step": 9680 + }, + { + "epoch": 0.5007493152808641, + "grad_norm": 0.6048880219459534, + "learning_rate": 0.0005, + "loss": 0.9702, + "step": 9690 + }, + { + "epoch": 0.5012660844400806, + "grad_norm": 0.5528176426887512, + "learning_rate": 0.0005, + "loss": 0.981, + "step": 9700 + }, + { + "epoch": 0.5017828535992972, + "grad_norm": 0.5247277021408081, + "learning_rate": 0.0005, + "loss": 0.9587, + "step": 9710 + }, + { + "epoch": 0.5022996227585138, + "grad_norm": 0.5636876225471497, + "learning_rate": 0.0005, + "loss": 0.9627, + "step": 9720 + }, + { + "epoch": 0.5028163919177303, + "grad_norm": 0.5214900970458984, + "learning_rate": 0.0005, + "loss": 0.9644, + "step": 9730 + }, + { + "epoch": 0.5033331610769469, + "grad_norm": 0.5302378535270691, + "learning_rate": 0.0005, + "loss": 0.9612, + "step": 9740 + }, + { + "epoch": 0.5038499302361635, + "grad_norm": 0.5830851197242737, + "learning_rate": 0.0005, + "loss": 0.9563, + "step": 9750 + }, + { + "epoch": 0.50436669939538, + "grad_norm": 0.5303472876548767, + "learning_rate": 0.0005, + "loss": 0.9542, + "step": 9760 + }, + { + "epoch": 0.5048834685545966, + "grad_norm": 0.5632893443107605, + "learning_rate": 0.0005, + "loss": 0.9828, + "step": 9770 + }, + { + "epoch": 0.5054002377138133, + "grad_norm": 0.5968844890594482, + "learning_rate": 0.0005, + "loss": 0.9855, + "step": 9780 + }, + { + "epoch": 0.5059170068730299, + "grad_norm": 0.580721378326416, + "learning_rate": 0.0005, + "loss": 0.9903, + "step": 9790 + }, + { + "epoch": 0.5064337760322464, + "grad_norm": 0.5187913179397583, + "learning_rate": 0.0005, + "loss": 0.9512, + "step": 9800 + }, + { + "epoch": 0.506950545191463, + "grad_norm": 0.5946047902107239, + "learning_rate": 0.0005, + "loss": 0.9661, + "step": 9810 + }, + { + "epoch": 0.5074673143506796, + "grad_norm": 0.5428043603897095, + "learning_rate": 0.0005, + "loss": 0.9669, + "step": 9820 + }, + { + "epoch": 0.5079840835098961, + "grad_norm": 0.562601625919342, + "learning_rate": 0.0005, + "loss": 0.958, + "step": 9830 + }, + { + "epoch": 0.5085008526691127, + "grad_norm": 0.5812455415725708, + "learning_rate": 0.0005, + "loss": 0.969, + "step": 9840 + }, + { + "epoch": 0.5090176218283293, + "grad_norm": 0.6318747997283936, + "learning_rate": 0.0005, + "loss": 0.9512, + "step": 9850 + }, + { + "epoch": 0.5095343909875458, + "grad_norm": 0.6214849352836609, + "learning_rate": 0.0005, + "loss": 0.9727, + "step": 9860 + }, + { + "epoch": 0.5100511601467624, + "grad_norm": 0.5631205439567566, + "learning_rate": 0.0005, + "loss": 0.9564, + "step": 9870 + }, + { + "epoch": 0.510567929305979, + "grad_norm": 0.626625657081604, + "learning_rate": 0.0005, + "loss": 0.9597, + "step": 9880 + }, + { + "epoch": 0.5110846984651956, + "grad_norm": 0.4959418475627899, + "learning_rate": 0.0005, + "loss": 0.9591, + "step": 9890 + }, + { + "epoch": 0.5116014676244122, + "grad_norm": 0.5196536779403687, + "learning_rate": 0.0005, + "loss": 0.9771, + "step": 9900 + }, + { + "epoch": 0.5121182367836288, + "grad_norm": 0.6234534382820129, + "learning_rate": 0.0005, + "loss": 0.9609, + "step": 9910 + }, + { + "epoch": 0.5126350059428453, + "grad_norm": 0.5823763012886047, + "learning_rate": 0.0005, + "loss": 0.9757, + "step": 9920 + }, + { + "epoch": 0.5131517751020619, + "grad_norm": 0.5576559901237488, + "learning_rate": 0.0005, + "loss": 0.9502, + "step": 9930 + }, + { + "epoch": 0.5136685442612785, + "grad_norm": 0.5374221801757812, + "learning_rate": 0.0005, + "loss": 0.9513, + "step": 9940 + }, + { + "epoch": 0.5141853134204951, + "grad_norm": 0.5272248387336731, + "learning_rate": 0.0005, + "loss": 0.9586, + "step": 9950 + }, + { + "epoch": 0.5147020825797116, + "grad_norm": 0.5568712949752808, + "learning_rate": 0.0005, + "loss": 0.957, + "step": 9960 + }, + { + "epoch": 0.5152188517389282, + "grad_norm": 0.5274987816810608, + "learning_rate": 0.0005, + "loss": 0.9432, + "step": 9970 + }, + { + "epoch": 0.5157356208981448, + "grad_norm": 0.5364307165145874, + "learning_rate": 0.0005, + "loss": 0.9548, + "step": 9980 + }, + { + "epoch": 0.5162523900573613, + "grad_norm": 0.5436477065086365, + "learning_rate": 0.0005, + "loss": 0.9572, + "step": 9990 + }, + { + "epoch": 0.516769159216578, + "grad_norm": 0.5213954448699951, + "learning_rate": 0.0005, + "loss": 0.9589, + "step": 10000 + }, + { + "epoch": 0.5172859283757946, + "grad_norm": 0.5076503157615662, + "learning_rate": 0.0005, + "loss": 0.9498, + "step": 10010 + }, + { + "epoch": 0.5178026975350111, + "grad_norm": 0.5266632437705994, + "learning_rate": 0.0005, + "loss": 0.9641, + "step": 10020 + }, + { + "epoch": 0.5183194666942277, + "grad_norm": 0.5237132906913757, + "learning_rate": 0.0005, + "loss": 0.9717, + "step": 10030 + }, + { + "epoch": 0.5188362358534443, + "grad_norm": 0.5496323704719543, + "learning_rate": 0.0005, + "loss": 0.952, + "step": 10040 + }, + { + "epoch": 0.5193530050126608, + "grad_norm": 0.5751678347587585, + "learning_rate": 0.0005, + "loss": 0.9508, + "step": 10050 + }, + { + "epoch": 0.5198697741718774, + "grad_norm": 0.5333780646324158, + "learning_rate": 0.0005, + "loss": 0.9442, + "step": 10060 + }, + { + "epoch": 0.520386543331094, + "grad_norm": 0.5529361367225647, + "learning_rate": 0.0005, + "loss": 0.9662, + "step": 10070 + }, + { + "epoch": 0.5209033124903106, + "grad_norm": 0.5695346593856812, + "learning_rate": 0.0005, + "loss": 0.9648, + "step": 10080 + }, + { + "epoch": 0.5214200816495271, + "grad_norm": 0.528101921081543, + "learning_rate": 0.0005, + "loss": 0.9581, + "step": 10090 + }, + { + "epoch": 0.5219368508087437, + "grad_norm": 0.5323454141616821, + "learning_rate": 0.0005, + "loss": 0.9842, + "step": 10100 + }, + { + "epoch": 0.5224536199679604, + "grad_norm": 0.5791360139846802, + "learning_rate": 0.0005, + "loss": 0.9755, + "step": 10110 + }, + { + "epoch": 0.5229703891271769, + "grad_norm": 0.5297543406486511, + "learning_rate": 0.0005, + "loss": 0.9706, + "step": 10120 + }, + { + "epoch": 0.5234871582863935, + "grad_norm": 0.5344191789627075, + "learning_rate": 0.0005, + "loss": 0.9557, + "step": 10130 + }, + { + "epoch": 0.5240039274456101, + "grad_norm": 0.5307314395904541, + "learning_rate": 0.0005, + "loss": 0.9561, + "step": 10140 + }, + { + "epoch": 0.5245206966048266, + "grad_norm": 0.5625677108764648, + "learning_rate": 0.0005, + "loss": 0.9639, + "step": 10150 + }, + { + "epoch": 0.5250374657640432, + "grad_norm": 0.5287933945655823, + "learning_rate": 0.0005, + "loss": 0.9458, + "step": 10160 + }, + { + "epoch": 0.5255542349232598, + "grad_norm": 0.4987037777900696, + "learning_rate": 0.0005, + "loss": 0.9542, + "step": 10170 + }, + { + "epoch": 0.5260710040824763, + "grad_norm": 0.5192455053329468, + "learning_rate": 0.0005, + "loss": 0.9534, + "step": 10180 + }, + { + "epoch": 0.5265877732416929, + "grad_norm": 0.5038531422615051, + "learning_rate": 0.0005, + "loss": 0.9534, + "step": 10190 + }, + { + "epoch": 0.5271045424009095, + "grad_norm": 0.5356433391571045, + "learning_rate": 0.0005, + "loss": 0.9657, + "step": 10200 + }, + { + "epoch": 0.527621311560126, + "grad_norm": 0.5290383696556091, + "learning_rate": 0.0005, + "loss": 0.9405, + "step": 10210 + }, + { + "epoch": 0.5281380807193427, + "grad_norm": 0.5376208424568176, + "learning_rate": 0.0005, + "loss": 0.9581, + "step": 10220 + }, + { + "epoch": 0.5286548498785593, + "grad_norm": 0.5011909604072571, + "learning_rate": 0.0005, + "loss": 0.9383, + "step": 10230 + }, + { + "epoch": 0.5291716190377759, + "grad_norm": 0.503073513507843, + "learning_rate": 0.0005, + "loss": 0.9525, + "step": 10240 + }, + { + "epoch": 0.5296883881969924, + "grad_norm": 0.5255160927772522, + "learning_rate": 0.0005, + "loss": 0.9525, + "step": 10250 + }, + { + "epoch": 0.530205157356209, + "grad_norm": 0.5147885084152222, + "learning_rate": 0.0005, + "loss": 0.9465, + "step": 10260 + }, + { + "epoch": 0.5307219265154256, + "grad_norm": 0.5343205332756042, + "learning_rate": 0.0005, + "loss": 0.9441, + "step": 10270 + }, + { + "epoch": 0.5312386956746421, + "grad_norm": 0.5480389595031738, + "learning_rate": 0.0005, + "loss": 0.9551, + "step": 10280 + }, + { + "epoch": 0.5317554648338587, + "grad_norm": 0.5425328612327576, + "learning_rate": 0.0005, + "loss": 0.951, + "step": 10290 + }, + { + "epoch": 0.5322722339930753, + "grad_norm": 0.6197424530982971, + "learning_rate": 0.0005, + "loss": 0.9467, + "step": 10300 + }, + { + "epoch": 0.5327890031522918, + "grad_norm": 0.5289689898490906, + "learning_rate": 0.0005, + "loss": 0.9615, + "step": 10310 + }, + { + "epoch": 0.5333057723115084, + "grad_norm": 0.5715579986572266, + "learning_rate": 0.0005, + "loss": 0.9572, + "step": 10320 + }, + { + "epoch": 0.533822541470725, + "grad_norm": 0.5315567851066589, + "learning_rate": 0.0005, + "loss": 0.961, + "step": 10330 + }, + { + "epoch": 0.5343393106299416, + "grad_norm": 0.5441263318061829, + "learning_rate": 0.0005, + "loss": 0.9581, + "step": 10340 + }, + { + "epoch": 0.5348560797891582, + "grad_norm": 0.5785178542137146, + "learning_rate": 0.0005, + "loss": 0.9479, + "step": 10350 + }, + { + "epoch": 0.5353728489483748, + "grad_norm": 0.5260955691337585, + "learning_rate": 0.0005, + "loss": 0.9729, + "step": 10360 + }, + { + "epoch": 0.5358896181075914, + "grad_norm": 0.5125389099121094, + "learning_rate": 0.0005, + "loss": 0.9568, + "step": 10370 + }, + { + "epoch": 0.5364063872668079, + "grad_norm": 0.5203437209129333, + "learning_rate": 0.0005, + "loss": 0.9603, + "step": 10380 + }, + { + "epoch": 0.5369231564260245, + "grad_norm": 0.5585212707519531, + "learning_rate": 0.0005, + "loss": 0.9599, + "step": 10390 + }, + { + "epoch": 0.5374399255852411, + "grad_norm": 0.48404642939567566, + "learning_rate": 0.0005, + "loss": 0.9494, + "step": 10400 + }, + { + "epoch": 0.5379566947444576, + "grad_norm": 0.65147465467453, + "learning_rate": 0.0005, + "loss": 0.9469, + "step": 10410 + }, + { + "epoch": 0.5384734639036742, + "grad_norm": 0.5233981013298035, + "learning_rate": 0.0005, + "loss": 0.9564, + "step": 10420 + }, + { + "epoch": 0.5389902330628908, + "grad_norm": 0.5470656156539917, + "learning_rate": 0.0005, + "loss": 0.9377, + "step": 10430 + }, + { + "epoch": 0.5395070022221073, + "grad_norm": 0.522283673286438, + "learning_rate": 0.0005, + "loss": 0.9431, + "step": 10440 + }, + { + "epoch": 0.540023771381324, + "grad_norm": 0.5491459965705872, + "learning_rate": 0.0005, + "loss": 0.9565, + "step": 10450 + }, + { + "epoch": 0.5405405405405406, + "grad_norm": 0.5251693725585938, + "learning_rate": 0.0005, + "loss": 0.9485, + "step": 10460 + }, + { + "epoch": 0.5410573096997571, + "grad_norm": 0.5080156922340393, + "learning_rate": 0.0005, + "loss": 0.9577, + "step": 10470 + }, + { + "epoch": 0.5415740788589737, + "grad_norm": 0.5703207850456238, + "learning_rate": 0.0005, + "loss": 0.969, + "step": 10480 + }, + { + "epoch": 0.5420908480181903, + "grad_norm": 0.5768096446990967, + "learning_rate": 0.0005, + "loss": 0.9481, + "step": 10490 + }, + { + "epoch": 0.5426076171774068, + "grad_norm": 0.5119413733482361, + "learning_rate": 0.0005, + "loss": 0.9491, + "step": 10500 + }, + { + "epoch": 0.5431243863366234, + "grad_norm": 0.5329270958900452, + "learning_rate": 0.0005, + "loss": 0.9625, + "step": 10510 + }, + { + "epoch": 0.54364115549584, + "grad_norm": 0.528266966342926, + "learning_rate": 0.0005, + "loss": 0.9477, + "step": 10520 + }, + { + "epoch": 0.5441579246550566, + "grad_norm": 0.5584282279014587, + "learning_rate": 0.0005, + "loss": 0.9555, + "step": 10530 + }, + { + "epoch": 0.5446746938142731, + "grad_norm": 0.5280376672744751, + "learning_rate": 0.0005, + "loss": 0.9404, + "step": 10540 + }, + { + "epoch": 0.5451914629734897, + "grad_norm": 0.512711763381958, + "learning_rate": 0.0005, + "loss": 0.9551, + "step": 10550 + }, + { + "epoch": 0.5457082321327064, + "grad_norm": 0.5412839651107788, + "learning_rate": 0.0005, + "loss": 0.946, + "step": 10560 + }, + { + "epoch": 0.5462250012919229, + "grad_norm": 0.5105991363525391, + "learning_rate": 0.0005, + "loss": 0.9507, + "step": 10570 + }, + { + "epoch": 0.5467417704511395, + "grad_norm": 0.5690359473228455, + "learning_rate": 0.0005, + "loss": 0.9532, + "step": 10580 + }, + { + "epoch": 0.5472585396103561, + "grad_norm": 0.5333488583564758, + "learning_rate": 0.0005, + "loss": 0.9634, + "step": 10590 + }, + { + "epoch": 0.5477753087695726, + "grad_norm": 0.5984283089637756, + "learning_rate": 0.0005, + "loss": 0.9624, + "step": 10600 + }, + { + "epoch": 0.5482920779287892, + "grad_norm": 0.5076044201850891, + "learning_rate": 0.0005, + "loss": 0.9426, + "step": 10610 + }, + { + "epoch": 0.5488088470880058, + "grad_norm": 0.5287521481513977, + "learning_rate": 0.0005, + "loss": 0.9571, + "step": 10620 + }, + { + "epoch": 0.5493256162472223, + "grad_norm": 0.5479470491409302, + "learning_rate": 0.0005, + "loss": 0.9424, + "step": 10630 + }, + { + "epoch": 0.5498423854064389, + "grad_norm": 0.5120390057563782, + "learning_rate": 0.0005, + "loss": 0.9471, + "step": 10640 + }, + { + "epoch": 0.5503591545656555, + "grad_norm": 0.5130133032798767, + "learning_rate": 0.0005, + "loss": 0.9605, + "step": 10650 + }, + { + "epoch": 0.5508759237248722, + "grad_norm": 0.5507628917694092, + "learning_rate": 0.0005, + "loss": 0.9545, + "step": 10660 + }, + { + "epoch": 0.5513926928840887, + "grad_norm": 0.4929947555065155, + "learning_rate": 0.0005, + "loss": 0.9415, + "step": 10670 + }, + { + "epoch": 0.5519094620433053, + "grad_norm": 0.5119226574897766, + "learning_rate": 0.0005, + "loss": 0.9564, + "step": 10680 + }, + { + "epoch": 0.5524262312025219, + "grad_norm": 0.5126231908798218, + "learning_rate": 0.0005, + "loss": 0.9467, + "step": 10690 + }, + { + "epoch": 0.5529430003617384, + "grad_norm": 0.5123251676559448, + "learning_rate": 0.0005, + "loss": 0.9412, + "step": 10700 + }, + { + "epoch": 0.553459769520955, + "grad_norm": 0.5106756687164307, + "learning_rate": 0.0005, + "loss": 0.9583, + "step": 10710 + }, + { + "epoch": 0.5539765386801716, + "grad_norm": 0.520325243473053, + "learning_rate": 0.0005, + "loss": 0.9593, + "step": 10720 + }, + { + "epoch": 0.5544933078393881, + "grad_norm": 0.6005384922027588, + "learning_rate": 0.0005, + "loss": 0.9617, + "step": 10730 + }, + { + "epoch": 0.5550100769986047, + "grad_norm": 0.49362891912460327, + "learning_rate": 0.0005, + "loss": 0.9476, + "step": 10740 + }, + { + "epoch": 0.5555268461578213, + "grad_norm": 0.5586000084877014, + "learning_rate": 0.0005, + "loss": 0.9594, + "step": 10750 + }, + { + "epoch": 0.5560436153170378, + "grad_norm": 0.5586140155792236, + "learning_rate": 0.0005, + "loss": 0.9343, + "step": 10760 + }, + { + "epoch": 0.5565603844762544, + "grad_norm": 0.5251288414001465, + "learning_rate": 0.0005, + "loss": 0.945, + "step": 10770 + }, + { + "epoch": 0.5570771536354711, + "grad_norm": 0.5328302383422852, + "learning_rate": 0.0005, + "loss": 0.9479, + "step": 10780 + }, + { + "epoch": 0.5575939227946876, + "grad_norm": 0.49472010135650635, + "learning_rate": 0.0005, + "loss": 0.9668, + "step": 10790 + }, + { + "epoch": 0.5581106919539042, + "grad_norm": 0.5159969925880432, + "learning_rate": 0.0005, + "loss": 0.9559, + "step": 10800 + }, + { + "epoch": 0.5586274611131208, + "grad_norm": 0.5159046649932861, + "learning_rate": 0.0005, + "loss": 0.9397, + "step": 10810 + }, + { + "epoch": 0.5591442302723374, + "grad_norm": 0.5191036462783813, + "learning_rate": 0.0005, + "loss": 0.9466, + "step": 10820 + }, + { + "epoch": 0.5596609994315539, + "grad_norm": 0.5178474187850952, + "learning_rate": 0.0005, + "loss": 0.9439, + "step": 10830 + }, + { + "epoch": 0.5601777685907705, + "grad_norm": 0.5447880625724792, + "learning_rate": 0.0005, + "loss": 0.9576, + "step": 10840 + }, + { + "epoch": 0.5606945377499871, + "grad_norm": 0.5056577920913696, + "learning_rate": 0.0005, + "loss": 0.9514, + "step": 10850 + }, + { + "epoch": 0.5612113069092036, + "grad_norm": 0.5639669299125671, + "learning_rate": 0.0005, + "loss": 0.9482, + "step": 10860 + }, + { + "epoch": 0.5617280760684202, + "grad_norm": 0.570584774017334, + "learning_rate": 0.0005, + "loss": 0.9602, + "step": 10870 + }, + { + "epoch": 0.5622448452276368, + "grad_norm": 0.5161934494972229, + "learning_rate": 0.0005, + "loss": 0.9366, + "step": 10880 + }, + { + "epoch": 0.5627616143868533, + "grad_norm": 0.5521616339683533, + "learning_rate": 0.0005, + "loss": 0.9535, + "step": 10890 + }, + { + "epoch": 0.56327838354607, + "grad_norm": 0.5411272644996643, + "learning_rate": 0.0005, + "loss": 0.9324, + "step": 10900 + }, + { + "epoch": 0.5637951527052866, + "grad_norm": 0.5098778605461121, + "learning_rate": 0.0005, + "loss": 0.9408, + "step": 10910 + }, + { + "epoch": 0.5643119218645031, + "grad_norm": 0.4730329215526581, + "learning_rate": 0.0005, + "loss": 0.946, + "step": 10920 + }, + { + "epoch": 0.5648286910237197, + "grad_norm": 0.5085341334342957, + "learning_rate": 0.0005, + "loss": 0.9469, + "step": 10930 + }, + { + "epoch": 0.5653454601829363, + "grad_norm": 0.5201531052589417, + "learning_rate": 0.0005, + "loss": 0.9583, + "step": 10940 + }, + { + "epoch": 0.5658622293421529, + "grad_norm": 0.4958653748035431, + "learning_rate": 0.0005, + "loss": 0.9542, + "step": 10950 + }, + { + "epoch": 0.5663789985013694, + "grad_norm": 0.5279732942581177, + "learning_rate": 0.0005, + "loss": 0.9503, + "step": 10960 + }, + { + "epoch": 0.566895767660586, + "grad_norm": 0.5014291405677795, + "learning_rate": 0.0005, + "loss": 0.9562, + "step": 10970 + }, + { + "epoch": 0.5674125368198026, + "grad_norm": 0.5004532337188721, + "learning_rate": 0.0005, + "loss": 0.937, + "step": 10980 + }, + { + "epoch": 0.5679293059790191, + "grad_norm": 0.5091339349746704, + "learning_rate": 0.0005, + "loss": 0.9442, + "step": 10990 + }, + { + "epoch": 0.5684460751382358, + "grad_norm": 0.5625014901161194, + "learning_rate": 0.0005, + "loss": 0.9426, + "step": 11000 + }, + { + "epoch": 0.5689628442974524, + "grad_norm": 0.5026536583900452, + "learning_rate": 0.0005, + "loss": 0.952, + "step": 11010 + }, + { + "epoch": 0.5694796134566689, + "grad_norm": 0.4980801045894623, + "learning_rate": 0.0005, + "loss": 0.9347, + "step": 11020 + }, + { + "epoch": 0.5699963826158855, + "grad_norm": 0.4974989593029022, + "learning_rate": 0.0005, + "loss": 0.9442, + "step": 11030 + }, + { + "epoch": 0.5705131517751021, + "grad_norm": 0.5242035388946533, + "learning_rate": 0.0005, + "loss": 0.9464, + "step": 11040 + }, + { + "epoch": 0.5710299209343186, + "grad_norm": 0.5066283941268921, + "learning_rate": 0.0005, + "loss": 0.9276, + "step": 11050 + }, + { + "epoch": 0.5715466900935352, + "grad_norm": 0.508834958076477, + "learning_rate": 0.0005, + "loss": 0.9402, + "step": 11060 + }, + { + "epoch": 0.5720634592527518, + "grad_norm": 0.5046612024307251, + "learning_rate": 0.0005, + "loss": 0.9487, + "step": 11070 + }, + { + "epoch": 0.5725802284119684, + "grad_norm": 0.5268915891647339, + "learning_rate": 0.0005, + "loss": 0.9415, + "step": 11080 + }, + { + "epoch": 0.5730969975711849, + "grad_norm": 0.5040035247802734, + "learning_rate": 0.0005, + "loss": 0.9326, + "step": 11090 + }, + { + "epoch": 0.5736137667304015, + "grad_norm": 0.500636100769043, + "learning_rate": 0.0005, + "loss": 0.9422, + "step": 11100 + }, + { + "epoch": 0.5741305358896182, + "grad_norm": 0.5215865969657898, + "learning_rate": 0.0005, + "loss": 0.9414, + "step": 11110 + }, + { + "epoch": 0.5746473050488347, + "grad_norm": 0.5058110356330872, + "learning_rate": 0.0005, + "loss": 0.9522, + "step": 11120 + }, + { + "epoch": 0.5751640742080513, + "grad_norm": 0.5117678046226501, + "learning_rate": 0.0005, + "loss": 0.9518, + "step": 11130 + }, + { + "epoch": 0.5756808433672679, + "grad_norm": 0.5039757490158081, + "learning_rate": 0.0005, + "loss": 0.9418, + "step": 11140 + }, + { + "epoch": 0.5761976125264844, + "grad_norm": 0.5518759489059448, + "learning_rate": 0.0005, + "loss": 0.9407, + "step": 11150 + }, + { + "epoch": 0.576714381685701, + "grad_norm": 0.5106251239776611, + "learning_rate": 0.0005, + "loss": 0.9367, + "step": 11160 + }, + { + "epoch": 0.5772311508449176, + "grad_norm": 0.5682827830314636, + "learning_rate": 0.0005, + "loss": 0.945, + "step": 11170 + }, + { + "epoch": 0.5777479200041341, + "grad_norm": 0.521513044834137, + "learning_rate": 0.0005, + "loss": 0.9453, + "step": 11180 + }, + { + "epoch": 0.5782646891633507, + "grad_norm": 0.5230028629302979, + "learning_rate": 0.0005, + "loss": 0.9544, + "step": 11190 + }, + { + "epoch": 0.5787814583225673, + "grad_norm": 0.5285042524337769, + "learning_rate": 0.0005, + "loss": 0.9459, + "step": 11200 + }, + { + "epoch": 0.5792982274817838, + "grad_norm": 0.5230273604393005, + "learning_rate": 0.0005, + "loss": 0.9354, + "step": 11210 + }, + { + "epoch": 0.5798149966410004, + "grad_norm": 0.5298386216163635, + "learning_rate": 0.0005, + "loss": 0.9578, + "step": 11220 + }, + { + "epoch": 0.5803317658002171, + "grad_norm": 0.5199642181396484, + "learning_rate": 0.0005, + "loss": 0.9559, + "step": 11230 + }, + { + "epoch": 0.5808485349594337, + "grad_norm": 0.5283148884773254, + "learning_rate": 0.0005, + "loss": 0.9315, + "step": 11240 + }, + { + "epoch": 0.5813653041186502, + "grad_norm": 0.5081456303596497, + "learning_rate": 0.0005, + "loss": 0.936, + "step": 11250 + }, + { + "epoch": 0.5818820732778668, + "grad_norm": 0.4844646751880646, + "learning_rate": 0.0005, + "loss": 0.9478, + "step": 11260 + }, + { + "epoch": 0.5823988424370834, + "grad_norm": 0.5176190733909607, + "learning_rate": 0.0005, + "loss": 0.918, + "step": 11270 + }, + { + "epoch": 0.5829156115962999, + "grad_norm": 0.5267295241355896, + "learning_rate": 0.0005, + "loss": 0.9462, + "step": 11280 + }, + { + "epoch": 0.5834323807555165, + "grad_norm": 0.5780160427093506, + "learning_rate": 0.0005, + "loss": 0.9302, + "step": 11290 + }, + { + "epoch": 0.5839491499147331, + "grad_norm": 0.47616294026374817, + "learning_rate": 0.0005, + "loss": 0.945, + "step": 11300 + }, + { + "epoch": 0.5844659190739496, + "grad_norm": 0.556125283241272, + "learning_rate": 0.0005, + "loss": 0.9306, + "step": 11310 + }, + { + "epoch": 0.5849826882331662, + "grad_norm": 0.5071564316749573, + "learning_rate": 0.0005, + "loss": 0.9611, + "step": 11320 + }, + { + "epoch": 0.5854994573923828, + "grad_norm": 0.5186158418655396, + "learning_rate": 0.0005, + "loss": 0.9311, + "step": 11330 + }, + { + "epoch": 0.5860162265515994, + "grad_norm": 0.48720046877861023, + "learning_rate": 0.0005, + "loss": 0.9609, + "step": 11340 + }, + { + "epoch": 0.586532995710816, + "grad_norm": 0.49717170000076294, + "learning_rate": 0.0005, + "loss": 0.957, + "step": 11350 + }, + { + "epoch": 0.5870497648700326, + "grad_norm": 0.534752368927002, + "learning_rate": 0.0005, + "loss": 0.94, + "step": 11360 + }, + { + "epoch": 0.5875665340292492, + "grad_norm": 0.523997962474823, + "learning_rate": 0.0005, + "loss": 0.9373, + "step": 11370 + }, + { + "epoch": 0.5880833031884657, + "grad_norm": 0.49437177181243896, + "learning_rate": 0.0005, + "loss": 0.9327, + "step": 11380 + }, + { + "epoch": 0.5886000723476823, + "grad_norm": 0.4986345171928406, + "learning_rate": 0.0005, + "loss": 0.9353, + "step": 11390 + }, + { + "epoch": 0.5891168415068989, + "grad_norm": 0.49254122376441956, + "learning_rate": 0.0005, + "loss": 0.9451, + "step": 11400 + }, + { + "epoch": 0.5896336106661154, + "grad_norm": 0.5066004991531372, + "learning_rate": 0.0005, + "loss": 0.9307, + "step": 11410 + }, + { + "epoch": 0.590150379825332, + "grad_norm": 0.4954734444618225, + "learning_rate": 0.0005, + "loss": 0.9345, + "step": 11420 + }, + { + "epoch": 0.5906671489845486, + "grad_norm": 0.4814952313899994, + "learning_rate": 0.0005, + "loss": 0.9383, + "step": 11430 + }, + { + "epoch": 0.5911839181437651, + "grad_norm": 0.48946642875671387, + "learning_rate": 0.0005, + "loss": 0.9314, + "step": 11440 + }, + { + "epoch": 0.5917006873029818, + "grad_norm": 0.5009201765060425, + "learning_rate": 0.0005, + "loss": 0.9532, + "step": 11450 + }, + { + "epoch": 0.5922174564621984, + "grad_norm": 0.5228848457336426, + "learning_rate": 0.0005, + "loss": 0.9346, + "step": 11460 + }, + { + "epoch": 0.5927342256214149, + "grad_norm": 0.5121431350708008, + "learning_rate": 0.0005, + "loss": 0.9367, + "step": 11470 + }, + { + "epoch": 0.5932509947806315, + "grad_norm": 0.49431100487709045, + "learning_rate": 0.0005, + "loss": 0.9261, + "step": 11480 + }, + { + "epoch": 0.5937677639398481, + "grad_norm": 0.516291081905365, + "learning_rate": 0.0005, + "loss": 0.9452, + "step": 11490 + }, + { + "epoch": 0.5942845330990646, + "grad_norm": 0.5128830671310425, + "learning_rate": 0.0005, + "loss": 0.9446, + "step": 11500 + }, + { + "epoch": 0.5948013022582812, + "grad_norm": 0.5089874267578125, + "learning_rate": 0.0005, + "loss": 0.9321, + "step": 11510 + }, + { + "epoch": 0.5953180714174978, + "grad_norm": 0.5457943677902222, + "learning_rate": 0.0005, + "loss": 0.9349, + "step": 11520 + }, + { + "epoch": 0.5958348405767144, + "grad_norm": 0.5342771410942078, + "learning_rate": 0.0005, + "loss": 0.9326, + "step": 11530 + }, + { + "epoch": 0.5963516097359309, + "grad_norm": 0.511667788028717, + "learning_rate": 0.0005, + "loss": 0.933, + "step": 11540 + }, + { + "epoch": 0.5968683788951475, + "grad_norm": 0.5304045677185059, + "learning_rate": 0.0005, + "loss": 0.9278, + "step": 11550 + }, + { + "epoch": 0.5973851480543642, + "grad_norm": 0.5285548567771912, + "learning_rate": 0.0005, + "loss": 0.9451, + "step": 11560 + }, + { + "epoch": 0.5979019172135807, + "grad_norm": 0.5200523734092712, + "learning_rate": 0.0005, + "loss": 0.9256, + "step": 11570 + }, + { + "epoch": 0.5984186863727973, + "grad_norm": 0.49133771657943726, + "learning_rate": 0.0005, + "loss": 0.9107, + "step": 11580 + }, + { + "epoch": 0.5989354555320139, + "grad_norm": 0.5477631092071533, + "learning_rate": 0.0005, + "loss": 0.9273, + "step": 11590 + }, + { + "epoch": 0.5994522246912304, + "grad_norm": 0.5735862255096436, + "learning_rate": 0.0005, + "loss": 0.9358, + "step": 11600 + }, + { + "epoch": 0.599968993850447, + "grad_norm": 0.48721542954444885, + "learning_rate": 0.0005, + "loss": 0.9273, + "step": 11610 + }, + { + "epoch": 0.6004857630096636, + "grad_norm": 0.5106229186058044, + "learning_rate": 0.0005, + "loss": 0.9283, + "step": 11620 + }, + { + "epoch": 0.6010025321688801, + "grad_norm": 0.4914691746234894, + "learning_rate": 0.0005, + "loss": 0.9303, + "step": 11630 + }, + { + "epoch": 0.6015193013280967, + "grad_norm": 0.5924090147018433, + "learning_rate": 0.0005, + "loss": 0.9199, + "step": 11640 + }, + { + "epoch": 0.6020360704873133, + "grad_norm": 0.4983723759651184, + "learning_rate": 0.0005, + "loss": 0.9384, + "step": 11650 + }, + { + "epoch": 0.60255283964653, + "grad_norm": 0.52519690990448, + "learning_rate": 0.0005, + "loss": 0.934, + "step": 11660 + }, + { + "epoch": 0.6030696088057464, + "grad_norm": 0.5365654826164246, + "learning_rate": 0.0005, + "loss": 0.9342, + "step": 11670 + }, + { + "epoch": 0.6035863779649631, + "grad_norm": 0.4914066195487976, + "learning_rate": 0.0005, + "loss": 0.9453, + "step": 11680 + }, + { + "epoch": 0.6041031471241797, + "grad_norm": 0.4888913929462433, + "learning_rate": 0.0005, + "loss": 0.9322, + "step": 11690 + }, + { + "epoch": 0.6046199162833962, + "grad_norm": 0.4911440908908844, + "learning_rate": 0.0005, + "loss": 0.9327, + "step": 11700 + }, + { + "epoch": 0.6051366854426128, + "grad_norm": 0.5005333423614502, + "learning_rate": 0.0005, + "loss": 0.9467, + "step": 11710 + }, + { + "epoch": 0.6056534546018294, + "grad_norm": 0.5367693901062012, + "learning_rate": 0.0005, + "loss": 0.9384, + "step": 11720 + }, + { + "epoch": 0.6061702237610459, + "grad_norm": 0.48554107546806335, + "learning_rate": 0.0005, + "loss": 0.9446, + "step": 11730 + }, + { + "epoch": 0.6066869929202625, + "grad_norm": 0.514530599117279, + "learning_rate": 0.0005, + "loss": 0.914, + "step": 11740 + }, + { + "epoch": 0.6072037620794791, + "grad_norm": 0.5004679560661316, + "learning_rate": 0.0005, + "loss": 0.9342, + "step": 11750 + }, + { + "epoch": 0.6077205312386956, + "grad_norm": 0.516576886177063, + "learning_rate": 0.0005, + "loss": 0.9325, + "step": 11760 + }, + { + "epoch": 0.6082373003979122, + "grad_norm": 0.5298195481300354, + "learning_rate": 0.0005, + "loss": 0.9324, + "step": 11770 + }, + { + "epoch": 0.6087540695571289, + "grad_norm": 0.4899151921272278, + "learning_rate": 0.0005, + "loss": 0.9161, + "step": 11780 + }, + { + "epoch": 0.6092708387163454, + "grad_norm": 0.5261816382408142, + "learning_rate": 0.0005, + "loss": 0.9393, + "step": 11790 + }, + { + "epoch": 0.609787607875562, + "grad_norm": 0.5143525004386902, + "learning_rate": 0.0005, + "loss": 0.9393, + "step": 11800 + }, + { + "epoch": 0.6103043770347786, + "grad_norm": 0.521551251411438, + "learning_rate": 0.0005, + "loss": 0.9291, + "step": 11810 + }, + { + "epoch": 0.6108211461939952, + "grad_norm": 0.4708675444126129, + "learning_rate": 0.0005, + "loss": 0.9462, + "step": 11820 + }, + { + "epoch": 0.6113379153532117, + "grad_norm": 0.47985512018203735, + "learning_rate": 0.0005, + "loss": 0.9355, + "step": 11830 + }, + { + "epoch": 0.6118546845124283, + "grad_norm": 0.5093055367469788, + "learning_rate": 0.0005, + "loss": 0.9301, + "step": 11840 + }, + { + "epoch": 0.6123714536716449, + "grad_norm": 0.5011575222015381, + "learning_rate": 0.0005, + "loss": 0.9382, + "step": 11850 + }, + { + "epoch": 0.6128882228308614, + "grad_norm": 0.5071706771850586, + "learning_rate": 0.0005, + "loss": 0.9425, + "step": 11860 + }, + { + "epoch": 0.613404991990078, + "grad_norm": 0.49520188570022583, + "learning_rate": 0.0005, + "loss": 0.9402, + "step": 11870 + }, + { + "epoch": 0.6139217611492946, + "grad_norm": 0.46812620759010315, + "learning_rate": 0.0005, + "loss": 0.9325, + "step": 11880 + }, + { + "epoch": 0.6144385303085111, + "grad_norm": 0.524341344833374, + "learning_rate": 0.0005, + "loss": 0.9267, + "step": 11890 + }, + { + "epoch": 0.6149552994677278, + "grad_norm": 0.48518240451812744, + "learning_rate": 0.0005, + "loss": 0.938, + "step": 11900 + }, + { + "epoch": 0.6154720686269444, + "grad_norm": 0.5080456137657166, + "learning_rate": 0.0005, + "loss": 0.9341, + "step": 11910 + }, + { + "epoch": 0.6159888377861609, + "grad_norm": 0.5626226663589478, + "learning_rate": 0.0005, + "loss": 0.9258, + "step": 11920 + }, + { + "epoch": 0.6165056069453775, + "grad_norm": 0.47337082028388977, + "learning_rate": 0.0005, + "loss": 0.9421, + "step": 11930 + }, + { + "epoch": 0.6170223761045941, + "grad_norm": 0.4747110903263092, + "learning_rate": 0.0005, + "loss": 0.9339, + "step": 11940 + }, + { + "epoch": 0.6175391452638107, + "grad_norm": 0.5242559909820557, + "learning_rate": 0.0005, + "loss": 0.942, + "step": 11950 + }, + { + "epoch": 0.6180559144230272, + "grad_norm": 0.5247402191162109, + "learning_rate": 0.0005, + "loss": 0.9269, + "step": 11960 + }, + { + "epoch": 0.6185726835822438, + "grad_norm": 0.5551696419715881, + "learning_rate": 0.0005, + "loss": 0.9268, + "step": 11970 + }, + { + "epoch": 0.6190894527414604, + "grad_norm": 0.5222793817520142, + "learning_rate": 0.0005, + "loss": 0.9331, + "step": 11980 + }, + { + "epoch": 0.6196062219006769, + "grad_norm": 0.49412423372268677, + "learning_rate": 0.0005, + "loss": 0.9292, + "step": 11990 + }, + { + "epoch": 0.6201229910598935, + "grad_norm": 0.49935638904571533, + "learning_rate": 0.0005, + "loss": 0.9168, + "step": 12000 + }, + { + "epoch": 0.6206397602191102, + "grad_norm": 0.5514285564422607, + "learning_rate": 0.0005, + "loss": 0.9289, + "step": 12010 + }, + { + "epoch": 0.6211565293783267, + "grad_norm": 0.5182361602783203, + "learning_rate": 0.0005, + "loss": 0.9359, + "step": 12020 + }, + { + "epoch": 0.6216732985375433, + "grad_norm": 0.5162422060966492, + "learning_rate": 0.0005, + "loss": 0.9257, + "step": 12030 + }, + { + "epoch": 0.6221900676967599, + "grad_norm": 0.4926648437976837, + "learning_rate": 0.0005, + "loss": 0.935, + "step": 12040 + }, + { + "epoch": 0.6227068368559764, + "grad_norm": 0.5213857293128967, + "learning_rate": 0.0005, + "loss": 0.9353, + "step": 12050 + }, + { + "epoch": 0.623223606015193, + "grad_norm": 0.5043472051620483, + "learning_rate": 0.0005, + "loss": 0.9499, + "step": 12060 + }, + { + "epoch": 0.6237403751744096, + "grad_norm": 0.48353925347328186, + "learning_rate": 0.0005, + "loss": 0.9319, + "step": 12070 + }, + { + "epoch": 0.6242571443336262, + "grad_norm": 0.5488812923431396, + "learning_rate": 0.0005, + "loss": 0.9262, + "step": 12080 + }, + { + "epoch": 0.6247739134928427, + "grad_norm": 0.5349071621894836, + "learning_rate": 0.0005, + "loss": 0.9317, + "step": 12090 + }, + { + "epoch": 0.6252906826520593, + "grad_norm": 0.5111981630325317, + "learning_rate": 0.0005, + "loss": 0.9128, + "step": 12100 + }, + { + "epoch": 0.625807451811276, + "grad_norm": 0.525330126285553, + "learning_rate": 0.0005, + "loss": 0.9212, + "step": 12110 + }, + { + "epoch": 0.6263242209704925, + "grad_norm": 0.5191537141799927, + "learning_rate": 0.0005, + "loss": 0.9313, + "step": 12120 + }, + { + "epoch": 0.6268409901297091, + "grad_norm": 0.49418073892593384, + "learning_rate": 0.0005, + "loss": 0.9408, + "step": 12130 + }, + { + "epoch": 0.6273577592889257, + "grad_norm": 0.49373695254325867, + "learning_rate": 0.0005, + "loss": 0.9226, + "step": 12140 + }, + { + "epoch": 0.6278745284481422, + "grad_norm": 0.488068550825119, + "learning_rate": 0.0005, + "loss": 0.9407, + "step": 12150 + }, + { + "epoch": 0.6283912976073588, + "grad_norm": 0.5186513662338257, + "learning_rate": 0.0005, + "loss": 0.9351, + "step": 12160 + }, + { + "epoch": 0.6289080667665754, + "grad_norm": 0.532514750957489, + "learning_rate": 0.0005, + "loss": 0.9323, + "step": 12170 + }, + { + "epoch": 0.6294248359257919, + "grad_norm": 0.4832149147987366, + "learning_rate": 0.0005, + "loss": 0.9303, + "step": 12180 + }, + { + "epoch": 0.6299416050850085, + "grad_norm": 0.5020478963851929, + "learning_rate": 0.0005, + "loss": 0.9278, + "step": 12190 + }, + { + "epoch": 0.6304583742442251, + "grad_norm": 0.45874807238578796, + "learning_rate": 0.0005, + "loss": 0.9205, + "step": 12200 + }, + { + "epoch": 0.6309751434034416, + "grad_norm": 0.5273077487945557, + "learning_rate": 0.0005, + "loss": 0.9133, + "step": 12210 + }, + { + "epoch": 0.6314919125626582, + "grad_norm": 0.49270930886268616, + "learning_rate": 0.0005, + "loss": 0.9228, + "step": 12220 + }, + { + "epoch": 0.6320086817218749, + "grad_norm": 0.47435376048088074, + "learning_rate": 0.0005, + "loss": 0.937, + "step": 12230 + }, + { + "epoch": 0.6325254508810915, + "grad_norm": 0.49013498425483704, + "learning_rate": 0.0005, + "loss": 0.925, + "step": 12240 + }, + { + "epoch": 0.633042220040308, + "grad_norm": 0.481581449508667, + "learning_rate": 0.0005, + "loss": 0.9209, + "step": 12250 + }, + { + "epoch": 0.6335589891995246, + "grad_norm": 0.5189198851585388, + "learning_rate": 0.0005, + "loss": 0.9206, + "step": 12260 + }, + { + "epoch": 0.6340757583587412, + "grad_norm": 0.47871729731559753, + "learning_rate": 0.0005, + "loss": 0.9279, + "step": 12270 + }, + { + "epoch": 0.6345925275179577, + "grad_norm": 0.4953111410140991, + "learning_rate": 0.0005, + "loss": 0.93, + "step": 12280 + }, + { + "epoch": 0.6351092966771743, + "grad_norm": 0.5199342370033264, + "learning_rate": 0.0005, + "loss": 0.9246, + "step": 12290 + }, + { + "epoch": 0.6356260658363909, + "grad_norm": 0.48852893710136414, + "learning_rate": 0.0005, + "loss": 0.9222, + "step": 12300 + }, + { + "epoch": 0.6361428349956074, + "grad_norm": 0.5054774284362793, + "learning_rate": 0.0005, + "loss": 0.9346, + "step": 12310 + }, + { + "epoch": 0.636659604154824, + "grad_norm": 0.5030813813209534, + "learning_rate": 0.0005, + "loss": 0.9238, + "step": 12320 + }, + { + "epoch": 0.6371763733140406, + "grad_norm": 0.47299617528915405, + "learning_rate": 0.0005, + "loss": 0.9317, + "step": 12330 + }, + { + "epoch": 0.6376931424732571, + "grad_norm": 0.5473576784133911, + "learning_rate": 0.0005, + "loss": 0.9206, + "step": 12340 + }, + { + "epoch": 0.6382099116324738, + "grad_norm": 0.4999616742134094, + "learning_rate": 0.0005, + "loss": 0.9449, + "step": 12350 + }, + { + "epoch": 0.6387266807916904, + "grad_norm": 0.5508975982666016, + "learning_rate": 0.0005, + "loss": 0.921, + "step": 12360 + }, + { + "epoch": 0.639243449950907, + "grad_norm": 0.5574737191200256, + "learning_rate": 0.0005, + "loss": 0.935, + "step": 12370 + }, + { + "epoch": 0.6397602191101235, + "grad_norm": 0.5615907907485962, + "learning_rate": 0.0005, + "loss": 0.9263, + "step": 12380 + }, + { + "epoch": 0.6402769882693401, + "grad_norm": 0.5180084109306335, + "learning_rate": 0.0005, + "loss": 0.9235, + "step": 12390 + }, + { + "epoch": 0.6407937574285567, + "grad_norm": 0.46675363183021545, + "learning_rate": 0.0005, + "loss": 0.9237, + "step": 12400 + }, + { + "epoch": 0.6413105265877732, + "grad_norm": 0.4773077070713043, + "learning_rate": 0.0005, + "loss": 0.9098, + "step": 12410 + }, + { + "epoch": 0.6418272957469898, + "grad_norm": 0.5147991180419922, + "learning_rate": 0.0005, + "loss": 0.9215, + "step": 12420 + }, + { + "epoch": 0.6423440649062064, + "grad_norm": 0.47254249453544617, + "learning_rate": 0.0005, + "loss": 0.925, + "step": 12430 + }, + { + "epoch": 0.6428608340654229, + "grad_norm": 0.48444342613220215, + "learning_rate": 0.0005, + "loss": 0.9138, + "step": 12440 + }, + { + "epoch": 0.6433776032246395, + "grad_norm": 0.4626687169075012, + "learning_rate": 0.0005, + "loss": 0.9239, + "step": 12450 + }, + { + "epoch": 0.6438943723838562, + "grad_norm": 0.48663684725761414, + "learning_rate": 0.0005, + "loss": 0.9365, + "step": 12460 + }, + { + "epoch": 0.6444111415430727, + "grad_norm": 0.5721457600593567, + "learning_rate": 0.0005, + "loss": 0.9228, + "step": 12470 + }, + { + "epoch": 0.6449279107022893, + "grad_norm": 0.4997864067554474, + "learning_rate": 0.0005, + "loss": 0.9203, + "step": 12480 + }, + { + "epoch": 0.6454446798615059, + "grad_norm": 0.4961699843406677, + "learning_rate": 0.0005, + "loss": 0.909, + "step": 12490 + }, + { + "epoch": 0.6459614490207224, + "grad_norm": 0.49018388986587524, + "learning_rate": 0.0005, + "loss": 0.9242, + "step": 12500 + }, + { + "epoch": 0.646478218179939, + "grad_norm": 0.5205206871032715, + "learning_rate": 0.0005, + "loss": 0.923, + "step": 12510 + }, + { + "epoch": 0.6469949873391556, + "grad_norm": 0.527740478515625, + "learning_rate": 0.0005, + "loss": 0.9267, + "step": 12520 + }, + { + "epoch": 0.6475117564983722, + "grad_norm": 0.4962241053581238, + "learning_rate": 0.0005, + "loss": 0.9206, + "step": 12530 + }, + { + "epoch": 0.6480285256575887, + "grad_norm": 0.47836676239967346, + "learning_rate": 0.0005, + "loss": 0.9134, + "step": 12540 + }, + { + "epoch": 0.6485452948168053, + "grad_norm": 0.48245546221733093, + "learning_rate": 0.0005, + "loss": 0.9326, + "step": 12550 + }, + { + "epoch": 0.649062063976022, + "grad_norm": 0.503021240234375, + "learning_rate": 0.0005, + "loss": 0.9361, + "step": 12560 + }, + { + "epoch": 0.6495788331352385, + "grad_norm": 0.5059377551078796, + "learning_rate": 0.0005, + "loss": 0.8998, + "step": 12570 + }, + { + "epoch": 0.6500956022944551, + "grad_norm": 0.49928557872772217, + "learning_rate": 0.0004994267553729553, + "loss": 0.929, + "step": 12580 + }, + { + "epoch": 0.6506123714536717, + "grad_norm": 0.4804401099681854, + "learning_rate": 0.0004963394943411699, + "loss": 0.9173, + "step": 12590 + }, + { + "epoch": 0.6511291406128882, + "grad_norm": 0.4649386405944824, + "learning_rate": 0.0004932713175506187, + "loss": 0.9256, + "step": 12600 + }, + { + "epoch": 0.6516459097721048, + "grad_norm": 0.47866883873939514, + "learning_rate": 0.0004902221070299804, + "loss": 0.9185, + "step": 12610 + }, + { + "epoch": 0.6521626789313214, + "grad_norm": 0.4801424443721771, + "learning_rate": 0.00048719174553718596, + "loss": 0.9276, + "step": 12620 + }, + { + "epoch": 0.6526794480905379, + "grad_norm": 0.4797857105731964, + "learning_rate": 0.0004841801165549115, + "loss": 0.9262, + "step": 12630 + }, + { + "epoch": 0.6531962172497545, + "grad_norm": 0.4703647494316101, + "learning_rate": 0.0004811871042860973, + "loss": 0.9113, + "step": 12640 + }, + { + "epoch": 0.6537129864089711, + "grad_norm": 0.4952949583530426, + "learning_rate": 0.00047821259364949593, + "loss": 0.9372, + "step": 12650 + }, + { + "epoch": 0.6542297555681877, + "grad_norm": 0.48347562551498413, + "learning_rate": 0.0004752564702752473, + "loss": 0.9224, + "step": 12660 + }, + { + "epoch": 0.6547465247274042, + "grad_norm": 0.4917808473110199, + "learning_rate": 0.0004723186205004811, + "loss": 0.91, + "step": 12670 + }, + { + "epoch": 0.6552632938866209, + "grad_norm": 0.5070691704750061, + "learning_rate": 0.00046939893136494626, + "loss": 0.9147, + "step": 12680 + }, + { + "epoch": 0.6557800630458375, + "grad_norm": 0.49811315536499023, + "learning_rate": 0.0004664972906066682, + "loss": 0.903, + "step": 12690 + }, + { + "epoch": 0.656296832205054, + "grad_norm": 0.5315011739730835, + "learning_rate": 0.0004636135866576317, + "loss": 0.9087, + "step": 12700 + }, + { + "epoch": 0.6568136013642706, + "grad_norm": 0.4951007068157196, + "learning_rate": 0.00046074770863949155, + "loss": 0.9282, + "step": 12710 + }, + { + "epoch": 0.6573303705234872, + "grad_norm": 0.49288272857666016, + "learning_rate": 0.00045789954635930914, + "loss": 0.9279, + "step": 12720 + }, + { + "epoch": 0.6578471396827037, + "grad_norm": 0.4682476222515106, + "learning_rate": 0.00045506899030531544, + "loss": 0.9122, + "step": 12730 + }, + { + "epoch": 0.6583639088419203, + "grad_norm": 0.5064340233802795, + "learning_rate": 0.0004522559316427005, + "loss": 0.9114, + "step": 12740 + }, + { + "epoch": 0.6588806780011369, + "grad_norm": 0.4566449224948883, + "learning_rate": 0.00044946026220942865, + "loss": 0.9133, + "step": 12750 + }, + { + "epoch": 0.6593974471603534, + "grad_norm": 0.4679611623287201, + "learning_rate": 0.00044668187451207944, + "loss": 0.8991, + "step": 12760 + }, + { + "epoch": 0.65991421631957, + "grad_norm": 0.48330655694007874, + "learning_rate": 0.00044392066172171496, + "loss": 0.9103, + "step": 12770 + }, + { + "epoch": 0.6604309854787866, + "grad_norm": 0.5204933285713196, + "learning_rate": 0.00044117651766977195, + "loss": 0.9149, + "step": 12780 + }, + { + "epoch": 0.6609477546380031, + "grad_norm": 0.48776623606681824, + "learning_rate": 0.00043844933684397984, + "loss": 0.9185, + "step": 12790 + }, + { + "epoch": 0.6614645237972198, + "grad_norm": 0.4869120419025421, + "learning_rate": 0.0004357390143843035, + "loss": 0.9096, + "step": 12800 + }, + { + "epoch": 0.6619812929564364, + "grad_norm": 0.4783307611942291, + "learning_rate": 0.0004330454460789117, + "loss": 0.8977, + "step": 12810 + }, + { + "epoch": 0.662498062115653, + "grad_norm": 0.4555026888847351, + "learning_rate": 0.00043036852836016994, + "loss": 0.9039, + "step": 12820 + }, + { + "epoch": 0.6630148312748695, + "grad_norm": 0.47510290145874023, + "learning_rate": 0.00042770815830065834, + "loss": 0.9051, + "step": 12830 + }, + { + "epoch": 0.6635316004340861, + "grad_norm": 0.4920065999031067, + "learning_rate": 0.0004250642336092143, + "loss": 0.9138, + "step": 12840 + }, + { + "epoch": 0.6640483695933027, + "grad_norm": 0.47680869698524475, + "learning_rate": 0.000422436652626999, + "loss": 0.9131, + "step": 12850 + }, + { + "epoch": 0.6645651387525192, + "grad_norm": 0.5098276138305664, + "learning_rate": 0.00041982531432358883, + "loss": 0.9158, + "step": 12860 + }, + { + "epoch": 0.6650819079117358, + "grad_norm": 0.4745832085609436, + "learning_rate": 0.000417230118293091, + "loss": 0.9019, + "step": 12870 + }, + { + "epoch": 0.6655986770709524, + "grad_norm": 0.456750750541687, + "learning_rate": 0.00041465096475028256, + "loss": 0.8881, + "step": 12880 + }, + { + "epoch": 0.6661154462301689, + "grad_norm": 0.49757450819015503, + "learning_rate": 0.00041208775452677374, + "loss": 0.8971, + "step": 12890 + }, + { + "epoch": 0.6666322153893856, + "grad_norm": 0.4721812605857849, + "learning_rate": 0.0004095403890671951, + "loss": 0.8896, + "step": 12900 + }, + { + "epoch": 0.6671489845486022, + "grad_norm": 0.4674829840660095, + "learning_rate": 0.00040700877042540803, + "loss": 0.8978, + "step": 12910 + }, + { + "epoch": 0.6676657537078187, + "grad_norm": 0.45353659987449646, + "learning_rate": 0.0004044928012607386, + "loss": 0.9012, + "step": 12920 + }, + { + "epoch": 0.6681825228670353, + "grad_norm": 0.44594326615333557, + "learning_rate": 0.0004019923848342348, + "loss": 0.8864, + "step": 12930 + }, + { + "epoch": 0.6686992920262519, + "grad_norm": 0.4606136083602905, + "learning_rate": 0.0003995074250049472, + "loss": 0.9042, + "step": 12940 + }, + { + "epoch": 0.6692160611854685, + "grad_norm": 0.4778830111026764, + "learning_rate": 0.000397037826226232, + "loss": 0.8883, + "step": 12950 + }, + { + "epoch": 0.669732830344685, + "grad_norm": 0.4795719385147095, + "learning_rate": 0.00039458349354207754, + "loss": 0.8943, + "step": 12960 + }, + { + "epoch": 0.6702495995039016, + "grad_norm": 0.46150490641593933, + "learning_rate": 0.000392144332583453, + "loss": 0.8986, + "step": 12970 + }, + { + "epoch": 0.6707663686631182, + "grad_norm": 0.4591388404369354, + "learning_rate": 0.00038972024956468015, + "loss": 0.8973, + "step": 12980 + }, + { + "epoch": 0.6712831378223347, + "grad_norm": 0.447889506816864, + "learning_rate": 0.00038731115127982704, + "loss": 0.8982, + "step": 12990 + }, + { + "epoch": 0.6717999069815513, + "grad_norm": 0.4567711651325226, + "learning_rate": 0.00038491694509912446, + "loss": 0.8946, + "step": 13000 + }, + { + "epoch": 0.672316676140768, + "grad_norm": 0.4653710424900055, + "learning_rate": 0.00038253753896540417, + "loss": 0.8805, + "step": 13010 + }, + { + "epoch": 0.6728334452999845, + "grad_norm": 0.47622108459472656, + "learning_rate": 0.00038017284139055935, + "loss": 0.8971, + "step": 13020 + }, + { + "epoch": 0.6733502144592011, + "grad_norm": 0.46596968173980713, + "learning_rate": 0.0003778227614520272, + "loss": 0.8872, + "step": 13030 + }, + { + "epoch": 0.6738669836184177, + "grad_norm": 0.47842490673065186, + "learning_rate": 0.0003754872087892921, + "loss": 0.8844, + "step": 13040 + }, + { + "epoch": 0.6743837527776342, + "grad_norm": 0.5763306617736816, + "learning_rate": 0.00037316609360041244, + "loss": 0.884, + "step": 13050 + }, + { + "epoch": 0.6749005219368508, + "grad_norm": 0.4681786298751831, + "learning_rate": 0.00037085932663856664, + "loss": 0.8957, + "step": 13060 + }, + { + "epoch": 0.6754172910960674, + "grad_norm": 0.4536014199256897, + "learning_rate": 0.0003685668192086224, + "loss": 0.8962, + "step": 13070 + }, + { + "epoch": 0.675934060255284, + "grad_norm": 0.4593828320503235, + "learning_rate": 0.0003662884831637259, + "loss": 0.8792, + "step": 13080 + }, + { + "epoch": 0.6764508294145005, + "grad_norm": 0.4837941527366638, + "learning_rate": 0.00036402423090191283, + "loss": 0.8928, + "step": 13090 + }, + { + "epoch": 0.6769675985737171, + "grad_norm": 0.47275635600090027, + "learning_rate": 0.0003617739753627399, + "loss": 0.8885, + "step": 13100 + }, + { + "epoch": 0.6774843677329337, + "grad_norm": 0.465971976518631, + "learning_rate": 0.00035953763002393753, + "loss": 0.8859, + "step": 13110 + }, + { + "epoch": 0.6780011368921502, + "grad_norm": 0.46785497665405273, + "learning_rate": 0.00035731510889808296, + "loss": 0.8829, + "step": 13120 + }, + { + "epoch": 0.6785179060513669, + "grad_norm": 0.44653069972991943, + "learning_rate": 0.0003551063265292941, + "loss": 0.8694, + "step": 13130 + }, + { + "epoch": 0.6790346752105835, + "grad_norm": 0.46585527062416077, + "learning_rate": 0.0003529111979899436, + "loss": 0.8871, + "step": 13140 + }, + { + "epoch": 0.6795514443698, + "grad_norm": 0.5283601880073547, + "learning_rate": 0.00035072963887739373, + "loss": 0.8863, + "step": 13150 + }, + { + "epoch": 0.6800682135290166, + "grad_norm": 0.4678700864315033, + "learning_rate": 0.0003485615653107508, + "loss": 0.8859, + "step": 13160 + }, + { + "epoch": 0.6805849826882332, + "grad_norm": 0.4804142713546753, + "learning_rate": 0.0003464068939276399, + "loss": 0.8994, + "step": 13170 + }, + { + "epoch": 0.6811017518474497, + "grad_norm": 0.450847864151001, + "learning_rate": 0.0003442655418809999, + "loss": 0.8894, + "step": 13180 + }, + { + "epoch": 0.6816185210066663, + "grad_norm": 0.46586012840270996, + "learning_rate": 0.00034213742683589774, + "loss": 0.8768, + "step": 13190 + }, + { + "epoch": 0.6821352901658829, + "grad_norm": 0.439656525850296, + "learning_rate": 0.0003400224669663629, + "loss": 0.8855, + "step": 13200 + }, + { + "epoch": 0.6826520593250994, + "grad_norm": 0.4356318712234497, + "learning_rate": 0.00033792058095224076, + "loss": 0.8772, + "step": 13210 + }, + { + "epoch": 0.683168828484316, + "grad_norm": 0.460469514131546, + "learning_rate": 0.0003358316879760663, + "loss": 0.8681, + "step": 13220 + }, + { + "epoch": 0.6836855976435326, + "grad_norm": 0.43120890855789185, + "learning_rate": 0.0003337557077199565, + "loss": 0.8611, + "step": 13230 + }, + { + "epoch": 0.6842023668027493, + "grad_norm": 0.45166271924972534, + "learning_rate": 0.000331692560362522, + "loss": 0.8771, + "step": 13240 + }, + { + "epoch": 0.6847191359619658, + "grad_norm": 0.44746896624565125, + "learning_rate": 0.0003296421665757981, + "loss": 0.8781, + "step": 13250 + }, + { + "epoch": 0.6852359051211824, + "grad_norm": 0.4466201663017273, + "learning_rate": 0.0003276044475221947, + "loss": 0.8647, + "step": 13260 + }, + { + "epoch": 0.685752674280399, + "grad_norm": 0.48084691166877747, + "learning_rate": 0.00032557932485146473, + "loss": 0.9078, + "step": 13270 + }, + { + "epoch": 0.6862694434396155, + "grad_norm": 0.46723824739456177, + "learning_rate": 0.0003235667206976918, + "loss": 0.8802, + "step": 13280 + }, + { + "epoch": 0.6867862125988321, + "grad_norm": 0.4841623902320862, + "learning_rate": 0.00032156655767629616, + "loss": 0.8721, + "step": 13290 + }, + { + "epoch": 0.6873029817580487, + "grad_norm": 0.4535221755504608, + "learning_rate": 0.0003195787588810593, + "loss": 0.8609, + "step": 13300 + }, + { + "epoch": 0.6878197509172652, + "grad_norm": 0.47944900393486023, + "learning_rate": 0.00031760324788116683, + "loss": 0.8803, + "step": 13310 + }, + { + "epoch": 0.6883365200764818, + "grad_norm": 0.4466581344604492, + "learning_rate": 0.00031563994871826995, + "loss": 0.867, + "step": 13320 + }, + { + "epoch": 0.6888532892356984, + "grad_norm": 0.4529067277908325, + "learning_rate": 0.00031368878590356457, + "loss": 0.8861, + "step": 13330 + }, + { + "epoch": 0.6893700583949149, + "grad_norm": 0.45706498622894287, + "learning_rate": 0.00031174968441488886, + "loss": 0.8754, + "step": 13340 + }, + { + "epoch": 0.6898868275541316, + "grad_norm": 0.46450352668762207, + "learning_rate": 0.00030982256969383883, + "loss": 0.8669, + "step": 13350 + }, + { + "epoch": 0.6904035967133482, + "grad_norm": 0.45960313081741333, + "learning_rate": 0.0003079073676429011, + "loss": 0.8669, + "step": 13360 + }, + { + "epoch": 0.6909203658725648, + "grad_norm": 0.4698009192943573, + "learning_rate": 0.00030600400462260457, + "loss": 0.8697, + "step": 13370 + }, + { + "epoch": 0.6914371350317813, + "grad_norm": 0.4546875059604645, + "learning_rate": 0.0003041124074486883, + "loss": 0.863, + "step": 13380 + }, + { + "epoch": 0.6919539041909979, + "grad_norm": 0.4646720588207245, + "learning_rate": 0.00030223250338928787, + "loss": 0.8664, + "step": 13390 + }, + { + "epoch": 0.6924706733502145, + "grad_norm": 0.6140843629837036, + "learning_rate": 0.0003003642201621389, + "loss": 0.8636, + "step": 13400 + }, + { + "epoch": 0.692987442509431, + "grad_norm": 0.46629661321640015, + "learning_rate": 0.0002985074859317977, + "loss": 0.8776, + "step": 13410 + }, + { + "epoch": 0.6935042116686476, + "grad_norm": 0.4489153027534485, + "learning_rate": 0.00029666222930687926, + "loss": 0.8663, + "step": 13420 + }, + { + "epoch": 0.6940209808278642, + "grad_norm": 0.45471352338790894, + "learning_rate": 0.00029482837933731207, + "loss": 0.8514, + "step": 13430 + }, + { + "epoch": 0.6945377499870807, + "grad_norm": 0.4706459045410156, + "learning_rate": 0.00029300586551161034, + "loss": 0.866, + "step": 13440 + }, + { + "epoch": 0.6950545191462973, + "grad_norm": 0.44388100504875183, + "learning_rate": 0.00029119461775416286, + "loss": 0.862, + "step": 13450 + }, + { + "epoch": 0.695571288305514, + "grad_norm": 0.5106334090232849, + "learning_rate": 0.0002893945664225381, + "loss": 0.8563, + "step": 13460 + }, + { + "epoch": 0.6960880574647305, + "grad_norm": 0.4586535096168518, + "learning_rate": 0.00028760564230480724, + "loss": 0.8564, + "step": 13470 + }, + { + "epoch": 0.6966048266239471, + "grad_norm": 0.5277544856071472, + "learning_rate": 0.0002858277766168823, + "loss": 0.8685, + "step": 13480 + }, + { + "epoch": 0.6971215957831637, + "grad_norm": 0.48058634996414185, + "learning_rate": 0.0002840609009998717, + "loss": 0.8645, + "step": 13490 + }, + { + "epoch": 0.6976383649423802, + "grad_norm": 0.4804344177246094, + "learning_rate": 0.0002823049475174519, + "loss": 0.8754, + "step": 13500 + }, + { + "epoch": 0.6981551341015968, + "grad_norm": 0.4439767003059387, + "learning_rate": 0.00028055984865325503, + "loss": 0.8514, + "step": 13510 + }, + { + "epoch": 0.6986719032608134, + "grad_norm": 0.4501279294490814, + "learning_rate": 0.0002788255373082731, + "loss": 0.856, + "step": 13520 + }, + { + "epoch": 0.69918867242003, + "grad_norm": 0.5022059679031372, + "learning_rate": 0.000277101946798278, + "loss": 0.8647, + "step": 13530 + }, + { + "epoch": 0.6997054415792465, + "grad_norm": 0.45433667302131653, + "learning_rate": 0.00027538901085125735, + "loss": 0.8719, + "step": 13540 + }, + { + "epoch": 0.7002222107384631, + "grad_norm": 0.46493837237358093, + "learning_rate": 0.0002736866636048666, + "loss": 0.8599, + "step": 13550 + }, + { + "epoch": 0.7007389798976797, + "grad_norm": 0.45873501896858215, + "learning_rate": 0.0002719948396038963, + "loss": 0.8648, + "step": 13560 + }, + { + "epoch": 0.7012557490568962, + "grad_norm": 0.4426117539405823, + "learning_rate": 0.0002703134737977557, + "loss": 0.8574, + "step": 13570 + }, + { + "epoch": 0.7017725182161129, + "grad_norm": 0.44519364833831787, + "learning_rate": 0.0002686425015379712, + "loss": 0.854, + "step": 13580 + }, + { + "epoch": 0.7022892873753295, + "grad_norm": 0.47185274958610535, + "learning_rate": 0.00026698185857570094, + "loss": 0.8565, + "step": 13590 + }, + { + "epoch": 0.702806056534546, + "grad_norm": 0.43223652243614197, + "learning_rate": 0.00026533148105926436, + "loss": 0.8721, + "step": 13600 + }, + { + "epoch": 0.7033228256937626, + "grad_norm": 0.4602532386779785, + "learning_rate": 0.0002636913055316868, + "loss": 0.8518, + "step": 13610 + }, + { + "epoch": 0.7038395948529792, + "grad_norm": 0.45018014311790466, + "learning_rate": 0.00026206126892826, + "loss": 0.8685, + "step": 13620 + }, + { + "epoch": 0.7043563640121957, + "grad_norm": 0.49739015102386475, + "learning_rate": 0.000260441308574117, + "loss": 0.8483, + "step": 13630 + }, + { + "epoch": 0.7048731331714123, + "grad_norm": 0.4658418595790863, + "learning_rate": 0.00025883136218182235, + "loss": 0.8545, + "step": 13640 + }, + { + "epoch": 0.7053899023306289, + "grad_norm": 0.4808160066604614, + "learning_rate": 0.0002572313678489773, + "loss": 0.8622, + "step": 13650 + }, + { + "epoch": 0.7059066714898455, + "grad_norm": 0.4521915316581726, + "learning_rate": 0.0002556412640558396, + "loss": 0.8632, + "step": 13660 + }, + { + "epoch": 0.706423440649062, + "grad_norm": 0.456153005361557, + "learning_rate": 0.0002540609896629577, + "loss": 0.861, + "step": 13670 + }, + { + "epoch": 0.7069402098082787, + "grad_norm": 0.43279728293418884, + "learning_rate": 0.00025249048390882053, + "loss": 0.8593, + "step": 13680 + }, + { + "epoch": 0.7074569789674953, + "grad_norm": 0.4601012170314789, + "learning_rate": 0.0002509296864075207, + "loss": 0.8629, + "step": 13690 + }, + { + "epoch": 0.7079737481267118, + "grad_norm": 0.47351303696632385, + "learning_rate": 0.0002493785371464332, + "loss": 0.8622, + "step": 13700 + }, + { + "epoch": 0.7084905172859284, + "grad_norm": 0.4869425594806671, + "learning_rate": 0.0002478369764839074, + "loss": 0.8546, + "step": 13710 + }, + { + "epoch": 0.709007286445145, + "grad_norm": 0.4412122964859009, + "learning_rate": 0.0002463049451469741, + "loss": 0.8444, + "step": 13720 + }, + { + "epoch": 0.7095240556043615, + "grad_norm": 0.4480939209461212, + "learning_rate": 0.0002447823842290664, + "loss": 0.848, + "step": 13730 + }, + { + "epoch": 0.7100408247635781, + "grad_norm": 0.4651864767074585, + "learning_rate": 0.00024326923518775486, + "loss": 0.8455, + "step": 13740 + }, + { + "epoch": 0.7105575939227947, + "grad_norm": 0.4487757384777069, + "learning_rate": 0.0002417654398424963, + "loss": 0.841, + "step": 13750 + }, + { + "epoch": 0.7110743630820112, + "grad_norm": 0.44667768478393555, + "learning_rate": 0.00024027094037239717, + "loss": 0.8454, + "step": 13760 + }, + { + "epoch": 0.7115911322412278, + "grad_norm": 0.44757676124572754, + "learning_rate": 0.0002387856793139899, + "loss": 0.8438, + "step": 13770 + }, + { + "epoch": 0.7121079014004444, + "grad_norm": 0.47068849205970764, + "learning_rate": 0.00023730959955902366, + "loss": 0.8434, + "step": 13780 + }, + { + "epoch": 0.7126246705596609, + "grad_norm": 0.4390396773815155, + "learning_rate": 0.00023584264435226848, + "loss": 0.8461, + "step": 13790 + }, + { + "epoch": 0.7131414397188776, + "grad_norm": 0.4566657543182373, + "learning_rate": 0.00023438475728933318, + "loss": 0.8473, + "step": 13800 + }, + { + "epoch": 0.7136582088780942, + "grad_norm": 0.49407103657722473, + "learning_rate": 0.0002329358823144963, + "loss": 0.8431, + "step": 13810 + }, + { + "epoch": 0.7141749780373108, + "grad_norm": 0.47513094544410706, + "learning_rate": 0.00023149596371855103, + "loss": 0.8425, + "step": 13820 + }, + { + "epoch": 0.7146917471965273, + "grad_norm": 0.4418255686759949, + "learning_rate": 0.00023006494613666317, + "loss": 0.8394, + "step": 13830 + }, + { + "epoch": 0.7152085163557439, + "grad_norm": 0.45882540941238403, + "learning_rate": 0.0002286427745462422, + "loss": 0.844, + "step": 13840 + }, + { + "epoch": 0.7157252855149605, + "grad_norm": 0.44126296043395996, + "learning_rate": 0.00022722939426482577, + "loss": 0.8438, + "step": 13850 + }, + { + "epoch": 0.716242054674177, + "grad_norm": 0.44302189350128174, + "learning_rate": 0.00022582475094797713, + "loss": 0.8597, + "step": 13860 + }, + { + "epoch": 0.7167588238333936, + "grad_norm": 0.46645456552505493, + "learning_rate": 0.00022442879058719568, + "loss": 0.8218, + "step": 13870 + }, + { + "epoch": 0.7172755929926102, + "grad_norm": 0.4451071619987488, + "learning_rate": 0.00022304145950784017, + "loss": 0.852, + "step": 13880 + }, + { + "epoch": 0.7177923621518267, + "grad_norm": 0.47982582449913025, + "learning_rate": 0.00022166270436706502, + "loss": 0.8408, + "step": 13890 + }, + { + "epoch": 0.7183091313110433, + "grad_norm": 0.4596095085144043, + "learning_rate": 0.00022029247215176934, + "loss": 0.8333, + "step": 13900 + }, + { + "epoch": 0.71882590047026, + "grad_norm": 0.4595165550708771, + "learning_rate": 0.00021893071017655845, + "loss": 0.8426, + "step": 13910 + }, + { + "epoch": 0.7193426696294765, + "grad_norm": 0.4321739375591278, + "learning_rate": 0.00021757736608171818, + "loss": 0.8419, + "step": 13920 + }, + { + "epoch": 0.7198594387886931, + "grad_norm": 0.4603961706161499, + "learning_rate": 0.00021623238783120176, + "loss": 0.8471, + "step": 13930 + }, + { + "epoch": 0.7203762079479097, + "grad_norm": 0.47230657935142517, + "learning_rate": 0.00021489572371062883, + "loss": 0.8326, + "step": 13940 + }, + { + "epoch": 0.7208929771071263, + "grad_norm": 0.45762136578559875, + "learning_rate": 0.0002135673223252971, + "loss": 0.8425, + "step": 13950 + }, + { + "epoch": 0.7214097462663428, + "grad_norm": 0.4551469385623932, + "learning_rate": 0.00021224713259820633, + "loss": 0.8335, + "step": 13960 + }, + { + "epoch": 0.7219265154255594, + "grad_norm": 0.4409978985786438, + "learning_rate": 0.00021093510376809428, + "loss": 0.8388, + "step": 13970 + }, + { + "epoch": 0.722443284584776, + "grad_norm": 0.444934219121933, + "learning_rate": 0.00020963118538748493, + "loss": 0.8313, + "step": 13980 + }, + { + "epoch": 0.7229600537439925, + "grad_norm": 0.4529027044773102, + "learning_rate": 0.00020833532732074907, + "loss": 0.8298, + "step": 13990 + }, + { + "epoch": 0.7234768229032091, + "grad_norm": 0.44308820366859436, + "learning_rate": 0.00020704747974217608, + "loss": 0.8132, + "step": 14000 + }, + { + "epoch": 0.7239935920624258, + "grad_norm": 0.451187402009964, + "learning_rate": 0.0002057675931340586, + "loss": 0.8465, + "step": 14010 + }, + { + "epoch": 0.7245103612216423, + "grad_norm": 0.4436304569244385, + "learning_rate": 0.00020449561828478832, + "loss": 0.8502, + "step": 14020 + }, + { + "epoch": 0.7250271303808589, + "grad_norm": 0.4516158401966095, + "learning_rate": 0.00020323150628696383, + "loss": 0.8323, + "step": 14030 + }, + { + "epoch": 0.7255438995400755, + "grad_norm": 0.4490114450454712, + "learning_rate": 0.00020197520853551025, + "loss": 0.8366, + "step": 14040 + }, + { + "epoch": 0.726060668699292, + "grad_norm": 0.4692043364048004, + "learning_rate": 0.00020072667672581016, + "loss": 0.8537, + "step": 14050 + }, + { + "epoch": 0.7265774378585086, + "grad_norm": 0.47233638167381287, + "learning_rate": 0.00019948586285184656, + "loss": 0.8387, + "step": 14060 + }, + { + "epoch": 0.7270942070177252, + "grad_norm": 0.43632131814956665, + "learning_rate": 0.00019825271920435674, + "loss": 0.836, + "step": 14070 + }, + { + "epoch": 0.7276109761769418, + "grad_norm": 0.4420956075191498, + "learning_rate": 0.00019702719836899813, + "loss": 0.8381, + "step": 14080 + }, + { + "epoch": 0.7281277453361583, + "grad_norm": 0.4486638009548187, + "learning_rate": 0.00019580925322452495, + "loss": 0.8382, + "step": 14090 + }, + { + "epoch": 0.7286445144953749, + "grad_norm": 0.45652589201927185, + "learning_rate": 0.0001945988369409767, + "loss": 0.8538, + "step": 14100 + }, + { + "epoch": 0.7291612836545915, + "grad_norm": 0.4422604739665985, + "learning_rate": 0.00019339590297787735, + "loss": 0.8321, + "step": 14110 + }, + { + "epoch": 0.729678052813808, + "grad_norm": 0.4418606758117676, + "learning_rate": 0.00019220040508244581, + "loss": 0.8362, + "step": 14120 + }, + { + "epoch": 0.7301948219730247, + "grad_norm": 0.43576526641845703, + "learning_rate": 0.00019101229728781774, + "loss": 0.8131, + "step": 14130 + }, + { + "epoch": 0.7307115911322413, + "grad_norm": 0.4448246657848358, + "learning_rate": 0.0001898315339112779, + "loss": 0.8425, + "step": 14140 + }, + { + "epoch": 0.7312283602914578, + "grad_norm": 0.43587714433670044, + "learning_rate": 0.0001886580695525038, + "loss": 0.8283, + "step": 14150 + }, + { + "epoch": 0.7317451294506744, + "grad_norm": 0.4598979353904724, + "learning_rate": 0.00018749185909182, + "loss": 0.8441, + "step": 14160 + }, + { + "epoch": 0.732261898609891, + "grad_norm": 0.5122143626213074, + "learning_rate": 0.0001863328576884632, + "loss": 0.8497, + "step": 14170 + }, + { + "epoch": 0.7327786677691075, + "grad_norm": 0.45913758873939514, + "learning_rate": 0.00018518102077885824, + "loss": 0.8324, + "step": 14180 + }, + { + "epoch": 0.7332954369283241, + "grad_norm": 0.46700534224510193, + "learning_rate": 0.00018403630407490455, + "loss": 0.8165, + "step": 14190 + }, + { + "epoch": 0.7338122060875407, + "grad_norm": 0.4529505670070648, + "learning_rate": 0.0001828986635622732, + "loss": 0.8345, + "step": 14200 + }, + { + "epoch": 0.7343289752467572, + "grad_norm": 0.4726906716823578, + "learning_rate": 0.0001817680554987149, + "loss": 0.8283, + "step": 14210 + }, + { + "epoch": 0.7348457444059738, + "grad_norm": 0.4485037326812744, + "learning_rate": 0.00018064443641237752, + "loss": 0.8403, + "step": 14220 + }, + { + "epoch": 0.7353625135651904, + "grad_norm": 0.46243423223495483, + "learning_rate": 0.00017952776310013513, + "loss": 0.8292, + "step": 14230 + }, + { + "epoch": 0.7358792827244071, + "grad_norm": 0.45175400376319885, + "learning_rate": 0.00017841799262592663, + "loss": 0.837, + "step": 14240 + }, + { + "epoch": 0.7363960518836236, + "grad_norm": 0.4575372040271759, + "learning_rate": 0.0001773150823191048, + "loss": 0.8224, + "step": 14250 + }, + { + "epoch": 0.7369128210428402, + "grad_norm": 0.4672216773033142, + "learning_rate": 0.00017621898977279577, + "loss": 0.8351, + "step": 14260 + }, + { + "epoch": 0.7374295902020568, + "grad_norm": 0.45373353362083435, + "learning_rate": 0.0001751296728422683, + "loss": 0.8334, + "step": 14270 + }, + { + "epoch": 0.7379463593612733, + "grad_norm": 0.472469687461853, + "learning_rate": 0.0001740470896433135, + "loss": 0.8346, + "step": 14280 + }, + { + "epoch": 0.7384631285204899, + "grad_norm": 0.4568733274936676, + "learning_rate": 0.00017297119855063422, + "loss": 0.8223, + "step": 14290 + }, + { + "epoch": 0.7389798976797065, + "grad_norm": 0.4490255117416382, + "learning_rate": 0.00017190195819624467, + "loss": 0.8298, + "step": 14300 + }, + { + "epoch": 0.739496666838923, + "grad_norm": 0.4388444125652313, + "learning_rate": 0.0001708393274678798, + "loss": 0.8301, + "step": 14310 + }, + { + "epoch": 0.7400134359981396, + "grad_norm": 0.4393922686576843, + "learning_rate": 0.00016978326550741443, + "loss": 0.8379, + "step": 14320 + }, + { + "epoch": 0.7405302051573562, + "grad_norm": 0.44879150390625, + "learning_rate": 0.00016873373170929243, + "loss": 0.8205, + "step": 14330 + }, + { + "epoch": 0.7410469743165727, + "grad_norm": 0.4404836595058441, + "learning_rate": 0.00016769068571896532, + "loss": 0.8197, + "step": 14340 + }, + { + "epoch": 0.7415637434757893, + "grad_norm": 0.47884973883628845, + "learning_rate": 0.00016665408743134062, + "loss": 0.8433, + "step": 14350 + }, + { + "epoch": 0.742080512635006, + "grad_norm": 0.4363346993923187, + "learning_rate": 0.00016562389698924, + "loss": 0.8255, + "step": 14360 + }, + { + "epoch": 0.7425972817942226, + "grad_norm": 0.4692130982875824, + "learning_rate": 0.00016460007478186648, + "loss": 0.8146, + "step": 14370 + }, + { + "epoch": 0.7431140509534391, + "grad_norm": 0.45265311002731323, + "learning_rate": 0.00016358258144328163, + "loss": 0.8166, + "step": 14380 + }, + { + "epoch": 0.7436308201126557, + "grad_norm": 0.46352484822273254, + "learning_rate": 0.00016257137785089182, + "loss": 0.8262, + "step": 14390 + }, + { + "epoch": 0.7441475892718723, + "grad_norm": 0.4382546842098236, + "learning_rate": 0.00016156642512394405, + "loss": 0.8118, + "step": 14400 + }, + { + "epoch": 0.7446643584310888, + "grad_norm": 0.4326501190662384, + "learning_rate": 0.0001605676846220309, + "loss": 0.832, + "step": 14410 + }, + { + "epoch": 0.7451811275903054, + "grad_norm": 0.4683341979980469, + "learning_rate": 0.0001595751179436049, + "loss": 0.8202, + "step": 14420 + }, + { + "epoch": 0.745697896749522, + "grad_norm": 0.4519064426422119, + "learning_rate": 0.0001585886869245019, + "loss": 0.8055, + "step": 14430 + }, + { + "epoch": 0.7462146659087385, + "grad_norm": 0.45761948823928833, + "learning_rate": 0.00015760835363647367, + "loss": 0.8128, + "step": 14440 + }, + { + "epoch": 0.7467314350679551, + "grad_norm": 0.4355948269367218, + "learning_rate": 0.00015663408038572963, + "loss": 0.816, + "step": 14450 + }, + { + "epoch": 0.7472482042271718, + "grad_norm": 0.4464154839515686, + "learning_rate": 0.00015566582971148748, + "loss": 0.8211, + "step": 14460 + }, + { + "epoch": 0.7477649733863883, + "grad_norm": 0.4529094696044922, + "learning_rate": 0.0001547035643845329, + "loss": 0.8124, + "step": 14470 + }, + { + "epoch": 0.7482817425456049, + "grad_norm": 0.48181021213531494, + "learning_rate": 0.00015374724740578792, + "loss": 0.8092, + "step": 14480 + }, + { + "epoch": 0.7487985117048215, + "grad_norm": 0.46071046590805054, + "learning_rate": 0.0001527968420048884, + "loss": 0.7989, + "step": 14490 + }, + { + "epoch": 0.749315280864038, + "grad_norm": 0.4348960220813751, + "learning_rate": 0.00015185231163877035, + "loss": 0.834, + "step": 14500 + }, + { + "epoch": 0.7498320500232546, + "grad_norm": 0.42849427461624146, + "learning_rate": 0.00015091361999026458, + "loss": 0.7947, + "step": 14510 + }, + { + "epoch": 0.7503488191824712, + "grad_norm": 0.42904916405677795, + "learning_rate": 0.00014998073096670058, + "loss": 0.8235, + "step": 14520 + }, + { + "epoch": 0.7508655883416878, + "grad_norm": 0.4777064919471741, + "learning_rate": 0.0001490536086985185, + "loss": 0.8273, + "step": 14530 + }, + { + "epoch": 0.7513823575009043, + "grad_norm": 0.44165903329849243, + "learning_rate": 0.00014813221753789016, + "loss": 0.825, + "step": 14540 + }, + { + "epoch": 0.7518991266601209, + "grad_norm": 0.4439583122730255, + "learning_rate": 0.00014721652205734831, + "loss": 0.827, + "step": 14550 + }, + { + "epoch": 0.7524158958193375, + "grad_norm": 0.455435186624527, + "learning_rate": 0.00014630648704842445, + "loss": 0.8198, + "step": 14560 + }, + { + "epoch": 0.752932664978554, + "grad_norm": 0.4566732347011566, + "learning_rate": 0.00014540207752029508, + "loss": 0.8284, + "step": 14570 + }, + { + "epoch": 0.7534494341377707, + "grad_norm": 0.44228848814964294, + "learning_rate": 0.00014450325869843633, + "loss": 0.8191, + "step": 14580 + }, + { + "epoch": 0.7539662032969873, + "grad_norm": 0.445332795381546, + "learning_rate": 0.0001436099960232868, + "loss": 0.8131, + "step": 14590 + }, + { + "epoch": 0.7544829724562038, + "grad_norm": 0.4628824293613434, + "learning_rate": 0.0001427222551489188, + "loss": 0.8257, + "step": 14600 + }, + { + "epoch": 0.7549997416154204, + "grad_norm": 0.46374180912971497, + "learning_rate": 0.00014184000194171777, + "loss": 0.8334, + "step": 14610 + }, + { + "epoch": 0.755516510774637, + "grad_norm": 0.4505828320980072, + "learning_rate": 0.00014096320247906978, + "loss": 0.8203, + "step": 14620 + }, + { + "epoch": 0.7560332799338535, + "grad_norm": 0.4418148100376129, + "learning_rate": 0.00014009182304805726, + "loss": 0.8071, + "step": 14630 + }, + { + "epoch": 0.7565500490930701, + "grad_norm": 0.43000486493110657, + "learning_rate": 0.0001392258301441627, + "loss": 0.8223, + "step": 14640 + }, + { + "epoch": 0.7570668182522867, + "grad_norm": 0.4482291340827942, + "learning_rate": 0.0001383651904699805, + "loss": 0.8106, + "step": 14650 + }, + { + "epoch": 0.7575835874115033, + "grad_norm": 0.4472900629043579, + "learning_rate": 0.00013750987093393656, + "loss": 0.8196, + "step": 14660 + }, + { + "epoch": 0.7581003565707198, + "grad_norm": 0.45943567156791687, + "learning_rate": 0.00013665983864901587, + "loss": 0.8197, + "step": 14670 + }, + { + "epoch": 0.7586171257299364, + "grad_norm": 0.43818199634552, + "learning_rate": 0.00013581506093149825, + "loss": 0.8003, + "step": 14680 + }, + { + "epoch": 0.7591338948891531, + "grad_norm": 0.43463850021362305, + "learning_rate": 0.0001349755052997014, + "loss": 0.8086, + "step": 14690 + }, + { + "epoch": 0.7596506640483696, + "grad_norm": 0.4578488767147064, + "learning_rate": 0.00013414113947273217, + "loss": 0.8011, + "step": 14700 + }, + { + "epoch": 0.7601674332075862, + "grad_norm": 0.44629108905792236, + "learning_rate": 0.00013331193136924515, + "loss": 0.8086, + "step": 14710 + }, + { + "epoch": 0.7606842023668028, + "grad_norm": 0.4482209384441376, + "learning_rate": 0.00013248784910620945, + "loss": 0.7996, + "step": 14720 + }, + { + "epoch": 0.7612009715260193, + "grad_norm": 0.4447433650493622, + "learning_rate": 0.00013166886099768245, + "loss": 0.8162, + "step": 14730 + }, + { + "epoch": 0.7617177406852359, + "grad_norm": 0.44065767526626587, + "learning_rate": 0.00013085493555359173, + "loss": 0.826, + "step": 14740 + }, + { + "epoch": 0.7622345098444525, + "grad_norm": 0.47181805968284607, + "learning_rate": 0.00013004604147852416, + "loss": 0.8074, + "step": 14750 + }, + { + "epoch": 0.762751279003669, + "grad_norm": 0.44598037004470825, + "learning_rate": 0.00012924214767052268, + "loss": 0.8047, + "step": 14760 + }, + { + "epoch": 0.7632680481628856, + "grad_norm": 0.4688059091567993, + "learning_rate": 0.00012844322321989025, + "loss": 0.8076, + "step": 14770 + }, + { + "epoch": 0.7637848173221022, + "grad_norm": 0.47695672512054443, + "learning_rate": 0.00012764923740800162, + "loss": 0.7913, + "step": 14780 + }, + { + "epoch": 0.7643015864813187, + "grad_norm": 0.4601481556892395, + "learning_rate": 0.00012686015970612207, + "loss": 0.8122, + "step": 14790 + }, + { + "epoch": 0.7648183556405354, + "grad_norm": 0.46827730536460876, + "learning_rate": 0.0001260759597742335, + "loss": 0.8136, + "step": 14800 + }, + { + "epoch": 0.765335124799752, + "grad_norm": 0.43789979815483093, + "learning_rate": 0.00012529660745986808, + "loss": 0.8131, + "step": 14810 + }, + { + "epoch": 0.7658518939589686, + "grad_norm": 0.44412630796432495, + "learning_rate": 0.00012452207279694858, + "loss": 0.7994, + "step": 14820 + }, + { + "epoch": 0.7663686631181851, + "grad_norm": 0.44957849383354187, + "learning_rate": 0.00012375232600463646, + "loss": 0.801, + "step": 14830 + }, + { + "epoch": 0.7668854322774017, + "grad_norm": 0.4659784436225891, + "learning_rate": 0.0001229873374861867, + "loss": 0.8011, + "step": 14840 + }, + { + "epoch": 0.7674022014366183, + "grad_norm": 0.4447031617164612, + "learning_rate": 0.00012222707782780977, + "loss": 0.8132, + "step": 14850 + }, + { + "epoch": 0.7679189705958348, + "grad_norm": 0.45082828402519226, + "learning_rate": 0.00012147151779754062, + "loss": 0.8067, + "step": 14860 + }, + { + "epoch": 0.7684357397550514, + "grad_norm": 0.42726126313209534, + "learning_rate": 0.00012072062834411491, + "loss": 0.81, + "step": 14870 + }, + { + "epoch": 0.768952508914268, + "grad_norm": 0.46154364943504333, + "learning_rate": 0.00011997438059585174, + "loss": 0.8063, + "step": 14880 + }, + { + "epoch": 0.7694692780734845, + "grad_norm": 0.45202165842056274, + "learning_rate": 0.00011923274585954376, + "loss": 0.8066, + "step": 14890 + }, + { + "epoch": 0.7699860472327011, + "grad_norm": 0.43574896454811096, + "learning_rate": 0.00011849569561935377, + "loss": 0.8024, + "step": 14900 + }, + { + "epoch": 0.7705028163919178, + "grad_norm": 0.4647500514984131, + "learning_rate": 0.00011776320153571831, + "loss": 0.8047, + "step": 14910 + }, + { + "epoch": 0.7710195855511343, + "grad_norm": 0.4715510308742523, + "learning_rate": 0.00011703523544425804, + "loss": 0.8242, + "step": 14920 + }, + { + "epoch": 0.7715363547103509, + "grad_norm": 0.48043355345726013, + "learning_rate": 0.00011631176935469487, + "loss": 0.8014, + "step": 14930 + }, + { + "epoch": 0.7720531238695675, + "grad_norm": 0.45127764344215393, + "learning_rate": 0.00011559277544977559, + "loss": 0.8143, + "step": 14940 + }, + { + "epoch": 0.7725698930287841, + "grad_norm": 0.447942852973938, + "learning_rate": 0.0001148782260842024, + "loss": 0.815, + "step": 14950 + }, + { + "epoch": 0.7730866621880006, + "grad_norm": 0.4494159519672394, + "learning_rate": 0.00011416809378356995, + "loss": 0.8193, + "step": 14960 + }, + { + "epoch": 0.7736034313472172, + "grad_norm": 0.4411426782608032, + "learning_rate": 0.00011346235124330891, + "loss": 0.7971, + "step": 14970 + }, + { + "epoch": 0.7741202005064338, + "grad_norm": 0.4652232229709625, + "learning_rate": 0.0001127609713276361, + "loss": 0.8108, + "step": 14980 + }, + { + "epoch": 0.7746369696656503, + "grad_norm": 0.48985597491264343, + "learning_rate": 0.00011206392706851122, + "loss": 0.8061, + "step": 14990 + }, + { + "epoch": 0.7751537388248669, + "grad_norm": 0.4511886239051819, + "learning_rate": 0.00011137119166459977, + "loss": 0.8046, + "step": 15000 + }, + { + "epoch": 0.7756705079840835, + "grad_norm": 0.4621480405330658, + "learning_rate": 0.00011068273848024272, + "loss": 0.8116, + "step": 15010 + }, + { + "epoch": 0.7761872771433, + "grad_norm": 0.45318228006362915, + "learning_rate": 0.00010999854104443217, + "loss": 0.7992, + "step": 15020 + }, + { + "epoch": 0.7767040463025167, + "grad_norm": 0.46225494146347046, + "learning_rate": 0.00010931857304979372, + "loss": 0.8055, + "step": 15030 + }, + { + "epoch": 0.7772208154617333, + "grad_norm": 0.4576970934867859, + "learning_rate": 0.00010864280835157488, + "loss": 0.7918, + "step": 15040 + }, + { + "epoch": 0.7777375846209498, + "grad_norm": 0.43827998638153076, + "learning_rate": 0.00010797122096663975, + "loss": 0.8124, + "step": 15050 + }, + { + "epoch": 0.7782543537801664, + "grad_norm": 0.4270840883255005, + "learning_rate": 0.00010730378507247009, + "loss": 0.8027, + "step": 15060 + }, + { + "epoch": 0.778771122939383, + "grad_norm": 0.4645536243915558, + "learning_rate": 0.00010664047500617232, + "loss": 0.8103, + "step": 15070 + }, + { + "epoch": 0.7792878920985996, + "grad_norm": 0.4405182898044586, + "learning_rate": 0.00010598126526349083, + "loss": 0.7886, + "step": 15080 + }, + { + "epoch": 0.7798046612578161, + "grad_norm": 0.4572370648384094, + "learning_rate": 0.00010532613049782744, + "loss": 0.8021, + "step": 15090 + }, + { + "epoch": 0.7803214304170327, + "grad_norm": 0.4464896321296692, + "learning_rate": 0.00010467504551926664, + "loss": 0.7897, + "step": 15100 + }, + { + "epoch": 0.7808381995762493, + "grad_norm": 0.470245897769928, + "learning_rate": 0.00010402798529360717, + "loss": 0.8053, + "step": 15110 + }, + { + "epoch": 0.7813549687354658, + "grad_norm": 0.4271971583366394, + "learning_rate": 0.00010338492494139942, + "loss": 0.8144, + "step": 15120 + }, + { + "epoch": 0.7818717378946825, + "grad_norm": 0.45670023560523987, + "learning_rate": 0.00010274583973698883, + "loss": 0.8012, + "step": 15130 + }, + { + "epoch": 0.7823885070538991, + "grad_norm": 0.4224714934825897, + "learning_rate": 0.0001021107051075651, + "loss": 0.785, + "step": 15140 + }, + { + "epoch": 0.7829052762131156, + "grad_norm": 0.43493083119392395, + "learning_rate": 0.00010147949663221759, + "loss": 0.8028, + "step": 15150 + }, + { + "epoch": 0.7834220453723322, + "grad_norm": 0.4562802016735077, + "learning_rate": 0.00010085219004099603, + "loss": 0.8052, + "step": 15160 + }, + { + "epoch": 0.7839388145315488, + "grad_norm": 0.44530564546585083, + "learning_rate": 0.00010022876121397758, + "loss": 0.8073, + "step": 15170 + }, + { + "epoch": 0.7844555836907653, + "grad_norm": 0.5228975415229797, + "learning_rate": 9.960918618033934e-05, + "loss": 0.8089, + "step": 15180 + }, + { + "epoch": 0.7849723528499819, + "grad_norm": 0.44067102670669556, + "learning_rate": 9.899344111743661e-05, + "loss": 0.7955, + "step": 15190 + }, + { + "epoch": 0.7854891220091985, + "grad_norm": 0.474118173122406, + "learning_rate": 9.838150234988704e-05, + "loss": 0.7932, + "step": 15200 + }, + { + "epoch": 0.786005891168415, + "grad_norm": 0.4493066668510437, + "learning_rate": 9.777334634866019e-05, + "loss": 0.7938, + "step": 15210 + }, + { + "epoch": 0.7865226603276316, + "grad_norm": 0.44325533509254456, + "learning_rate": 9.716894973017291e-05, + "loss": 0.8098, + "step": 15220 + }, + { + "epoch": 0.7870394294868482, + "grad_norm": 0.44017842411994934, + "learning_rate": 9.656828925539026e-05, + "loss": 0.7872, + "step": 15230 + }, + { + "epoch": 0.7875561986460649, + "grad_norm": 0.4537578225135803, + "learning_rate": 9.597134182893185e-05, + "loss": 0.8046, + "step": 15240 + }, + { + "epoch": 0.7880729678052814, + "grad_norm": 0.43279150128364563, + "learning_rate": 9.5378084498184e-05, + "loss": 0.8155, + "step": 15250 + }, + { + "epoch": 0.788589736964498, + "grad_norm": 0.45793530344963074, + "learning_rate": 9.478849445241703e-05, + "loss": 0.8033, + "step": 15260 + }, + { + "epoch": 0.7891065061237146, + "grad_norm": 0.45037081837654114, + "learning_rate": 9.420254902190833e-05, + "loss": 0.7985, + "step": 15270 + }, + { + "epoch": 0.7896232752829311, + "grad_norm": 0.4623776972293854, + "learning_rate": 9.362022567707067e-05, + "loss": 0.8197, + "step": 15280 + }, + { + "epoch": 0.7901400444421477, + "grad_norm": 0.4537854790687561, + "learning_rate": 9.30415020275859e-05, + "loss": 0.7926, + "step": 15290 + }, + { + "epoch": 0.7906568136013643, + "grad_norm": 0.4492059648036957, + "learning_rate": 9.246635582154403e-05, + "loss": 0.7938, + "step": 15300 + }, + { + "epoch": 0.7911735827605808, + "grad_norm": 0.4396090805530548, + "learning_rate": 9.189476494458775e-05, + "loss": 0.7999, + "step": 15310 + }, + { + "epoch": 0.7916903519197974, + "grad_norm": 0.43469393253326416, + "learning_rate": 9.132670741906201e-05, + "loss": 0.7994, + "step": 15320 + }, + { + "epoch": 0.792207121079014, + "grad_norm": 0.44428810477256775, + "learning_rate": 9.076216140316906e-05, + "loss": 0.8043, + "step": 15330 + }, + { + "epoch": 0.7927238902382305, + "grad_norm": 0.4329991638660431, + "learning_rate": 9.02011051901286e-05, + "loss": 0.7877, + "step": 15340 + }, + { + "epoch": 0.7932406593974471, + "grad_norm": 0.4495084583759308, + "learning_rate": 8.964351720734322e-05, + "loss": 0.7969, + "step": 15350 + }, + { + "epoch": 0.7937574285566638, + "grad_norm": 0.4632558822631836, + "learning_rate": 8.908937601556875e-05, + "loss": 0.7895, + "step": 15360 + }, + { + "epoch": 0.7942741977158804, + "grad_norm": 0.44832077622413635, + "learning_rate": 8.853866030809016e-05, + "loss": 0.7928, + "step": 15370 + }, + { + "epoch": 0.7947909668750969, + "grad_norm": 0.4608152210712433, + "learning_rate": 8.799134890990218e-05, + "loss": 0.8033, + "step": 15380 + }, + { + "epoch": 0.7953077360343135, + "grad_norm": 0.45813852548599243, + "learning_rate": 8.744742077689513e-05, + "loss": 0.8127, + "step": 15390 + }, + { + "epoch": 0.7958245051935301, + "grad_norm": 0.4426814317703247, + "learning_rate": 8.69068549950458e-05, + "loss": 0.7939, + "step": 15400 + }, + { + "epoch": 0.7963412743527466, + "grad_norm": 0.4528840482234955, + "learning_rate": 8.636963077961332e-05, + "loss": 0.7889, + "step": 15410 + }, + { + "epoch": 0.7968580435119632, + "grad_norm": 0.4318794310092926, + "learning_rate": 8.583572747433989e-05, + "loss": 0.79, + "step": 15420 + }, + { + "epoch": 0.7973748126711798, + "grad_norm": 0.4563692808151245, + "learning_rate": 8.530512455065673e-05, + "loss": 0.7922, + "step": 15430 + }, + { + "epoch": 0.7978915818303963, + "grad_norm": 0.44473403692245483, + "learning_rate": 8.477780160689458e-05, + "loss": 0.7999, + "step": 15440 + }, + { + "epoch": 0.7984083509896129, + "grad_norm": 0.45080122351646423, + "learning_rate": 8.425373836749934e-05, + "loss": 0.7854, + "step": 15450 + }, + { + "epoch": 0.7989251201488295, + "grad_norm": 0.4660671055316925, + "learning_rate": 8.373291468225247e-05, + "loss": 0.8033, + "step": 15460 + }, + { + "epoch": 0.799441889308046, + "grad_norm": 0.43612638115882874, + "learning_rate": 8.321531052549621e-05, + "loss": 0.7975, + "step": 15470 + }, + { + "epoch": 0.7999586584672627, + "grad_norm": 0.44829973578453064, + "learning_rate": 8.270090599536357e-05, + "loss": 0.7865, + "step": 15480 + }, + { + "epoch": 0.8004754276264793, + "grad_norm": 0.4527774751186371, + "learning_rate": 8.218968131301314e-05, + "loss": 0.7994, + "step": 15490 + }, + { + "epoch": 0.8009921967856958, + "grad_norm": 0.46482163667678833, + "learning_rate": 8.16816168218686e-05, + "loss": 0.7949, + "step": 15500 + }, + { + "epoch": 0.8015089659449124, + "grad_norm": 0.4425605535507202, + "learning_rate": 8.117669298686285e-05, + "loss": 0.7708, + "step": 15510 + }, + { + "epoch": 0.802025735104129, + "grad_norm": 0.4287862777709961, + "learning_rate": 8.0674890393687e-05, + "loss": 0.801, + "step": 15520 + }, + { + "epoch": 0.8025425042633456, + "grad_norm": 0.4485211670398712, + "learning_rate": 8.017618974804377e-05, + "loss": 0.7876, + "step": 15530 + }, + { + "epoch": 0.8030592734225621, + "grad_norm": 0.43715623021125793, + "learning_rate": 7.968057187490574e-05, + "loss": 0.7984, + "step": 15540 + }, + { + "epoch": 0.8035760425817787, + "grad_norm": 0.4431898891925812, + "learning_rate": 7.918801771777797e-05, + "loss": 0.787, + "step": 15550 + }, + { + "epoch": 0.8040928117409953, + "grad_norm": 0.4634036421775818, + "learning_rate": 7.869850833796537e-05, + "loss": 0.8002, + "step": 15560 + }, + { + "epoch": 0.8046095809002118, + "grad_norm": 0.4434111416339874, + "learning_rate": 7.821202491384445e-05, + "loss": 0.7827, + "step": 15570 + }, + { + "epoch": 0.8051263500594285, + "grad_norm": 0.4345285892486572, + "learning_rate": 7.77285487401396e-05, + "loss": 0.7983, + "step": 15580 + }, + { + "epoch": 0.8056431192186451, + "grad_norm": 0.4299919605255127, + "learning_rate": 7.724806122720396e-05, + "loss": 0.7777, + "step": 15590 + }, + { + "epoch": 0.8061598883778616, + "grad_norm": 0.44167646765708923, + "learning_rate": 7.677054390030455e-05, + "loss": 0.7967, + "step": 15600 + }, + { + "epoch": 0.8066766575370782, + "grad_norm": 0.4805566370487213, + "learning_rate": 7.629597839891209e-05, + "loss": 0.809, + "step": 15610 + }, + { + "epoch": 0.8071934266962948, + "grad_norm": 0.4554888606071472, + "learning_rate": 7.582434647599476e-05, + "loss": 0.792, + "step": 15620 + }, + { + "epoch": 0.8077101958555113, + "grad_norm": 0.4604235887527466, + "learning_rate": 7.535562999731686e-05, + "loss": 0.7825, + "step": 15630 + }, + { + "epoch": 0.8082269650147279, + "grad_norm": 0.47276201844215393, + "learning_rate": 7.488981094074143e-05, + "loss": 0.7981, + "step": 15640 + }, + { + "epoch": 0.8087437341739445, + "grad_norm": 0.46937987208366394, + "learning_rate": 7.442687139553729e-05, + "loss": 0.7825, + "step": 15650 + }, + { + "epoch": 0.8092605033331611, + "grad_norm": 0.44667670130729675, + "learning_rate": 7.396679356169044e-05, + "loss": 0.7788, + "step": 15660 + }, + { + "epoch": 0.8097772724923776, + "grad_norm": 0.4452296197414398, + "learning_rate": 7.35095597492196e-05, + "loss": 0.7962, + "step": 15670 + }, + { + "epoch": 0.8102940416515942, + "grad_norm": 0.47155633568763733, + "learning_rate": 7.3055152377496e-05, + "loss": 0.7937, + "step": 15680 + }, + { + "epoch": 0.8108108108108109, + "grad_norm": 0.4572817087173462, + "learning_rate": 7.260355397456748e-05, + "loss": 0.7911, + "step": 15690 + }, + { + "epoch": 0.8113275799700274, + "grad_norm": 0.4582803547382355, + "learning_rate": 7.21547471764867e-05, + "loss": 0.7832, + "step": 15700 + }, + { + "epoch": 0.811844349129244, + "grad_norm": 0.45184165239334106, + "learning_rate": 7.170871472664335e-05, + "loss": 0.7896, + "step": 15710 + }, + { + "epoch": 0.8123611182884606, + "grad_norm": 0.462866872549057, + "learning_rate": 7.126543947510089e-05, + "loss": 0.8053, + "step": 15720 + }, + { + "epoch": 0.8128778874476771, + "grad_norm": 0.4350687265396118, + "learning_rate": 7.082490437793685e-05, + "loss": 0.7901, + "step": 15730 + }, + { + "epoch": 0.8133946566068937, + "grad_norm": 0.48868757486343384, + "learning_rate": 7.03870924965877e-05, + "loss": 0.7932, + "step": 15740 + }, + { + "epoch": 0.8139114257661103, + "grad_norm": 0.4378123879432678, + "learning_rate": 6.995198699719745e-05, + "loss": 0.8041, + "step": 15750 + }, + { + "epoch": 0.8144281949253268, + "grad_norm": 0.43519341945648193, + "learning_rate": 6.95195711499705e-05, + "loss": 0.7868, + "step": 15760 + }, + { + "epoch": 0.8149449640845434, + "grad_norm": 0.434491366147995, + "learning_rate": 6.908982832852821e-05, + "loss": 0.7872, + "step": 15770 + }, + { + "epoch": 0.81546173324376, + "grad_norm": 0.44694221019744873, + "learning_rate": 6.86627420092698e-05, + "loss": 0.7804, + "step": 15780 + }, + { + "epoch": 0.8159785024029765, + "grad_norm": 0.4496343731880188, + "learning_rate": 6.823829577073686e-05, + "loss": 0.7805, + "step": 15790 + }, + { + "epoch": 0.8164952715621931, + "grad_norm": 0.4403352737426758, + "learning_rate": 6.781647329298209e-05, + "loss": 0.7783, + "step": 15800 + }, + { + "epoch": 0.8170120407214098, + "grad_norm": 0.43307387828826904, + "learning_rate": 6.739725835694167e-05, + "loss": 0.7883, + "step": 15810 + }, + { + "epoch": 0.8175288098806264, + "grad_norm": 0.4405989646911621, + "learning_rate": 6.698063484381174e-05, + "loss": 0.7945, + "step": 15820 + }, + { + "epoch": 0.8180455790398429, + "grad_norm": 0.46816104650497437, + "learning_rate": 6.656658673442854e-05, + "loss": 0.7719, + "step": 15830 + }, + { + "epoch": 0.8185623481990595, + "grad_norm": 0.4712413251399994, + "learning_rate": 6.615509810865257e-05, + "loss": 0.8033, + "step": 15840 + }, + { + "epoch": 0.8190791173582761, + "grad_norm": 0.45156368613243103, + "learning_rate": 6.574615314475637e-05, + "loss": 0.7981, + "step": 15850 + }, + { + "epoch": 0.8195958865174926, + "grad_norm": 0.44122111797332764, + "learning_rate": 6.533973611881624e-05, + "loss": 0.7945, + "step": 15860 + }, + { + "epoch": 0.8201126556767092, + "grad_norm": 0.4496499001979828, + "learning_rate": 6.493583140410763e-05, + "loss": 0.7858, + "step": 15870 + }, + { + "epoch": 0.8206294248359258, + "grad_norm": 0.4501078128814697, + "learning_rate": 6.453442347050426e-05, + "loss": 0.7928, + "step": 15880 + }, + { + "epoch": 0.8211461939951423, + "grad_norm": 0.4360281825065613, + "learning_rate": 6.413549688388107e-05, + "loss": 0.787, + "step": 15890 + }, + { + "epoch": 0.8216629631543589, + "grad_norm": 0.4398462176322937, + "learning_rate": 6.37390363055207e-05, + "loss": 0.7736, + "step": 15900 + }, + { + "epoch": 0.8221797323135756, + "grad_norm": 0.44592639803886414, + "learning_rate": 6.334502649152376e-05, + "loss": 0.7869, + "step": 15910 + }, + { + "epoch": 0.822696501472792, + "grad_norm": 0.44563406705856323, + "learning_rate": 6.295345229222268e-05, + "loss": 0.7859, + "step": 15920 + }, + { + "epoch": 0.8232132706320087, + "grad_norm": 0.46638575196266174, + "learning_rate": 6.256429865159924e-05, + "loss": 0.7921, + "step": 15930 + }, + { + "epoch": 0.8237300397912253, + "grad_norm": 0.458056777715683, + "learning_rate": 6.217755060670557e-05, + "loss": 0.7799, + "step": 15940 + }, + { + "epoch": 0.8242468089504419, + "grad_norm": 0.4988017976284027, + "learning_rate": 6.1793193287089e-05, + "loss": 0.7771, + "step": 15950 + }, + { + "epoch": 0.8247635781096584, + "grad_norm": 0.44715121388435364, + "learning_rate": 6.141121191422011e-05, + "loss": 0.7974, + "step": 15960 + }, + { + "epoch": 0.825280347268875, + "grad_norm": 0.45090383291244507, + "learning_rate": 6.1031591800924596e-05, + "loss": 0.7683, + "step": 15970 + }, + { + "epoch": 0.8257971164280916, + "grad_norm": 0.43011826276779175, + "learning_rate": 6.0654318350818545e-05, + "loss": 0.7791, + "step": 15980 + }, + { + "epoch": 0.8263138855873081, + "grad_norm": 0.4606122672557831, + "learning_rate": 6.027937705774713e-05, + "loss": 0.7998, + "step": 15990 + }, + { + "epoch": 0.8268306547465247, + "grad_norm": 0.4207383096218109, + "learning_rate": 5.9906753505226956e-05, + "loss": 0.7785, + "step": 16000 + }, + { + "epoch": 0.8273474239057413, + "grad_norm": 0.4336974620819092, + "learning_rate": 5.953643336589173e-05, + "loss": 0.7834, + "step": 16010 + }, + { + "epoch": 0.8278641930649578, + "grad_norm": 0.4548156261444092, + "learning_rate": 5.916840240094121e-05, + "loss": 0.7922, + "step": 16020 + }, + { + "epoch": 0.8283809622241745, + "grad_norm": 0.43436485528945923, + "learning_rate": 5.880264645959399e-05, + "loss": 0.7804, + "step": 16030 + }, + { + "epoch": 0.8288977313833911, + "grad_norm": 0.4377012252807617, + "learning_rate": 5.843915147854316e-05, + "loss": 0.7718, + "step": 16040 + }, + { + "epoch": 0.8294145005426076, + "grad_norm": 0.46145206689834595, + "learning_rate": 5.807790348141579e-05, + "loss": 0.7888, + "step": 16050 + }, + { + "epoch": 0.8299312697018242, + "grad_norm": 0.444749116897583, + "learning_rate": 5.771888857823527e-05, + "loss": 0.7978, + "step": 16060 + }, + { + "epoch": 0.8304480388610408, + "grad_norm": 0.4541518986225128, + "learning_rate": 5.736209296488757e-05, + "loss": 0.7849, + "step": 16070 + }, + { + "epoch": 0.8309648080202574, + "grad_norm": 0.43136441707611084, + "learning_rate": 5.7007502922590154e-05, + "loss": 0.7924, + "step": 16080 + }, + { + "epoch": 0.8314815771794739, + "grad_norm": 0.4634501338005066, + "learning_rate": 5.665510481736475e-05, + "loss": 0.7966, + "step": 16090 + }, + { + "epoch": 0.8319983463386905, + "grad_norm": 0.45138517022132874, + "learning_rate": 5.63048850995129e-05, + "loss": 0.783, + "step": 16100 + }, + { + "epoch": 0.8325151154979071, + "grad_norm": 0.45926496386528015, + "learning_rate": 5.59568303030952e-05, + "loss": 0.7903, + "step": 16110 + }, + { + "epoch": 0.8330318846571236, + "grad_norm": 0.4217846691608429, + "learning_rate": 5.561092704541337e-05, + "loss": 0.765, + "step": 16120 + }, + { + "epoch": 0.8335486538163402, + "grad_norm": 0.46820348501205444, + "learning_rate": 5.526716202649569e-05, + "loss": 0.7917, + "step": 16130 + }, + { + "epoch": 0.8340654229755569, + "grad_norm": 0.45810696482658386, + "learning_rate": 5.492552202858579e-05, + "loss": 0.7771, + "step": 16140 + }, + { + "epoch": 0.8345821921347734, + "grad_norm": 0.45739495754241943, + "learning_rate": 5.458599391563416e-05, + "loss": 0.7949, + "step": 16150 + }, + { + "epoch": 0.83509896129399, + "grad_norm": 0.45775654911994934, + "learning_rate": 5.4248564632793354e-05, + "loss": 0.7748, + "step": 16160 + }, + { + "epoch": 0.8356157304532066, + "grad_norm": 0.471780925989151, + "learning_rate": 5.3913221205915764e-05, + "loss": 0.7908, + "step": 16170 + }, + { + "epoch": 0.8361324996124231, + "grad_norm": 0.4380318522453308, + "learning_rate": 5.3579950741055e-05, + "loss": 0.7871, + "step": 16180 + }, + { + "epoch": 0.8366492687716397, + "grad_norm": 0.45614588260650635, + "learning_rate": 5.324874042396992e-05, + "loss": 0.7717, + "step": 16190 + }, + { + "epoch": 0.8371660379308563, + "grad_norm": 0.42838895320892334, + "learning_rate": 5.29195775196321e-05, + "loss": 0.7816, + "step": 16200 + }, + { + "epoch": 0.8376828070900728, + "grad_norm": 0.47133561968803406, + "learning_rate": 5.259244937173599e-05, + "loss": 0.7732, + "step": 16210 + }, + { + "epoch": 0.8381995762492894, + "grad_norm": 0.42173993587493896, + "learning_rate": 5.226734340221249e-05, + "loss": 0.7687, + "step": 16220 + }, + { + "epoch": 0.838716345408506, + "grad_norm": 0.42915183305740356, + "learning_rate": 5.194424711074507e-05, + "loss": 0.7866, + "step": 16230 + }, + { + "epoch": 0.8392331145677226, + "grad_norm": 0.4370039999485016, + "learning_rate": 5.1623148074289386e-05, + "loss": 0.7855, + "step": 16240 + }, + { + "epoch": 0.8397498837269392, + "grad_norm": 0.4343273937702179, + "learning_rate": 5.130403394659548e-05, + "loss": 0.7871, + "step": 16250 + }, + { + "epoch": 0.8402666528861558, + "grad_norm": 0.4628264009952545, + "learning_rate": 5.0986892457733016e-05, + "loss": 0.7929, + "step": 16260 + }, + { + "epoch": 0.8407834220453724, + "grad_norm": 0.4544295072555542, + "learning_rate": 5.067171141361967e-05, + "loss": 0.7823, + "step": 16270 + }, + { + "epoch": 0.8413001912045889, + "grad_norm": 0.46135464310646057, + "learning_rate": 5.035847869555207e-05, + "loss": 0.7747, + "step": 16280 + }, + { + "epoch": 0.8418169603638055, + "grad_norm": 0.44259122014045715, + "learning_rate": 5.004718225974004e-05, + "loss": 0.7836, + "step": 16290 + }, + { + "epoch": 0.8423337295230221, + "grad_norm": 0.44478118419647217, + "learning_rate": 4.9737810136843286e-05, + "loss": 0.7664, + "step": 16300 + }, + { + "epoch": 0.8428504986822386, + "grad_norm": 0.44629231095314026, + "learning_rate": 4.943035043151143e-05, + "loss": 0.7906, + "step": 16310 + }, + { + "epoch": 0.8433672678414552, + "grad_norm": 0.4398927092552185, + "learning_rate": 4.912479132192638e-05, + "loss": 0.7835, + "step": 16320 + }, + { + "epoch": 0.8438840370006718, + "grad_norm": 0.4557620882987976, + "learning_rate": 4.882112105934801e-05, + "loss": 0.7727, + "step": 16330 + }, + { + "epoch": 0.8444008061598883, + "grad_norm": 0.45272544026374817, + "learning_rate": 4.851932796766221e-05, + "loss": 0.781, + "step": 16340 + }, + { + "epoch": 0.8449175753191049, + "grad_norm": 0.44196563959121704, + "learning_rate": 4.821940044293212e-05, + "loss": 0.7867, + "step": 16350 + }, + { + "epoch": 0.8454343444783216, + "grad_norm": 0.44495323300361633, + "learning_rate": 4.79213269529519e-05, + "loss": 0.7791, + "step": 16360 + }, + { + "epoch": 0.8459511136375382, + "grad_norm": 0.4298705756664276, + "learning_rate": 4.76250960368032e-05, + "loss": 0.7924, + "step": 16370 + }, + { + "epoch": 0.8464678827967547, + "grad_norm": 0.4538145065307617, + "learning_rate": 4.7330696304414696e-05, + "loss": 0.801, + "step": 16380 + }, + { + "epoch": 0.8469846519559713, + "grad_norm": 0.437732458114624, + "learning_rate": 4.703811643612394e-05, + "loss": 0.7953, + "step": 16390 + }, + { + "epoch": 0.8475014211151879, + "grad_norm": 0.441617876291275, + "learning_rate": 4.674734518224231e-05, + "loss": 0.772, + "step": 16400 + }, + { + "epoch": 0.8480181902744044, + "grad_norm": 0.42918652296066284, + "learning_rate": 4.645837136262228e-05, + "loss": 0.7839, + "step": 16410 + }, + { + "epoch": 0.848534959433621, + "grad_norm": 0.44365042448043823, + "learning_rate": 4.617118386622768e-05, + "loss": 0.7774, + "step": 16420 + }, + { + "epoch": 0.8490517285928376, + "grad_norm": 0.43790024518966675, + "learning_rate": 4.588577165070638e-05, + "loss": 0.7821, + "step": 16430 + }, + { + "epoch": 0.8495684977520541, + "grad_norm": 0.4523584246635437, + "learning_rate": 4.5602123741965806e-05, + "loss": 0.7689, + "step": 16440 + }, + { + "epoch": 0.8500852669112707, + "grad_norm": 0.438987135887146, + "learning_rate": 4.5320229233750884e-05, + "loss": 0.7774, + "step": 16450 + }, + { + "epoch": 0.8506020360704873, + "grad_norm": 0.4385901987552643, + "learning_rate": 4.504007728722478e-05, + "loss": 0.7767, + "step": 16460 + }, + { + "epoch": 0.8511188052297038, + "grad_norm": 0.44286254048347473, + "learning_rate": 4.4761657130552136e-05, + "loss": 0.7893, + "step": 16470 + }, + { + "epoch": 0.8516355743889205, + "grad_norm": 0.43227192759513855, + "learning_rate": 4.448495805848479e-05, + "loss": 0.7632, + "step": 16480 + }, + { + "epoch": 0.8521523435481371, + "grad_norm": 0.4544907510280609, + "learning_rate": 4.420996943195034e-05, + "loss": 0.7812, + "step": 16490 + }, + { + "epoch": 0.8526691127073536, + "grad_norm": 0.46841660141944885, + "learning_rate": 4.393668067764288e-05, + "loss": 0.7712, + "step": 16500 + }, + { + "epoch": 0.8531858818665702, + "grad_norm": 0.45919257402420044, + "learning_rate": 4.3665081287616635e-05, + "loss": 0.7757, + "step": 16510 + }, + { + "epoch": 0.8537026510257868, + "grad_norm": 0.44672319293022156, + "learning_rate": 4.339516081888175e-05, + "loss": 0.7787, + "step": 16520 + }, + { + "epoch": 0.8542194201850034, + "grad_norm": 0.445287823677063, + "learning_rate": 4.312690889300296e-05, + "loss": 0.7787, + "step": 16530 + }, + { + "epoch": 0.8547361893442199, + "grad_norm": 0.46268194913864136, + "learning_rate": 4.286031519570033e-05, + "loss": 0.7757, + "step": 16540 + }, + { + "epoch": 0.8552529585034365, + "grad_norm": 0.434190034866333, + "learning_rate": 4.2595369476452845e-05, + "loss": 0.7733, + "step": 16550 + }, + { + "epoch": 0.8557697276626531, + "grad_norm": 0.4440845549106598, + "learning_rate": 4.233206154810416e-05, + "loss": 0.7667, + "step": 16560 + }, + { + "epoch": 0.8562864968218696, + "grad_norm": 0.43531450629234314, + "learning_rate": 4.2070381286470965e-05, + "loss": 0.7712, + "step": 16570 + }, + { + "epoch": 0.8568032659810862, + "grad_norm": 0.46447721123695374, + "learning_rate": 4.181031862995373e-05, + "loss": 0.7679, + "step": 16580 + }, + { + "epoch": 0.8573200351403029, + "grad_norm": 0.43442919850349426, + "learning_rate": 4.155186357914973e-05, + "loss": 0.7959, + "step": 16590 + }, + { + "epoch": 0.8578368042995194, + "grad_norm": 0.4343065917491913, + "learning_rate": 4.129500619646871e-05, + "loss": 0.7829, + "step": 16600 + }, + { + "epoch": 0.858353573458736, + "grad_norm": 0.4585905969142914, + "learning_rate": 4.103973660575065e-05, + "loss": 0.7786, + "step": 16610 + }, + { + "epoch": 0.8588703426179526, + "grad_norm": 0.43392133712768555, + "learning_rate": 4.078604499188617e-05, + "loss": 0.7773, + "step": 16620 + }, + { + "epoch": 0.8593871117771691, + "grad_norm": 0.43312516808509827, + "learning_rate": 4.053392160043896e-05, + "loss": 0.7678, + "step": 16630 + }, + { + "epoch": 0.8599038809363857, + "grad_norm": 0.4381249248981476, + "learning_rate": 4.028335673727093e-05, + "loss": 0.7724, + "step": 16640 + }, + { + "epoch": 0.8604206500956023, + "grad_norm": 0.4337814152240753, + "learning_rate": 4.0034340768169274e-05, + "loss": 0.7823, + "step": 16650 + }, + { + "epoch": 0.8609374192548189, + "grad_norm": 0.4437348246574402, + "learning_rate": 3.978686411847619e-05, + "loss": 0.7926, + "step": 16660 + }, + { + "epoch": 0.8614541884140354, + "grad_norm": 0.4341773986816406, + "learning_rate": 3.954091727272062e-05, + "loss": 0.7826, + "step": 16670 + }, + { + "epoch": 0.861970957573252, + "grad_norm": 0.44881367683410645, + "learning_rate": 3.929649077425246e-05, + "loss": 0.7704, + "step": 16680 + }, + { + "epoch": 0.8624877267324687, + "grad_norm": 0.4502032697200775, + "learning_rate": 3.9053575224878926e-05, + "loss": 0.7816, + "step": 16690 + }, + { + "epoch": 0.8630044958916852, + "grad_norm": 0.47224000096321106, + "learning_rate": 3.881216128450315e-05, + "loss": 0.7736, + "step": 16700 + }, + { + "epoch": 0.8635212650509018, + "grad_norm": 0.4375690221786499, + "learning_rate": 3.857223967076515e-05, + "loss": 0.7812, + "step": 16710 + }, + { + "epoch": 0.8640380342101184, + "grad_norm": 0.4506520926952362, + "learning_rate": 3.833380115868479e-05, + "loss": 0.7993, + "step": 16720 + }, + { + "epoch": 0.8645548033693349, + "grad_norm": 0.45109614729881287, + "learning_rate": 3.809683658030725e-05, + "loss": 0.7912, + "step": 16730 + }, + { + "epoch": 0.8650715725285515, + "grad_norm": 0.4557834267616272, + "learning_rate": 3.7861336824350335e-05, + "loss": 0.7775, + "step": 16740 + }, + { + "epoch": 0.8655883416877681, + "grad_norm": 0.43183183670043945, + "learning_rate": 3.7627292835854304e-05, + "loss": 0.7656, + "step": 16750 + }, + { + "epoch": 0.8661051108469846, + "grad_norm": 0.43516460061073303, + "learning_rate": 3.7394695615833586e-05, + "loss": 0.7817, + "step": 16760 + }, + { + "epoch": 0.8666218800062012, + "grad_norm": 0.45719340443611145, + "learning_rate": 3.7163536220930875e-05, + "loss": 0.7886, + "step": 16770 + }, + { + "epoch": 0.8671386491654178, + "grad_norm": 0.45268991589546204, + "learning_rate": 3.693380576307314e-05, + "loss": 0.7874, + "step": 16780 + }, + { + "epoch": 0.8676554183246343, + "grad_norm": 0.43802937865257263, + "learning_rate": 3.6705495409130015e-05, + "loss": 0.7802, + "step": 16790 + }, + { + "epoch": 0.8681721874838509, + "grad_norm": 0.44268324971199036, + "learning_rate": 3.647859638057403e-05, + "loss": 0.7695, + "step": 16800 + }, + { + "epoch": 0.8686889566430676, + "grad_norm": 0.4444487988948822, + "learning_rate": 3.625309995314319e-05, + "loss": 0.7831, + "step": 16810 + }, + { + "epoch": 0.8692057258022842, + "grad_norm": 0.4433843493461609, + "learning_rate": 3.602899745650546e-05, + "loss": 0.7795, + "step": 16820 + }, + { + "epoch": 0.8697224949615007, + "grad_norm": 0.45644548535346985, + "learning_rate": 3.580628027392539e-05, + "loss": 0.7705, + "step": 16830 + }, + { + "epoch": 0.8702392641207173, + "grad_norm": 0.4484211802482605, + "learning_rate": 3.558493984193286e-05, + "loss": 0.7708, + "step": 16840 + }, + { + "epoch": 0.8707560332799339, + "grad_norm": 0.46782976388931274, + "learning_rate": 3.536496764999374e-05, + "loss": 0.7723, + "step": 16850 + }, + { + "epoch": 0.8712728024391504, + "grad_norm": 0.4290997087955475, + "learning_rate": 3.5146355240182734e-05, + "loss": 0.7832, + "step": 16860 + }, + { + "epoch": 0.871789571598367, + "grad_norm": 0.449011892080307, + "learning_rate": 3.492909420685807e-05, + "loss": 0.786, + "step": 16870 + }, + { + "epoch": 0.8723063407575836, + "grad_norm": 0.4471029043197632, + "learning_rate": 3.471317619633846e-05, + "loss": 0.7797, + "step": 16880 + }, + { + "epoch": 0.8728231099168001, + "grad_norm": 0.41699501872062683, + "learning_rate": 3.449859290658173e-05, + "loss": 0.7732, + "step": 16890 + }, + { + "epoch": 0.8733398790760167, + "grad_norm": 0.42831024527549744, + "learning_rate": 3.428533608686573e-05, + "loss": 0.7711, + "step": 16900 + }, + { + "epoch": 0.8738566482352333, + "grad_norm": 0.44072601199150085, + "learning_rate": 3.407339753747102e-05, + "loss": 0.7796, + "step": 16910 + }, + { + "epoch": 0.8743734173944498, + "grad_norm": 0.43595975637435913, + "learning_rate": 3.386276910936564e-05, + "loss": 0.7583, + "step": 16920 + }, + { + "epoch": 0.8748901865536665, + "grad_norm": 0.47178915143013, + "learning_rate": 3.365344270389179e-05, + "loss": 0.7815, + "step": 16930 + }, + { + "epoch": 0.8754069557128831, + "grad_norm": 0.4295157790184021, + "learning_rate": 3.344541027245434e-05, + "loss": 0.7664, + "step": 16940 + }, + { + "epoch": 0.8759237248720997, + "grad_norm": 0.43913745880126953, + "learning_rate": 3.323866381621149e-05, + "loss": 0.767, + "step": 16950 + }, + { + "epoch": 0.8764404940313162, + "grad_norm": 0.4710383415222168, + "learning_rate": 3.3033195385767116e-05, + "loss": 0.7841, + "step": 16960 + }, + { + "epoch": 0.8769572631905328, + "grad_norm": 0.4420885443687439, + "learning_rate": 3.282899708086518e-05, + "loss": 0.7809, + "step": 16970 + }, + { + "epoch": 0.8774740323497494, + "grad_norm": 0.4158540666103363, + "learning_rate": 3.262606105008591e-05, + "loss": 0.7677, + "step": 16980 + }, + { + "epoch": 0.8779908015089659, + "grad_norm": 0.4570242464542389, + "learning_rate": 3.242437949054398e-05, + "loss": 0.7651, + "step": 16990 + }, + { + "epoch": 0.8785075706681825, + "grad_norm": 0.4389027953147888, + "learning_rate": 3.2223944647588423e-05, + "loss": 0.7688, + "step": 17000 + }, + { + "epoch": 0.8790243398273991, + "grad_norm": 0.4603040814399719, + "learning_rate": 3.202474881450452e-05, + "loss": 0.7836, + "step": 17010 + }, + { + "epoch": 0.8795411089866156, + "grad_norm": 0.43595853447914124, + "learning_rate": 3.18267843322174e-05, + "loss": 0.7632, + "step": 17020 + }, + { + "epoch": 0.8800578781458323, + "grad_norm": 0.45017024874687195, + "learning_rate": 3.163004358899766e-05, + "loss": 0.7783, + "step": 17030 + }, + { + "epoch": 0.8805746473050489, + "grad_norm": 0.4486757516860962, + "learning_rate": 3.143451902016862e-05, + "loss": 0.7764, + "step": 17040 + }, + { + "epoch": 0.8810914164642654, + "grad_norm": 0.44407910108566284, + "learning_rate": 3.124020310781543e-05, + "loss": 0.768, + "step": 17050 + }, + { + "epoch": 0.881608185623482, + "grad_norm": 0.43660351634025574, + "learning_rate": 3.1047088380496114e-05, + "loss": 0.7758, + "step": 17060 + }, + { + "epoch": 0.8821249547826986, + "grad_norm": 0.4449329674243927, + "learning_rate": 3.0855167412954175e-05, + "loss": 0.7875, + "step": 17070 + }, + { + "epoch": 0.8826417239419152, + "grad_norm": 0.43863120675086975, + "learning_rate": 3.066443282583321e-05, + "loss": 0.7723, + "step": 17080 + }, + { + "epoch": 0.8831584931011317, + "grad_norm": 0.4402186870574951, + "learning_rate": 3.0474877285393036e-05, + "loss": 0.7713, + "step": 17090 + }, + { + "epoch": 0.8836752622603483, + "grad_norm": 0.47123128175735474, + "learning_rate": 3.028649350322787e-05, + "loss": 0.7822, + "step": 17100 + }, + { + "epoch": 0.8841920314195649, + "grad_norm": 0.44672438502311707, + "learning_rate": 3.0099274235985934e-05, + "loss": 0.7716, + "step": 17110 + }, + { + "epoch": 0.8847088005787814, + "grad_norm": 0.4311140179634094, + "learning_rate": 2.9913212285091083e-05, + "loss": 0.7735, + "step": 17120 + }, + { + "epoch": 0.885225569737998, + "grad_norm": 0.42859673500061035, + "learning_rate": 2.9728300496465886e-05, + "loss": 0.768, + "step": 17130 + }, + { + "epoch": 0.8857423388972147, + "grad_norm": 0.4675106406211853, + "learning_rate": 2.954453176025668e-05, + "loss": 0.7915, + "step": 17140 + }, + { + "epoch": 0.8862591080564312, + "grad_norm": 0.44611257314682007, + "learning_rate": 2.936189901056014e-05, + "loss": 0.7661, + "step": 17150 + }, + { + "epoch": 0.8867758772156478, + "grad_norm": 0.4537068009376526, + "learning_rate": 2.918039522515154e-05, + "loss": 0.7732, + "step": 17160 + }, + { + "epoch": 0.8872926463748644, + "grad_norm": 0.451235830783844, + "learning_rate": 2.900001342521487e-05, + "loss": 0.7765, + "step": 17170 + }, + { + "epoch": 0.8878094155340809, + "grad_norm": 0.42030608654022217, + "learning_rate": 2.882074667507437e-05, + "loss": 0.764, + "step": 17180 + }, + { + "epoch": 0.8883261846932975, + "grad_norm": 0.4544169306755066, + "learning_rate": 2.8642588081927974e-05, + "loss": 0.7751, + "step": 17190 + }, + { + "epoch": 0.8888429538525141, + "grad_norm": 0.4388182759284973, + "learning_rate": 2.8465530795582176e-05, + "loss": 0.7677, + "step": 17200 + }, + { + "epoch": 0.8893597230117306, + "grad_norm": 0.4463309645652771, + "learning_rate": 2.8289568008188735e-05, + "loss": 0.7847, + "step": 17210 + }, + { + "epoch": 0.8898764921709472, + "grad_norm": 0.42829135060310364, + "learning_rate": 2.8114692953982826e-05, + "loss": 0.7622, + "step": 17220 + }, + { + "epoch": 0.8903932613301638, + "grad_norm": 0.4384378492832184, + "learning_rate": 2.7940898909022972e-05, + "loss": 0.7695, + "step": 17230 + }, + { + "epoch": 0.8909100304893804, + "grad_norm": 0.4420071542263031, + "learning_rate": 2.7768179190932436e-05, + "loss": 0.7716, + "step": 17240 + }, + { + "epoch": 0.891426799648597, + "grad_norm": 0.4406958818435669, + "learning_rate": 2.7596527158642362e-05, + "loss": 0.772, + "step": 17250 + }, + { + "epoch": 0.8919435688078136, + "grad_norm": 0.46476542949676514, + "learning_rate": 2.7425936212136382e-05, + "loss": 0.7747, + "step": 17260 + }, + { + "epoch": 0.8924603379670302, + "grad_norm": 0.44601190090179443, + "learning_rate": 2.7256399792196816e-05, + "loss": 0.7739, + "step": 17270 + }, + { + "epoch": 0.8929771071262467, + "grad_norm": 0.4409795105457306, + "learning_rate": 2.7087911380152546e-05, + "loss": 0.7703, + "step": 17280 + }, + { + "epoch": 0.8934938762854633, + "grad_norm": 0.4447353780269623, + "learning_rate": 2.6920464497628288e-05, + "loss": 0.7713, + "step": 17290 + }, + { + "epoch": 0.8940106454446799, + "grad_norm": 0.42424049973487854, + "learning_rate": 2.6754052706295595e-05, + "loss": 0.7662, + "step": 17300 + }, + { + "epoch": 0.8945274146038964, + "grad_norm": 0.4320373237133026, + "learning_rate": 2.6588669607625194e-05, + "loss": 0.764, + "step": 17310 + }, + { + "epoch": 0.895044183763113, + "grad_norm": 0.4584170877933502, + "learning_rate": 2.6424308842641074e-05, + "loss": 0.7697, + "step": 17320 + }, + { + "epoch": 0.8955609529223296, + "grad_norm": 0.4255240261554718, + "learning_rate": 2.6260964091675873e-05, + "loss": 0.7638, + "step": 17330 + }, + { + "epoch": 0.8960777220815461, + "grad_norm": 0.4410153329372406, + "learning_rate": 2.6098629074128e-05, + "loss": 0.7722, + "step": 17340 + }, + { + "epoch": 0.8965944912407627, + "grad_norm": 0.4603617787361145, + "learning_rate": 2.593729754822004e-05, + "loss": 0.7764, + "step": 17350 + }, + { + "epoch": 0.8971112603999793, + "grad_norm": 0.4616399109363556, + "learning_rate": 2.5776963310758847e-05, + "loss": 0.7828, + "step": 17360 + }, + { + "epoch": 0.897628029559196, + "grad_norm": 0.4478990435600281, + "learning_rate": 2.5617620196896944e-05, + "loss": 0.7677, + "step": 17370 + }, + { + "epoch": 0.8981447987184125, + "grad_norm": 0.4245089292526245, + "learning_rate": 2.545926207989558e-05, + "loss": 0.7751, + "step": 17380 + }, + { + "epoch": 0.8986615678776291, + "grad_norm": 0.4588530957698822, + "learning_rate": 2.530188287088909e-05, + "loss": 0.7735, + "step": 17390 + }, + { + "epoch": 0.8991783370368457, + "grad_norm": 0.4587204158306122, + "learning_rate": 2.5145476518650782e-05, + "loss": 0.7804, + "step": 17400 + }, + { + "epoch": 0.8996951061960622, + "grad_norm": 0.4349258244037628, + "learning_rate": 2.499003700936031e-05, + "loss": 0.78, + "step": 17410 + }, + { + "epoch": 0.9002118753552788, + "grad_norm": 0.46240687370300293, + "learning_rate": 2.4835558366372383e-05, + "loss": 0.7741, + "step": 17420 + }, + { + "epoch": 0.9007286445144954, + "grad_norm": 0.43434906005859375, + "learning_rate": 2.4682034649987037e-05, + "loss": 0.7757, + "step": 17430 + }, + { + "epoch": 0.9012454136737119, + "grad_norm": 0.45485690236091614, + "learning_rate": 2.4529459957221164e-05, + "loss": 0.7614, + "step": 17440 + }, + { + "epoch": 0.9017621828329285, + "grad_norm": 0.451511025428772, + "learning_rate": 2.4377828421581636e-05, + "loss": 0.775, + "step": 17450 + }, + { + "epoch": 0.9022789519921451, + "grad_norm": 0.44211798906326294, + "learning_rate": 2.422713421283965e-05, + "loss": 0.7715, + "step": 17460 + }, + { + "epoch": 0.9027957211513616, + "grad_norm": 0.43941619992256165, + "learning_rate": 2.4077371536806647e-05, + "loss": 0.7762, + "step": 17470 + }, + { + "epoch": 0.9033124903105783, + "grad_norm": 0.4729272723197937, + "learning_rate": 2.392853463511143e-05, + "loss": 0.7889, + "step": 17480 + }, + { + "epoch": 0.9038292594697949, + "grad_norm": 0.45001113414764404, + "learning_rate": 2.3780617784978833e-05, + "loss": 0.7644, + "step": 17490 + }, + { + "epoch": 0.9043460286290114, + "grad_norm": 0.44931286573410034, + "learning_rate": 2.3633615299009652e-05, + "loss": 0.7628, + "step": 17500 + }, + { + "epoch": 0.904862797788228, + "grad_norm": 0.43167644739151, + "learning_rate": 2.348752152496193e-05, + "loss": 0.7707, + "step": 17510 + }, + { + "epoch": 0.9053795669474446, + "grad_norm": 0.4542749226093292, + "learning_rate": 2.33423308455337e-05, + "loss": 0.7687, + "step": 17520 + }, + { + "epoch": 0.9058963361066612, + "grad_norm": 0.4356542229652405, + "learning_rate": 2.319803767814693e-05, + "loss": 0.7656, + "step": 17530 + }, + { + "epoch": 0.9064131052658777, + "grad_norm": 0.4345816373825073, + "learning_rate": 2.305463647473293e-05, + "loss": 0.7564, + "step": 17540 + }, + { + "epoch": 0.9069298744250943, + "grad_norm": 0.4554193317890167, + "learning_rate": 2.291212172151897e-05, + "loss": 0.7659, + "step": 17550 + }, + { + "epoch": 0.9074466435843109, + "grad_norm": 0.4463479518890381, + "learning_rate": 2.2770487938816346e-05, + "loss": 0.7608, + "step": 17560 + }, + { + "epoch": 0.9079634127435274, + "grad_norm": 0.4824206829071045, + "learning_rate": 2.262972968080962e-05, + "loss": 0.7768, + "step": 17570 + }, + { + "epoch": 0.908480181902744, + "grad_norm": 0.4427326023578644, + "learning_rate": 2.248984153534727e-05, + "loss": 0.7791, + "step": 17580 + }, + { + "epoch": 0.9089969510619607, + "grad_norm": 0.4576285779476166, + "learning_rate": 2.2350818123733565e-05, + "loss": 0.7788, + "step": 17590 + }, + { + "epoch": 0.9095137202211772, + "grad_norm": 0.40807288885116577, + "learning_rate": 2.2212654100521793e-05, + "loss": 0.7733, + "step": 17600 + }, + { + "epoch": 0.9100304893803938, + "grad_norm": 0.4429195821285248, + "learning_rate": 2.20753441533087e-05, + "loss": 0.796, + "step": 17610 + }, + { + "epoch": 0.9105472585396104, + "grad_norm": 0.4344060719013214, + "learning_rate": 2.19388830025302e-05, + "loss": 0.7661, + "step": 17620 + }, + { + "epoch": 0.9110640276988269, + "grad_norm": 0.4657835364341736, + "learning_rate": 2.180326540125846e-05, + "loss": 0.7738, + "step": 17630 + }, + { + "epoch": 0.9115807968580435, + "grad_norm": 0.4533781111240387, + "learning_rate": 2.166848613500005e-05, + "loss": 0.7719, + "step": 17640 + }, + { + "epoch": 0.9120975660172601, + "grad_norm": 0.43933114409446716, + "learning_rate": 2.1534540021495556e-05, + "loss": 0.769, + "step": 17650 + }, + { + "epoch": 0.9126143351764767, + "grad_norm": 0.439761221408844, + "learning_rate": 2.140142191052022e-05, + "loss": 0.7698, + "step": 17660 + }, + { + "epoch": 0.9131311043356932, + "grad_norm": 0.471292644739151, + "learning_rate": 2.1269126683685998e-05, + "loss": 0.7586, + "step": 17670 + }, + { + "epoch": 0.9136478734949098, + "grad_norm": 0.45629554986953735, + "learning_rate": 2.1137649254244677e-05, + "loss": 0.794, + "step": 17680 + }, + { + "epoch": 0.9141646426541264, + "grad_norm": 0.4637652039527893, + "learning_rate": 2.1006984566892386e-05, + "loss": 0.7757, + "step": 17690 + }, + { + "epoch": 0.914681411813343, + "grad_norm": 0.4626142382621765, + "learning_rate": 2.087712759757512e-05, + "loss": 0.7778, + "step": 17700 + }, + { + "epoch": 0.9151981809725596, + "grad_norm": 0.4568713903427124, + "learning_rate": 2.074807335329564e-05, + "loss": 0.7972, + "step": 17710 + }, + { + "epoch": 0.9157149501317762, + "grad_norm": 0.43964695930480957, + "learning_rate": 2.061981687192147e-05, + "loss": 0.7651, + "step": 17720 + }, + { + "epoch": 0.9162317192909927, + "grad_norm": 0.45957452058792114, + "learning_rate": 2.0492353221994066e-05, + "loss": 0.7744, + "step": 17730 + }, + { + "epoch": 0.9167484884502093, + "grad_norm": 0.42849215865135193, + "learning_rate": 2.0365677502539268e-05, + "loss": 0.7602, + "step": 17740 + }, + { + "epoch": 0.9172652576094259, + "grad_norm": 0.4392319619655609, + "learning_rate": 2.0239784842878798e-05, + "loss": 0.7822, + "step": 17750 + }, + { + "epoch": 0.9177820267686424, + "grad_norm": 0.43897444009780884, + "learning_rate": 2.011467040244303e-05, + "loss": 0.7793, + "step": 17760 + }, + { + "epoch": 0.918298795927859, + "grad_norm": 0.4271240532398224, + "learning_rate": 1.9990329370584816e-05, + "loss": 0.7727, + "step": 17770 + }, + { + "epoch": 0.9188155650870756, + "grad_norm": 0.43358883261680603, + "learning_rate": 1.9866756966394584e-05, + "loss": 0.7884, + "step": 17780 + }, + { + "epoch": 0.9193323342462921, + "grad_norm": 0.4576852023601532, + "learning_rate": 1.9743948438516452e-05, + "loss": 0.7845, + "step": 17790 + }, + { + "epoch": 0.9198491034055087, + "grad_norm": 0.4521750211715698, + "learning_rate": 1.962189906496559e-05, + "loss": 0.7652, + "step": 17800 + }, + { + "epoch": 0.9203658725647254, + "grad_norm": 0.4462205469608307, + "learning_rate": 1.9500604152946586e-05, + "loss": 0.7748, + "step": 17810 + }, + { + "epoch": 0.920882641723942, + "grad_norm": 0.4531271457672119, + "learning_rate": 1.9380059038673104e-05, + "loss": 0.7843, + "step": 17820 + }, + { + "epoch": 0.9213994108831585, + "grad_norm": 0.4446341097354889, + "learning_rate": 1.9260259087188497e-05, + "loss": 0.7529, + "step": 17830 + }, + { + "epoch": 0.9219161800423751, + "grad_norm": 0.4507541060447693, + "learning_rate": 1.9141199692187586e-05, + "loss": 0.7641, + "step": 17840 + }, + { + "epoch": 0.9224329492015917, + "grad_norm": 0.4495556056499481, + "learning_rate": 1.9022876275839615e-05, + "loss": 0.7679, + "step": 17850 + }, + { + "epoch": 0.9229497183608082, + "grad_norm": 0.448811799287796, + "learning_rate": 1.890528428861213e-05, + "loss": 0.7744, + "step": 17860 + }, + { + "epoch": 0.9234664875200248, + "grad_norm": 0.45697128772735596, + "learning_rate": 1.8788419209096178e-05, + "loss": 0.7723, + "step": 17870 + }, + { + "epoch": 0.9239832566792414, + "grad_norm": 0.43319204449653625, + "learning_rate": 1.8672276543832325e-05, + "loss": 0.7901, + "step": 17880 + }, + { + "epoch": 0.9245000258384579, + "grad_norm": 0.4573897123336792, + "learning_rate": 1.855685182713799e-05, + "loss": 0.7739, + "step": 17890 + }, + { + "epoch": 0.9250167949976745, + "grad_norm": 0.4467730224132538, + "learning_rate": 1.8442140620935673e-05, + "loss": 0.7709, + "step": 17900 + }, + { + "epoch": 0.9255335641568911, + "grad_norm": 0.4632819592952728, + "learning_rate": 1.8328138514582353e-05, + "loss": 0.7597, + "step": 17910 + }, + { + "epoch": 0.9260503333161076, + "grad_norm": 0.45948299765586853, + "learning_rate": 1.821484112469986e-05, + "loss": 0.7795, + "step": 17920 + }, + { + "epoch": 0.9265671024753243, + "grad_norm": 0.464005708694458, + "learning_rate": 1.810224409500637e-05, + "loss": 0.7693, + "step": 17930 + }, + { + "epoch": 0.9270838716345409, + "grad_norm": 0.4494501054286957, + "learning_rate": 1.79903430961489e-05, + "loss": 0.7754, + "step": 17940 + }, + { + "epoch": 0.9276006407937575, + "grad_norm": 0.4453310966491699, + "learning_rate": 1.7879133825536803e-05, + "loss": 0.7703, + "step": 17950 + }, + { + "epoch": 0.928117409952974, + "grad_norm": 0.4534304141998291, + "learning_rate": 1.7768612007176403e-05, + "loss": 0.7694, + "step": 17960 + }, + { + "epoch": 0.9286341791121906, + "grad_norm": 0.42768940329551697, + "learning_rate": 1.7658773391506503e-05, + "loss": 0.7753, + "step": 17970 + }, + { + "epoch": 0.9291509482714072, + "grad_norm": 0.4579961597919464, + "learning_rate": 1.754961375523509e-05, + "loss": 0.7756, + "step": 17980 + }, + { + "epoch": 0.9296677174306237, + "grad_norm": 0.43378955125808716, + "learning_rate": 1.744112890117683e-05, + "loss": 0.7584, + "step": 17990 + }, + { + "epoch": 0.9301844865898403, + "grad_norm": 0.4437185823917389, + "learning_rate": 1.7333314658091796e-05, + "loss": 0.7636, + "step": 18000 + }, + { + "epoch": 0.9307012557490569, + "grad_norm": 0.4335078299045563, + "learning_rate": 1.7226166880525008e-05, + "loss": 0.7676, + "step": 18010 + }, + { + "epoch": 0.9312180249082734, + "grad_norm": 0.4542897343635559, + "learning_rate": 1.711968144864709e-05, + "loss": 0.7743, + "step": 18020 + }, + { + "epoch": 0.93173479406749, + "grad_norm": 0.46580132842063904, + "learning_rate": 1.7013854268095815e-05, + "loss": 0.7722, + "step": 18030 + }, + { + "epoch": 0.9322515632267067, + "grad_norm": 0.4515324532985687, + "learning_rate": 1.6908681269818735e-05, + "loss": 0.7711, + "step": 18040 + }, + { + "epoch": 0.9327683323859232, + "grad_norm": 0.4366278350353241, + "learning_rate": 1.6804158409916664e-05, + "loss": 0.7707, + "step": 18050 + }, + { + "epoch": 0.9332851015451398, + "grad_norm": 0.45202723145484924, + "learning_rate": 1.6700281669488236e-05, + "loss": 0.7733, + "step": 18060 + }, + { + "epoch": 0.9338018707043564, + "grad_norm": 0.4829843044281006, + "learning_rate": 1.6597047054475375e-05, + "loss": 0.7772, + "step": 18070 + }, + { + "epoch": 0.934318639863573, + "grad_norm": 0.45102638006210327, + "learning_rate": 1.6494450595509677e-05, + "loss": 0.7736, + "step": 18080 + }, + { + "epoch": 0.9348354090227895, + "grad_norm": 0.43405377864837646, + "learning_rate": 1.639248834775986e-05, + "loss": 0.7655, + "step": 18090 + }, + { + "epoch": 0.9353521781820061, + "grad_norm": 0.44487160444259644, + "learning_rate": 1.6291156390780006e-05, + "loss": 0.7617, + "step": 18100 + }, + { + "epoch": 0.9358689473412227, + "grad_norm": 0.4330504238605499, + "learning_rate": 1.6190450828358913e-05, + "loss": 0.7771, + "step": 18110 + }, + { + "epoch": 0.9363857165004392, + "grad_norm": 0.44895511865615845, + "learning_rate": 1.6090367788370184e-05, + "loss": 0.7787, + "step": 18120 + }, + { + "epoch": 0.9369024856596558, + "grad_norm": 0.4521077275276184, + "learning_rate": 1.599090342262343e-05, + "loss": 0.7599, + "step": 18130 + }, + { + "epoch": 0.9374192548188724, + "grad_norm": 0.4501364529132843, + "learning_rate": 1.589205390671625e-05, + "loss": 0.7611, + "step": 18140 + }, + { + "epoch": 0.937936023978089, + "grad_norm": 0.45777976512908936, + "learning_rate": 1.5793815439887217e-05, + "loss": 0.7609, + "step": 18150 + }, + { + "epoch": 0.9384527931373056, + "grad_norm": 0.4469406306743622, + "learning_rate": 1.569618424486971e-05, + "loss": 0.7669, + "step": 18160 + }, + { + "epoch": 0.9389695622965222, + "grad_norm": 0.44795021414756775, + "learning_rate": 1.5599156567746714e-05, + "loss": 0.7748, + "step": 18170 + }, + { + "epoch": 0.9394863314557387, + "grad_norm": 0.46077170968055725, + "learning_rate": 1.5502728677806457e-05, + "loss": 0.7829, + "step": 18180 + }, + { + "epoch": 0.9400031006149553, + "grad_norm": 0.4519754946231842, + "learning_rate": 1.5406896867398952e-05, + "loss": 0.7608, + "step": 18190 + }, + { + "epoch": 0.9405198697741719, + "grad_norm": 0.43412908911705017, + "learning_rate": 1.5311657451793483e-05, + "loss": 0.7739, + "step": 18200 + }, + { + "epoch": 0.9410366389333884, + "grad_norm": 0.44264018535614014, + "learning_rate": 1.5217006769036868e-05, + "loss": 0.7754, + "step": 18210 + }, + { + "epoch": 0.941553408092605, + "grad_norm": 0.42187464237213135, + "learning_rate": 1.5122941179812719e-05, + "loss": 0.7649, + "step": 18220 + }, + { + "epoch": 0.9420701772518216, + "grad_norm": 0.44390153884887695, + "learning_rate": 1.5029457067301455e-05, + "loss": 0.759, + "step": 18230 + }, + { + "epoch": 0.9425869464110382, + "grad_norm": 0.43942004442214966, + "learning_rate": 1.4936550837041282e-05, + "loss": 0.7693, + "step": 18240 + }, + { + "epoch": 0.9431037155702547, + "grad_norm": 0.44910815358161926, + "learning_rate": 1.4844218916789941e-05, + "loss": 0.7672, + "step": 18250 + }, + { + "epoch": 0.9436204847294714, + "grad_norm": 0.4458234906196594, + "learning_rate": 1.4752457756387405e-05, + "loss": 0.7841, + "step": 18260 + }, + { + "epoch": 0.944137253888688, + "grad_norm": 0.42799797654151917, + "learning_rate": 1.4661263827619318e-05, + "loss": 0.7717, + "step": 18270 + }, + { + "epoch": 0.9446540230479045, + "grad_norm": 0.4394701421260834, + "learning_rate": 1.4570633624081393e-05, + "loss": 0.7702, + "step": 18280 + }, + { + "epoch": 0.9451707922071211, + "grad_norm": 0.44984373450279236, + "learning_rate": 1.4480563661044558e-05, + "loss": 0.7719, + "step": 18290 + }, + { + "epoch": 0.9456875613663377, + "grad_norm": 0.446482390165329, + "learning_rate": 1.4391050475320961e-05, + "loss": 0.7572, + "step": 18300 + }, + { + "epoch": 0.9462043305255542, + "grad_norm": 0.4424509108066559, + "learning_rate": 1.4302090625130843e-05, + "loss": 0.7773, + "step": 18310 + }, + { + "epoch": 0.9467210996847708, + "grad_norm": 0.4587627649307251, + "learning_rate": 1.4213680689970162e-05, + "loss": 0.7723, + "step": 18320 + }, + { + "epoch": 0.9472378688439874, + "grad_norm": 0.4332590699195862, + "learning_rate": 1.4125817270479119e-05, + "loss": 0.7649, + "step": 18330 + }, + { + "epoch": 0.9477546380032039, + "grad_norm": 0.4457739591598511, + "learning_rate": 1.4038496988311402e-05, + "loss": 0.7722, + "step": 18340 + }, + { + "epoch": 0.9482714071624205, + "grad_norm": 0.4352693557739258, + "learning_rate": 1.3951716486004345e-05, + "loss": 0.7592, + "step": 18350 + }, + { + "epoch": 0.9487881763216371, + "grad_norm": 0.44573667645454407, + "learning_rate": 1.3865472426849772e-05, + "loss": 0.7637, + "step": 18360 + }, + { + "epoch": 0.9493049454808538, + "grad_norm": 0.4508999288082123, + "learning_rate": 1.3779761494765763e-05, + "loss": 0.7627, + "step": 18370 + }, + { + "epoch": 0.9498217146400703, + "grad_norm": 0.46261972188949585, + "learning_rate": 1.3694580394169099e-05, + "loss": 0.7798, + "step": 18380 + }, + { + "epoch": 0.9503384837992869, + "grad_norm": 0.446575790643692, + "learning_rate": 1.360992584984858e-05, + "loss": 0.7636, + "step": 18390 + }, + { + "epoch": 0.9508552529585035, + "grad_norm": 0.4478476941585541, + "learning_rate": 1.3525794606839085e-05, + "loss": 0.7757, + "step": 18400 + }, + { + "epoch": 0.95137202211772, + "grad_norm": 0.4484612047672272, + "learning_rate": 1.3442183430296398e-05, + "loss": 0.7695, + "step": 18410 + }, + { + "epoch": 0.9518887912769366, + "grad_norm": 0.45452138781547546, + "learning_rate": 1.3359089105372866e-05, + "loss": 0.7659, + "step": 18420 + }, + { + "epoch": 0.9524055604361532, + "grad_norm": 0.4534998834133148, + "learning_rate": 1.3276508437093752e-05, + "loss": 0.763, + "step": 18430 + }, + { + "epoch": 0.9529223295953697, + "grad_norm": 0.43683722615242004, + "learning_rate": 1.3194438250234418e-05, + "loss": 0.7744, + "step": 18440 + }, + { + "epoch": 0.9534390987545863, + "grad_norm": 0.4494810998439789, + "learning_rate": 1.3112875389198208e-05, + "loss": 0.7645, + "step": 18450 + }, + { + "epoch": 0.9539558679138029, + "grad_norm": 0.449897825717926, + "learning_rate": 1.3031816717895151e-05, + "loss": 0.7641, + "step": 18460 + }, + { + "epoch": 0.9544726370730194, + "grad_norm": 0.4382020831108093, + "learning_rate": 1.2951259119621336e-05, + "loss": 0.7748, + "step": 18470 + }, + { + "epoch": 0.954989406232236, + "grad_norm": 0.46431413292884827, + "learning_rate": 1.2871199496939121e-05, + "loss": 0.7683, + "step": 18480 + }, + { + "epoch": 0.9555061753914527, + "grad_norm": 0.4337891936302185, + "learning_rate": 1.2791634771557991e-05, + "loss": 0.7561, + "step": 18490 + }, + { + "epoch": 0.9560229445506692, + "grad_norm": 0.46482157707214355, + "learning_rate": 1.2712561884216234e-05, + "loss": 0.7601, + "step": 18500 + }, + { + "epoch": 0.9565397137098858, + "grad_norm": 0.4410005211830139, + "learning_rate": 1.2633977794563303e-05, + "loss": 0.773, + "step": 18510 + }, + { + "epoch": 0.9570564828691024, + "grad_norm": 0.46581384539604187, + "learning_rate": 1.2555879481042893e-05, + "loss": 0.7753, + "step": 18520 + }, + { + "epoch": 0.957573252028319, + "grad_norm": 0.45101165771484375, + "learning_rate": 1.2478263940776792e-05, + "loss": 0.7647, + "step": 18530 + }, + { + "epoch": 0.9580900211875355, + "grad_norm": 0.44979819655418396, + "learning_rate": 1.2401128189449399e-05, + "loss": 0.775, + "step": 18540 + }, + { + "epoch": 0.9586067903467521, + "grad_norm": 0.4470668435096741, + "learning_rate": 1.2324469261193e-05, + "loss": 0.7579, + "step": 18550 + }, + { + "epoch": 0.9591235595059687, + "grad_norm": 0.4402695596218109, + "learning_rate": 1.2248284208473693e-05, + "loss": 0.7793, + "step": 18560 + }, + { + "epoch": 0.9596403286651852, + "grad_norm": 0.4400414526462555, + "learning_rate": 1.2172570101978107e-05, + "loss": 0.7725, + "step": 18570 + }, + { + "epoch": 0.9601570978244018, + "grad_norm": 0.43797457218170166, + "learning_rate": 1.2097324030500717e-05, + "loss": 0.7474, + "step": 18580 + }, + { + "epoch": 0.9606738669836185, + "grad_norm": 0.47379326820373535, + "learning_rate": 1.2022543100831949e-05, + "loss": 0.7644, + "step": 18590 + }, + { + "epoch": 0.961190636142835, + "grad_norm": 0.4277331829071045, + "learning_rate": 1.1948224437646907e-05, + "loss": 0.7698, + "step": 18600 + }, + { + "epoch": 0.9617074053020516, + "grad_norm": 0.46481338143348694, + "learning_rate": 1.1874365183394848e-05, + "loss": 0.7575, + "step": 18610 + }, + { + "epoch": 0.9622241744612682, + "grad_norm": 0.4436621367931366, + "learning_rate": 1.1800962498189266e-05, + "loss": 0.7714, + "step": 18620 + }, + { + "epoch": 0.9627409436204847, + "grad_norm": 0.44922277331352234, + "learning_rate": 1.1728013559698744e-05, + "loss": 0.7711, + "step": 18630 + }, + { + "epoch": 0.9632577127797013, + "grad_norm": 0.4406448006629944, + "learning_rate": 1.1655515563038412e-05, + "loss": 0.7645, + "step": 18640 + }, + { + "epoch": 0.9637744819389179, + "grad_norm": 0.4575316607952118, + "learning_rate": 1.1583465720662092e-05, + "loss": 0.7774, + "step": 18650 + }, + { + "epoch": 0.9642912510981345, + "grad_norm": 0.44259268045425415, + "learning_rate": 1.1511861262255142e-05, + "loss": 0.7791, + "step": 18660 + }, + { + "epoch": 0.964808020257351, + "grad_norm": 0.43396565318107605, + "learning_rate": 1.14406994346279e-05, + "loss": 0.7552, + "step": 18670 + }, + { + "epoch": 0.9653247894165676, + "grad_norm": 0.4611850082874298, + "learning_rate": 1.1369977501609877e-05, + "loss": 0.7747, + "step": 18680 + }, + { + "epoch": 0.9658415585757842, + "grad_norm": 0.4555375277996063, + "learning_rate": 1.129969274394449e-05, + "loss": 0.7726, + "step": 18690 + }, + { + "epoch": 0.9663583277350007, + "grad_norm": 0.4663475453853607, + "learning_rate": 1.1229842459184562e-05, + "loss": 0.7596, + "step": 18700 + }, + { + "epoch": 0.9668750968942174, + "grad_norm": 0.45513424277305603, + "learning_rate": 1.1160423961588368e-05, + "loss": 0.7813, + "step": 18710 + }, + { + "epoch": 0.967391866053434, + "grad_norm": 0.4629857838153839, + "learning_rate": 1.1091434582016413e-05, + "loss": 0.7668, + "step": 18720 + }, + { + "epoch": 0.9679086352126505, + "grad_norm": 0.45282307267189026, + "learning_rate": 1.1022871667828753e-05, + "loss": 0.7543, + "step": 18730 + }, + { + "epoch": 0.9684254043718671, + "grad_norm": 0.4608106315135956, + "learning_rate": 1.0954732582783043e-05, + "loss": 0.7588, + "step": 18740 + }, + { + "epoch": 0.9689421735310837, + "grad_norm": 0.44871219992637634, + "learning_rate": 1.088701470693316e-05, + "loss": 0.7681, + "step": 18750 + }, + { + "epoch": 0.9694589426903002, + "grad_norm": 0.4576722979545593, + "learning_rate": 1.081971543652845e-05, + "loss": 0.7618, + "step": 18760 + }, + { + "epoch": 0.9699757118495168, + "grad_norm": 0.4332127571105957, + "learning_rate": 1.0752832183913647e-05, + "loss": 0.7586, + "step": 18770 + }, + { + "epoch": 0.9704924810087334, + "grad_norm": 0.44485628604888916, + "learning_rate": 1.0686362377429339e-05, + "loss": 0.7737, + "step": 18780 + }, + { + "epoch": 0.9710092501679499, + "grad_norm": 0.45990100502967834, + "learning_rate": 1.0620303461313126e-05, + "loss": 0.7679, + "step": 18790 + }, + { + "epoch": 0.9715260193271665, + "grad_norm": 0.4547218084335327, + "learning_rate": 1.0554652895601313e-05, + "loss": 0.7559, + "step": 18800 + }, + { + "epoch": 0.9720427884863831, + "grad_norm": 0.43457552790641785, + "learning_rate": 1.0489408156031289e-05, + "loss": 0.7512, + "step": 18810 + }, + { + "epoch": 0.9725595576455998, + "grad_norm": 0.44039562344551086, + "learning_rate": 1.0424566733944429e-05, + "loss": 0.7791, + "step": 18820 + }, + { + "epoch": 0.9730763268048163, + "grad_norm": 0.4435688257217407, + "learning_rate": 1.0360126136189671e-05, + "loss": 0.7738, + "step": 18830 + }, + { + "epoch": 0.9735930959640329, + "grad_norm": 0.4358065128326416, + "learning_rate": 1.0296083885027623e-05, + "loss": 0.7595, + "step": 18840 + }, + { + "epoch": 0.9741098651232495, + "grad_norm": 0.4542253613471985, + "learning_rate": 1.0232437518035322e-05, + "loss": 0.7802, + "step": 18850 + }, + { + "epoch": 0.974626634282466, + "grad_norm": 0.4499568045139313, + "learning_rate": 1.0169184588011541e-05, + "loss": 0.7556, + "step": 18860 + }, + { + "epoch": 0.9751434034416826, + "grad_norm": 0.42469751834869385, + "learning_rate": 1.0106322662882686e-05, + "loss": 0.7747, + "step": 18870 + }, + { + "epoch": 0.9756601726008992, + "grad_norm": 0.45162233710289, + "learning_rate": 1.00438493256093e-05, + "loss": 0.7716, + "step": 18880 + }, + { + "epoch": 0.9761769417601157, + "grad_norm": 0.45597076416015625, + "learning_rate": 9.981762174093112e-06, + "loss": 0.7779, + "step": 18890 + }, + { + "epoch": 0.9766937109193323, + "grad_norm": 0.4463193714618683, + "learning_rate": 9.920058821084695e-06, + "loss": 0.7686, + "step": 18900 + }, + { + "epoch": 0.9772104800785489, + "grad_norm": 0.4148988425731659, + "learning_rate": 9.858736894091644e-06, + "loss": 0.753, + "step": 18910 + }, + { + "epoch": 0.9777272492377654, + "grad_norm": 0.4257926940917969, + "learning_rate": 9.797794035287406e-06, + "loss": 0.7675, + "step": 18920 + }, + { + "epoch": 0.978244018396982, + "grad_norm": 0.4566889703273773, + "learning_rate": 9.737227901420558e-06, + "loss": 0.7674, + "step": 18930 + }, + { + "epoch": 0.9787607875561987, + "grad_norm": 0.46036675572395325, + "learning_rate": 9.677036163724766e-06, + "loss": 0.7701, + "step": 18940 + }, + { + "epoch": 0.9792775567154153, + "grad_norm": 0.4719618260860443, + "learning_rate": 9.617216507829204e-06, + "loss": 0.7577, + "step": 18950 + }, + { + "epoch": 0.9797943258746318, + "grad_norm": 0.45223793387413025, + "learning_rate": 9.557766633669592e-06, + "loss": 0.7618, + "step": 18960 + }, + { + "epoch": 0.9803110950338484, + "grad_norm": 0.44620633125305176, + "learning_rate": 9.498684255399747e-06, + "loss": 0.7623, + "step": 18970 + }, + { + "epoch": 0.980827864193065, + "grad_norm": 0.4350356459617615, + "learning_rate": 9.439967101303683e-06, + "loss": 0.7659, + "step": 18980 + }, + { + "epoch": 0.9813446333522815, + "grad_norm": 0.434857040643692, + "learning_rate": 9.381612913708292e-06, + "loss": 0.7637, + "step": 18990 + }, + { + "epoch": 0.9818614025114981, + "grad_norm": 0.44825971126556396, + "learning_rate": 9.323619448896502e-06, + "loss": 0.766, + "step": 19000 + }, + { + "epoch": 0.9823781716707147, + "grad_norm": 0.4420020282268524, + "learning_rate": 9.26598447702104e-06, + "loss": 0.7644, + "step": 19010 + }, + { + "epoch": 0.9828949408299312, + "grad_norm": 0.44582831859588623, + "learning_rate": 9.208705782018656e-06, + "loss": 0.7606, + "step": 19020 + }, + { + "epoch": 0.9834117099891478, + "grad_norm": 0.4383075535297394, + "learning_rate": 9.151781161524964e-06, + "loss": 0.7662, + "step": 19030 + }, + { + "epoch": 0.9839284791483645, + "grad_norm": 0.4672369062900543, + "learning_rate": 9.095208426789703e-06, + "loss": 0.7623, + "step": 19040 + }, + { + "epoch": 0.984445248307581, + "grad_norm": 0.4448625445365906, + "learning_rate": 9.03898540259264e-06, + "loss": 0.7767, + "step": 19050 + }, + { + "epoch": 0.9849620174667976, + "grad_norm": 0.45743006467819214, + "learning_rate": 8.983109927159886e-06, + "loss": 0.7655, + "step": 19060 + }, + { + "epoch": 0.9854787866260142, + "grad_norm": 0.4571949243545532, + "learning_rate": 8.927579852080794e-06, + "loss": 0.7569, + "step": 19070 + }, + { + "epoch": 0.9859955557852308, + "grad_norm": 0.4542441666126251, + "learning_rate": 8.872393042225366e-06, + "loss": 0.7726, + "step": 19080 + }, + { + "epoch": 0.9865123249444473, + "grad_norm": 0.4544001817703247, + "learning_rate": 8.817547375662121e-06, + "loss": 0.7624, + "step": 19090 + }, + { + "epoch": 0.9870290941036639, + "grad_norm": 0.44613394141197205, + "learning_rate": 8.763040743576555e-06, + "loss": 0.7729, + "step": 19100 + }, + { + "epoch": 0.9875458632628805, + "grad_norm": 0.4503871202468872, + "learning_rate": 8.708871050190002e-06, + "loss": 0.7619, + "step": 19110 + }, + { + "epoch": 0.988062632422097, + "grad_norm": 0.45252034068107605, + "learning_rate": 8.65503621267911e-06, + "loss": 0.7617, + "step": 19120 + }, + { + "epoch": 0.9885794015813136, + "grad_norm": 0.4656429886817932, + "learning_rate": 8.601534161095704e-06, + "loss": 0.7733, + "step": 19130 + }, + { + "epoch": 0.9890961707405302, + "grad_norm": 0.44941556453704834, + "learning_rate": 8.548362838287236e-06, + "loss": 0.765, + "step": 19140 + }, + { + "epoch": 0.9896129398997467, + "grad_norm": 0.4554784893989563, + "learning_rate": 8.495520199817657e-06, + "loss": 0.7708, + "step": 19150 + }, + { + "epoch": 0.9901297090589634, + "grad_norm": 0.44851189851760864, + "learning_rate": 8.443004213888836e-06, + "loss": 0.7548, + "step": 19160 + }, + { + "epoch": 0.99064647821818, + "grad_norm": 0.43213942646980286, + "learning_rate": 8.390812861262414e-06, + "loss": 0.7583, + "step": 19170 + }, + { + "epoch": 0.9911632473773965, + "grad_norm": 0.4359610676765442, + "learning_rate": 8.33894413518218e-06, + "loss": 0.7451, + "step": 19180 + }, + { + "epoch": 0.9916800165366131, + "grad_norm": 0.4492233693599701, + "learning_rate": 8.287396041296902e-06, + "loss": 0.7648, + "step": 19190 + }, + { + "epoch": 0.9921967856958297, + "grad_norm": 0.45256808400154114, + "learning_rate": 8.236166597583653e-06, + "loss": 0.781, + "step": 19200 + }, + { + "epoch": 0.9927135548550462, + "grad_norm": 0.45061782002449036, + "learning_rate": 8.185253834271597e-06, + "loss": 0.7828, + "step": 19210 + }, + { + "epoch": 0.9932303240142628, + "grad_norm": 0.43763041496276855, + "learning_rate": 8.134655793766237e-06, + "loss": 0.7523, + "step": 19220 + }, + { + "epoch": 0.9937470931734794, + "grad_norm": 0.4337799847126007, + "learning_rate": 8.084370530574186e-06, + "loss": 0.7738, + "step": 19230 + }, + { + "epoch": 0.994263862332696, + "grad_norm": 0.45650362968444824, + "learning_rate": 8.034396111228312e-06, + "loss": 0.7676, + "step": 19240 + }, + { + "epoch": 0.9947806314919125, + "grad_norm": 0.458556205034256, + "learning_rate": 7.98473061421344e-06, + "loss": 0.7812, + "step": 19250 + }, + { + "epoch": 0.9952974006511291, + "grad_norm": 0.4379122853279114, + "learning_rate": 7.935372129892435e-06, + "loss": 0.7653, + "step": 19260 + }, + { + "epoch": 0.9958141698103458, + "grad_norm": 0.453417032957077, + "learning_rate": 7.886318760432809e-06, + "loss": 0.7701, + "step": 19270 + }, + { + "epoch": 0.9963309389695623, + "grad_norm": 0.4366815388202667, + "learning_rate": 7.837568619733714e-06, + "loss": 0.7665, + "step": 19280 + }, + { + "epoch": 0.9968477081287789, + "grad_norm": 0.4635095000267029, + "learning_rate": 7.78911983335346e-06, + "loss": 0.7694, + "step": 19290 + }, + { + "epoch": 0.9973644772879955, + "grad_norm": 0.4435023069381714, + "learning_rate": 7.740970538437405e-06, + "loss": 0.7689, + "step": 19300 + }, + { + "epoch": 0.997881246447212, + "grad_norm": 0.432817667722702, + "learning_rate": 7.693118883646362e-06, + "loss": 0.7592, + "step": 19310 + }, + { + "epoch": 0.9983980156064286, + "grad_norm": 0.45705628395080566, + "learning_rate": 7.64556302908539e-06, + "loss": 0.77, + "step": 19320 + }, + { + "epoch": 0.9989147847656452, + "grad_norm": 0.45206621289253235, + "learning_rate": 7.598301146233062e-06, + "loss": 0.7665, + "step": 19330 + }, + { + "epoch": 0.9994315539248617, + "grad_norm": 0.42955172061920166, + "learning_rate": 7.551331417871156e-06, + "loss": 0.7619, + "step": 19340 + }, + { + "epoch": 0.9999483230840783, + "grad_norm": 0.436574250459671, + "learning_rate": 7.50465203801478e-06, + "loss": 0.7581, + "step": 19350 + } + ], + "logging_steps": 10, + "max_steps": 19351, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.2816606598299008e+17, + "train_batch_size": 512, + "trial_name": null, + "trial_params": null +}