{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1500.0, "eval_steps": 500, "global_step": 43500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 1.7241379310344827, "grad_norm": 2.0060818195343018, "learning_rate": 4.9955162106231316e-05, "loss": 10.4539, "step": 50 }, { "epoch": 3.4482758620689653, "grad_norm": 3.088568925857544, "learning_rate": 4.98976776270407e-05, "loss": 10.299, "step": 100 }, { "epoch": 5.172413793103448, "grad_norm": 2.993016481399536, "learning_rate": 4.984019314785008e-05, "loss": 9.8799, "step": 150 }, { "epoch": 6.896551724137931, "grad_norm": 4.346968173980713, "learning_rate": 4.9782708668659466e-05, "loss": 9.5311, "step": 200 }, { "epoch": 8.620689655172415, "grad_norm": 8.983997344970703, "learning_rate": 4.9725224189468845e-05, "loss": 9.0023, "step": 250 }, { "epoch": 10.344827586206897, "grad_norm": 7.128100872039795, "learning_rate": 4.966773971027823e-05, "loss": 8.1923, "step": 300 }, { "epoch": 12.068965517241379, "grad_norm": 5.1490936279296875, "learning_rate": 4.961025523108761e-05, "loss": 7.2506, "step": 350 }, { "epoch": 13.793103448275861, "grad_norm": 10.747542381286621, "learning_rate": 4.955277075189699e-05, "loss": 6.2387, "step": 400 }, { "epoch": 15.517241379310345, "grad_norm": 6.818395137786865, "learning_rate": 4.949528627270637e-05, "loss": 5.6948, "step": 450 }, { "epoch": 17.24137931034483, "grad_norm": 5.117465496063232, "learning_rate": 4.943780179351575e-05, "loss": 5.2608, "step": 500 }, { "epoch": 18.96551724137931, "grad_norm": 6.664745807647705, "learning_rate": 4.9380317314325137e-05, "loss": 4.8891, "step": 550 }, { "epoch": 20.689655172413794, "grad_norm": 4.525327205657959, "learning_rate": 4.9322832835134515e-05, "loss": 4.4924, "step": 600 }, { "epoch": 22.413793103448278, "grad_norm": 6.5916218757629395, "learning_rate": 4.9265348355943894e-05, "loss": 4.3038, "step": 650 }, { "epoch": 24.137931034482758, "grad_norm": 5.0870513916015625, "learning_rate": 4.920786387675328e-05, "loss": 4.1693, "step": 700 }, { "epoch": 25.862068965517242, "grad_norm": 5.148299694061279, "learning_rate": 4.9150379397562665e-05, "loss": 3.9635, "step": 750 }, { "epoch": 27.586206896551722, "grad_norm": 12.492316246032715, "learning_rate": 4.909289491837204e-05, "loss": 3.8218, "step": 800 }, { "epoch": 29.310344827586206, "grad_norm": 6.042776107788086, "learning_rate": 4.903541043918142e-05, "loss": 3.7588, "step": 850 }, { "epoch": 31.03448275862069, "grad_norm": 8.02228832244873, "learning_rate": 4.897792595999081e-05, "loss": 3.5988, "step": 900 }, { "epoch": 32.758620689655174, "grad_norm": 3.3461897373199463, "learning_rate": 4.8920441480800186e-05, "loss": 3.5428, "step": 950 }, { "epoch": 34.48275862068966, "grad_norm": 3.7409300804138184, "learning_rate": 4.8862957001609564e-05, "loss": 3.3834, "step": 1000 }, { "epoch": 36.206896551724135, "grad_norm": 4.3111395835876465, "learning_rate": 4.880547252241895e-05, "loss": 3.3318, "step": 1050 }, { "epoch": 37.93103448275862, "grad_norm": 3.6318342685699463, "learning_rate": 4.8747988043228335e-05, "loss": 3.3532, "step": 1100 }, { "epoch": 39.6551724137931, "grad_norm": 3.8173635005950928, "learning_rate": 4.869050356403771e-05, "loss": 3.2379, "step": 1150 }, { "epoch": 41.37931034482759, "grad_norm": 4.711099147796631, "learning_rate": 4.863301908484709e-05, "loss": 3.1646, "step": 1200 }, { "epoch": 43.10344827586207, "grad_norm": 7.3676347732543945, "learning_rate": 4.857553460565648e-05, "loss": 3.2186, "step": 1250 }, { "epoch": 44.827586206896555, "grad_norm": 5.0018486976623535, "learning_rate": 4.8518050126465857e-05, "loss": 3.1036, "step": 1300 }, { "epoch": 46.55172413793103, "grad_norm": 3.642249822616577, "learning_rate": 4.8460565647275235e-05, "loss": 3.0257, "step": 1350 }, { "epoch": 48.275862068965516, "grad_norm": 5.078407287597656, "learning_rate": 4.840308116808462e-05, "loss": 3.0314, "step": 1400 }, { "epoch": 50.0, "grad_norm": 6.419682025909424, "learning_rate": 4.8345596688894e-05, "loss": 3.0169, "step": 1450 }, { "epoch": 51.724137931034484, "grad_norm": 4.125535488128662, "learning_rate": 4.8288112209703385e-05, "loss": 2.9953, "step": 1500 }, { "epoch": 53.44827586206897, "grad_norm": 5.300748825073242, "learning_rate": 4.823062773051276e-05, "loss": 2.8801, "step": 1550 }, { "epoch": 55.172413793103445, "grad_norm": 4.446888446807861, "learning_rate": 4.817314325132214e-05, "loss": 2.9408, "step": 1600 }, { "epoch": 56.89655172413793, "grad_norm": 4.873482704162598, "learning_rate": 4.811565877213153e-05, "loss": 2.8675, "step": 1650 }, { "epoch": 58.62068965517241, "grad_norm": 5.3052802085876465, "learning_rate": 4.805817429294091e-05, "loss": 2.8077, "step": 1700 }, { "epoch": 60.3448275862069, "grad_norm": 4.846802711486816, "learning_rate": 4.800068981375029e-05, "loss": 2.8166, "step": 1750 }, { "epoch": 62.06896551724138, "grad_norm": 4.726503372192383, "learning_rate": 4.794320533455967e-05, "loss": 2.7935, "step": 1800 }, { "epoch": 63.793103448275865, "grad_norm": 4.825628280639648, "learning_rate": 4.7885720855369055e-05, "loss": 2.7445, "step": 1850 }, { "epoch": 65.51724137931035, "grad_norm": 5.352137565612793, "learning_rate": 4.7828236376178434e-05, "loss": 2.7607, "step": 1900 }, { "epoch": 67.24137931034483, "grad_norm": 4.513598442077637, "learning_rate": 4.777075189698781e-05, "loss": 2.6762, "step": 1950 }, { "epoch": 68.96551724137932, "grad_norm": 5.330913543701172, "learning_rate": 4.77132674177972e-05, "loss": 2.6841, "step": 2000 }, { "epoch": 70.6896551724138, "grad_norm": 7.6014933586120605, "learning_rate": 4.765578293860658e-05, "loss": 2.6781, "step": 2050 }, { "epoch": 72.41379310344827, "grad_norm": 5.973616600036621, "learning_rate": 4.7598298459415955e-05, "loss": 2.6609, "step": 2100 }, { "epoch": 74.13793103448276, "grad_norm": 3.9735934734344482, "learning_rate": 4.754081398022534e-05, "loss": 2.5562, "step": 2150 }, { "epoch": 75.86206896551724, "grad_norm": 4.1465744972229, "learning_rate": 4.7483329501034726e-05, "loss": 2.5748, "step": 2200 }, { "epoch": 77.58620689655173, "grad_norm": 5.349466800689697, "learning_rate": 4.7425845021844104e-05, "loss": 2.5821, "step": 2250 }, { "epoch": 79.3103448275862, "grad_norm": 5.090352535247803, "learning_rate": 4.736836054265348e-05, "loss": 2.5716, "step": 2300 }, { "epoch": 81.03448275862068, "grad_norm": 4.871942043304443, "learning_rate": 4.731087606346287e-05, "loss": 2.5584, "step": 2350 }, { "epoch": 82.75862068965517, "grad_norm": 5.257153511047363, "learning_rate": 4.725339158427225e-05, "loss": 2.4672, "step": 2400 }, { "epoch": 84.48275862068965, "grad_norm": 4.021986484527588, "learning_rate": 4.719590710508163e-05, "loss": 2.4606, "step": 2450 }, { "epoch": 86.20689655172414, "grad_norm": 4.317248821258545, "learning_rate": 4.713842262589101e-05, "loss": 2.4418, "step": 2500 }, { "epoch": 87.93103448275862, "grad_norm": 5.078919887542725, "learning_rate": 4.708093814670039e-05, "loss": 2.4362, "step": 2550 }, { "epoch": 89.65517241379311, "grad_norm": 5.768272876739502, "learning_rate": 4.7023453667509775e-05, "loss": 2.4501, "step": 2600 }, { "epoch": 91.37931034482759, "grad_norm": 4.903597354888916, "learning_rate": 4.696596918831916e-05, "loss": 2.3619, "step": 2650 }, { "epoch": 93.10344827586206, "grad_norm": 6.515978813171387, "learning_rate": 4.690848470912854e-05, "loss": 2.3734, "step": 2700 }, { "epoch": 94.82758620689656, "grad_norm": 10.460058212280273, "learning_rate": 4.685100022993792e-05, "loss": 2.364, "step": 2750 }, { "epoch": 96.55172413793103, "grad_norm": 7.598303318023682, "learning_rate": 4.67935157507473e-05, "loss": 2.3303, "step": 2800 }, { "epoch": 98.27586206896552, "grad_norm": 7.762308597564697, "learning_rate": 4.673603127155668e-05, "loss": 2.3892, "step": 2850 }, { "epoch": 100.0, "grad_norm": 5.2604899406433105, "learning_rate": 4.667854679236606e-05, "loss": 2.3131, "step": 2900 }, { "epoch": 101.72413793103448, "grad_norm": 6.001845836639404, "learning_rate": 4.6621062313175446e-05, "loss": 2.3053, "step": 2950 }, { "epoch": 103.44827586206897, "grad_norm": 5.902701377868652, "learning_rate": 4.656357783398483e-05, "loss": 2.284, "step": 3000 }, { "epoch": 105.17241379310344, "grad_norm": 5.38364315032959, "learning_rate": 4.65060933547942e-05, "loss": 2.2305, "step": 3050 }, { "epoch": 106.89655172413794, "grad_norm": 6.552408695220947, "learning_rate": 4.644860887560359e-05, "loss": 2.2879, "step": 3100 }, { "epoch": 108.62068965517241, "grad_norm": 6.03168249130249, "learning_rate": 4.6391124396412974e-05, "loss": 2.3032, "step": 3150 }, { "epoch": 110.34482758620689, "grad_norm": 5.571142673492432, "learning_rate": 4.633363991722235e-05, "loss": 2.1709, "step": 3200 }, { "epoch": 112.06896551724138, "grad_norm": 5.904757976531982, "learning_rate": 4.627615543803173e-05, "loss": 2.2252, "step": 3250 }, { "epoch": 113.79310344827586, "grad_norm": 7.679347515106201, "learning_rate": 4.6218670958841116e-05, "loss": 2.2069, "step": 3300 }, { "epoch": 115.51724137931035, "grad_norm": 5.991060733795166, "learning_rate": 4.6161186479650495e-05, "loss": 2.1673, "step": 3350 }, { "epoch": 117.24137931034483, "grad_norm": 5.203935146331787, "learning_rate": 4.610370200045988e-05, "loss": 2.1409, "step": 3400 }, { "epoch": 118.96551724137932, "grad_norm": 5.779735088348389, "learning_rate": 4.604621752126926e-05, "loss": 2.1658, "step": 3450 }, { "epoch": 120.6896551724138, "grad_norm": 6.381357669830322, "learning_rate": 4.598873304207864e-05, "loss": 2.1707, "step": 3500 }, { "epoch": 122.41379310344827, "grad_norm": 6.57841682434082, "learning_rate": 4.593124856288802e-05, "loss": 2.1486, "step": 3550 }, { "epoch": 124.13793103448276, "grad_norm": 5.283041477203369, "learning_rate": 4.58737640836974e-05, "loss": 2.1096, "step": 3600 }, { "epoch": 125.86206896551724, "grad_norm": 6.344391822814941, "learning_rate": 4.581627960450679e-05, "loss": 2.0739, "step": 3650 }, { "epoch": 127.58620689655173, "grad_norm": 8.262744903564453, "learning_rate": 4.5758795125316166e-05, "loss": 2.0606, "step": 3700 }, { "epoch": 129.31034482758622, "grad_norm": 6.685667037963867, "learning_rate": 4.570131064612555e-05, "loss": 2.1267, "step": 3750 }, { "epoch": 131.0344827586207, "grad_norm": 5.96486234664917, "learning_rate": 4.564382616693493e-05, "loss": 2.011, "step": 3800 }, { "epoch": 132.75862068965517, "grad_norm": 5.791350364685059, "learning_rate": 4.558634168774431e-05, "loss": 2.0566, "step": 3850 }, { "epoch": 134.48275862068965, "grad_norm": 6.358405590057373, "learning_rate": 4.5528857208553694e-05, "loss": 1.9882, "step": 3900 }, { "epoch": 136.20689655172413, "grad_norm": 7.524918079376221, "learning_rate": 4.547137272936308e-05, "loss": 2.0044, "step": 3950 }, { "epoch": 137.93103448275863, "grad_norm": 12.758820533752441, "learning_rate": 4.541388825017245e-05, "loss": 2.0085, "step": 4000 }, { "epoch": 139.6551724137931, "grad_norm": 5.038642883300781, "learning_rate": 4.5356403770981836e-05, "loss": 1.9535, "step": 4050 }, { "epoch": 141.3793103448276, "grad_norm": 5.867166519165039, "learning_rate": 4.529891929179122e-05, "loss": 1.9702, "step": 4100 }, { "epoch": 143.10344827586206, "grad_norm": 8.320228576660156, "learning_rate": 4.52414348126006e-05, "loss": 1.9882, "step": 4150 }, { "epoch": 144.82758620689654, "grad_norm": 11.666614532470703, "learning_rate": 4.518395033340998e-05, "loss": 1.9514, "step": 4200 }, { "epoch": 146.55172413793105, "grad_norm": 8.031473159790039, "learning_rate": 4.5126465854219364e-05, "loss": 1.9452, "step": 4250 }, { "epoch": 148.27586206896552, "grad_norm": 7.733284950256348, "learning_rate": 4.506898137502874e-05, "loss": 1.9922, "step": 4300 }, { "epoch": 150.0, "grad_norm": 7.646688938140869, "learning_rate": 4.501149689583812e-05, "loss": 1.9233, "step": 4350 }, { "epoch": 151.72413793103448, "grad_norm": 7.040040016174316, "learning_rate": 4.495401241664751e-05, "loss": 1.9004, "step": 4400 }, { "epoch": 153.44827586206895, "grad_norm": 7.619781494140625, "learning_rate": 4.489652793745689e-05, "loss": 1.9199, "step": 4450 }, { "epoch": 155.17241379310346, "grad_norm": 6.080178737640381, "learning_rate": 4.483904345826627e-05, "loss": 1.8664, "step": 4500 }, { "epoch": 156.89655172413794, "grad_norm": 7.275068759918213, "learning_rate": 4.478155897907565e-05, "loss": 1.8476, "step": 4550 }, { "epoch": 158.6206896551724, "grad_norm": 7.104513168334961, "learning_rate": 4.4724074499885035e-05, "loss": 1.912, "step": 4600 }, { "epoch": 160.3448275862069, "grad_norm": 7.298790454864502, "learning_rate": 4.4666590020694414e-05, "loss": 1.8348, "step": 4650 }, { "epoch": 162.06896551724137, "grad_norm": 7.384130954742432, "learning_rate": 4.46091055415038e-05, "loss": 1.8569, "step": 4700 }, { "epoch": 163.79310344827587, "grad_norm": 7.758162498474121, "learning_rate": 4.455162106231318e-05, "loss": 1.7858, "step": 4750 }, { "epoch": 165.51724137931035, "grad_norm": 7.239351749420166, "learning_rate": 4.4494136583122556e-05, "loss": 1.8161, "step": 4800 }, { "epoch": 167.24137931034483, "grad_norm": 6.6620001792907715, "learning_rate": 4.443665210393194e-05, "loss": 1.8066, "step": 4850 }, { "epoch": 168.9655172413793, "grad_norm": 10.143238067626953, "learning_rate": 4.437916762474133e-05, "loss": 1.7736, "step": 4900 }, { "epoch": 170.68965517241378, "grad_norm": 7.6075439453125, "learning_rate": 4.43216831455507e-05, "loss": 1.7622, "step": 4950 }, { "epoch": 172.41379310344828, "grad_norm": 6.34699821472168, "learning_rate": 4.4264198666360084e-05, "loss": 1.7531, "step": 5000 }, { "epoch": 174.13793103448276, "grad_norm": 8.876733779907227, "learning_rate": 4.420671418716947e-05, "loss": 1.779, "step": 5050 }, { "epoch": 175.86206896551724, "grad_norm": 8.35562515258789, "learning_rate": 4.414922970797885e-05, "loss": 1.7321, "step": 5100 }, { "epoch": 177.58620689655172, "grad_norm": 10.794508934020996, "learning_rate": 4.409174522878823e-05, "loss": 1.7324, "step": 5150 }, { "epoch": 179.31034482758622, "grad_norm": 8.842728614807129, "learning_rate": 4.403426074959761e-05, "loss": 1.7285, "step": 5200 }, { "epoch": 181.0344827586207, "grad_norm": 7.502677917480469, "learning_rate": 4.397677627040699e-05, "loss": 1.768, "step": 5250 }, { "epoch": 182.75862068965517, "grad_norm": 8.119267463684082, "learning_rate": 4.391929179121637e-05, "loss": 1.7321, "step": 5300 }, { "epoch": 184.48275862068965, "grad_norm": 8.223883628845215, "learning_rate": 4.3861807312025755e-05, "loss": 1.6645, "step": 5350 }, { "epoch": 186.20689655172413, "grad_norm": 9.334975242614746, "learning_rate": 4.380432283283514e-05, "loss": 1.6582, "step": 5400 }, { "epoch": 187.93103448275863, "grad_norm": 7.834394931793213, "learning_rate": 4.374683835364452e-05, "loss": 1.734, "step": 5450 }, { "epoch": 189.6551724137931, "grad_norm": 8.89294147491455, "learning_rate": 4.36893538744539e-05, "loss": 1.6894, "step": 5500 }, { "epoch": 191.3793103448276, "grad_norm": 8.902423858642578, "learning_rate": 4.363186939526328e-05, "loss": 1.6576, "step": 5550 }, { "epoch": 193.10344827586206, "grad_norm": 12.103850364685059, "learning_rate": 4.357438491607266e-05, "loss": 1.7019, "step": 5600 }, { "epoch": 194.82758620689654, "grad_norm": 8.310213088989258, "learning_rate": 4.351690043688205e-05, "loss": 1.642, "step": 5650 }, { "epoch": 196.55172413793105, "grad_norm": 8.348377227783203, "learning_rate": 4.3459415957691426e-05, "loss": 1.6779, "step": 5700 }, { "epoch": 198.27586206896552, "grad_norm": 9.531669616699219, "learning_rate": 4.3401931478500804e-05, "loss": 1.6265, "step": 5750 }, { "epoch": 200.0, "grad_norm": 9.664142608642578, "learning_rate": 4.334444699931019e-05, "loss": 1.6541, "step": 5800 }, { "epoch": 201.72413793103448, "grad_norm": 9.6816987991333, "learning_rate": 4.328696252011957e-05, "loss": 1.609, "step": 5850 }, { "epoch": 203.44827586206895, "grad_norm": 6.469832420349121, "learning_rate": 4.322947804092895e-05, "loss": 1.7454, "step": 5900 }, { "epoch": 205.17241379310346, "grad_norm": 7.778339862823486, "learning_rate": 4.317199356173833e-05, "loss": 1.614, "step": 5950 }, { "epoch": 206.89655172413794, "grad_norm": 7.958249092102051, "learning_rate": 4.311450908254772e-05, "loss": 1.5781, "step": 6000 }, { "epoch": 208.6206896551724, "grad_norm": 11.570052146911621, "learning_rate": 4.3057024603357096e-05, "loss": 1.5946, "step": 6050 }, { "epoch": 210.3448275862069, "grad_norm": 6.685848712921143, "learning_rate": 4.2999540124166475e-05, "loss": 1.5809, "step": 6100 }, { "epoch": 212.06896551724137, "grad_norm": 8.674286842346191, "learning_rate": 4.294205564497586e-05, "loss": 1.5305, "step": 6150 }, { "epoch": 213.79310344827587, "grad_norm": 9.578246116638184, "learning_rate": 4.288457116578524e-05, "loss": 1.5231, "step": 6200 }, { "epoch": 215.51724137931035, "grad_norm": 8.25148868560791, "learning_rate": 4.282708668659462e-05, "loss": 1.5513, "step": 6250 }, { "epoch": 217.24137931034483, "grad_norm": 9.481741905212402, "learning_rate": 4.2769602207404e-05, "loss": 1.5018, "step": 6300 }, { "epoch": 218.9655172413793, "grad_norm": 15.537293434143066, "learning_rate": 4.271211772821339e-05, "loss": 1.5322, "step": 6350 }, { "epoch": 220.68965517241378, "grad_norm": 9.168423652648926, "learning_rate": 4.265463324902277e-05, "loss": 1.5052, "step": 6400 }, { "epoch": 222.41379310344828, "grad_norm": 10.939312934875488, "learning_rate": 4.2597148769832146e-05, "loss": 1.5532, "step": 6450 }, { "epoch": 224.13793103448276, "grad_norm": 11.567139625549316, "learning_rate": 4.253966429064153e-05, "loss": 1.4844, "step": 6500 }, { "epoch": 225.86206896551724, "grad_norm": 8.810647010803223, "learning_rate": 4.248217981145091e-05, "loss": 1.4638, "step": 6550 }, { "epoch": 227.58620689655172, "grad_norm": 10.871058464050293, "learning_rate": 4.242469533226029e-05, "loss": 1.5207, "step": 6600 }, { "epoch": 229.31034482758622, "grad_norm": 10.270922660827637, "learning_rate": 4.2367210853069674e-05, "loss": 1.4472, "step": 6650 }, { "epoch": 231.0344827586207, "grad_norm": 7.7902727127075195, "learning_rate": 4.230972637387905e-05, "loss": 1.5099, "step": 6700 }, { "epoch": 232.75862068965517, "grad_norm": 9.806976318359375, "learning_rate": 4.225224189468844e-05, "loss": 1.6083, "step": 6750 }, { "epoch": 234.48275862068965, "grad_norm": 6.84442663192749, "learning_rate": 4.2194757415497816e-05, "loss": 1.489, "step": 6800 }, { "epoch": 236.20689655172413, "grad_norm": 10.46094799041748, "learning_rate": 4.2137272936307195e-05, "loss": 1.3763, "step": 6850 }, { "epoch": 237.93103448275863, "grad_norm": 9.683454513549805, "learning_rate": 4.207978845711658e-05, "loss": 1.4695, "step": 6900 }, { "epoch": 239.6551724137931, "grad_norm": 7.90580940246582, "learning_rate": 4.2022303977925966e-05, "loss": 1.3974, "step": 6950 }, { "epoch": 241.3793103448276, "grad_norm": 9.816710472106934, "learning_rate": 4.1964819498735344e-05, "loss": 1.4501, "step": 7000 }, { "epoch": 243.10344827586206, "grad_norm": 8.528003692626953, "learning_rate": 4.190733501954472e-05, "loss": 1.4518, "step": 7050 }, { "epoch": 244.82758620689654, "grad_norm": 9.941585540771484, "learning_rate": 4.184985054035411e-05, "loss": 1.4164, "step": 7100 }, { "epoch": 246.55172413793105, "grad_norm": 8.947132110595703, "learning_rate": 4.1792366061163494e-05, "loss": 1.5914, "step": 7150 }, { "epoch": 248.27586206896552, "grad_norm": 10.514062881469727, "learning_rate": 4.1734881581972866e-05, "loss": 1.4042, "step": 7200 }, { "epoch": 250.0, "grad_norm": 8.969085693359375, "learning_rate": 4.167739710278225e-05, "loss": 1.4252, "step": 7250 }, { "epoch": 251.72413793103448, "grad_norm": 10.117509841918945, "learning_rate": 4.1619912623591636e-05, "loss": 1.3672, "step": 7300 }, { "epoch": 253.44827586206895, "grad_norm": 11.695943832397461, "learning_rate": 4.156242814440101e-05, "loss": 1.4913, "step": 7350 }, { "epoch": 255.17241379310346, "grad_norm": 11.890349388122559, "learning_rate": 4.1504943665210394e-05, "loss": 1.4323, "step": 7400 }, { "epoch": 256.8965517241379, "grad_norm": 10.415454864501953, "learning_rate": 4.144745918601978e-05, "loss": 1.406, "step": 7450 }, { "epoch": 258.62068965517244, "grad_norm": 9.543240547180176, "learning_rate": 4.138997470682916e-05, "loss": 1.5744, "step": 7500 }, { "epoch": 260.3448275862069, "grad_norm": 11.878031730651855, "learning_rate": 4.1332490227638536e-05, "loss": 1.3432, "step": 7550 }, { "epoch": 262.0689655172414, "grad_norm": 12.324782371520996, "learning_rate": 4.127500574844792e-05, "loss": 1.5018, "step": 7600 }, { "epoch": 263.7931034482759, "grad_norm": 21.42174530029297, "learning_rate": 4.12175212692573e-05, "loss": 1.3119, "step": 7650 }, { "epoch": 265.51724137931035, "grad_norm": 17.67474937438965, "learning_rate": 4.1160036790066686e-05, "loss": 1.3519, "step": 7700 }, { "epoch": 267.2413793103448, "grad_norm": 10.17497730255127, "learning_rate": 4.1102552310876064e-05, "loss": 1.2818, "step": 7750 }, { "epoch": 268.9655172413793, "grad_norm": 14.742813110351562, "learning_rate": 4.104506783168545e-05, "loss": 1.319, "step": 7800 }, { "epoch": 270.6896551724138, "grad_norm": 10.701860427856445, "learning_rate": 4.098758335249483e-05, "loss": 1.4599, "step": 7850 }, { "epoch": 272.41379310344826, "grad_norm": 173.8832550048828, "learning_rate": 4.0930098873304214e-05, "loss": 1.3641, "step": 7900 }, { "epoch": 274.13793103448273, "grad_norm": 15.688765525817871, "learning_rate": 4.087261439411359e-05, "loss": 1.6463, "step": 7950 }, { "epoch": 275.86206896551727, "grad_norm": 10.024360656738281, "learning_rate": 4.081512991492297e-05, "loss": 1.2864, "step": 8000 }, { "epoch": 277.58620689655174, "grad_norm": 8.830585479736328, "learning_rate": 4.0757645435732356e-05, "loss": 1.2916, "step": 8050 }, { "epoch": 279.3103448275862, "grad_norm": 9.77102279663086, "learning_rate": 4.0700160956541735e-05, "loss": 1.2564, "step": 8100 }, { "epoch": 281.0344827586207, "grad_norm": 9.754477500915527, "learning_rate": 4.0642676477351114e-05, "loss": 1.2283, "step": 8150 }, { "epoch": 282.7586206896552, "grad_norm": 11.53702163696289, "learning_rate": 4.05851919981605e-05, "loss": 1.449, "step": 8200 }, { "epoch": 284.48275862068965, "grad_norm": 13.12060260772705, "learning_rate": 4.0527707518969884e-05, "loss": 1.1929, "step": 8250 }, { "epoch": 286.2068965517241, "grad_norm": 17.75617027282715, "learning_rate": 4.0470223039779256e-05, "loss": 1.117, "step": 8300 }, { "epoch": 287.9310344827586, "grad_norm": 11.70126724243164, "learning_rate": 4.041273856058864e-05, "loss": 1.1659, "step": 8350 }, { "epoch": 289.6551724137931, "grad_norm": 9.401458740234375, "learning_rate": 4.035525408139803e-05, "loss": 1.1066, "step": 8400 }, { "epoch": 291.37931034482756, "grad_norm": 11.178382873535156, "learning_rate": 4.0297769602207406e-05, "loss": 1.146, "step": 8450 }, { "epoch": 293.1034482758621, "grad_norm": 11.62197208404541, "learning_rate": 4.0240285123016784e-05, "loss": 1.142, "step": 8500 }, { "epoch": 294.82758620689657, "grad_norm": 13.076742172241211, "learning_rate": 4.018280064382617e-05, "loss": 1.0669, "step": 8550 }, { "epoch": 296.55172413793105, "grad_norm": 14.376165390014648, "learning_rate": 4.012531616463555e-05, "loss": 1.0488, "step": 8600 }, { "epoch": 298.2758620689655, "grad_norm": 8.637730598449707, "learning_rate": 4.0067831685444934e-05, "loss": 1.0768, "step": 8650 }, { "epoch": 300.0, "grad_norm": 20.18622589111328, "learning_rate": 4.001034720625431e-05, "loss": 1.1276, "step": 8700 }, { "epoch": 301.7241379310345, "grad_norm": 14.751632690429688, "learning_rate": 3.99528627270637e-05, "loss": 1.1125, "step": 8750 }, { "epoch": 303.44827586206895, "grad_norm": 13.578910827636719, "learning_rate": 3.9895378247873076e-05, "loss": 1.0394, "step": 8800 }, { "epoch": 305.17241379310343, "grad_norm": 12.440682411193848, "learning_rate": 3.9837893768682455e-05, "loss": 1.0411, "step": 8850 }, { "epoch": 306.8965517241379, "grad_norm": 12.8425931930542, "learning_rate": 3.978040928949184e-05, "loss": 1.0436, "step": 8900 }, { "epoch": 308.62068965517244, "grad_norm": 11.793864250183105, "learning_rate": 3.972292481030122e-05, "loss": 1.0232, "step": 8950 }, { "epoch": 310.3448275862069, "grad_norm": 10.135506629943848, "learning_rate": 3.9665440331110604e-05, "loss": 1.1059, "step": 9000 }, { "epoch": 312.0689655172414, "grad_norm": 12.296060562133789, "learning_rate": 3.960795585191998e-05, "loss": 1.0182, "step": 9050 }, { "epoch": 313.7931034482759, "grad_norm": 10.98312759399414, "learning_rate": 3.955047137272936e-05, "loss": 1.0216, "step": 9100 }, { "epoch": 315.51724137931035, "grad_norm": 11.333012580871582, "learning_rate": 3.949298689353875e-05, "loss": 0.9825, "step": 9150 }, { "epoch": 317.2413793103448, "grad_norm": 9.772345542907715, "learning_rate": 3.943550241434813e-05, "loss": 0.9759, "step": 9200 }, { "epoch": 318.9655172413793, "grad_norm": 7.699130535125732, "learning_rate": 3.9378017935157504e-05, "loss": 0.9963, "step": 9250 }, { "epoch": 320.6896551724138, "grad_norm": 11.26038646697998, "learning_rate": 3.932053345596689e-05, "loss": 0.9921, "step": 9300 }, { "epoch": 322.41379310344826, "grad_norm": 9.768702507019043, "learning_rate": 3.9263048976776275e-05, "loss": 1.0894, "step": 9350 }, { "epoch": 324.13793103448273, "grad_norm": 10.358709335327148, "learning_rate": 3.9205564497585654e-05, "loss": 0.9916, "step": 9400 }, { "epoch": 325.86206896551727, "grad_norm": 13.890983581542969, "learning_rate": 3.914808001839503e-05, "loss": 1.0137, "step": 9450 }, { "epoch": 327.58620689655174, "grad_norm": 9.678529739379883, "learning_rate": 3.909059553920442e-05, "loss": 0.9861, "step": 9500 }, { "epoch": 329.3103448275862, "grad_norm": 11.910717010498047, "learning_rate": 3.9033111060013796e-05, "loss": 0.9545, "step": 9550 }, { "epoch": 331.0344827586207, "grad_norm": 7.422963619232178, "learning_rate": 3.8975626580823175e-05, "loss": 0.9594, "step": 9600 }, { "epoch": 332.7586206896552, "grad_norm": 9.682363510131836, "learning_rate": 3.891814210163256e-05, "loss": 0.9851, "step": 9650 }, { "epoch": 334.48275862068965, "grad_norm": 9.991140365600586, "learning_rate": 3.8860657622441946e-05, "loss": 0.9766, "step": 9700 }, { "epoch": 336.2068965517241, "grad_norm": 16.067174911499023, "learning_rate": 3.8803173143251324e-05, "loss": 0.9885, "step": 9750 }, { "epoch": 337.9310344827586, "grad_norm": 15.498530387878418, "learning_rate": 3.87456886640607e-05, "loss": 0.967, "step": 9800 }, { "epoch": 339.6551724137931, "grad_norm": 12.12011432647705, "learning_rate": 3.868820418487009e-05, "loss": 0.8778, "step": 9850 }, { "epoch": 341.37931034482756, "grad_norm": 10.55134105682373, "learning_rate": 3.863071970567947e-05, "loss": 0.9546, "step": 9900 }, { "epoch": 343.1034482758621, "grad_norm": 17.042943954467773, "learning_rate": 3.857323522648885e-05, "loss": 0.9069, "step": 9950 }, { "epoch": 344.82758620689657, "grad_norm": 10.811591148376465, "learning_rate": 3.851575074729823e-05, "loss": 0.9093, "step": 10000 }, { "epoch": 346.55172413793105, "grad_norm": 12.173806190490723, "learning_rate": 3.845826626810761e-05, "loss": 0.9254, "step": 10050 }, { "epoch": 348.2758620689655, "grad_norm": 9.049908638000488, "learning_rate": 3.8400781788916995e-05, "loss": 0.9094, "step": 10100 }, { "epoch": 350.0, "grad_norm": 18.905593872070312, "learning_rate": 3.834329730972638e-05, "loss": 0.9224, "step": 10150 }, { "epoch": 351.7241379310345, "grad_norm": 13.81609058380127, "learning_rate": 3.828581283053575e-05, "loss": 0.911, "step": 10200 }, { "epoch": 353.44827586206895, "grad_norm": 12.098998069763184, "learning_rate": 3.822832835134514e-05, "loss": 0.9006, "step": 10250 }, { "epoch": 355.17241379310343, "grad_norm": 11.600626945495605, "learning_rate": 3.817084387215452e-05, "loss": 0.9531, "step": 10300 }, { "epoch": 356.8965517241379, "grad_norm": 12.164990425109863, "learning_rate": 3.81133593929639e-05, "loss": 0.8933, "step": 10350 }, { "epoch": 358.62068965517244, "grad_norm": 15.047524452209473, "learning_rate": 3.805587491377328e-05, "loss": 0.8679, "step": 10400 }, { "epoch": 360.3448275862069, "grad_norm": 9.286539077758789, "learning_rate": 3.7998390434582666e-05, "loss": 0.8657, "step": 10450 }, { "epoch": 362.0689655172414, "grad_norm": 9.160314559936523, "learning_rate": 3.794090595539205e-05, "loss": 0.8485, "step": 10500 }, { "epoch": 363.7931034482759, "grad_norm": 9.045252799987793, "learning_rate": 3.788342147620142e-05, "loss": 0.8985, "step": 10550 }, { "epoch": 365.51724137931035, "grad_norm": 14.365066528320312, "learning_rate": 3.782593699701081e-05, "loss": 0.8442, "step": 10600 }, { "epoch": 367.2413793103448, "grad_norm": 11.80059814453125, "learning_rate": 3.7768452517820194e-05, "loss": 0.8944, "step": 10650 }, { "epoch": 368.9655172413793, "grad_norm": 12.03311538696289, "learning_rate": 3.771096803862957e-05, "loss": 0.8249, "step": 10700 }, { "epoch": 370.6896551724138, "grad_norm": 11.342610359191895, "learning_rate": 3.765348355943895e-05, "loss": 0.862, "step": 10750 }, { "epoch": 372.41379310344826, "grad_norm": 12.84242057800293, "learning_rate": 3.7595999080248336e-05, "loss": 0.8232, "step": 10800 }, { "epoch": 374.13793103448273, "grad_norm": 12.429469108581543, "learning_rate": 3.7538514601057715e-05, "loss": 0.8271, "step": 10850 }, { "epoch": 375.86206896551727, "grad_norm": 14.474736213684082, "learning_rate": 3.74810301218671e-05, "loss": 0.8412, "step": 10900 }, { "epoch": 377.58620689655174, "grad_norm": 12.938905715942383, "learning_rate": 3.742354564267648e-05, "loss": 0.9458, "step": 10950 }, { "epoch": 379.3103448275862, "grad_norm": 13.770180702209473, "learning_rate": 3.736606116348586e-05, "loss": 0.8541, "step": 11000 }, { "epoch": 381.0344827586207, "grad_norm": 12.744041442871094, "learning_rate": 3.730857668429524e-05, "loss": 0.8079, "step": 11050 }, { "epoch": 382.7586206896552, "grad_norm": 11.498470306396484, "learning_rate": 3.725109220510463e-05, "loss": 0.8148, "step": 11100 }, { "epoch": 384.48275862068965, "grad_norm": 12.428725242614746, "learning_rate": 3.719360772591401e-05, "loss": 0.7774, "step": 11150 }, { "epoch": 386.2068965517241, "grad_norm": 14.606568336486816, "learning_rate": 3.7136123246723386e-05, "loss": 0.8475, "step": 11200 }, { "epoch": 387.9310344827586, "grad_norm": 8.876829147338867, "learning_rate": 3.707863876753277e-05, "loss": 0.8082, "step": 11250 }, { "epoch": 389.6551724137931, "grad_norm": 10.602686882019043, "learning_rate": 3.702115428834215e-05, "loss": 0.7823, "step": 11300 }, { "epoch": 391.37931034482756, "grad_norm": 16.462186813354492, "learning_rate": 3.696366980915153e-05, "loss": 0.8117, "step": 11350 }, { "epoch": 393.1034482758621, "grad_norm": 9.348710060119629, "learning_rate": 3.6906185329960914e-05, "loss": 0.8005, "step": 11400 }, { "epoch": 394.82758620689657, "grad_norm": 12.182049751281738, "learning_rate": 3.68487008507703e-05, "loss": 0.7708, "step": 11450 }, { "epoch": 396.55172413793105, "grad_norm": 10.518006324768066, "learning_rate": 3.679121637157967e-05, "loss": 0.8198, "step": 11500 }, { "epoch": 398.2758620689655, "grad_norm": 16.45381736755371, "learning_rate": 3.6733731892389056e-05, "loss": 0.8599, "step": 11550 }, { "epoch": 400.0, "grad_norm": 14.395490646362305, "learning_rate": 3.667624741319844e-05, "loss": 0.7781, "step": 11600 }, { "epoch": 401.7241379310345, "grad_norm": 10.24387264251709, "learning_rate": 3.661876293400782e-05, "loss": 0.8065, "step": 11650 }, { "epoch": 403.44827586206895, "grad_norm": 19.994789123535156, "learning_rate": 3.65612784548172e-05, "loss": 0.7893, "step": 11700 }, { "epoch": 405.17241379310343, "grad_norm": 16.570329666137695, "learning_rate": 3.6503793975626584e-05, "loss": 0.8214, "step": 11750 }, { "epoch": 406.8965517241379, "grad_norm": 9.087477684020996, "learning_rate": 3.644630949643596e-05, "loss": 0.802, "step": 11800 }, { "epoch": 408.62068965517244, "grad_norm": 14.778762817382812, "learning_rate": 3.638882501724535e-05, "loss": 0.8271, "step": 11850 }, { "epoch": 410.3448275862069, "grad_norm": 9.180350303649902, "learning_rate": 3.633134053805473e-05, "loss": 0.8113, "step": 11900 }, { "epoch": 412.0689655172414, "grad_norm": 9.583569526672363, "learning_rate": 3.6273856058864105e-05, "loss": 0.7981, "step": 11950 }, { "epoch": 413.7931034482759, "grad_norm": 9.231134414672852, "learning_rate": 3.621637157967349e-05, "loss": 0.7608, "step": 12000 }, { "epoch": 415.51724137931035, "grad_norm": 12.140012741088867, "learning_rate": 3.615888710048287e-05, "loss": 0.7939, "step": 12050 }, { "epoch": 417.2413793103448, "grad_norm": 10.847981452941895, "learning_rate": 3.6101402621292255e-05, "loss": 0.7875, "step": 12100 }, { "epoch": 418.9655172413793, "grad_norm": 12.358875274658203, "learning_rate": 3.6043918142101634e-05, "loss": 0.7377, "step": 12150 }, { "epoch": 420.6896551724138, "grad_norm": 10.942302703857422, "learning_rate": 3.598643366291102e-05, "loss": 0.7476, "step": 12200 }, { "epoch": 422.41379310344826, "grad_norm": 8.106229782104492, "learning_rate": 3.59289491837204e-05, "loss": 0.7749, "step": 12250 }, { "epoch": 424.13793103448273, "grad_norm": 21.18960189819336, "learning_rate": 3.5871464704529776e-05, "loss": 0.7272, "step": 12300 }, { "epoch": 425.86206896551727, "grad_norm": 13.392748832702637, "learning_rate": 3.581398022533916e-05, "loss": 0.746, "step": 12350 }, { "epoch": 427.58620689655174, "grad_norm": 12.348884582519531, "learning_rate": 3.575649574614855e-05, "loss": 0.7171, "step": 12400 }, { "epoch": 429.3103448275862, "grad_norm": 13.730042457580566, "learning_rate": 3.569901126695792e-05, "loss": 0.7706, "step": 12450 }, { "epoch": 431.0344827586207, "grad_norm": 12.708958625793457, "learning_rate": 3.5641526787767304e-05, "loss": 0.7415, "step": 12500 }, { "epoch": 432.7586206896552, "grad_norm": 12.088113784790039, "learning_rate": 3.558404230857669e-05, "loss": 0.7616, "step": 12550 }, { "epoch": 434.48275862068965, "grad_norm": 11.354323387145996, "learning_rate": 3.552655782938607e-05, "loss": 0.7229, "step": 12600 }, { "epoch": 436.2068965517241, "grad_norm": 12.352569580078125, "learning_rate": 3.546907335019545e-05, "loss": 0.7204, "step": 12650 }, { "epoch": 437.9310344827586, "grad_norm": 14.099604606628418, "learning_rate": 3.541158887100483e-05, "loss": 0.7759, "step": 12700 }, { "epoch": 439.6551724137931, "grad_norm": 13.68437385559082, "learning_rate": 3.535410439181421e-05, "loss": 0.7255, "step": 12750 }, { "epoch": 441.37931034482756, "grad_norm": 11.179208755493164, "learning_rate": 3.529661991262359e-05, "loss": 0.7546, "step": 12800 }, { "epoch": 443.1034482758621, "grad_norm": 7.51722526550293, "learning_rate": 3.5239135433432975e-05, "loss": 0.6994, "step": 12850 }, { "epoch": 444.82758620689657, "grad_norm": 14.899781227111816, "learning_rate": 3.5181650954242353e-05, "loss": 0.7195, "step": 12900 }, { "epoch": 446.55172413793105, "grad_norm": 9.282757759094238, "learning_rate": 3.512416647505174e-05, "loss": 0.7282, "step": 12950 }, { "epoch": 448.2758620689655, "grad_norm": 7.849394798278809, "learning_rate": 3.506668199586112e-05, "loss": 0.7103, "step": 13000 }, { "epoch": 450.0, "grad_norm": 16.93404769897461, "learning_rate": 3.50091975166705e-05, "loss": 0.7365, "step": 13050 }, { "epoch": 451.7241379310345, "grad_norm": 16.34079933166504, "learning_rate": 3.495171303747988e-05, "loss": 0.6811, "step": 13100 }, { "epoch": 453.44827586206895, "grad_norm": 9.280820846557617, "learning_rate": 3.489422855828927e-05, "loss": 0.8149, "step": 13150 }, { "epoch": 455.17241379310343, "grad_norm": 12.158914566040039, "learning_rate": 3.4836744079098645e-05, "loss": 0.6775, "step": 13200 }, { "epoch": 456.8965517241379, "grad_norm": 11.688444137573242, "learning_rate": 3.4779259599908024e-05, "loss": 0.6975, "step": 13250 }, { "epoch": 458.62068965517244, "grad_norm": 12.36756706237793, "learning_rate": 3.472177512071741e-05, "loss": 0.7026, "step": 13300 }, { "epoch": 460.3448275862069, "grad_norm": 12.185371398925781, "learning_rate": 3.4664290641526795e-05, "loss": 0.6919, "step": 13350 }, { "epoch": 462.0689655172414, "grad_norm": 8.0114107131958, "learning_rate": 3.460680616233617e-05, "loss": 0.7031, "step": 13400 }, { "epoch": 463.7931034482759, "grad_norm": 14.150402069091797, "learning_rate": 3.454932168314555e-05, "loss": 0.7309, "step": 13450 }, { "epoch": 465.51724137931035, "grad_norm": 18.634939193725586, "learning_rate": 3.449183720395494e-05, "loss": 0.6818, "step": 13500 }, { "epoch": 467.2413793103448, "grad_norm": 13.656394958496094, "learning_rate": 3.443435272476431e-05, "loss": 0.688, "step": 13550 }, { "epoch": 468.9655172413793, "grad_norm": 8.369678497314453, "learning_rate": 3.4376868245573695e-05, "loss": 0.6834, "step": 13600 }, { "epoch": 470.6896551724138, "grad_norm": 8.357674598693848, "learning_rate": 3.431938376638308e-05, "loss": 0.6693, "step": 13650 }, { "epoch": 472.41379310344826, "grad_norm": 26.532268524169922, "learning_rate": 3.426189928719246e-05, "loss": 0.6756, "step": 13700 }, { "epoch": 474.13793103448273, "grad_norm": 12.204858779907227, "learning_rate": 3.420441480800184e-05, "loss": 0.6426, "step": 13750 }, { "epoch": 475.86206896551727, "grad_norm": 8.887661933898926, "learning_rate": 3.414693032881122e-05, "loss": 0.6552, "step": 13800 }, { "epoch": 477.58620689655174, "grad_norm": 12.329022407531738, "learning_rate": 3.408944584962061e-05, "loss": 0.6569, "step": 13850 }, { "epoch": 479.3103448275862, "grad_norm": 11.519402503967285, "learning_rate": 3.403196137042999e-05, "loss": 0.6791, "step": 13900 }, { "epoch": 481.0344827586207, "grad_norm": 9.974365234375, "learning_rate": 3.3974476891239365e-05, "loss": 0.7103, "step": 13950 }, { "epoch": 482.7586206896552, "grad_norm": 15.297700881958008, "learning_rate": 3.391699241204875e-05, "loss": 0.6867, "step": 14000 }, { "epoch": 484.48275862068965, "grad_norm": 8.194988250732422, "learning_rate": 3.385950793285813e-05, "loss": 0.6377, "step": 14050 }, { "epoch": 486.2068965517241, "grad_norm": 14.536419868469238, "learning_rate": 3.3802023453667515e-05, "loss": 0.6888, "step": 14100 }, { "epoch": 487.9310344827586, "grad_norm": 8.925668716430664, "learning_rate": 3.3744538974476893e-05, "loss": 0.6544, "step": 14150 }, { "epoch": 489.6551724137931, "grad_norm": 16.66129493713379, "learning_rate": 3.368705449528627e-05, "loss": 0.6428, "step": 14200 }, { "epoch": 491.37931034482756, "grad_norm": 10.90417766571045, "learning_rate": 3.362957001609566e-05, "loss": 0.6716, "step": 14250 }, { "epoch": 493.1034482758621, "grad_norm": 17.86897850036621, "learning_rate": 3.3572085536905036e-05, "loss": 0.6873, "step": 14300 }, { "epoch": 494.82758620689657, "grad_norm": 9.528550148010254, "learning_rate": 3.3514601057714415e-05, "loss": 0.644, "step": 14350 }, { "epoch": 496.55172413793105, "grad_norm": 10.691713333129883, "learning_rate": 3.34571165785238e-05, "loss": 0.6387, "step": 14400 }, { "epoch": 498.2758620689655, "grad_norm": 9.92000961303711, "learning_rate": 3.3399632099333186e-05, "loss": 0.6569, "step": 14450 }, { "epoch": 500.0, "grad_norm": 22.76459312438965, "learning_rate": 3.3342147620142564e-05, "loss": 0.6338, "step": 14500 }, { "epoch": 501.7241379310345, "grad_norm": 12.7587251663208, "learning_rate": 3.328466314095194e-05, "loss": 0.6318, "step": 14550 }, { "epoch": 503.44827586206895, "grad_norm": 8.93956470489502, "learning_rate": 3.322717866176133e-05, "loss": 0.6235, "step": 14600 }, { "epoch": 505.17241379310343, "grad_norm": 11.803091049194336, "learning_rate": 3.316969418257071e-05, "loss": 0.6379, "step": 14650 }, { "epoch": 506.8965517241379, "grad_norm": 12.328641891479492, "learning_rate": 3.3112209703380085e-05, "loss": 0.6191, "step": 14700 }, { "epoch": 508.62068965517244, "grad_norm": 13.680608749389648, "learning_rate": 3.305472522418947e-05, "loss": 0.6303, "step": 14750 }, { "epoch": 510.3448275862069, "grad_norm": 16.989259719848633, "learning_rate": 3.2997240744998856e-05, "loss": 0.6248, "step": 14800 }, { "epoch": 512.0689655172414, "grad_norm": 15.2227201461792, "learning_rate": 3.2939756265808235e-05, "loss": 0.6212, "step": 14850 }, { "epoch": 513.7931034482758, "grad_norm": 10.81242847442627, "learning_rate": 3.2882271786617613e-05, "loss": 0.7043, "step": 14900 }, { "epoch": 515.5172413793103, "grad_norm": 10.451784133911133, "learning_rate": 3.2824787307427e-05, "loss": 0.6325, "step": 14950 }, { "epoch": 517.2413793103449, "grad_norm": 9.947224617004395, "learning_rate": 3.276730282823638e-05, "loss": 0.6145, "step": 15000 }, { "epoch": 518.9655172413793, "grad_norm": 13.941021919250488, "learning_rate": 3.2709818349045756e-05, "loss": 0.6287, "step": 15050 }, { "epoch": 520.6896551724138, "grad_norm": 10.099722862243652, "learning_rate": 3.265233386985514e-05, "loss": 0.6113, "step": 15100 }, { "epoch": 522.4137931034483, "grad_norm": 10.102798461914062, "learning_rate": 3.259484939066452e-05, "loss": 0.5946, "step": 15150 }, { "epoch": 524.1379310344828, "grad_norm": 10.795266151428223, "learning_rate": 3.2537364911473905e-05, "loss": 0.6297, "step": 15200 }, { "epoch": 525.8620689655172, "grad_norm": 13.499300003051758, "learning_rate": 3.2479880432283284e-05, "loss": 0.6155, "step": 15250 }, { "epoch": 527.5862068965517, "grad_norm": 12.059775352478027, "learning_rate": 3.242239595309266e-05, "loss": 0.6437, "step": 15300 }, { "epoch": 529.3103448275862, "grad_norm": 9.141693115234375, "learning_rate": 3.236491147390205e-05, "loss": 0.6, "step": 15350 }, { "epoch": 531.0344827586207, "grad_norm": 11.43187141418457, "learning_rate": 3.2307426994711433e-05, "loss": 0.582, "step": 15400 }, { "epoch": 532.7586206896551, "grad_norm": 13.33201789855957, "learning_rate": 3.224994251552081e-05, "loss": 0.6107, "step": 15450 }, { "epoch": 534.4827586206897, "grad_norm": 9.781421661376953, "learning_rate": 3.219245803633019e-05, "loss": 0.5985, "step": 15500 }, { "epoch": 536.2068965517242, "grad_norm": 9.903817176818848, "learning_rate": 3.2134973557139576e-05, "loss": 0.6138, "step": 15550 }, { "epoch": 537.9310344827586, "grad_norm": 10.612358093261719, "learning_rate": 3.2077489077948955e-05, "loss": 0.644, "step": 15600 }, { "epoch": 539.6551724137931, "grad_norm": 14.504125595092773, "learning_rate": 3.202000459875833e-05, "loss": 0.5982, "step": 15650 }, { "epoch": 541.3793103448276, "grad_norm": 17.734477996826172, "learning_rate": 3.196252011956772e-05, "loss": 0.5866, "step": 15700 }, { "epoch": 543.1034482758621, "grad_norm": 10.594564437866211, "learning_rate": 3.1905035640377104e-05, "loss": 0.5716, "step": 15750 }, { "epoch": 544.8275862068965, "grad_norm": 9.165364265441895, "learning_rate": 3.1847551161186476e-05, "loss": 0.6101, "step": 15800 }, { "epoch": 546.551724137931, "grad_norm": 14.510986328125, "learning_rate": 3.179006668199586e-05, "loss": 0.6006, "step": 15850 }, { "epoch": 548.2758620689655, "grad_norm": 15.641995429992676, "learning_rate": 3.173258220280525e-05, "loss": 0.6281, "step": 15900 }, { "epoch": 550.0, "grad_norm": 20.696884155273438, "learning_rate": 3.1675097723614625e-05, "loss": 0.5741, "step": 15950 }, { "epoch": 551.7241379310345, "grad_norm": 7.9419708251953125, "learning_rate": 3.1617613244424004e-05, "loss": 0.6172, "step": 16000 }, { "epoch": 553.448275862069, "grad_norm": 9.974145889282227, "learning_rate": 3.156012876523339e-05, "loss": 0.5883, "step": 16050 }, { "epoch": 555.1724137931035, "grad_norm": 9.277832984924316, "learning_rate": 3.150264428604277e-05, "loss": 0.5961, "step": 16100 }, { "epoch": 556.8965517241379, "grad_norm": 24.026142120361328, "learning_rate": 3.1445159806852153e-05, "loss": 0.5875, "step": 16150 }, { "epoch": 558.6206896551724, "grad_norm": 11.789554595947266, "learning_rate": 3.138767532766153e-05, "loss": 0.5904, "step": 16200 }, { "epoch": 560.3448275862069, "grad_norm": 8.030414581298828, "learning_rate": 3.133019084847091e-05, "loss": 0.6161, "step": 16250 }, { "epoch": 562.0689655172414, "grad_norm": 10.811291694641113, "learning_rate": 3.1272706369280296e-05, "loss": 0.5665, "step": 16300 }, { "epoch": 563.7931034482758, "grad_norm": 8.506341934204102, "learning_rate": 3.121522189008968e-05, "loss": 0.5799, "step": 16350 }, { "epoch": 565.5172413793103, "grad_norm": 13.202093124389648, "learning_rate": 3.115773741089906e-05, "loss": 0.5788, "step": 16400 }, { "epoch": 567.2413793103449, "grad_norm": 18.485050201416016, "learning_rate": 3.110025293170844e-05, "loss": 0.5878, "step": 16450 }, { "epoch": 568.9655172413793, "grad_norm": 16.01526641845703, "learning_rate": 3.1042768452517824e-05, "loss": 0.5674, "step": 16500 }, { "epoch": 570.6896551724138, "grad_norm": 11.409168243408203, "learning_rate": 3.09852839733272e-05, "loss": 0.585, "step": 16550 }, { "epoch": 572.4137931034483, "grad_norm": 14.99613094329834, "learning_rate": 3.092779949413658e-05, "loss": 0.6, "step": 16600 }, { "epoch": 574.1379310344828, "grad_norm": 9.071861267089844, "learning_rate": 3.087031501494597e-05, "loss": 0.6105, "step": 16650 }, { "epoch": 575.8620689655172, "grad_norm": 12.799962043762207, "learning_rate": 3.081283053575535e-05, "loss": 0.6003, "step": 16700 }, { "epoch": 577.5862068965517, "grad_norm": 16.909852981567383, "learning_rate": 3.0755346056564724e-05, "loss": 0.5666, "step": 16750 }, { "epoch": 579.3103448275862, "grad_norm": 6.847039699554443, "learning_rate": 3.069786157737411e-05, "loss": 0.5707, "step": 16800 }, { "epoch": 581.0344827586207, "grad_norm": 10.43884563446045, "learning_rate": 3.0640377098183495e-05, "loss": 0.557, "step": 16850 }, { "epoch": 582.7586206896551, "grad_norm": 10.08051872253418, "learning_rate": 3.058289261899287e-05, "loss": 0.5633, "step": 16900 }, { "epoch": 584.4827586206897, "grad_norm": 10.643617630004883, "learning_rate": 3.052540813980225e-05, "loss": 0.5572, "step": 16950 }, { "epoch": 586.2068965517242, "grad_norm": 10.702605247497559, "learning_rate": 3.0467923660611637e-05, "loss": 0.571, "step": 17000 }, { "epoch": 587.9310344827586, "grad_norm": 12.869645118713379, "learning_rate": 3.041043918142102e-05, "loss": 0.5632, "step": 17050 }, { "epoch": 589.6551724137931, "grad_norm": 13.873844146728516, "learning_rate": 3.03529547022304e-05, "loss": 0.5834, "step": 17100 }, { "epoch": 591.3793103448276, "grad_norm": 6.18317174911499, "learning_rate": 3.029547022303978e-05, "loss": 0.5213, "step": 17150 }, { "epoch": 593.1034482758621, "grad_norm": 9.573638916015625, "learning_rate": 3.0237985743849162e-05, "loss": 0.5435, "step": 17200 }, { "epoch": 594.8275862068965, "grad_norm": 8.570283889770508, "learning_rate": 3.0180501264658544e-05, "loss": 0.5404, "step": 17250 }, { "epoch": 596.551724137931, "grad_norm": 14.95202350616455, "learning_rate": 3.0123016785467923e-05, "loss": 0.556, "step": 17300 }, { "epoch": 598.2758620689655, "grad_norm": 8.30759048461914, "learning_rate": 3.0065532306277305e-05, "loss": 0.5619, "step": 17350 }, { "epoch": 600.0, "grad_norm": 9.298309326171875, "learning_rate": 3.0008047827086687e-05, "loss": 0.5656, "step": 17400 }, { "epoch": 601.7241379310345, "grad_norm": 8.85986614227295, "learning_rate": 2.9950563347896072e-05, "loss": 0.5332, "step": 17450 }, { "epoch": 603.448275862069, "grad_norm": 10.735071182250977, "learning_rate": 2.9893078868705447e-05, "loss": 0.5226, "step": 17500 }, { "epoch": 605.1724137931035, "grad_norm": 8.513005256652832, "learning_rate": 2.9835594389514833e-05, "loss": 0.5476, "step": 17550 }, { "epoch": 606.8965517241379, "grad_norm": 15.68436050415039, "learning_rate": 2.9778109910324215e-05, "loss": 0.5487, "step": 17600 }, { "epoch": 608.6206896551724, "grad_norm": 8.542695999145508, "learning_rate": 2.9720625431133597e-05, "loss": 0.537, "step": 17650 }, { "epoch": 610.3448275862069, "grad_norm": 12.077261924743652, "learning_rate": 2.9663140951942975e-05, "loss": 0.5163, "step": 17700 }, { "epoch": 612.0689655172414, "grad_norm": 10.408947944641113, "learning_rate": 2.9605656472752357e-05, "loss": 0.5268, "step": 17750 }, { "epoch": 613.7931034482758, "grad_norm": 13.493642807006836, "learning_rate": 2.954817199356174e-05, "loss": 0.5378, "step": 17800 }, { "epoch": 615.5172413793103, "grad_norm": 6.730158805847168, "learning_rate": 2.9490687514371125e-05, "loss": 0.5226, "step": 17850 }, { "epoch": 617.2413793103449, "grad_norm": 7.061563491821289, "learning_rate": 2.94332030351805e-05, "loss": 0.4977, "step": 17900 }, { "epoch": 618.9655172413793, "grad_norm": 12.697237968444824, "learning_rate": 2.9375718555989885e-05, "loss": 0.5486, "step": 17950 }, { "epoch": 620.6896551724138, "grad_norm": 11.321379661560059, "learning_rate": 2.9318234076799267e-05, "loss": 0.5267, "step": 18000 }, { "epoch": 622.4137931034483, "grad_norm": 9.749813079833984, "learning_rate": 2.9260749597608646e-05, "loss": 0.5198, "step": 18050 }, { "epoch": 624.1379310344828, "grad_norm": 11.106800079345703, "learning_rate": 2.9203265118418028e-05, "loss": 0.5298, "step": 18100 }, { "epoch": 625.8620689655172, "grad_norm": 9.838105201721191, "learning_rate": 2.914578063922741e-05, "loss": 0.5305, "step": 18150 }, { "epoch": 627.5862068965517, "grad_norm": 14.010879516601562, "learning_rate": 2.9088296160036792e-05, "loss": 0.4954, "step": 18200 }, { "epoch": 629.3103448275862, "grad_norm": 8.32265853881836, "learning_rate": 2.903081168084617e-05, "loss": 0.5003, "step": 18250 }, { "epoch": 631.0344827586207, "grad_norm": 9.698287010192871, "learning_rate": 2.8973327201655553e-05, "loss": 0.5141, "step": 18300 }, { "epoch": 632.7586206896551, "grad_norm": 15.5342435836792, "learning_rate": 2.8915842722464938e-05, "loss": 0.4884, "step": 18350 }, { "epoch": 634.4827586206897, "grad_norm": 7.024100303649902, "learning_rate": 2.885835824327432e-05, "loss": 0.5096, "step": 18400 }, { "epoch": 636.2068965517242, "grad_norm": 9.926061630249023, "learning_rate": 2.8800873764083695e-05, "loss": 0.5345, "step": 18450 }, { "epoch": 637.9310344827586, "grad_norm": 6.2884111404418945, "learning_rate": 2.874338928489308e-05, "loss": 0.5186, "step": 18500 }, { "epoch": 639.6551724137931, "grad_norm": 8.967951774597168, "learning_rate": 2.8685904805702463e-05, "loss": 0.5079, "step": 18550 }, { "epoch": 641.3793103448276, "grad_norm": 12.651328086853027, "learning_rate": 2.8628420326511845e-05, "loss": 0.5414, "step": 18600 }, { "epoch": 643.1034482758621, "grad_norm": 6.481184005737305, "learning_rate": 2.8570935847321223e-05, "loss": 0.5155, "step": 18650 }, { "epoch": 644.8275862068965, "grad_norm": 11.71776008605957, "learning_rate": 2.8513451368130605e-05, "loss": 0.5151, "step": 18700 }, { "epoch": 646.551724137931, "grad_norm": 14.073832511901855, "learning_rate": 2.8455966888939987e-05, "loss": 0.541, "step": 18750 }, { "epoch": 648.2758620689655, "grad_norm": 13.915197372436523, "learning_rate": 2.8398482409749373e-05, "loss": 0.5277, "step": 18800 }, { "epoch": 650.0, "grad_norm": 16.9539852142334, "learning_rate": 2.8340997930558748e-05, "loss": 0.5225, "step": 18850 }, { "epoch": 651.7241379310345, "grad_norm": 9.616743087768555, "learning_rate": 2.8283513451368133e-05, "loss": 0.4941, "step": 18900 }, { "epoch": 653.448275862069, "grad_norm": 12.190633773803711, "learning_rate": 2.8226028972177515e-05, "loss": 0.5134, "step": 18950 }, { "epoch": 655.1724137931035, "grad_norm": 9.082640647888184, "learning_rate": 2.8168544492986894e-05, "loss": 0.5099, "step": 19000 }, { "epoch": 656.8965517241379, "grad_norm": 10.562954902648926, "learning_rate": 2.8111060013796276e-05, "loss": 0.496, "step": 19050 }, { "epoch": 658.6206896551724, "grad_norm": 10.206488609313965, "learning_rate": 2.8053575534605658e-05, "loss": 0.4915, "step": 19100 }, { "epoch": 660.3448275862069, "grad_norm": 14.335347175598145, "learning_rate": 2.799609105541504e-05, "loss": 0.5051, "step": 19150 }, { "epoch": 662.0689655172414, "grad_norm": 9.336698532104492, "learning_rate": 2.793860657622442e-05, "loss": 0.4982, "step": 19200 }, { "epoch": 663.7931034482758, "grad_norm": 13.002731323242188, "learning_rate": 2.78811220970338e-05, "loss": 0.4912, "step": 19250 }, { "epoch": 665.5172413793103, "grad_norm": 11.235210418701172, "learning_rate": 2.7823637617843186e-05, "loss": 0.4788, "step": 19300 }, { "epoch": 667.2413793103449, "grad_norm": 7.61260986328125, "learning_rate": 2.7766153138652568e-05, "loss": 0.5087, "step": 19350 }, { "epoch": 668.9655172413793, "grad_norm": 10.798530578613281, "learning_rate": 2.7708668659461943e-05, "loss": 0.5296, "step": 19400 }, { "epoch": 670.6896551724138, "grad_norm": 15.151008605957031, "learning_rate": 2.765118418027133e-05, "loss": 0.5101, "step": 19450 }, { "epoch": 672.4137931034483, "grad_norm": 13.777518272399902, "learning_rate": 2.759369970108071e-05, "loss": 0.4999, "step": 19500 }, { "epoch": 674.1379310344828, "grad_norm": 15.031306266784668, "learning_rate": 2.7536215221890093e-05, "loss": 0.5042, "step": 19550 }, { "epoch": 675.8620689655172, "grad_norm": 15.911823272705078, "learning_rate": 2.747873074269947e-05, "loss": 0.5026, "step": 19600 }, { "epoch": 677.5862068965517, "grad_norm": 10.616887092590332, "learning_rate": 2.7421246263508853e-05, "loss": 0.484, "step": 19650 }, { "epoch": 679.3103448275862, "grad_norm": 13.158663749694824, "learning_rate": 2.736376178431824e-05, "loss": 0.4784, "step": 19700 }, { "epoch": 681.0344827586207, "grad_norm": 10.053190231323242, "learning_rate": 2.7306277305127614e-05, "loss": 0.5015, "step": 19750 }, { "epoch": 682.7586206896551, "grad_norm": 9.748899459838867, "learning_rate": 2.7248792825936996e-05, "loss": 0.4934, "step": 19800 }, { "epoch": 684.4827586206897, "grad_norm": 7.784812927246094, "learning_rate": 2.719130834674638e-05, "loss": 0.5095, "step": 19850 }, { "epoch": 686.2068965517242, "grad_norm": 12.4750394821167, "learning_rate": 2.7133823867555763e-05, "loss": 0.4474, "step": 19900 }, { "epoch": 687.9310344827586, "grad_norm": 10.724108695983887, "learning_rate": 2.7076339388365142e-05, "loss": 0.5131, "step": 19950 }, { "epoch": 689.6551724137931, "grad_norm": 12.447121620178223, "learning_rate": 2.7018854909174524e-05, "loss": 0.4914, "step": 20000 }, { "epoch": 691.3793103448276, "grad_norm": 11.685606956481934, "learning_rate": 2.6961370429983906e-05, "loss": 0.4877, "step": 20050 }, { "epoch": 693.1034482758621, "grad_norm": 17.459991455078125, "learning_rate": 2.6903885950793288e-05, "loss": 0.5206, "step": 20100 }, { "epoch": 694.8275862068965, "grad_norm": 6.652616500854492, "learning_rate": 2.6846401471602667e-05, "loss": 0.4631, "step": 20150 }, { "epoch": 696.551724137931, "grad_norm": 10.871857643127441, "learning_rate": 2.678891699241205e-05, "loss": 0.5033, "step": 20200 }, { "epoch": 698.2758620689655, "grad_norm": 11.187660217285156, "learning_rate": 2.6731432513221434e-05, "loss": 0.4908, "step": 20250 }, { "epoch": 700.0, "grad_norm": 8.734025001525879, "learning_rate": 2.6673948034030816e-05, "loss": 0.4921, "step": 20300 }, { "epoch": 701.7241379310345, "grad_norm": 11.996895790100098, "learning_rate": 2.6616463554840195e-05, "loss": 0.4651, "step": 20350 }, { "epoch": 703.448275862069, "grad_norm": 9.585247993469238, "learning_rate": 2.6558979075649577e-05, "loss": 0.4862, "step": 20400 }, { "epoch": 705.1724137931035, "grad_norm": 11.085442543029785, "learning_rate": 2.650149459645896e-05, "loss": 0.4882, "step": 20450 }, { "epoch": 706.8965517241379, "grad_norm": 10.932337760925293, "learning_rate": 2.6444010117268337e-05, "loss": 0.4679, "step": 20500 }, { "epoch": 708.6206896551724, "grad_norm": 7.5849456787109375, "learning_rate": 2.638652563807772e-05, "loss": 0.472, "step": 20550 }, { "epoch": 710.3448275862069, "grad_norm": 9.400444030761719, "learning_rate": 2.63290411588871e-05, "loss": 0.4908, "step": 20600 }, { "epoch": 712.0689655172414, "grad_norm": 9.45374584197998, "learning_rate": 2.6271556679696487e-05, "loss": 0.4725, "step": 20650 }, { "epoch": 713.7931034482758, "grad_norm": 8.45836353302002, "learning_rate": 2.6214072200505862e-05, "loss": 0.4473, "step": 20700 }, { "epoch": 715.5172413793103, "grad_norm": 6.153685569763184, "learning_rate": 2.6156587721315244e-05, "loss": 0.4938, "step": 20750 }, { "epoch": 717.2413793103449, "grad_norm": 7.458461761474609, "learning_rate": 2.609910324212463e-05, "loss": 0.4613, "step": 20800 }, { "epoch": 718.9655172413793, "grad_norm": 9.330913543701172, "learning_rate": 2.604161876293401e-05, "loss": 0.4667, "step": 20850 }, { "epoch": 720.6896551724138, "grad_norm": 13.142263412475586, "learning_rate": 2.598413428374339e-05, "loss": 0.4685, "step": 20900 }, { "epoch": 722.4137931034483, "grad_norm": 7.8271403312683105, "learning_rate": 2.5926649804552772e-05, "loss": 0.476, "step": 20950 }, { "epoch": 724.1379310344828, "grad_norm": 11.601402282714844, "learning_rate": 2.5869165325362154e-05, "loss": 0.4825, "step": 21000 }, { "epoch": 725.8620689655172, "grad_norm": 8.06040096282959, "learning_rate": 2.5811680846171536e-05, "loss": 0.4792, "step": 21050 }, { "epoch": 727.5862068965517, "grad_norm": 10.636070251464844, "learning_rate": 2.5754196366980915e-05, "loss": 0.4499, "step": 21100 }, { "epoch": 729.3103448275862, "grad_norm": 16.24286651611328, "learning_rate": 2.5696711887790297e-05, "loss": 0.4772, "step": 21150 }, { "epoch": 731.0344827586207, "grad_norm": 7.462520599365234, "learning_rate": 2.5639227408599682e-05, "loss": 0.4538, "step": 21200 }, { "epoch": 732.7586206896551, "grad_norm": 15.53167724609375, "learning_rate": 2.5581742929409057e-05, "loss": 0.4963, "step": 21250 }, { "epoch": 734.4827586206897, "grad_norm": 12.614178657531738, "learning_rate": 2.5524258450218443e-05, "loss": 0.5034, "step": 21300 }, { "epoch": 736.2068965517242, "grad_norm": 8.404461860656738, "learning_rate": 2.5466773971027825e-05, "loss": 0.4886, "step": 21350 }, { "epoch": 737.9310344827586, "grad_norm": 8.928647994995117, "learning_rate": 2.5409289491837207e-05, "loss": 0.4491, "step": 21400 }, { "epoch": 739.6551724137931, "grad_norm": 16.93730354309082, "learning_rate": 2.5351805012646585e-05, "loss": 0.4428, "step": 21450 }, { "epoch": 741.3793103448276, "grad_norm": 9.481078147888184, "learning_rate": 2.5294320533455967e-05, "loss": 0.466, "step": 21500 }, { "epoch": 743.1034482758621, "grad_norm": 6.316746711730957, "learning_rate": 2.523683605426535e-05, "loss": 0.4675, "step": 21550 }, { "epoch": 744.8275862068965, "grad_norm": 12.559989929199219, "learning_rate": 2.5179351575074735e-05, "loss": 0.4613, "step": 21600 }, { "epoch": 746.551724137931, "grad_norm": 10.660043716430664, "learning_rate": 2.512186709588411e-05, "loss": 0.4841, "step": 21650 }, { "epoch": 748.2758620689655, "grad_norm": 14.807117462158203, "learning_rate": 2.5064382616693495e-05, "loss": 0.4608, "step": 21700 }, { "epoch": 750.0, "grad_norm": 18.719385147094727, "learning_rate": 2.5006898137502877e-05, "loss": 0.4913, "step": 21750 }, { "epoch": 751.7241379310345, "grad_norm": 17.217443466186523, "learning_rate": 2.4949413658312256e-05, "loss": 0.4927, "step": 21800 }, { "epoch": 753.448275862069, "grad_norm": 20.091716766357422, "learning_rate": 2.489192917912164e-05, "loss": 0.4494, "step": 21850 }, { "epoch": 755.1724137931035, "grad_norm": 9.750120162963867, "learning_rate": 2.483444469993102e-05, "loss": 0.4733, "step": 21900 }, { "epoch": 756.8965517241379, "grad_norm": 5.633036136627197, "learning_rate": 2.47769602207404e-05, "loss": 0.4595, "step": 21950 }, { "epoch": 758.6206896551724, "grad_norm": 8.773770332336426, "learning_rate": 2.4719475741549784e-05, "loss": 0.4362, "step": 22000 }, { "epoch": 760.3448275862069, "grad_norm": 10.554413795471191, "learning_rate": 2.4661991262359163e-05, "loss": 0.4564, "step": 22050 }, { "epoch": 762.0689655172414, "grad_norm": 9.440986633300781, "learning_rate": 2.4604506783168545e-05, "loss": 0.4354, "step": 22100 }, { "epoch": 763.7931034482758, "grad_norm": 16.867412567138672, "learning_rate": 2.4547022303977927e-05, "loss": 0.4712, "step": 22150 }, { "epoch": 765.5172413793103, "grad_norm": 14.023324966430664, "learning_rate": 2.448953782478731e-05, "loss": 0.4557, "step": 22200 }, { "epoch": 767.2413793103449, "grad_norm": 10.579647064208984, "learning_rate": 2.443205334559669e-05, "loss": 0.46, "step": 22250 }, { "epoch": 768.9655172413793, "grad_norm": 13.71581745147705, "learning_rate": 2.4374568866406073e-05, "loss": 0.4446, "step": 22300 }, { "epoch": 770.6896551724138, "grad_norm": 9.531046867370605, "learning_rate": 2.431708438721545e-05, "loss": 0.4804, "step": 22350 }, { "epoch": 772.4137931034483, "grad_norm": 10.61509895324707, "learning_rate": 2.4259599908024837e-05, "loss": 0.4431, "step": 22400 }, { "epoch": 774.1379310344828, "grad_norm": 7.44507360458374, "learning_rate": 2.4202115428834215e-05, "loss": 0.4403, "step": 22450 }, { "epoch": 775.8620689655172, "grad_norm": 8.763456344604492, "learning_rate": 2.4144630949643597e-05, "loss": 0.4145, "step": 22500 }, { "epoch": 777.5862068965517, "grad_norm": 11.339861869812012, "learning_rate": 2.408714647045298e-05, "loss": 0.4451, "step": 22550 }, { "epoch": 779.3103448275862, "grad_norm": 9.188865661621094, "learning_rate": 2.402966199126236e-05, "loss": 0.4272, "step": 22600 }, { "epoch": 781.0344827586207, "grad_norm": 9.83092212677002, "learning_rate": 2.3972177512071743e-05, "loss": 0.4422, "step": 22650 }, { "epoch": 782.7586206896551, "grad_norm": 9.990921974182129, "learning_rate": 2.3914693032881122e-05, "loss": 0.4324, "step": 22700 }, { "epoch": 784.4827586206897, "grad_norm": 13.08397102355957, "learning_rate": 2.3857208553690504e-05, "loss": 0.488, "step": 22750 }, { "epoch": 786.2068965517242, "grad_norm": 11.852884292602539, "learning_rate": 2.3799724074499886e-05, "loss": 0.4152, "step": 22800 }, { "epoch": 787.9310344827586, "grad_norm": 8.267644882202148, "learning_rate": 2.3742239595309268e-05, "loss": 0.4517, "step": 22850 }, { "epoch": 789.6551724137931, "grad_norm": 10.276979446411133, "learning_rate": 2.3684755116118646e-05, "loss": 0.4637, "step": 22900 }, { "epoch": 791.3793103448276, "grad_norm": 18.616357803344727, "learning_rate": 2.3627270636928032e-05, "loss": 0.4545, "step": 22950 }, { "epoch": 793.1034482758621, "grad_norm": 14.108061790466309, "learning_rate": 2.356978615773741e-05, "loss": 0.4826, "step": 23000 }, { "epoch": 794.8275862068965, "grad_norm": 29.307893753051758, "learning_rate": 2.3512301678546796e-05, "loss": 0.4551, "step": 23050 }, { "epoch": 796.551724137931, "grad_norm": 16.642671585083008, "learning_rate": 2.3454817199356175e-05, "loss": 0.4705, "step": 23100 }, { "epoch": 798.2758620689655, "grad_norm": 7.182180404663086, "learning_rate": 2.3397332720165557e-05, "loss": 0.4261, "step": 23150 }, { "epoch": 800.0, "grad_norm": 13.219466209411621, "learning_rate": 2.333984824097494e-05, "loss": 0.4502, "step": 23200 }, { "epoch": 801.7241379310345, "grad_norm": 9.657610893249512, "learning_rate": 2.328236376178432e-05, "loss": 0.4471, "step": 23250 }, { "epoch": 803.448275862069, "grad_norm": 10.877338409423828, "learning_rate": 2.32248792825937e-05, "loss": 0.4442, "step": 23300 }, { "epoch": 805.1724137931035, "grad_norm": 9.863310813903809, "learning_rate": 2.3167394803403085e-05, "loss": 0.4403, "step": 23350 }, { "epoch": 806.8965517241379, "grad_norm": 12.387887954711914, "learning_rate": 2.3109910324212463e-05, "loss": 0.4701, "step": 23400 }, { "epoch": 808.6206896551724, "grad_norm": 15.360872268676758, "learning_rate": 2.3052425845021845e-05, "loss": 0.4633, "step": 23450 }, { "epoch": 810.3448275862069, "grad_norm": 10.893658638000488, "learning_rate": 2.2994941365831227e-05, "loss": 0.4368, "step": 23500 }, { "epoch": 812.0689655172414, "grad_norm": 10.36801528930664, "learning_rate": 2.2937456886640606e-05, "loss": 0.4329, "step": 23550 }, { "epoch": 813.7931034482758, "grad_norm": 10.813234329223633, "learning_rate": 2.287997240744999e-05, "loss": 0.4562, "step": 23600 }, { "epoch": 815.5172413793103, "grad_norm": 13.257786750793457, "learning_rate": 2.282248792825937e-05, "loss": 0.4632, "step": 23650 }, { "epoch": 817.2413793103449, "grad_norm": 12.036748886108398, "learning_rate": 2.2765003449068752e-05, "loss": 0.3979, "step": 23700 }, { "epoch": 818.9655172413793, "grad_norm": 12.847742080688477, "learning_rate": 2.2707518969878134e-05, "loss": 0.4563, "step": 23750 }, { "epoch": 820.6896551724138, "grad_norm": 6.0036301612854, "learning_rate": 2.2650034490687516e-05, "loss": 0.4298, "step": 23800 }, { "epoch": 822.4137931034483, "grad_norm": 9.15625286102295, "learning_rate": 2.2592550011496898e-05, "loss": 0.4205, "step": 23850 }, { "epoch": 824.1379310344828, "grad_norm": 9.822038650512695, "learning_rate": 2.253506553230628e-05, "loss": 0.4297, "step": 23900 }, { "epoch": 825.8620689655172, "grad_norm": 8.86760139465332, "learning_rate": 2.247758105311566e-05, "loss": 0.406, "step": 23950 }, { "epoch": 827.5862068965517, "grad_norm": 12.85793399810791, "learning_rate": 2.2420096573925044e-05, "loss": 0.4546, "step": 24000 }, { "epoch": 829.3103448275862, "grad_norm": 5.669714450836182, "learning_rate": 2.2362612094734422e-05, "loss": 0.4504, "step": 24050 }, { "epoch": 831.0344827586207, "grad_norm": 16.406763076782227, "learning_rate": 2.2305127615543804e-05, "loss": 0.4463, "step": 24100 }, { "epoch": 832.7586206896551, "grad_norm": 10.400687217712402, "learning_rate": 2.2247643136353186e-05, "loss": 0.4264, "step": 24150 }, { "epoch": 834.4827586206897, "grad_norm": 11.179044723510742, "learning_rate": 2.2190158657162565e-05, "loss": 0.4427, "step": 24200 }, { "epoch": 836.2068965517242, "grad_norm": 8.038444519042969, "learning_rate": 2.2132674177971947e-05, "loss": 0.4226, "step": 24250 }, { "epoch": 837.9310344827586, "grad_norm": 9.589123725891113, "learning_rate": 2.207518969878133e-05, "loss": 0.4283, "step": 24300 }, { "epoch": 839.6551724137931, "grad_norm": 9.089820861816406, "learning_rate": 2.201770521959071e-05, "loss": 0.4145, "step": 24350 }, { "epoch": 841.3793103448276, "grad_norm": 15.860547065734863, "learning_rate": 2.1960220740400093e-05, "loss": 0.4167, "step": 24400 }, { "epoch": 843.1034482758621, "grad_norm": 5.311263561248779, "learning_rate": 2.1902736261209475e-05, "loss": 0.4296, "step": 24450 }, { "epoch": 844.8275862068965, "grad_norm": 14.320064544677734, "learning_rate": 2.1845251782018854e-05, "loss": 0.4174, "step": 24500 }, { "epoch": 846.551724137931, "grad_norm": 5.604402542114258, "learning_rate": 2.178776730282824e-05, "loss": 0.4252, "step": 24550 }, { "epoch": 848.2758620689655, "grad_norm": 13.035962104797363, "learning_rate": 2.1730282823637618e-05, "loss": 0.4217, "step": 24600 }, { "epoch": 850.0, "grad_norm": 8.706709861755371, "learning_rate": 2.1672798344447e-05, "loss": 0.4272, "step": 24650 }, { "epoch": 851.7241379310345, "grad_norm": 12.834080696105957, "learning_rate": 2.1615313865256382e-05, "loss": 0.428, "step": 24700 }, { "epoch": 853.448275862069, "grad_norm": 15.700016021728516, "learning_rate": 2.1557829386065764e-05, "loss": 0.444, "step": 24750 }, { "epoch": 855.1724137931035, "grad_norm": 8.073234558105469, "learning_rate": 2.1500344906875146e-05, "loss": 0.4053, "step": 24800 }, { "epoch": 856.8965517241379, "grad_norm": 8.428800582885742, "learning_rate": 2.1442860427684528e-05, "loss": 0.4073, "step": 24850 }, { "epoch": 858.6206896551724, "grad_norm": 11.688020706176758, "learning_rate": 2.1385375948493906e-05, "loss": 0.4273, "step": 24900 }, { "epoch": 860.3448275862069, "grad_norm": 7.634262561798096, "learning_rate": 2.1327891469303292e-05, "loss": 0.4026, "step": 24950 }, { "epoch": 862.0689655172414, "grad_norm": 8.20329475402832, "learning_rate": 2.127040699011267e-05, "loss": 0.4196, "step": 25000 }, { "epoch": 863.7931034482758, "grad_norm": 9.971654891967773, "learning_rate": 2.1212922510922052e-05, "loss": 0.4128, "step": 25050 }, { "epoch": 865.5172413793103, "grad_norm": 9.349475860595703, "learning_rate": 2.1155438031731434e-05, "loss": 0.4097, "step": 25100 }, { "epoch": 867.2413793103449, "grad_norm": 13.595865249633789, "learning_rate": 2.1097953552540813e-05, "loss": 0.4205, "step": 25150 }, { "epoch": 868.9655172413793, "grad_norm": 9.110448837280273, "learning_rate": 2.10404690733502e-05, "loss": 0.4078, "step": 25200 }, { "epoch": 870.6896551724138, "grad_norm": 12.512377738952637, "learning_rate": 2.0982984594159577e-05, "loss": 0.4159, "step": 25250 }, { "epoch": 872.4137931034483, "grad_norm": 12.327591896057129, "learning_rate": 2.092550011496896e-05, "loss": 0.4347, "step": 25300 }, { "epoch": 874.1379310344828, "grad_norm": 12.339853286743164, "learning_rate": 2.086801563577834e-05, "loss": 0.4085, "step": 25350 }, { "epoch": 875.8620689655172, "grad_norm": 13.274292945861816, "learning_rate": 2.0810531156587723e-05, "loss": 0.4067, "step": 25400 }, { "epoch": 877.5862068965517, "grad_norm": 16.93192481994629, "learning_rate": 2.0753046677397102e-05, "loss": 0.4062, "step": 25450 }, { "epoch": 879.3103448275862, "grad_norm": 14.042583465576172, "learning_rate": 2.0695562198206487e-05, "loss": 0.4282, "step": 25500 }, { "epoch": 881.0344827586207, "grad_norm": 5.028243064880371, "learning_rate": 2.0638077719015866e-05, "loss": 0.416, "step": 25550 }, { "epoch": 882.7586206896551, "grad_norm": 12.229650497436523, "learning_rate": 2.0580593239825248e-05, "loss": 0.4044, "step": 25600 }, { "epoch": 884.4827586206897, "grad_norm": 18.598133087158203, "learning_rate": 2.052310876063463e-05, "loss": 0.4182, "step": 25650 }, { "epoch": 886.2068965517242, "grad_norm": 8.576728820800781, "learning_rate": 2.0465624281444012e-05, "loss": 0.3858, "step": 25700 }, { "epoch": 887.9310344827586, "grad_norm": 8.721904754638672, "learning_rate": 2.0408139802253394e-05, "loss": 0.4086, "step": 25750 }, { "epoch": 889.6551724137931, "grad_norm": 9.718511581420898, "learning_rate": 2.0350655323062772e-05, "loss": 0.4179, "step": 25800 }, { "epoch": 891.3793103448276, "grad_norm": 10.51516056060791, "learning_rate": 2.0293170843872154e-05, "loss": 0.4115, "step": 25850 }, { "epoch": 893.1034482758621, "grad_norm": 10.537981033325195, "learning_rate": 2.0235686364681536e-05, "loss": 0.4257, "step": 25900 }, { "epoch": 894.8275862068965, "grad_norm": 7.222978115081787, "learning_rate": 2.017820188549092e-05, "loss": 0.4136, "step": 25950 }, { "epoch": 896.551724137931, "grad_norm": 9.49518871307373, "learning_rate": 2.01207174063003e-05, "loss": 0.4028, "step": 26000 }, { "epoch": 898.2758620689655, "grad_norm": 8.481707572937012, "learning_rate": 2.0063232927109682e-05, "loss": 0.4008, "step": 26050 }, { "epoch": 900.0, "grad_norm": 8.928730964660645, "learning_rate": 2.000574844791906e-05, "loss": 0.3941, "step": 26100 }, { "epoch": 901.7241379310345, "grad_norm": 8.243491172790527, "learning_rate": 1.9948263968728446e-05, "loss": 0.4022, "step": 26150 }, { "epoch": 903.448275862069, "grad_norm": 10.242218971252441, "learning_rate": 1.9890779489537825e-05, "loss": 0.4251, "step": 26200 }, { "epoch": 905.1724137931035, "grad_norm": 8.466740608215332, "learning_rate": 1.9833295010347207e-05, "loss": 0.3911, "step": 26250 }, { "epoch": 906.8965517241379, "grad_norm": 7.11518669128418, "learning_rate": 1.977581053115659e-05, "loss": 0.4247, "step": 26300 }, { "epoch": 908.6206896551724, "grad_norm": 18.46306610107422, "learning_rate": 1.971832605196597e-05, "loss": 0.3973, "step": 26350 }, { "epoch": 910.3448275862069, "grad_norm": 7.4284162521362305, "learning_rate": 1.9660841572775353e-05, "loss": 0.3953, "step": 26400 }, { "epoch": 912.0689655172414, "grad_norm": 5.791119575500488, "learning_rate": 1.9603357093584735e-05, "loss": 0.4109, "step": 26450 }, { "epoch": 913.7931034482758, "grad_norm": 7.005152225494385, "learning_rate": 1.9545872614394114e-05, "loss": 0.4178, "step": 26500 }, { "epoch": 915.5172413793103, "grad_norm": 7.08847188949585, "learning_rate": 1.9488388135203496e-05, "loss": 0.3992, "step": 26550 }, { "epoch": 917.2413793103449, "grad_norm": 10.282743453979492, "learning_rate": 1.9430903656012878e-05, "loss": 0.4244, "step": 26600 }, { "epoch": 918.9655172413793, "grad_norm": 10.910677909851074, "learning_rate": 1.9373419176822256e-05, "loss": 0.394, "step": 26650 }, { "epoch": 920.6896551724138, "grad_norm": 11.09377670288086, "learning_rate": 1.9315934697631642e-05, "loss": 0.4111, "step": 26700 }, { "epoch": 922.4137931034483, "grad_norm": 14.740580558776855, "learning_rate": 1.925845021844102e-05, "loss": 0.3786, "step": 26750 }, { "epoch": 924.1379310344828, "grad_norm": 9.399269104003906, "learning_rate": 1.9200965739250402e-05, "loss": 0.3997, "step": 26800 }, { "epoch": 925.8620689655172, "grad_norm": 7.7314229011535645, "learning_rate": 1.9143481260059784e-05, "loss": 0.4117, "step": 26850 }, { "epoch": 927.5862068965517, "grad_norm": 10.183043479919434, "learning_rate": 1.9085996780869166e-05, "loss": 0.3976, "step": 26900 }, { "epoch": 929.3103448275862, "grad_norm": 6.429906368255615, "learning_rate": 1.902851230167855e-05, "loss": 0.4072, "step": 26950 }, { "epoch": 931.0344827586207, "grad_norm": 7.826855659484863, "learning_rate": 1.897102782248793e-05, "loss": 0.4068, "step": 27000 }, { "epoch": 932.7586206896551, "grad_norm": 10.032449722290039, "learning_rate": 1.891354334329731e-05, "loss": 0.4159, "step": 27050 }, { "epoch": 934.4827586206897, "grad_norm": 10.631507873535156, "learning_rate": 1.8856058864106694e-05, "loss": 0.402, "step": 27100 }, { "epoch": 936.2068965517242, "grad_norm": 12.723832130432129, "learning_rate": 1.8798574384916073e-05, "loss": 0.4104, "step": 27150 }, { "epoch": 937.9310344827586, "grad_norm": 9.151418685913086, "learning_rate": 1.8741089905725455e-05, "loss": 0.3937, "step": 27200 }, { "epoch": 939.6551724137931, "grad_norm": 10.712944030761719, "learning_rate": 1.8683605426534837e-05, "loss": 0.4014, "step": 27250 }, { "epoch": 941.3793103448276, "grad_norm": 9.183157920837402, "learning_rate": 1.8626120947344216e-05, "loss": 0.4045, "step": 27300 }, { "epoch": 943.1034482758621, "grad_norm": 8.212356567382812, "learning_rate": 1.85686364681536e-05, "loss": 0.3893, "step": 27350 }, { "epoch": 944.8275862068965, "grad_norm": 11.241971969604492, "learning_rate": 1.851115198896298e-05, "loss": 0.4053, "step": 27400 }, { "epoch": 946.551724137931, "grad_norm": 7.8272247314453125, "learning_rate": 1.8453667509772362e-05, "loss": 0.3684, "step": 27450 }, { "epoch": 948.2758620689655, "grad_norm": 6.664997100830078, "learning_rate": 1.8396183030581744e-05, "loss": 0.405, "step": 27500 }, { "epoch": 950.0, "grad_norm": 12.75696849822998, "learning_rate": 1.8338698551391126e-05, "loss": 0.4076, "step": 27550 }, { "epoch": 951.7241379310345, "grad_norm": 5.724425792694092, "learning_rate": 1.8281214072200504e-05, "loss": 0.4041, "step": 27600 }, { "epoch": 953.448275862069, "grad_norm": 11.442533493041992, "learning_rate": 1.822372959300989e-05, "loss": 0.3819, "step": 27650 }, { "epoch": 955.1724137931035, "grad_norm": 10.054542541503906, "learning_rate": 1.816624511381927e-05, "loss": 0.3885, "step": 27700 }, { "epoch": 956.8965517241379, "grad_norm": 7.679254055023193, "learning_rate": 1.810876063462865e-05, "loss": 0.3982, "step": 27750 }, { "epoch": 958.6206896551724, "grad_norm": 4.477881908416748, "learning_rate": 1.8051276155438032e-05, "loss": 0.376, "step": 27800 }, { "epoch": 960.3448275862069, "grad_norm": 19.345735549926758, "learning_rate": 1.7993791676247414e-05, "loss": 0.3967, "step": 27850 }, { "epoch": 962.0689655172414, "grad_norm": 9.548508644104004, "learning_rate": 1.7936307197056796e-05, "loss": 0.4008, "step": 27900 }, { "epoch": 963.7931034482758, "grad_norm": 12.707850456237793, "learning_rate": 1.787882271786618e-05, "loss": 0.3965, "step": 27950 }, { "epoch": 965.5172413793103, "grad_norm": 8.658663749694824, "learning_rate": 1.7821338238675557e-05, "loss": 0.3876, "step": 28000 }, { "epoch": 967.2413793103449, "grad_norm": 10.078787803649902, "learning_rate": 1.776385375948494e-05, "loss": 0.3928, "step": 28050 }, { "epoch": 968.9655172413793, "grad_norm": 9.055538177490234, "learning_rate": 1.770636928029432e-05, "loss": 0.3854, "step": 28100 }, { "epoch": 970.6896551724138, "grad_norm": 8.22917366027832, "learning_rate": 1.7648884801103703e-05, "loss": 0.383, "step": 28150 }, { "epoch": 972.4137931034483, "grad_norm": 10.366366386413574, "learning_rate": 1.7591400321913085e-05, "loss": 0.4011, "step": 28200 }, { "epoch": 974.1379310344828, "grad_norm": 9.05001163482666, "learning_rate": 1.7533915842722464e-05, "loss": 0.3719, "step": 28250 }, { "epoch": 975.8620689655172, "grad_norm": 6.288949966430664, "learning_rate": 1.747643136353185e-05, "loss": 0.3988, "step": 28300 }, { "epoch": 977.5862068965517, "grad_norm": 10.313246726989746, "learning_rate": 1.7418946884341228e-05, "loss": 0.389, "step": 28350 }, { "epoch": 979.3103448275862, "grad_norm": 19.803335189819336, "learning_rate": 1.736146240515061e-05, "loss": 0.37, "step": 28400 }, { "epoch": 981.0344827586207, "grad_norm": 9.94951057434082, "learning_rate": 1.730397792595999e-05, "loss": 0.3861, "step": 28450 }, { "epoch": 982.7586206896551, "grad_norm": 6.249629974365234, "learning_rate": 1.7246493446769374e-05, "loss": 0.3964, "step": 28500 }, { "epoch": 984.4827586206897, "grad_norm": 13.551671028137207, "learning_rate": 1.7189008967578756e-05, "loss": 0.3809, "step": 28550 }, { "epoch": 986.2068965517242, "grad_norm": 9.148994445800781, "learning_rate": 1.7131524488388138e-05, "loss": 0.3856, "step": 28600 }, { "epoch": 987.9310344827586, "grad_norm": 5.7350969314575195, "learning_rate": 1.7074040009197516e-05, "loss": 0.3758, "step": 28650 }, { "epoch": 989.6551724137931, "grad_norm": 7.190326690673828, "learning_rate": 1.7016555530006902e-05, "loss": 0.3854, "step": 28700 }, { "epoch": 991.3793103448276, "grad_norm": 13.431684494018555, "learning_rate": 1.695907105081628e-05, "loss": 0.4025, "step": 28750 }, { "epoch": 993.1034482758621, "grad_norm": 9.488801956176758, "learning_rate": 1.690158657162566e-05, "loss": 0.3822, "step": 28800 }, { "epoch": 994.8275862068965, "grad_norm": 11.048343658447266, "learning_rate": 1.6844102092435044e-05, "loss": 0.3865, "step": 28850 }, { "epoch": 996.551724137931, "grad_norm": 9.490495681762695, "learning_rate": 1.6786617613244423e-05, "loss": 0.3916, "step": 28900 }, { "epoch": 998.2758620689655, "grad_norm": 9.790066719055176, "learning_rate": 1.6729133134053805e-05, "loss": 0.3733, "step": 28950 }, { "epoch": 1000.0, "grad_norm": 11.23779582977295, "learning_rate": 1.6671648654863187e-05, "loss": 0.3999, "step": 29000 }, { "epoch": 1001.7241379310345, "grad_norm": 7.197849273681641, "learning_rate": 1.661416417567257e-05, "loss": 0.4035, "step": 29050 }, { "epoch": 1003.448275862069, "grad_norm": 15.386571884155273, "learning_rate": 1.655667969648195e-05, "loss": 0.3973, "step": 29100 }, { "epoch": 1005.1724137931035, "grad_norm": 4.410562992095947, "learning_rate": 1.6499195217291333e-05, "loss": 0.3643, "step": 29150 }, { "epoch": 1006.8965517241379, "grad_norm": 6.943966388702393, "learning_rate": 1.644171073810071e-05, "loss": 0.3877, "step": 29200 }, { "epoch": 1008.6206896551724, "grad_norm": 10.435264587402344, "learning_rate": 1.6384226258910097e-05, "loss": 0.3788, "step": 29250 }, { "epoch": 1010.3448275862069, "grad_norm": 9.548276901245117, "learning_rate": 1.6326741779719476e-05, "loss": 0.4039, "step": 29300 }, { "epoch": 1012.0689655172414, "grad_norm": 6.759462356567383, "learning_rate": 1.6269257300528858e-05, "loss": 0.3849, "step": 29350 }, { "epoch": 1013.7931034482758, "grad_norm": 6.355317115783691, "learning_rate": 1.621177282133824e-05, "loss": 0.3915, "step": 29400 }, { "epoch": 1015.5172413793103, "grad_norm": 9.240543365478516, "learning_rate": 1.615428834214762e-05, "loss": 0.3918, "step": 29450 }, { "epoch": 1017.2413793103449, "grad_norm": 12.63991641998291, "learning_rate": 1.6096803862957004e-05, "loss": 0.3813, "step": 29500 }, { "epoch": 1018.9655172413793, "grad_norm": 8.079286575317383, "learning_rate": 1.6039319383766386e-05, "loss": 0.3992, "step": 29550 }, { "epoch": 1020.6896551724138, "grad_norm": 8.934220314025879, "learning_rate": 1.5981834904575764e-05, "loss": 0.3985, "step": 29600 }, { "epoch": 1022.4137931034483, "grad_norm": 10.656882286071777, "learning_rate": 1.5924350425385146e-05, "loss": 0.3739, "step": 29650 }, { "epoch": 1024.1379310344828, "grad_norm": 7.842384338378906, "learning_rate": 1.586686594619453e-05, "loss": 0.3786, "step": 29700 }, { "epoch": 1025.8620689655172, "grad_norm": 10.330281257629395, "learning_rate": 1.580938146700391e-05, "loss": 0.392, "step": 29750 }, { "epoch": 1027.5862068965516, "grad_norm": 7.076016902923584, "learning_rate": 1.5751896987813292e-05, "loss": 0.3831, "step": 29800 }, { "epoch": 1029.3103448275863, "grad_norm": 10.679615020751953, "learning_rate": 1.569441250862267e-05, "loss": 0.3816, "step": 29850 }, { "epoch": 1031.0344827586207, "grad_norm": 7.164504051208496, "learning_rate": 1.5636928029432056e-05, "loss": 0.3757, "step": 29900 }, { "epoch": 1032.7586206896551, "grad_norm": 13.215066909790039, "learning_rate": 1.5579443550241435e-05, "loss": 0.3947, "step": 29950 }, { "epoch": 1034.4827586206898, "grad_norm": 9.657276153564453, "learning_rate": 1.5521959071050817e-05, "loss": 0.3929, "step": 30000 }, { "epoch": 1036.2068965517242, "grad_norm": 11.715109825134277, "learning_rate": 1.54644745918602e-05, "loss": 0.3663, "step": 30050 }, { "epoch": 1037.9310344827586, "grad_norm": 8.122743606567383, "learning_rate": 1.540699011266958e-05, "loss": 0.3713, "step": 30100 }, { "epoch": 1039.655172413793, "grad_norm": 9.487385749816895, "learning_rate": 1.534950563347896e-05, "loss": 0.3839, "step": 30150 }, { "epoch": 1041.3793103448277, "grad_norm": 7.689152717590332, "learning_rate": 1.5292021154288345e-05, "loss": 0.3714, "step": 30200 }, { "epoch": 1043.103448275862, "grad_norm": 11.595024108886719, "learning_rate": 1.5234536675097724e-05, "loss": 0.379, "step": 30250 }, { "epoch": 1044.8275862068965, "grad_norm": 12.962248802185059, "learning_rate": 1.5177052195907107e-05, "loss": 0.3461, "step": 30300 }, { "epoch": 1046.551724137931, "grad_norm": 9.700074195861816, "learning_rate": 1.5119567716716488e-05, "loss": 0.3694, "step": 30350 }, { "epoch": 1048.2758620689656, "grad_norm": 7.546276569366455, "learning_rate": 1.5062083237525868e-05, "loss": 0.3725, "step": 30400 }, { "epoch": 1050.0, "grad_norm": 10.260604858398438, "learning_rate": 1.500459875833525e-05, "loss": 0.3385, "step": 30450 }, { "epoch": 1051.7241379310344, "grad_norm": 8.85687255859375, "learning_rate": 1.494711427914463e-05, "loss": 0.379, "step": 30500 }, { "epoch": 1053.448275862069, "grad_norm": 10.262443542480469, "learning_rate": 1.4889629799954014e-05, "loss": 0.3788, "step": 30550 }, { "epoch": 1055.1724137931035, "grad_norm": 10.926116943359375, "learning_rate": 1.4832145320763394e-05, "loss": 0.3633, "step": 30600 }, { "epoch": 1056.896551724138, "grad_norm": 6.42436408996582, "learning_rate": 1.4774660841572776e-05, "loss": 0.3525, "step": 30650 }, { "epoch": 1058.6206896551723, "grad_norm": 19.310440063476562, "learning_rate": 1.4717176362382157e-05, "loss": 0.3684, "step": 30700 }, { "epoch": 1060.344827586207, "grad_norm": 6.746118545532227, "learning_rate": 1.465969188319154e-05, "loss": 0.3619, "step": 30750 }, { "epoch": 1062.0689655172414, "grad_norm": 8.014568328857422, "learning_rate": 1.460220740400092e-05, "loss": 0.3725, "step": 30800 }, { "epoch": 1063.7931034482758, "grad_norm": 6.931039333343506, "learning_rate": 1.4544722924810303e-05, "loss": 0.3884, "step": 30850 }, { "epoch": 1065.5172413793102, "grad_norm": 6.655370712280273, "learning_rate": 1.4487238445619683e-05, "loss": 0.3665, "step": 30900 }, { "epoch": 1067.2413793103449, "grad_norm": 9.340506553649902, "learning_rate": 1.4429753966429067e-05, "loss": 0.3475, "step": 30950 }, { "epoch": 1068.9655172413793, "grad_norm": 9.452394485473633, "learning_rate": 1.4372269487238447e-05, "loss": 0.3654, "step": 31000 }, { "epoch": 1070.6896551724137, "grad_norm": 11.220993041992188, "learning_rate": 1.4314785008047829e-05, "loss": 0.3716, "step": 31050 }, { "epoch": 1072.4137931034484, "grad_norm": 13.291098594665527, "learning_rate": 1.425730052885721e-05, "loss": 0.3787, "step": 31100 }, { "epoch": 1074.1379310344828, "grad_norm": 10.462983131408691, "learning_rate": 1.419981604966659e-05, "loss": 0.3752, "step": 31150 }, { "epoch": 1075.8620689655172, "grad_norm": 12.196723937988281, "learning_rate": 1.4142331570475972e-05, "loss": 0.3913, "step": 31200 }, { "epoch": 1077.5862068965516, "grad_norm": 6.551695346832275, "learning_rate": 1.4084847091285352e-05, "loss": 0.3923, "step": 31250 }, { "epoch": 1079.3103448275863, "grad_norm": 7.714890003204346, "learning_rate": 1.4027362612094736e-05, "loss": 0.3686, "step": 31300 }, { "epoch": 1081.0344827586207, "grad_norm": 11.943695068359375, "learning_rate": 1.3969878132904116e-05, "loss": 0.384, "step": 31350 }, { "epoch": 1082.7586206896551, "grad_norm": 13.117568016052246, "learning_rate": 1.3912393653713498e-05, "loss": 0.3578, "step": 31400 }, { "epoch": 1084.4827586206898, "grad_norm": 5.623871326446533, "learning_rate": 1.3854909174522878e-05, "loss": 0.3885, "step": 31450 }, { "epoch": 1086.2068965517242, "grad_norm": 6.84212064743042, "learning_rate": 1.3797424695332262e-05, "loss": 0.3452, "step": 31500 }, { "epoch": 1087.9310344827586, "grad_norm": 13.793436050415039, "learning_rate": 1.3739940216141642e-05, "loss": 0.3831, "step": 31550 }, { "epoch": 1089.655172413793, "grad_norm": 7.368927478790283, "learning_rate": 1.3682455736951024e-05, "loss": 0.362, "step": 31600 }, { "epoch": 1091.3793103448277, "grad_norm": 9.836878776550293, "learning_rate": 1.3624971257760405e-05, "loss": 0.3825, "step": 31650 }, { "epoch": 1093.103448275862, "grad_norm": 7.3501691818237305, "learning_rate": 1.3567486778569788e-05, "loss": 0.3765, "step": 31700 }, { "epoch": 1094.8275862068965, "grad_norm": 18.3492488861084, "learning_rate": 1.3510002299379169e-05, "loss": 0.361, "step": 31750 }, { "epoch": 1096.551724137931, "grad_norm": 9.178772926330566, "learning_rate": 1.345251782018855e-05, "loss": 0.3721, "step": 31800 }, { "epoch": 1098.2758620689656, "grad_norm": 11.826425552368164, "learning_rate": 1.3395033340997931e-05, "loss": 0.376, "step": 31850 }, { "epoch": 1100.0, "grad_norm": 18.10971450805664, "learning_rate": 1.3337548861807311e-05, "loss": 0.3599, "step": 31900 }, { "epoch": 1101.7241379310344, "grad_norm": 8.266904830932617, "learning_rate": 1.3280064382616695e-05, "loss": 0.3859, "step": 31950 }, { "epoch": 1103.448275862069, "grad_norm": 6.199615478515625, "learning_rate": 1.3222579903426075e-05, "loss": 0.3612, "step": 32000 }, { "epoch": 1105.1724137931035, "grad_norm": 8.0098295211792, "learning_rate": 1.3165095424235457e-05, "loss": 0.3517, "step": 32050 }, { "epoch": 1106.896551724138, "grad_norm": 9.372276306152344, "learning_rate": 1.3107610945044838e-05, "loss": 0.3688, "step": 32100 }, { "epoch": 1108.6206896551723, "grad_norm": 9.435142517089844, "learning_rate": 1.3050126465854221e-05, "loss": 0.3672, "step": 32150 }, { "epoch": 1110.344827586207, "grad_norm": 10.498263359069824, "learning_rate": 1.29926419866636e-05, "loss": 0.3621, "step": 32200 }, { "epoch": 1112.0689655172414, "grad_norm": 9.563613891601562, "learning_rate": 1.2935157507472984e-05, "loss": 0.3781, "step": 32250 }, { "epoch": 1113.7931034482758, "grad_norm": 12.710960388183594, "learning_rate": 1.2877673028282364e-05, "loss": 0.3621, "step": 32300 }, { "epoch": 1115.5172413793102, "grad_norm": 13.179040908813477, "learning_rate": 1.2820188549091746e-05, "loss": 0.3935, "step": 32350 }, { "epoch": 1117.2413793103449, "grad_norm": 7.590801239013672, "learning_rate": 1.2762704069901126e-05, "loss": 0.3655, "step": 32400 }, { "epoch": 1118.9655172413793, "grad_norm": 9.509500503540039, "learning_rate": 1.270521959071051e-05, "loss": 0.376, "step": 32450 }, { "epoch": 1120.6896551724137, "grad_norm": 7.22570276260376, "learning_rate": 1.264773511151989e-05, "loss": 0.3767, "step": 32500 }, { "epoch": 1122.4137931034484, "grad_norm": 5.893636703491211, "learning_rate": 1.2590250632329272e-05, "loss": 0.369, "step": 32550 }, { "epoch": 1124.1379310344828, "grad_norm": 9.135228157043457, "learning_rate": 1.2532766153138653e-05, "loss": 0.3651, "step": 32600 }, { "epoch": 1125.8620689655172, "grad_norm": 6.5866265296936035, "learning_rate": 1.2475281673948035e-05, "loss": 0.3664, "step": 32650 }, { "epoch": 1127.5862068965516, "grad_norm": 4.808094501495361, "learning_rate": 1.2417797194757417e-05, "loss": 0.3697, "step": 32700 }, { "epoch": 1129.3103448275863, "grad_norm": 9.829878807067871, "learning_rate": 1.2360312715566799e-05, "loss": 0.3596, "step": 32750 }, { "epoch": 1131.0344827586207, "grad_norm": 8.643157005310059, "learning_rate": 1.2302828236376179e-05, "loss": 0.3773, "step": 32800 }, { "epoch": 1132.7586206896551, "grad_norm": 7.753471374511719, "learning_rate": 1.2245343757185561e-05, "loss": 0.3791, "step": 32850 }, { "epoch": 1134.4827586206898, "grad_norm": 10.430192947387695, "learning_rate": 1.2187859277994943e-05, "loss": 0.3521, "step": 32900 }, { "epoch": 1136.2068965517242, "grad_norm": 15.017478942871094, "learning_rate": 1.2130374798804323e-05, "loss": 0.3737, "step": 32950 }, { "epoch": 1137.9310344827586, "grad_norm": 7.898679256439209, "learning_rate": 1.2072890319613705e-05, "loss": 0.3593, "step": 33000 }, { "epoch": 1139.655172413793, "grad_norm": 9.728981018066406, "learning_rate": 1.2015405840423086e-05, "loss": 0.3636, "step": 33050 }, { "epoch": 1141.3793103448277, "grad_norm": 10.439945220947266, "learning_rate": 1.1957921361232468e-05, "loss": 0.3459, "step": 33100 }, { "epoch": 1143.103448275862, "grad_norm": 16.732709884643555, "learning_rate": 1.190043688204185e-05, "loss": 0.3659, "step": 33150 }, { "epoch": 1144.8275862068965, "grad_norm": 18.07990074157715, "learning_rate": 1.184295240285123e-05, "loss": 0.3576, "step": 33200 }, { "epoch": 1146.551724137931, "grad_norm": 6.497382164001465, "learning_rate": 1.1785467923660612e-05, "loss": 0.3637, "step": 33250 }, { "epoch": 1148.2758620689656, "grad_norm": 4.634519100189209, "learning_rate": 1.1727983444469994e-05, "loss": 0.3385, "step": 33300 }, { "epoch": 1150.0, "grad_norm": 16.58812713623047, "learning_rate": 1.1670498965279374e-05, "loss": 0.3559, "step": 33350 }, { "epoch": 1151.7241379310344, "grad_norm": 10.34142780303955, "learning_rate": 1.1613014486088756e-05, "loss": 0.356, "step": 33400 }, { "epoch": 1153.448275862069, "grad_norm": 10.07383918762207, "learning_rate": 1.1555530006898138e-05, "loss": 0.3829, "step": 33450 }, { "epoch": 1155.1724137931035, "grad_norm": 13.932613372802734, "learning_rate": 1.149804552770752e-05, "loss": 0.3467, "step": 33500 }, { "epoch": 1156.896551724138, "grad_norm": 8.408425331115723, "learning_rate": 1.14405610485169e-05, "loss": 0.3684, "step": 33550 }, { "epoch": 1158.6206896551723, "grad_norm": 14.593783378601074, "learning_rate": 1.1383076569326283e-05, "loss": 0.3636, "step": 33600 }, { "epoch": 1160.344827586207, "grad_norm": 5.467103958129883, "learning_rate": 1.1325592090135665e-05, "loss": 0.3503, "step": 33650 }, { "epoch": 1162.0689655172414, "grad_norm": 10.684771537780762, "learning_rate": 1.1268107610945047e-05, "loss": 0.3895, "step": 33700 }, { "epoch": 1163.7931034482758, "grad_norm": 15.924055099487305, "learning_rate": 1.1210623131754427e-05, "loss": 0.3612, "step": 33750 }, { "epoch": 1165.5172413793102, "grad_norm": 7.8569841384887695, "learning_rate": 1.1153138652563807e-05, "loss": 0.3606, "step": 33800 }, { "epoch": 1167.2413793103449, "grad_norm": 7.350346088409424, "learning_rate": 1.109565417337319e-05, "loss": 0.3556, "step": 33850 }, { "epoch": 1168.9655172413793, "grad_norm": 10.281527519226074, "learning_rate": 1.1038169694182571e-05, "loss": 0.354, "step": 33900 }, { "epoch": 1170.6896551724137, "grad_norm": 10.269756317138672, "learning_rate": 1.0980685214991951e-05, "loss": 0.3468, "step": 33950 }, { "epoch": 1172.4137931034484, "grad_norm": 11.779791831970215, "learning_rate": 1.0923200735801334e-05, "loss": 0.3467, "step": 34000 }, { "epoch": 1174.1379310344828, "grad_norm": 9.749473571777344, "learning_rate": 1.0865716256610716e-05, "loss": 0.3882, "step": 34050 }, { "epoch": 1175.8620689655172, "grad_norm": 5.0219035148620605, "learning_rate": 1.0808231777420098e-05, "loss": 0.3443, "step": 34100 }, { "epoch": 1177.5862068965516, "grad_norm": 6.911483287811279, "learning_rate": 1.0750747298229478e-05, "loss": 0.3514, "step": 34150 }, { "epoch": 1179.3103448275863, "grad_norm": 5.359627723693848, "learning_rate": 1.069326281903886e-05, "loss": 0.3535, "step": 34200 }, { "epoch": 1181.0344827586207, "grad_norm": 12.645089149475098, "learning_rate": 1.0635778339848242e-05, "loss": 0.3551, "step": 34250 }, { "epoch": 1182.7586206896551, "grad_norm": 8.579277992248535, "learning_rate": 1.0578293860657624e-05, "loss": 0.3594, "step": 34300 }, { "epoch": 1184.4827586206898, "grad_norm": 6.450700759887695, "learning_rate": 1.0520809381467004e-05, "loss": 0.3581, "step": 34350 }, { "epoch": 1186.2068965517242, "grad_norm": 10.449162483215332, "learning_rate": 1.0463324902276386e-05, "loss": 0.338, "step": 34400 }, { "epoch": 1187.9310344827586, "grad_norm": 9.93543529510498, "learning_rate": 1.0405840423085768e-05, "loss": 0.3333, "step": 34450 }, { "epoch": 1189.655172413793, "grad_norm": 7.859614372253418, "learning_rate": 1.034835594389515e-05, "loss": 0.353, "step": 34500 }, { "epoch": 1191.3793103448277, "grad_norm": 8.042450904846191, "learning_rate": 1.029087146470453e-05, "loss": 0.3632, "step": 34550 }, { "epoch": 1193.103448275862, "grad_norm": 5.73293924331665, "learning_rate": 1.023338698551391e-05, "loss": 0.3566, "step": 34600 }, { "epoch": 1194.8275862068965, "grad_norm": 11.474154472351074, "learning_rate": 1.0175902506323293e-05, "loss": 0.3666, "step": 34650 }, { "epoch": 1196.551724137931, "grad_norm": 7.6923723220825195, "learning_rate": 1.0118418027132675e-05, "loss": 0.3334, "step": 34700 }, { "epoch": 1198.2758620689656, "grad_norm": 5.4585676193237305, "learning_rate": 1.0060933547942055e-05, "loss": 0.3802, "step": 34750 }, { "epoch": 1200.0, "grad_norm": 7.469751834869385, "learning_rate": 1.0003449068751437e-05, "loss": 0.3646, "step": 34800 }, { "epoch": 1201.7241379310344, "grad_norm": 11.218746185302734, "learning_rate": 9.94596458956082e-06, "loss": 0.3688, "step": 34850 }, { "epoch": 1203.448275862069, "grad_norm": 7.2207417488098145, "learning_rate": 9.888480110370201e-06, "loss": 0.3469, "step": 34900 }, { "epoch": 1205.1724137931035, "grad_norm": 12.242910385131836, "learning_rate": 9.830995631179581e-06, "loss": 0.3501, "step": 34950 }, { "epoch": 1206.896551724138, "grad_norm": 5.4994659423828125, "learning_rate": 9.773511151988963e-06, "loss": 0.3492, "step": 35000 }, { "epoch": 1208.6206896551723, "grad_norm": 5.841935157775879, "learning_rate": 9.716026672798345e-06, "loss": 0.3553, "step": 35050 }, { "epoch": 1210.344827586207, "grad_norm": 7.812607288360596, "learning_rate": 9.658542193607728e-06, "loss": 0.3559, "step": 35100 }, { "epoch": 1212.0689655172414, "grad_norm": 22.461164474487305, "learning_rate": 9.601057714417108e-06, "loss": 0.3514, "step": 35150 }, { "epoch": 1213.7931034482758, "grad_norm": 9.567444801330566, "learning_rate": 9.54357323522649e-06, "loss": 0.35, "step": 35200 }, { "epoch": 1215.5172413793102, "grad_norm": 13.007235527038574, "learning_rate": 9.486088756035872e-06, "loss": 0.3363, "step": 35250 }, { "epoch": 1217.2413793103449, "grad_norm": 7.958148002624512, "learning_rate": 9.428604276845252e-06, "loss": 0.3666, "step": 35300 }, { "epoch": 1218.9655172413793, "grad_norm": 10.840229034423828, "learning_rate": 9.371119797654632e-06, "loss": 0.3531, "step": 35350 }, { "epoch": 1220.6896551724137, "grad_norm": 6.059818744659424, "learning_rate": 9.313635318464014e-06, "loss": 0.3576, "step": 35400 }, { "epoch": 1222.4137931034484, "grad_norm": 13.16929817199707, "learning_rate": 9.256150839273396e-06, "loss": 0.3691, "step": 35450 }, { "epoch": 1224.1379310344828, "grad_norm": 9.812789916992188, "learning_rate": 9.198666360082778e-06, "loss": 0.355, "step": 35500 }, { "epoch": 1225.8620689655172, "grad_norm": 21.69074058532715, "learning_rate": 9.141181880892159e-06, "loss": 0.3598, "step": 35550 }, { "epoch": 1227.5862068965516, "grad_norm": 10.920539855957031, "learning_rate": 9.08369740170154e-06, "loss": 0.3432, "step": 35600 }, { "epoch": 1229.3103448275863, "grad_norm": 10.280683517456055, "learning_rate": 9.026212922510923e-06, "loss": 0.3403, "step": 35650 }, { "epoch": 1231.0344827586207, "grad_norm": 8.334404945373535, "learning_rate": 8.968728443320303e-06, "loss": 0.3785, "step": 35700 }, { "epoch": 1232.7586206896551, "grad_norm": 11.498673439025879, "learning_rate": 8.911243964129685e-06, "loss": 0.328, "step": 35750 }, { "epoch": 1234.4827586206898, "grad_norm": 14.804670333862305, "learning_rate": 8.853759484939067e-06, "loss": 0.3523, "step": 35800 }, { "epoch": 1236.2068965517242, "grad_norm": 7.5304083824157715, "learning_rate": 8.796275005748449e-06, "loss": 0.3365, "step": 35850 }, { "epoch": 1237.9310344827586, "grad_norm": 12.465886116027832, "learning_rate": 8.73879052655783e-06, "loss": 0.365, "step": 35900 }, { "epoch": 1239.655172413793, "grad_norm": 13.944148063659668, "learning_rate": 8.681306047367211e-06, "loss": 0.3405, "step": 35950 }, { "epoch": 1241.3793103448277, "grad_norm": 4.417449951171875, "learning_rate": 8.623821568176593e-06, "loss": 0.3451, "step": 36000 }, { "epoch": 1243.103448275862, "grad_norm": 4.128957748413086, "learning_rate": 8.566337088985975e-06, "loss": 0.342, "step": 36050 }, { "epoch": 1244.8275862068965, "grad_norm": 7.202677249908447, "learning_rate": 8.508852609795356e-06, "loss": 0.3462, "step": 36100 }, { "epoch": 1246.551724137931, "grad_norm": 9.519671440124512, "learning_rate": 8.451368130604736e-06, "loss": 0.3446, "step": 36150 }, { "epoch": 1248.2758620689656, "grad_norm": 8.864044189453125, "learning_rate": 8.393883651414118e-06, "loss": 0.3471, "step": 36200 }, { "epoch": 1250.0, "grad_norm": 6.762562274932861, "learning_rate": 8.3363991722235e-06, "loss": 0.3197, "step": 36250 }, { "epoch": 1251.7241379310344, "grad_norm": 10.311796188354492, "learning_rate": 8.27891469303288e-06, "loss": 0.3457, "step": 36300 }, { "epoch": 1253.448275862069, "grad_norm": 11.082413673400879, "learning_rate": 8.221430213842262e-06, "loss": 0.3393, "step": 36350 }, { "epoch": 1255.1724137931035, "grad_norm": 9.231948852539062, "learning_rate": 8.163945734651644e-06, "loss": 0.3653, "step": 36400 }, { "epoch": 1256.896551724138, "grad_norm": 8.82437515258789, "learning_rate": 8.106461255461026e-06, "loss": 0.3512, "step": 36450 }, { "epoch": 1258.6206896551723, "grad_norm": 11.596000671386719, "learning_rate": 8.048976776270407e-06, "loss": 0.3509, "step": 36500 }, { "epoch": 1260.344827586207, "grad_norm": 7.113890171051025, "learning_rate": 7.991492297079789e-06, "loss": 0.3829, "step": 36550 }, { "epoch": 1262.0689655172414, "grad_norm": 6.238236427307129, "learning_rate": 7.93400781788917e-06, "loss": 0.3584, "step": 36600 }, { "epoch": 1263.7931034482758, "grad_norm": 10.327884674072266, "learning_rate": 7.876523338698553e-06, "loss": 0.3753, "step": 36650 }, { "epoch": 1265.5172413793102, "grad_norm": 12.697983741760254, "learning_rate": 7.819038859507933e-06, "loss": 0.3651, "step": 36700 }, { "epoch": 1267.2413793103449, "grad_norm": 8.638954162597656, "learning_rate": 7.761554380317315e-06, "loss": 0.3377, "step": 36750 }, { "epoch": 1268.9655172413793, "grad_norm": 7.269346237182617, "learning_rate": 7.704069901126697e-06, "loss": 0.3286, "step": 36800 }, { "epoch": 1270.6896551724137, "grad_norm": 7.418833255767822, "learning_rate": 7.646585421936079e-06, "loss": 0.3508, "step": 36850 }, { "epoch": 1272.4137931034484, "grad_norm": 5.8808417320251465, "learning_rate": 7.589100942745459e-06, "loss": 0.337, "step": 36900 }, { "epoch": 1274.1379310344828, "grad_norm": 5.134968280792236, "learning_rate": 7.53161646355484e-06, "loss": 0.3587, "step": 36950 }, { "epoch": 1275.8620689655172, "grad_norm": 11.895872116088867, "learning_rate": 7.474131984364222e-06, "loss": 0.3375, "step": 37000 }, { "epoch": 1277.5862068965516, "grad_norm": 11.477492332458496, "learning_rate": 7.416647505173603e-06, "loss": 0.3574, "step": 37050 }, { "epoch": 1279.3103448275863, "grad_norm": 7.810723304748535, "learning_rate": 7.359163025982985e-06, "loss": 0.3216, "step": 37100 }, { "epoch": 1281.0344827586207, "grad_norm": 6.583137035369873, "learning_rate": 7.301678546792366e-06, "loss": 0.35, "step": 37150 }, { "epoch": 1282.7586206896551, "grad_norm": 4.778672218322754, "learning_rate": 7.244194067601748e-06, "loss": 0.3298, "step": 37200 }, { "epoch": 1284.4827586206898, "grad_norm": 11.420574188232422, "learning_rate": 7.186709588411129e-06, "loss": 0.3297, "step": 37250 }, { "epoch": 1286.2068965517242, "grad_norm": 6.6733927726745605, "learning_rate": 7.129225109220511e-06, "loss": 0.3369, "step": 37300 }, { "epoch": 1287.9310344827586, "grad_norm": 4.217225551605225, "learning_rate": 7.071740630029892e-06, "loss": 0.3412, "step": 37350 }, { "epoch": 1289.655172413793, "grad_norm": 6.589169502258301, "learning_rate": 7.0142561508392744e-06, "loss": 0.33, "step": 37400 }, { "epoch": 1291.3793103448277, "grad_norm": 5.689001083374023, "learning_rate": 6.956771671648656e-06, "loss": 0.3432, "step": 37450 }, { "epoch": 1293.103448275862, "grad_norm": 14.713215827941895, "learning_rate": 6.899287192458037e-06, "loss": 0.3476, "step": 37500 }, { "epoch": 1294.8275862068965, "grad_norm": 12.100150108337402, "learning_rate": 6.841802713267419e-06, "loss": 0.3345, "step": 37550 }, { "epoch": 1296.551724137931, "grad_norm": 8.93959903717041, "learning_rate": 6.7843182340768e-06, "loss": 0.3336, "step": 37600 }, { "epoch": 1298.2758620689656, "grad_norm": 6.844462871551514, "learning_rate": 6.72683375488618e-06, "loss": 0.3603, "step": 37650 }, { "epoch": 1300.0, "grad_norm": 11.404465675354004, "learning_rate": 6.669349275695562e-06, "loss": 0.3408, "step": 37700 }, { "epoch": 1301.7241379310344, "grad_norm": 9.943863868713379, "learning_rate": 6.611864796504943e-06, "loss": 0.3342, "step": 37750 }, { "epoch": 1303.448275862069, "grad_norm": 10.692397117614746, "learning_rate": 6.554380317314325e-06, "loss": 0.3547, "step": 37800 }, { "epoch": 1305.1724137931035, "grad_norm": 9.271644592285156, "learning_rate": 6.4968958381237066e-06, "loss": 0.3261, "step": 37850 }, { "epoch": 1306.896551724138, "grad_norm": 8.352354049682617, "learning_rate": 6.4394113589330886e-06, "loss": 0.3589, "step": 37900 }, { "epoch": 1308.6206896551723, "grad_norm": 6.677180290222168, "learning_rate": 6.38192687974247e-06, "loss": 0.3557, "step": 37950 }, { "epoch": 1310.344827586207, "grad_norm": 6.720130443572998, "learning_rate": 6.324442400551851e-06, "loss": 0.3486, "step": 38000 }, { "epoch": 1312.0689655172414, "grad_norm": 13.527320861816406, "learning_rate": 6.266957921361233e-06, "loss": 0.3343, "step": 38050 }, { "epoch": 1313.7931034482758, "grad_norm": 14.34444522857666, "learning_rate": 6.209473442170614e-06, "loss": 0.3538, "step": 38100 }, { "epoch": 1315.5172413793102, "grad_norm": 6.647273063659668, "learning_rate": 6.151988962979996e-06, "loss": 0.3525, "step": 38150 }, { "epoch": 1317.2413793103449, "grad_norm": 11.363895416259766, "learning_rate": 6.094504483789377e-06, "loss": 0.3407, "step": 38200 }, { "epoch": 1318.9655172413793, "grad_norm": 7.47884464263916, "learning_rate": 6.037020004598758e-06, "loss": 0.3605, "step": 38250 }, { "epoch": 1320.6896551724137, "grad_norm": 5.9791412353515625, "learning_rate": 5.9795355254081395e-06, "loss": 0.3553, "step": 38300 }, { "epoch": 1322.4137931034484, "grad_norm": 5.88453483581543, "learning_rate": 5.9220510462175216e-06, "loss": 0.3514, "step": 38350 }, { "epoch": 1324.1379310344828, "grad_norm": 6.67228889465332, "learning_rate": 5.864566567026903e-06, "loss": 0.3321, "step": 38400 }, { "epoch": 1325.8620689655172, "grad_norm": 5.450709342956543, "learning_rate": 5.807082087836285e-06, "loss": 0.3441, "step": 38450 }, { "epoch": 1327.5862068965516, "grad_norm": 10.643080711364746, "learning_rate": 5.749597608645666e-06, "loss": 0.3394, "step": 38500 }, { "epoch": 1329.3103448275863, "grad_norm": 11.631002426147461, "learning_rate": 5.692113129455048e-06, "loss": 0.352, "step": 38550 }, { "epoch": 1331.0344827586207, "grad_norm": 7.7536821365356445, "learning_rate": 5.634628650264428e-06, "loss": 0.344, "step": 38600 }, { "epoch": 1332.7586206896551, "grad_norm": 9.447080612182617, "learning_rate": 5.57714417107381e-06, "loss": 0.3273, "step": 38650 }, { "epoch": 1334.4827586206898, "grad_norm": 9.951111793518066, "learning_rate": 5.519659691883191e-06, "loss": 0.3243, "step": 38700 }, { "epoch": 1336.2068965517242, "grad_norm": 8.128294944763184, "learning_rate": 5.462175212692573e-06, "loss": 0.3395, "step": 38750 }, { "epoch": 1337.9310344827586, "grad_norm": 7.917686939239502, "learning_rate": 5.4046907335019545e-06, "loss": 0.3657, "step": 38800 }, { "epoch": 1339.655172413793, "grad_norm": 2.9258852005004883, "learning_rate": 5.3472062543113365e-06, "loss": 0.3431, "step": 38850 }, { "epoch": 1341.3793103448277, "grad_norm": 11.76290225982666, "learning_rate": 5.289721775120718e-06, "loss": 0.3295, "step": 38900 }, { "epoch": 1343.103448275862, "grad_norm": 11.408366203308105, "learning_rate": 5.2322372959301e-06, "loss": 0.3565, "step": 38950 }, { "epoch": 1344.8275862068965, "grad_norm": 6.51284122467041, "learning_rate": 5.17475281673948e-06, "loss": 0.3415, "step": 39000 }, { "epoch": 1346.551724137931, "grad_norm": 13.244549751281738, "learning_rate": 5.117268337548862e-06, "loss": 0.3564, "step": 39050 }, { "epoch": 1348.2758620689656, "grad_norm": 6.419191837310791, "learning_rate": 5.059783858358243e-06, "loss": 0.331, "step": 39100 }, { "epoch": 1350.0, "grad_norm": 6.889120578765869, "learning_rate": 5.002299379167625e-06, "loss": 0.36, "step": 39150 }, { "epoch": 1351.7241379310344, "grad_norm": 10.378256797790527, "learning_rate": 4.944814899977006e-06, "loss": 0.3615, "step": 39200 }, { "epoch": 1353.448275862069, "grad_norm": 8.468602180480957, "learning_rate": 4.887330420786388e-06, "loss": 0.3456, "step": 39250 }, { "epoch": 1355.1724137931035, "grad_norm": 11.215758323669434, "learning_rate": 4.8298459415957695e-06, "loss": 0.3301, "step": 39300 }, { "epoch": 1356.896551724138, "grad_norm": 7.1538848876953125, "learning_rate": 4.7723614624051515e-06, "loss": 0.3254, "step": 39350 }, { "epoch": 1358.6206896551723, "grad_norm": 8.748969078063965, "learning_rate": 4.714876983214532e-06, "loss": 0.3371, "step": 39400 }, { "epoch": 1360.344827586207, "grad_norm": 4.307106018066406, "learning_rate": 4.657392504023914e-06, "loss": 0.3432, "step": 39450 }, { "epoch": 1362.0689655172414, "grad_norm": 12.248066902160645, "learning_rate": 4.599908024833295e-06, "loss": 0.3625, "step": 39500 }, { "epoch": 1363.7931034482758, "grad_norm": 6.587746620178223, "learning_rate": 4.542423545642677e-06, "loss": 0.3455, "step": 39550 }, { "epoch": 1365.5172413793102, "grad_norm": 7.072157859802246, "learning_rate": 4.484939066452058e-06, "loss": 0.3424, "step": 39600 }, { "epoch": 1367.2413793103449, "grad_norm": 7.832579135894775, "learning_rate": 4.42745458726144e-06, "loss": 0.3298, "step": 39650 }, { "epoch": 1368.9655172413793, "grad_norm": 7.665703296661377, "learning_rate": 4.369970108070821e-06, "loss": 0.3177, "step": 39700 }, { "epoch": 1370.6896551724137, "grad_norm": 7.821745872497559, "learning_rate": 4.3124856288802025e-06, "loss": 0.339, "step": 39750 }, { "epoch": 1372.4137931034484, "grad_norm": 7.955054759979248, "learning_rate": 4.255001149689584e-06, "loss": 0.3498, "step": 39800 }, { "epoch": 1374.1379310344828, "grad_norm": 11.540693283081055, "learning_rate": 4.197516670498966e-06, "loss": 0.3526, "step": 39850 }, { "epoch": 1375.8620689655172, "grad_norm": 6.683413505554199, "learning_rate": 4.140032191308347e-06, "loss": 0.3291, "step": 39900 }, { "epoch": 1377.5862068965516, "grad_norm": 9.08400821685791, "learning_rate": 4.082547712117729e-06, "loss": 0.3385, "step": 39950 }, { "epoch": 1379.3103448275863, "grad_norm": 9.322541236877441, "learning_rate": 4.02506323292711e-06, "loss": 0.3454, "step": 40000 }, { "epoch": 1381.0344827586207, "grad_norm": 10.001763343811035, "learning_rate": 3.967578753736491e-06, "loss": 0.3292, "step": 40050 }, { "epoch": 1382.7586206896551, "grad_norm": 8.4324369430542, "learning_rate": 3.910094274545873e-06, "loss": 0.3802, "step": 40100 }, { "epoch": 1384.4827586206898, "grad_norm": 12.70076847076416, "learning_rate": 3.852609795355254e-06, "loss": 0.3551, "step": 40150 }, { "epoch": 1386.2068965517242, "grad_norm": 10.163627624511719, "learning_rate": 3.7951253161646355e-06, "loss": 0.3351, "step": 40200 }, { "epoch": 1387.9310344827586, "grad_norm": 13.175633430480957, "learning_rate": 3.737640836974017e-06, "loss": 0.3519, "step": 40250 }, { "epoch": 1389.655172413793, "grad_norm": 9.967467308044434, "learning_rate": 3.6801563577833987e-06, "loss": 0.3739, "step": 40300 }, { "epoch": 1391.3793103448277, "grad_norm": 10.758793830871582, "learning_rate": 3.6226718785927802e-06, "loss": 0.3434, "step": 40350 }, { "epoch": 1393.103448275862, "grad_norm": 5.817944526672363, "learning_rate": 3.565187399402162e-06, "loss": 0.3123, "step": 40400 }, { "epoch": 1394.8275862068965, "grad_norm": 11.685726165771484, "learning_rate": 3.5077029202115434e-06, "loss": 0.3448, "step": 40450 }, { "epoch": 1396.551724137931, "grad_norm": 6.314751625061035, "learning_rate": 3.450218441020925e-06, "loss": 0.3355, "step": 40500 }, { "epoch": 1398.2758620689656, "grad_norm": 12.054367065429688, "learning_rate": 3.3927339618303057e-06, "loss": 0.3405, "step": 40550 }, { "epoch": 1400.0, "grad_norm": 11.395332336425781, "learning_rate": 3.3352494826396873e-06, "loss": 0.3629, "step": 40600 }, { "epoch": 1401.7241379310344, "grad_norm": 14.286806106567383, "learning_rate": 3.277765003449069e-06, "loss": 0.3524, "step": 40650 }, { "epoch": 1403.448275862069, "grad_norm": 10.114741325378418, "learning_rate": 3.2202805242584505e-06, "loss": 0.3506, "step": 40700 }, { "epoch": 1405.1724137931035, "grad_norm": 7.126160144805908, "learning_rate": 3.162796045067832e-06, "loss": 0.3365, "step": 40750 }, { "epoch": 1406.896551724138, "grad_norm": 11.108461380004883, "learning_rate": 3.1053115658772136e-06, "loss": 0.3299, "step": 40800 }, { "epoch": 1408.6206896551723, "grad_norm": 11.96660327911377, "learning_rate": 3.047827086686595e-06, "loss": 0.3347, "step": 40850 }, { "epoch": 1410.344827586207, "grad_norm": 12.44959831237793, "learning_rate": 2.9903426074959764e-06, "loss": 0.3321, "step": 40900 }, { "epoch": 1412.0689655172414, "grad_norm": 8.108181953430176, "learning_rate": 2.932858128305358e-06, "loss": 0.3454, "step": 40950 }, { "epoch": 1413.7931034482758, "grad_norm": 7.305360317230225, "learning_rate": 2.875373649114739e-06, "loss": 0.3296, "step": 41000 }, { "epoch": 1415.5172413793102, "grad_norm": 10.082894325256348, "learning_rate": 2.8178891699241207e-06, "loss": 0.3318, "step": 41050 }, { "epoch": 1417.2413793103449, "grad_norm": 7.051600933074951, "learning_rate": 2.7604046907335023e-06, "loss": 0.3569, "step": 41100 }, { "epoch": 1418.9655172413793, "grad_norm": 8.559059143066406, "learning_rate": 2.7029202115428835e-06, "loss": 0.3367, "step": 41150 }, { "epoch": 1420.6896551724137, "grad_norm": 6.945859432220459, "learning_rate": 2.645435732352265e-06, "loss": 0.345, "step": 41200 }, { "epoch": 1422.4137931034484, "grad_norm": 9.76579761505127, "learning_rate": 2.5879512531616466e-06, "loss": 0.3347, "step": 41250 }, { "epoch": 1424.1379310344828, "grad_norm": 9.017191886901855, "learning_rate": 2.530466773971028e-06, "loss": 0.3543, "step": 41300 }, { "epoch": 1425.8620689655172, "grad_norm": 6.610126972198486, "learning_rate": 2.4729822947804094e-06, "loss": 0.3433, "step": 41350 }, { "epoch": 1427.5862068965516, "grad_norm": 8.919373512268066, "learning_rate": 2.4154978155897905e-06, "loss": 0.3274, "step": 41400 }, { "epoch": 1429.3103448275863, "grad_norm": 7.73258113861084, "learning_rate": 2.358013336399172e-06, "loss": 0.3537, "step": 41450 }, { "epoch": 1431.0344827586207, "grad_norm": 13.695204734802246, "learning_rate": 2.3005288572085537e-06, "loss": 0.3288, "step": 41500 }, { "epoch": 1432.7586206896551, "grad_norm": 12.077178955078125, "learning_rate": 2.2430443780179353e-06, "loss": 0.3361, "step": 41550 }, { "epoch": 1434.4827586206898, "grad_norm": 9.64843463897705, "learning_rate": 2.1855598988273164e-06, "loss": 0.3374, "step": 41600 }, { "epoch": 1436.2068965517242, "grad_norm": 9.649758338928223, "learning_rate": 2.128075419636698e-06, "loss": 0.3277, "step": 41650 }, { "epoch": 1437.9310344827586, "grad_norm": 17.453420639038086, "learning_rate": 2.0705909404460796e-06, "loss": 0.33, "step": 41700 }, { "epoch": 1439.655172413793, "grad_norm": 8.375717163085938, "learning_rate": 2.013106461255461e-06, "loss": 0.3232, "step": 41750 }, { "epoch": 1441.3793103448277, "grad_norm": 6.42498254776001, "learning_rate": 1.9556219820648424e-06, "loss": 0.3464, "step": 41800 }, { "epoch": 1443.103448275862, "grad_norm": 7.46762228012085, "learning_rate": 1.898137502874224e-06, "loss": 0.3513, "step": 41850 }, { "epoch": 1444.8275862068965, "grad_norm": 4.703877925872803, "learning_rate": 1.8406530236836055e-06, "loss": 0.3313, "step": 41900 }, { "epoch": 1446.551724137931, "grad_norm": 10.599761009216309, "learning_rate": 1.7831685444929871e-06, "loss": 0.3422, "step": 41950 }, { "epoch": 1448.2758620689656, "grad_norm": 15.02623176574707, "learning_rate": 1.7256840653023683e-06, "loss": 0.3375, "step": 42000 }, { "epoch": 1450.0, "grad_norm": 11.589694023132324, "learning_rate": 1.6681995861117499e-06, "loss": 0.3568, "step": 42050 }, { "epoch": 1451.7241379310344, "grad_norm": 8.810125350952148, "learning_rate": 1.6107151069211314e-06, "loss": 0.2974, "step": 42100 }, { "epoch": 1453.448275862069, "grad_norm": 5.8388800621032715, "learning_rate": 1.5532306277305128e-06, "loss": 0.3788, "step": 42150 }, { "epoch": 1455.1724137931035, "grad_norm": 5.160738468170166, "learning_rate": 1.4957461485398942e-06, "loss": 0.3381, "step": 42200 }, { "epoch": 1456.896551724138, "grad_norm": 6.78072452545166, "learning_rate": 1.4382616693492758e-06, "loss": 0.3253, "step": 42250 }, { "epoch": 1458.6206896551723, "grad_norm": 10.720961570739746, "learning_rate": 1.3807771901586571e-06, "loss": 0.327, "step": 42300 }, { "epoch": 1460.344827586207, "grad_norm": 12.243437767028809, "learning_rate": 1.3232927109680387e-06, "loss": 0.3595, "step": 42350 }, { "epoch": 1462.0689655172414, "grad_norm": 7.357919216156006, "learning_rate": 1.26580823177742e-06, "loss": 0.3276, "step": 42400 }, { "epoch": 1463.7931034482758, "grad_norm": 8.001890182495117, "learning_rate": 1.2083237525868017e-06, "loss": 0.3164, "step": 42450 }, { "epoch": 1465.5172413793102, "grad_norm": 5.370369911193848, "learning_rate": 1.150839273396183e-06, "loss": 0.3461, "step": 42500 }, { "epoch": 1467.2413793103449, "grad_norm": 5.190150737762451, "learning_rate": 1.0933547942055646e-06, "loss": 0.3269, "step": 42550 }, { "epoch": 1468.9655172413793, "grad_norm": 9.991484642028809, "learning_rate": 1.035870315014946e-06, "loss": 0.3283, "step": 42600 }, { "epoch": 1470.6896551724137, "grad_norm": 9.458399772644043, "learning_rate": 9.783858358243276e-07, "loss": 0.3389, "step": 42650 }, { "epoch": 1472.4137931034484, "grad_norm": 14.690953254699707, "learning_rate": 9.209013566337089e-07, "loss": 0.3395, "step": 42700 }, { "epoch": 1474.1379310344828, "grad_norm": 5.438377380371094, "learning_rate": 8.634168774430904e-07, "loss": 0.3455, "step": 42750 }, { "epoch": 1475.8620689655172, "grad_norm": 7.423194408416748, "learning_rate": 8.059323982524718e-07, "loss": 0.3361, "step": 42800 }, { "epoch": 1477.5862068965516, "grad_norm": 10.556965827941895, "learning_rate": 7.484479190618533e-07, "loss": 0.3353, "step": 42850 }, { "epoch": 1479.3103448275863, "grad_norm": 14.089933395385742, "learning_rate": 6.909634398712348e-07, "loss": 0.3378, "step": 42900 }, { "epoch": 1481.0344827586207, "grad_norm": 11.984712600708008, "learning_rate": 6.334789606806162e-07, "loss": 0.3556, "step": 42950 }, { "epoch": 1482.7586206896551, "grad_norm": 16.83322525024414, "learning_rate": 5.759944814899977e-07, "loss": 0.3478, "step": 43000 }, { "epoch": 1484.4827586206898, "grad_norm": 10.616393089294434, "learning_rate": 5.185100022993792e-07, "loss": 0.3314, "step": 43050 }, { "epoch": 1486.2068965517242, "grad_norm": 7.828108787536621, "learning_rate": 4.610255231087606e-07, "loss": 0.347, "step": 43100 }, { "epoch": 1487.9310344827586, "grad_norm": 13.197574615478516, "learning_rate": 4.035410439181421e-07, "loss": 0.3521, "step": 43150 }, { "epoch": 1489.655172413793, "grad_norm": 11.973600387573242, "learning_rate": 3.460565647275236e-07, "loss": 0.3334, "step": 43200 }, { "epoch": 1491.3793103448277, "grad_norm": 12.832175254821777, "learning_rate": 2.8857208553690506e-07, "loss": 0.326, "step": 43250 }, { "epoch": 1493.103448275862, "grad_norm": 6.833930492401123, "learning_rate": 2.310876063462865e-07, "loss": 0.3298, "step": 43300 }, { "epoch": 1494.8275862068965, "grad_norm": 6.8522257804870605, "learning_rate": 1.7360312715566799e-07, "loss": 0.3373, "step": 43350 }, { "epoch": 1496.551724137931, "grad_norm": 4.281970977783203, "learning_rate": 1.1611864796504944e-07, "loss": 0.3163, "step": 43400 }, { "epoch": 1498.2758620689656, "grad_norm": 5.030120372772217, "learning_rate": 5.863416877443091e-08, "loss": 0.3344, "step": 43450 }, { "epoch": 1500.0, "grad_norm": 8.938008308410645, "learning_rate": 1.1496895838123708e-09, "loss": 0.333, "step": 43500 } ], "logging_steps": 50, "max_steps": 43500, "num_input_tokens_seen": 0, "num_train_epochs": 1500, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5604482285568000.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }