diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,28050 @@ +{ + "best_global_step": 3000, + "best_metric": 0.4750007688999176, + "best_model_checkpoint": "outputs/training_20260111_053019/checkpoint-3000", + "epoch": 0.7111111111111111, + "eval_steps": 1500, + "global_step": 4000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00017777777777777779, + "grad_norm": 0.5633949637413025, + "learning_rate": 5e-05, + "loss": 0.9039, + "step": 1 + }, + { + "epoch": 0.00035555555555555557, + "grad_norm": 0.5636752247810364, + "learning_rate": 4.9999996100897126e-05, + "loss": 0.7874, + "step": 2 + }, + { + "epoch": 0.0005333333333333334, + "grad_norm": 0.5482001900672913, + "learning_rate": 4.999998440358973e-05, + "loss": 0.8611, + "step": 3 + }, + { + "epoch": 0.0007111111111111111, + "grad_norm": 0.5551458597183228, + "learning_rate": 4.9999964908081455e-05, + "loss": 0.8905, + "step": 4 + }, + { + "epoch": 0.0008888888888888889, + "grad_norm": 0.5364474654197693, + "learning_rate": 4.999993761437838e-05, + "loss": 0.8634, + "step": 5 + }, + { + "epoch": 0.0010666666666666667, + "grad_norm": 0.5034008622169495, + "learning_rate": 4.9999902522489015e-05, + "loss": 0.7463, + "step": 6 + }, + { + "epoch": 0.0012444444444444445, + "grad_norm": 0.5017440319061279, + "learning_rate": 4.999985963242432e-05, + "loss": 0.7615, + "step": 7 + }, + { + "epoch": 0.0014222222222222223, + "grad_norm": 0.4903201460838318, + "learning_rate": 4.9999808944197666e-05, + "loss": 0.842, + "step": 8 + }, + { + "epoch": 0.0016, + "grad_norm": 0.5094455480575562, + "learning_rate": 4.999975045782486e-05, + "loss": 0.8348, + "step": 9 + }, + { + "epoch": 0.0017777777777777779, + "grad_norm": 0.5041722655296326, + "learning_rate": 4.999968417332415e-05, + "loss": 0.9804, + "step": 10 + }, + { + "epoch": 0.0019555555555555554, + "grad_norm": 0.49050918221473694, + "learning_rate": 4.999961009071621e-05, + "loss": 0.8011, + "step": 11 + }, + { + "epoch": 0.0021333333333333334, + "grad_norm": 0.48820722103118896, + "learning_rate": 4.999952821002415e-05, + "loss": 0.8194, + "step": 12 + }, + { + "epoch": 0.002311111111111111, + "grad_norm": 0.4856832027435303, + "learning_rate": 4.999943853127351e-05, + "loss": 0.7692, + "step": 13 + }, + { + "epoch": 0.002488888888888889, + "grad_norm": 0.48455479741096497, + "learning_rate": 4.9999341054492265e-05, + "loss": 0.678, + "step": 14 + }, + { + "epoch": 0.0026666666666666666, + "grad_norm": 0.474031001329422, + "learning_rate": 4.9999235779710826e-05, + "loss": 0.7274, + "step": 15 + }, + { + "epoch": 0.0028444444444444446, + "grad_norm": 0.47146075963974, + "learning_rate": 4.999912270696202e-05, + "loss": 0.793, + "step": 16 + }, + { + "epoch": 0.003022222222222222, + "grad_norm": 0.47266456484794617, + "learning_rate": 4.999900183628112e-05, + "loss": 0.685, + "step": 17 + }, + { + "epoch": 0.0032, + "grad_norm": 0.4639424681663513, + "learning_rate": 4.999887316770584e-05, + "loss": 0.878, + "step": 18 + }, + { + "epoch": 0.0033777777777777777, + "grad_norm": 0.44405439496040344, + "learning_rate": 4.9998736701276295e-05, + "loss": 0.6873, + "step": 19 + }, + { + "epoch": 0.0035555555555555557, + "grad_norm": 0.43510663509368896, + "learning_rate": 4.9998592437035076e-05, + "loss": 0.7481, + "step": 20 + }, + { + "epoch": 0.0037333333333333333, + "grad_norm": 0.45592424273490906, + "learning_rate": 4.9998440375027166e-05, + "loss": 0.7324, + "step": 21 + }, + { + "epoch": 0.003911111111111111, + "grad_norm": 0.44173601269721985, + "learning_rate": 4.99982805153e-05, + "loss": 0.7655, + "step": 22 + }, + { + "epoch": 0.004088888888888889, + "grad_norm": 0.4052738845348358, + "learning_rate": 4.9998112857903454e-05, + "loss": 0.6807, + "step": 23 + }, + { + "epoch": 0.004266666666666667, + "grad_norm": 0.42756152153015137, + "learning_rate": 4.999793740288982e-05, + "loss": 0.6802, + "step": 24 + }, + { + "epoch": 0.0044444444444444444, + "grad_norm": 0.4189569354057312, + "learning_rate": 4.9997754150313815e-05, + "loss": 0.712, + "step": 25 + }, + { + "epoch": 0.004622222222222222, + "grad_norm": 0.41647130250930786, + "learning_rate": 4.999756310023261e-05, + "loss": 0.6725, + "step": 26 + }, + { + "epoch": 0.0048, + "grad_norm": 0.4385494887828827, + "learning_rate": 4.99973642527058e-05, + "loss": 0.8894, + "step": 27 + }, + { + "epoch": 0.004977777777777778, + "grad_norm": 0.4217928946018219, + "learning_rate": 4.999715760779541e-05, + "loss": 0.6861, + "step": 28 + }, + { + "epoch": 0.005155555555555556, + "grad_norm": 0.4131334125995636, + "learning_rate": 4.9996943165565905e-05, + "loss": 0.7903, + "step": 29 + }, + { + "epoch": 0.005333333333333333, + "grad_norm": 0.3944827914237976, + "learning_rate": 4.9996720926084164e-05, + "loss": 0.6555, + "step": 30 + }, + { + "epoch": 0.005511111111111111, + "grad_norm": 0.3939933180809021, + "learning_rate": 4.9996490889419514e-05, + "loss": 0.6998, + "step": 31 + }, + { + "epoch": 0.005688888888888889, + "grad_norm": 0.3831292688846588, + "learning_rate": 4.999625305564371e-05, + "loss": 0.7177, + "step": 32 + }, + { + "epoch": 0.005866666666666667, + "grad_norm": 0.3787647783756256, + "learning_rate": 4.999600742483094e-05, + "loss": 0.6726, + "step": 33 + }, + { + "epoch": 0.006044444444444444, + "grad_norm": 0.3797937333583832, + "learning_rate": 4.999575399705783e-05, + "loss": 0.6746, + "step": 34 + }, + { + "epoch": 0.006222222222222222, + "grad_norm": 0.36211299896240234, + "learning_rate": 4.999549277240342e-05, + "loss": 0.7807, + "step": 35 + }, + { + "epoch": 0.0064, + "grad_norm": 0.35416722297668457, + "learning_rate": 4.999522375094919e-05, + "loss": 0.8017, + "step": 36 + }, + { + "epoch": 0.006577777777777778, + "grad_norm": 0.3679851293563843, + "learning_rate": 4.999494693277907e-05, + "loss": 0.8026, + "step": 37 + }, + { + "epoch": 0.0067555555555555554, + "grad_norm": 0.36721140146255493, + "learning_rate": 4.999466231797941e-05, + "loss": 0.6894, + "step": 38 + }, + { + "epoch": 0.006933333333333333, + "grad_norm": 0.33606964349746704, + "learning_rate": 4.999436990663897e-05, + "loss": 0.6973, + "step": 39 + }, + { + "epoch": 0.0071111111111111115, + "grad_norm": 0.3648448586463928, + "learning_rate": 4.999406969884897e-05, + "loss": 0.557, + "step": 40 + }, + { + "epoch": 0.007288888888888889, + "grad_norm": 0.37145328521728516, + "learning_rate": 4.999376169470306e-05, + "loss": 0.6482, + "step": 41 + }, + { + "epoch": 0.007466666666666667, + "grad_norm": 0.35953575372695923, + "learning_rate": 4.99934458942973e-05, + "loss": 0.6259, + "step": 42 + }, + { + "epoch": 0.007644444444444444, + "grad_norm": 0.3610894978046417, + "learning_rate": 4.999312229773022e-05, + "loss": 0.6131, + "step": 43 + }, + { + "epoch": 0.007822222222222222, + "grad_norm": 0.3665354251861572, + "learning_rate": 4.9992790905102734e-05, + "loss": 0.6551, + "step": 44 + }, + { + "epoch": 0.008, + "grad_norm": 0.3667304515838623, + "learning_rate": 4.999245171651823e-05, + "loss": 0.7602, + "step": 45 + }, + { + "epoch": 0.008177777777777779, + "grad_norm": 0.38898375630378723, + "learning_rate": 4.99921047320825e-05, + "loss": 0.7476, + "step": 46 + }, + { + "epoch": 0.008355555555555555, + "grad_norm": 0.41774147748947144, + "learning_rate": 4.999174995190379e-05, + "loss": 0.6498, + "step": 47 + }, + { + "epoch": 0.008533333333333334, + "grad_norm": 0.41857436299324036, + "learning_rate": 4.999138737609276e-05, + "loss": 0.7246, + "step": 48 + }, + { + "epoch": 0.00871111111111111, + "grad_norm": 0.4185871183872223, + "learning_rate": 4.9991017004762496e-05, + "loss": 0.7194, + "step": 49 + }, + { + "epoch": 0.008888888888888889, + "grad_norm": 0.5065877437591553, + "learning_rate": 4.9990638838028546e-05, + "loss": 0.6876, + "step": 50 + }, + { + "epoch": 0.009066666666666667, + "grad_norm": 0.3123113512992859, + "learning_rate": 4.999025287600886e-05, + "loss": 0.6506, + "step": 51 + }, + { + "epoch": 0.009244444444444444, + "grad_norm": 0.28749120235443115, + "learning_rate": 4.998985911882384e-05, + "loss": 0.6297, + "step": 52 + }, + { + "epoch": 0.009422222222222222, + "grad_norm": 0.2994007170200348, + "learning_rate": 4.99894575665963e-05, + "loss": 0.6616, + "step": 53 + }, + { + "epoch": 0.0096, + "grad_norm": 0.28443682193756104, + "learning_rate": 4.9989048219451495e-05, + "loss": 0.6858, + "step": 54 + }, + { + "epoch": 0.009777777777777778, + "grad_norm": 0.27083295583724976, + "learning_rate": 4.998863107751711e-05, + "loss": 0.6675, + "step": 55 + }, + { + "epoch": 0.009955555555555556, + "grad_norm": 0.2696591913700104, + "learning_rate": 4.998820614092328e-05, + "loss": 0.6847, + "step": 56 + }, + { + "epoch": 0.010133333333333333, + "grad_norm": 0.2783946990966797, + "learning_rate": 4.998777340980254e-05, + "loss": 0.8185, + "step": 57 + }, + { + "epoch": 0.010311111111111111, + "grad_norm": 0.2770375609397888, + "learning_rate": 4.998733288428987e-05, + "loss": 0.6901, + "step": 58 + }, + { + "epoch": 0.01048888888888889, + "grad_norm": 0.25746840238571167, + "learning_rate": 4.9986884564522696e-05, + "loss": 0.6535, + "step": 59 + }, + { + "epoch": 0.010666666666666666, + "grad_norm": 0.2688048183917999, + "learning_rate": 4.998642845064086e-05, + "loss": 0.8365, + "step": 60 + }, + { + "epoch": 0.010844444444444445, + "grad_norm": 0.28876763582229614, + "learning_rate": 4.9985964542786614e-05, + "loss": 0.9726, + "step": 61 + }, + { + "epoch": 0.011022222222222221, + "grad_norm": 0.26135286688804626, + "learning_rate": 4.998549284110468e-05, + "loss": 0.8118, + "step": 62 + }, + { + "epoch": 0.0112, + "grad_norm": 0.2630575895309448, + "learning_rate": 4.99850133457422e-05, + "loss": 0.665, + "step": 63 + }, + { + "epoch": 0.011377777777777778, + "grad_norm": 0.24160782992839813, + "learning_rate": 4.998452605684874e-05, + "loss": 0.7075, + "step": 64 + }, + { + "epoch": 0.011555555555555555, + "grad_norm": 0.2451350837945938, + "learning_rate": 4.9984030974576285e-05, + "loss": 0.6946, + "step": 65 + }, + { + "epoch": 0.011733333333333333, + "grad_norm": 0.2520352005958557, + "learning_rate": 4.998352809907928e-05, + "loss": 0.6951, + "step": 66 + }, + { + "epoch": 0.011911111111111112, + "grad_norm": 0.2402976006269455, + "learning_rate": 4.998301743051459e-05, + "loss": 0.5872, + "step": 67 + }, + { + "epoch": 0.012088888888888889, + "grad_norm": 0.23182231187820435, + "learning_rate": 4.998249896904149e-05, + "loss": 0.6606, + "step": 68 + }, + { + "epoch": 0.012266666666666667, + "grad_norm": 0.2236698567867279, + "learning_rate": 4.998197271482171e-05, + "loss": 0.6554, + "step": 69 + }, + { + "epoch": 0.012444444444444444, + "grad_norm": 0.235708549618721, + "learning_rate": 4.998143866801942e-05, + "loss": 0.5829, + "step": 70 + }, + { + "epoch": 0.012622222222222222, + "grad_norm": 0.22412535548210144, + "learning_rate": 4.998089682880117e-05, + "loss": 0.6161, + "step": 71 + }, + { + "epoch": 0.0128, + "grad_norm": 0.22920480370521545, + "learning_rate": 4.9980347197336005e-05, + "loss": 0.6808, + "step": 72 + }, + { + "epoch": 0.012977777777777777, + "grad_norm": 0.225337415933609, + "learning_rate": 4.997978977379536e-05, + "loss": 0.5974, + "step": 73 + }, + { + "epoch": 0.013155555555555556, + "grad_norm": 0.22699826955795288, + "learning_rate": 4.997922455835311e-05, + "loss": 0.7156, + "step": 74 + }, + { + "epoch": 0.013333333333333334, + "grad_norm": 0.2172619104385376, + "learning_rate": 4.997865155118557e-05, + "loss": 0.5586, + "step": 75 + }, + { + "epoch": 0.013511111111111111, + "grad_norm": 0.23073641955852509, + "learning_rate": 4.997807075247146e-05, + "loss": 0.5214, + "step": 76 + }, + { + "epoch": 0.01368888888888889, + "grad_norm": 0.22633294761180878, + "learning_rate": 4.997748216239196e-05, + "loss": 0.6347, + "step": 77 + }, + { + "epoch": 0.013866666666666666, + "grad_norm": 0.24868322908878326, + "learning_rate": 4.9976885781130665e-05, + "loss": 0.6219, + "step": 78 + }, + { + "epoch": 0.014044444444444444, + "grad_norm": 0.2360876351594925, + "learning_rate": 4.997628160887361e-05, + "loss": 0.6395, + "step": 79 + }, + { + "epoch": 0.014222222222222223, + "grad_norm": 0.23156440258026123, + "learning_rate": 4.9975669645809244e-05, + "loss": 0.5651, + "step": 80 + }, + { + "epoch": 0.0144, + "grad_norm": 0.250428706407547, + "learning_rate": 4.9975049892128455e-05, + "loss": 0.6373, + "step": 81 + }, + { + "epoch": 0.014577777777777778, + "grad_norm": 0.2502261698246002, + "learning_rate": 4.997442234802456e-05, + "loss": 0.5161, + "step": 82 + }, + { + "epoch": 0.014755555555555555, + "grad_norm": 0.24371939897537231, + "learning_rate": 4.997378701369332e-05, + "loss": 0.5576, + "step": 83 + }, + { + "epoch": 0.014933333333333333, + "grad_norm": 0.2324707806110382, + "learning_rate": 4.997314388933291e-05, + "loss": 0.6313, + "step": 84 + }, + { + "epoch": 0.015111111111111112, + "grad_norm": 0.23092150688171387, + "learning_rate": 4.997249297514394e-05, + "loss": 0.6201, + "step": 85 + }, + { + "epoch": 0.015288888888888888, + "grad_norm": 0.2389262467622757, + "learning_rate": 4.997183427132943e-05, + "loss": 0.6716, + "step": 86 + }, + { + "epoch": 0.015466666666666667, + "grad_norm": 0.23141413927078247, + "learning_rate": 4.9971167778094863e-05, + "loss": 0.6087, + "step": 87 + }, + { + "epoch": 0.015644444444444443, + "grad_norm": 0.2378101497888565, + "learning_rate": 4.997049349564814e-05, + "loss": 0.6394, + "step": 88 + }, + { + "epoch": 0.015822222222222224, + "grad_norm": 0.23467670381069183, + "learning_rate": 4.996981142419959e-05, + "loss": 0.7171, + "step": 89 + }, + { + "epoch": 0.016, + "grad_norm": 0.21919319033622742, + "learning_rate": 4.9969121563961956e-05, + "loss": 0.5858, + "step": 90 + }, + { + "epoch": 0.016177777777777777, + "grad_norm": 0.2534019649028778, + "learning_rate": 4.996842391515044e-05, + "loss": 0.5, + "step": 91 + }, + { + "epoch": 0.016355555555555557, + "grad_norm": 0.2621934115886688, + "learning_rate": 4.996771847798265e-05, + "loss": 0.706, + "step": 92 + }, + { + "epoch": 0.016533333333333334, + "grad_norm": 0.2678978443145752, + "learning_rate": 4.9967005252678634e-05, + "loss": 0.7235, + "step": 93 + }, + { + "epoch": 0.01671111111111111, + "grad_norm": 0.2515285313129425, + "learning_rate": 4.996628423946087e-05, + "loss": 0.468, + "step": 94 + }, + { + "epoch": 0.016888888888888887, + "grad_norm": 0.2523013949394226, + "learning_rate": 4.9965555438554254e-05, + "loss": 0.5585, + "step": 95 + }, + { + "epoch": 0.017066666666666667, + "grad_norm": 0.2534025013446808, + "learning_rate": 4.9964818850186135e-05, + "loss": 0.5239, + "step": 96 + }, + { + "epoch": 0.017244444444444444, + "grad_norm": 0.25027477741241455, + "learning_rate": 4.996407447458626e-05, + "loss": 0.5933, + "step": 97 + }, + { + "epoch": 0.01742222222222222, + "grad_norm": 0.2816220223903656, + "learning_rate": 4.996332231198683e-05, + "loss": 0.5134, + "step": 98 + }, + { + "epoch": 0.0176, + "grad_norm": 0.2923649847507477, + "learning_rate": 4.996256236262245e-05, + "loss": 0.7084, + "step": 99 + }, + { + "epoch": 0.017777777777777778, + "grad_norm": 0.29674360156059265, + "learning_rate": 4.99617946267302e-05, + "loss": 0.7524, + "step": 100 + }, + { + "epoch": 0.017955555555555554, + "grad_norm": 0.21921125054359436, + "learning_rate": 4.996101910454953e-05, + "loss": 0.503, + "step": 101 + }, + { + "epoch": 0.018133333333333335, + "grad_norm": 0.19979022443294525, + "learning_rate": 4.996023579632236e-05, + "loss": 0.5286, + "step": 102 + }, + { + "epoch": 0.01831111111111111, + "grad_norm": 0.19289103150367737, + "learning_rate": 4.995944470229302e-05, + "loss": 0.5486, + "step": 103 + }, + { + "epoch": 0.018488888888888888, + "grad_norm": 0.1895735114812851, + "learning_rate": 4.9958645822708285e-05, + "loss": 0.7548, + "step": 104 + }, + { + "epoch": 0.018666666666666668, + "grad_norm": 0.1836000680923462, + "learning_rate": 4.995783915781734e-05, + "loss": 0.5964, + "step": 105 + }, + { + "epoch": 0.018844444444444445, + "grad_norm": 0.18698160350322723, + "learning_rate": 4.9957024707871806e-05, + "loss": 0.7119, + "step": 106 + }, + { + "epoch": 0.01902222222222222, + "grad_norm": 0.18611574172973633, + "learning_rate": 4.9956202473125736e-05, + "loss": 0.7435, + "step": 107 + }, + { + "epoch": 0.0192, + "grad_norm": 0.161662295460701, + "learning_rate": 4.99553724538356e-05, + "loss": 0.5021, + "step": 108 + }, + { + "epoch": 0.01937777777777778, + "grad_norm": 0.16785134375095367, + "learning_rate": 4.995453465026032e-05, + "loss": 0.6388, + "step": 109 + }, + { + "epoch": 0.019555555555555555, + "grad_norm": 0.15723343193531036, + "learning_rate": 4.9953689062661226e-05, + "loss": 0.6732, + "step": 110 + }, + { + "epoch": 0.019733333333333332, + "grad_norm": 0.23393531143665314, + "learning_rate": 4.995283569130207e-05, + "loss": 0.7251, + "step": 111 + }, + { + "epoch": 0.019911111111111112, + "grad_norm": 0.15552565455436707, + "learning_rate": 4.995197453644905e-05, + "loss": 0.7106, + "step": 112 + }, + { + "epoch": 0.02008888888888889, + "grad_norm": 0.1477118283510208, + "learning_rate": 4.995110559837078e-05, + "loss": 0.6299, + "step": 113 + }, + { + "epoch": 0.020266666666666665, + "grad_norm": 0.12583880126476288, + "learning_rate": 4.995022887733832e-05, + "loss": 0.524, + "step": 114 + }, + { + "epoch": 0.020444444444444446, + "grad_norm": 0.13419458270072937, + "learning_rate": 4.994934437362513e-05, + "loss": 0.5786, + "step": 115 + }, + { + "epoch": 0.020622222222222222, + "grad_norm": 0.149000346660614, + "learning_rate": 4.9948452087507116e-05, + "loss": 0.6443, + "step": 116 + }, + { + "epoch": 0.0208, + "grad_norm": 0.17001931369304657, + "learning_rate": 4.9947552019262605e-05, + "loss": 0.6709, + "step": 117 + }, + { + "epoch": 0.02097777777777778, + "grad_norm": 0.1675909012556076, + "learning_rate": 4.9946644169172355e-05, + "loss": 0.5673, + "step": 118 + }, + { + "epoch": 0.021155555555555556, + "grad_norm": 0.13317342102527618, + "learning_rate": 4.9945728537519555e-05, + "loss": 0.6506, + "step": 119 + }, + { + "epoch": 0.021333333333333333, + "grad_norm": 0.12229342758655548, + "learning_rate": 4.994480512458981e-05, + "loss": 0.5013, + "step": 120 + }, + { + "epoch": 0.021511111111111113, + "grad_norm": 0.10997683554887772, + "learning_rate": 4.994387393067117e-05, + "loss": 0.4243, + "step": 121 + }, + { + "epoch": 0.02168888888888889, + "grad_norm": 0.1417355090379715, + "learning_rate": 4.9942934956054076e-05, + "loss": 0.732, + "step": 122 + }, + { + "epoch": 0.021866666666666666, + "grad_norm": 0.1389738768339157, + "learning_rate": 4.994198820103145e-05, + "loss": 0.5911, + "step": 123 + }, + { + "epoch": 0.022044444444444443, + "grad_norm": 0.1208367645740509, + "learning_rate": 4.994103366589859e-05, + "loss": 0.5884, + "step": 124 + }, + { + "epoch": 0.022222222222222223, + "grad_norm": 0.11839725822210312, + "learning_rate": 4.9940071350953255e-05, + "loss": 0.4604, + "step": 125 + }, + { + "epoch": 0.0224, + "grad_norm": 0.1459076851606369, + "learning_rate": 4.993910125649561e-05, + "loss": 0.5763, + "step": 126 + }, + { + "epoch": 0.022577777777777776, + "grad_norm": 0.13375116884708405, + "learning_rate": 4.993812338282826e-05, + "loss": 0.5362, + "step": 127 + }, + { + "epoch": 0.022755555555555557, + "grad_norm": 0.11560770124197006, + "learning_rate": 4.993713773025623e-05, + "loss": 0.5712, + "step": 128 + }, + { + "epoch": 0.022933333333333333, + "grad_norm": 0.14490965008735657, + "learning_rate": 4.993614429908697e-05, + "loss": 0.6323, + "step": 129 + }, + { + "epoch": 0.02311111111111111, + "grad_norm": 0.12258388847112656, + "learning_rate": 4.993514308963036e-05, + "loss": 0.5886, + "step": 130 + }, + { + "epoch": 0.02328888888888889, + "grad_norm": 0.12152822315692902, + "learning_rate": 4.993413410219871e-05, + "loss": 0.599, + "step": 131 + }, + { + "epoch": 0.023466666666666667, + "grad_norm": 0.12627656757831573, + "learning_rate": 4.993311733710676e-05, + "loss": 0.499, + "step": 132 + }, + { + "epoch": 0.023644444444444444, + "grad_norm": 0.11848804354667664, + "learning_rate": 4.993209279467164e-05, + "loss": 0.479, + "step": 133 + }, + { + "epoch": 0.023822222222222224, + "grad_norm": 0.11447634547948837, + "learning_rate": 4.993106047521296e-05, + "loss": 0.6705, + "step": 134 + }, + { + "epoch": 0.024, + "grad_norm": 0.12298741191625595, + "learning_rate": 4.993002037905272e-05, + "loss": 0.6076, + "step": 135 + }, + { + "epoch": 0.024177777777777777, + "grad_norm": 0.13246573507785797, + "learning_rate": 4.992897250651535e-05, + "loss": 0.5358, + "step": 136 + }, + { + "epoch": 0.024355555555555554, + "grad_norm": 0.10114778578281403, + "learning_rate": 4.992791685792772e-05, + "loss": 0.5443, + "step": 137 + }, + { + "epoch": 0.024533333333333334, + "grad_norm": 0.12766729295253754, + "learning_rate": 4.992685343361911e-05, + "loss": 0.5541, + "step": 138 + }, + { + "epoch": 0.02471111111111111, + "grad_norm": 0.133477121591568, + "learning_rate": 4.992578223392124e-05, + "loss": 0.5883, + "step": 139 + }, + { + "epoch": 0.024888888888888887, + "grad_norm": 0.1214534193277359, + "learning_rate": 4.9924703259168244e-05, + "loss": 0.4744, + "step": 140 + }, + { + "epoch": 0.025066666666666668, + "grad_norm": 0.11343647539615631, + "learning_rate": 4.9923616509696683e-05, + "loss": 0.4613, + "step": 141 + }, + { + "epoch": 0.025244444444444444, + "grad_norm": 0.13983483612537384, + "learning_rate": 4.992252198584554e-05, + "loss": 0.5951, + "step": 142 + }, + { + "epoch": 0.02542222222222222, + "grad_norm": 0.1549396812915802, + "learning_rate": 4.992141968795623e-05, + "loss": 0.5171, + "step": 143 + }, + { + "epoch": 0.0256, + "grad_norm": 0.12246689200401306, + "learning_rate": 4.9920309616372596e-05, + "loss": 0.511, + "step": 144 + }, + { + "epoch": 0.025777777777777778, + "grad_norm": 0.23052345216274261, + "learning_rate": 4.9919191771440905e-05, + "loss": 0.527, + "step": 145 + }, + { + "epoch": 0.025955555555555555, + "grad_norm": 0.1364292949438095, + "learning_rate": 4.9918066153509834e-05, + "loss": 0.5334, + "step": 146 + }, + { + "epoch": 0.026133333333333335, + "grad_norm": 0.15990453958511353, + "learning_rate": 4.99169327629305e-05, + "loss": 0.5076, + "step": 147 + }, + { + "epoch": 0.02631111111111111, + "grad_norm": 0.15343108773231506, + "learning_rate": 4.991579160005644e-05, + "loss": 0.5342, + "step": 148 + }, + { + "epoch": 0.026488888888888888, + "grad_norm": 0.12868401408195496, + "learning_rate": 4.99146426652436e-05, + "loss": 0.5119, + "step": 149 + }, + { + "epoch": 0.02666666666666667, + "grad_norm": 0.1964123249053955, + "learning_rate": 4.991348595885039e-05, + "loss": 0.7665, + "step": 150 + }, + { + "epoch": 0.026844444444444445, + "grad_norm": 0.26083841919898987, + "learning_rate": 4.991232148123761e-05, + "loss": 0.5343, + "step": 151 + }, + { + "epoch": 0.027022222222222222, + "grad_norm": 0.1395082175731659, + "learning_rate": 4.991114923276849e-05, + "loss": 0.7237, + "step": 152 + }, + { + "epoch": 0.0272, + "grad_norm": 0.12086386233568192, + "learning_rate": 4.9909969213808683e-05, + "loss": 0.6929, + "step": 153 + }, + { + "epoch": 0.02737777777777778, + "grad_norm": 0.13468104600906372, + "learning_rate": 4.990878142472628e-05, + "loss": 0.6146, + "step": 154 + }, + { + "epoch": 0.027555555555555555, + "grad_norm": 0.12088959664106369, + "learning_rate": 4.990758586589178e-05, + "loss": 0.6594, + "step": 155 + }, + { + "epoch": 0.027733333333333332, + "grad_norm": 0.1024961844086647, + "learning_rate": 4.990638253767812e-05, + "loss": 0.5792, + "step": 156 + }, + { + "epoch": 0.027911111111111112, + "grad_norm": 0.11372818052768707, + "learning_rate": 4.990517144046064e-05, + "loss": 0.5113, + "step": 157 + }, + { + "epoch": 0.02808888888888889, + "grad_norm": 0.13793951272964478, + "learning_rate": 4.990395257461712e-05, + "loss": 0.627, + "step": 158 + }, + { + "epoch": 0.028266666666666666, + "grad_norm": 0.12339150905609131, + "learning_rate": 4.990272594052776e-05, + "loss": 0.5823, + "step": 159 + }, + { + "epoch": 0.028444444444444446, + "grad_norm": 0.09628098458051682, + "learning_rate": 4.9901491538575185e-05, + "loss": 0.5364, + "step": 160 + }, + { + "epoch": 0.028622222222222223, + "grad_norm": 0.11706215143203735, + "learning_rate": 4.9900249369144434e-05, + "loss": 0.6852, + "step": 161 + }, + { + "epoch": 0.0288, + "grad_norm": 0.1168065071105957, + "learning_rate": 4.9898999432622974e-05, + "loss": 0.6664, + "step": 162 + }, + { + "epoch": 0.02897777777777778, + "grad_norm": 0.10837826877832413, + "learning_rate": 4.9897741729400705e-05, + "loss": 0.5942, + "step": 163 + }, + { + "epoch": 0.029155555555555556, + "grad_norm": 0.12275711447000504, + "learning_rate": 4.989647625986993e-05, + "loss": 0.5265, + "step": 164 + }, + { + "epoch": 0.029333333333333333, + "grad_norm": 0.09790056943893433, + "learning_rate": 4.9895203024425385e-05, + "loss": 0.5471, + "step": 165 + }, + { + "epoch": 0.02951111111111111, + "grad_norm": 0.0930943414568901, + "learning_rate": 4.9893922023464236e-05, + "loss": 0.6063, + "step": 166 + }, + { + "epoch": 0.02968888888888889, + "grad_norm": 0.11883064359426498, + "learning_rate": 4.989263325738605e-05, + "loss": 0.6441, + "step": 167 + }, + { + "epoch": 0.029866666666666666, + "grad_norm": 0.1502642184495926, + "learning_rate": 4.9891336726592844e-05, + "loss": 0.6521, + "step": 168 + }, + { + "epoch": 0.030044444444444443, + "grad_norm": 0.10600157827138901, + "learning_rate": 4.989003243148904e-05, + "loss": 0.5837, + "step": 169 + }, + { + "epoch": 0.030222222222222223, + "grad_norm": 0.1090245321393013, + "learning_rate": 4.988872037248148e-05, + "loss": 0.5593, + "step": 170 + }, + { + "epoch": 0.0304, + "grad_norm": 0.10448866337537766, + "learning_rate": 4.988740054997943e-05, + "loss": 0.5197, + "step": 171 + }, + { + "epoch": 0.030577777777777777, + "grad_norm": 0.11432240158319473, + "learning_rate": 4.988607296439458e-05, + "loss": 0.4996, + "step": 172 + }, + { + "epoch": 0.030755555555555557, + "grad_norm": 0.11317683011293411, + "learning_rate": 4.988473761614105e-05, + "loss": 0.4481, + "step": 173 + }, + { + "epoch": 0.030933333333333334, + "grad_norm": 0.1050187200307846, + "learning_rate": 4.9883394505635364e-05, + "loss": 0.6967, + "step": 174 + }, + { + "epoch": 0.03111111111111111, + "grad_norm": 0.09667979925870895, + "learning_rate": 4.988204363329648e-05, + "loss": 0.5153, + "step": 175 + }, + { + "epoch": 0.03128888888888889, + "grad_norm": 0.12330680340528488, + "learning_rate": 4.988068499954578e-05, + "loss": 0.5943, + "step": 176 + }, + { + "epoch": 0.031466666666666664, + "grad_norm": 0.10915859788656235, + "learning_rate": 4.987931860480705e-05, + "loss": 0.5495, + "step": 177 + }, + { + "epoch": 0.03164444444444445, + "grad_norm": 0.10273627936840057, + "learning_rate": 4.987794444950651e-05, + "loss": 0.6353, + "step": 178 + }, + { + "epoch": 0.031822222222222224, + "grad_norm": 0.11179158836603165, + "learning_rate": 4.98765625340728e-05, + "loss": 0.5867, + "step": 179 + }, + { + "epoch": 0.032, + "grad_norm": 0.09845339506864548, + "learning_rate": 4.987517285893697e-05, + "loss": 0.6299, + "step": 180 + }, + { + "epoch": 0.03217777777777778, + "grad_norm": 0.0992949977517128, + "learning_rate": 4.987377542453251e-05, + "loss": 0.555, + "step": 181 + }, + { + "epoch": 0.032355555555555554, + "grad_norm": 0.11363174766302109, + "learning_rate": 4.987237023129531e-05, + "loss": 0.5736, + "step": 182 + }, + { + "epoch": 0.03253333333333333, + "grad_norm": 0.12147802114486694, + "learning_rate": 4.98709572796637e-05, + "loss": 0.6545, + "step": 183 + }, + { + "epoch": 0.032711111111111114, + "grad_norm": 0.15247713029384613, + "learning_rate": 4.986953657007841e-05, + "loss": 0.7735, + "step": 184 + }, + { + "epoch": 0.03288888888888889, + "grad_norm": 0.10348669439554214, + "learning_rate": 4.9868108102982604e-05, + "loss": 0.5527, + "step": 185 + }, + { + "epoch": 0.03306666666666667, + "grad_norm": 0.11538142710924149, + "learning_rate": 4.986667187882186e-05, + "loss": 0.6915, + "step": 186 + }, + { + "epoch": 0.033244444444444445, + "grad_norm": 0.10936667025089264, + "learning_rate": 4.986522789804417e-05, + "loss": 0.4876, + "step": 187 + }, + { + "epoch": 0.03342222222222222, + "grad_norm": 0.10721372067928314, + "learning_rate": 4.9863776161099964e-05, + "loss": 0.5513, + "step": 188 + }, + { + "epoch": 0.0336, + "grad_norm": 0.12088362872600555, + "learning_rate": 4.986231666844208e-05, + "loss": 0.6191, + "step": 189 + }, + { + "epoch": 0.033777777777777775, + "grad_norm": 0.1014571562409401, + "learning_rate": 4.9860849420525766e-05, + "loss": 0.4806, + "step": 190 + }, + { + "epoch": 0.03395555555555556, + "grad_norm": 0.09837964922189713, + "learning_rate": 4.98593744178087e-05, + "loss": 0.5334, + "step": 191 + }, + { + "epoch": 0.034133333333333335, + "grad_norm": 0.33532989025115967, + "learning_rate": 4.9857891660750986e-05, + "loss": 0.653, + "step": 192 + }, + { + "epoch": 0.03431111111111111, + "grad_norm": 0.12047170102596283, + "learning_rate": 4.9856401149815126e-05, + "loss": 0.4128, + "step": 193 + }, + { + "epoch": 0.03448888888888889, + "grad_norm": 0.168871209025383, + "learning_rate": 4.985490288546606e-05, + "loss": 0.513, + "step": 194 + }, + { + "epoch": 0.034666666666666665, + "grad_norm": 0.11047204583883286, + "learning_rate": 4.985339686817113e-05, + "loss": 0.4705, + "step": 195 + }, + { + "epoch": 0.03484444444444444, + "grad_norm": 0.1479148268699646, + "learning_rate": 4.985188309840012e-05, + "loss": 0.4471, + "step": 196 + }, + { + "epoch": 0.035022222222222225, + "grad_norm": 0.11363429576158524, + "learning_rate": 4.985036157662521e-05, + "loss": 0.4675, + "step": 197 + }, + { + "epoch": 0.0352, + "grad_norm": 0.17608030140399933, + "learning_rate": 4.984883230332099e-05, + "loss": 0.6166, + "step": 198 + }, + { + "epoch": 0.03537777777777778, + "grad_norm": 0.11926349252462387, + "learning_rate": 4.9847295278964514e-05, + "loss": 0.4445, + "step": 199 + }, + { + "epoch": 0.035555555555555556, + "grad_norm": 0.146172896027565, + "learning_rate": 4.9845750504035195e-05, + "loss": 0.6981, + "step": 200 + }, + { + "epoch": 0.03573333333333333, + "grad_norm": 0.10850780457258224, + "learning_rate": 4.984419797901491e-05, + "loss": 0.4486, + "step": 201 + }, + { + "epoch": 0.03591111111111111, + "grad_norm": 0.09656495600938797, + "learning_rate": 4.984263770438793e-05, + "loss": 0.5468, + "step": 202 + }, + { + "epoch": 0.036088888888888886, + "grad_norm": 0.12192872911691666, + "learning_rate": 4.984106968064095e-05, + "loss": 0.4336, + "step": 203 + }, + { + "epoch": 0.03626666666666667, + "grad_norm": 0.10992716252803802, + "learning_rate": 4.983949390826308e-05, + "loss": 0.6243, + "step": 204 + }, + { + "epoch": 0.036444444444444446, + "grad_norm": 0.1067366972565651, + "learning_rate": 4.9837910387745845e-05, + "loss": 0.6188, + "step": 205 + }, + { + "epoch": 0.03662222222222222, + "grad_norm": 0.1047806516289711, + "learning_rate": 4.983631911958319e-05, + "loss": 0.6255, + "step": 206 + }, + { + "epoch": 0.0368, + "grad_norm": 0.15057428181171417, + "learning_rate": 4.9834720104271484e-05, + "loss": 0.6211, + "step": 207 + }, + { + "epoch": 0.036977777777777776, + "grad_norm": 0.1308983415365219, + "learning_rate": 4.98331133423095e-05, + "loss": 0.5309, + "step": 208 + }, + { + "epoch": 0.03715555555555555, + "grad_norm": 0.14646312594413757, + "learning_rate": 4.983149883419842e-05, + "loss": 0.5669, + "step": 209 + }, + { + "epoch": 0.037333333333333336, + "grad_norm": 0.09577513486146927, + "learning_rate": 4.982987658044188e-05, + "loss": 0.4696, + "step": 210 + }, + { + "epoch": 0.03751111111111111, + "grad_norm": 0.09543467313051224, + "learning_rate": 4.982824658154589e-05, + "loss": 0.5048, + "step": 211 + }, + { + "epoch": 0.03768888888888889, + "grad_norm": 0.09927001595497131, + "learning_rate": 4.982660883801889e-05, + "loss": 0.614, + "step": 212 + }, + { + "epoch": 0.037866666666666667, + "grad_norm": 0.10242555290460587, + "learning_rate": 4.982496335037175e-05, + "loss": 0.6597, + "step": 213 + }, + { + "epoch": 0.03804444444444444, + "grad_norm": 0.10295635461807251, + "learning_rate": 4.982331011911774e-05, + "loss": 0.5872, + "step": 214 + }, + { + "epoch": 0.03822222222222222, + "grad_norm": 0.14086520671844482, + "learning_rate": 4.9821649144772545e-05, + "loss": 0.6865, + "step": 215 + }, + { + "epoch": 0.0384, + "grad_norm": 0.10746883600950241, + "learning_rate": 4.981998042785427e-05, + "loss": 0.6699, + "step": 216 + }, + { + "epoch": 0.03857777777777778, + "grad_norm": 0.08688037097454071, + "learning_rate": 4.981830396888344e-05, + "loss": 0.5398, + "step": 217 + }, + { + "epoch": 0.03875555555555556, + "grad_norm": 0.10395106673240662, + "learning_rate": 4.981661976838299e-05, + "loss": 0.5715, + "step": 218 + }, + { + "epoch": 0.038933333333333334, + "grad_norm": 0.0961478054523468, + "learning_rate": 4.9814927826878256e-05, + "loss": 0.4944, + "step": 219 + }, + { + "epoch": 0.03911111111111111, + "grad_norm": 0.10356276482343674, + "learning_rate": 4.981322814489703e-05, + "loss": 0.6105, + "step": 220 + }, + { + "epoch": 0.03928888888888889, + "grad_norm": 0.10357530415058136, + "learning_rate": 4.9811520722969465e-05, + "loss": 0.7405, + "step": 221 + }, + { + "epoch": 0.039466666666666664, + "grad_norm": 0.11511997133493423, + "learning_rate": 4.980980556162816e-05, + "loss": 0.6908, + "step": 222 + }, + { + "epoch": 0.03964444444444445, + "grad_norm": 0.10484056919813156, + "learning_rate": 4.980808266140813e-05, + "loss": 0.4961, + "step": 223 + }, + { + "epoch": 0.039822222222222224, + "grad_norm": 0.10967821627855301, + "learning_rate": 4.980635202284679e-05, + "loss": 0.661, + "step": 224 + }, + { + "epoch": 0.04, + "grad_norm": 0.10848290473222733, + "learning_rate": 4.980461364648398e-05, + "loss": 0.5437, + "step": 225 + }, + { + "epoch": 0.04017777777777778, + "grad_norm": 0.11733432114124298, + "learning_rate": 4.980286753286195e-05, + "loss": 0.6287, + "step": 226 + }, + { + "epoch": 0.040355555555555554, + "grad_norm": 0.09216199070215225, + "learning_rate": 4.980111368252535e-05, + "loss": 0.4688, + "step": 227 + }, + { + "epoch": 0.04053333333333333, + "grad_norm": 0.09345618635416031, + "learning_rate": 4.9799352096021266e-05, + "loss": 0.5279, + "step": 228 + }, + { + "epoch": 0.040711111111111115, + "grad_norm": 0.21244564652442932, + "learning_rate": 4.979758277389919e-05, + "loss": 0.6368, + "step": 229 + }, + { + "epoch": 0.04088888888888889, + "grad_norm": 0.24040700495243073, + "learning_rate": 4.9795805716711e-05, + "loss": 0.4829, + "step": 230 + }, + { + "epoch": 0.04106666666666667, + "grad_norm": 0.12090496718883514, + "learning_rate": 4.9794020925011044e-05, + "loss": 0.6549, + "step": 231 + }, + { + "epoch": 0.041244444444444445, + "grad_norm": 0.07658255100250244, + "learning_rate": 4.979222839935602e-05, + "loss": 0.4442, + "step": 232 + }, + { + "epoch": 0.04142222222222222, + "grad_norm": 0.1340865194797516, + "learning_rate": 4.979042814030509e-05, + "loss": 0.6938, + "step": 233 + }, + { + "epoch": 0.0416, + "grad_norm": 0.15205548703670502, + "learning_rate": 4.978862014841979e-05, + "loss": 0.4328, + "step": 234 + }, + { + "epoch": 0.041777777777777775, + "grad_norm": 0.10344212502241135, + "learning_rate": 4.9786804424264085e-05, + "loss": 0.4979, + "step": 235 + }, + { + "epoch": 0.04195555555555556, + "grad_norm": 0.10293713957071304, + "learning_rate": 4.978498096840436e-05, + "loss": 0.5674, + "step": 236 + }, + { + "epoch": 0.042133333333333335, + "grad_norm": 0.15762662887573242, + "learning_rate": 4.9783149781409404e-05, + "loss": 0.5476, + "step": 237 + }, + { + "epoch": 0.04231111111111111, + "grad_norm": 0.09386316686868668, + "learning_rate": 4.9781310863850405e-05, + "loss": 0.5492, + "step": 238 + }, + { + "epoch": 0.04248888888888889, + "grad_norm": 0.11153044551610947, + "learning_rate": 4.977946421630098e-05, + "loss": 0.3998, + "step": 239 + }, + { + "epoch": 0.042666666666666665, + "grad_norm": 0.09900151938199997, + "learning_rate": 4.977760983933714e-05, + "loss": 0.4811, + "step": 240 + }, + { + "epoch": 0.04284444444444444, + "grad_norm": 0.10927964746952057, + "learning_rate": 4.977574773353732e-05, + "loss": 0.4628, + "step": 241 + }, + { + "epoch": 0.043022222222222226, + "grad_norm": 0.12449217587709427, + "learning_rate": 4.977387789948238e-05, + "loss": 0.5396, + "step": 242 + }, + { + "epoch": 0.0432, + "grad_norm": 0.10435616970062256, + "learning_rate": 4.977200033775555e-05, + "loss": 0.4586, + "step": 243 + }, + { + "epoch": 0.04337777777777778, + "grad_norm": 0.12148528546094894, + "learning_rate": 4.977011504894252e-05, + "loss": 0.5313, + "step": 244 + }, + { + "epoch": 0.043555555555555556, + "grad_norm": 0.10629341751337051, + "learning_rate": 4.976822203363135e-05, + "loss": 0.4647, + "step": 245 + }, + { + "epoch": 0.04373333333333333, + "grad_norm": 0.12418622523546219, + "learning_rate": 4.976632129241252e-05, + "loss": 0.5073, + "step": 246 + }, + { + "epoch": 0.04391111111111111, + "grad_norm": 0.11972734332084656, + "learning_rate": 4.9764412825878943e-05, + "loss": 0.5648, + "step": 247 + }, + { + "epoch": 0.044088888888888886, + "grad_norm": 0.15979549288749695, + "learning_rate": 4.97624966346259e-05, + "loss": 0.5439, + "step": 248 + }, + { + "epoch": 0.04426666666666667, + "grad_norm": 0.1805201768875122, + "learning_rate": 4.976057271925113e-05, + "loss": 0.6221, + "step": 249 + }, + { + "epoch": 0.044444444444444446, + "grad_norm": 0.1387433558702469, + "learning_rate": 4.975864108035474e-05, + "loss": 0.7004, + "step": 250 + }, + { + "epoch": 0.04462222222222222, + "grad_norm": 0.12188535928726196, + "learning_rate": 4.975670171853926e-05, + "loss": 0.6453, + "step": 251 + }, + { + "epoch": 0.0448, + "grad_norm": 0.1226486787199974, + "learning_rate": 4.975475463440964e-05, + "loss": 0.5931, + "step": 252 + }, + { + "epoch": 0.044977777777777776, + "grad_norm": 0.1255352646112442, + "learning_rate": 4.975279982857324e-05, + "loss": 0.6342, + "step": 253 + }, + { + "epoch": 0.04515555555555555, + "grad_norm": 0.1070035845041275, + "learning_rate": 4.9750837301639796e-05, + "loss": 0.4583, + "step": 254 + }, + { + "epoch": 0.04533333333333334, + "grad_norm": 0.0999893918633461, + "learning_rate": 4.974886705422149e-05, + "loss": 0.6701, + "step": 255 + }, + { + "epoch": 0.04551111111111111, + "grad_norm": 0.09253033995628357, + "learning_rate": 4.9746889086932895e-05, + "loss": 0.4827, + "step": 256 + }, + { + "epoch": 0.04568888888888889, + "grad_norm": 0.09987790137529373, + "learning_rate": 4.9744903400391e-05, + "loss": 0.5833, + "step": 257 + }, + { + "epoch": 0.04586666666666667, + "grad_norm": 0.10074815154075623, + "learning_rate": 4.974290999521519e-05, + "loss": 0.6793, + "step": 258 + }, + { + "epoch": 0.04604444444444444, + "grad_norm": 0.09924104064702988, + "learning_rate": 4.974090887202726e-05, + "loss": 0.5025, + "step": 259 + }, + { + "epoch": 0.04622222222222222, + "grad_norm": 0.1000429168343544, + "learning_rate": 4.973890003145143e-05, + "loss": 0.4784, + "step": 260 + }, + { + "epoch": 0.0464, + "grad_norm": 0.12252362817525864, + "learning_rate": 4.973688347411431e-05, + "loss": 0.537, + "step": 261 + }, + { + "epoch": 0.04657777777777778, + "grad_norm": 0.10239475965499878, + "learning_rate": 4.9734859200644905e-05, + "loss": 0.6376, + "step": 262 + }, + { + "epoch": 0.04675555555555556, + "grad_norm": 0.1513953059911728, + "learning_rate": 4.973282721167467e-05, + "loss": 0.6185, + "step": 263 + }, + { + "epoch": 0.046933333333333334, + "grad_norm": 0.13385048508644104, + "learning_rate": 4.973078750783742e-05, + "loss": 0.6077, + "step": 264 + }, + { + "epoch": 0.04711111111111111, + "grad_norm": 0.10267174243927002, + "learning_rate": 4.97287400897694e-05, + "loss": 0.4917, + "step": 265 + }, + { + "epoch": 0.04728888888888889, + "grad_norm": 0.1471744030714035, + "learning_rate": 4.9726684958109266e-05, + "loss": 0.6737, + "step": 266 + }, + { + "epoch": 0.047466666666666664, + "grad_norm": 0.10583169758319855, + "learning_rate": 4.972462211349806e-05, + "loss": 0.5044, + "step": 267 + }, + { + "epoch": 0.04764444444444445, + "grad_norm": 0.09750795364379883, + "learning_rate": 4.972255155657925e-05, + "loss": 0.5493, + "step": 268 + }, + { + "epoch": 0.047822222222222224, + "grad_norm": 0.1117471233010292, + "learning_rate": 4.9720473287998695e-05, + "loss": 0.5088, + "step": 269 + }, + { + "epoch": 0.048, + "grad_norm": 0.11861962080001831, + "learning_rate": 4.9718387308404675e-05, + "loss": 0.5972, + "step": 270 + }, + { + "epoch": 0.04817777777777778, + "grad_norm": 0.11881297081708908, + "learning_rate": 4.971629361844785e-05, + "loss": 0.4927, + "step": 271 + }, + { + "epoch": 0.048355555555555554, + "grad_norm": 0.11163926869630814, + "learning_rate": 4.9714192218781316e-05, + "loss": 0.5077, + "step": 272 + }, + { + "epoch": 0.04853333333333333, + "grad_norm": 0.10126686096191406, + "learning_rate": 4.9712083110060556e-05, + "loss": 0.641, + "step": 273 + }, + { + "epoch": 0.04871111111111111, + "grad_norm": 0.12614768743515015, + "learning_rate": 4.9709966292943455e-05, + "loss": 0.7042, + "step": 274 + }, + { + "epoch": 0.04888888888888889, + "grad_norm": 0.15151461958885193, + "learning_rate": 4.9707841768090314e-05, + "loss": 0.5686, + "step": 275 + }, + { + "epoch": 0.04906666666666667, + "grad_norm": 0.09307102113962173, + "learning_rate": 4.9705709536163824e-05, + "loss": 0.5139, + "step": 276 + }, + { + "epoch": 0.049244444444444445, + "grad_norm": 0.11126819252967834, + "learning_rate": 4.970356959782909e-05, + "loss": 0.5995, + "step": 277 + }, + { + "epoch": 0.04942222222222222, + "grad_norm": 0.13643859326839447, + "learning_rate": 4.970142195375363e-05, + "loss": 0.6118, + "step": 278 + }, + { + "epoch": 0.0496, + "grad_norm": 0.15757425129413605, + "learning_rate": 4.9699266604607355e-05, + "loss": 0.4944, + "step": 279 + }, + { + "epoch": 0.049777777777777775, + "grad_norm": 0.14190055429935455, + "learning_rate": 4.9697103551062556e-05, + "loss": 0.5835, + "step": 280 + }, + { + "epoch": 0.04995555555555556, + "grad_norm": 0.09707939624786377, + "learning_rate": 4.969493279379398e-05, + "loss": 0.5284, + "step": 281 + }, + { + "epoch": 0.050133333333333335, + "grad_norm": 0.103645920753479, + "learning_rate": 4.969275433347872e-05, + "loss": 0.6776, + "step": 282 + }, + { + "epoch": 0.05031111111111111, + "grad_norm": 0.13587555289268494, + "learning_rate": 4.969056817079633e-05, + "loss": 0.5261, + "step": 283 + }, + { + "epoch": 0.05048888888888889, + "grad_norm": 0.11946401000022888, + "learning_rate": 4.9688374306428696e-05, + "loss": 0.4922, + "step": 284 + }, + { + "epoch": 0.050666666666666665, + "grad_norm": 0.10667511820793152, + "learning_rate": 4.968617274106019e-05, + "loss": 0.6504, + "step": 285 + }, + { + "epoch": 0.05084444444444444, + "grad_norm": 0.1257142573595047, + "learning_rate": 4.968396347537751e-05, + "loss": 0.5528, + "step": 286 + }, + { + "epoch": 0.05102222222222222, + "grad_norm": 0.09634239226579666, + "learning_rate": 4.9681746510069805e-05, + "loss": 0.5362, + "step": 287 + }, + { + "epoch": 0.0512, + "grad_norm": 0.09978805482387543, + "learning_rate": 4.9679521845828604e-05, + "loss": 0.4487, + "step": 288 + }, + { + "epoch": 0.05137777777777778, + "grad_norm": 0.10307997465133667, + "learning_rate": 4.967728948334784e-05, + "loss": 0.6163, + "step": 289 + }, + { + "epoch": 0.051555555555555556, + "grad_norm": 0.10950716584920883, + "learning_rate": 4.967504942332385e-05, + "loss": 0.569, + "step": 290 + }, + { + "epoch": 0.05173333333333333, + "grad_norm": 0.11659679561853409, + "learning_rate": 4.967280166645538e-05, + "loss": 0.5702, + "step": 291 + }, + { + "epoch": 0.05191111111111111, + "grad_norm": 0.08770512789487839, + "learning_rate": 4.967054621344356e-05, + "loss": 0.4102, + "step": 292 + }, + { + "epoch": 0.052088888888888886, + "grad_norm": 0.09629162400960922, + "learning_rate": 4.966828306499193e-05, + "loss": 0.4858, + "step": 293 + }, + { + "epoch": 0.05226666666666667, + "grad_norm": 0.14017847180366516, + "learning_rate": 4.9666012221806434e-05, + "loss": 0.4999, + "step": 294 + }, + { + "epoch": 0.052444444444444446, + "grad_norm": 0.13698740303516388, + "learning_rate": 4.966373368459541e-05, + "loss": 0.5104, + "step": 295 + }, + { + "epoch": 0.05262222222222222, + "grad_norm": 0.15850208699703217, + "learning_rate": 4.966144745406961e-05, + "loss": 0.577, + "step": 296 + }, + { + "epoch": 0.0528, + "grad_norm": 0.13207827508449554, + "learning_rate": 4.965915353094215e-05, + "loss": 0.5899, + "step": 297 + }, + { + "epoch": 0.052977777777777776, + "grad_norm": 0.10734611749649048, + "learning_rate": 4.965685191592859e-05, + "loss": 0.4573, + "step": 298 + }, + { + "epoch": 0.05315555555555555, + "grad_norm": 0.15234452486038208, + "learning_rate": 4.965454260974685e-05, + "loss": 0.5435, + "step": 299 + }, + { + "epoch": 0.05333333333333334, + "grad_norm": 0.13856150209903717, + "learning_rate": 4.9652225613117284e-05, + "loss": 0.5718, + "step": 300 + }, + { + "epoch": 0.05351111111111111, + "grad_norm": 0.09088527411222458, + "learning_rate": 4.964990092676263e-05, + "loss": 0.4754, + "step": 301 + }, + { + "epoch": 0.05368888888888889, + "grad_norm": 0.11890742927789688, + "learning_rate": 4.964756855140801e-05, + "loss": 0.6654, + "step": 302 + }, + { + "epoch": 0.05386666666666667, + "grad_norm": 0.10593919456005096, + "learning_rate": 4.964522848778096e-05, + "loss": 0.5128, + "step": 303 + }, + { + "epoch": 0.054044444444444444, + "grad_norm": 0.11388156563043594, + "learning_rate": 4.964288073661142e-05, + "loss": 0.5277, + "step": 304 + }, + { + "epoch": 0.05422222222222222, + "grad_norm": 0.10140350461006165, + "learning_rate": 4.964052529863171e-05, + "loss": 0.6087, + "step": 305 + }, + { + "epoch": 0.0544, + "grad_norm": 0.1306110918521881, + "learning_rate": 4.963816217457657e-05, + "loss": 0.514, + "step": 306 + }, + { + "epoch": 0.05457777777777778, + "grad_norm": 0.0984165146946907, + "learning_rate": 4.963579136518312e-05, + "loss": 0.6198, + "step": 307 + }, + { + "epoch": 0.05475555555555556, + "grad_norm": 0.1110251396894455, + "learning_rate": 4.9633412871190873e-05, + "loss": 0.505, + "step": 308 + }, + { + "epoch": 0.054933333333333334, + "grad_norm": 0.0856340229511261, + "learning_rate": 4.9631026693341764e-05, + "loss": 0.5793, + "step": 309 + }, + { + "epoch": 0.05511111111111111, + "grad_norm": 0.09711772203445435, + "learning_rate": 4.96286328323801e-05, + "loss": 0.5052, + "step": 310 + }, + { + "epoch": 0.05528888888888889, + "grad_norm": 0.14198853075504303, + "learning_rate": 4.9626231289052596e-05, + "loss": 0.5531, + "step": 311 + }, + { + "epoch": 0.055466666666666664, + "grad_norm": 0.12941457331180573, + "learning_rate": 4.9623822064108364e-05, + "loss": 0.6125, + "step": 312 + }, + { + "epoch": 0.05564444444444445, + "grad_norm": 0.09148528426885605, + "learning_rate": 4.96214051582989e-05, + "loss": 0.6196, + "step": 313 + }, + { + "epoch": 0.055822222222222224, + "grad_norm": 0.10153131932020187, + "learning_rate": 4.96189805723781e-05, + "loss": 0.4241, + "step": 314 + }, + { + "epoch": 0.056, + "grad_norm": 0.13130423426628113, + "learning_rate": 4.961654830710229e-05, + "loss": 0.612, + "step": 315 + }, + { + "epoch": 0.05617777777777778, + "grad_norm": 0.12414809316396713, + "learning_rate": 4.9614108363230135e-05, + "loss": 0.5504, + "step": 316 + }, + { + "epoch": 0.056355555555555555, + "grad_norm": 0.1427779495716095, + "learning_rate": 4.961166074152274e-05, + "loss": 0.5597, + "step": 317 + }, + { + "epoch": 0.05653333333333333, + "grad_norm": 0.1236666664481163, + "learning_rate": 4.9609205442743566e-05, + "loss": 0.5376, + "step": 318 + }, + { + "epoch": 0.05671111111111111, + "grad_norm": 0.0970771312713623, + "learning_rate": 4.960674246765851e-05, + "loss": 0.6075, + "step": 319 + }, + { + "epoch": 0.05688888888888889, + "grad_norm": 0.12108790129423141, + "learning_rate": 4.9604271817035834e-05, + "loss": 0.4894, + "step": 320 + }, + { + "epoch": 0.05706666666666667, + "grad_norm": 0.12736080586910248, + "learning_rate": 4.960179349164621e-05, + "loss": 0.6127, + "step": 321 + }, + { + "epoch": 0.057244444444444445, + "grad_norm": 0.1136578842997551, + "learning_rate": 4.959930749226269e-05, + "loss": 0.4815, + "step": 322 + }, + { + "epoch": 0.05742222222222222, + "grad_norm": 0.10089968144893646, + "learning_rate": 4.959681381966073e-05, + "loss": 0.6272, + "step": 323 + }, + { + "epoch": 0.0576, + "grad_norm": 0.130580335855484, + "learning_rate": 4.9594312474618175e-05, + "loss": 0.683, + "step": 324 + }, + { + "epoch": 0.057777777777777775, + "grad_norm": 0.13355344533920288, + "learning_rate": 4.959180345791528e-05, + "loss": 0.7352, + "step": 325 + }, + { + "epoch": 0.05795555555555556, + "grad_norm": 0.09264720231294632, + "learning_rate": 4.9589286770334654e-05, + "loss": 0.5056, + "step": 326 + }, + { + "epoch": 0.058133333333333335, + "grad_norm": 0.15634864568710327, + "learning_rate": 4.9586762412661333e-05, + "loss": 0.5626, + "step": 327 + }, + { + "epoch": 0.05831111111111111, + "grad_norm": 0.115487240254879, + "learning_rate": 4.958423038568274e-05, + "loss": 0.419, + "step": 328 + }, + { + "epoch": 0.05848888888888889, + "grad_norm": 0.11722230911254883, + "learning_rate": 4.958169069018869e-05, + "loss": 0.5911, + "step": 329 + }, + { + "epoch": 0.058666666666666666, + "grad_norm": 0.13027794659137726, + "learning_rate": 4.957914332697137e-05, + "loss": 0.5362, + "step": 330 + }, + { + "epoch": 0.05884444444444444, + "grad_norm": 0.13117583096027374, + "learning_rate": 4.9576588296825386e-05, + "loss": 0.6894, + "step": 331 + }, + { + "epoch": 0.05902222222222222, + "grad_norm": 0.17989352345466614, + "learning_rate": 4.957402560054773e-05, + "loss": 0.4547, + "step": 332 + }, + { + "epoch": 0.0592, + "grad_norm": 0.12834592163562775, + "learning_rate": 4.957145523893776e-05, + "loss": 0.5725, + "step": 333 + }, + { + "epoch": 0.05937777777777778, + "grad_norm": 0.10056453943252563, + "learning_rate": 4.956887721279726e-05, + "loss": 0.6068, + "step": 334 + }, + { + "epoch": 0.059555555555555556, + "grad_norm": 0.0998259112238884, + "learning_rate": 4.9566291522930375e-05, + "loss": 0.4956, + "step": 335 + }, + { + "epoch": 0.05973333333333333, + "grad_norm": 0.12029340118169785, + "learning_rate": 4.9563698170143666e-05, + "loss": 0.5718, + "step": 336 + }, + { + "epoch": 0.05991111111111111, + "grad_norm": 0.11883249878883362, + "learning_rate": 4.956109715524608e-05, + "loss": 0.6578, + "step": 337 + }, + { + "epoch": 0.060088888888888886, + "grad_norm": 0.13039566576480865, + "learning_rate": 4.955848847904894e-05, + "loss": 0.619, + "step": 338 + }, + { + "epoch": 0.06026666666666667, + "grad_norm": 0.1289486289024353, + "learning_rate": 4.9555872142365945e-05, + "loss": 0.6338, + "step": 339 + }, + { + "epoch": 0.060444444444444446, + "grad_norm": 0.24932079017162323, + "learning_rate": 4.955324814601324e-05, + "loss": 0.5698, + "step": 340 + }, + { + "epoch": 0.06062222222222222, + "grad_norm": 0.09469828009605408, + "learning_rate": 4.95506164908093e-05, + "loss": 0.4024, + "step": 341 + }, + { + "epoch": 0.0608, + "grad_norm": 0.11783221364021301, + "learning_rate": 4.9547977177575014e-05, + "loss": 0.4178, + "step": 342 + }, + { + "epoch": 0.06097777777777778, + "grad_norm": 0.11389844864606857, + "learning_rate": 4.9545330207133664e-05, + "loss": 0.399, + "step": 343 + }, + { + "epoch": 0.06115555555555555, + "grad_norm": 0.09958557784557343, + "learning_rate": 4.954267558031092e-05, + "loss": 0.4744, + "step": 344 + }, + { + "epoch": 0.06133333333333333, + "grad_norm": 0.17481932044029236, + "learning_rate": 4.9540013297934826e-05, + "loss": 0.4412, + "step": 345 + }, + { + "epoch": 0.061511111111111114, + "grad_norm": 0.10469872504472733, + "learning_rate": 4.953734336083583e-05, + "loss": 0.469, + "step": 346 + }, + { + "epoch": 0.06168888888888889, + "grad_norm": 0.1322900950908661, + "learning_rate": 4.953466576984675e-05, + "loss": 0.539, + "step": 347 + }, + { + "epoch": 0.06186666666666667, + "grad_norm": 0.15849369764328003, + "learning_rate": 4.953198052580281e-05, + "loss": 0.5385, + "step": 348 + }, + { + "epoch": 0.062044444444444444, + "grad_norm": 0.41547104716300964, + "learning_rate": 4.952928762954161e-05, + "loss": 0.5733, + "step": 349 + }, + { + "epoch": 0.06222222222222222, + "grad_norm": 0.16643372178077698, + "learning_rate": 4.9526587081903145e-05, + "loss": 0.5699, + "step": 350 + }, + { + "epoch": 0.0624, + "grad_norm": 0.10074812918901443, + "learning_rate": 4.952387888372979e-05, + "loss": 0.5621, + "step": 351 + }, + { + "epoch": 0.06257777777777777, + "grad_norm": 0.10379762202501297, + "learning_rate": 4.952116303586631e-05, + "loss": 0.4467, + "step": 352 + }, + { + "epoch": 0.06275555555555555, + "grad_norm": 0.15022054314613342, + "learning_rate": 4.951843953915985e-05, + "loss": 0.6035, + "step": 353 + }, + { + "epoch": 0.06293333333333333, + "grad_norm": 0.2365552932024002, + "learning_rate": 4.951570839445995e-05, + "loss": 0.5661, + "step": 354 + }, + { + "epoch": 0.06311111111111112, + "grad_norm": 0.10666210204362869, + "learning_rate": 4.951296960261853e-05, + "loss": 0.5279, + "step": 355 + }, + { + "epoch": 0.0632888888888889, + "grad_norm": 0.1475289762020111, + "learning_rate": 4.95102231644899e-05, + "loss": 0.6749, + "step": 356 + }, + { + "epoch": 0.06346666666666667, + "grad_norm": 0.11344022303819656, + "learning_rate": 4.9507469080930734e-05, + "loss": 0.569, + "step": 357 + }, + { + "epoch": 0.06364444444444445, + "grad_norm": 0.09832093119621277, + "learning_rate": 4.9504707352800125e-05, + "loss": 0.588, + "step": 358 + }, + { + "epoch": 0.06382222222222222, + "grad_norm": 0.16316619515419006, + "learning_rate": 4.9501937980959545e-05, + "loss": 0.4731, + "step": 359 + }, + { + "epoch": 0.064, + "grad_norm": 0.1701132208108902, + "learning_rate": 4.949916096627282e-05, + "loss": 0.6044, + "step": 360 + }, + { + "epoch": 0.06417777777777778, + "grad_norm": 0.1116047129034996, + "learning_rate": 4.949637630960617e-05, + "loss": 0.5256, + "step": 361 + }, + { + "epoch": 0.06435555555555555, + "grad_norm": 0.12019706517457962, + "learning_rate": 4.949358401182824e-05, + "loss": 0.5548, + "step": 362 + }, + { + "epoch": 0.06453333333333333, + "grad_norm": 0.11305873841047287, + "learning_rate": 4.949078407381e-05, + "loss": 0.5527, + "step": 363 + }, + { + "epoch": 0.06471111111111111, + "grad_norm": 0.1324770152568817, + "learning_rate": 4.948797649642484e-05, + "loss": 0.5513, + "step": 364 + }, + { + "epoch": 0.06488888888888888, + "grad_norm": 0.10249876976013184, + "learning_rate": 4.948516128054852e-05, + "loss": 0.5749, + "step": 365 + }, + { + "epoch": 0.06506666666666666, + "grad_norm": 0.13852979242801666, + "learning_rate": 4.948233842705919e-05, + "loss": 0.7387, + "step": 366 + }, + { + "epoch": 0.06524444444444444, + "grad_norm": 0.13100025057792664, + "learning_rate": 4.9479507936837364e-05, + "loss": 0.5049, + "step": 367 + }, + { + "epoch": 0.06542222222222223, + "grad_norm": 0.11465228348970413, + "learning_rate": 4.947666981076597e-05, + "loss": 0.5006, + "step": 368 + }, + { + "epoch": 0.0656, + "grad_norm": 0.13173198699951172, + "learning_rate": 4.94738240497303e-05, + "loss": 0.6291, + "step": 369 + }, + { + "epoch": 0.06577777777777778, + "grad_norm": 0.11195150017738342, + "learning_rate": 4.947097065461801e-05, + "loss": 0.5483, + "step": 370 + }, + { + "epoch": 0.06595555555555556, + "grad_norm": 0.13388414680957794, + "learning_rate": 4.946810962631916e-05, + "loss": 0.754, + "step": 371 + }, + { + "epoch": 0.06613333333333334, + "grad_norm": 0.13278795778751373, + "learning_rate": 4.9465240965726195e-05, + "loss": 0.4575, + "step": 372 + }, + { + "epoch": 0.06631111111111111, + "grad_norm": 0.12588383257389069, + "learning_rate": 4.946236467373392e-05, + "loss": 0.5325, + "step": 373 + }, + { + "epoch": 0.06648888888888889, + "grad_norm": 0.21415670216083527, + "learning_rate": 4.945948075123954e-05, + "loss": 0.7034, + "step": 374 + }, + { + "epoch": 0.06666666666666667, + "grad_norm": 0.11683046817779541, + "learning_rate": 4.9456589199142637e-05, + "loss": 0.4491, + "step": 375 + }, + { + "epoch": 0.06684444444444444, + "grad_norm": 0.12641367316246033, + "learning_rate": 4.9453690018345144e-05, + "loss": 0.4136, + "step": 376 + }, + { + "epoch": 0.06702222222222222, + "grad_norm": 0.17564763128757477, + "learning_rate": 4.945078320975142e-05, + "loss": 0.5542, + "step": 377 + }, + { + "epoch": 0.0672, + "grad_norm": 0.22514808177947998, + "learning_rate": 4.9447868774268166e-05, + "loss": 0.5082, + "step": 378 + }, + { + "epoch": 0.06737777777777777, + "grad_norm": 0.09878932684659958, + "learning_rate": 4.9444946712804494e-05, + "loss": 0.5395, + "step": 379 + }, + { + "epoch": 0.06755555555555555, + "grad_norm": 0.12520992755889893, + "learning_rate": 4.9442017026271864e-05, + "loss": 0.6807, + "step": 380 + }, + { + "epoch": 0.06773333333333334, + "grad_norm": 0.09364703297615051, + "learning_rate": 4.9439079715584135e-05, + "loss": 0.4706, + "step": 381 + }, + { + "epoch": 0.06791111111111112, + "grad_norm": 0.09961648285388947, + "learning_rate": 4.943613478165753e-05, + "loss": 0.6094, + "step": 382 + }, + { + "epoch": 0.0680888888888889, + "grad_norm": 0.1309124231338501, + "learning_rate": 4.943318222541066e-05, + "loss": 0.4911, + "step": 383 + }, + { + "epoch": 0.06826666666666667, + "grad_norm": 0.09704731404781342, + "learning_rate": 4.9430222047764506e-05, + "loss": 0.5329, + "step": 384 + }, + { + "epoch": 0.06844444444444445, + "grad_norm": 0.11558416485786438, + "learning_rate": 4.9427254249642444e-05, + "loss": 0.4877, + "step": 385 + }, + { + "epoch": 0.06862222222222222, + "grad_norm": 0.10762333869934082, + "learning_rate": 4.942427883197021e-05, + "loss": 0.5774, + "step": 386 + }, + { + "epoch": 0.0688, + "grad_norm": 0.13059288263320923, + "learning_rate": 4.94212957956759e-05, + "loss": 0.6695, + "step": 387 + }, + { + "epoch": 0.06897777777777778, + "grad_norm": 0.1858450025320053, + "learning_rate": 4.941830514169004e-05, + "loss": 0.5673, + "step": 388 + }, + { + "epoch": 0.06915555555555555, + "grad_norm": 0.12893255054950714, + "learning_rate": 4.941530687094548e-05, + "loss": 0.5959, + "step": 389 + }, + { + "epoch": 0.06933333333333333, + "grad_norm": 0.24427193403244019, + "learning_rate": 4.941230098437747e-05, + "loss": 0.736, + "step": 390 + }, + { + "epoch": 0.0695111111111111, + "grad_norm": 0.12266620993614197, + "learning_rate": 4.940928748292363e-05, + "loss": 0.5298, + "step": 391 + }, + { + "epoch": 0.06968888888888888, + "grad_norm": 0.17361949384212494, + "learning_rate": 4.9406266367523945e-05, + "loss": 0.5028, + "step": 392 + }, + { + "epoch": 0.06986666666666666, + "grad_norm": 0.1182074025273323, + "learning_rate": 4.9403237639120805e-05, + "loss": 0.5703, + "step": 393 + }, + { + "epoch": 0.07004444444444445, + "grad_norm": 0.1409914195537567, + "learning_rate": 4.940020129865895e-05, + "loss": 0.5364, + "step": 394 + }, + { + "epoch": 0.07022222222222223, + "grad_norm": 0.14208325743675232, + "learning_rate": 4.93971573470855e-05, + "loss": 0.4332, + "step": 395 + }, + { + "epoch": 0.0704, + "grad_norm": 0.1182163804769516, + "learning_rate": 4.9394105785349944e-05, + "loss": 0.4911, + "step": 396 + }, + { + "epoch": 0.07057777777777778, + "grad_norm": 0.12269298732280731, + "learning_rate": 4.939104661440415e-05, + "loss": 0.4789, + "step": 397 + }, + { + "epoch": 0.07075555555555556, + "grad_norm": 0.23277179896831512, + "learning_rate": 4.938797983520237e-05, + "loss": 0.4616, + "step": 398 + }, + { + "epoch": 0.07093333333333333, + "grad_norm": 0.12538619339466095, + "learning_rate": 4.938490544870121e-05, + "loss": 0.4223, + "step": 399 + }, + { + "epoch": 0.07111111111111111, + "grad_norm": 0.2072671800851822, + "learning_rate": 4.938182345585966e-05, + "loss": 0.5571, + "step": 400 + }, + { + "epoch": 0.07128888888888889, + "grad_norm": 0.12578058242797852, + "learning_rate": 4.937873385763908e-05, + "loss": 0.6039, + "step": 401 + }, + { + "epoch": 0.07146666666666666, + "grad_norm": 0.13072609901428223, + "learning_rate": 4.937563665500321e-05, + "loss": 0.5999, + "step": 402 + }, + { + "epoch": 0.07164444444444444, + "grad_norm": 0.09660910815000534, + "learning_rate": 4.9372531848918145e-05, + "loss": 0.4193, + "step": 403 + }, + { + "epoch": 0.07182222222222222, + "grad_norm": 0.17925910651683807, + "learning_rate": 4.936941944035237e-05, + "loss": 0.4779, + "step": 404 + }, + { + "epoch": 0.072, + "grad_norm": 0.12675291299819946, + "learning_rate": 4.936629943027672e-05, + "loss": 0.5663, + "step": 405 + }, + { + "epoch": 0.07217777777777777, + "grad_norm": 0.1440359354019165, + "learning_rate": 4.9363171819664434e-05, + "loss": 0.4829, + "step": 406 + }, + { + "epoch": 0.07235555555555556, + "grad_norm": 0.14052598178386688, + "learning_rate": 4.936003660949108e-05, + "loss": 0.7449, + "step": 407 + }, + { + "epoch": 0.07253333333333334, + "grad_norm": 0.11429505795240402, + "learning_rate": 4.935689380073464e-05, + "loss": 0.47, + "step": 408 + }, + { + "epoch": 0.07271111111111112, + "grad_norm": 0.12665224075317383, + "learning_rate": 4.935374339437543e-05, + "loss": 0.578, + "step": 409 + }, + { + "epoch": 0.07288888888888889, + "grad_norm": 0.11421271413564682, + "learning_rate": 4.935058539139615e-05, + "loss": 0.5641, + "step": 410 + }, + { + "epoch": 0.07306666666666667, + "grad_norm": 0.13265226781368256, + "learning_rate": 4.9347419792781876e-05, + "loss": 0.6225, + "step": 411 + }, + { + "epoch": 0.07324444444444445, + "grad_norm": 0.12128201872110367, + "learning_rate": 4.934424659952006e-05, + "loss": 0.6562, + "step": 412 + }, + { + "epoch": 0.07342222222222222, + "grad_norm": 0.1084672138094902, + "learning_rate": 4.934106581260049e-05, + "loss": 0.5167, + "step": 413 + }, + { + "epoch": 0.0736, + "grad_norm": 0.09844696521759033, + "learning_rate": 4.933787743301534e-05, + "loss": 0.5038, + "step": 414 + }, + { + "epoch": 0.07377777777777778, + "grad_norm": 0.12272031605243683, + "learning_rate": 4.933468146175918e-05, + "loss": 0.5574, + "step": 415 + }, + { + "epoch": 0.07395555555555555, + "grad_norm": 0.12302003800868988, + "learning_rate": 4.93314778998289e-05, + "loss": 0.6401, + "step": 416 + }, + { + "epoch": 0.07413333333333333, + "grad_norm": 0.10350817441940308, + "learning_rate": 4.93282667482238e-05, + "loss": 0.5237, + "step": 417 + }, + { + "epoch": 0.0743111111111111, + "grad_norm": 0.11492586135864258, + "learning_rate": 4.9325048007945526e-05, + "loss": 0.5586, + "step": 418 + }, + { + "epoch": 0.07448888888888888, + "grad_norm": 0.1294155865907669, + "learning_rate": 4.9321821679998074e-05, + "loss": 0.5182, + "step": 419 + }, + { + "epoch": 0.07466666666666667, + "grad_norm": 0.10784469544887543, + "learning_rate": 4.9318587765387845e-05, + "loss": 0.4844, + "step": 420 + }, + { + "epoch": 0.07484444444444445, + "grad_norm": 0.13449308276176453, + "learning_rate": 4.9315346265123594e-05, + "loss": 0.6438, + "step": 421 + }, + { + "epoch": 0.07502222222222223, + "grad_norm": 0.13114199042320251, + "learning_rate": 4.9312097180216414e-05, + "loss": 0.5335, + "step": 422 + }, + { + "epoch": 0.0752, + "grad_norm": 0.1415078490972519, + "learning_rate": 4.9308840511679804e-05, + "loss": 0.586, + "step": 423 + }, + { + "epoch": 0.07537777777777778, + "grad_norm": 0.11740420758724213, + "learning_rate": 4.9305576260529607e-05, + "loss": 0.5374, + "step": 424 + }, + { + "epoch": 0.07555555555555556, + "grad_norm": 0.12439213693141937, + "learning_rate": 4.930230442778403e-05, + "loss": 0.4819, + "step": 425 + }, + { + "epoch": 0.07573333333333333, + "grad_norm": 0.10145685076713562, + "learning_rate": 4.929902501446366e-05, + "loss": 0.4455, + "step": 426 + }, + { + "epoch": 0.07591111111111111, + "grad_norm": 0.1378268003463745, + "learning_rate": 4.929573802159143e-05, + "loss": 0.5227, + "step": 427 + }, + { + "epoch": 0.07608888888888889, + "grad_norm": 0.11586637794971466, + "learning_rate": 4.9292443450192645e-05, + "loss": 0.4823, + "step": 428 + }, + { + "epoch": 0.07626666666666666, + "grad_norm": 0.14607420563697815, + "learning_rate": 4.928914130129498e-05, + "loss": 0.6554, + "step": 429 + }, + { + "epoch": 0.07644444444444444, + "grad_norm": 0.1378929764032364, + "learning_rate": 4.9285831575928465e-05, + "loss": 0.5136, + "step": 430 + }, + { + "epoch": 0.07662222222222222, + "grad_norm": 0.11642621457576752, + "learning_rate": 4.92825142751255e-05, + "loss": 0.5608, + "step": 431 + }, + { + "epoch": 0.0768, + "grad_norm": 0.12321925163269043, + "learning_rate": 4.9279189399920844e-05, + "loss": 0.5031, + "step": 432 + }, + { + "epoch": 0.07697777777777778, + "grad_norm": 0.12677502632141113, + "learning_rate": 4.927585695135162e-05, + "loss": 0.6046, + "step": 433 + }, + { + "epoch": 0.07715555555555556, + "grad_norm": 0.17256797850131989, + "learning_rate": 4.9272516930457314e-05, + "loss": 0.5595, + "step": 434 + }, + { + "epoch": 0.07733333333333334, + "grad_norm": 0.1433863341808319, + "learning_rate": 4.9269169338279766e-05, + "loss": 0.6104, + "step": 435 + }, + { + "epoch": 0.07751111111111111, + "grad_norm": 0.1455060839653015, + "learning_rate": 4.9265814175863186e-05, + "loss": 0.6235, + "step": 436 + }, + { + "epoch": 0.07768888888888889, + "grad_norm": 0.14658929407596588, + "learning_rate": 4.926245144425415e-05, + "loss": 0.5466, + "step": 437 + }, + { + "epoch": 0.07786666666666667, + "grad_norm": 0.15051274001598358, + "learning_rate": 4.925908114450158e-05, + "loss": 0.5063, + "step": 438 + }, + { + "epoch": 0.07804444444444444, + "grad_norm": 0.13032183051109314, + "learning_rate": 4.925570327765678e-05, + "loss": 0.4495, + "step": 439 + }, + { + "epoch": 0.07822222222222222, + "grad_norm": 0.1568521410226822, + "learning_rate": 4.925231784477339e-05, + "loss": 0.6008, + "step": 440 + }, + { + "epoch": 0.0784, + "grad_norm": 0.12232274562120438, + "learning_rate": 4.924892484690743e-05, + "loss": 0.4374, + "step": 441 + }, + { + "epoch": 0.07857777777777777, + "grad_norm": 0.14305874705314636, + "learning_rate": 4.9245524285117274e-05, + "loss": 0.5823, + "step": 442 + }, + { + "epoch": 0.07875555555555555, + "grad_norm": 0.1605319380760193, + "learning_rate": 4.924211616046365e-05, + "loss": 0.4347, + "step": 443 + }, + { + "epoch": 0.07893333333333333, + "grad_norm": 0.10928831249475479, + "learning_rate": 4.923870047400964e-05, + "loss": 0.4523, + "step": 444 + }, + { + "epoch": 0.0791111111111111, + "grad_norm": 0.19794242084026337, + "learning_rate": 4.9235277226820695e-05, + "loss": 0.4556, + "step": 445 + }, + { + "epoch": 0.0792888888888889, + "grad_norm": 0.16253721714019775, + "learning_rate": 4.923184641996463e-05, + "loss": 0.5373, + "step": 446 + }, + { + "epoch": 0.07946666666666667, + "grad_norm": 0.2454134076833725, + "learning_rate": 4.922840805451161e-05, + "loss": 0.5283, + "step": 447 + }, + { + "epoch": 0.07964444444444445, + "grad_norm": 0.18191643059253693, + "learning_rate": 4.922496213153416e-05, + "loss": 0.4677, + "step": 448 + }, + { + "epoch": 0.07982222222222222, + "grad_norm": 0.2187846601009369, + "learning_rate": 4.922150865210715e-05, + "loss": 0.4869, + "step": 449 + }, + { + "epoch": 0.08, + "grad_norm": 0.32716700434684753, + "learning_rate": 4.9218047617307824e-05, + "loss": 0.6624, + "step": 450 + }, + { + "epoch": 0.08017777777777778, + "grad_norm": 0.14984872937202454, + "learning_rate": 4.9214579028215776e-05, + "loss": 0.4709, + "step": 451 + }, + { + "epoch": 0.08035555555555556, + "grad_norm": 0.1540294885635376, + "learning_rate": 4.9211102885912965e-05, + "loss": 0.6462, + "step": 452 + }, + { + "epoch": 0.08053333333333333, + "grad_norm": 0.13533832132816315, + "learning_rate": 4.920761919148369e-05, + "loss": 0.5695, + "step": 453 + }, + { + "epoch": 0.08071111111111111, + "grad_norm": 0.19239214062690735, + "learning_rate": 4.920412794601461e-05, + "loss": 0.5041, + "step": 454 + }, + { + "epoch": 0.08088888888888889, + "grad_norm": 0.15827728807926178, + "learning_rate": 4.9200629150594744e-05, + "loss": 0.5251, + "step": 455 + }, + { + "epoch": 0.08106666666666666, + "grad_norm": 0.16110765933990479, + "learning_rate": 4.919712280631547e-05, + "loss": 0.551, + "step": 456 + }, + { + "epoch": 0.08124444444444444, + "grad_norm": 0.14169973134994507, + "learning_rate": 4.9193608914270515e-05, + "loss": 0.6055, + "step": 457 + }, + { + "epoch": 0.08142222222222223, + "grad_norm": 0.18263943493366241, + "learning_rate": 4.9190087475555955e-05, + "loss": 0.6913, + "step": 458 + }, + { + "epoch": 0.0816, + "grad_norm": 0.1228347197175026, + "learning_rate": 4.918655849127024e-05, + "loss": 0.4552, + "step": 459 + }, + { + "epoch": 0.08177777777777778, + "grad_norm": 0.10659822821617126, + "learning_rate": 4.918302196251415e-05, + "loss": 0.5003, + "step": 460 + }, + { + "epoch": 0.08195555555555556, + "grad_norm": 0.12345919758081436, + "learning_rate": 4.9179477890390825e-05, + "loss": 0.429, + "step": 461 + }, + { + "epoch": 0.08213333333333334, + "grad_norm": 0.11815184354782104, + "learning_rate": 4.917592627600577e-05, + "loss": 0.502, + "step": 462 + }, + { + "epoch": 0.08231111111111111, + "grad_norm": 0.13535043597221375, + "learning_rate": 4.917236712046682e-05, + "loss": 0.6751, + "step": 463 + }, + { + "epoch": 0.08248888888888889, + "grad_norm": 0.13930095732212067, + "learning_rate": 4.916880042488419e-05, + "loss": 0.5427, + "step": 464 + }, + { + "epoch": 0.08266666666666667, + "grad_norm": 0.11492421478033066, + "learning_rate": 4.916522619037043e-05, + "loss": 0.4535, + "step": 465 + }, + { + "epoch": 0.08284444444444444, + "grad_norm": 0.13391397893428802, + "learning_rate": 4.916164441804044e-05, + "loss": 0.5106, + "step": 466 + }, + { + "epoch": 0.08302222222222222, + "grad_norm": 0.15971896052360535, + "learning_rate": 4.915805510901148e-05, + "loss": 0.8395, + "step": 467 + }, + { + "epoch": 0.0832, + "grad_norm": 0.14514043927192688, + "learning_rate": 4.915445826440316e-05, + "loss": 0.5025, + "step": 468 + }, + { + "epoch": 0.08337777777777777, + "grad_norm": 0.13565373420715332, + "learning_rate": 4.9150853885337426e-05, + "loss": 0.5774, + "step": 469 + }, + { + "epoch": 0.08355555555555555, + "grad_norm": 0.11118032783269882, + "learning_rate": 4.9147241972938596e-05, + "loss": 0.4923, + "step": 470 + }, + { + "epoch": 0.08373333333333334, + "grad_norm": 0.14587406814098358, + "learning_rate": 4.914362252833332e-05, + "loss": 0.5503, + "step": 471 + }, + { + "epoch": 0.08391111111111112, + "grad_norm": 0.11147578805685043, + "learning_rate": 4.913999555265062e-05, + "loss": 0.4687, + "step": 472 + }, + { + "epoch": 0.0840888888888889, + "grad_norm": 0.14339478313922882, + "learning_rate": 4.913636104702183e-05, + "loss": 0.559, + "step": 473 + }, + { + "epoch": 0.08426666666666667, + "grad_norm": 0.12677563726902008, + "learning_rate": 4.913271901258067e-05, + "loss": 0.5893, + "step": 474 + }, + { + "epoch": 0.08444444444444445, + "grad_norm": 0.1598431020975113, + "learning_rate": 4.9129069450463186e-05, + "loss": 0.458, + "step": 475 + }, + { + "epoch": 0.08462222222222222, + "grad_norm": 0.16157884895801544, + "learning_rate": 4.912541236180779e-05, + "loss": 0.5704, + "step": 476 + }, + { + "epoch": 0.0848, + "grad_norm": 0.14672227203845978, + "learning_rate": 4.912174774775522e-05, + "loss": 0.6351, + "step": 477 + }, + { + "epoch": 0.08497777777777778, + "grad_norm": 0.12318343669176102, + "learning_rate": 4.911807560944858e-05, + "loss": 0.4788, + "step": 478 + }, + { + "epoch": 0.08515555555555555, + "grad_norm": 0.09274142235517502, + "learning_rate": 4.9114395948033296e-05, + "loss": 0.3969, + "step": 479 + }, + { + "epoch": 0.08533333333333333, + "grad_norm": 0.12495303899049759, + "learning_rate": 4.911070876465719e-05, + "loss": 0.6137, + "step": 480 + }, + { + "epoch": 0.08551111111111111, + "grad_norm": 0.16101619601249695, + "learning_rate": 4.910701406047037e-05, + "loss": 0.6358, + "step": 481 + }, + { + "epoch": 0.08568888888888888, + "grad_norm": 0.09776817262172699, + "learning_rate": 4.910331183662533e-05, + "loss": 0.4668, + "step": 482 + }, + { + "epoch": 0.08586666666666666, + "grad_norm": 0.17198891937732697, + "learning_rate": 4.90996020942769e-05, + "loss": 0.6583, + "step": 483 + }, + { + "epoch": 0.08604444444444445, + "grad_norm": 0.11508947610855103, + "learning_rate": 4.909588483458225e-05, + "loss": 0.6487, + "step": 484 + }, + { + "epoch": 0.08622222222222223, + "grad_norm": 0.10812798142433167, + "learning_rate": 4.90921600587009e-05, + "loss": 0.4142, + "step": 485 + }, + { + "epoch": 0.0864, + "grad_norm": 0.11280444264411926, + "learning_rate": 4.908842776779472e-05, + "loss": 0.4806, + "step": 486 + }, + { + "epoch": 0.08657777777777778, + "grad_norm": 0.1564003825187683, + "learning_rate": 4.9084687963027894e-05, + "loss": 0.5632, + "step": 487 + }, + { + "epoch": 0.08675555555555556, + "grad_norm": 0.14630691707134247, + "learning_rate": 4.9080940645567e-05, + "loss": 0.5461, + "step": 488 + }, + { + "epoch": 0.08693333333333333, + "grad_norm": 0.1468648612499237, + "learning_rate": 4.907718581658091e-05, + "loss": 0.509, + "step": 489 + }, + { + "epoch": 0.08711111111111111, + "grad_norm": 0.16092698276042938, + "learning_rate": 4.907342347724087e-05, + "loss": 0.5398, + "step": 490 + }, + { + "epoch": 0.08728888888888889, + "grad_norm": 0.11216852813959122, + "learning_rate": 4.906965362872047e-05, + "loss": 0.5187, + "step": 491 + }, + { + "epoch": 0.08746666666666666, + "grad_norm": 0.15423683822155, + "learning_rate": 4.906587627219562e-05, + "loss": 0.4705, + "step": 492 + }, + { + "epoch": 0.08764444444444444, + "grad_norm": 0.13622348010540009, + "learning_rate": 4.906209140884459e-05, + "loss": 0.524, + "step": 493 + }, + { + "epoch": 0.08782222222222222, + "grad_norm": 0.1636885553598404, + "learning_rate": 4.9058299039847975e-05, + "loss": 0.5623, + "step": 494 + }, + { + "epoch": 0.088, + "grad_norm": 0.1458250731229782, + "learning_rate": 4.905449916638873e-05, + "loss": 0.4771, + "step": 495 + }, + { + "epoch": 0.08817777777777777, + "grad_norm": 0.1354760080575943, + "learning_rate": 4.905069178965215e-05, + "loss": 0.4188, + "step": 496 + }, + { + "epoch": 0.08835555555555556, + "grad_norm": 0.20629598200321198, + "learning_rate": 4.904687691082585e-05, + "loss": 0.4221, + "step": 497 + }, + { + "epoch": 0.08853333333333334, + "grad_norm": 0.1728140413761139, + "learning_rate": 4.904305453109981e-05, + "loss": 0.4182, + "step": 498 + }, + { + "epoch": 0.08871111111111112, + "grad_norm": 0.284369558095932, + "learning_rate": 4.9039224651666325e-05, + "loss": 0.4135, + "step": 499 + }, + { + "epoch": 0.08888888888888889, + "grad_norm": 0.29567790031433105, + "learning_rate": 4.903538727372005e-05, + "loss": 0.6789, + "step": 500 + }, + { + "epoch": 0.08906666666666667, + "grad_norm": 0.17631009221076965, + "learning_rate": 4.9031542398457974e-05, + "loss": 0.7005, + "step": 501 + }, + { + "epoch": 0.08924444444444445, + "grad_norm": 0.14960840344429016, + "learning_rate": 4.902769002707942e-05, + "loss": 0.545, + "step": 502 + }, + { + "epoch": 0.08942222222222222, + "grad_norm": 0.19478420913219452, + "learning_rate": 4.902383016078605e-05, + "loss": 0.4462, + "step": 503 + }, + { + "epoch": 0.0896, + "grad_norm": 0.1346614956855774, + "learning_rate": 4.901996280078186e-05, + "loss": 0.5446, + "step": 504 + }, + { + "epoch": 0.08977777777777778, + "grad_norm": 0.1598706990480423, + "learning_rate": 4.90160879482732e-05, + "loss": 0.6985, + "step": 505 + }, + { + "epoch": 0.08995555555555555, + "grad_norm": 0.20883695781230927, + "learning_rate": 4.9012205604468744e-05, + "loss": 0.5531, + "step": 506 + }, + { + "epoch": 0.09013333333333333, + "grad_norm": 0.15307044982910156, + "learning_rate": 4.90083157705795e-05, + "loss": 0.7189, + "step": 507 + }, + { + "epoch": 0.0903111111111111, + "grad_norm": 0.123873770236969, + "learning_rate": 4.9004418447818815e-05, + "loss": 0.6484, + "step": 508 + }, + { + "epoch": 0.09048888888888888, + "grad_norm": 0.16285759210586548, + "learning_rate": 4.900051363740238e-05, + "loss": 0.4526, + "step": 509 + }, + { + "epoch": 0.09066666666666667, + "grad_norm": 0.16067764163017273, + "learning_rate": 4.8996601340548215e-05, + "loss": 0.6434, + "step": 510 + }, + { + "epoch": 0.09084444444444445, + "grad_norm": 0.12879173457622528, + "learning_rate": 4.899268155847667e-05, + "loss": 0.5127, + "step": 511 + }, + { + "epoch": 0.09102222222222223, + "grad_norm": 0.16171373426914215, + "learning_rate": 4.898875429241044e-05, + "loss": 0.4832, + "step": 512 + }, + { + "epoch": 0.0912, + "grad_norm": 0.15955506265163422, + "learning_rate": 4.898481954357455e-05, + "loss": 0.4298, + "step": 513 + }, + { + "epoch": 0.09137777777777778, + "grad_norm": 0.1416526585817337, + "learning_rate": 4.898087731319636e-05, + "loss": 0.5886, + "step": 514 + }, + { + "epoch": 0.09155555555555556, + "grad_norm": 0.14958050847053528, + "learning_rate": 4.897692760250556e-05, + "loss": 0.5979, + "step": 515 + }, + { + "epoch": 0.09173333333333333, + "grad_norm": 0.12716057896614075, + "learning_rate": 4.8972970412734176e-05, + "loss": 0.5218, + "step": 516 + }, + { + "epoch": 0.09191111111111111, + "grad_norm": 0.13036109507083893, + "learning_rate": 4.896900574511657e-05, + "loss": 0.538, + "step": 517 + }, + { + "epoch": 0.09208888888888889, + "grad_norm": 0.14761294424533844, + "learning_rate": 4.8965033600889435e-05, + "loss": 0.4935, + "step": 518 + }, + { + "epoch": 0.09226666666666666, + "grad_norm": 0.14525489509105682, + "learning_rate": 4.8961053981291795e-05, + "loss": 0.5502, + "step": 519 + }, + { + "epoch": 0.09244444444444444, + "grad_norm": 0.1187833622097969, + "learning_rate": 4.8957066887565e-05, + "loss": 0.4978, + "step": 520 + }, + { + "epoch": 0.09262222222222222, + "grad_norm": 0.13545098900794983, + "learning_rate": 4.8953072320952745e-05, + "loss": 0.4861, + "step": 521 + }, + { + "epoch": 0.0928, + "grad_norm": 0.16626237332820892, + "learning_rate": 4.8949070282701034e-05, + "loss": 0.5501, + "step": 522 + }, + { + "epoch": 0.09297777777777778, + "grad_norm": 0.1331178843975067, + "learning_rate": 4.894506077405824e-05, + "loss": 0.4918, + "step": 523 + }, + { + "epoch": 0.09315555555555556, + "grad_norm": 0.16947755217552185, + "learning_rate": 4.8941043796275015e-05, + "loss": 0.5617, + "step": 524 + }, + { + "epoch": 0.09333333333333334, + "grad_norm": 0.16088862717151642, + "learning_rate": 4.893701935060439e-05, + "loss": 0.5284, + "step": 525 + }, + { + "epoch": 0.09351111111111111, + "grad_norm": 0.10503464937210083, + "learning_rate": 4.893298743830168e-05, + "loss": 0.5306, + "step": 526 + }, + { + "epoch": 0.09368888888888889, + "grad_norm": 0.19542542099952698, + "learning_rate": 4.892894806062458e-05, + "loss": 0.4742, + "step": 527 + }, + { + "epoch": 0.09386666666666667, + "grad_norm": 0.14651548862457275, + "learning_rate": 4.892490121883306e-05, + "loss": 0.5249, + "step": 528 + }, + { + "epoch": 0.09404444444444444, + "grad_norm": 0.16770899295806885, + "learning_rate": 4.892084691418947e-05, + "loss": 0.5566, + "step": 529 + }, + { + "epoch": 0.09422222222222222, + "grad_norm": 0.12149757891893387, + "learning_rate": 4.891678514795843e-05, + "loss": 0.3934, + "step": 530 + }, + { + "epoch": 0.0944, + "grad_norm": 0.21761754155158997, + "learning_rate": 4.891271592140695e-05, + "loss": 0.5554, + "step": 531 + }, + { + "epoch": 0.09457777777777777, + "grad_norm": 0.19074362516403198, + "learning_rate": 4.8908639235804324e-05, + "loss": 0.5569, + "step": 532 + }, + { + "epoch": 0.09475555555555555, + "grad_norm": 0.17187733948230743, + "learning_rate": 4.890455509242218e-05, + "loss": 0.5309, + "step": 533 + }, + { + "epoch": 0.09493333333333333, + "grad_norm": 0.12675423920154572, + "learning_rate": 4.890046349253448e-05, + "loss": 0.5178, + "step": 534 + }, + { + "epoch": 0.0951111111111111, + "grad_norm": 0.18373918533325195, + "learning_rate": 4.889636443741752e-05, + "loss": 0.6037, + "step": 535 + }, + { + "epoch": 0.0952888888888889, + "grad_norm": 0.10553757846355438, + "learning_rate": 4.889225792834991e-05, + "loss": 0.4518, + "step": 536 + }, + { + "epoch": 0.09546666666666667, + "grad_norm": 0.18762044608592987, + "learning_rate": 4.888814396661256e-05, + "loss": 0.4829, + "step": 537 + }, + { + "epoch": 0.09564444444444445, + "grad_norm": 0.17660444974899292, + "learning_rate": 4.888402255348876e-05, + "loss": 0.483, + "step": 538 + }, + { + "epoch": 0.09582222222222223, + "grad_norm": 0.1741284430027008, + "learning_rate": 4.887989369026409e-05, + "loss": 0.4932, + "step": 539 + }, + { + "epoch": 0.096, + "grad_norm": 0.15438582003116608, + "learning_rate": 4.887575737822645e-05, + "loss": 0.5317, + "step": 540 + }, + { + "epoch": 0.09617777777777778, + "grad_norm": 0.13077807426452637, + "learning_rate": 4.887161361866608e-05, + "loss": 0.5137, + "step": 541 + }, + { + "epoch": 0.09635555555555556, + "grad_norm": 0.2088961899280548, + "learning_rate": 4.8867462412875526e-05, + "loss": 0.3897, + "step": 542 + }, + { + "epoch": 0.09653333333333333, + "grad_norm": 0.16604351997375488, + "learning_rate": 4.886330376214968e-05, + "loss": 0.4283, + "step": 543 + }, + { + "epoch": 0.09671111111111111, + "grad_norm": 0.16843605041503906, + "learning_rate": 4.8859137667785735e-05, + "loss": 0.4215, + "step": 544 + }, + { + "epoch": 0.09688888888888889, + "grad_norm": 0.20798593759536743, + "learning_rate": 4.88549641310832e-05, + "loss": 0.4405, + "step": 545 + }, + { + "epoch": 0.09706666666666666, + "grad_norm": 0.14760468900203705, + "learning_rate": 4.885078315334395e-05, + "loss": 0.4278, + "step": 546 + }, + { + "epoch": 0.09724444444444444, + "grad_norm": 0.1894788295030594, + "learning_rate": 4.884659473587213e-05, + "loss": 0.4811, + "step": 547 + }, + { + "epoch": 0.09742222222222222, + "grad_norm": 0.1976652294397354, + "learning_rate": 4.884239887997423e-05, + "loss": 0.4609, + "step": 548 + }, + { + "epoch": 0.0976, + "grad_norm": 0.2146688550710678, + "learning_rate": 4.8838195586959046e-05, + "loss": 0.4672, + "step": 549 + }, + { + "epoch": 0.09777777777777778, + "grad_norm": 0.23618823289871216, + "learning_rate": 4.8833984858137715e-05, + "loss": 0.5308, + "step": 550 + }, + { + "epoch": 0.09795555555555556, + "grad_norm": 0.1519390195608139, + "learning_rate": 4.882976669482367e-05, + "loss": 0.6066, + "step": 551 + }, + { + "epoch": 0.09813333333333334, + "grad_norm": 0.13874588906764984, + "learning_rate": 4.8825541098332706e-05, + "loss": 0.5008, + "step": 552 + }, + { + "epoch": 0.09831111111111111, + "grad_norm": 0.14271466434001923, + "learning_rate": 4.8821308069982867e-05, + "loss": 0.5179, + "step": 553 + }, + { + "epoch": 0.09848888888888889, + "grad_norm": 0.13332295417785645, + "learning_rate": 4.881706761109458e-05, + "loss": 0.488, + "step": 554 + }, + { + "epoch": 0.09866666666666667, + "grad_norm": 0.12876787781715393, + "learning_rate": 4.881281972299055e-05, + "loss": 0.5535, + "step": 555 + }, + { + "epoch": 0.09884444444444444, + "grad_norm": 0.13639667630195618, + "learning_rate": 4.880856440699582e-05, + "loss": 0.4188, + "step": 556 + }, + { + "epoch": 0.09902222222222222, + "grad_norm": 0.1605464518070221, + "learning_rate": 4.880430166443775e-05, + "loss": 0.4853, + "step": 557 + }, + { + "epoch": 0.0992, + "grad_norm": 0.1567903608083725, + "learning_rate": 4.880003149664599e-05, + "loss": 0.5563, + "step": 558 + }, + { + "epoch": 0.09937777777777777, + "grad_norm": 0.1114354282617569, + "learning_rate": 4.8795753904952534e-05, + "loss": 0.4686, + "step": 559 + }, + { + "epoch": 0.09955555555555555, + "grad_norm": 0.163170725107193, + "learning_rate": 4.8791468890691696e-05, + "loss": 0.4125, + "step": 560 + }, + { + "epoch": 0.09973333333333333, + "grad_norm": 0.1444622427225113, + "learning_rate": 4.878717645520008e-05, + "loss": 0.4841, + "step": 561 + }, + { + "epoch": 0.09991111111111112, + "grad_norm": 0.212680384516716, + "learning_rate": 4.878287659981662e-05, + "loss": 0.5713, + "step": 562 + }, + { + "epoch": 0.1000888888888889, + "grad_norm": 0.21137243509292603, + "learning_rate": 4.877856932588257e-05, + "loss": 0.4885, + "step": 563 + }, + { + "epoch": 0.10026666666666667, + "grad_norm": 0.13607624173164368, + "learning_rate": 4.877425463474148e-05, + "loss": 0.5789, + "step": 564 + }, + { + "epoch": 0.10044444444444445, + "grad_norm": 0.17614617943763733, + "learning_rate": 4.8769932527739225e-05, + "loss": 0.6047, + "step": 565 + }, + { + "epoch": 0.10062222222222222, + "grad_norm": 0.16323573887348175, + "learning_rate": 4.8765603006224006e-05, + "loss": 0.5245, + "step": 566 + }, + { + "epoch": 0.1008, + "grad_norm": 0.15266373753547668, + "learning_rate": 4.87612660715463e-05, + "loss": 0.5597, + "step": 567 + }, + { + "epoch": 0.10097777777777778, + "grad_norm": 0.12621857225894928, + "learning_rate": 4.8756921725058934e-05, + "loss": 0.554, + "step": 568 + }, + { + "epoch": 0.10115555555555555, + "grad_norm": 0.12518687546253204, + "learning_rate": 4.875256996811703e-05, + "loss": 0.5873, + "step": 569 + }, + { + "epoch": 0.10133333333333333, + "grad_norm": 0.19686056673526764, + "learning_rate": 4.874821080207803e-05, + "loss": 0.5533, + "step": 570 + }, + { + "epoch": 0.10151111111111111, + "grad_norm": 0.14372362196445465, + "learning_rate": 4.874384422830167e-05, + "loss": 0.4307, + "step": 571 + }, + { + "epoch": 0.10168888888888888, + "grad_norm": 0.15460693836212158, + "learning_rate": 4.873947024815002e-05, + "loss": 0.5374, + "step": 572 + }, + { + "epoch": 0.10186666666666666, + "grad_norm": 0.2047254592180252, + "learning_rate": 4.873508886298743e-05, + "loss": 0.5059, + "step": 573 + }, + { + "epoch": 0.10204444444444444, + "grad_norm": 0.14418460428714752, + "learning_rate": 4.873070007418059e-05, + "loss": 0.4804, + "step": 574 + }, + { + "epoch": 0.10222222222222223, + "grad_norm": 0.15861301124095917, + "learning_rate": 4.872630388309849e-05, + "loss": 0.5543, + "step": 575 + }, + { + "epoch": 0.1024, + "grad_norm": 0.13709314167499542, + "learning_rate": 4.8721900291112415e-05, + "loss": 0.4175, + "step": 576 + }, + { + "epoch": 0.10257777777777778, + "grad_norm": 0.1624605506658554, + "learning_rate": 4.871748929959598e-05, + "loss": 0.6772, + "step": 577 + }, + { + "epoch": 0.10275555555555556, + "grad_norm": 0.1580829620361328, + "learning_rate": 4.8713070909925094e-05, + "loss": 0.457, + "step": 578 + }, + { + "epoch": 0.10293333333333334, + "grad_norm": 0.19348135590553284, + "learning_rate": 4.870864512347797e-05, + "loss": 0.5821, + "step": 579 + }, + { + "epoch": 0.10311111111111111, + "grad_norm": 0.12710440158843994, + "learning_rate": 4.870421194163515e-05, + "loss": 0.4329, + "step": 580 + }, + { + "epoch": 0.10328888888888889, + "grad_norm": 0.22347180545330048, + "learning_rate": 4.8699771365779453e-05, + "loss": 0.4592, + "step": 581 + }, + { + "epoch": 0.10346666666666667, + "grad_norm": 0.17189402878284454, + "learning_rate": 4.8695323397296044e-05, + "loss": 0.5264, + "step": 582 + }, + { + "epoch": 0.10364444444444444, + "grad_norm": 0.17262843251228333, + "learning_rate": 4.8690868037572346e-05, + "loss": 0.5395, + "step": 583 + }, + { + "epoch": 0.10382222222222222, + "grad_norm": 0.2574656009674072, + "learning_rate": 4.8686405287998116e-05, + "loss": 0.683, + "step": 584 + }, + { + "epoch": 0.104, + "grad_norm": 0.12457854300737381, + "learning_rate": 4.8681935149965416e-05, + "loss": 0.4287, + "step": 585 + }, + { + "epoch": 0.10417777777777777, + "grad_norm": 0.18310818076133728, + "learning_rate": 4.867745762486861e-05, + "loss": 0.5687, + "step": 586 + }, + { + "epoch": 0.10435555555555555, + "grad_norm": 0.162963405251503, + "learning_rate": 4.8672972714104357e-05, + "loss": 0.5696, + "step": 587 + }, + { + "epoch": 0.10453333333333334, + "grad_norm": 0.13576892018318176, + "learning_rate": 4.866848041907164e-05, + "loss": 0.4863, + "step": 588 + }, + { + "epoch": 0.10471111111111112, + "grad_norm": 0.1375289112329483, + "learning_rate": 4.8663980741171724e-05, + "loss": 0.4946, + "step": 589 + }, + { + "epoch": 0.10488888888888889, + "grad_norm": 0.13796629011631012, + "learning_rate": 4.865947368180818e-05, + "loss": 0.6431, + "step": 590 + }, + { + "epoch": 0.10506666666666667, + "grad_norm": 0.23682910203933716, + "learning_rate": 4.8654959242386896e-05, + "loss": 0.4593, + "step": 591 + }, + { + "epoch": 0.10524444444444445, + "grad_norm": 0.17123302817344666, + "learning_rate": 4.865043742431605e-05, + "loss": 0.3951, + "step": 592 + }, + { + "epoch": 0.10542222222222222, + "grad_norm": 0.18275408446788788, + "learning_rate": 4.8645908229006135e-05, + "loss": 0.4035, + "step": 593 + }, + { + "epoch": 0.1056, + "grad_norm": 0.13864780962467194, + "learning_rate": 4.8641371657869916e-05, + "loss": 0.4468, + "step": 594 + }, + { + "epoch": 0.10577777777777778, + "grad_norm": 0.18762609362602234, + "learning_rate": 4.863682771232248e-05, + "loss": 0.4982, + "step": 595 + }, + { + "epoch": 0.10595555555555555, + "grad_norm": 0.17086105048656464, + "learning_rate": 4.863227639378124e-05, + "loss": 0.4342, + "step": 596 + }, + { + "epoch": 0.10613333333333333, + "grad_norm": 0.1798182874917984, + "learning_rate": 4.862771770366584e-05, + "loss": 0.3989, + "step": 597 + }, + { + "epoch": 0.1063111111111111, + "grad_norm": 0.3316052556037903, + "learning_rate": 4.862315164339829e-05, + "loss": 0.4452, + "step": 598 + }, + { + "epoch": 0.10648888888888888, + "grad_norm": 0.2244439423084259, + "learning_rate": 4.861857821440287e-05, + "loss": 0.4652, + "step": 599 + }, + { + "epoch": 0.10666666666666667, + "grad_norm": 0.263261616230011, + "learning_rate": 4.861399741810615e-05, + "loss": 0.5543, + "step": 600 + }, + { + "epoch": 0.10684444444444445, + "grad_norm": 0.16309194266796112, + "learning_rate": 4.860940925593703e-05, + "loss": 0.4489, + "step": 601 + }, + { + "epoch": 0.10702222222222223, + "grad_norm": 0.18105711042881012, + "learning_rate": 4.860481372932667e-05, + "loss": 0.5226, + "step": 602 + }, + { + "epoch": 0.1072, + "grad_norm": 0.1627589911222458, + "learning_rate": 4.860021083970855e-05, + "loss": 0.4933, + "step": 603 + }, + { + "epoch": 0.10737777777777778, + "grad_norm": 0.16295213997364044, + "learning_rate": 4.859560058851844e-05, + "loss": 0.5953, + "step": 604 + }, + { + "epoch": 0.10755555555555556, + "grad_norm": 0.2031651735305786, + "learning_rate": 4.85909829771944e-05, + "loss": 0.5351, + "step": 605 + }, + { + "epoch": 0.10773333333333333, + "grad_norm": 0.1696680337190628, + "learning_rate": 4.858635800717681e-05, + "loss": 0.4635, + "step": 606 + }, + { + "epoch": 0.10791111111111111, + "grad_norm": 0.19358326494693756, + "learning_rate": 4.8581725679908317e-05, + "loss": 0.4878, + "step": 607 + }, + { + "epoch": 0.10808888888888889, + "grad_norm": 0.16717968881130219, + "learning_rate": 4.857708599683389e-05, + "loss": 0.5309, + "step": 608 + }, + { + "epoch": 0.10826666666666666, + "grad_norm": 0.16091205179691315, + "learning_rate": 4.857243895940076e-05, + "loss": 0.4178, + "step": 609 + }, + { + "epoch": 0.10844444444444444, + "grad_norm": 0.1746775209903717, + "learning_rate": 4.856778456905846e-05, + "loss": 0.6035, + "step": 610 + }, + { + "epoch": 0.10862222222222222, + "grad_norm": 0.15101461112499237, + "learning_rate": 4.856312282725886e-05, + "loss": 0.4307, + "step": 611 + }, + { + "epoch": 0.1088, + "grad_norm": 0.16890616714954376, + "learning_rate": 4.855845373545605e-05, + "loss": 0.5392, + "step": 612 + }, + { + "epoch": 0.10897777777777778, + "grad_norm": 0.14475519955158234, + "learning_rate": 4.855377729510648e-05, + "loss": 0.5137, + "step": 613 + }, + { + "epoch": 0.10915555555555556, + "grad_norm": 0.184649258852005, + "learning_rate": 4.8549093507668865e-05, + "loss": 0.5753, + "step": 614 + }, + { + "epoch": 0.10933333333333334, + "grad_norm": 0.1629726141691208, + "learning_rate": 4.854440237460418e-05, + "loss": 0.4653, + "step": 615 + }, + { + "epoch": 0.10951111111111111, + "grad_norm": 0.1585172712802887, + "learning_rate": 4.8539703897375755e-05, + "loss": 0.5474, + "step": 616 + }, + { + "epoch": 0.10968888888888889, + "grad_norm": 0.158916637301445, + "learning_rate": 4.853499807744916e-05, + "loss": 0.514, + "step": 617 + }, + { + "epoch": 0.10986666666666667, + "grad_norm": 0.19505652785301208, + "learning_rate": 4.853028491629228e-05, + "loss": 0.4439, + "step": 618 + }, + { + "epoch": 0.11004444444444444, + "grad_norm": 0.1849583089351654, + "learning_rate": 4.852556441537528e-05, + "loss": 0.5399, + "step": 619 + }, + { + "epoch": 0.11022222222222222, + "grad_norm": 0.23115134239196777, + "learning_rate": 4.852083657617061e-05, + "loss": 0.5264, + "step": 620 + }, + { + "epoch": 0.1104, + "grad_norm": 0.24591057002544403, + "learning_rate": 4.851610140015304e-05, + "loss": 0.5815, + "step": 621 + }, + { + "epoch": 0.11057777777777777, + "grad_norm": 0.22109457850456238, + "learning_rate": 4.851135888879958e-05, + "loss": 0.5344, + "step": 622 + }, + { + "epoch": 0.11075555555555555, + "grad_norm": 0.14644119143486023, + "learning_rate": 4.850660904358956e-05, + "loss": 0.574, + "step": 623 + }, + { + "epoch": 0.11093333333333333, + "grad_norm": 0.18631508946418762, + "learning_rate": 4.85018518660046e-05, + "loss": 0.5127, + "step": 624 + }, + { + "epoch": 0.1111111111111111, + "grad_norm": 0.23002789914608002, + "learning_rate": 4.849708735752859e-05, + "loss": 0.4177, + "step": 625 + }, + { + "epoch": 0.1112888888888889, + "grad_norm": 0.17093795537948608, + "learning_rate": 4.849231551964771e-05, + "loss": 0.6261, + "step": 626 + }, + { + "epoch": 0.11146666666666667, + "grad_norm": 0.1518908441066742, + "learning_rate": 4.8487536353850444e-05, + "loss": 0.5502, + "step": 627 + }, + { + "epoch": 0.11164444444444445, + "grad_norm": 0.15272735059261322, + "learning_rate": 4.848274986162754e-05, + "loss": 0.6634, + "step": 628 + }, + { + "epoch": 0.11182222222222223, + "grad_norm": 0.19047828018665314, + "learning_rate": 4.847795604447204e-05, + "loss": 0.6019, + "step": 629 + }, + { + "epoch": 0.112, + "grad_norm": 0.1536102592945099, + "learning_rate": 4.8473154903879276e-05, + "loss": 0.5099, + "step": 630 + }, + { + "epoch": 0.11217777777777778, + "grad_norm": 0.11658968776464462, + "learning_rate": 4.846834644134686e-05, + "loss": 0.3666, + "step": 631 + }, + { + "epoch": 0.11235555555555556, + "grad_norm": 0.17104125022888184, + "learning_rate": 4.846353065837467e-05, + "loss": 0.5112, + "step": 632 + }, + { + "epoch": 0.11253333333333333, + "grad_norm": 0.1651592254638672, + "learning_rate": 4.845870755646491e-05, + "loss": 0.6138, + "step": 633 + }, + { + "epoch": 0.11271111111111111, + "grad_norm": 0.18889138102531433, + "learning_rate": 4.845387713712203e-05, + "loss": 0.4396, + "step": 634 + }, + { + "epoch": 0.11288888888888889, + "grad_norm": 0.16750873625278473, + "learning_rate": 4.844903940185276e-05, + "loss": 0.5737, + "step": 635 + }, + { + "epoch": 0.11306666666666666, + "grad_norm": 0.14189808070659637, + "learning_rate": 4.844419435216615e-05, + "loss": 0.4309, + "step": 636 + }, + { + "epoch": 0.11324444444444444, + "grad_norm": 0.19938720762729645, + "learning_rate": 4.84393419895735e-05, + "loss": 0.5062, + "step": 637 + }, + { + "epoch": 0.11342222222222222, + "grad_norm": 0.17119143903255463, + "learning_rate": 4.843448231558839e-05, + "loss": 0.4705, + "step": 638 + }, + { + "epoch": 0.1136, + "grad_norm": 0.18427115678787231, + "learning_rate": 4.84296153317267e-05, + "loss": 0.4771, + "step": 639 + }, + { + "epoch": 0.11377777777777778, + "grad_norm": 0.19630096852779388, + "learning_rate": 4.8424741039506575e-05, + "loss": 0.5178, + "step": 640 + }, + { + "epoch": 0.11395555555555556, + "grad_norm": 0.22775466740131378, + "learning_rate": 4.841985944044845e-05, + "loss": 0.6406, + "step": 641 + }, + { + "epoch": 0.11413333333333334, + "grad_norm": 0.2094782590866089, + "learning_rate": 4.8414970536075024e-05, + "loss": 0.45, + "step": 642 + }, + { + "epoch": 0.11431111111111111, + "grad_norm": 0.19464978575706482, + "learning_rate": 4.841007432791129e-05, + "loss": 0.4867, + "step": 643 + }, + { + "epoch": 0.11448888888888889, + "grad_norm": 0.19385504722595215, + "learning_rate": 4.8405170817484515e-05, + "loss": 0.5388, + "step": 644 + }, + { + "epoch": 0.11466666666666667, + "grad_norm": 0.19417083263397217, + "learning_rate": 4.8400260006324235e-05, + "loss": 0.5221, + "step": 645 + }, + { + "epoch": 0.11484444444444444, + "grad_norm": 0.20474906265735626, + "learning_rate": 4.839534189596228e-05, + "loss": 0.5255, + "step": 646 + }, + { + "epoch": 0.11502222222222222, + "grad_norm": 0.24578408896923065, + "learning_rate": 4.8390416487932733e-05, + "loss": 0.3919, + "step": 647 + }, + { + "epoch": 0.1152, + "grad_norm": 0.23829345405101776, + "learning_rate": 4.8385483783771986e-05, + "loss": 0.4833, + "step": 648 + }, + { + "epoch": 0.11537777777777777, + "grad_norm": 0.2910478711128235, + "learning_rate": 4.8380543785018677e-05, + "loss": 0.4992, + "step": 649 + }, + { + "epoch": 0.11555555555555555, + "grad_norm": 0.2461572289466858, + "learning_rate": 4.837559649321374e-05, + "loss": 0.5242, + "step": 650 + }, + { + "epoch": 0.11573333333333333, + "grad_norm": 0.19238248467445374, + "learning_rate": 4.837064190990036e-05, + "loss": 0.4988, + "step": 651 + }, + { + "epoch": 0.11591111111111112, + "grad_norm": 0.2024281769990921, + "learning_rate": 4.8365680036624026e-05, + "loss": 0.5919, + "step": 652 + }, + { + "epoch": 0.1160888888888889, + "grad_norm": 0.23859934508800507, + "learning_rate": 4.8360710874932485e-05, + "loss": 0.6583, + "step": 653 + }, + { + "epoch": 0.11626666666666667, + "grad_norm": 0.1401166021823883, + "learning_rate": 4.8355734426375753e-05, + "loss": 0.4593, + "step": 654 + }, + { + "epoch": 0.11644444444444445, + "grad_norm": 0.12656547129154205, + "learning_rate": 4.835075069250613e-05, + "loss": 0.4792, + "step": 655 + }, + { + "epoch": 0.11662222222222222, + "grad_norm": 0.21867027878761292, + "learning_rate": 4.834575967487817e-05, + "loss": 0.5763, + "step": 656 + }, + { + "epoch": 0.1168, + "grad_norm": 0.19035212695598602, + "learning_rate": 4.834076137504873e-05, + "loss": 0.8646, + "step": 657 + }, + { + "epoch": 0.11697777777777778, + "grad_norm": 0.19331279397010803, + "learning_rate": 4.833575579457691e-05, + "loss": 0.754, + "step": 658 + }, + { + "epoch": 0.11715555555555555, + "grad_norm": 0.19049163162708282, + "learning_rate": 4.83307429350241e-05, + "loss": 0.4256, + "step": 659 + }, + { + "epoch": 0.11733333333333333, + "grad_norm": 0.19232383370399475, + "learning_rate": 4.8325722797953945e-05, + "loss": 0.5399, + "step": 660 + }, + { + "epoch": 0.11751111111111111, + "grad_norm": 0.20689839124679565, + "learning_rate": 4.832069538493237e-05, + "loss": 0.4269, + "step": 661 + }, + { + "epoch": 0.11768888888888888, + "grad_norm": 0.16264976561069489, + "learning_rate": 4.8315660697527566e-05, + "loss": 0.5932, + "step": 662 + }, + { + "epoch": 0.11786666666666666, + "grad_norm": 0.26071879267692566, + "learning_rate": 4.831061873730999e-05, + "loss": 0.8341, + "step": 663 + }, + { + "epoch": 0.11804444444444444, + "grad_norm": 0.15443435311317444, + "learning_rate": 4.830556950585238e-05, + "loss": 0.5677, + "step": 664 + }, + { + "epoch": 0.11822222222222223, + "grad_norm": 0.16641636192798615, + "learning_rate": 4.8300513004729735e-05, + "loss": 0.5402, + "step": 665 + }, + { + "epoch": 0.1184, + "grad_norm": 0.2508205771446228, + "learning_rate": 4.829544923551931e-05, + "loss": 0.5715, + "step": 666 + }, + { + "epoch": 0.11857777777777778, + "grad_norm": 0.19586646556854248, + "learning_rate": 4.829037819980065e-05, + "loss": 0.5254, + "step": 667 + }, + { + "epoch": 0.11875555555555556, + "grad_norm": 0.17180398106575012, + "learning_rate": 4.828529989915555e-05, + "loss": 0.6406, + "step": 668 + }, + { + "epoch": 0.11893333333333334, + "grad_norm": 0.1918603479862213, + "learning_rate": 4.828021433516806e-05, + "loss": 0.4858, + "step": 669 + }, + { + "epoch": 0.11911111111111111, + "grad_norm": 0.19110412895679474, + "learning_rate": 4.827512150942454e-05, + "loss": 0.4816, + "step": 670 + }, + { + "epoch": 0.11928888888888889, + "grad_norm": 0.15839064121246338, + "learning_rate": 4.8270021423513554e-05, + "loss": 0.6086, + "step": 671 + }, + { + "epoch": 0.11946666666666667, + "grad_norm": 0.16673055291175842, + "learning_rate": 4.826491407902599e-05, + "loss": 0.6152, + "step": 672 + }, + { + "epoch": 0.11964444444444444, + "grad_norm": 0.18019962310791016, + "learning_rate": 4.8259799477554965e-05, + "loss": 0.4505, + "step": 673 + }, + { + "epoch": 0.11982222222222222, + "grad_norm": 0.15572546422481537, + "learning_rate": 4.825467762069585e-05, + "loss": 0.5159, + "step": 674 + }, + { + "epoch": 0.12, + "grad_norm": 0.2496463656425476, + "learning_rate": 4.824954851004633e-05, + "loss": 0.6371, + "step": 675 + }, + { + "epoch": 0.12017777777777777, + "grad_norm": 0.15238633751869202, + "learning_rate": 4.8244412147206284e-05, + "loss": 0.4281, + "step": 676 + }, + { + "epoch": 0.12035555555555555, + "grad_norm": 0.16668856143951416, + "learning_rate": 4.823926853377791e-05, + "loss": 0.6006, + "step": 677 + }, + { + "epoch": 0.12053333333333334, + "grad_norm": 0.16303813457489014, + "learning_rate": 4.823411767136565e-05, + "loss": 0.5668, + "step": 678 + }, + { + "epoch": 0.12071111111111112, + "grad_norm": 0.28897982835769653, + "learning_rate": 4.822895956157619e-05, + "loss": 0.6136, + "step": 679 + }, + { + "epoch": 0.12088888888888889, + "grad_norm": 0.1798553168773651, + "learning_rate": 4.822379420601849e-05, + "loss": 0.5308, + "step": 680 + }, + { + "epoch": 0.12106666666666667, + "grad_norm": 0.1641470193862915, + "learning_rate": 4.821862160630378e-05, + "loss": 0.5033, + "step": 681 + }, + { + "epoch": 0.12124444444444445, + "grad_norm": 0.286424845457077, + "learning_rate": 4.821344176404554e-05, + "loss": 0.6677, + "step": 682 + }, + { + "epoch": 0.12142222222222222, + "grad_norm": 0.1499778926372528, + "learning_rate": 4.8208254680859494e-05, + "loss": 0.431, + "step": 683 + }, + { + "epoch": 0.1216, + "grad_norm": 0.18727213144302368, + "learning_rate": 4.820306035836365e-05, + "loss": 0.5122, + "step": 684 + }, + { + "epoch": 0.12177777777777778, + "grad_norm": 0.1995077282190323, + "learning_rate": 4.819785879817827e-05, + "loss": 0.5164, + "step": 685 + }, + { + "epoch": 0.12195555555555555, + "grad_norm": 0.2810794711112976, + "learning_rate": 4.8192650001925855e-05, + "loss": 0.6068, + "step": 686 + }, + { + "epoch": 0.12213333333333333, + "grad_norm": 0.1473729908466339, + "learning_rate": 4.818743397123119e-05, + "loss": 0.5572, + "step": 687 + }, + { + "epoch": 0.1223111111111111, + "grad_norm": 0.15528550744056702, + "learning_rate": 4.8182210707721284e-05, + "loss": 0.5105, + "step": 688 + }, + { + "epoch": 0.12248888888888888, + "grad_norm": 0.2309819757938385, + "learning_rate": 4.8176980213025434e-05, + "loss": 0.5573, + "step": 689 + }, + { + "epoch": 0.12266666666666666, + "grad_norm": 0.18664969503879547, + "learning_rate": 4.817174248877518e-05, + "loss": 0.6213, + "step": 690 + }, + { + "epoch": 0.12284444444444445, + "grad_norm": 0.18997201323509216, + "learning_rate": 4.81664975366043e-05, + "loss": 0.6344, + "step": 691 + }, + { + "epoch": 0.12302222222222223, + "grad_norm": 0.27575939893722534, + "learning_rate": 4.8161245358148866e-05, + "loss": 0.4748, + "step": 692 + }, + { + "epoch": 0.1232, + "grad_norm": 0.32139909267425537, + "learning_rate": 4.815598595504717e-05, + "loss": 0.5016, + "step": 693 + }, + { + "epoch": 0.12337777777777778, + "grad_norm": 0.20199266076087952, + "learning_rate": 4.8150719328939755e-05, + "loss": 0.5423, + "step": 694 + }, + { + "epoch": 0.12355555555555556, + "grad_norm": 0.24906978011131287, + "learning_rate": 4.814544548146945e-05, + "loss": 0.5321, + "step": 695 + }, + { + "epoch": 0.12373333333333333, + "grad_norm": 0.2221626490354538, + "learning_rate": 4.8140164414281306e-05, + "loss": 0.519, + "step": 696 + }, + { + "epoch": 0.12391111111111111, + "grad_norm": 0.21375438570976257, + "learning_rate": 4.813487612902264e-05, + "loss": 0.4611, + "step": 697 + }, + { + "epoch": 0.12408888888888889, + "grad_norm": 0.23013022541999817, + "learning_rate": 4.812958062734302e-05, + "loss": 0.4576, + "step": 698 + }, + { + "epoch": 0.12426666666666666, + "grad_norm": 0.3189735412597656, + "learning_rate": 4.812427791089426e-05, + "loss": 0.5959, + "step": 699 + }, + { + "epoch": 0.12444444444444444, + "grad_norm": 0.30822017788887024, + "learning_rate": 4.811896798133042e-05, + "loss": 0.5546, + "step": 700 + }, + { + "epoch": 0.12462222222222222, + "grad_norm": 0.1774422973394394, + "learning_rate": 4.8113650840307834e-05, + "loss": 0.4935, + "step": 701 + }, + { + "epoch": 0.1248, + "grad_norm": 0.14315634965896606, + "learning_rate": 4.810832648948505e-05, + "loss": 0.5485, + "step": 702 + }, + { + "epoch": 0.12497777777777777, + "grad_norm": 0.17411945760250092, + "learning_rate": 4.810299493052289e-05, + "loss": 0.3879, + "step": 703 + }, + { + "epoch": 0.12515555555555555, + "grad_norm": 0.14914071559906006, + "learning_rate": 4.809765616508443e-05, + "loss": 0.5129, + "step": 704 + }, + { + "epoch": 0.12533333333333332, + "grad_norm": 0.19055378437042236, + "learning_rate": 4.809231019483497e-05, + "loss": 0.7886, + "step": 705 + }, + { + "epoch": 0.1255111111111111, + "grad_norm": 0.17609871923923492, + "learning_rate": 4.808695702144206e-05, + "loss": 0.5084, + "step": 706 + }, + { + "epoch": 0.12568888888888888, + "grad_norm": 0.18136291205883026, + "learning_rate": 4.808159664657552e-05, + "loss": 0.4947, + "step": 707 + }, + { + "epoch": 0.12586666666666665, + "grad_norm": 0.1343490481376648, + "learning_rate": 4.8076229071907397e-05, + "loss": 0.394, + "step": 708 + }, + { + "epoch": 0.12604444444444443, + "grad_norm": 0.1484137624502182, + "learning_rate": 4.8070854299111994e-05, + "loss": 0.3972, + "step": 709 + }, + { + "epoch": 0.12622222222222224, + "grad_norm": 0.16214905679225922, + "learning_rate": 4.8065472329865854e-05, + "loss": 0.4853, + "step": 710 + }, + { + "epoch": 0.1264, + "grad_norm": 0.19232375919818878, + "learning_rate": 4.8060083165847754e-05, + "loss": 0.523, + "step": 711 + }, + { + "epoch": 0.1265777777777778, + "grad_norm": 0.20119550824165344, + "learning_rate": 4.805468680873874e-05, + "loss": 0.5455, + "step": 712 + }, + { + "epoch": 0.12675555555555557, + "grad_norm": 0.2584286630153656, + "learning_rate": 4.8049283260222075e-05, + "loss": 0.7892, + "step": 713 + }, + { + "epoch": 0.12693333333333334, + "grad_norm": 0.18945609033107758, + "learning_rate": 4.8043872521983294e-05, + "loss": 0.4641, + "step": 714 + }, + { + "epoch": 0.12711111111111112, + "grad_norm": 0.15987750887870789, + "learning_rate": 4.803845459571014e-05, + "loss": 0.5027, + "step": 715 + }, + { + "epoch": 0.1272888888888889, + "grad_norm": 0.22411666810512543, + "learning_rate": 4.803302948309264e-05, + "loss": 0.649, + "step": 716 + }, + { + "epoch": 0.12746666666666667, + "grad_norm": 0.24168173968791962, + "learning_rate": 4.8027597185823016e-05, + "loss": 0.4984, + "step": 717 + }, + { + "epoch": 0.12764444444444445, + "grad_norm": 0.18792656064033508, + "learning_rate": 4.802215770559577e-05, + "loss": 0.4996, + "step": 718 + }, + { + "epoch": 0.12782222222222223, + "grad_norm": 0.40115678310394287, + "learning_rate": 4.801671104410763e-05, + "loss": 0.4385, + "step": 719 + }, + { + "epoch": 0.128, + "grad_norm": 0.18150955438613892, + "learning_rate": 4.8011257203057556e-05, + "loss": 0.4678, + "step": 720 + }, + { + "epoch": 0.12817777777777778, + "grad_norm": 0.15427082777023315, + "learning_rate": 4.800579618414676e-05, + "loss": 0.5439, + "step": 721 + }, + { + "epoch": 0.12835555555555556, + "grad_norm": 0.18335971236228943, + "learning_rate": 4.800032798907869e-05, + "loss": 0.5424, + "step": 722 + }, + { + "epoch": 0.12853333333333333, + "grad_norm": 0.21156658232212067, + "learning_rate": 4.7994852619559016e-05, + "loss": 0.4518, + "step": 723 + }, + { + "epoch": 0.1287111111111111, + "grad_norm": 0.21200919151306152, + "learning_rate": 4.798937007729568e-05, + "loss": 0.5315, + "step": 724 + }, + { + "epoch": 0.1288888888888889, + "grad_norm": 0.1788853108882904, + "learning_rate": 4.798388036399883e-05, + "loss": 0.7258, + "step": 725 + }, + { + "epoch": 0.12906666666666666, + "grad_norm": 0.1894267201423645, + "learning_rate": 4.797838348138086e-05, + "loss": 0.5601, + "step": 726 + }, + { + "epoch": 0.12924444444444444, + "grad_norm": 0.16751262545585632, + "learning_rate": 4.797287943115641e-05, + "loss": 0.5639, + "step": 727 + }, + { + "epoch": 0.12942222222222222, + "grad_norm": 0.17367936670780182, + "learning_rate": 4.796736821504235e-05, + "loss": 0.5569, + "step": 728 + }, + { + "epoch": 0.1296, + "grad_norm": 0.1829548478126526, + "learning_rate": 4.7961849834757786e-05, + "loss": 0.5129, + "step": 729 + }, + { + "epoch": 0.12977777777777777, + "grad_norm": 0.16978542506694794, + "learning_rate": 4.795632429202405e-05, + "loss": 0.338, + "step": 730 + }, + { + "epoch": 0.12995555555555555, + "grad_norm": 0.171937957406044, + "learning_rate": 4.79507915885647e-05, + "loss": 0.4558, + "step": 731 + }, + { + "epoch": 0.13013333333333332, + "grad_norm": 0.19471272826194763, + "learning_rate": 4.794525172610558e-05, + "loss": 0.4176, + "step": 732 + }, + { + "epoch": 0.1303111111111111, + "grad_norm": 0.16205425560474396, + "learning_rate": 4.793970470637469e-05, + "loss": 0.4779, + "step": 733 + }, + { + "epoch": 0.13048888888888888, + "grad_norm": 0.15942677855491638, + "learning_rate": 4.793415053110233e-05, + "loss": 0.5812, + "step": 734 + }, + { + "epoch": 0.13066666666666665, + "grad_norm": 0.17581968009471893, + "learning_rate": 4.792858920202099e-05, + "loss": 0.5352, + "step": 735 + }, + { + "epoch": 0.13084444444444446, + "grad_norm": 0.26987457275390625, + "learning_rate": 4.7923020720865414e-05, + "loss": 0.6674, + "step": 736 + }, + { + "epoch": 0.13102222222222223, + "grad_norm": 0.22612085938453674, + "learning_rate": 4.791744508937256e-05, + "loss": 0.6504, + "step": 737 + }, + { + "epoch": 0.1312, + "grad_norm": 0.16121941804885864, + "learning_rate": 4.791186230928163e-05, + "loss": 0.6792, + "step": 738 + }, + { + "epoch": 0.1313777777777778, + "grad_norm": 0.2309853881597519, + "learning_rate": 4.790627238233405e-05, + "loss": 0.5664, + "step": 739 + }, + { + "epoch": 0.13155555555555556, + "grad_norm": 0.171939417719841, + "learning_rate": 4.7900675310273466e-05, + "loss": 0.5592, + "step": 740 + }, + { + "epoch": 0.13173333333333334, + "grad_norm": 0.17857739329338074, + "learning_rate": 4.789507109484579e-05, + "loss": 0.5819, + "step": 741 + }, + { + "epoch": 0.13191111111111112, + "grad_norm": 0.1734759658575058, + "learning_rate": 4.78894597377991e-05, + "loss": 0.4961, + "step": 742 + }, + { + "epoch": 0.1320888888888889, + "grad_norm": 0.18777959048748016, + "learning_rate": 4.7883841240883766e-05, + "loss": 0.4661, + "step": 743 + }, + { + "epoch": 0.13226666666666667, + "grad_norm": 0.26379868388175964, + "learning_rate": 4.7878215605852336e-05, + "loss": 0.4983, + "step": 744 + }, + { + "epoch": 0.13244444444444445, + "grad_norm": 0.221966952085495, + "learning_rate": 4.787258283445962e-05, + "loss": 0.4538, + "step": 745 + }, + { + "epoch": 0.13262222222222222, + "grad_norm": 0.17293398082256317, + "learning_rate": 4.7866942928462625e-05, + "loss": 0.4486, + "step": 746 + }, + { + "epoch": 0.1328, + "grad_norm": 0.2450498789548874, + "learning_rate": 4.786129588962061e-05, + "loss": 0.4364, + "step": 747 + }, + { + "epoch": 0.13297777777777778, + "grad_norm": 0.20403257012367249, + "learning_rate": 4.7855641719695023e-05, + "loss": 0.4191, + "step": 748 + }, + { + "epoch": 0.13315555555555555, + "grad_norm": 0.242579385638237, + "learning_rate": 4.7849980420449594e-05, + "loss": 0.414, + "step": 749 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 0.26167938113212585, + "learning_rate": 4.7844311993650205e-05, + "loss": 0.5284, + "step": 750 + }, + { + "epoch": 0.1335111111111111, + "grad_norm": 0.21603204309940338, + "learning_rate": 4.783863644106502e-05, + "loss": 0.5375, + "step": 751 + }, + { + "epoch": 0.13368888888888888, + "grad_norm": 0.19549617171287537, + "learning_rate": 4.7832953764464405e-05, + "loss": 0.4878, + "step": 752 + }, + { + "epoch": 0.13386666666666666, + "grad_norm": 0.17020666599273682, + "learning_rate": 4.782726396562094e-05, + "loss": 0.5682, + "step": 753 + }, + { + "epoch": 0.13404444444444444, + "grad_norm": 0.1952783316373825, + "learning_rate": 4.782156704630944e-05, + "loss": 0.6119, + "step": 754 + }, + { + "epoch": 0.13422222222222221, + "grad_norm": 0.17863894999027252, + "learning_rate": 4.781586300830693e-05, + "loss": 0.5569, + "step": 755 + }, + { + "epoch": 0.1344, + "grad_norm": 0.20778587460517883, + "learning_rate": 4.781015185339266e-05, + "loss": 0.6058, + "step": 756 + }, + { + "epoch": 0.13457777777777777, + "grad_norm": 0.236437126994133, + "learning_rate": 4.78044335833481e-05, + "loss": 0.5017, + "step": 757 + }, + { + "epoch": 0.13475555555555555, + "grad_norm": 0.189187154173851, + "learning_rate": 4.779870819995694e-05, + "loss": 0.6328, + "step": 758 + }, + { + "epoch": 0.13493333333333332, + "grad_norm": 0.19671674072742462, + "learning_rate": 4.779297570500509e-05, + "loss": 0.4939, + "step": 759 + }, + { + "epoch": 0.1351111111111111, + "grad_norm": 0.22109843790531158, + "learning_rate": 4.7787236100280685e-05, + "loss": 0.5503, + "step": 760 + }, + { + "epoch": 0.13528888888888888, + "grad_norm": 0.16525454819202423, + "learning_rate": 4.778148938757406e-05, + "loss": 0.473, + "step": 761 + }, + { + "epoch": 0.13546666666666668, + "grad_norm": 0.17730148136615753, + "learning_rate": 4.7775735568677775e-05, + "loss": 0.4093, + "step": 762 + }, + { + "epoch": 0.13564444444444446, + "grad_norm": 0.17130325734615326, + "learning_rate": 4.776997464538662e-05, + "loss": 0.5597, + "step": 763 + }, + { + "epoch": 0.13582222222222223, + "grad_norm": 0.18379709124565125, + "learning_rate": 4.776420661949758e-05, + "loss": 0.4618, + "step": 764 + }, + { + "epoch": 0.136, + "grad_norm": 0.18312430381774902, + "learning_rate": 4.775843149280986e-05, + "loss": 0.7169, + "step": 765 + }, + { + "epoch": 0.1361777777777778, + "grad_norm": 0.15854863822460175, + "learning_rate": 4.775264926712489e-05, + "loss": 0.4381, + "step": 766 + }, + { + "epoch": 0.13635555555555556, + "grad_norm": 0.16607770323753357, + "learning_rate": 4.7746859944246325e-05, + "loss": 0.5603, + "step": 767 + }, + { + "epoch": 0.13653333333333334, + "grad_norm": 0.16546474397182465, + "learning_rate": 4.7741063525980004e-05, + "loss": 0.5965, + "step": 768 + }, + { + "epoch": 0.13671111111111112, + "grad_norm": 0.21205104887485504, + "learning_rate": 4.7735260014133986e-05, + "loss": 0.5572, + "step": 769 + }, + { + "epoch": 0.1368888888888889, + "grad_norm": 0.2534192204475403, + "learning_rate": 4.772944941051856e-05, + "loss": 0.4907, + "step": 770 + }, + { + "epoch": 0.13706666666666667, + "grad_norm": 0.19808736443519592, + "learning_rate": 4.772363171694622e-05, + "loss": 0.5408, + "step": 771 + }, + { + "epoch": 0.13724444444444445, + "grad_norm": 0.19530202448368073, + "learning_rate": 4.7717806935231665e-05, + "loss": 0.5342, + "step": 772 + }, + { + "epoch": 0.13742222222222222, + "grad_norm": 0.13586518168449402, + "learning_rate": 4.771197506719181e-05, + "loss": 0.3526, + "step": 773 + }, + { + "epoch": 0.1376, + "grad_norm": 0.16730354726314545, + "learning_rate": 4.770613611464577e-05, + "loss": 0.5968, + "step": 774 + }, + { + "epoch": 0.13777777777777778, + "grad_norm": 0.17443497478961945, + "learning_rate": 4.7700290079414896e-05, + "loss": 0.5546, + "step": 775 + }, + { + "epoch": 0.13795555555555555, + "grad_norm": 0.18248924612998962, + "learning_rate": 4.769443696332272e-05, + "loss": 0.6833, + "step": 776 + }, + { + "epoch": 0.13813333333333333, + "grad_norm": 0.16325588524341583, + "learning_rate": 4.7688576768194994e-05, + "loss": 0.5036, + "step": 777 + }, + { + "epoch": 0.1383111111111111, + "grad_norm": 0.166627436876297, + "learning_rate": 4.768270949585968e-05, + "loss": 0.5887, + "step": 778 + }, + { + "epoch": 0.13848888888888888, + "grad_norm": 0.19861891865730286, + "learning_rate": 4.767683514814696e-05, + "loss": 0.6712, + "step": 779 + }, + { + "epoch": 0.13866666666666666, + "grad_norm": 0.2539740800857544, + "learning_rate": 4.767095372688918e-05, + "loss": 0.6298, + "step": 780 + }, + { + "epoch": 0.13884444444444444, + "grad_norm": 0.17549359798431396, + "learning_rate": 4.7665065233920945e-05, + "loss": 0.5241, + "step": 781 + }, + { + "epoch": 0.1390222222222222, + "grad_norm": 0.17305056750774384, + "learning_rate": 4.765916967107903e-05, + "loss": 0.4617, + "step": 782 + }, + { + "epoch": 0.1392, + "grad_norm": 0.21730421483516693, + "learning_rate": 4.7653267040202436e-05, + "loss": 0.5906, + "step": 783 + }, + { + "epoch": 0.13937777777777777, + "grad_norm": 0.22284889221191406, + "learning_rate": 4.764735734313236e-05, + "loss": 0.7111, + "step": 784 + }, + { + "epoch": 0.13955555555555554, + "grad_norm": 0.18242111802101135, + "learning_rate": 4.764144058171219e-05, + "loss": 0.414, + "step": 785 + }, + { + "epoch": 0.13973333333333332, + "grad_norm": 0.205621600151062, + "learning_rate": 4.763551675778755e-05, + "loss": 0.4781, + "step": 786 + }, + { + "epoch": 0.13991111111111112, + "grad_norm": 0.1954527199268341, + "learning_rate": 4.7629585873206226e-05, + "loss": 0.5383, + "step": 787 + }, + { + "epoch": 0.1400888888888889, + "grad_norm": 0.17568983137607574, + "learning_rate": 4.762364792981825e-05, + "loss": 0.4001, + "step": 788 + }, + { + "epoch": 0.14026666666666668, + "grad_norm": 0.15788014233112335, + "learning_rate": 4.761770292947582e-05, + "loss": 0.5163, + "step": 789 + }, + { + "epoch": 0.14044444444444446, + "grad_norm": 0.2080955058336258, + "learning_rate": 4.7611750874033356e-05, + "loss": 0.6496, + "step": 790 + }, + { + "epoch": 0.14062222222222223, + "grad_norm": 0.2008746862411499, + "learning_rate": 4.760579176534747e-05, + "loss": 0.4522, + "step": 791 + }, + { + "epoch": 0.1408, + "grad_norm": 0.2045242339372635, + "learning_rate": 4.759982560527698e-05, + "loss": 0.5689, + "step": 792 + }, + { + "epoch": 0.14097777777777779, + "grad_norm": 0.1799342781305313, + "learning_rate": 4.759385239568289e-05, + "loss": 0.4113, + "step": 793 + }, + { + "epoch": 0.14115555555555556, + "grad_norm": 0.2251848429441452, + "learning_rate": 4.758787213842842e-05, + "loss": 0.3805, + "step": 794 + }, + { + "epoch": 0.14133333333333334, + "grad_norm": 0.2128487229347229, + "learning_rate": 4.758188483537898e-05, + "loss": 0.4834, + "step": 795 + }, + { + "epoch": 0.14151111111111112, + "grad_norm": 0.24930857121944427, + "learning_rate": 4.7575890488402185e-05, + "loss": 0.4463, + "step": 796 + }, + { + "epoch": 0.1416888888888889, + "grad_norm": 0.25401321053504944, + "learning_rate": 4.7569889099367824e-05, + "loss": 0.4292, + "step": 797 + }, + { + "epoch": 0.14186666666666667, + "grad_norm": 0.25575584173202515, + "learning_rate": 4.756388067014792e-05, + "loss": 0.4536, + "step": 798 + }, + { + "epoch": 0.14204444444444445, + "grad_norm": 0.27672070264816284, + "learning_rate": 4.7557865202616656e-05, + "loss": 0.4438, + "step": 799 + }, + { + "epoch": 0.14222222222222222, + "grad_norm": 0.2639235556125641, + "learning_rate": 4.7551842698650436e-05, + "loss": 0.6028, + "step": 800 + }, + { + "epoch": 0.1424, + "grad_norm": 0.20468570291996002, + "learning_rate": 4.754581316012785e-05, + "loss": 0.733, + "step": 801 + }, + { + "epoch": 0.14257777777777778, + "grad_norm": 0.19943034648895264, + "learning_rate": 4.753977658892967e-05, + "loss": 0.4545, + "step": 802 + }, + { + "epoch": 0.14275555555555555, + "grad_norm": 0.21533729135990143, + "learning_rate": 4.753373298693888e-05, + "loss": 0.5872, + "step": 803 + }, + { + "epoch": 0.14293333333333333, + "grad_norm": 0.20392805337905884, + "learning_rate": 4.752768235604065e-05, + "loss": 0.6231, + "step": 804 + }, + { + "epoch": 0.1431111111111111, + "grad_norm": 0.20478495955467224, + "learning_rate": 4.752162469812234e-05, + "loss": 0.6074, + "step": 805 + }, + { + "epoch": 0.14328888888888888, + "grad_norm": 0.20691512525081635, + "learning_rate": 4.7515560015073514e-05, + "loss": 0.4769, + "step": 806 + }, + { + "epoch": 0.14346666666666666, + "grad_norm": 0.1968759149312973, + "learning_rate": 4.7509488308785905e-05, + "loss": 0.5512, + "step": 807 + }, + { + "epoch": 0.14364444444444444, + "grad_norm": 0.20234976708889008, + "learning_rate": 4.750340958115346e-05, + "loss": 0.5479, + "step": 808 + }, + { + "epoch": 0.1438222222222222, + "grad_norm": 0.25453826785087585, + "learning_rate": 4.749732383407229e-05, + "loss": 0.4733, + "step": 809 + }, + { + "epoch": 0.144, + "grad_norm": 0.2266426533460617, + "learning_rate": 4.749123106944073e-05, + "loss": 0.4805, + "step": 810 + }, + { + "epoch": 0.14417777777777777, + "grad_norm": 0.2288711816072464, + "learning_rate": 4.7485131289159276e-05, + "loss": 0.5747, + "step": 811 + }, + { + "epoch": 0.14435555555555554, + "grad_norm": 0.19953805208206177, + "learning_rate": 4.747902449513063e-05, + "loss": 0.4776, + "step": 812 + }, + { + "epoch": 0.14453333333333335, + "grad_norm": 0.18901818990707397, + "learning_rate": 4.7472910689259655e-05, + "loss": 0.4556, + "step": 813 + }, + { + "epoch": 0.14471111111111112, + "grad_norm": 0.20502203702926636, + "learning_rate": 4.7466789873453444e-05, + "loss": 0.5853, + "step": 814 + }, + { + "epoch": 0.1448888888888889, + "grad_norm": 0.26504093408584595, + "learning_rate": 4.746066204962123e-05, + "loss": 0.5361, + "step": 815 + }, + { + "epoch": 0.14506666666666668, + "grad_norm": 0.20630107820034027, + "learning_rate": 4.745452721967446e-05, + "loss": 0.5575, + "step": 816 + }, + { + "epoch": 0.14524444444444445, + "grad_norm": 0.20522207021713257, + "learning_rate": 4.744838538552677e-05, + "loss": 0.611, + "step": 817 + }, + { + "epoch": 0.14542222222222223, + "grad_norm": 0.20067137479782104, + "learning_rate": 4.744223654909397e-05, + "loss": 0.5078, + "step": 818 + }, + { + "epoch": 0.1456, + "grad_norm": 0.20978687703609467, + "learning_rate": 4.743608071229405e-05, + "loss": 0.4745, + "step": 819 + }, + { + "epoch": 0.14577777777777778, + "grad_norm": 0.19159674644470215, + "learning_rate": 4.742991787704719e-05, + "loss": 0.5607, + "step": 820 + }, + { + "epoch": 0.14595555555555556, + "grad_norm": 0.21390162408351898, + "learning_rate": 4.742374804527575e-05, + "loss": 0.5927, + "step": 821 + }, + { + "epoch": 0.14613333333333334, + "grad_norm": 0.15510669350624084, + "learning_rate": 4.741757121890428e-05, + "loss": 0.4122, + "step": 822 + }, + { + "epoch": 0.14631111111111111, + "grad_norm": 0.21298177540302277, + "learning_rate": 4.741138739985951e-05, + "loss": 0.4671, + "step": 823 + }, + { + "epoch": 0.1464888888888889, + "grad_norm": 0.21576207876205444, + "learning_rate": 4.740519659007033e-05, + "loss": 0.5159, + "step": 824 + }, + { + "epoch": 0.14666666666666667, + "grad_norm": 0.23075196146965027, + "learning_rate": 4.739899879146785e-05, + "loss": 0.5242, + "step": 825 + }, + { + "epoch": 0.14684444444444444, + "grad_norm": 0.15927590429782867, + "learning_rate": 4.7392794005985326e-05, + "loss": 0.3803, + "step": 826 + }, + { + "epoch": 0.14702222222222222, + "grad_norm": 0.19443438947200775, + "learning_rate": 4.7386582235558205e-05, + "loss": 0.5189, + "step": 827 + }, + { + "epoch": 0.1472, + "grad_norm": 0.18872953951358795, + "learning_rate": 4.738036348212412e-05, + "loss": 0.4865, + "step": 828 + }, + { + "epoch": 0.14737777777777777, + "grad_norm": 0.2277461141347885, + "learning_rate": 4.737413774762287e-05, + "loss": 0.5619, + "step": 829 + }, + { + "epoch": 0.14755555555555555, + "grad_norm": 0.17676177620887756, + "learning_rate": 4.7367905033996445e-05, + "loss": 0.4449, + "step": 830 + }, + { + "epoch": 0.14773333333333333, + "grad_norm": 0.2726444900035858, + "learning_rate": 4.7361665343189e-05, + "loss": 0.4874, + "step": 831 + }, + { + "epoch": 0.1479111111111111, + "grad_norm": 0.210150808095932, + "learning_rate": 4.735541867714687e-05, + "loss": 0.5238, + "step": 832 + }, + { + "epoch": 0.14808888888888888, + "grad_norm": 0.17125515639781952, + "learning_rate": 4.734916503781856e-05, + "loss": 0.4957, + "step": 833 + }, + { + "epoch": 0.14826666666666666, + "grad_norm": 0.23999734222888947, + "learning_rate": 4.7342904427154766e-05, + "loss": 0.4937, + "step": 834 + }, + { + "epoch": 0.14844444444444443, + "grad_norm": 0.23215296864509583, + "learning_rate": 4.733663684710835e-05, + "loss": 0.5629, + "step": 835 + }, + { + "epoch": 0.1486222222222222, + "grad_norm": 0.19657696783542633, + "learning_rate": 4.733036229963435e-05, + "loss": 0.4349, + "step": 836 + }, + { + "epoch": 0.1488, + "grad_norm": 0.2089412361383438, + "learning_rate": 4.732408078668995e-05, + "loss": 0.6476, + "step": 837 + }, + { + "epoch": 0.14897777777777776, + "grad_norm": 0.3011063039302826, + "learning_rate": 4.731779231023456e-05, + "loss": 0.5351, + "step": 838 + }, + { + "epoch": 0.14915555555555557, + "grad_norm": 0.16934815049171448, + "learning_rate": 4.731149687222972e-05, + "loss": 0.5273, + "step": 839 + }, + { + "epoch": 0.14933333333333335, + "grad_norm": 0.1976025104522705, + "learning_rate": 4.730519447463916e-05, + "loss": 0.4658, + "step": 840 + }, + { + "epoch": 0.14951111111111112, + "grad_norm": 0.20601053535938263, + "learning_rate": 4.7298885119428773e-05, + "loss": 0.3892, + "step": 841 + }, + { + "epoch": 0.1496888888888889, + "grad_norm": 0.23810157179832458, + "learning_rate": 4.729256880856662e-05, + "loss": 0.467, + "step": 842 + }, + { + "epoch": 0.14986666666666668, + "grad_norm": 0.1992977410554886, + "learning_rate": 4.728624554402295e-05, + "loss": 0.4014, + "step": 843 + }, + { + "epoch": 0.15004444444444445, + "grad_norm": 0.2090865820646286, + "learning_rate": 4.7279915327770155e-05, + "loss": 0.4065, + "step": 844 + }, + { + "epoch": 0.15022222222222223, + "grad_norm": 0.2280810922384262, + "learning_rate": 4.727357816178282e-05, + "loss": 0.4053, + "step": 845 + }, + { + "epoch": 0.1504, + "grad_norm": 0.2561088800430298, + "learning_rate": 4.7267234048037664e-05, + "loss": 0.4134, + "step": 846 + }, + { + "epoch": 0.15057777777777778, + "grad_norm": 0.40030455589294434, + "learning_rate": 4.7260882988513624e-05, + "loss": 0.4952, + "step": 847 + }, + { + "epoch": 0.15075555555555556, + "grad_norm": 0.27520647644996643, + "learning_rate": 4.725452498519175e-05, + "loss": 0.5782, + "step": 848 + }, + { + "epoch": 0.15093333333333334, + "grad_norm": 0.3196900486946106, + "learning_rate": 4.7248160040055304e-05, + "loss": 0.4036, + "step": 849 + }, + { + "epoch": 0.1511111111111111, + "grad_norm": 0.2642897963523865, + "learning_rate": 4.724178815508967e-05, + "loss": 0.5129, + "step": 850 + }, + { + "epoch": 0.1512888888888889, + "grad_norm": 0.18399062752723694, + "learning_rate": 4.723540933228244e-05, + "loss": 0.5883, + "step": 851 + }, + { + "epoch": 0.15146666666666667, + "grad_norm": 0.15808753669261932, + "learning_rate": 4.722902357362333e-05, + "loss": 0.4776, + "step": 852 + }, + { + "epoch": 0.15164444444444444, + "grad_norm": 0.20257540047168732, + "learning_rate": 4.722263088110426e-05, + "loss": 0.4453, + "step": 853 + }, + { + "epoch": 0.15182222222222222, + "grad_norm": 0.21902291476726532, + "learning_rate": 4.721623125671927e-05, + "loss": 0.592, + "step": 854 + }, + { + "epoch": 0.152, + "grad_norm": 0.252695232629776, + "learning_rate": 4.720982470246459e-05, + "loss": 0.4007, + "step": 855 + }, + { + "epoch": 0.15217777777777777, + "grad_norm": 0.17141903936862946, + "learning_rate": 4.720341122033862e-05, + "loss": 0.3564, + "step": 856 + }, + { + "epoch": 0.15235555555555555, + "grad_norm": 0.22648568451404572, + "learning_rate": 4.719699081234188e-05, + "loss": 0.6112, + "step": 857 + }, + { + "epoch": 0.15253333333333333, + "grad_norm": 0.1699402630329132, + "learning_rate": 4.7190563480477095e-05, + "loss": 0.4707, + "step": 858 + }, + { + "epoch": 0.1527111111111111, + "grad_norm": 0.2075439989566803, + "learning_rate": 4.718412922674913e-05, + "loss": 0.4669, + "step": 859 + }, + { + "epoch": 0.15288888888888888, + "grad_norm": 0.20650020241737366, + "learning_rate": 4.717768805316501e-05, + "loss": 0.526, + "step": 860 + }, + { + "epoch": 0.15306666666666666, + "grad_norm": 0.22338451445102692, + "learning_rate": 4.71712399617339e-05, + "loss": 0.5538, + "step": 861 + }, + { + "epoch": 0.15324444444444443, + "grad_norm": 0.24497784674167633, + "learning_rate": 4.7164784954467166e-05, + "loss": 0.544, + "step": 862 + }, + { + "epoch": 0.1534222222222222, + "grad_norm": 0.22911253571510315, + "learning_rate": 4.715832303337829e-05, + "loss": 0.5144, + "step": 863 + }, + { + "epoch": 0.1536, + "grad_norm": 0.23725630342960358, + "learning_rate": 4.715185420048295e-05, + "loss": 0.7483, + "step": 864 + }, + { + "epoch": 0.1537777777777778, + "grad_norm": 0.1991734355688095, + "learning_rate": 4.714537845779894e-05, + "loss": 0.5067, + "step": 865 + }, + { + "epoch": 0.15395555555555557, + "grad_norm": 0.1909249722957611, + "learning_rate": 4.713889580734623e-05, + "loss": 0.4118, + "step": 866 + }, + { + "epoch": 0.15413333333333334, + "grad_norm": 0.1759929060935974, + "learning_rate": 4.7132406251146935e-05, + "loss": 0.438, + "step": 867 + }, + { + "epoch": 0.15431111111111112, + "grad_norm": 0.1974143087863922, + "learning_rate": 4.712590979122534e-05, + "loss": 0.5525, + "step": 868 + }, + { + "epoch": 0.1544888888888889, + "grad_norm": 0.20354808866977692, + "learning_rate": 4.7119406429607885e-05, + "loss": 0.6397, + "step": 869 + }, + { + "epoch": 0.15466666666666667, + "grad_norm": 0.2186330258846283, + "learning_rate": 4.711289616832312e-05, + "loss": 0.5384, + "step": 870 + }, + { + "epoch": 0.15484444444444445, + "grad_norm": 0.21013225615024567, + "learning_rate": 4.710637900940181e-05, + "loss": 0.5498, + "step": 871 + }, + { + "epoch": 0.15502222222222223, + "grad_norm": 0.22667552530765533, + "learning_rate": 4.709985495487682e-05, + "loss": 0.5223, + "step": 872 + }, + { + "epoch": 0.1552, + "grad_norm": 0.2572024464607239, + "learning_rate": 4.7093324006783214e-05, + "loss": 0.5054, + "step": 873 + }, + { + "epoch": 0.15537777777777778, + "grad_norm": 0.16276004910469055, + "learning_rate": 4.708678616715815e-05, + "loss": 0.5291, + "step": 874 + }, + { + "epoch": 0.15555555555555556, + "grad_norm": 0.20620758831501007, + "learning_rate": 4.708024143804097e-05, + "loss": 0.5619, + "step": 875 + }, + { + "epoch": 0.15573333333333333, + "grad_norm": 0.16112466156482697, + "learning_rate": 4.707368982147318e-05, + "loss": 0.5114, + "step": 876 + }, + { + "epoch": 0.1559111111111111, + "grad_norm": 0.20106056332588196, + "learning_rate": 4.706713131949839e-05, + "loss": 0.4416, + "step": 877 + }, + { + "epoch": 0.1560888888888889, + "grad_norm": 0.1672735959291458, + "learning_rate": 4.7060565934162394e-05, + "loss": 0.4864, + "step": 878 + }, + { + "epoch": 0.15626666666666666, + "grad_norm": 0.20644119381904602, + "learning_rate": 4.705399366751312e-05, + "loss": 0.5254, + "step": 879 + }, + { + "epoch": 0.15644444444444444, + "grad_norm": 0.21950022876262665, + "learning_rate": 4.7047414521600644e-05, + "loss": 0.4849, + "step": 880 + }, + { + "epoch": 0.15662222222222222, + "grad_norm": 0.21106302738189697, + "learning_rate": 4.704082849847718e-05, + "loss": 0.4513, + "step": 881 + }, + { + "epoch": 0.1568, + "grad_norm": 0.22631031274795532, + "learning_rate": 4.70342356001971e-05, + "loss": 0.5948, + "step": 882 + }, + { + "epoch": 0.15697777777777777, + "grad_norm": 0.20993725955486298, + "learning_rate": 4.702763582881692e-05, + "loss": 0.476, + "step": 883 + }, + { + "epoch": 0.15715555555555555, + "grad_norm": 0.16196566820144653, + "learning_rate": 4.702102918639528e-05, + "loss": 0.5067, + "step": 884 + }, + { + "epoch": 0.15733333333333333, + "grad_norm": 0.1755910962820053, + "learning_rate": 4.7014415674993e-05, + "loss": 0.4236, + "step": 885 + }, + { + "epoch": 0.1575111111111111, + "grad_norm": 0.17265745997428894, + "learning_rate": 4.7007795296673006e-05, + "loss": 0.3939, + "step": 886 + }, + { + "epoch": 0.15768888888888888, + "grad_norm": 0.2645987570285797, + "learning_rate": 4.700116805350039e-05, + "loss": 0.6418, + "step": 887 + }, + { + "epoch": 0.15786666666666666, + "grad_norm": 0.2062368243932724, + "learning_rate": 4.699453394754236e-05, + "loss": 0.6236, + "step": 888 + }, + { + "epoch": 0.15804444444444443, + "grad_norm": 0.2235916405916214, + "learning_rate": 4.6987892980868296e-05, + "loss": 0.5995, + "step": 889 + }, + { + "epoch": 0.1582222222222222, + "grad_norm": 0.24996426701545715, + "learning_rate": 4.69812451555497e-05, + "loss": 0.4127, + "step": 890 + }, + { + "epoch": 0.1584, + "grad_norm": 0.3111324608325958, + "learning_rate": 4.6974590473660216e-05, + "loss": 0.5198, + "step": 891 + }, + { + "epoch": 0.1585777777777778, + "grad_norm": 0.2417266070842743, + "learning_rate": 4.696792893727562e-05, + "loss": 0.4558, + "step": 892 + }, + { + "epoch": 0.15875555555555557, + "grad_norm": 0.20156078040599823, + "learning_rate": 4.696126054847385e-05, + "loss": 0.4461, + "step": 893 + }, + { + "epoch": 0.15893333333333334, + "grad_norm": 0.19840207695960999, + "learning_rate": 4.695458530933494e-05, + "loss": 0.533, + "step": 894 + }, + { + "epoch": 0.15911111111111112, + "grad_norm": 0.2322389930486679, + "learning_rate": 4.694790322194111e-05, + "loss": 0.4585, + "step": 895 + }, + { + "epoch": 0.1592888888888889, + "grad_norm": 0.23116163909435272, + "learning_rate": 4.694121428837668e-05, + "loss": 0.3636, + "step": 896 + }, + { + "epoch": 0.15946666666666667, + "grad_norm": 0.2592049539089203, + "learning_rate": 4.693451851072811e-05, + "loss": 0.4505, + "step": 897 + }, + { + "epoch": 0.15964444444444445, + "grad_norm": 0.28173017501831055, + "learning_rate": 4.692781589108402e-05, + "loss": 0.4401, + "step": 898 + }, + { + "epoch": 0.15982222222222223, + "grad_norm": 0.22880995273590088, + "learning_rate": 4.6921106431535135e-05, + "loss": 0.452, + "step": 899 + }, + { + "epoch": 0.16, + "grad_norm": 0.33095496892929077, + "learning_rate": 4.691439013417433e-05, + "loss": 0.4577, + "step": 900 + }, + { + "epoch": 0.16017777777777778, + "grad_norm": 0.3279748857021332, + "learning_rate": 4.690766700109659e-05, + "loss": 0.8017, + "step": 901 + }, + { + "epoch": 0.16035555555555556, + "grad_norm": 0.23918560147285461, + "learning_rate": 4.690093703439907e-05, + "loss": 0.3828, + "step": 902 + }, + { + "epoch": 0.16053333333333333, + "grad_norm": 0.19343721866607666, + "learning_rate": 4.689420023618104e-05, + "loss": 0.5045, + "step": 903 + }, + { + "epoch": 0.1607111111111111, + "grad_norm": 0.22671912610530853, + "learning_rate": 4.688745660854388e-05, + "loss": 0.5412, + "step": 904 + }, + { + "epoch": 0.1608888888888889, + "grad_norm": 0.24987496435642242, + "learning_rate": 4.688070615359114e-05, + "loss": 0.4365, + "step": 905 + }, + { + "epoch": 0.16106666666666666, + "grad_norm": 0.23491297662258148, + "learning_rate": 4.687394887342845e-05, + "loss": 0.4586, + "step": 906 + }, + { + "epoch": 0.16124444444444444, + "grad_norm": 0.25794878602027893, + "learning_rate": 4.686718477016361e-05, + "loss": 0.5283, + "step": 907 + }, + { + "epoch": 0.16142222222222222, + "grad_norm": 0.21168023347854614, + "learning_rate": 4.6860413845906534e-05, + "loss": 0.4153, + "step": 908 + }, + { + "epoch": 0.1616, + "grad_norm": 0.18543760478496552, + "learning_rate": 4.6853636102769274e-05, + "loss": 0.4231, + "step": 909 + }, + { + "epoch": 0.16177777777777777, + "grad_norm": 0.20537665486335754, + "learning_rate": 4.684685154286599e-05, + "loss": 0.5388, + "step": 910 + }, + { + "epoch": 0.16195555555555555, + "grad_norm": 0.1905399113893509, + "learning_rate": 4.684006016831297e-05, + "loss": 0.5745, + "step": 911 + }, + { + "epoch": 0.16213333333333332, + "grad_norm": 0.2412562370300293, + "learning_rate": 4.6833261981228646e-05, + "loss": 0.5613, + "step": 912 + }, + { + "epoch": 0.1623111111111111, + "grad_norm": 0.23087134957313538, + "learning_rate": 4.682645698373357e-05, + "loss": 0.4804, + "step": 913 + }, + { + "epoch": 0.16248888888888888, + "grad_norm": 0.2422640025615692, + "learning_rate": 4.68196451779504e-05, + "loss": 0.6175, + "step": 914 + }, + { + "epoch": 0.16266666666666665, + "grad_norm": 0.24661637842655182, + "learning_rate": 4.6812826566003934e-05, + "loss": 0.5568, + "step": 915 + }, + { + "epoch": 0.16284444444444446, + "grad_norm": 0.19879814982414246, + "learning_rate": 4.68060011500211e-05, + "loss": 0.4898, + "step": 916 + }, + { + "epoch": 0.16302222222222224, + "grad_norm": 0.18111403286457062, + "learning_rate": 4.6799168932130915e-05, + "loss": 0.4513, + "step": 917 + }, + { + "epoch": 0.1632, + "grad_norm": 0.2221381664276123, + "learning_rate": 4.679232991446456e-05, + "loss": 0.5464, + "step": 918 + }, + { + "epoch": 0.1633777777777778, + "grad_norm": 0.23238599300384521, + "learning_rate": 4.678548409915532e-05, + "loss": 0.5113, + "step": 919 + }, + { + "epoch": 0.16355555555555557, + "grad_norm": 0.1607072651386261, + "learning_rate": 4.677863148833859e-05, + "loss": 0.5671, + "step": 920 + }, + { + "epoch": 0.16373333333333334, + "grad_norm": 0.23482924699783325, + "learning_rate": 4.6771772084151885e-05, + "loss": 0.5032, + "step": 921 + }, + { + "epoch": 0.16391111111111112, + "grad_norm": 0.20868410170078278, + "learning_rate": 4.676490588873486e-05, + "loss": 0.5348, + "step": 922 + }, + { + "epoch": 0.1640888888888889, + "grad_norm": 0.19650250673294067, + "learning_rate": 4.675803290422927e-05, + "loss": 0.4544, + "step": 923 + }, + { + "epoch": 0.16426666666666667, + "grad_norm": 0.22298850119113922, + "learning_rate": 4.6751153132779e-05, + "loss": 0.6112, + "step": 924 + }, + { + "epoch": 0.16444444444444445, + "grad_norm": 0.20983600616455078, + "learning_rate": 4.674426657653003e-05, + "loss": 0.5648, + "step": 925 + }, + { + "epoch": 0.16462222222222223, + "grad_norm": 0.19523482024669647, + "learning_rate": 4.6737373237630476e-05, + "loss": 0.4889, + "step": 926 + }, + { + "epoch": 0.1648, + "grad_norm": 0.2303411066532135, + "learning_rate": 4.6730473118230575e-05, + "loss": 0.7892, + "step": 927 + }, + { + "epoch": 0.16497777777777778, + "grad_norm": 0.1627681404352188, + "learning_rate": 4.6723566220482664e-05, + "loss": 0.3899, + "step": 928 + }, + { + "epoch": 0.16515555555555556, + "grad_norm": 0.1805935949087143, + "learning_rate": 4.6716652546541194e-05, + "loss": 0.4942, + "step": 929 + }, + { + "epoch": 0.16533333333333333, + "grad_norm": 0.18388409912586212, + "learning_rate": 4.6709732098562745e-05, + "loss": 0.5232, + "step": 930 + }, + { + "epoch": 0.1655111111111111, + "grad_norm": 0.23972368240356445, + "learning_rate": 4.670280487870598e-05, + "loss": 0.4591, + "step": 931 + }, + { + "epoch": 0.16568888888888889, + "grad_norm": 0.21529941260814667, + "learning_rate": 4.6695870889131724e-05, + "loss": 0.5808, + "step": 932 + }, + { + "epoch": 0.16586666666666666, + "grad_norm": 0.23533114790916443, + "learning_rate": 4.668893013200286e-05, + "loss": 0.6702, + "step": 933 + }, + { + "epoch": 0.16604444444444444, + "grad_norm": 0.20815765857696533, + "learning_rate": 4.6681982609484416e-05, + "loss": 0.4385, + "step": 934 + }, + { + "epoch": 0.16622222222222222, + "grad_norm": 0.23493315279483795, + "learning_rate": 4.667502832374352e-05, + "loss": 0.682, + "step": 935 + }, + { + "epoch": 0.1664, + "grad_norm": 0.19730225205421448, + "learning_rate": 4.6668067276949414e-05, + "loss": 0.4217, + "step": 936 + }, + { + "epoch": 0.16657777777777777, + "grad_norm": 0.19495216012001038, + "learning_rate": 4.666109947127343e-05, + "loss": 0.4488, + "step": 937 + }, + { + "epoch": 0.16675555555555555, + "grad_norm": 0.2136872261762619, + "learning_rate": 4.665412490888904e-05, + "loss": 0.6401, + "step": 938 + }, + { + "epoch": 0.16693333333333332, + "grad_norm": 0.20429658889770508, + "learning_rate": 4.66471435919718e-05, + "loss": 0.4743, + "step": 939 + }, + { + "epoch": 0.1671111111111111, + "grad_norm": 0.2563759684562683, + "learning_rate": 4.6640155522699374e-05, + "loss": 0.4652, + "step": 940 + }, + { + "epoch": 0.16728888888888888, + "grad_norm": 0.1696614772081375, + "learning_rate": 4.6633160703251554e-05, + "loss": 0.4073, + "step": 941 + }, + { + "epoch": 0.16746666666666668, + "grad_norm": 0.24591179192066193, + "learning_rate": 4.6626159135810205e-05, + "loss": 0.3833, + "step": 942 + }, + { + "epoch": 0.16764444444444446, + "grad_norm": 0.233788400888443, + "learning_rate": 4.661915082255932e-05, + "loss": 0.3966, + "step": 943 + }, + { + "epoch": 0.16782222222222223, + "grad_norm": 0.2804924249649048, + "learning_rate": 4.6612135765685e-05, + "loss": 0.5185, + "step": 944 + }, + { + "epoch": 0.168, + "grad_norm": 0.2768259644508362, + "learning_rate": 4.660511396737541e-05, + "loss": 0.4721, + "step": 945 + }, + { + "epoch": 0.1681777777777778, + "grad_norm": 0.19400373101234436, + "learning_rate": 4.659808542982088e-05, + "loss": 0.4502, + "step": 946 + }, + { + "epoch": 0.16835555555555556, + "grad_norm": 0.32416754961013794, + "learning_rate": 4.65910501552138e-05, + "loss": 0.506, + "step": 947 + }, + { + "epoch": 0.16853333333333334, + "grad_norm": 0.2552897334098816, + "learning_rate": 4.6584008145748656e-05, + "loss": 0.4193, + "step": 948 + }, + { + "epoch": 0.16871111111111112, + "grad_norm": 0.30237919092178345, + "learning_rate": 4.657695940362207e-05, + "loss": 0.5407, + "step": 949 + }, + { + "epoch": 0.1688888888888889, + "grad_norm": 0.3677632212638855, + "learning_rate": 4.6569903931032735e-05, + "loss": 0.4347, + "step": 950 + }, + { + "epoch": 0.16906666666666667, + "grad_norm": 0.18824970722198486, + "learning_rate": 4.656284173018144e-05, + "loss": 0.5013, + "step": 951 + }, + { + "epoch": 0.16924444444444445, + "grad_norm": 0.2421761155128479, + "learning_rate": 4.65557728032711e-05, + "loss": 0.5574, + "step": 952 + }, + { + "epoch": 0.16942222222222222, + "grad_norm": 0.2794753611087799, + "learning_rate": 4.6548697152506705e-05, + "loss": 0.6012, + "step": 953 + }, + { + "epoch": 0.1696, + "grad_norm": 0.31365451216697693, + "learning_rate": 4.654161478009536e-05, + "loss": 0.6598, + "step": 954 + }, + { + "epoch": 0.16977777777777778, + "grad_norm": 0.2384924292564392, + "learning_rate": 4.653452568824625e-05, + "loss": 0.6682, + "step": 955 + }, + { + "epoch": 0.16995555555555555, + "grad_norm": 0.2735510468482971, + "learning_rate": 4.652742987917066e-05, + "loss": 0.5755, + "step": 956 + }, + { + "epoch": 0.17013333333333333, + "grad_norm": 0.16620174050331116, + "learning_rate": 4.652032735508198e-05, + "loss": 0.3222, + "step": 957 + }, + { + "epoch": 0.1703111111111111, + "grad_norm": 0.2189287394285202, + "learning_rate": 4.651321811819568e-05, + "loss": 0.5817, + "step": 958 + }, + { + "epoch": 0.17048888888888888, + "grad_norm": 0.2255752831697464, + "learning_rate": 4.650610217072934e-05, + "loss": 0.6141, + "step": 959 + }, + { + "epoch": 0.17066666666666666, + "grad_norm": 0.23336412012577057, + "learning_rate": 4.649897951490262e-05, + "loss": 0.545, + "step": 960 + }, + { + "epoch": 0.17084444444444444, + "grad_norm": 0.20710806548595428, + "learning_rate": 4.649185015293728e-05, + "loss": 0.5042, + "step": 961 + }, + { + "epoch": 0.17102222222222221, + "grad_norm": 0.217239111661911, + "learning_rate": 4.648471408705717e-05, + "loss": 0.4779, + "step": 962 + }, + { + "epoch": 0.1712, + "grad_norm": 0.19258487224578857, + "learning_rate": 4.647757131948822e-05, + "loss": 0.5445, + "step": 963 + }, + { + "epoch": 0.17137777777777777, + "grad_norm": 0.16922153532505035, + "learning_rate": 4.647042185245847e-05, + "loss": 0.375, + "step": 964 + }, + { + "epoch": 0.17155555555555554, + "grad_norm": 0.22192326188087463, + "learning_rate": 4.6463265688198044e-05, + "loss": 0.5454, + "step": 965 + }, + { + "epoch": 0.17173333333333332, + "grad_norm": 0.24540521204471588, + "learning_rate": 4.645610282893915e-05, + "loss": 0.5599, + "step": 966 + }, + { + "epoch": 0.1719111111111111, + "grad_norm": 0.19649697840213776, + "learning_rate": 4.6448933276916076e-05, + "loss": 0.4368, + "step": 967 + }, + { + "epoch": 0.1720888888888889, + "grad_norm": 0.2000036984682083, + "learning_rate": 4.644175703436522e-05, + "loss": 0.525, + "step": 968 + }, + { + "epoch": 0.17226666666666668, + "grad_norm": 0.20743907988071442, + "learning_rate": 4.6434574103525044e-05, + "loss": 0.4669, + "step": 969 + }, + { + "epoch": 0.17244444444444446, + "grad_norm": 0.178581103682518, + "learning_rate": 4.6427384486636113e-05, + "loss": 0.3846, + "step": 970 + }, + { + "epoch": 0.17262222222222223, + "grad_norm": 0.21291114389896393, + "learning_rate": 4.642018818594107e-05, + "loss": 0.5845, + "step": 971 + }, + { + "epoch": 0.1728, + "grad_norm": 0.22005410492420197, + "learning_rate": 4.6412985203684654e-05, + "loss": 0.4798, + "step": 972 + }, + { + "epoch": 0.17297777777777779, + "grad_norm": 0.242549866437912, + "learning_rate": 4.640577554211366e-05, + "loss": 0.5982, + "step": 973 + }, + { + "epoch": 0.17315555555555556, + "grad_norm": 0.22279231250286102, + "learning_rate": 4.639855920347701e-05, + "loss": 0.5054, + "step": 974 + }, + { + "epoch": 0.17333333333333334, + "grad_norm": 0.2300112545490265, + "learning_rate": 4.6391336190025644e-05, + "loss": 0.4876, + "step": 975 + }, + { + "epoch": 0.17351111111111112, + "grad_norm": 0.19521552324295044, + "learning_rate": 4.638410650401267e-05, + "loss": 0.3871, + "step": 976 + }, + { + "epoch": 0.1736888888888889, + "grad_norm": 0.19378717243671417, + "learning_rate": 4.6376870147693196e-05, + "loss": 0.5497, + "step": 977 + }, + { + "epoch": 0.17386666666666667, + "grad_norm": 0.23334316909313202, + "learning_rate": 4.6369627123324465e-05, + "loss": 0.4768, + "step": 978 + }, + { + "epoch": 0.17404444444444445, + "grad_norm": 0.2977173924446106, + "learning_rate": 4.636237743316578e-05, + "loss": 0.567, + "step": 979 + }, + { + "epoch": 0.17422222222222222, + "grad_norm": 0.20177583396434784, + "learning_rate": 4.635512107947851e-05, + "loss": 0.4708, + "step": 980 + }, + { + "epoch": 0.1744, + "grad_norm": 0.2482595294713974, + "learning_rate": 4.6347858064526125e-05, + "loss": 0.5163, + "step": 981 + }, + { + "epoch": 0.17457777777777778, + "grad_norm": 0.23634855449199677, + "learning_rate": 4.634058839057417e-05, + "loss": 0.4976, + "step": 982 + }, + { + "epoch": 0.17475555555555555, + "grad_norm": 0.26508915424346924, + "learning_rate": 4.6333312059890256e-05, + "loss": 0.4423, + "step": 983 + }, + { + "epoch": 0.17493333333333333, + "grad_norm": 0.1751834601163864, + "learning_rate": 4.6326029074744074e-05, + "loss": 0.461, + "step": 984 + }, + { + "epoch": 0.1751111111111111, + "grad_norm": 0.2352793961763382, + "learning_rate": 4.63187394374074e-05, + "loss": 0.5458, + "step": 985 + }, + { + "epoch": 0.17528888888888888, + "grad_norm": 0.189413383603096, + "learning_rate": 4.631144315015407e-05, + "loss": 0.4946, + "step": 986 + }, + { + "epoch": 0.17546666666666666, + "grad_norm": 0.22011128067970276, + "learning_rate": 4.630414021525999e-05, + "loss": 0.5682, + "step": 987 + }, + { + "epoch": 0.17564444444444444, + "grad_norm": 0.2838226556777954, + "learning_rate": 4.629683063500319e-05, + "loss": 0.6517, + "step": 988 + }, + { + "epoch": 0.1758222222222222, + "grad_norm": 0.18364161252975464, + "learning_rate": 4.62895144116637e-05, + "loss": 0.4133, + "step": 989 + }, + { + "epoch": 0.176, + "grad_norm": 0.18876802921295166, + "learning_rate": 4.628219154752367e-05, + "loss": 0.5709, + "step": 990 + }, + { + "epoch": 0.17617777777777777, + "grad_norm": 0.23001141846179962, + "learning_rate": 4.6274862044867304e-05, + "loss": 0.517, + "step": 991 + }, + { + "epoch": 0.17635555555555554, + "grad_norm": 0.24718807637691498, + "learning_rate": 4.626752590598088e-05, + "loss": 0.3803, + "step": 992 + }, + { + "epoch": 0.17653333333333332, + "grad_norm": 0.26053664088249207, + "learning_rate": 4.626018313315275e-05, + "loss": 0.3827, + "step": 993 + }, + { + "epoch": 0.17671111111111112, + "grad_norm": 0.25779855251312256, + "learning_rate": 4.625283372867333e-05, + "loss": 0.3799, + "step": 994 + }, + { + "epoch": 0.1768888888888889, + "grad_norm": 0.22719506919384003, + "learning_rate": 4.6245477694835106e-05, + "loss": 0.3566, + "step": 995 + }, + { + "epoch": 0.17706666666666668, + "grad_norm": 0.3598608076572418, + "learning_rate": 4.6238115033932636e-05, + "loss": 0.4218, + "step": 996 + }, + { + "epoch": 0.17724444444444445, + "grad_norm": 0.3139735758304596, + "learning_rate": 4.623074574826254e-05, + "loss": 0.4723, + "step": 997 + }, + { + "epoch": 0.17742222222222223, + "grad_norm": 0.32300683856010437, + "learning_rate": 4.622336984012351e-05, + "loss": 0.4039, + "step": 998 + }, + { + "epoch": 0.1776, + "grad_norm": 0.35856425762176514, + "learning_rate": 4.621598731181629e-05, + "loss": 0.4217, + "step": 999 + }, + { + "epoch": 0.17777777777777778, + "grad_norm": 0.45077309012413025, + "learning_rate": 4.6208598165643715e-05, + "loss": 0.4163, + "step": 1000 + }, + { + "epoch": 0.17795555555555556, + "grad_norm": 0.22296997904777527, + "learning_rate": 4.620120240391065e-05, + "loss": 0.5907, + "step": 1001 + }, + { + "epoch": 0.17813333333333334, + "grad_norm": 0.2576141655445099, + "learning_rate": 4.619380002892406e-05, + "loss": 0.4594, + "step": 1002 + }, + { + "epoch": 0.17831111111111111, + "grad_norm": 0.26877814531326294, + "learning_rate": 4.618639104299294e-05, + "loss": 0.637, + "step": 1003 + }, + { + "epoch": 0.1784888888888889, + "grad_norm": 0.18491525948047638, + "learning_rate": 4.617897544842836e-05, + "loss": 0.4661, + "step": 1004 + }, + { + "epoch": 0.17866666666666667, + "grad_norm": 0.2318442314863205, + "learning_rate": 4.617155324754346e-05, + "loss": 0.5804, + "step": 1005 + }, + { + "epoch": 0.17884444444444444, + "grad_norm": 0.2059689462184906, + "learning_rate": 4.616412444265345e-05, + "loss": 0.5935, + "step": 1006 + }, + { + "epoch": 0.17902222222222222, + "grad_norm": 0.1740427315235138, + "learning_rate": 4.6156689036075555e-05, + "loss": 0.4648, + "step": 1007 + }, + { + "epoch": 0.1792, + "grad_norm": 0.20965313911437988, + "learning_rate": 4.614924703012911e-05, + "loss": 0.48, + "step": 1008 + }, + { + "epoch": 0.17937777777777777, + "grad_norm": 0.22500717639923096, + "learning_rate": 4.614179842713547e-05, + "loss": 0.5164, + "step": 1009 + }, + { + "epoch": 0.17955555555555555, + "grad_norm": 0.18078558146953583, + "learning_rate": 4.6134343229418075e-05, + "loss": 0.5545, + "step": 1010 + }, + { + "epoch": 0.17973333333333333, + "grad_norm": 0.19148242473602295, + "learning_rate": 4.612688143930242e-05, + "loss": 0.3739, + "step": 1011 + }, + { + "epoch": 0.1799111111111111, + "grad_norm": 0.25137242674827576, + "learning_rate": 4.611941305911602e-05, + "loss": 0.506, + "step": 1012 + }, + { + "epoch": 0.18008888888888888, + "grad_norm": 0.18720732629299164, + "learning_rate": 4.61119380911885e-05, + "loss": 0.392, + "step": 1013 + }, + { + "epoch": 0.18026666666666666, + "grad_norm": 0.21631205081939697, + "learning_rate": 4.610445653785151e-05, + "loss": 0.5299, + "step": 1014 + }, + { + "epoch": 0.18044444444444444, + "grad_norm": 0.17647314071655273, + "learning_rate": 4.6096968401438745e-05, + "loss": 0.4512, + "step": 1015 + }, + { + "epoch": 0.1806222222222222, + "grad_norm": 0.17952392995357513, + "learning_rate": 4.6089473684285974e-05, + "loss": 0.4693, + "step": 1016 + }, + { + "epoch": 0.1808, + "grad_norm": 0.1992589831352234, + "learning_rate": 4.608197238873101e-05, + "loss": 0.5205, + "step": 1017 + }, + { + "epoch": 0.18097777777777777, + "grad_norm": 0.17946861684322357, + "learning_rate": 4.607446451711372e-05, + "loss": 0.4142, + "step": 1018 + }, + { + "epoch": 0.18115555555555554, + "grad_norm": 0.22301191091537476, + "learning_rate": 4.6066950071776015e-05, + "loss": 0.5484, + "step": 1019 + }, + { + "epoch": 0.18133333333333335, + "grad_norm": 0.18079876899719238, + "learning_rate": 4.605942905506188e-05, + "loss": 0.4731, + "step": 1020 + }, + { + "epoch": 0.18151111111111112, + "grad_norm": 0.24740757048130035, + "learning_rate": 4.605190146931731e-05, + "loss": 0.4866, + "step": 1021 + }, + { + "epoch": 0.1816888888888889, + "grad_norm": 0.20163549482822418, + "learning_rate": 4.6044367316890386e-05, + "loss": 0.536, + "step": 1022 + }, + { + "epoch": 0.18186666666666668, + "grad_norm": 0.2963416278362274, + "learning_rate": 4.6036826600131216e-05, + "loss": 0.4296, + "step": 1023 + }, + { + "epoch": 0.18204444444444445, + "grad_norm": 0.20758002996444702, + "learning_rate": 4.602927932139197e-05, + "loss": 0.5467, + "step": 1024 + }, + { + "epoch": 0.18222222222222223, + "grad_norm": 0.2044724076986313, + "learning_rate": 4.602172548302684e-05, + "loss": 0.548, + "step": 1025 + }, + { + "epoch": 0.1824, + "grad_norm": 0.2220332771539688, + "learning_rate": 4.601416508739211e-05, + "loss": 0.5708, + "step": 1026 + }, + { + "epoch": 0.18257777777777778, + "grad_norm": 0.20552785694599152, + "learning_rate": 4.6006598136846056e-05, + "loss": 0.6461, + "step": 1027 + }, + { + "epoch": 0.18275555555555556, + "grad_norm": 0.24580639600753784, + "learning_rate": 4.599902463374903e-05, + "loss": 0.5615, + "step": 1028 + }, + { + "epoch": 0.18293333333333334, + "grad_norm": 0.19479405879974365, + "learning_rate": 4.599144458046343e-05, + "loss": 0.4593, + "step": 1029 + }, + { + "epoch": 0.1831111111111111, + "grad_norm": 0.21928654611110687, + "learning_rate": 4.598385797935368e-05, + "loss": 0.5547, + "step": 1030 + }, + { + "epoch": 0.1832888888888889, + "grad_norm": 0.2328445017337799, + "learning_rate": 4.597626483278625e-05, + "loss": 0.6427, + "step": 1031 + }, + { + "epoch": 0.18346666666666667, + "grad_norm": 0.2163815051317215, + "learning_rate": 4.596866514312967e-05, + "loss": 0.5794, + "step": 1032 + }, + { + "epoch": 0.18364444444444444, + "grad_norm": 0.18270502984523773, + "learning_rate": 4.596105891275449e-05, + "loss": 0.4292, + "step": 1033 + }, + { + "epoch": 0.18382222222222222, + "grad_norm": 0.205346941947937, + "learning_rate": 4.5953446144033316e-05, + "loss": 0.4258, + "step": 1034 + }, + { + "epoch": 0.184, + "grad_norm": 0.1878262460231781, + "learning_rate": 4.594582683934078e-05, + "loss": 0.3921, + "step": 1035 + }, + { + "epoch": 0.18417777777777777, + "grad_norm": 0.2462712675333023, + "learning_rate": 4.593820100105355e-05, + "loss": 0.6257, + "step": 1036 + }, + { + "epoch": 0.18435555555555555, + "grad_norm": 0.21715806424617767, + "learning_rate": 4.593056863155034e-05, + "loss": 0.489, + "step": 1037 + }, + { + "epoch": 0.18453333333333333, + "grad_norm": 0.1828193962574005, + "learning_rate": 4.5922929733211926e-05, + "loss": 0.3685, + "step": 1038 + }, + { + "epoch": 0.1847111111111111, + "grad_norm": 0.21273817121982574, + "learning_rate": 4.591528430842107e-05, + "loss": 0.4879, + "step": 1039 + }, + { + "epoch": 0.18488888888888888, + "grad_norm": 0.24157774448394775, + "learning_rate": 4.59076323595626e-05, + "loss": 0.4499, + "step": 1040 + }, + { + "epoch": 0.18506666666666666, + "grad_norm": 0.31074634194374084, + "learning_rate": 4.589997388902338e-05, + "loss": 0.4216, + "step": 1041 + }, + { + "epoch": 0.18524444444444443, + "grad_norm": 0.3191879093647003, + "learning_rate": 4.589230889919232e-05, + "loss": 0.4199, + "step": 1042 + }, + { + "epoch": 0.1854222222222222, + "grad_norm": 0.26181063055992126, + "learning_rate": 4.5884637392460314e-05, + "loss": 0.4213, + "step": 1043 + }, + { + "epoch": 0.1856, + "grad_norm": 0.294725239276886, + "learning_rate": 4.5876959371220344e-05, + "loss": 0.4161, + "step": 1044 + }, + { + "epoch": 0.18577777777777776, + "grad_norm": 0.2438194751739502, + "learning_rate": 4.5869274837867394e-05, + "loss": 0.441, + "step": 1045 + }, + { + "epoch": 0.18595555555555557, + "grad_norm": 0.24105407297611237, + "learning_rate": 4.586158379479848e-05, + "loss": 0.4317, + "step": 1046 + }, + { + "epoch": 0.18613333333333335, + "grad_norm": 0.3104403614997864, + "learning_rate": 4.585388624441267e-05, + "loss": 0.4428, + "step": 1047 + }, + { + "epoch": 0.18631111111111112, + "grad_norm": 0.26926448941230774, + "learning_rate": 4.5846182189111035e-05, + "loss": 0.5026, + "step": 1048 + }, + { + "epoch": 0.1864888888888889, + "grad_norm": 0.360380619764328, + "learning_rate": 4.58384716312967e-05, + "loss": 0.5785, + "step": 1049 + }, + { + "epoch": 0.18666666666666668, + "grad_norm": 0.4018998444080353, + "learning_rate": 4.583075457337479e-05, + "loss": 0.4732, + "step": 1050 + }, + { + "epoch": 0.18684444444444445, + "grad_norm": 0.22973701357841492, + "learning_rate": 4.5823031017752485e-05, + "loss": 0.6335, + "step": 1051 + }, + { + "epoch": 0.18702222222222223, + "grad_norm": 0.1961563527584076, + "learning_rate": 4.581530096683898e-05, + "loss": 0.4715, + "step": 1052 + }, + { + "epoch": 0.1872, + "grad_norm": 0.24128082394599915, + "learning_rate": 4.580756442304549e-05, + "loss": 0.4982, + "step": 1053 + }, + { + "epoch": 0.18737777777777778, + "grad_norm": 0.2485327571630478, + "learning_rate": 4.579982138878527e-05, + "loss": 0.6328, + "step": 1054 + }, + { + "epoch": 0.18755555555555556, + "grad_norm": 0.230249285697937, + "learning_rate": 4.579207186647357e-05, + "loss": 0.5677, + "step": 1055 + }, + { + "epoch": 0.18773333333333334, + "grad_norm": 0.20927925407886505, + "learning_rate": 4.5784315858527715e-05, + "loss": 0.4342, + "step": 1056 + }, + { + "epoch": 0.1879111111111111, + "grad_norm": 0.24494759738445282, + "learning_rate": 4.5776553367367e-05, + "loss": 0.4641, + "step": 1057 + }, + { + "epoch": 0.1880888888888889, + "grad_norm": 0.21655695140361786, + "learning_rate": 4.576878439541278e-05, + "loss": 0.4867, + "step": 1058 + }, + { + "epoch": 0.18826666666666667, + "grad_norm": 0.2592152953147888, + "learning_rate": 4.57610089450884e-05, + "loss": 0.6213, + "step": 1059 + }, + { + "epoch": 0.18844444444444444, + "grad_norm": 0.18461069464683533, + "learning_rate": 4.575322701881926e-05, + "loss": 0.509, + "step": 1060 + }, + { + "epoch": 0.18862222222222222, + "grad_norm": 0.2437116503715515, + "learning_rate": 4.574543861903274e-05, + "loss": 0.6255, + "step": 1061 + }, + { + "epoch": 0.1888, + "grad_norm": 0.2580753564834595, + "learning_rate": 4.5737643748158295e-05, + "loss": 0.529, + "step": 1062 + }, + { + "epoch": 0.18897777777777777, + "grad_norm": 0.22123725712299347, + "learning_rate": 4.5729842408627334e-05, + "loss": 0.6493, + "step": 1063 + }, + { + "epoch": 0.18915555555555555, + "grad_norm": 0.23026639223098755, + "learning_rate": 4.572203460287333e-05, + "loss": 0.456, + "step": 1064 + }, + { + "epoch": 0.18933333333333333, + "grad_norm": 0.24028520286083221, + "learning_rate": 4.5714220333331756e-05, + "loss": 0.4786, + "step": 1065 + }, + { + "epoch": 0.1895111111111111, + "grad_norm": 0.19466330111026764, + "learning_rate": 4.5706399602440106e-05, + "loss": 0.4933, + "step": 1066 + }, + { + "epoch": 0.18968888888888888, + "grad_norm": 0.244710773229599, + "learning_rate": 4.569857241263788e-05, + "loss": 0.6948, + "step": 1067 + }, + { + "epoch": 0.18986666666666666, + "grad_norm": 0.2331545650959015, + "learning_rate": 4.56907387663666e-05, + "loss": 0.5679, + "step": 1068 + }, + { + "epoch": 0.19004444444444443, + "grad_norm": 0.21071632206439972, + "learning_rate": 4.568289866606981e-05, + "loss": 0.5083, + "step": 1069 + }, + { + "epoch": 0.1902222222222222, + "grad_norm": 0.2760453522205353, + "learning_rate": 4.567505211419305e-05, + "loss": 0.5247, + "step": 1070 + }, + { + "epoch": 0.1904, + "grad_norm": 0.21211619675159454, + "learning_rate": 4.566719911318389e-05, + "loss": 0.4151, + "step": 1071 + }, + { + "epoch": 0.1905777777777778, + "grad_norm": 0.19759276509284973, + "learning_rate": 4.565933966549189e-05, + "loss": 0.4837, + "step": 1072 + }, + { + "epoch": 0.19075555555555557, + "grad_norm": 0.2720250189304352, + "learning_rate": 4.565147377356864e-05, + "loss": 0.5794, + "step": 1073 + }, + { + "epoch": 0.19093333333333334, + "grad_norm": 0.21481817960739136, + "learning_rate": 4.5643601439867734e-05, + "loss": 0.4398, + "step": 1074 + }, + { + "epoch": 0.19111111111111112, + "grad_norm": 0.25854310393333435, + "learning_rate": 4.5635722666844775e-05, + "loss": 0.5797, + "step": 1075 + }, + { + "epoch": 0.1912888888888889, + "grad_norm": 0.27354690432548523, + "learning_rate": 4.562783745695738e-05, + "loss": 0.5776, + "step": 1076 + }, + { + "epoch": 0.19146666666666667, + "grad_norm": 0.1841791272163391, + "learning_rate": 4.561994581266516e-05, + "loss": 0.4142, + "step": 1077 + }, + { + "epoch": 0.19164444444444445, + "grad_norm": 0.231328085064888, + "learning_rate": 4.561204773642974e-05, + "loss": 0.3986, + "step": 1078 + }, + { + "epoch": 0.19182222222222223, + "grad_norm": 0.23936523497104645, + "learning_rate": 4.560414323071477e-05, + "loss": 0.473, + "step": 1079 + }, + { + "epoch": 0.192, + "grad_norm": 0.3074245750904083, + "learning_rate": 4.559623229798587e-05, + "loss": 0.6815, + "step": 1080 + }, + { + "epoch": 0.19217777777777778, + "grad_norm": 0.23218508064746857, + "learning_rate": 4.558831494071069e-05, + "loss": 0.4819, + "step": 1081 + }, + { + "epoch": 0.19235555555555556, + "grad_norm": 0.21664172410964966, + "learning_rate": 4.558039116135887e-05, + "loss": 0.5056, + "step": 1082 + }, + { + "epoch": 0.19253333333333333, + "grad_norm": 0.21905098855495453, + "learning_rate": 4.5572460962402075e-05, + "loss": 0.5181, + "step": 1083 + }, + { + "epoch": 0.1927111111111111, + "grad_norm": 0.21040025353431702, + "learning_rate": 4.556452434631395e-05, + "loss": 0.4904, + "step": 1084 + }, + { + "epoch": 0.1928888888888889, + "grad_norm": 0.2571915090084076, + "learning_rate": 4.555658131557015e-05, + "loss": 0.53, + "step": 1085 + }, + { + "epoch": 0.19306666666666666, + "grad_norm": 0.25061699748039246, + "learning_rate": 4.5548631872648326e-05, + "loss": 0.4612, + "step": 1086 + }, + { + "epoch": 0.19324444444444444, + "grad_norm": 0.21951662003993988, + "learning_rate": 4.5540676020028145e-05, + "loss": 0.4758, + "step": 1087 + }, + { + "epoch": 0.19342222222222222, + "grad_norm": 0.23268786072731018, + "learning_rate": 4.553271376019125e-05, + "loss": 0.4479, + "step": 1088 + }, + { + "epoch": 0.1936, + "grad_norm": 0.21990342438220978, + "learning_rate": 4.55247450956213e-05, + "loss": 0.4055, + "step": 1089 + }, + { + "epoch": 0.19377777777777777, + "grad_norm": 0.29903778433799744, + "learning_rate": 4.5516770028803954e-05, + "loss": 0.4866, + "step": 1090 + }, + { + "epoch": 0.19395555555555555, + "grad_norm": 0.27366170287132263, + "learning_rate": 4.550878856222685e-05, + "loss": 0.3603, + "step": 1091 + }, + { + "epoch": 0.19413333333333332, + "grad_norm": 0.26119598746299744, + "learning_rate": 4.5500800698379624e-05, + "loss": 0.5072, + "step": 1092 + }, + { + "epoch": 0.1943111111111111, + "grad_norm": 0.2572188675403595, + "learning_rate": 4.5492806439753935e-05, + "loss": 0.5147, + "step": 1093 + }, + { + "epoch": 0.19448888888888888, + "grad_norm": 0.23310203850269318, + "learning_rate": 4.548480578884341e-05, + "loss": 0.4296, + "step": 1094 + }, + { + "epoch": 0.19466666666666665, + "grad_norm": 0.3724033236503601, + "learning_rate": 4.547679874814368e-05, + "loss": 0.4525, + "step": 1095 + }, + { + "epoch": 0.19484444444444443, + "grad_norm": 0.3192828595638275, + "learning_rate": 4.5468785320152365e-05, + "loss": 0.413, + "step": 1096 + }, + { + "epoch": 0.19502222222222224, + "grad_norm": 0.2911534011363983, + "learning_rate": 4.5460765507369084e-05, + "loss": 0.4974, + "step": 1097 + }, + { + "epoch": 0.1952, + "grad_norm": 0.3280751407146454, + "learning_rate": 4.5452739312295436e-05, + "loss": 0.4672, + "step": 1098 + }, + { + "epoch": 0.1953777777777778, + "grad_norm": 0.2735004425048828, + "learning_rate": 4.5444706737435014e-05, + "loss": 0.4395, + "step": 1099 + }, + { + "epoch": 0.19555555555555557, + "grad_norm": 0.35013899207115173, + "learning_rate": 4.543666778529342e-05, + "loss": 0.4506, + "step": 1100 + }, + { + "epoch": 0.19573333333333334, + "grad_norm": 0.23073118925094604, + "learning_rate": 4.542862245837821e-05, + "loss": 0.5279, + "step": 1101 + }, + { + "epoch": 0.19591111111111112, + "grad_norm": 0.26068708300590515, + "learning_rate": 4.542057075919897e-05, + "loss": 0.5842, + "step": 1102 + }, + { + "epoch": 0.1960888888888889, + "grad_norm": 0.30220794677734375, + "learning_rate": 4.5412512690267246e-05, + "loss": 0.4728, + "step": 1103 + }, + { + "epoch": 0.19626666666666667, + "grad_norm": 0.19572116434574127, + "learning_rate": 4.540444825409657e-05, + "loss": 0.5194, + "step": 1104 + }, + { + "epoch": 0.19644444444444445, + "grad_norm": 0.2343786358833313, + "learning_rate": 4.5396377453202466e-05, + "loss": 0.4486, + "step": 1105 + }, + { + "epoch": 0.19662222222222223, + "grad_norm": 0.24793241918087006, + "learning_rate": 4.5388300290102456e-05, + "loss": 0.4397, + "step": 1106 + }, + { + "epoch": 0.1968, + "grad_norm": 0.2507890462875366, + "learning_rate": 4.538021676731603e-05, + "loss": 0.3959, + "step": 1107 + }, + { + "epoch": 0.19697777777777778, + "grad_norm": 0.23967313766479492, + "learning_rate": 4.5372126887364655e-05, + "loss": 0.5535, + "step": 1108 + }, + { + "epoch": 0.19715555555555556, + "grad_norm": 0.22210414707660675, + "learning_rate": 4.536403065277182e-05, + "loss": 0.5231, + "step": 1109 + }, + { + "epoch": 0.19733333333333333, + "grad_norm": 0.2142551988363266, + "learning_rate": 4.535592806606294e-05, + "loss": 0.5618, + "step": 1110 + }, + { + "epoch": 0.1975111111111111, + "grad_norm": 0.19544768333435059, + "learning_rate": 4.534781912976546e-05, + "loss": 0.5069, + "step": 1111 + }, + { + "epoch": 0.1976888888888889, + "grad_norm": 0.2284489870071411, + "learning_rate": 4.533970384640877e-05, + "loss": 0.4705, + "step": 1112 + }, + { + "epoch": 0.19786666666666666, + "grad_norm": 0.17595472931861877, + "learning_rate": 4.533158221852427e-05, + "loss": 0.4573, + "step": 1113 + }, + { + "epoch": 0.19804444444444444, + "grad_norm": 0.25575509667396545, + "learning_rate": 4.5323454248645324e-05, + "loss": 0.4847, + "step": 1114 + }, + { + "epoch": 0.19822222222222222, + "grad_norm": 0.20740462839603424, + "learning_rate": 4.531531993930727e-05, + "loss": 0.473, + "step": 1115 + }, + { + "epoch": 0.1984, + "grad_norm": 0.1732252985239029, + "learning_rate": 4.530717929304743e-05, + "loss": 0.3946, + "step": 1116 + }, + { + "epoch": 0.19857777777777777, + "grad_norm": 0.17060326039791107, + "learning_rate": 4.529903231240511e-05, + "loss": 0.4146, + "step": 1117 + }, + { + "epoch": 0.19875555555555555, + "grad_norm": 0.18030692636966705, + "learning_rate": 4.529087899992156e-05, + "loss": 0.4589, + "step": 1118 + }, + { + "epoch": 0.19893333333333332, + "grad_norm": 0.1954290270805359, + "learning_rate": 4.5282719358140056e-05, + "loss": 0.5066, + "step": 1119 + }, + { + "epoch": 0.1991111111111111, + "grad_norm": 0.1500605344772339, + "learning_rate": 4.52745533896058e-05, + "loss": 0.4056, + "step": 1120 + }, + { + "epoch": 0.19928888888888888, + "grad_norm": 0.218876451253891, + "learning_rate": 4.5266381096866e-05, + "loss": 0.485, + "step": 1121 + }, + { + "epoch": 0.19946666666666665, + "grad_norm": 0.22948206961154938, + "learning_rate": 4.525820248246982e-05, + "loss": 0.6035, + "step": 1122 + }, + { + "epoch": 0.19964444444444446, + "grad_norm": 0.2785491943359375, + "learning_rate": 4.5250017548968404e-05, + "loss": 0.5002, + "step": 1123 + }, + { + "epoch": 0.19982222222222223, + "grad_norm": 0.18432079255580902, + "learning_rate": 4.524182629891486e-05, + "loss": 0.4337, + "step": 1124 + }, + { + "epoch": 0.2, + "grad_norm": 0.22611063718795776, + "learning_rate": 4.523362873486427e-05, + "loss": 0.4745, + "step": 1125 + }, + { + "epoch": 0.2001777777777778, + "grad_norm": 0.239525705575943, + "learning_rate": 4.522542485937369e-05, + "loss": 0.4696, + "step": 1126 + }, + { + "epoch": 0.20035555555555556, + "grad_norm": 0.18857158720493317, + "learning_rate": 4.521721467500213e-05, + "loss": 0.4492, + "step": 1127 + }, + { + "epoch": 0.20053333333333334, + "grad_norm": 0.17476879060268402, + "learning_rate": 4.5208998184310596e-05, + "loss": 0.3725, + "step": 1128 + }, + { + "epoch": 0.20071111111111112, + "grad_norm": 0.22804075479507446, + "learning_rate": 4.5200775389862026e-05, + "loss": 0.4564, + "step": 1129 + }, + { + "epoch": 0.2008888888888889, + "grad_norm": 0.1928485482931137, + "learning_rate": 4.519254629422136e-05, + "loss": 0.5353, + "step": 1130 + }, + { + "epoch": 0.20106666666666667, + "grad_norm": 0.21351028978824615, + "learning_rate": 4.5184310899955465e-05, + "loss": 0.5254, + "step": 1131 + }, + { + "epoch": 0.20124444444444445, + "grad_norm": 0.22923269867897034, + "learning_rate": 4.51760692096332e-05, + "loss": 0.5075, + "step": 1132 + }, + { + "epoch": 0.20142222222222222, + "grad_norm": 0.24213163554668427, + "learning_rate": 4.516782122582538e-05, + "loss": 0.3929, + "step": 1133 + }, + { + "epoch": 0.2016, + "grad_norm": 0.2357289046049118, + "learning_rate": 4.5159566951104796e-05, + "loss": 0.5673, + "step": 1134 + }, + { + "epoch": 0.20177777777777778, + "grad_norm": 0.28709009289741516, + "learning_rate": 4.5151306388046175e-05, + "loss": 0.5258, + "step": 1135 + }, + { + "epoch": 0.20195555555555555, + "grad_norm": 0.22160127758979797, + "learning_rate": 4.5143039539226234e-05, + "loss": 0.4142, + "step": 1136 + }, + { + "epoch": 0.20213333333333333, + "grad_norm": 0.24209551513195038, + "learning_rate": 4.513476640722362e-05, + "loss": 0.5669, + "step": 1137 + }, + { + "epoch": 0.2023111111111111, + "grad_norm": 0.26472365856170654, + "learning_rate": 4.512648699461897e-05, + "loss": 0.5035, + "step": 1138 + }, + { + "epoch": 0.20248888888888888, + "grad_norm": 0.2960183918476105, + "learning_rate": 4.511820130399485e-05, + "loss": 0.5571, + "step": 1139 + }, + { + "epoch": 0.20266666666666666, + "grad_norm": 0.2725565433502197, + "learning_rate": 4.510990933793583e-05, + "loss": 0.3111, + "step": 1140 + }, + { + "epoch": 0.20284444444444444, + "grad_norm": 0.31567874550819397, + "learning_rate": 4.510161109902837e-05, + "loss": 0.3578, + "step": 1141 + }, + { + "epoch": 0.20302222222222222, + "grad_norm": 0.28235065937042236, + "learning_rate": 4.509330658986095e-05, + "loss": 0.3743, + "step": 1142 + }, + { + "epoch": 0.2032, + "grad_norm": 0.30130165815353394, + "learning_rate": 4.508499581302398e-05, + "loss": 0.3632, + "step": 1143 + }, + { + "epoch": 0.20337777777777777, + "grad_norm": 0.28148213028907776, + "learning_rate": 4.507667877110982e-05, + "loss": 0.4697, + "step": 1144 + }, + { + "epoch": 0.20355555555555555, + "grad_norm": 0.31920433044433594, + "learning_rate": 4.506835546671278e-05, + "loss": 0.4269, + "step": 1145 + }, + { + "epoch": 0.20373333333333332, + "grad_norm": 0.27543723583221436, + "learning_rate": 4.5060025902429174e-05, + "loss": 0.4451, + "step": 1146 + }, + { + "epoch": 0.2039111111111111, + "grad_norm": 0.3476957678794861, + "learning_rate": 4.5051690080857176e-05, + "loss": 0.3867, + "step": 1147 + }, + { + "epoch": 0.20408888888888888, + "grad_norm": 0.3947436511516571, + "learning_rate": 4.504334800459699e-05, + "loss": 0.4406, + "step": 1148 + }, + { + "epoch": 0.20426666666666668, + "grad_norm": 0.3452424705028534, + "learning_rate": 4.5034999676250745e-05, + "loss": 0.4443, + "step": 1149 + }, + { + "epoch": 0.20444444444444446, + "grad_norm": 0.3163152039051056, + "learning_rate": 4.5026645098422515e-05, + "loss": 0.5542, + "step": 1150 + }, + { + "epoch": 0.20462222222222223, + "grad_norm": 0.22046495974063873, + "learning_rate": 4.5018284273718336e-05, + "loss": 0.5451, + "step": 1151 + }, + { + "epoch": 0.2048, + "grad_norm": 0.21697033941745758, + "learning_rate": 4.5009917204746184e-05, + "loss": 0.4613, + "step": 1152 + }, + { + "epoch": 0.2049777777777778, + "grad_norm": 0.3063431680202484, + "learning_rate": 4.5001543894115975e-05, + "loss": 0.6208, + "step": 1153 + }, + { + "epoch": 0.20515555555555556, + "grad_norm": 0.2530516982078552, + "learning_rate": 4.499316434443959e-05, + "loss": 0.5381, + "step": 1154 + }, + { + "epoch": 0.20533333333333334, + "grad_norm": 0.24551373720169067, + "learning_rate": 4.4984778558330844e-05, + "loss": 0.5628, + "step": 1155 + }, + { + "epoch": 0.20551111111111112, + "grad_norm": 0.262751966714859, + "learning_rate": 4.4976386538405495e-05, + "loss": 0.4742, + "step": 1156 + }, + { + "epoch": 0.2056888888888889, + "grad_norm": 0.2420046478509903, + "learning_rate": 4.496798828728126e-05, + "loss": 0.4927, + "step": 1157 + }, + { + "epoch": 0.20586666666666667, + "grad_norm": 0.2048220932483673, + "learning_rate": 4.495958380757779e-05, + "loss": 0.5151, + "step": 1158 + }, + { + "epoch": 0.20604444444444445, + "grad_norm": 0.1994069367647171, + "learning_rate": 4.4951173101916675e-05, + "loss": 0.4978, + "step": 1159 + }, + { + "epoch": 0.20622222222222222, + "grad_norm": 0.27311640977859497, + "learning_rate": 4.494275617292144e-05, + "loss": 0.5309, + "step": 1160 + }, + { + "epoch": 0.2064, + "grad_norm": 0.17105115950107574, + "learning_rate": 4.493433302321759e-05, + "loss": 0.467, + "step": 1161 + }, + { + "epoch": 0.20657777777777778, + "grad_norm": 0.22653710842132568, + "learning_rate": 4.492590365543253e-05, + "loss": 0.4371, + "step": 1162 + }, + { + "epoch": 0.20675555555555555, + "grad_norm": 0.19835852086544037, + "learning_rate": 4.491746807219561e-05, + "loss": 0.5782, + "step": 1163 + }, + { + "epoch": 0.20693333333333333, + "grad_norm": 0.17252543568611145, + "learning_rate": 4.490902627613813e-05, + "loss": 0.3605, + "step": 1164 + }, + { + "epoch": 0.2071111111111111, + "grad_norm": 0.2291734516620636, + "learning_rate": 4.4900578269893335e-05, + "loss": 0.4802, + "step": 1165 + }, + { + "epoch": 0.20728888888888888, + "grad_norm": 0.20126929879188538, + "learning_rate": 4.4892124056096386e-05, + "loss": 0.7609, + "step": 1166 + }, + { + "epoch": 0.20746666666666666, + "grad_norm": 0.18789182603359222, + "learning_rate": 4.4883663637384396e-05, + "loss": 0.4193, + "step": 1167 + }, + { + "epoch": 0.20764444444444444, + "grad_norm": 0.21864496171474457, + "learning_rate": 4.487519701639641e-05, + "loss": 0.3695, + "step": 1168 + }, + { + "epoch": 0.2078222222222222, + "grad_norm": 0.20566138625144958, + "learning_rate": 4.486672419577339e-05, + "loss": 0.5819, + "step": 1169 + }, + { + "epoch": 0.208, + "grad_norm": 0.24795235693454742, + "learning_rate": 4.4858245178158276e-05, + "loss": 0.5585, + "step": 1170 + }, + { + "epoch": 0.20817777777777777, + "grad_norm": 0.22701838612556458, + "learning_rate": 4.484975996619589e-05, + "loss": 0.5721, + "step": 1171 + }, + { + "epoch": 0.20835555555555554, + "grad_norm": 0.2662225663661957, + "learning_rate": 4.484126856253301e-05, + "loss": 0.6068, + "step": 1172 + }, + { + "epoch": 0.20853333333333332, + "grad_norm": 0.19402030110359192, + "learning_rate": 4.483277096981836e-05, + "loss": 0.4954, + "step": 1173 + }, + { + "epoch": 0.2087111111111111, + "grad_norm": 0.19918403029441833, + "learning_rate": 4.482426719070258e-05, + "loss": 0.6145, + "step": 1174 + }, + { + "epoch": 0.2088888888888889, + "grad_norm": 0.18216092884540558, + "learning_rate": 4.481575722783821e-05, + "loss": 0.4927, + "step": 1175 + }, + { + "epoch": 0.20906666666666668, + "grad_norm": 0.24153631925582886, + "learning_rate": 4.480724108387977e-05, + "loss": 0.4145, + "step": 1176 + }, + { + "epoch": 0.20924444444444446, + "grad_norm": 0.22951214015483856, + "learning_rate": 4.479871876148368e-05, + "loss": 0.5217, + "step": 1177 + }, + { + "epoch": 0.20942222222222223, + "grad_norm": 0.240169957280159, + "learning_rate": 4.4790190263308306e-05, + "loss": 0.4639, + "step": 1178 + }, + { + "epoch": 0.2096, + "grad_norm": 0.2075841873884201, + "learning_rate": 4.4781655592013914e-05, + "loss": 0.4262, + "step": 1179 + }, + { + "epoch": 0.20977777777777779, + "grad_norm": 0.19956330955028534, + "learning_rate": 4.477311475026271e-05, + "loss": 0.5, + "step": 1180 + }, + { + "epoch": 0.20995555555555556, + "grad_norm": 0.21053773164749146, + "learning_rate": 4.4764567740718825e-05, + "loss": 0.4606, + "step": 1181 + }, + { + "epoch": 0.21013333333333334, + "grad_norm": 0.19418244063854218, + "learning_rate": 4.475601456604831e-05, + "loss": 0.4769, + "step": 1182 + }, + { + "epoch": 0.21031111111111112, + "grad_norm": 0.22966864705085754, + "learning_rate": 4.4747455228919146e-05, + "loss": 0.5019, + "step": 1183 + }, + { + "epoch": 0.2104888888888889, + "grad_norm": 0.2202337086200714, + "learning_rate": 4.4738889732001234e-05, + "loss": 0.473, + "step": 1184 + }, + { + "epoch": 0.21066666666666667, + "grad_norm": 0.20665954053401947, + "learning_rate": 4.473031807796639e-05, + "loss": 0.4338, + "step": 1185 + }, + { + "epoch": 0.21084444444444445, + "grad_norm": 0.2171582579612732, + "learning_rate": 4.4721740269488355e-05, + "loss": 0.5085, + "step": 1186 + }, + { + "epoch": 0.21102222222222222, + "grad_norm": 0.3173885643482208, + "learning_rate": 4.471315630924279e-05, + "loss": 0.5511, + "step": 1187 + }, + { + "epoch": 0.2112, + "grad_norm": 0.2598091959953308, + "learning_rate": 4.470456619990727e-05, + "loss": 0.5356, + "step": 1188 + }, + { + "epoch": 0.21137777777777778, + "grad_norm": 0.21810810267925262, + "learning_rate": 4.46959699441613e-05, + "loss": 0.4542, + "step": 1189 + }, + { + "epoch": 0.21155555555555555, + "grad_norm": 0.270347535610199, + "learning_rate": 4.46873675446863e-05, + "loss": 0.4604, + "step": 1190 + }, + { + "epoch": 0.21173333333333333, + "grad_norm": 0.22283422946929932, + "learning_rate": 4.4678759004165584e-05, + "loss": 0.4804, + "step": 1191 + }, + { + "epoch": 0.2119111111111111, + "grad_norm": 0.2997529208660126, + "learning_rate": 4.4670144325284414e-05, + "loss": 0.3335, + "step": 1192 + }, + { + "epoch": 0.21208888888888888, + "grad_norm": 0.23307053744792938, + "learning_rate": 4.466152351072994e-05, + "loss": 0.3917, + "step": 1193 + }, + { + "epoch": 0.21226666666666666, + "grad_norm": 0.4034876525402069, + "learning_rate": 4.465289656319124e-05, + "loss": 0.3383, + "step": 1194 + }, + { + "epoch": 0.21244444444444444, + "grad_norm": 0.2727571427822113, + "learning_rate": 4.464426348535931e-05, + "loss": 0.4211, + "step": 1195 + }, + { + "epoch": 0.2126222222222222, + "grad_norm": 0.3137003183364868, + "learning_rate": 4.4635624279927044e-05, + "loss": 0.5643, + "step": 1196 + }, + { + "epoch": 0.2128, + "grad_norm": 0.30616435408592224, + "learning_rate": 4.462697894958926e-05, + "loss": 0.5288, + "step": 1197 + }, + { + "epoch": 0.21297777777777777, + "grad_norm": 0.35483983159065247, + "learning_rate": 4.461832749704268e-05, + "loss": 0.3453, + "step": 1198 + }, + { + "epoch": 0.21315555555555554, + "grad_norm": 0.37526512145996094, + "learning_rate": 4.460966992498593e-05, + "loss": 0.4246, + "step": 1199 + }, + { + "epoch": 0.21333333333333335, + "grad_norm": 0.28306496143341064, + "learning_rate": 4.460100623611955e-05, + "loss": 0.3952, + "step": 1200 + }, + { + "epoch": 0.21351111111111112, + "grad_norm": 0.2579217255115509, + "learning_rate": 4.4592336433146e-05, + "loss": 0.4593, + "step": 1201 + }, + { + "epoch": 0.2136888888888889, + "grad_norm": 0.25233665108680725, + "learning_rate": 4.458366051876962e-05, + "loss": 0.5193, + "step": 1202 + }, + { + "epoch": 0.21386666666666668, + "grad_norm": 0.33399251103401184, + "learning_rate": 4.45749784956967e-05, + "loss": 0.5962, + "step": 1203 + }, + { + "epoch": 0.21404444444444445, + "grad_norm": 0.23728017508983612, + "learning_rate": 4.456629036663537e-05, + "loss": 0.7229, + "step": 1204 + }, + { + "epoch": 0.21422222222222223, + "grad_norm": 0.271767258644104, + "learning_rate": 4.455759613429573e-05, + "loss": 0.4733, + "step": 1205 + }, + { + "epoch": 0.2144, + "grad_norm": 0.23073779046535492, + "learning_rate": 4.454889580138975e-05, + "loss": 0.4321, + "step": 1206 + }, + { + "epoch": 0.21457777777777778, + "grad_norm": 0.22024044394493103, + "learning_rate": 4.4540189370631315e-05, + "loss": 0.4672, + "step": 1207 + }, + { + "epoch": 0.21475555555555556, + "grad_norm": 0.1814970225095749, + "learning_rate": 4.45314768447362e-05, + "loss": 0.2871, + "step": 1208 + }, + { + "epoch": 0.21493333333333334, + "grad_norm": 0.18271920084953308, + "learning_rate": 4.4522758226422076e-05, + "loss": 0.4254, + "step": 1209 + }, + { + "epoch": 0.21511111111111111, + "grad_norm": 0.2293633073568344, + "learning_rate": 4.451403351840855e-05, + "loss": 0.4709, + "step": 1210 + }, + { + "epoch": 0.2152888888888889, + "grad_norm": 0.24442018568515778, + "learning_rate": 4.450530272341709e-05, + "loss": 0.5123, + "step": 1211 + }, + { + "epoch": 0.21546666666666667, + "grad_norm": 0.2315955013036728, + "learning_rate": 4.449656584417108e-05, + "loss": 0.4793, + "step": 1212 + }, + { + "epoch": 0.21564444444444444, + "grad_norm": 0.23595914244651794, + "learning_rate": 4.4487822883395805e-05, + "loss": 0.4148, + "step": 1213 + }, + { + "epoch": 0.21582222222222222, + "grad_norm": 0.21754497289657593, + "learning_rate": 4.447907384381843e-05, + "loss": 0.5832, + "step": 1214 + }, + { + "epoch": 0.216, + "grad_norm": 0.24892906844615936, + "learning_rate": 4.447031872816804e-05, + "loss": 0.4762, + "step": 1215 + }, + { + "epoch": 0.21617777777777777, + "grad_norm": 0.22116093337535858, + "learning_rate": 4.4461557539175594e-05, + "loss": 0.3892, + "step": 1216 + }, + { + "epoch": 0.21635555555555555, + "grad_norm": 0.21376650035381317, + "learning_rate": 4.445279027957395e-05, + "loss": 0.4064, + "step": 1217 + }, + { + "epoch": 0.21653333333333333, + "grad_norm": 0.24173544347286224, + "learning_rate": 4.444401695209788e-05, + "loss": 0.5405, + "step": 1218 + }, + { + "epoch": 0.2167111111111111, + "grad_norm": 0.20949050784111023, + "learning_rate": 4.443523755948401e-05, + "loss": 0.6225, + "step": 1219 + }, + { + "epoch": 0.21688888888888888, + "grad_norm": 0.23511171340942383, + "learning_rate": 4.4426452104470903e-05, + "loss": 0.3899, + "step": 1220 + }, + { + "epoch": 0.21706666666666666, + "grad_norm": 0.2516617774963379, + "learning_rate": 4.441766058979898e-05, + "loss": 0.5543, + "step": 1221 + }, + { + "epoch": 0.21724444444444443, + "grad_norm": 0.2169966995716095, + "learning_rate": 4.4408863018210564e-05, + "loss": 0.5775, + "step": 1222 + }, + { + "epoch": 0.2174222222222222, + "grad_norm": 0.1982172727584839, + "learning_rate": 4.440005939244986e-05, + "loss": 0.4319, + "step": 1223 + }, + { + "epoch": 0.2176, + "grad_norm": 0.24691443145275116, + "learning_rate": 4.439124971526297e-05, + "loss": 0.6291, + "step": 1224 + }, + { + "epoch": 0.21777777777777776, + "grad_norm": 0.22283481061458588, + "learning_rate": 4.4382433989397895e-05, + "loss": 0.5197, + "step": 1225 + }, + { + "epoch": 0.21795555555555557, + "grad_norm": 0.2501087188720703, + "learning_rate": 4.4373612217604496e-05, + "loss": 0.4807, + "step": 1226 + }, + { + "epoch": 0.21813333333333335, + "grad_norm": 0.23015612363815308, + "learning_rate": 4.436478440263453e-05, + "loss": 0.4218, + "step": 1227 + }, + { + "epoch": 0.21831111111111112, + "grad_norm": 0.19644664227962494, + "learning_rate": 4.4355950547241645e-05, + "loss": 0.6045, + "step": 1228 + }, + { + "epoch": 0.2184888888888889, + "grad_norm": 0.17348532378673553, + "learning_rate": 4.434711065418137e-05, + "loss": 0.397, + "step": 1229 + }, + { + "epoch": 0.21866666666666668, + "grad_norm": 0.29025378823280334, + "learning_rate": 4.433826472621112e-05, + "loss": 0.6841, + "step": 1230 + }, + { + "epoch": 0.21884444444444445, + "grad_norm": 0.26663684844970703, + "learning_rate": 4.432941276609018e-05, + "loss": 0.5249, + "step": 1231 + }, + { + "epoch": 0.21902222222222223, + "grad_norm": 0.23280292749404907, + "learning_rate": 4.4320554776579747e-05, + "loss": 0.4476, + "step": 1232 + }, + { + "epoch": 0.2192, + "grad_norm": 0.24723279476165771, + "learning_rate": 4.431169076044286e-05, + "loss": 0.6883, + "step": 1233 + }, + { + "epoch": 0.21937777777777778, + "grad_norm": 0.2232581526041031, + "learning_rate": 4.4302820720444456e-05, + "loss": 0.4752, + "step": 1234 + }, + { + "epoch": 0.21955555555555556, + "grad_norm": 0.2624834179878235, + "learning_rate": 4.429394465935136e-05, + "loss": 0.5272, + "step": 1235 + }, + { + "epoch": 0.21973333333333334, + "grad_norm": 0.24239814281463623, + "learning_rate": 4.428506257993226e-05, + "loss": 0.5152, + "step": 1236 + }, + { + "epoch": 0.2199111111111111, + "grad_norm": 0.19653569161891937, + "learning_rate": 4.427617448495772e-05, + "loss": 0.4192, + "step": 1237 + }, + { + "epoch": 0.2200888888888889, + "grad_norm": 0.21659450232982635, + "learning_rate": 4.4267280377200205e-05, + "loss": 0.4367, + "step": 1238 + }, + { + "epoch": 0.22026666666666667, + "grad_norm": 0.25144386291503906, + "learning_rate": 4.425838025943403e-05, + "loss": 0.5186, + "step": 1239 + }, + { + "epoch": 0.22044444444444444, + "grad_norm": 0.29326704144477844, + "learning_rate": 4.424947413443539e-05, + "loss": 0.4742, + "step": 1240 + }, + { + "epoch": 0.22062222222222222, + "grad_norm": 0.2666285037994385, + "learning_rate": 4.4240562004982364e-05, + "loss": 0.4814, + "step": 1241 + }, + { + "epoch": 0.2208, + "grad_norm": 0.26863181591033936, + "learning_rate": 4.423164387385489e-05, + "loss": 0.3805, + "step": 1242 + }, + { + "epoch": 0.22097777777777777, + "grad_norm": 0.2502087950706482, + "learning_rate": 4.422271974383479e-05, + "loss": 0.3988, + "step": 1243 + }, + { + "epoch": 0.22115555555555555, + "grad_norm": 0.24802133440971375, + "learning_rate": 4.4213789617705746e-05, + "loss": 0.507, + "step": 1244 + }, + { + "epoch": 0.22133333333333333, + "grad_norm": 0.2693012058734894, + "learning_rate": 4.420485349825332e-05, + "loss": 0.4696, + "step": 1245 + }, + { + "epoch": 0.2215111111111111, + "grad_norm": 0.3461697995662689, + "learning_rate": 4.4195911388264946e-05, + "loss": 0.4831, + "step": 1246 + }, + { + "epoch": 0.22168888888888888, + "grad_norm": 0.35696372389793396, + "learning_rate": 4.41869632905299e-05, + "loss": 0.3648, + "step": 1247 + }, + { + "epoch": 0.22186666666666666, + "grad_norm": 0.3228091299533844, + "learning_rate": 4.417800920783937e-05, + "loss": 0.372, + "step": 1248 + }, + { + "epoch": 0.22204444444444443, + "grad_norm": 0.3126201033592224, + "learning_rate": 4.4169049142986376e-05, + "loss": 0.4403, + "step": 1249 + }, + { + "epoch": 0.2222222222222222, + "grad_norm": 0.39341673254966736, + "learning_rate": 4.4160083098765815e-05, + "loss": 0.5878, + "step": 1250 + }, + { + "epoch": 0.2224, + "grad_norm": 0.22823891043663025, + "learning_rate": 4.415111107797445e-05, + "loss": 0.3206, + "step": 1251 + }, + { + "epoch": 0.2225777777777778, + "grad_norm": 0.22837820649147034, + "learning_rate": 4.414213308341092e-05, + "loss": 0.4378, + "step": 1252 + }, + { + "epoch": 0.22275555555555557, + "grad_norm": 0.2534780204296112, + "learning_rate": 4.413314911787569e-05, + "loss": 0.5127, + "step": 1253 + }, + { + "epoch": 0.22293333333333334, + "grad_norm": 0.23945263028144836, + "learning_rate": 4.4124159184171134e-05, + "loss": 0.6296, + "step": 1254 + }, + { + "epoch": 0.22311111111111112, + "grad_norm": 0.22508473694324493, + "learning_rate": 4.411516328510145e-05, + "loss": 0.532, + "step": 1255 + }, + { + "epoch": 0.2232888888888889, + "grad_norm": 0.23963187634944916, + "learning_rate": 4.410616142347273e-05, + "loss": 0.5243, + "step": 1256 + }, + { + "epoch": 0.22346666666666667, + "grad_norm": 0.26530513167381287, + "learning_rate": 4.409715360209289e-05, + "loss": 0.5661, + "step": 1257 + }, + { + "epoch": 0.22364444444444445, + "grad_norm": 0.22146882116794586, + "learning_rate": 4.4088139823771744e-05, + "loss": 0.4502, + "step": 1258 + }, + { + "epoch": 0.22382222222222223, + "grad_norm": 0.2245611846446991, + "learning_rate": 4.407912009132093e-05, + "loss": 0.5192, + "step": 1259 + }, + { + "epoch": 0.224, + "grad_norm": 0.20654036104679108, + "learning_rate": 4.407009440755396e-05, + "loss": 0.4832, + "step": 1260 + }, + { + "epoch": 0.22417777777777778, + "grad_norm": 0.17662639915943146, + "learning_rate": 4.40610627752862e-05, + "loss": 0.3809, + "step": 1261 + }, + { + "epoch": 0.22435555555555556, + "grad_norm": 0.23477929830551147, + "learning_rate": 4.4052025197334864e-05, + "loss": 0.4382, + "step": 1262 + }, + { + "epoch": 0.22453333333333333, + "grad_norm": 0.221193328499794, + "learning_rate": 4.404298167651905e-05, + "loss": 0.3814, + "step": 1263 + }, + { + "epoch": 0.2247111111111111, + "grad_norm": 0.2709603011608124, + "learning_rate": 4.403393221565966e-05, + "loss": 0.5322, + "step": 1264 + }, + { + "epoch": 0.2248888888888889, + "grad_norm": 0.2439824640750885, + "learning_rate": 4.40248768175795e-05, + "loss": 0.4397, + "step": 1265 + }, + { + "epoch": 0.22506666666666666, + "grad_norm": 0.2303110659122467, + "learning_rate": 4.401581548510318e-05, + "loss": 0.5983, + "step": 1266 + }, + { + "epoch": 0.22524444444444444, + "grad_norm": 0.33068788051605225, + "learning_rate": 4.4006748221057206e-05, + "loss": 0.6224, + "step": 1267 + }, + { + "epoch": 0.22542222222222222, + "grad_norm": 0.2883262634277344, + "learning_rate": 4.3997675028269906e-05, + "loss": 0.5691, + "step": 1268 + }, + { + "epoch": 0.2256, + "grad_norm": 0.20572085678577423, + "learning_rate": 4.3988595909571464e-05, + "loss": 0.3703, + "step": 1269 + }, + { + "epoch": 0.22577777777777777, + "grad_norm": 0.18315738439559937, + "learning_rate": 4.3979510867793917e-05, + "loss": 0.4485, + "step": 1270 + }, + { + "epoch": 0.22595555555555555, + "grad_norm": 0.30687716603279114, + "learning_rate": 4.3970419905771145e-05, + "loss": 0.7186, + "step": 1271 + }, + { + "epoch": 0.22613333333333333, + "grad_norm": 0.21920879185199738, + "learning_rate": 4.396132302633886e-05, + "loss": 0.4395, + "step": 1272 + }, + { + "epoch": 0.2263111111111111, + "grad_norm": 0.24590124189853668, + "learning_rate": 4.395222023233466e-05, + "loss": 0.5013, + "step": 1273 + }, + { + "epoch": 0.22648888888888888, + "grad_norm": 0.20799681544303894, + "learning_rate": 4.394311152659796e-05, + "loss": 0.5773, + "step": 1274 + }, + { + "epoch": 0.22666666666666666, + "grad_norm": 0.20395217835903168, + "learning_rate": 4.393399691197e-05, + "loss": 0.3824, + "step": 1275 + }, + { + "epoch": 0.22684444444444443, + "grad_norm": 0.24426783621311188, + "learning_rate": 4.3924876391293915e-05, + "loss": 0.4177, + "step": 1276 + }, + { + "epoch": 0.2270222222222222, + "grad_norm": 0.2574116289615631, + "learning_rate": 4.391574996741463e-05, + "loss": 0.4799, + "step": 1277 + }, + { + "epoch": 0.2272, + "grad_norm": 0.2484491765499115, + "learning_rate": 4.390661764317895e-05, + "loss": 0.5511, + "step": 1278 + }, + { + "epoch": 0.2273777777777778, + "grad_norm": 0.2698419690132141, + "learning_rate": 4.38974794214355e-05, + "loss": 0.4791, + "step": 1279 + }, + { + "epoch": 0.22755555555555557, + "grad_norm": 0.23384766280651093, + "learning_rate": 4.388833530503473e-05, + "loss": 0.4635, + "step": 1280 + }, + { + "epoch": 0.22773333333333334, + "grad_norm": 0.2407292127609253, + "learning_rate": 4.387918529682898e-05, + "loss": 0.46, + "step": 1281 + }, + { + "epoch": 0.22791111111111112, + "grad_norm": 0.27329519391059875, + "learning_rate": 4.387002939967237e-05, + "loss": 0.539, + "step": 1282 + }, + { + "epoch": 0.2280888888888889, + "grad_norm": 0.26004716753959656, + "learning_rate": 4.386086761642091e-05, + "loss": 0.574, + "step": 1283 + }, + { + "epoch": 0.22826666666666667, + "grad_norm": 0.19215545058250427, + "learning_rate": 4.3851699949932396e-05, + "loss": 0.3969, + "step": 1284 + }, + { + "epoch": 0.22844444444444445, + "grad_norm": 0.25178056955337524, + "learning_rate": 4.3842526403066486e-05, + "loss": 0.5273, + "step": 1285 + }, + { + "epoch": 0.22862222222222223, + "grad_norm": 0.2471136748790741, + "learning_rate": 4.3833346978684675e-05, + "loss": 0.5462, + "step": 1286 + }, + { + "epoch": 0.2288, + "grad_norm": 0.18890753388404846, + "learning_rate": 4.382416167965028e-05, + "loss": 0.4993, + "step": 1287 + }, + { + "epoch": 0.22897777777777778, + "grad_norm": 0.24137598276138306, + "learning_rate": 4.381497050882845e-05, + "loss": 0.3832, + "step": 1288 + }, + { + "epoch": 0.22915555555555556, + "grad_norm": 0.25049135088920593, + "learning_rate": 4.380577346908618e-05, + "loss": 0.5908, + "step": 1289 + }, + { + "epoch": 0.22933333333333333, + "grad_norm": 0.25297388434410095, + "learning_rate": 4.379657056329228e-05, + "loss": 0.5927, + "step": 1290 + }, + { + "epoch": 0.2295111111111111, + "grad_norm": 0.2016248106956482, + "learning_rate": 4.3787361794317405e-05, + "loss": 0.4903, + "step": 1291 + }, + { + "epoch": 0.2296888888888889, + "grad_norm": 0.20851749181747437, + "learning_rate": 4.3778147165034025e-05, + "loss": 0.3953, + "step": 1292 + }, + { + "epoch": 0.22986666666666666, + "grad_norm": 0.22823545336723328, + "learning_rate": 4.376892667831644e-05, + "loss": 0.3, + "step": 1293 + }, + { + "epoch": 0.23004444444444444, + "grad_norm": 0.22524063289165497, + "learning_rate": 4.375970033704077e-05, + "loss": 0.3724, + "step": 1294 + }, + { + "epoch": 0.23022222222222222, + "grad_norm": 0.343679815530777, + "learning_rate": 4.375046814408499e-05, + "loss": 0.4167, + "step": 1295 + }, + { + "epoch": 0.2304, + "grad_norm": 0.28717124462127686, + "learning_rate": 4.374123010232888e-05, + "loss": 0.4525, + "step": 1296 + }, + { + "epoch": 0.23057777777777777, + "grad_norm": 0.41317862272262573, + "learning_rate": 4.373198621465404e-05, + "loss": 0.4082, + "step": 1297 + }, + { + "epoch": 0.23075555555555555, + "grad_norm": 0.3291909694671631, + "learning_rate": 4.372273648394389e-05, + "loss": 0.436, + "step": 1298 + }, + { + "epoch": 0.23093333333333332, + "grad_norm": 0.3544304072856903, + "learning_rate": 4.37134809130837e-05, + "loss": 0.4362, + "step": 1299 + }, + { + "epoch": 0.2311111111111111, + "grad_norm": 0.3479328453540802, + "learning_rate": 4.370421950496054e-05, + "loss": 0.4789, + "step": 1300 + }, + { + "epoch": 0.23128888888888888, + "grad_norm": 0.2357267588376999, + "learning_rate": 4.36949522624633e-05, + "loss": 0.4947, + "step": 1301 + }, + { + "epoch": 0.23146666666666665, + "grad_norm": 0.27003100514411926, + "learning_rate": 4.368567918848269e-05, + "loss": 0.5034, + "step": 1302 + }, + { + "epoch": 0.23164444444444443, + "grad_norm": 0.26223084330558777, + "learning_rate": 4.3676400285911256e-05, + "loss": 0.538, + "step": 1303 + }, + { + "epoch": 0.23182222222222224, + "grad_norm": 0.23477938771247864, + "learning_rate": 4.3667115557643336e-05, + "loss": 0.5616, + "step": 1304 + }, + { + "epoch": 0.232, + "grad_norm": 0.21605823934078217, + "learning_rate": 4.3657825006575106e-05, + "loss": 0.4894, + "step": 1305 + }, + { + "epoch": 0.2321777777777778, + "grad_norm": 0.25563398003578186, + "learning_rate": 4.3648528635604556e-05, + "loss": 0.4244, + "step": 1306 + }, + { + "epoch": 0.23235555555555557, + "grad_norm": 0.23986752331256866, + "learning_rate": 4.363922644763147e-05, + "loss": 0.5559, + "step": 1307 + }, + { + "epoch": 0.23253333333333334, + "grad_norm": 0.2909373939037323, + "learning_rate": 4.362991844555749e-05, + "loss": 0.5399, + "step": 1308 + }, + { + "epoch": 0.23271111111111112, + "grad_norm": 0.2384665608406067, + "learning_rate": 4.3620604632286024e-05, + "loss": 0.4536, + "step": 1309 + }, + { + "epoch": 0.2328888888888889, + "grad_norm": 0.25972333550453186, + "learning_rate": 4.361128501072231e-05, + "loss": 0.5464, + "step": 1310 + }, + { + "epoch": 0.23306666666666667, + "grad_norm": 0.23711919784545898, + "learning_rate": 4.3601959583773415e-05, + "loss": 0.5016, + "step": 1311 + }, + { + "epoch": 0.23324444444444445, + "grad_norm": 0.18047326803207397, + "learning_rate": 4.35926283543482e-05, + "loss": 0.4517, + "step": 1312 + }, + { + "epoch": 0.23342222222222223, + "grad_norm": 0.2347806692123413, + "learning_rate": 4.358329132535733e-05, + "loss": 0.4599, + "step": 1313 + }, + { + "epoch": 0.2336, + "grad_norm": 0.2155926376581192, + "learning_rate": 4.35739484997133e-05, + "loss": 0.4324, + "step": 1314 + }, + { + "epoch": 0.23377777777777778, + "grad_norm": 0.2632337808609009, + "learning_rate": 4.356459988033039e-05, + "loss": 0.532, + "step": 1315 + }, + { + "epoch": 0.23395555555555556, + "grad_norm": 0.23097728192806244, + "learning_rate": 4.355524547012471e-05, + "loss": 0.4209, + "step": 1316 + }, + { + "epoch": 0.23413333333333333, + "grad_norm": 0.24183626472949982, + "learning_rate": 4.354588527201414e-05, + "loss": 0.5361, + "step": 1317 + }, + { + "epoch": 0.2343111111111111, + "grad_norm": 0.2498297095298767, + "learning_rate": 4.353651928891842e-05, + "loss": 0.4496, + "step": 1318 + }, + { + "epoch": 0.23448888888888889, + "grad_norm": 0.1860927790403366, + "learning_rate": 4.352714752375906e-05, + "loss": 0.3423, + "step": 1319 + }, + { + "epoch": 0.23466666666666666, + "grad_norm": 0.24412637948989868, + "learning_rate": 4.351776997945936e-05, + "loss": 0.5076, + "step": 1320 + }, + { + "epoch": 0.23484444444444444, + "grad_norm": 0.3298015892505646, + "learning_rate": 4.350838665894446e-05, + "loss": 0.563, + "step": 1321 + }, + { + "epoch": 0.23502222222222222, + "grad_norm": 0.2763049900531769, + "learning_rate": 4.3498997565141267e-05, + "loss": 0.4932, + "step": 1322 + }, + { + "epoch": 0.2352, + "grad_norm": 0.2645281255245209, + "learning_rate": 4.348960270097851e-05, + "loss": 0.4046, + "step": 1323 + }, + { + "epoch": 0.23537777777777777, + "grad_norm": 0.3076537549495697, + "learning_rate": 4.348020206938672e-05, + "loss": 0.5533, + "step": 1324 + }, + { + "epoch": 0.23555555555555555, + "grad_norm": 0.21966569125652313, + "learning_rate": 4.3470795673298206e-05, + "loss": 0.4752, + "step": 1325 + }, + { + "epoch": 0.23573333333333332, + "grad_norm": 0.2737371027469635, + "learning_rate": 4.3461383515647106e-05, + "loss": 0.4063, + "step": 1326 + }, + { + "epoch": 0.2359111111111111, + "grad_norm": 0.2169768065214157, + "learning_rate": 4.345196559936932e-05, + "loss": 0.5452, + "step": 1327 + }, + { + "epoch": 0.23608888888888888, + "grad_norm": 0.2770763635635376, + "learning_rate": 4.3442541927402566e-05, + "loss": 0.4594, + "step": 1328 + }, + { + "epoch": 0.23626666666666668, + "grad_norm": 0.22036604583263397, + "learning_rate": 4.3433112502686355e-05, + "loss": 0.4301, + "step": 1329 + }, + { + "epoch": 0.23644444444444446, + "grad_norm": 0.2196999043226242, + "learning_rate": 4.3423677328161996e-05, + "loss": 0.4142, + "step": 1330 + }, + { + "epoch": 0.23662222222222223, + "grad_norm": 0.26208990812301636, + "learning_rate": 4.3414236406772584e-05, + "loss": 0.4838, + "step": 1331 + }, + { + "epoch": 0.2368, + "grad_norm": 0.18558618426322937, + "learning_rate": 4.3404789741463e-05, + "loss": 0.4307, + "step": 1332 + }, + { + "epoch": 0.2369777777777778, + "grad_norm": 0.2166328877210617, + "learning_rate": 4.3395337335179945e-05, + "loss": 0.5326, + "step": 1333 + }, + { + "epoch": 0.23715555555555556, + "grad_norm": 0.24006320536136627, + "learning_rate": 4.338587919087187e-05, + "loss": 0.5539, + "step": 1334 + }, + { + "epoch": 0.23733333333333334, + "grad_norm": 0.1928250789642334, + "learning_rate": 4.3376415311489056e-05, + "loss": 0.4231, + "step": 1335 + }, + { + "epoch": 0.23751111111111112, + "grad_norm": 0.2704288363456726, + "learning_rate": 4.336694569998354e-05, + "loss": 0.4847, + "step": 1336 + }, + { + "epoch": 0.2376888888888889, + "grad_norm": 0.24226625263690948, + "learning_rate": 4.335747035930916e-05, + "loss": 0.5627, + "step": 1337 + }, + { + "epoch": 0.23786666666666667, + "grad_norm": 0.25263598561286926, + "learning_rate": 4.334798929242155e-05, + "loss": 0.5061, + "step": 1338 + }, + { + "epoch": 0.23804444444444445, + "grad_norm": 0.22211408615112305, + "learning_rate": 4.3338502502278134e-05, + "loss": 0.5774, + "step": 1339 + }, + { + "epoch": 0.23822222222222222, + "grad_norm": 0.24443838000297546, + "learning_rate": 4.3329009991838084e-05, + "loss": 0.4604, + "step": 1340 + }, + { + "epoch": 0.2384, + "grad_norm": 0.25341013073921204, + "learning_rate": 4.331951176406239e-05, + "loss": 0.5761, + "step": 1341 + }, + { + "epoch": 0.23857777777777778, + "grad_norm": 0.2710097134113312, + "learning_rate": 4.3310007821913836e-05, + "loss": 0.3894, + "step": 1342 + }, + { + "epoch": 0.23875555555555555, + "grad_norm": 0.2957439422607422, + "learning_rate": 4.330049816835694e-05, + "loss": 0.3945, + "step": 1343 + }, + { + "epoch": 0.23893333333333333, + "grad_norm": 0.26889345049858093, + "learning_rate": 4.3290982806358046e-05, + "loss": 0.3815, + "step": 1344 + }, + { + "epoch": 0.2391111111111111, + "grad_norm": 0.2689574062824249, + "learning_rate": 4.3281461738885274e-05, + "loss": 0.4134, + "step": 1345 + }, + { + "epoch": 0.23928888888888888, + "grad_norm": 0.24850115180015564, + "learning_rate": 4.3271934968908514e-05, + "loss": 0.4146, + "step": 1346 + }, + { + "epoch": 0.23946666666666666, + "grad_norm": 0.41217899322509766, + "learning_rate": 4.3262402499399404e-05, + "loss": 0.5276, + "step": 1347 + }, + { + "epoch": 0.23964444444444444, + "grad_norm": 0.3405587673187256, + "learning_rate": 4.325286433333142e-05, + "loss": 0.4423, + "step": 1348 + }, + { + "epoch": 0.23982222222222221, + "grad_norm": 0.32275477051734924, + "learning_rate": 4.3243320473679785e-05, + "loss": 0.3751, + "step": 1349 + }, + { + "epoch": 0.24, + "grad_norm": 0.3968574106693268, + "learning_rate": 4.323377092342148e-05, + "loss": 0.3679, + "step": 1350 + }, + { + "epoch": 0.24017777777777777, + "grad_norm": 0.2869572937488556, + "learning_rate": 4.3224215685535294e-05, + "loss": 0.4279, + "step": 1351 + }, + { + "epoch": 0.24035555555555554, + "grad_norm": 0.25271180272102356, + "learning_rate": 4.321465476300177e-05, + "loss": 0.5202, + "step": 1352 + }, + { + "epoch": 0.24053333333333332, + "grad_norm": 0.2373661994934082, + "learning_rate": 4.3205088158803226e-05, + "loss": 0.545, + "step": 1353 + }, + { + "epoch": 0.2407111111111111, + "grad_norm": 0.22581227123737335, + "learning_rate": 4.319551587592376e-05, + "loss": 0.5069, + "step": 1354 + }, + { + "epoch": 0.2408888888888889, + "grad_norm": 0.21236656606197357, + "learning_rate": 4.318593791734924e-05, + "loss": 0.4859, + "step": 1355 + }, + { + "epoch": 0.24106666666666668, + "grad_norm": 0.20294010639190674, + "learning_rate": 4.31763542860673e-05, + "loss": 0.4251, + "step": 1356 + }, + { + "epoch": 0.24124444444444446, + "grad_norm": 0.24016942083835602, + "learning_rate": 4.3166764985067343e-05, + "loss": 0.4283, + "step": 1357 + }, + { + "epoch": 0.24142222222222223, + "grad_norm": 0.2627848982810974, + "learning_rate": 4.3157170017340545e-05, + "loss": 0.521, + "step": 1358 + }, + { + "epoch": 0.2416, + "grad_norm": 0.23952418565750122, + "learning_rate": 4.314756938587984e-05, + "loss": 0.5327, + "step": 1359 + }, + { + "epoch": 0.24177777777777779, + "grad_norm": 0.3013181686401367, + "learning_rate": 4.3137963093679945e-05, + "loss": 0.5394, + "step": 1360 + }, + { + "epoch": 0.24195555555555556, + "grad_norm": 0.24400077760219574, + "learning_rate": 4.3128351143737335e-05, + "loss": 0.5534, + "step": 1361 + }, + { + "epoch": 0.24213333333333334, + "grad_norm": 0.2012927532196045, + "learning_rate": 4.3118733539050244e-05, + "loss": 0.4136, + "step": 1362 + }, + { + "epoch": 0.24231111111111112, + "grad_norm": 0.2734348475933075, + "learning_rate": 4.310911028261867e-05, + "loss": 0.5149, + "step": 1363 + }, + { + "epoch": 0.2424888888888889, + "grad_norm": 0.23700101673603058, + "learning_rate": 4.3099481377444384e-05, + "loss": 0.4956, + "step": 1364 + }, + { + "epoch": 0.24266666666666667, + "grad_norm": 0.2542860507965088, + "learning_rate": 4.308984682653092e-05, + "loss": 0.6563, + "step": 1365 + }, + { + "epoch": 0.24284444444444445, + "grad_norm": 0.21225221455097198, + "learning_rate": 4.3080206632883554e-05, + "loss": 0.3805, + "step": 1366 + }, + { + "epoch": 0.24302222222222222, + "grad_norm": 0.21046310663223267, + "learning_rate": 4.307056079950934e-05, + "loss": 0.483, + "step": 1367 + }, + { + "epoch": 0.2432, + "grad_norm": 0.20668034255504608, + "learning_rate": 4.306090932941708e-05, + "loss": 0.4549, + "step": 1368 + }, + { + "epoch": 0.24337777777777778, + "grad_norm": 0.24548202753067017, + "learning_rate": 4.305125222561736e-05, + "loss": 0.6477, + "step": 1369 + }, + { + "epoch": 0.24355555555555555, + "grad_norm": 0.21200339496135712, + "learning_rate": 4.304158949112247e-05, + "loss": 0.4711, + "step": 1370 + }, + { + "epoch": 0.24373333333333333, + "grad_norm": 0.2290968894958496, + "learning_rate": 4.303192112894652e-05, + "loss": 0.4094, + "step": 1371 + }, + { + "epoch": 0.2439111111111111, + "grad_norm": 0.3123732805252075, + "learning_rate": 4.302224714210532e-05, + "loss": 0.5358, + "step": 1372 + }, + { + "epoch": 0.24408888888888888, + "grad_norm": 0.22897206246852875, + "learning_rate": 4.301256753361649e-05, + "loss": 0.5369, + "step": 1373 + }, + { + "epoch": 0.24426666666666666, + "grad_norm": 0.23908253014087677, + "learning_rate": 4.3002882306499345e-05, + "loss": 0.3527, + "step": 1374 + }, + { + "epoch": 0.24444444444444444, + "grad_norm": 0.17792123556137085, + "learning_rate": 4.2993191463774997e-05, + "loss": 0.4548, + "step": 1375 + }, + { + "epoch": 0.2446222222222222, + "grad_norm": 0.2578827738761902, + "learning_rate": 4.2983495008466276e-05, + "loss": 0.6232, + "step": 1376 + }, + { + "epoch": 0.2448, + "grad_norm": 0.28514882922172546, + "learning_rate": 4.297379294359781e-05, + "loss": 0.5848, + "step": 1377 + }, + { + "epoch": 0.24497777777777777, + "grad_norm": 0.29542988538742065, + "learning_rate": 4.296408527219592e-05, + "loss": 0.5274, + "step": 1378 + }, + { + "epoch": 0.24515555555555554, + "grad_norm": 0.23926961421966553, + "learning_rate": 4.295437199728871e-05, + "loss": 0.5209, + "step": 1379 + }, + { + "epoch": 0.24533333333333332, + "grad_norm": 0.22336643934249878, + "learning_rate": 4.294465312190603e-05, + "loss": 0.3908, + "step": 1380 + }, + { + "epoch": 0.24551111111111112, + "grad_norm": 0.22745724022388458, + "learning_rate": 4.293492864907947e-05, + "loss": 0.6449, + "step": 1381 + }, + { + "epoch": 0.2456888888888889, + "grad_norm": 0.29059791564941406, + "learning_rate": 4.292519858184236e-05, + "loss": 0.535, + "step": 1382 + }, + { + "epoch": 0.24586666666666668, + "grad_norm": 0.21000346541404724, + "learning_rate": 4.291546292322979e-05, + "loss": 0.4864, + "step": 1383 + }, + { + "epoch": 0.24604444444444445, + "grad_norm": 0.2529181241989136, + "learning_rate": 4.290572167627859e-05, + "loss": 0.4834, + "step": 1384 + }, + { + "epoch": 0.24622222222222223, + "grad_norm": 0.2576427161693573, + "learning_rate": 4.289597484402732e-05, + "loss": 0.7625, + "step": 1385 + }, + { + "epoch": 0.2464, + "grad_norm": 0.27456507086753845, + "learning_rate": 4.2886222429516296e-05, + "loss": 0.5583, + "step": 1386 + }, + { + "epoch": 0.24657777777777778, + "grad_norm": 0.30464449524879456, + "learning_rate": 4.287646443578758e-05, + "loss": 0.5193, + "step": 1387 + }, + { + "epoch": 0.24675555555555556, + "grad_norm": 0.2272568792104721, + "learning_rate": 4.2866700865884954e-05, + "loss": 0.3593, + "step": 1388 + }, + { + "epoch": 0.24693333333333334, + "grad_norm": 0.3084667921066284, + "learning_rate": 4.285693172285396e-05, + "loss": 0.5499, + "step": 1389 + }, + { + "epoch": 0.24711111111111111, + "grad_norm": 0.2830738425254822, + "learning_rate": 4.2847157009741856e-05, + "loss": 0.5987, + "step": 1390 + }, + { + "epoch": 0.2472888888888889, + "grad_norm": 0.2028263807296753, + "learning_rate": 4.283737672959766e-05, + "loss": 0.414, + "step": 1391 + }, + { + "epoch": 0.24746666666666667, + "grad_norm": 0.2376394122838974, + "learning_rate": 4.2827590885472125e-05, + "loss": 0.4055, + "step": 1392 + }, + { + "epoch": 0.24764444444444444, + "grad_norm": 0.26731806993484497, + "learning_rate": 4.281779948041772e-05, + "loss": 0.403, + "step": 1393 + }, + { + "epoch": 0.24782222222222222, + "grad_norm": 0.34051623940467834, + "learning_rate": 4.2808002517488667e-05, + "loss": 0.442, + "step": 1394 + }, + { + "epoch": 0.248, + "grad_norm": 0.30593159794807434, + "learning_rate": 4.279819999974091e-05, + "loss": 0.4027, + "step": 1395 + }, + { + "epoch": 0.24817777777777777, + "grad_norm": 0.3657020330429077, + "learning_rate": 4.278839193023214e-05, + "loss": 0.4575, + "step": 1396 + }, + { + "epoch": 0.24835555555555555, + "grad_norm": 0.34744617342948914, + "learning_rate": 4.2778578312021754e-05, + "loss": 0.4608, + "step": 1397 + }, + { + "epoch": 0.24853333333333333, + "grad_norm": 0.27552008628845215, + "learning_rate": 4.2768759148170915e-05, + "loss": 0.3578, + "step": 1398 + }, + { + "epoch": 0.2487111111111111, + "grad_norm": 0.3841858506202698, + "learning_rate": 4.2758934441742496e-05, + "loss": 0.4694, + "step": 1399 + }, + { + "epoch": 0.24888888888888888, + "grad_norm": 0.5377020835876465, + "learning_rate": 4.274910419580108e-05, + "loss": 0.4992, + "step": 1400 + }, + { + "epoch": 0.24906666666666666, + "grad_norm": 0.30639055371284485, + "learning_rate": 4.273926841341302e-05, + "loss": 0.5411, + "step": 1401 + }, + { + "epoch": 0.24924444444444444, + "grad_norm": 0.23703326284885406, + "learning_rate": 4.272942709764638e-05, + "loss": 0.4505, + "step": 1402 + }, + { + "epoch": 0.2494222222222222, + "grad_norm": 0.2635270059108734, + "learning_rate": 4.2719580251570915e-05, + "loss": 0.4303, + "step": 1403 + }, + { + "epoch": 0.2496, + "grad_norm": 0.34187769889831543, + "learning_rate": 4.270972787825815e-05, + "loss": 0.5522, + "step": 1404 + }, + { + "epoch": 0.24977777777777777, + "grad_norm": 0.21534954011440277, + "learning_rate": 4.269986998078132e-05, + "loss": 0.468, + "step": 1405 + }, + { + "epoch": 0.24995555555555554, + "grad_norm": 0.2620059549808502, + "learning_rate": 4.2690006562215384e-05, + "loss": 0.6601, + "step": 1406 + }, + { + "epoch": 0.2501333333333333, + "grad_norm": 0.23461578786373138, + "learning_rate": 4.268013762563702e-05, + "loss": 0.452, + "step": 1407 + }, + { + "epoch": 0.2503111111111111, + "grad_norm": 0.210880309343338, + "learning_rate": 4.267026317412461e-05, + "loss": 0.4634, + "step": 1408 + }, + { + "epoch": 0.25048888888888887, + "grad_norm": 0.36030298471450806, + "learning_rate": 4.266038321075831e-05, + "loss": 0.527, + "step": 1409 + }, + { + "epoch": 0.25066666666666665, + "grad_norm": 0.27505359053611755, + "learning_rate": 4.265049773861991e-05, + "loss": 0.6108, + "step": 1410 + }, + { + "epoch": 0.2508444444444444, + "grad_norm": 0.19059057533740997, + "learning_rate": 4.264060676079302e-05, + "loss": 0.344, + "step": 1411 + }, + { + "epoch": 0.2510222222222222, + "grad_norm": 0.17850477993488312, + "learning_rate": 4.263071028036288e-05, + "loss": 0.4582, + "step": 1412 + }, + { + "epoch": 0.2512, + "grad_norm": 0.23579291999340057, + "learning_rate": 4.26208083004165e-05, + "loss": 0.5083, + "step": 1413 + }, + { + "epoch": 0.25137777777777776, + "grad_norm": 0.21456308662891388, + "learning_rate": 4.261090082404258e-05, + "loss": 0.4719, + "step": 1414 + }, + { + "epoch": 0.25155555555555553, + "grad_norm": 0.22844012081623077, + "learning_rate": 4.260098785433154e-05, + "loss": 0.5167, + "step": 1415 + }, + { + "epoch": 0.2517333333333333, + "grad_norm": 0.2279992699623108, + "learning_rate": 4.259106939437551e-05, + "loss": 0.4802, + "step": 1416 + }, + { + "epoch": 0.2519111111111111, + "grad_norm": 0.21874922513961792, + "learning_rate": 4.258114544726835e-05, + "loss": 0.4884, + "step": 1417 + }, + { + "epoch": 0.25208888888888886, + "grad_norm": 0.20955197513103485, + "learning_rate": 4.2571216016105614e-05, + "loss": 0.496, + "step": 1418 + }, + { + "epoch": 0.25226666666666664, + "grad_norm": 0.22666418552398682, + "learning_rate": 4.256128110398457e-05, + "loss": 0.5054, + "step": 1419 + }, + { + "epoch": 0.25244444444444447, + "grad_norm": 0.20175351202487946, + "learning_rate": 4.2551340714004203e-05, + "loss": 0.4, + "step": 1420 + }, + { + "epoch": 0.25262222222222225, + "grad_norm": 0.2647659182548523, + "learning_rate": 4.254139484926519e-05, + "loss": 0.4324, + "step": 1421 + }, + { + "epoch": 0.2528, + "grad_norm": 0.2721613049507141, + "learning_rate": 4.253144351286994e-05, + "loss": 0.4467, + "step": 1422 + }, + { + "epoch": 0.2529777777777778, + "grad_norm": 0.2117205709218979, + "learning_rate": 4.252148670792254e-05, + "loss": 0.435, + "step": 1423 + }, + { + "epoch": 0.2531555555555556, + "grad_norm": 0.23721779882907867, + "learning_rate": 4.2511524437528825e-05, + "loss": 0.4304, + "step": 1424 + }, + { + "epoch": 0.25333333333333335, + "grad_norm": 0.21435852348804474, + "learning_rate": 4.250155670479628e-05, + "loss": 0.4478, + "step": 1425 + }, + { + "epoch": 0.25351111111111113, + "grad_norm": 0.22277596592903137, + "learning_rate": 4.249158351283414e-05, + "loss": 0.5614, + "step": 1426 + }, + { + "epoch": 0.2536888888888889, + "grad_norm": 0.28016769886016846, + "learning_rate": 4.248160486475331e-05, + "loss": 0.5587, + "step": 1427 + }, + { + "epoch": 0.2538666666666667, + "grad_norm": 0.24636779725551605, + "learning_rate": 4.247162076366643e-05, + "loss": 0.4944, + "step": 1428 + }, + { + "epoch": 0.25404444444444446, + "grad_norm": 0.22983276844024658, + "learning_rate": 4.2461631212687816e-05, + "loss": 0.5169, + "step": 1429 + }, + { + "epoch": 0.25422222222222224, + "grad_norm": 0.22834815084934235, + "learning_rate": 4.245163621493349e-05, + "loss": 0.4633, + "step": 1430 + }, + { + "epoch": 0.2544, + "grad_norm": 0.1989385187625885, + "learning_rate": 4.244163577352116e-05, + "loss": 0.4785, + "step": 1431 + }, + { + "epoch": 0.2545777777777778, + "grad_norm": 0.32826298475265503, + "learning_rate": 4.2431629891570266e-05, + "loss": 0.6937, + "step": 1432 + }, + { + "epoch": 0.25475555555555557, + "grad_norm": 0.24005098640918732, + "learning_rate": 4.242161857220193e-05, + "loss": 0.6591, + "step": 1433 + }, + { + "epoch": 0.25493333333333335, + "grad_norm": 0.20080901682376862, + "learning_rate": 4.241160181853894e-05, + "loss": 0.4658, + "step": 1434 + }, + { + "epoch": 0.2551111111111111, + "grad_norm": 0.2073754072189331, + "learning_rate": 4.240157963370582e-05, + "loss": 0.5901, + "step": 1435 + }, + { + "epoch": 0.2552888888888889, + "grad_norm": 0.2263334095478058, + "learning_rate": 4.2391552020828775e-05, + "loss": 0.3843, + "step": 1436 + }, + { + "epoch": 0.2554666666666667, + "grad_norm": 0.2785729169845581, + "learning_rate": 4.238151898303569e-05, + "loss": 0.4735, + "step": 1437 + }, + { + "epoch": 0.25564444444444445, + "grad_norm": 0.2540399432182312, + "learning_rate": 4.237148052345616e-05, + "loss": 0.5184, + "step": 1438 + }, + { + "epoch": 0.25582222222222223, + "grad_norm": 0.24783490598201752, + "learning_rate": 4.236143664522146e-05, + "loss": 0.4601, + "step": 1439 + }, + { + "epoch": 0.256, + "grad_norm": 0.2024785727262497, + "learning_rate": 4.2351387351464565e-05, + "loss": 0.4025, + "step": 1440 + }, + { + "epoch": 0.2561777777777778, + "grad_norm": 0.24628347158432007, + "learning_rate": 4.234133264532012e-05, + "loss": 0.3148, + "step": 1441 + }, + { + "epoch": 0.25635555555555556, + "grad_norm": 0.33022361993789673, + "learning_rate": 4.2331272529924495e-05, + "loss": 0.4063, + "step": 1442 + }, + { + "epoch": 0.25653333333333334, + "grad_norm": 0.3090986907482147, + "learning_rate": 4.232120700841571e-05, + "loss": 0.3836, + "step": 1443 + }, + { + "epoch": 0.2567111111111111, + "grad_norm": 0.2894114553928375, + "learning_rate": 4.231113608393348e-05, + "loss": 0.4113, + "step": 1444 + }, + { + "epoch": 0.2568888888888889, + "grad_norm": 0.298056423664093, + "learning_rate": 4.230105975961921e-05, + "loss": 0.3889, + "step": 1445 + }, + { + "epoch": 0.25706666666666667, + "grad_norm": 0.29035723209381104, + "learning_rate": 4.2290978038616e-05, + "loss": 0.4038, + "step": 1446 + }, + { + "epoch": 0.25724444444444444, + "grad_norm": 0.3252440392971039, + "learning_rate": 4.2280890924068625e-05, + "loss": 0.4141, + "step": 1447 + }, + { + "epoch": 0.2574222222222222, + "grad_norm": 0.36125317215919495, + "learning_rate": 4.2270798419123534e-05, + "loss": 0.41, + "step": 1448 + }, + { + "epoch": 0.2576, + "grad_norm": 0.4312919080257416, + "learning_rate": 4.226070052692886e-05, + "loss": 0.5784, + "step": 1449 + }, + { + "epoch": 0.2577777777777778, + "grad_norm": 0.3851270079612732, + "learning_rate": 4.225059725063444e-05, + "loss": 0.4461, + "step": 1450 + }, + { + "epoch": 0.25795555555555555, + "grad_norm": 0.22475066781044006, + "learning_rate": 4.224048859339175e-05, + "loss": 0.4587, + "step": 1451 + }, + { + "epoch": 0.2581333333333333, + "grad_norm": 0.24634967744350433, + "learning_rate": 4.223037455835397e-05, + "loss": 0.4977, + "step": 1452 + }, + { + "epoch": 0.2583111111111111, + "grad_norm": 0.26186516880989075, + "learning_rate": 4.2220255148675956e-05, + "loss": 0.478, + "step": 1453 + }, + { + "epoch": 0.2584888888888889, + "grad_norm": 0.19384765625, + "learning_rate": 4.221013036751424e-05, + "loss": 0.4362, + "step": 1454 + }, + { + "epoch": 0.25866666666666666, + "grad_norm": 0.2394952028989792, + "learning_rate": 4.220000021802702e-05, + "loss": 0.4828, + "step": 1455 + }, + { + "epoch": 0.25884444444444443, + "grad_norm": 0.23462162911891937, + "learning_rate": 4.218986470337419e-05, + "loss": 0.4785, + "step": 1456 + }, + { + "epoch": 0.2590222222222222, + "grad_norm": 0.26691412925720215, + "learning_rate": 4.217972382671729e-05, + "loss": 0.6649, + "step": 1457 + }, + { + "epoch": 0.2592, + "grad_norm": 0.2758161425590515, + "learning_rate": 4.2169577591219545e-05, + "loss": 0.5351, + "step": 1458 + }, + { + "epoch": 0.25937777777777776, + "grad_norm": 0.2521554231643677, + "learning_rate": 4.2159426000045854e-05, + "loss": 0.5177, + "step": 1459 + }, + { + "epoch": 0.25955555555555554, + "grad_norm": 0.365726113319397, + "learning_rate": 4.2149269056362794e-05, + "loss": 0.5673, + "step": 1460 + }, + { + "epoch": 0.2597333333333333, + "grad_norm": 0.24749819934368134, + "learning_rate": 4.213910676333859e-05, + "loss": 0.5196, + "step": 1461 + }, + { + "epoch": 0.2599111111111111, + "grad_norm": 0.2708158791065216, + "learning_rate": 4.212893912414316e-05, + "loss": 0.5719, + "step": 1462 + }, + { + "epoch": 0.26008888888888887, + "grad_norm": 0.2683485746383667, + "learning_rate": 4.2118766141948066e-05, + "loss": 0.3993, + "step": 1463 + }, + { + "epoch": 0.26026666666666665, + "grad_norm": 0.22644978761672974, + "learning_rate": 4.2108587819926554e-05, + "loss": 0.5259, + "step": 1464 + }, + { + "epoch": 0.2604444444444444, + "grad_norm": 0.2376643419265747, + "learning_rate": 4.209840416125353e-05, + "loss": 0.5097, + "step": 1465 + }, + { + "epoch": 0.2606222222222222, + "grad_norm": 0.23214809596538544, + "learning_rate": 4.208821516910557e-05, + "loss": 0.5128, + "step": 1466 + }, + { + "epoch": 0.2608, + "grad_norm": 0.2017192542552948, + "learning_rate": 4.20780208466609e-05, + "loss": 0.3825, + "step": 1467 + }, + { + "epoch": 0.26097777777777775, + "grad_norm": 0.21186144649982452, + "learning_rate": 4.206782119709942e-05, + "loss": 0.4807, + "step": 1468 + }, + { + "epoch": 0.26115555555555553, + "grad_norm": 0.21056553721427917, + "learning_rate": 4.2057616223602684e-05, + "loss": 0.557, + "step": 1469 + }, + { + "epoch": 0.2613333333333333, + "grad_norm": 0.21942351758480072, + "learning_rate": 4.204740592935392e-05, + "loss": 0.3926, + "step": 1470 + }, + { + "epoch": 0.26151111111111114, + "grad_norm": 0.21538221836090088, + "learning_rate": 4.2037190317538e-05, + "loss": 0.4913, + "step": 1471 + }, + { + "epoch": 0.2616888888888889, + "grad_norm": 0.3418629765510559, + "learning_rate": 4.202696939134146e-05, + "loss": 0.6669, + "step": 1472 + }, + { + "epoch": 0.2618666666666667, + "grad_norm": 0.26230230927467346, + "learning_rate": 4.2016743153952505e-05, + "loss": 0.5268, + "step": 1473 + }, + { + "epoch": 0.26204444444444447, + "grad_norm": 0.21536362171173096, + "learning_rate": 4.200651160856098e-05, + "loss": 0.4836, + "step": 1474 + }, + { + "epoch": 0.26222222222222225, + "grad_norm": 0.23902572691440582, + "learning_rate": 4.19962747583584e-05, + "loss": 0.6719, + "step": 1475 + }, + { + "epoch": 0.2624, + "grad_norm": 0.23020555078983307, + "learning_rate": 4.198603260653792e-05, + "loss": 0.5276, + "step": 1476 + }, + { + "epoch": 0.2625777777777778, + "grad_norm": 0.2669456899166107, + "learning_rate": 4.197578515629435e-05, + "loss": 0.4361, + "step": 1477 + }, + { + "epoch": 0.2627555555555556, + "grad_norm": 0.24579648673534393, + "learning_rate": 4.196553241082418e-05, + "loss": 0.568, + "step": 1478 + }, + { + "epoch": 0.26293333333333335, + "grad_norm": 0.23900336027145386, + "learning_rate": 4.1955274373325506e-05, + "loss": 0.5411, + "step": 1479 + }, + { + "epoch": 0.26311111111111113, + "grad_norm": 0.2443108707666397, + "learning_rate": 4.194501104699812e-05, + "loss": 0.4607, + "step": 1480 + }, + { + "epoch": 0.2632888888888889, + "grad_norm": 0.27022042870521545, + "learning_rate": 4.193474243504343e-05, + "loss": 0.5241, + "step": 1481 + }, + { + "epoch": 0.2634666666666667, + "grad_norm": 0.2121763527393341, + "learning_rate": 4.192446854066452e-05, + "loss": 0.4122, + "step": 1482 + }, + { + "epoch": 0.26364444444444446, + "grad_norm": 0.256328821182251, + "learning_rate": 4.1914189367066094e-05, + "loss": 0.5656, + "step": 1483 + }, + { + "epoch": 0.26382222222222224, + "grad_norm": 0.22856400907039642, + "learning_rate": 4.1903904917454516e-05, + "loss": 0.5028, + "step": 1484 + }, + { + "epoch": 0.264, + "grad_norm": 0.24136513471603394, + "learning_rate": 4.18936151950378e-05, + "loss": 0.3829, + "step": 1485 + }, + { + "epoch": 0.2641777777777778, + "grad_norm": 0.26403841376304626, + "learning_rate": 4.188332020302561e-05, + "loss": 0.4771, + "step": 1486 + }, + { + "epoch": 0.26435555555555557, + "grad_norm": 0.2241847813129425, + "learning_rate": 4.187301994462924e-05, + "loss": 0.4929, + "step": 1487 + }, + { + "epoch": 0.26453333333333334, + "grad_norm": 0.21930277347564697, + "learning_rate": 4.1862714423061624e-05, + "loss": 0.4396, + "step": 1488 + }, + { + "epoch": 0.2647111111111111, + "grad_norm": 0.23635181784629822, + "learning_rate": 4.185240364153734e-05, + "loss": 0.5611, + "step": 1489 + }, + { + "epoch": 0.2648888888888889, + "grad_norm": 0.2261408120393753, + "learning_rate": 4.184208760327263e-05, + "loss": 0.439, + "step": 1490 + }, + { + "epoch": 0.2650666666666667, + "grad_norm": 0.2616260349750519, + "learning_rate": 4.183176631148534e-05, + "loss": 0.3843, + "step": 1491 + }, + { + "epoch": 0.26524444444444445, + "grad_norm": 0.2506815493106842, + "learning_rate": 4.1821439769395e-05, + "loss": 0.3813, + "step": 1492 + }, + { + "epoch": 0.2654222222222222, + "grad_norm": 0.2863088846206665, + "learning_rate": 4.181110798022271e-05, + "loss": 0.5414, + "step": 1493 + }, + { + "epoch": 0.2656, + "grad_norm": 0.2668296992778778, + "learning_rate": 4.180077094719128e-05, + "loss": 0.3518, + "step": 1494 + }, + { + "epoch": 0.2657777777777778, + "grad_norm": 0.43989288806915283, + "learning_rate": 4.179042867352511e-05, + "loss": 0.4817, + "step": 1495 + }, + { + "epoch": 0.26595555555555556, + "grad_norm": 0.2694304585456848, + "learning_rate": 4.178008116245024e-05, + "loss": 0.4193, + "step": 1496 + }, + { + "epoch": 0.26613333333333333, + "grad_norm": 0.3411727547645569, + "learning_rate": 4.176972841719435e-05, + "loss": 0.4088, + "step": 1497 + }, + { + "epoch": 0.2663111111111111, + "grad_norm": 0.32642364501953125, + "learning_rate": 4.1759370440986775e-05, + "loss": 0.5118, + "step": 1498 + }, + { + "epoch": 0.2664888888888889, + "grad_norm": 0.4388028681278229, + "learning_rate": 4.174900723705845e-05, + "loss": 0.5624, + "step": 1499 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 0.35872790217399597, + "learning_rate": 4.1738638808641936e-05, + "loss": 0.4269, + "step": 1500 + }, + { + "epoch": 0.26666666666666666, + "eval_loss": 0.48485440015792847, + "eval_runtime": 1815.9503, + "eval_samples_per_second": 2.753, + "eval_steps_per_second": 0.344, + "step": 1500 + }, + { + "epoch": 0.26684444444444444, + "grad_norm": 0.24652020633220673, + "learning_rate": 4.172826515897146e-05, + "loss": 0.4726, + "step": 1501 + }, + { + "epoch": 0.2670222222222222, + "grad_norm": 0.28594204783439636, + "learning_rate": 4.171788629128284e-05, + "loss": 0.5689, + "step": 1502 + }, + { + "epoch": 0.2672, + "grad_norm": 0.2464514672756195, + "learning_rate": 4.170750220881354e-05, + "loss": 0.3653, + "step": 1503 + }, + { + "epoch": 0.26737777777777777, + "grad_norm": 0.22022032737731934, + "learning_rate": 4.169711291480266e-05, + "loss": 0.4444, + "step": 1504 + }, + { + "epoch": 0.26755555555555555, + "grad_norm": 0.2939121723175049, + "learning_rate": 4.168671841249091e-05, + "loss": 0.4875, + "step": 1505 + }, + { + "epoch": 0.2677333333333333, + "grad_norm": 0.2217637598514557, + "learning_rate": 4.1676318705120616e-05, + "loss": 0.5097, + "step": 1506 + }, + { + "epoch": 0.2679111111111111, + "grad_norm": 0.16596266627311707, + "learning_rate": 4.166591379593575e-05, + "loss": 0.3922, + "step": 1507 + }, + { + "epoch": 0.2680888888888889, + "grad_norm": 0.2935040593147278, + "learning_rate": 4.16555036881819e-05, + "loss": 0.4468, + "step": 1508 + }, + { + "epoch": 0.26826666666666665, + "grad_norm": 0.2813069224357605, + "learning_rate": 4.1645088385106266e-05, + "loss": 0.5526, + "step": 1509 + }, + { + "epoch": 0.26844444444444443, + "grad_norm": 0.2901483476161957, + "learning_rate": 4.1634667889957676e-05, + "loss": 0.501, + "step": 1510 + }, + { + "epoch": 0.2686222222222222, + "grad_norm": 0.20306707918643951, + "learning_rate": 4.162424220598658e-05, + "loss": 0.4906, + "step": 1511 + }, + { + "epoch": 0.2688, + "grad_norm": 0.18119587004184723, + "learning_rate": 4.161381133644505e-05, + "loss": 0.4975, + "step": 1512 + }, + { + "epoch": 0.26897777777777776, + "grad_norm": 0.20812520384788513, + "learning_rate": 4.160337528458676e-05, + "loss": 0.427, + "step": 1513 + }, + { + "epoch": 0.26915555555555554, + "grad_norm": 0.21775951981544495, + "learning_rate": 4.1592934053667004e-05, + "loss": 0.5109, + "step": 1514 + }, + { + "epoch": 0.2693333333333333, + "grad_norm": 0.22875505685806274, + "learning_rate": 4.1582487646942706e-05, + "loss": 0.3462, + "step": 1515 + }, + { + "epoch": 0.2695111111111111, + "grad_norm": 0.21732567250728607, + "learning_rate": 4.157203606767238e-05, + "loss": 0.3937, + "step": 1516 + }, + { + "epoch": 0.26968888888888887, + "grad_norm": 0.26495614647865295, + "learning_rate": 4.156157931911619e-05, + "loss": 0.485, + "step": 1517 + }, + { + "epoch": 0.26986666666666664, + "grad_norm": 0.24223580956459045, + "learning_rate": 4.155111740453588e-05, + "loss": 0.4345, + "step": 1518 + }, + { + "epoch": 0.2700444444444444, + "grad_norm": 0.23529917001724243, + "learning_rate": 4.154065032719481e-05, + "loss": 0.3413, + "step": 1519 + }, + { + "epoch": 0.2702222222222222, + "grad_norm": 0.23065678775310516, + "learning_rate": 4.1530178090357976e-05, + "loss": 0.5157, + "step": 1520 + }, + { + "epoch": 0.2704, + "grad_norm": 0.2641994059085846, + "learning_rate": 4.1519700697291944e-05, + "loss": 0.5559, + "step": 1521 + }, + { + "epoch": 0.27057777777777775, + "grad_norm": 0.23958146572113037, + "learning_rate": 4.150921815126493e-05, + "loss": 0.5747, + "step": 1522 + }, + { + "epoch": 0.2707555555555556, + "grad_norm": 0.2547266483306885, + "learning_rate": 4.149873045554671e-05, + "loss": 0.5712, + "step": 1523 + }, + { + "epoch": 0.27093333333333336, + "grad_norm": 0.19969938695430756, + "learning_rate": 4.148823761340871e-05, + "loss": 0.4068, + "step": 1524 + }, + { + "epoch": 0.27111111111111114, + "grad_norm": 0.2441670000553131, + "learning_rate": 4.1477739628123934e-05, + "loss": 0.3867, + "step": 1525 + }, + { + "epoch": 0.2712888888888889, + "grad_norm": 0.20733439922332764, + "learning_rate": 4.146723650296701e-05, + "loss": 0.5424, + "step": 1526 + }, + { + "epoch": 0.2714666666666667, + "grad_norm": 0.250712126493454, + "learning_rate": 4.145672824121416e-05, + "loss": 0.5429, + "step": 1527 + }, + { + "epoch": 0.27164444444444447, + "grad_norm": 0.18215471506118774, + "learning_rate": 4.144621484614319e-05, + "loss": 0.4416, + "step": 1528 + }, + { + "epoch": 0.27182222222222224, + "grad_norm": 0.23842157423496246, + "learning_rate": 4.1435696321033554e-05, + "loss": 0.4608, + "step": 1529 + }, + { + "epoch": 0.272, + "grad_norm": 0.2514033317565918, + "learning_rate": 4.142517266916625e-05, + "loss": 0.5043, + "step": 1530 + }, + { + "epoch": 0.2721777777777778, + "grad_norm": 0.284056693315506, + "learning_rate": 4.1414643893823914e-05, + "loss": 0.6174, + "step": 1531 + }, + { + "epoch": 0.2723555555555556, + "grad_norm": 0.25382670760154724, + "learning_rate": 4.140410999829076e-05, + "loss": 0.4341, + "step": 1532 + }, + { + "epoch": 0.27253333333333335, + "grad_norm": 0.2005155086517334, + "learning_rate": 4.139357098585262e-05, + "loss": 0.4774, + "step": 1533 + }, + { + "epoch": 0.2727111111111111, + "grad_norm": 0.2968514859676361, + "learning_rate": 4.1383026859796905e-05, + "loss": 0.4934, + "step": 1534 + }, + { + "epoch": 0.2728888888888889, + "grad_norm": 0.28110137581825256, + "learning_rate": 4.137247762341262e-05, + "loss": 0.4875, + "step": 1535 + }, + { + "epoch": 0.2730666666666667, + "grad_norm": 0.28423967957496643, + "learning_rate": 4.136192327999037e-05, + "loss": 0.4619, + "step": 1536 + }, + { + "epoch": 0.27324444444444446, + "grad_norm": 0.2212304174900055, + "learning_rate": 4.135136383282237e-05, + "loss": 0.4596, + "step": 1537 + }, + { + "epoch": 0.27342222222222223, + "grad_norm": 0.22889535129070282, + "learning_rate": 4.1340799285202376e-05, + "loss": 0.4865, + "step": 1538 + }, + { + "epoch": 0.2736, + "grad_norm": 0.22949595749378204, + "learning_rate": 4.13302296404258e-05, + "loss": 0.4172, + "step": 1539 + }, + { + "epoch": 0.2737777777777778, + "grad_norm": 0.2354002594947815, + "learning_rate": 4.131965490178959e-05, + "loss": 0.5039, + "step": 1540 + }, + { + "epoch": 0.27395555555555556, + "grad_norm": 0.20702648162841797, + "learning_rate": 4.130907507259233e-05, + "loss": 0.3463, + "step": 1541 + }, + { + "epoch": 0.27413333333333334, + "grad_norm": 0.2758485674858093, + "learning_rate": 4.129849015613415e-05, + "loss": 0.4931, + "step": 1542 + }, + { + "epoch": 0.2743111111111111, + "grad_norm": 0.27681341767311096, + "learning_rate": 4.1287900155716784e-05, + "loss": 0.3863, + "step": 1543 + }, + { + "epoch": 0.2744888888888889, + "grad_norm": 0.2481858879327774, + "learning_rate": 4.127730507464356e-05, + "loss": 0.3915, + "step": 1544 + }, + { + "epoch": 0.27466666666666667, + "grad_norm": 0.254969984292984, + "learning_rate": 4.126670491621938e-05, + "loss": 0.3195, + "step": 1545 + }, + { + "epoch": 0.27484444444444445, + "grad_norm": 0.3109833896160126, + "learning_rate": 4.125609968375072e-05, + "loss": 0.3843, + "step": 1546 + }, + { + "epoch": 0.2750222222222222, + "grad_norm": 0.2881673276424408, + "learning_rate": 4.124548938054568e-05, + "loss": 0.5282, + "step": 1547 + }, + { + "epoch": 0.2752, + "grad_norm": 0.3744480609893799, + "learning_rate": 4.123487400991388e-05, + "loss": 0.4048, + "step": 1548 + }, + { + "epoch": 0.2753777777777778, + "grad_norm": 0.30865803360939026, + "learning_rate": 4.122425357516658e-05, + "loss": 0.405, + "step": 1549 + }, + { + "epoch": 0.27555555555555555, + "grad_norm": 0.33565211296081543, + "learning_rate": 4.121362807961658e-05, + "loss": 0.3849, + "step": 1550 + }, + { + "epoch": 0.27573333333333333, + "grad_norm": 0.25992846488952637, + "learning_rate": 4.1202997526578276e-05, + "loss": 0.4958, + "step": 1551 + }, + { + "epoch": 0.2759111111111111, + "grad_norm": 0.2575809359550476, + "learning_rate": 4.119236191936764e-05, + "loss": 0.5551, + "step": 1552 + }, + { + "epoch": 0.2760888888888889, + "grad_norm": 0.24611805379390717, + "learning_rate": 4.118172126130221e-05, + "loss": 0.3948, + "step": 1553 + }, + { + "epoch": 0.27626666666666666, + "grad_norm": 0.2759351432323456, + "learning_rate": 4.117107555570111e-05, + "loss": 0.3766, + "step": 1554 + }, + { + "epoch": 0.27644444444444444, + "grad_norm": 0.2714369297027588, + "learning_rate": 4.116042480588505e-05, + "loss": 0.6492, + "step": 1555 + }, + { + "epoch": 0.2766222222222222, + "grad_norm": 0.3663914203643799, + "learning_rate": 4.1149769015176275e-05, + "loss": 0.5837, + "step": 1556 + }, + { + "epoch": 0.2768, + "grad_norm": 0.2502076029777527, + "learning_rate": 4.113910818689864e-05, + "loss": 0.4025, + "step": 1557 + }, + { + "epoch": 0.27697777777777777, + "grad_norm": 0.1820707619190216, + "learning_rate": 4.112844232437757e-05, + "loss": 0.3326, + "step": 1558 + }, + { + "epoch": 0.27715555555555554, + "grad_norm": 0.21882522106170654, + "learning_rate": 4.1117771430940035e-05, + "loss": 0.5356, + "step": 1559 + }, + { + "epoch": 0.2773333333333333, + "grad_norm": 0.18644623458385468, + "learning_rate": 4.1107095509914584e-05, + "loss": 0.3793, + "step": 1560 + }, + { + "epoch": 0.2775111111111111, + "grad_norm": 0.23072949051856995, + "learning_rate": 4.109641456463135e-05, + "loss": 0.423, + "step": 1561 + }, + { + "epoch": 0.2776888888888889, + "grad_norm": 0.25749099254608154, + "learning_rate": 4.108572859842201e-05, + "loss": 0.4936, + "step": 1562 + }, + { + "epoch": 0.27786666666666665, + "grad_norm": 0.195820152759552, + "learning_rate": 4.107503761461983e-05, + "loss": 0.4212, + "step": 1563 + }, + { + "epoch": 0.2780444444444444, + "grad_norm": 0.22111307084560394, + "learning_rate": 4.106434161655962e-05, + "loss": 0.3951, + "step": 1564 + }, + { + "epoch": 0.2782222222222222, + "grad_norm": 0.21331843733787537, + "learning_rate": 4.105364060757776e-05, + "loss": 0.5137, + "step": 1565 + }, + { + "epoch": 0.2784, + "grad_norm": 0.21190695464611053, + "learning_rate": 4.104293459101222e-05, + "loss": 0.605, + "step": 1566 + }, + { + "epoch": 0.27857777777777776, + "grad_norm": 0.2378402054309845, + "learning_rate": 4.1032223570202474e-05, + "loss": 0.6654, + "step": 1567 + }, + { + "epoch": 0.27875555555555553, + "grad_norm": 0.25793296098709106, + "learning_rate": 4.1021507548489625e-05, + "loss": 0.6217, + "step": 1568 + }, + { + "epoch": 0.2789333333333333, + "grad_norm": 0.2183181494474411, + "learning_rate": 4.1010786529216284e-05, + "loss": 0.4419, + "step": 1569 + }, + { + "epoch": 0.2791111111111111, + "grad_norm": 0.2204470932483673, + "learning_rate": 4.1000060515726647e-05, + "loss": 0.41, + "step": 1570 + }, + { + "epoch": 0.27928888888888886, + "grad_norm": 0.20785318315029144, + "learning_rate": 4.098932951136645e-05, + "loss": 0.4394, + "step": 1571 + }, + { + "epoch": 0.27946666666666664, + "grad_norm": 0.26133033633232117, + "learning_rate": 4.097859351948301e-05, + "loss": 0.5198, + "step": 1572 + }, + { + "epoch": 0.2796444444444444, + "grad_norm": 0.19836439192295074, + "learning_rate": 4.0967852543425175e-05, + "loss": 0.5097, + "step": 1573 + }, + { + "epoch": 0.27982222222222225, + "grad_norm": 0.2608725428581238, + "learning_rate": 4.095710658654337e-05, + "loss": 0.3913, + "step": 1574 + }, + { + "epoch": 0.28, + "grad_norm": 0.24927951395511627, + "learning_rate": 4.094635565218955e-05, + "loss": 0.516, + "step": 1575 + }, + { + "epoch": 0.2801777777777778, + "grad_norm": 0.29135334491729736, + "learning_rate": 4.093559974371725e-05, + "loss": 0.419, + "step": 1576 + }, + { + "epoch": 0.2803555555555556, + "grad_norm": 0.23141182959079742, + "learning_rate": 4.0924838864481516e-05, + "loss": 0.5006, + "step": 1577 + }, + { + "epoch": 0.28053333333333336, + "grad_norm": 0.212726891040802, + "learning_rate": 4.0914073017838996e-05, + "loss": 0.5809, + "step": 1578 + }, + { + "epoch": 0.28071111111111113, + "grad_norm": 0.24448910355567932, + "learning_rate": 4.090330220714785e-05, + "loss": 0.4599, + "step": 1579 + }, + { + "epoch": 0.2808888888888889, + "grad_norm": 0.26687201857566833, + "learning_rate": 4.0892526435767795e-05, + "loss": 0.6305, + "step": 1580 + }, + { + "epoch": 0.2810666666666667, + "grad_norm": 0.20560559630393982, + "learning_rate": 4.088174570706011e-05, + "loss": 0.4353, + "step": 1581 + }, + { + "epoch": 0.28124444444444446, + "grad_norm": 0.2660726010799408, + "learning_rate": 4.0870960024387596e-05, + "loss": 0.4926, + "step": 1582 + }, + { + "epoch": 0.28142222222222224, + "grad_norm": 0.32632845640182495, + "learning_rate": 4.0860169391114625e-05, + "loss": 0.7468, + "step": 1583 + }, + { + "epoch": 0.2816, + "grad_norm": 0.24986085295677185, + "learning_rate": 4.084937381060708e-05, + "loss": 0.4779, + "step": 1584 + }, + { + "epoch": 0.2817777777777778, + "grad_norm": 0.2901780903339386, + "learning_rate": 4.083857328623243e-05, + "loss": 0.5599, + "step": 1585 + }, + { + "epoch": 0.28195555555555557, + "grad_norm": 0.2497858852148056, + "learning_rate": 4.082776782135964e-05, + "loss": 0.4285, + "step": 1586 + }, + { + "epoch": 0.28213333333333335, + "grad_norm": 0.30241838097572327, + "learning_rate": 4.0816957419359264e-05, + "loss": 0.551, + "step": 1587 + }, + { + "epoch": 0.2823111111111111, + "grad_norm": 0.240733802318573, + "learning_rate": 4.080614208360336e-05, + "loss": 0.6085, + "step": 1588 + }, + { + "epoch": 0.2824888888888889, + "grad_norm": 0.23214389383792877, + "learning_rate": 4.079532181746553e-05, + "loss": 0.4408, + "step": 1589 + }, + { + "epoch": 0.2826666666666667, + "grad_norm": 0.3076317310333252, + "learning_rate": 4.078449662432093e-05, + "loss": 0.4799, + "step": 1590 + }, + { + "epoch": 0.28284444444444445, + "grad_norm": 0.3100294768810272, + "learning_rate": 4.077366650754624e-05, + "loss": 0.4326, + "step": 1591 + }, + { + "epoch": 0.28302222222222223, + "grad_norm": 0.3078424632549286, + "learning_rate": 4.076283147051968e-05, + "loss": 0.4135, + "step": 1592 + }, + { + "epoch": 0.2832, + "grad_norm": 0.3391684889793396, + "learning_rate": 4.075199151662101e-05, + "loss": 0.5718, + "step": 1593 + }, + { + "epoch": 0.2833777777777778, + "grad_norm": 0.2584700882434845, + "learning_rate": 4.0741146649231504e-05, + "loss": 0.406, + "step": 1594 + }, + { + "epoch": 0.28355555555555556, + "grad_norm": 0.3204618990421295, + "learning_rate": 4.073029687173399e-05, + "loss": 0.3745, + "step": 1595 + }, + { + "epoch": 0.28373333333333334, + "grad_norm": 0.2907264828681946, + "learning_rate": 4.071944218751282e-05, + "loss": 0.542, + "step": 1596 + }, + { + "epoch": 0.2839111111111111, + "grad_norm": 0.27430570125579834, + "learning_rate": 4.070858259995387e-05, + "loss": 0.4433, + "step": 1597 + }, + { + "epoch": 0.2840888888888889, + "grad_norm": 0.28936102986335754, + "learning_rate": 4.069771811244457e-05, + "loss": 0.4237, + "step": 1598 + }, + { + "epoch": 0.28426666666666667, + "grad_norm": 0.36423459649086, + "learning_rate": 4.068684872837384e-05, + "loss": 0.4171, + "step": 1599 + }, + { + "epoch": 0.28444444444444444, + "grad_norm": 0.3362770080566406, + "learning_rate": 4.067597445113216e-05, + "loss": 0.4717, + "step": 1600 + }, + { + "epoch": 0.2846222222222222, + "grad_norm": 0.2398492991924286, + "learning_rate": 4.066509528411152e-05, + "loss": 0.4841, + "step": 1601 + }, + { + "epoch": 0.2848, + "grad_norm": 0.27051061391830444, + "learning_rate": 4.065421123070543e-05, + "loss": 0.4635, + "step": 1602 + }, + { + "epoch": 0.2849777777777778, + "grad_norm": 0.26619118452072144, + "learning_rate": 4.064332229430895e-05, + "loss": 0.4471, + "step": 1603 + }, + { + "epoch": 0.28515555555555555, + "grad_norm": 0.29838183522224426, + "learning_rate": 4.063242847831864e-05, + "loss": 0.6265, + "step": 1604 + }, + { + "epoch": 0.2853333333333333, + "grad_norm": 0.30956029891967773, + "learning_rate": 4.062152978613258e-05, + "loss": 0.5344, + "step": 1605 + }, + { + "epoch": 0.2855111111111111, + "grad_norm": 0.21937286853790283, + "learning_rate": 4.0610626221150394e-05, + "loss": 0.4774, + "step": 1606 + }, + { + "epoch": 0.2856888888888889, + "grad_norm": 0.2073107808828354, + "learning_rate": 4.0599717786773204e-05, + "loss": 0.3971, + "step": 1607 + }, + { + "epoch": 0.28586666666666666, + "grad_norm": 0.22711306810379028, + "learning_rate": 4.058880448640367e-05, + "loss": 0.5173, + "step": 1608 + }, + { + "epoch": 0.28604444444444443, + "grad_norm": 0.25710800290107727, + "learning_rate": 4.057788632344593e-05, + "loss": 0.4084, + "step": 1609 + }, + { + "epoch": 0.2862222222222222, + "grad_norm": 0.25240039825439453, + "learning_rate": 4.0566963301305705e-05, + "loss": 0.448, + "step": 1610 + }, + { + "epoch": 0.2864, + "grad_norm": 0.29141196608543396, + "learning_rate": 4.055603542339016e-05, + "loss": 0.4506, + "step": 1611 + }, + { + "epoch": 0.28657777777777776, + "grad_norm": 0.19746699929237366, + "learning_rate": 4.054510269310803e-05, + "loss": 0.4524, + "step": 1612 + }, + { + "epoch": 0.28675555555555554, + "grad_norm": 0.23472252488136292, + "learning_rate": 4.053416511386954e-05, + "loss": 0.4693, + "step": 1613 + }, + { + "epoch": 0.2869333333333333, + "grad_norm": 0.2002190500497818, + "learning_rate": 4.0523222689086414e-05, + "loss": 0.4578, + "step": 1614 + }, + { + "epoch": 0.2871111111111111, + "grad_norm": 0.21806994080543518, + "learning_rate": 4.051227542217192e-05, + "loss": 0.6064, + "step": 1615 + }, + { + "epoch": 0.28728888888888887, + "grad_norm": 0.24389760196208954, + "learning_rate": 4.050132331654082e-05, + "loss": 0.4888, + "step": 1616 + }, + { + "epoch": 0.28746666666666665, + "grad_norm": 0.22597664594650269, + "learning_rate": 4.0490366375609376e-05, + "loss": 0.6411, + "step": 1617 + }, + { + "epoch": 0.2876444444444444, + "grad_norm": 0.251056045293808, + "learning_rate": 4.047940460279537e-05, + "loss": 0.4501, + "step": 1618 + }, + { + "epoch": 0.2878222222222222, + "grad_norm": 0.20650461316108704, + "learning_rate": 4.0468438001518084e-05, + "loss": 0.4326, + "step": 1619 + }, + { + "epoch": 0.288, + "grad_norm": 0.20276285707950592, + "learning_rate": 4.045746657519831e-05, + "loss": 0.5027, + "step": 1620 + }, + { + "epoch": 0.28817777777777775, + "grad_norm": 0.21045973896980286, + "learning_rate": 4.044649032725836e-05, + "loss": 0.4085, + "step": 1621 + }, + { + "epoch": 0.28835555555555553, + "grad_norm": 0.2793443500995636, + "learning_rate": 4.043550926112203e-05, + "loss": 0.5925, + "step": 1622 + }, + { + "epoch": 0.2885333333333333, + "grad_norm": 0.2458329051733017, + "learning_rate": 4.042452338021461e-05, + "loss": 0.5399, + "step": 1623 + }, + { + "epoch": 0.2887111111111111, + "grad_norm": 0.2701219618320465, + "learning_rate": 4.041353268796293e-05, + "loss": 0.5973, + "step": 1624 + }, + { + "epoch": 0.28888888888888886, + "grad_norm": 0.21649327874183655, + "learning_rate": 4.0402537187795274e-05, + "loss": 0.4322, + "step": 1625 + }, + { + "epoch": 0.2890666666666667, + "grad_norm": 0.28817254304885864, + "learning_rate": 4.039153688314145e-05, + "loss": 0.5904, + "step": 1626 + }, + { + "epoch": 0.28924444444444447, + "grad_norm": 0.23212578892707825, + "learning_rate": 4.0380531777432794e-05, + "loss": 0.4125, + "step": 1627 + }, + { + "epoch": 0.28942222222222225, + "grad_norm": 0.200678750872612, + "learning_rate": 4.036952187410208e-05, + "loss": 0.3569, + "step": 1628 + }, + { + "epoch": 0.2896, + "grad_norm": 0.20307832956314087, + "learning_rate": 4.035850717658362e-05, + "loss": 0.5187, + "step": 1629 + }, + { + "epoch": 0.2897777777777778, + "grad_norm": 0.24210457503795624, + "learning_rate": 4.0347487688313194e-05, + "loss": 0.4213, + "step": 1630 + }, + { + "epoch": 0.2899555555555556, + "grad_norm": 0.20839503407478333, + "learning_rate": 4.033646341272811e-05, + "loss": 0.5448, + "step": 1631 + }, + { + "epoch": 0.29013333333333335, + "grad_norm": 0.24183082580566406, + "learning_rate": 4.032543435326714e-05, + "loss": 0.5771, + "step": 1632 + }, + { + "epoch": 0.29031111111111113, + "grad_norm": 0.23193883895874023, + "learning_rate": 4.031440051337056e-05, + "loss": 0.5367, + "step": 1633 + }, + { + "epoch": 0.2904888888888889, + "grad_norm": 0.31010133028030396, + "learning_rate": 4.030336189648014e-05, + "loss": 0.4983, + "step": 1634 + }, + { + "epoch": 0.2906666666666667, + "grad_norm": 0.21981319785118103, + "learning_rate": 4.029231850603914e-05, + "loss": 0.5145, + "step": 1635 + }, + { + "epoch": 0.29084444444444446, + "grad_norm": 0.221828892827034, + "learning_rate": 4.028127034549229e-05, + "loss": 0.5115, + "step": 1636 + }, + { + "epoch": 0.29102222222222224, + "grad_norm": 0.20732784271240234, + "learning_rate": 4.027021741828584e-05, + "loss": 0.5271, + "step": 1637 + }, + { + "epoch": 0.2912, + "grad_norm": 0.21683849394321442, + "learning_rate": 4.0259159727867504e-05, + "loss": 0.4002, + "step": 1638 + }, + { + "epoch": 0.2913777777777778, + "grad_norm": 0.2405775785446167, + "learning_rate": 4.024809727768648e-05, + "loss": 0.5526, + "step": 1639 + }, + { + "epoch": 0.29155555555555557, + "grad_norm": 0.2961327135562897, + "learning_rate": 4.023703007119347e-05, + "loss": 0.5706, + "step": 1640 + }, + { + "epoch": 0.29173333333333334, + "grad_norm": 0.24386747181415558, + "learning_rate": 4.022595811184064e-05, + "loss": 0.4983, + "step": 1641 + }, + { + "epoch": 0.2919111111111111, + "grad_norm": 0.22576314210891724, + "learning_rate": 4.021488140308165e-05, + "loss": 0.3676, + "step": 1642 + }, + { + "epoch": 0.2920888888888889, + "grad_norm": 0.2773377597332001, + "learning_rate": 4.020379994837164e-05, + "loss": 0.3953, + "step": 1643 + }, + { + "epoch": 0.2922666666666667, + "grad_norm": 0.26731356978416443, + "learning_rate": 4.019271375116722e-05, + "loss": 0.5052, + "step": 1644 + }, + { + "epoch": 0.29244444444444445, + "grad_norm": 0.30060598254203796, + "learning_rate": 4.0181622814926504e-05, + "loss": 0.5325, + "step": 1645 + }, + { + "epoch": 0.29262222222222223, + "grad_norm": 0.27068033814430237, + "learning_rate": 4.017052714310906e-05, + "loss": 0.3108, + "step": 1646 + }, + { + "epoch": 0.2928, + "grad_norm": 0.30356886982917786, + "learning_rate": 4.015942673917593e-05, + "loss": 0.4212, + "step": 1647 + }, + { + "epoch": 0.2929777777777778, + "grad_norm": 0.36942142248153687, + "learning_rate": 4.0148321606589656e-05, + "loss": 0.4332, + "step": 1648 + }, + { + "epoch": 0.29315555555555556, + "grad_norm": 0.3672679364681244, + "learning_rate": 4.013721174881425e-05, + "loss": 0.445, + "step": 1649 + }, + { + "epoch": 0.29333333333333333, + "grad_norm": 0.3945762515068054, + "learning_rate": 4.012609716931517e-05, + "loss": 0.4994, + "step": 1650 + }, + { + "epoch": 0.2935111111111111, + "grad_norm": 0.2316383421421051, + "learning_rate": 4.011497787155938e-05, + "loss": 0.4043, + "step": 1651 + }, + { + "epoch": 0.2936888888888889, + "grad_norm": 0.3398202657699585, + "learning_rate": 4.01038538590153e-05, + "loss": 0.607, + "step": 1652 + }, + { + "epoch": 0.29386666666666666, + "grad_norm": 0.28314563632011414, + "learning_rate": 4.009272513515281e-05, + "loss": 0.4341, + "step": 1653 + }, + { + "epoch": 0.29404444444444444, + "grad_norm": 0.24392464756965637, + "learning_rate": 4.00815917034433e-05, + "loss": 0.3375, + "step": 1654 + }, + { + "epoch": 0.2942222222222222, + "grad_norm": 0.25865399837493896, + "learning_rate": 4.007045356735959e-05, + "loss": 0.4945, + "step": 1655 + }, + { + "epoch": 0.2944, + "grad_norm": 0.3179703950881958, + "learning_rate": 4.005931073037596e-05, + "loss": 0.5979, + "step": 1656 + }, + { + "epoch": 0.29457777777777777, + "grad_norm": 0.20923791825771332, + "learning_rate": 4.0048163195968214e-05, + "loss": 0.4971, + "step": 1657 + }, + { + "epoch": 0.29475555555555555, + "grad_norm": 0.24600355327129364, + "learning_rate": 4.003701096761355e-05, + "loss": 0.5726, + "step": 1658 + }, + { + "epoch": 0.2949333333333333, + "grad_norm": 0.2677474915981293, + "learning_rate": 4.0025854048790677e-05, + "loss": 0.5259, + "step": 1659 + }, + { + "epoch": 0.2951111111111111, + "grad_norm": 0.24998877942562103, + "learning_rate": 4.001469244297975e-05, + "loss": 0.5909, + "step": 1660 + }, + { + "epoch": 0.2952888888888889, + "grad_norm": 0.2508189082145691, + "learning_rate": 4.000352615366239e-05, + "loss": 0.4954, + "step": 1661 + }, + { + "epoch": 0.29546666666666666, + "grad_norm": 0.3233291804790497, + "learning_rate": 3.999235518432168e-05, + "loss": 0.4778, + "step": 1662 + }, + { + "epoch": 0.29564444444444443, + "grad_norm": 0.22414641082286835, + "learning_rate": 3.9981179538442146e-05, + "loss": 0.5158, + "step": 1663 + }, + { + "epoch": 0.2958222222222222, + "grad_norm": 0.23457972705364227, + "learning_rate": 3.996999921950981e-05, + "loss": 0.3846, + "step": 1664 + }, + { + "epoch": 0.296, + "grad_norm": 0.21062356233596802, + "learning_rate": 3.9958814231012115e-05, + "loss": 0.4392, + "step": 1665 + }, + { + "epoch": 0.29617777777777776, + "grad_norm": 0.20708103477954865, + "learning_rate": 3.9947624576437975e-05, + "loss": 0.475, + "step": 1666 + }, + { + "epoch": 0.29635555555555554, + "grad_norm": 0.22435812652111053, + "learning_rate": 3.993643025927776e-05, + "loss": 0.4604, + "step": 1667 + }, + { + "epoch": 0.2965333333333333, + "grad_norm": 0.25603124499320984, + "learning_rate": 3.99252312830233e-05, + "loss": 0.5115, + "step": 1668 + }, + { + "epoch": 0.2967111111111111, + "grad_norm": 0.22671236097812653, + "learning_rate": 3.9914027651167866e-05, + "loss": 0.4006, + "step": 1669 + }, + { + "epoch": 0.29688888888888887, + "grad_norm": 0.2064259648323059, + "learning_rate": 3.990281936720619e-05, + "loss": 0.4479, + "step": 1670 + }, + { + "epoch": 0.29706666666666665, + "grad_norm": 0.28547775745391846, + "learning_rate": 3.989160643463445e-05, + "loss": 0.6084, + "step": 1671 + }, + { + "epoch": 0.2972444444444444, + "grad_norm": 0.24645598232746124, + "learning_rate": 3.988038885695028e-05, + "loss": 0.3842, + "step": 1672 + }, + { + "epoch": 0.2974222222222222, + "grad_norm": 0.275958389043808, + "learning_rate": 3.986916663765275e-05, + "loss": 0.5749, + "step": 1673 + }, + { + "epoch": 0.2976, + "grad_norm": 0.2431192398071289, + "learning_rate": 3.985793978024239e-05, + "loss": 0.6435, + "step": 1674 + }, + { + "epoch": 0.29777777777777775, + "grad_norm": 0.3277748227119446, + "learning_rate": 3.984670828822118e-05, + "loss": 0.6528, + "step": 1675 + }, + { + "epoch": 0.29795555555555553, + "grad_norm": 0.2489626556634903, + "learning_rate": 3.983547216509254e-05, + "loss": 0.4292, + "step": 1676 + }, + { + "epoch": 0.2981333333333333, + "grad_norm": 0.22843550145626068, + "learning_rate": 3.9824231414361324e-05, + "loss": 0.3957, + "step": 1677 + }, + { + "epoch": 0.29831111111111114, + "grad_norm": 0.24791912734508514, + "learning_rate": 3.981298603953385e-05, + "loss": 0.5764, + "step": 1678 + }, + { + "epoch": 0.2984888888888889, + "grad_norm": 0.24238701164722443, + "learning_rate": 3.980173604411786e-05, + "loss": 0.595, + "step": 1679 + }, + { + "epoch": 0.2986666666666667, + "grad_norm": 0.2152300924062729, + "learning_rate": 3.979048143162255e-05, + "loss": 0.3898, + "step": 1680 + }, + { + "epoch": 0.29884444444444447, + "grad_norm": 0.25221845507621765, + "learning_rate": 3.977922220555855e-05, + "loss": 0.5145, + "step": 1681 + }, + { + "epoch": 0.29902222222222224, + "grad_norm": 0.23014791309833527, + "learning_rate": 3.976795836943793e-05, + "loss": 0.5697, + "step": 1682 + }, + { + "epoch": 0.2992, + "grad_norm": 0.2948393225669861, + "learning_rate": 3.9756689926774196e-05, + "loss": 0.4177, + "step": 1683 + }, + { + "epoch": 0.2993777777777778, + "grad_norm": 0.24183227121829987, + "learning_rate": 3.97454168810823e-05, + "loss": 0.5183, + "step": 1684 + }, + { + "epoch": 0.2995555555555556, + "grad_norm": 0.25105908513069153, + "learning_rate": 3.973413923587862e-05, + "loss": 0.5281, + "step": 1685 + }, + { + "epoch": 0.29973333333333335, + "grad_norm": 0.22666655480861664, + "learning_rate": 3.9722856994680966e-05, + "loss": 0.4322, + "step": 1686 + }, + { + "epoch": 0.29991111111111113, + "grad_norm": 0.2383197396993637, + "learning_rate": 3.9711570161008596e-05, + "loss": 0.4463, + "step": 1687 + }, + { + "epoch": 0.3000888888888889, + "grad_norm": 0.23485571146011353, + "learning_rate": 3.970027873838219e-05, + "loss": 0.6199, + "step": 1688 + }, + { + "epoch": 0.3002666666666667, + "grad_norm": 0.32722291350364685, + "learning_rate": 3.9688982730323865e-05, + "loss": 0.5191, + "step": 1689 + }, + { + "epoch": 0.30044444444444446, + "grad_norm": 0.29917100071907043, + "learning_rate": 3.967768214035715e-05, + "loss": 0.4298, + "step": 1690 + }, + { + "epoch": 0.30062222222222224, + "grad_norm": 0.3028694689273834, + "learning_rate": 3.966637697200703e-05, + "loss": 0.5412, + "step": 1691 + }, + { + "epoch": 0.3008, + "grad_norm": 0.2963564991950989, + "learning_rate": 3.965506722879991e-05, + "loss": 0.4126, + "step": 1692 + }, + { + "epoch": 0.3009777777777778, + "grad_norm": 0.3049181401729584, + "learning_rate": 3.964375291426361e-05, + "loss": 0.4623, + "step": 1693 + }, + { + "epoch": 0.30115555555555557, + "grad_norm": 0.2815674841403961, + "learning_rate": 3.963243403192739e-05, + "loss": 0.4347, + "step": 1694 + }, + { + "epoch": 0.30133333333333334, + "grad_norm": 0.31967678666114807, + "learning_rate": 3.962111058532192e-05, + "loss": 0.4895, + "step": 1695 + }, + { + "epoch": 0.3015111111111111, + "grad_norm": 0.37411928176879883, + "learning_rate": 3.960978257797931e-05, + "loss": 0.3972, + "step": 1696 + }, + { + "epoch": 0.3016888888888889, + "grad_norm": 0.33468058705329895, + "learning_rate": 3.9598450013433075e-05, + "loss": 0.3578, + "step": 1697 + }, + { + "epoch": 0.30186666666666667, + "grad_norm": 0.40379950404167175, + "learning_rate": 3.9587112895218184e-05, + "loss": 0.4178, + "step": 1698 + }, + { + "epoch": 0.30204444444444445, + "grad_norm": 0.4760851562023163, + "learning_rate": 3.957577122687098e-05, + "loss": 0.4437, + "step": 1699 + }, + { + "epoch": 0.3022222222222222, + "grad_norm": 0.48131507635116577, + "learning_rate": 3.9564425011929265e-05, + "loss": 0.4827, + "step": 1700 + }, + { + "epoch": 0.3024, + "grad_norm": 0.2509986162185669, + "learning_rate": 3.955307425393224e-05, + "loss": 0.5208, + "step": 1701 + }, + { + "epoch": 0.3025777777777778, + "grad_norm": 0.21239590644836426, + "learning_rate": 3.954171895642052e-05, + "loss": 0.5983, + "step": 1702 + }, + { + "epoch": 0.30275555555555556, + "grad_norm": 0.3689562678337097, + "learning_rate": 3.953035912293616e-05, + "loss": 0.5489, + "step": 1703 + }, + { + "epoch": 0.30293333333333333, + "grad_norm": 0.27050459384918213, + "learning_rate": 3.951899475702259e-05, + "loss": 0.6034, + "step": 1704 + }, + { + "epoch": 0.3031111111111111, + "grad_norm": 0.3071124255657196, + "learning_rate": 3.950762586222468e-05, + "loss": 0.5167, + "step": 1705 + }, + { + "epoch": 0.3032888888888889, + "grad_norm": 0.25853657722473145, + "learning_rate": 3.9496252442088733e-05, + "loss": 0.4756, + "step": 1706 + }, + { + "epoch": 0.30346666666666666, + "grad_norm": 0.2545461654663086, + "learning_rate": 3.948487450016242e-05, + "loss": 0.5748, + "step": 1707 + }, + { + "epoch": 0.30364444444444444, + "grad_norm": 0.2136763632297516, + "learning_rate": 3.947349203999484e-05, + "loss": 0.4262, + "step": 1708 + }, + { + "epoch": 0.3038222222222222, + "grad_norm": 0.22998157143592834, + "learning_rate": 3.946210506513651e-05, + "loss": 0.5375, + "step": 1709 + }, + { + "epoch": 0.304, + "grad_norm": 0.2568053901195526, + "learning_rate": 3.945071357913935e-05, + "loss": 0.6967, + "step": 1710 + }, + { + "epoch": 0.30417777777777777, + "grad_norm": 0.26576822996139526, + "learning_rate": 3.943931758555669e-05, + "loss": 0.6003, + "step": 1711 + }, + { + "epoch": 0.30435555555555555, + "grad_norm": 0.3024243414402008, + "learning_rate": 3.942791708794326e-05, + "loss": 0.5089, + "step": 1712 + }, + { + "epoch": 0.3045333333333333, + "grad_norm": 0.23126915097236633, + "learning_rate": 3.9416512089855184e-05, + "loss": 0.3945, + "step": 1713 + }, + { + "epoch": 0.3047111111111111, + "grad_norm": 0.24421076476573944, + "learning_rate": 3.940510259485002e-05, + "loss": 0.5101, + "step": 1714 + }, + { + "epoch": 0.3048888888888889, + "grad_norm": 0.2684940993785858, + "learning_rate": 3.939368860648669e-05, + "loss": 0.5511, + "step": 1715 + }, + { + "epoch": 0.30506666666666665, + "grad_norm": 0.26677650213241577, + "learning_rate": 3.938227012832557e-05, + "loss": 0.5932, + "step": 1716 + }, + { + "epoch": 0.30524444444444443, + "grad_norm": 0.28246426582336426, + "learning_rate": 3.937084716392838e-05, + "loss": 0.4421, + "step": 1717 + }, + { + "epoch": 0.3054222222222222, + "grad_norm": 0.18641994893550873, + "learning_rate": 3.9359419716858274e-05, + "loss": 0.3947, + "step": 1718 + }, + { + "epoch": 0.3056, + "grad_norm": 0.3639377951622009, + "learning_rate": 3.93479877906798e-05, + "loss": 0.4888, + "step": 1719 + }, + { + "epoch": 0.30577777777777776, + "grad_norm": 0.3343105614185333, + "learning_rate": 3.933655138895889e-05, + "loss": 0.8343, + "step": 1720 + }, + { + "epoch": 0.30595555555555554, + "grad_norm": 0.22481101751327515, + "learning_rate": 3.932511051526289e-05, + "loss": 0.4321, + "step": 1721 + }, + { + "epoch": 0.3061333333333333, + "grad_norm": 0.2662048041820526, + "learning_rate": 3.931366517316052e-05, + "loss": 0.5768, + "step": 1722 + }, + { + "epoch": 0.3063111111111111, + "grad_norm": 0.24762697517871857, + "learning_rate": 3.930221536622191e-05, + "loss": 0.5002, + "step": 1723 + }, + { + "epoch": 0.30648888888888887, + "grad_norm": 0.2484867423772812, + "learning_rate": 3.9290761098018585e-05, + "loss": 0.51, + "step": 1724 + }, + { + "epoch": 0.30666666666666664, + "grad_norm": 0.23064836859703064, + "learning_rate": 3.927930237212345e-05, + "loss": 0.4966, + "step": 1725 + }, + { + "epoch": 0.3068444444444444, + "grad_norm": 0.2692429721355438, + "learning_rate": 3.92678391921108e-05, + "loss": 0.559, + "step": 1726 + }, + { + "epoch": 0.3070222222222222, + "grad_norm": 0.26322412490844727, + "learning_rate": 3.925637156155633e-05, + "loss": 0.4627, + "step": 1727 + }, + { + "epoch": 0.3072, + "grad_norm": 0.276094526052475, + "learning_rate": 3.924489948403711e-05, + "loss": 0.4309, + "step": 1728 + }, + { + "epoch": 0.3073777777777778, + "grad_norm": 0.21968096494674683, + "learning_rate": 3.9233422963131616e-05, + "loss": 0.4189, + "step": 1729 + }, + { + "epoch": 0.3075555555555556, + "grad_norm": 0.23094774782657623, + "learning_rate": 3.922194200241969e-05, + "loss": 0.3949, + "step": 1730 + }, + { + "epoch": 0.30773333333333336, + "grad_norm": 0.2851698100566864, + "learning_rate": 3.9210456605482576e-05, + "loss": 0.6882, + "step": 1731 + }, + { + "epoch": 0.30791111111111114, + "grad_norm": 0.22909457981586456, + "learning_rate": 3.919896677590289e-05, + "loss": 0.6026, + "step": 1732 + }, + { + "epoch": 0.3080888888888889, + "grad_norm": 0.23537103831768036, + "learning_rate": 3.918747251726463e-05, + "loss": 0.5448, + "step": 1733 + }, + { + "epoch": 0.3082666666666667, + "grad_norm": 0.27476415038108826, + "learning_rate": 3.9175973833153186e-05, + "loss": 0.4069, + "step": 1734 + }, + { + "epoch": 0.30844444444444447, + "grad_norm": 0.2562309205532074, + "learning_rate": 3.9164470727155314e-05, + "loss": 0.4997, + "step": 1735 + }, + { + "epoch": 0.30862222222222224, + "grad_norm": 0.2508479058742523, + "learning_rate": 3.915296320285917e-05, + "loss": 0.4769, + "step": 1736 + }, + { + "epoch": 0.3088, + "grad_norm": 0.2374836951494217, + "learning_rate": 3.914145126385426e-05, + "loss": 0.5252, + "step": 1737 + }, + { + "epoch": 0.3089777777777778, + "grad_norm": 0.22735223174095154, + "learning_rate": 3.91299349137315e-05, + "loss": 0.519, + "step": 1738 + }, + { + "epoch": 0.3091555555555556, + "grad_norm": 0.3538740575313568, + "learning_rate": 3.911841415608315e-05, + "loss": 0.5471, + "step": 1739 + }, + { + "epoch": 0.30933333333333335, + "grad_norm": 0.2828303277492523, + "learning_rate": 3.9106888994502864e-05, + "loss": 0.5914, + "step": 1740 + }, + { + "epoch": 0.3095111111111111, + "grad_norm": 0.2548404037952423, + "learning_rate": 3.909535943258567e-05, + "loss": 0.4444, + "step": 1741 + }, + { + "epoch": 0.3096888888888889, + "grad_norm": 0.22350788116455078, + "learning_rate": 3.908382547392796e-05, + "loss": 0.3887, + "step": 1742 + }, + { + "epoch": 0.3098666666666667, + "grad_norm": 0.2685641348361969, + "learning_rate": 3.907228712212751e-05, + "loss": 0.462, + "step": 1743 + }, + { + "epoch": 0.31004444444444446, + "grad_norm": 0.26695823669433594, + "learning_rate": 3.9060744380783435e-05, + "loss": 0.4314, + "step": 1744 + }, + { + "epoch": 0.31022222222222223, + "grad_norm": 0.29361921548843384, + "learning_rate": 3.9049197253496264e-05, + "loss": 0.3711, + "step": 1745 + }, + { + "epoch": 0.3104, + "grad_norm": 0.33422568440437317, + "learning_rate": 3.903764574386786e-05, + "loss": 0.4161, + "step": 1746 + }, + { + "epoch": 0.3105777777777778, + "grad_norm": 0.3214603066444397, + "learning_rate": 3.902608985550147e-05, + "loss": 0.5017, + "step": 1747 + }, + { + "epoch": 0.31075555555555556, + "grad_norm": 0.2901337444782257, + "learning_rate": 3.9014529592001705e-05, + "loss": 0.3689, + "step": 1748 + }, + { + "epoch": 0.31093333333333334, + "grad_norm": 0.43314751982688904, + "learning_rate": 3.900296495697453e-05, + "loss": 0.5553, + "step": 1749 + }, + { + "epoch": 0.3111111111111111, + "grad_norm": 0.4667474031448364, + "learning_rate": 3.899139595402729e-05, + "loss": 0.436, + "step": 1750 + }, + { + "epoch": 0.3112888888888889, + "grad_norm": 0.22450193762779236, + "learning_rate": 3.897982258676867e-05, + "loss": 0.4632, + "step": 1751 + }, + { + "epoch": 0.31146666666666667, + "grad_norm": 0.21711860597133636, + "learning_rate": 3.896824485880874e-05, + "loss": 0.4069, + "step": 1752 + }, + { + "epoch": 0.31164444444444445, + "grad_norm": 0.2716481685638428, + "learning_rate": 3.895666277375892e-05, + "loss": 0.4671, + "step": 1753 + }, + { + "epoch": 0.3118222222222222, + "grad_norm": 0.2682226896286011, + "learning_rate": 3.894507633523199e-05, + "loss": 0.4991, + "step": 1754 + }, + { + "epoch": 0.312, + "grad_norm": 0.25763416290283203, + "learning_rate": 3.8933485546842094e-05, + "loss": 0.5941, + "step": 1755 + }, + { + "epoch": 0.3121777777777778, + "grad_norm": 0.24517154693603516, + "learning_rate": 3.8921890412204705e-05, + "loss": 0.452, + "step": 1756 + }, + { + "epoch": 0.31235555555555555, + "grad_norm": 0.2549922466278076, + "learning_rate": 3.891029093493669e-05, + "loss": 0.5495, + "step": 1757 + }, + { + "epoch": 0.31253333333333333, + "grad_norm": 0.23597003519535065, + "learning_rate": 3.889868711865624e-05, + "loss": 0.4653, + "step": 1758 + }, + { + "epoch": 0.3127111111111111, + "grad_norm": 0.27230918407440186, + "learning_rate": 3.8887078966982925e-05, + "loss": 0.5556, + "step": 1759 + }, + { + "epoch": 0.3128888888888889, + "grad_norm": 0.2747502624988556, + "learning_rate": 3.887546648353765e-05, + "loss": 0.4753, + "step": 1760 + }, + { + "epoch": 0.31306666666666666, + "grad_norm": 0.26325714588165283, + "learning_rate": 3.8863849671942685e-05, + "loss": 0.5776, + "step": 1761 + }, + { + "epoch": 0.31324444444444444, + "grad_norm": 0.2610800266265869, + "learning_rate": 3.885222853582163e-05, + "loss": 0.4123, + "step": 1762 + }, + { + "epoch": 0.3134222222222222, + "grad_norm": 0.21665984392166138, + "learning_rate": 3.8840603078799445e-05, + "loss": 0.3762, + "step": 1763 + }, + { + "epoch": 0.3136, + "grad_norm": 0.24290180206298828, + "learning_rate": 3.8828973304502446e-05, + "loss": 0.4575, + "step": 1764 + }, + { + "epoch": 0.31377777777777777, + "grad_norm": 0.24685519933700562, + "learning_rate": 3.881733921655829e-05, + "loss": 0.571, + "step": 1765 + }, + { + "epoch": 0.31395555555555554, + "grad_norm": 0.2319568246603012, + "learning_rate": 3.880570081859597e-05, + "loss": 0.414, + "step": 1766 + }, + { + "epoch": 0.3141333333333333, + "grad_norm": 0.30388307571411133, + "learning_rate": 3.879405811424583e-05, + "loss": 0.7389, + "step": 1767 + }, + { + "epoch": 0.3143111111111111, + "grad_norm": 0.2181704044342041, + "learning_rate": 3.8782411107139564e-05, + "loss": 0.4981, + "step": 1768 + }, + { + "epoch": 0.3144888888888889, + "grad_norm": 0.21128003299236298, + "learning_rate": 3.87707598009102e-05, + "loss": 0.4114, + "step": 1769 + }, + { + "epoch": 0.31466666666666665, + "grad_norm": 0.2682114243507385, + "learning_rate": 3.875910419919211e-05, + "loss": 0.5918, + "step": 1770 + }, + { + "epoch": 0.3148444444444444, + "grad_norm": 0.2793351113796234, + "learning_rate": 3.8747444305621e-05, + "loss": 0.5388, + "step": 1771 + }, + { + "epoch": 0.3150222222222222, + "grad_norm": 0.20167353749275208, + "learning_rate": 3.873578012383393e-05, + "loss": 0.4654, + "step": 1772 + }, + { + "epoch": 0.3152, + "grad_norm": 0.2164766937494278, + "learning_rate": 3.872411165746927e-05, + "loss": 0.5353, + "step": 1773 + }, + { + "epoch": 0.31537777777777776, + "grad_norm": 0.19288529455661774, + "learning_rate": 3.871243891016676e-05, + "loss": 0.4798, + "step": 1774 + }, + { + "epoch": 0.31555555555555553, + "grad_norm": 0.2466810792684555, + "learning_rate": 3.870076188556746e-05, + "loss": 0.4882, + "step": 1775 + }, + { + "epoch": 0.3157333333333333, + "grad_norm": 0.2654877007007599, + "learning_rate": 3.868908058731376e-05, + "loss": 0.5043, + "step": 1776 + }, + { + "epoch": 0.3159111111111111, + "grad_norm": 0.23393423855304718, + "learning_rate": 3.867739501904938e-05, + "loss": 0.4672, + "step": 1777 + }, + { + "epoch": 0.31608888888888886, + "grad_norm": 0.24775564670562744, + "learning_rate": 3.8665705184419386e-05, + "loss": 0.6202, + "step": 1778 + }, + { + "epoch": 0.31626666666666664, + "grad_norm": 0.2079147845506668, + "learning_rate": 3.865401108707017e-05, + "loss": 0.4263, + "step": 1779 + }, + { + "epoch": 0.3164444444444444, + "grad_norm": 0.2793065011501312, + "learning_rate": 3.864231273064944e-05, + "loss": 0.4726, + "step": 1780 + }, + { + "epoch": 0.31662222222222225, + "grad_norm": 0.23752595484256744, + "learning_rate": 3.8630610118806254e-05, + "loss": 0.5576, + "step": 1781 + }, + { + "epoch": 0.3168, + "grad_norm": 0.22386451065540314, + "learning_rate": 3.861890325519098e-05, + "loss": 0.5332, + "step": 1782 + }, + { + "epoch": 0.3169777777777778, + "grad_norm": 0.2831936180591583, + "learning_rate": 3.8607192143455326e-05, + "loss": 0.7237, + "step": 1783 + }, + { + "epoch": 0.3171555555555556, + "grad_norm": 0.24557265639305115, + "learning_rate": 3.859547678725231e-05, + "loss": 0.5024, + "step": 1784 + }, + { + "epoch": 0.31733333333333336, + "grad_norm": 0.30713656544685364, + "learning_rate": 3.858375719023629e-05, + "loss": 0.5171, + "step": 1785 + }, + { + "epoch": 0.31751111111111113, + "grad_norm": 0.2589064836502075, + "learning_rate": 3.8572033356062943e-05, + "loss": 0.5712, + "step": 1786 + }, + { + "epoch": 0.3176888888888889, + "grad_norm": 0.2590896785259247, + "learning_rate": 3.856030528838925e-05, + "loss": 0.5295, + "step": 1787 + }, + { + "epoch": 0.3178666666666667, + "grad_norm": 0.2632320523262024, + "learning_rate": 3.854857299087353e-05, + "loss": 0.3963, + "step": 1788 + }, + { + "epoch": 0.31804444444444446, + "grad_norm": 0.26224273443222046, + "learning_rate": 3.853683646717543e-05, + "loss": 0.4764, + "step": 1789 + }, + { + "epoch": 0.31822222222222224, + "grad_norm": 0.26053351163864136, + "learning_rate": 3.852509572095588e-05, + "loss": 0.463, + "step": 1790 + }, + { + "epoch": 0.3184, + "grad_norm": 0.22330141067504883, + "learning_rate": 3.851335075587718e-05, + "loss": 0.3735, + "step": 1791 + }, + { + "epoch": 0.3185777777777778, + "grad_norm": 0.27002251148223877, + "learning_rate": 3.85016015756029e-05, + "loss": 0.3804, + "step": 1792 + }, + { + "epoch": 0.31875555555555557, + "grad_norm": 0.2598581314086914, + "learning_rate": 3.848984818379793e-05, + "loss": 0.4206, + "step": 1793 + }, + { + "epoch": 0.31893333333333335, + "grad_norm": 0.37808501720428467, + "learning_rate": 3.84780905841285e-05, + "loss": 0.466, + "step": 1794 + }, + { + "epoch": 0.3191111111111111, + "grad_norm": 0.36060091853141785, + "learning_rate": 3.846632878026214e-05, + "loss": 0.4189, + "step": 1795 + }, + { + "epoch": 0.3192888888888889, + "grad_norm": 0.32629507780075073, + "learning_rate": 3.8454562775867684e-05, + "loss": 0.3959, + "step": 1796 + }, + { + "epoch": 0.3194666666666667, + "grad_norm": 0.36942774057388306, + "learning_rate": 3.8442792574615275e-05, + "loss": 0.4132, + "step": 1797 + }, + { + "epoch": 0.31964444444444445, + "grad_norm": 0.38746127486228943, + "learning_rate": 3.843101818017637e-05, + "loss": 0.3563, + "step": 1798 + }, + { + "epoch": 0.31982222222222223, + "grad_norm": 0.34891799092292786, + "learning_rate": 3.841923959622375e-05, + "loss": 0.5363, + "step": 1799 + }, + { + "epoch": 0.32, + "grad_norm": 0.44006434082984924, + "learning_rate": 3.840745682643147e-05, + "loss": 0.4849, + "step": 1800 + }, + { + "epoch": 0.3201777777777778, + "grad_norm": 0.2698839008808136, + "learning_rate": 3.8395669874474915e-05, + "loss": 0.5459, + "step": 1801 + }, + { + "epoch": 0.32035555555555556, + "grad_norm": 0.30511969327926636, + "learning_rate": 3.8383878744030776e-05, + "loss": 0.5023, + "step": 1802 + }, + { + "epoch": 0.32053333333333334, + "grad_norm": 0.3474431335926056, + "learning_rate": 3.837208343877703e-05, + "loss": 0.5084, + "step": 1803 + }, + { + "epoch": 0.3207111111111111, + "grad_norm": 0.3954322040081024, + "learning_rate": 3.836028396239297e-05, + "loss": 0.8113, + "step": 1804 + }, + { + "epoch": 0.3208888888888889, + "grad_norm": 0.28120672702789307, + "learning_rate": 3.834848031855919e-05, + "loss": 0.5473, + "step": 1805 + }, + { + "epoch": 0.32106666666666667, + "grad_norm": 0.37416958808898926, + "learning_rate": 3.8336672510957574e-05, + "loss": 0.4418, + "step": 1806 + }, + { + "epoch": 0.32124444444444444, + "grad_norm": 0.27026987075805664, + "learning_rate": 3.83248605432713e-05, + "loss": 0.5303, + "step": 1807 + }, + { + "epoch": 0.3214222222222222, + "grad_norm": 0.26826706528663635, + "learning_rate": 3.8313044419184873e-05, + "loss": 0.6264, + "step": 1808 + }, + { + "epoch": 0.3216, + "grad_norm": 0.3161207139492035, + "learning_rate": 3.830122414238406e-05, + "loss": 0.4557, + "step": 1809 + }, + { + "epoch": 0.3217777777777778, + "grad_norm": 0.2921457886695862, + "learning_rate": 3.828939971655595e-05, + "loss": 0.3211, + "step": 1810 + }, + { + "epoch": 0.32195555555555555, + "grad_norm": 0.2892923653125763, + "learning_rate": 3.827757114538892e-05, + "loss": 0.3958, + "step": 1811 + }, + { + "epoch": 0.3221333333333333, + "grad_norm": 0.33940258622169495, + "learning_rate": 3.826573843257262e-05, + "loss": 0.5927, + "step": 1812 + }, + { + "epoch": 0.3223111111111111, + "grad_norm": 0.2379864603281021, + "learning_rate": 3.8253901581798016e-05, + "loss": 0.3848, + "step": 1813 + }, + { + "epoch": 0.3224888888888889, + "grad_norm": 0.2523452043533325, + "learning_rate": 3.824206059675736e-05, + "loss": 0.5374, + "step": 1814 + }, + { + "epoch": 0.32266666666666666, + "grad_norm": 0.22038324177265167, + "learning_rate": 3.823021548114417e-05, + "loss": 0.4294, + "step": 1815 + }, + { + "epoch": 0.32284444444444443, + "grad_norm": 0.29588496685028076, + "learning_rate": 3.821836623865329e-05, + "loss": 0.6109, + "step": 1816 + }, + { + "epoch": 0.3230222222222222, + "grad_norm": 0.21434426307678223, + "learning_rate": 3.820651287298084e-05, + "loss": 0.401, + "step": 1817 + }, + { + "epoch": 0.3232, + "grad_norm": 0.2023949921131134, + "learning_rate": 3.81946553878242e-05, + "loss": 0.3811, + "step": 1818 + }, + { + "epoch": 0.32337777777777776, + "grad_norm": 0.21868082880973816, + "learning_rate": 3.8182793786882065e-05, + "loss": 0.3081, + "step": 1819 + }, + { + "epoch": 0.32355555555555554, + "grad_norm": 0.24043965339660645, + "learning_rate": 3.8170928073854396e-05, + "loss": 0.5559, + "step": 1820 + }, + { + "epoch": 0.3237333333333333, + "grad_norm": 0.25507602095603943, + "learning_rate": 3.8159058252442446e-05, + "loss": 0.3773, + "step": 1821 + }, + { + "epoch": 0.3239111111111111, + "grad_norm": 0.23844337463378906, + "learning_rate": 3.814718432634876e-05, + "loss": 0.5183, + "step": 1822 + }, + { + "epoch": 0.32408888888888887, + "grad_norm": 0.25890296697616577, + "learning_rate": 3.813530629927714e-05, + "loss": 0.6071, + "step": 1823 + }, + { + "epoch": 0.32426666666666665, + "grad_norm": 0.23044650256633759, + "learning_rate": 3.8123424174932674e-05, + "loss": 0.4995, + "step": 1824 + }, + { + "epoch": 0.3244444444444444, + "grad_norm": 0.29565560817718506, + "learning_rate": 3.811153795702174e-05, + "loss": 0.4878, + "step": 1825 + }, + { + "epoch": 0.3246222222222222, + "grad_norm": 0.2706906497478485, + "learning_rate": 3.8099647649251986e-05, + "loss": 0.4275, + "step": 1826 + }, + { + "epoch": 0.3248, + "grad_norm": 0.24134431779384613, + "learning_rate": 3.808775325533232e-05, + "loss": 0.4621, + "step": 1827 + }, + { + "epoch": 0.32497777777777775, + "grad_norm": 0.28100982308387756, + "learning_rate": 3.8075854778972955e-05, + "loss": 0.4145, + "step": 1828 + }, + { + "epoch": 0.32515555555555553, + "grad_norm": 0.23400017619132996, + "learning_rate": 3.806395222388536e-05, + "loss": 0.5443, + "step": 1829 + }, + { + "epoch": 0.3253333333333333, + "grad_norm": 0.25233718752861023, + "learning_rate": 3.805204559378227e-05, + "loss": 0.359, + "step": 1830 + }, + { + "epoch": 0.3255111111111111, + "grad_norm": 0.2422688603401184, + "learning_rate": 3.80401348923777e-05, + "loss": 0.4645, + "step": 1831 + }, + { + "epoch": 0.3256888888888889, + "grad_norm": 0.20790573954582214, + "learning_rate": 3.802822012338694e-05, + "loss": 0.433, + "step": 1832 + }, + { + "epoch": 0.3258666666666667, + "grad_norm": 0.32703182101249695, + "learning_rate": 3.8016301290526534e-05, + "loss": 0.6004, + "step": 1833 + }, + { + "epoch": 0.32604444444444447, + "grad_norm": 0.27112576365470886, + "learning_rate": 3.8004378397514315e-05, + "loss": 0.4776, + "step": 1834 + }, + { + "epoch": 0.32622222222222225, + "grad_norm": 0.27799126505851746, + "learning_rate": 3.799245144806937e-05, + "loss": 0.3902, + "step": 1835 + }, + { + "epoch": 0.3264, + "grad_norm": 0.21686284244060516, + "learning_rate": 3.798052044591204e-05, + "loss": 0.3964, + "step": 1836 + }, + { + "epoch": 0.3265777777777778, + "grad_norm": 0.23352764546871185, + "learning_rate": 3.796858539476394e-05, + "loss": 0.4619, + "step": 1837 + }, + { + "epoch": 0.3267555555555556, + "grad_norm": 0.22777606546878815, + "learning_rate": 3.7956646298347956e-05, + "loss": 0.4216, + "step": 1838 + }, + { + "epoch": 0.32693333333333335, + "grad_norm": 0.250166654586792, + "learning_rate": 3.7944703160388234e-05, + "loss": 0.4123, + "step": 1839 + }, + { + "epoch": 0.32711111111111113, + "grad_norm": 0.2995658814907074, + "learning_rate": 3.793275598461017e-05, + "loss": 0.3242, + "step": 1840 + }, + { + "epoch": 0.3272888888888889, + "grad_norm": 0.2961699664592743, + "learning_rate": 3.792080477474043e-05, + "loss": 0.4628, + "step": 1841 + }, + { + "epoch": 0.3274666666666667, + "grad_norm": 0.25068238377571106, + "learning_rate": 3.790884953450692e-05, + "loss": 0.3664, + "step": 1842 + }, + { + "epoch": 0.32764444444444446, + "grad_norm": 0.2917799651622772, + "learning_rate": 3.789689026763883e-05, + "loss": 0.3605, + "step": 1843 + }, + { + "epoch": 0.32782222222222224, + "grad_norm": 0.2906927168369293, + "learning_rate": 3.788492697786658e-05, + "loss": 0.4569, + "step": 1844 + }, + { + "epoch": 0.328, + "grad_norm": 0.2979743182659149, + "learning_rate": 3.7872959668921884e-05, + "loss": 0.4687, + "step": 1845 + }, + { + "epoch": 0.3281777777777778, + "grad_norm": 0.3796265721321106, + "learning_rate": 3.786098834453766e-05, + "loss": 0.4653, + "step": 1846 + }, + { + "epoch": 0.32835555555555557, + "grad_norm": 0.38148266077041626, + "learning_rate": 3.7849013008448115e-05, + "loss": 0.4448, + "step": 1847 + }, + { + "epoch": 0.32853333333333334, + "grad_norm": 0.3751465678215027, + "learning_rate": 3.783703366438868e-05, + "loss": 0.4196, + "step": 1848 + }, + { + "epoch": 0.3287111111111111, + "grad_norm": 0.5386833548545837, + "learning_rate": 3.782505031609607e-05, + "loss": 0.4503, + "step": 1849 + }, + { + "epoch": 0.3288888888888889, + "grad_norm": 0.4277711808681488, + "learning_rate": 3.78130629673082e-05, + "loss": 0.4275, + "step": 1850 + }, + { + "epoch": 0.3290666666666667, + "grad_norm": 0.21239469945430756, + "learning_rate": 3.780107162176429e-05, + "loss": 0.3547, + "step": 1851 + }, + { + "epoch": 0.32924444444444445, + "grad_norm": 0.2635221779346466, + "learning_rate": 3.778907628320477e-05, + "loss": 0.5979, + "step": 1852 + }, + { + "epoch": 0.3294222222222222, + "grad_norm": 0.292133629322052, + "learning_rate": 3.777707695537133e-05, + "loss": 0.6329, + "step": 1853 + }, + { + "epoch": 0.3296, + "grad_norm": 0.28475481271743774, + "learning_rate": 3.776507364200689e-05, + "loss": 0.4102, + "step": 1854 + }, + { + "epoch": 0.3297777777777778, + "grad_norm": 0.2551453709602356, + "learning_rate": 3.775306634685562e-05, + "loss": 0.6214, + "step": 1855 + }, + { + "epoch": 0.32995555555555556, + "grad_norm": 0.26289665699005127, + "learning_rate": 3.7741055073662946e-05, + "loss": 0.4735, + "step": 1856 + }, + { + "epoch": 0.33013333333333333, + "grad_norm": 0.3306237757205963, + "learning_rate": 3.772903982617552e-05, + "loss": 0.517, + "step": 1857 + }, + { + "epoch": 0.3303111111111111, + "grad_norm": 0.2637612223625183, + "learning_rate": 3.771702060814123e-05, + "loss": 0.4701, + "step": 1858 + }, + { + "epoch": 0.3304888888888889, + "grad_norm": 0.22696034610271454, + "learning_rate": 3.770499742330922e-05, + "loss": 0.4851, + "step": 1859 + }, + { + "epoch": 0.33066666666666666, + "grad_norm": 0.282044917345047, + "learning_rate": 3.769297027542985e-05, + "loss": 0.4875, + "step": 1860 + }, + { + "epoch": 0.33084444444444444, + "grad_norm": 0.274105966091156, + "learning_rate": 3.7680939168254733e-05, + "loss": 0.4224, + "step": 1861 + }, + { + "epoch": 0.3310222222222222, + "grad_norm": 0.2340739667415619, + "learning_rate": 3.7668904105536706e-05, + "loss": 0.4379, + "step": 1862 + }, + { + "epoch": 0.3312, + "grad_norm": 0.23340032994747162, + "learning_rate": 3.765686509102985e-05, + "loss": 0.4753, + "step": 1863 + }, + { + "epoch": 0.33137777777777777, + "grad_norm": 0.2583799660205841, + "learning_rate": 3.764482212848948e-05, + "loss": 0.4218, + "step": 1864 + }, + { + "epoch": 0.33155555555555555, + "grad_norm": 0.20819202065467834, + "learning_rate": 3.7632775221672115e-05, + "loss": 0.4144, + "step": 1865 + }, + { + "epoch": 0.3317333333333333, + "grad_norm": 0.22011113166809082, + "learning_rate": 3.762072437433555e-05, + "loss": 0.377, + "step": 1866 + }, + { + "epoch": 0.3319111111111111, + "grad_norm": 0.25663116574287415, + "learning_rate": 3.760866959023877e-05, + "loss": 0.5028, + "step": 1867 + }, + { + "epoch": 0.3320888888888889, + "grad_norm": 0.24224042892456055, + "learning_rate": 3.759661087314199e-05, + "loss": 0.5362, + "step": 1868 + }, + { + "epoch": 0.33226666666666665, + "grad_norm": 0.26839980483055115, + "learning_rate": 3.7584548226806696e-05, + "loss": 0.4201, + "step": 1869 + }, + { + "epoch": 0.33244444444444443, + "grad_norm": 0.2340385615825653, + "learning_rate": 3.757248165499555e-05, + "loss": 0.5805, + "step": 1870 + }, + { + "epoch": 0.3326222222222222, + "grad_norm": 0.2702961564064026, + "learning_rate": 3.7560411161472456e-05, + "loss": 0.5838, + "step": 1871 + }, + { + "epoch": 0.3328, + "grad_norm": 0.28083211183547974, + "learning_rate": 3.7548336750002544e-05, + "loss": 0.3691, + "step": 1872 + }, + { + "epoch": 0.33297777777777776, + "grad_norm": 0.3152126967906952, + "learning_rate": 3.753625842435216e-05, + "loss": 0.682, + "step": 1873 + }, + { + "epoch": 0.33315555555555554, + "grad_norm": 0.26895955204963684, + "learning_rate": 3.752417618828888e-05, + "loss": 0.5986, + "step": 1874 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 0.20872604846954346, + "learning_rate": 3.751209004558149e-05, + "loss": 0.4727, + "step": 1875 + }, + { + "epoch": 0.3335111111111111, + "grad_norm": 0.2611149251461029, + "learning_rate": 3.7500000000000003e-05, + "loss": 0.6079, + "step": 1876 + }, + { + "epoch": 0.33368888888888887, + "grad_norm": 0.23145298659801483, + "learning_rate": 3.748790605531565e-05, + "loss": 0.4827, + "step": 1877 + }, + { + "epoch": 0.33386666666666664, + "grad_norm": 0.25269511342048645, + "learning_rate": 3.7475808215300854e-05, + "loss": 0.4719, + "step": 1878 + }, + { + "epoch": 0.3340444444444444, + "grad_norm": 0.19355008006095886, + "learning_rate": 3.7463706483729296e-05, + "loss": 0.4846, + "step": 1879 + }, + { + "epoch": 0.3342222222222222, + "grad_norm": 0.3000943660736084, + "learning_rate": 3.7451600864375844e-05, + "loss": 0.4581, + "step": 1880 + }, + { + "epoch": 0.3344, + "grad_norm": 0.23026035726070404, + "learning_rate": 3.7439491361016564e-05, + "loss": 0.4171, + "step": 1881 + }, + { + "epoch": 0.33457777777777775, + "grad_norm": 0.27065929770469666, + "learning_rate": 3.742737797742878e-05, + "loss": 0.3958, + "step": 1882 + }, + { + "epoch": 0.33475555555555553, + "grad_norm": 0.3308534026145935, + "learning_rate": 3.741526071739097e-05, + "loss": 0.5279, + "step": 1883 + }, + { + "epoch": 0.33493333333333336, + "grad_norm": 0.2427447885274887, + "learning_rate": 3.740313958468287e-05, + "loss": 0.4405, + "step": 1884 + }, + { + "epoch": 0.33511111111111114, + "grad_norm": 0.2443220615386963, + "learning_rate": 3.7391014583085385e-05, + "loss": 0.4932, + "step": 1885 + }, + { + "epoch": 0.3352888888888889, + "grad_norm": 0.2655545175075531, + "learning_rate": 3.7378885716380664e-05, + "loss": 0.5911, + "step": 1886 + }, + { + "epoch": 0.3354666666666667, + "grad_norm": 0.2868138551712036, + "learning_rate": 3.736675298835203e-05, + "loss": 0.5663, + "step": 1887 + }, + { + "epoch": 0.33564444444444447, + "grad_norm": 0.26341116428375244, + "learning_rate": 3.7354616402784035e-05, + "loss": 0.4665, + "step": 1888 + }, + { + "epoch": 0.33582222222222224, + "grad_norm": 0.26280859112739563, + "learning_rate": 3.734247596346242e-05, + "loss": 0.5051, + "step": 1889 + }, + { + "epoch": 0.336, + "grad_norm": 0.2871548533439636, + "learning_rate": 3.7330331674174125e-05, + "loss": 0.3198, + "step": 1890 + }, + { + "epoch": 0.3361777777777778, + "grad_norm": 0.26460519433021545, + "learning_rate": 3.731818353870729e-05, + "loss": 0.4056, + "step": 1891 + }, + { + "epoch": 0.3363555555555556, + "grad_norm": 0.281025767326355, + "learning_rate": 3.7306031560851275e-05, + "loss": 0.3658, + "step": 1892 + }, + { + "epoch": 0.33653333333333335, + "grad_norm": 0.2817701995372772, + "learning_rate": 3.729387574439662e-05, + "loss": 0.448, + "step": 1893 + }, + { + "epoch": 0.3367111111111111, + "grad_norm": 0.3609563708305359, + "learning_rate": 3.7281716093135063e-05, + "loss": 0.4345, + "step": 1894 + }, + { + "epoch": 0.3368888888888889, + "grad_norm": 0.3648139536380768, + "learning_rate": 3.726955261085956e-05, + "loss": 0.4839, + "step": 1895 + }, + { + "epoch": 0.3370666666666667, + "grad_norm": 0.33719033002853394, + "learning_rate": 3.725738530136422e-05, + "loss": 0.4134, + "step": 1896 + }, + { + "epoch": 0.33724444444444446, + "grad_norm": 0.4368197023868561, + "learning_rate": 3.7245214168444386e-05, + "loss": 0.5851, + "step": 1897 + }, + { + "epoch": 0.33742222222222223, + "grad_norm": 0.3872874677181244, + "learning_rate": 3.723303921589657e-05, + "loss": 0.3736, + "step": 1898 + }, + { + "epoch": 0.3376, + "grad_norm": 0.37986788153648376, + "learning_rate": 3.722086044751849e-05, + "loss": 0.4076, + "step": 1899 + }, + { + "epoch": 0.3377777777777778, + "grad_norm": 0.437290757894516, + "learning_rate": 3.720867786710904e-05, + "loss": 0.5952, + "step": 1900 + }, + { + "epoch": 0.33795555555555556, + "grad_norm": 0.3028965890407562, + "learning_rate": 3.719649147846832e-05, + "loss": 0.5245, + "step": 1901 + }, + { + "epoch": 0.33813333333333334, + "grad_norm": 0.30047425627708435, + "learning_rate": 3.71843012853976e-05, + "loss": 0.4661, + "step": 1902 + }, + { + "epoch": 0.3383111111111111, + "grad_norm": 0.25994980335235596, + "learning_rate": 3.717210729169935e-05, + "loss": 0.5909, + "step": 1903 + }, + { + "epoch": 0.3384888888888889, + "grad_norm": 0.23889589309692383, + "learning_rate": 3.7159909501177226e-05, + "loss": 0.5156, + "step": 1904 + }, + { + "epoch": 0.33866666666666667, + "grad_norm": 0.4134761393070221, + "learning_rate": 3.7147707917636046e-05, + "loss": 0.5898, + "step": 1905 + }, + { + "epoch": 0.33884444444444445, + "grad_norm": 0.253187358379364, + "learning_rate": 3.713550254488185e-05, + "loss": 0.497, + "step": 1906 + }, + { + "epoch": 0.3390222222222222, + "grad_norm": 0.24367600679397583, + "learning_rate": 3.712329338672182e-05, + "loss": 0.5153, + "step": 1907 + }, + { + "epoch": 0.3392, + "grad_norm": 0.27530232071876526, + "learning_rate": 3.711108044696436e-05, + "loss": 0.4683, + "step": 1908 + }, + { + "epoch": 0.3393777777777778, + "grad_norm": 0.3312942087650299, + "learning_rate": 3.7098863729419e-05, + "loss": 0.5108, + "step": 1909 + }, + { + "epoch": 0.33955555555555555, + "grad_norm": 0.27320244908332825, + "learning_rate": 3.7086643237896504e-05, + "loss": 0.5039, + "step": 1910 + }, + { + "epoch": 0.33973333333333333, + "grad_norm": 0.237113818526268, + "learning_rate": 3.7074418976208766e-05, + "loss": 0.379, + "step": 1911 + }, + { + "epoch": 0.3399111111111111, + "grad_norm": 0.21813713014125824, + "learning_rate": 3.706219094816891e-05, + "loss": 0.5419, + "step": 1912 + }, + { + "epoch": 0.3400888888888889, + "grad_norm": 0.314559668302536, + "learning_rate": 3.704995915759117e-05, + "loss": 0.3644, + "step": 1913 + }, + { + "epoch": 0.34026666666666666, + "grad_norm": 0.2437450885772705, + "learning_rate": 3.7037723608291015e-05, + "loss": 0.6255, + "step": 1914 + }, + { + "epoch": 0.34044444444444444, + "grad_norm": 0.24562908709049225, + "learning_rate": 3.7025484304085034e-05, + "loss": 0.3466, + "step": 1915 + }, + { + "epoch": 0.3406222222222222, + "grad_norm": 0.2487138956785202, + "learning_rate": 3.701324124879102e-05, + "loss": 0.441, + "step": 1916 + }, + { + "epoch": 0.3408, + "grad_norm": 0.23318137228488922, + "learning_rate": 3.700099444622794e-05, + "loss": 0.4717, + "step": 1917 + }, + { + "epoch": 0.34097777777777777, + "grad_norm": 0.23189052939414978, + "learning_rate": 3.6988743900215894e-05, + "loss": 0.4128, + "step": 1918 + }, + { + "epoch": 0.34115555555555555, + "grad_norm": 0.2886887788772583, + "learning_rate": 3.69764896145762e-05, + "loss": 0.6373, + "step": 1919 + }, + { + "epoch": 0.3413333333333333, + "grad_norm": 0.23716111481189728, + "learning_rate": 3.696423159313129e-05, + "loss": 0.5031, + "step": 1920 + }, + { + "epoch": 0.3415111111111111, + "grad_norm": 0.26277533173561096, + "learning_rate": 3.695196983970481e-05, + "loss": 0.4631, + "step": 1921 + }, + { + "epoch": 0.3416888888888889, + "grad_norm": 0.281986266374588, + "learning_rate": 3.693970435812153e-05, + "loss": 0.5281, + "step": 1922 + }, + { + "epoch": 0.34186666666666665, + "grad_norm": 0.20242241024971008, + "learning_rate": 3.6927435152207406e-05, + "loss": 0.4111, + "step": 1923 + }, + { + "epoch": 0.34204444444444443, + "grad_norm": 0.2568512558937073, + "learning_rate": 3.6915162225789546e-05, + "loss": 0.4841, + "step": 1924 + }, + { + "epoch": 0.3422222222222222, + "grad_norm": 0.37452176213264465, + "learning_rate": 3.690288558269623e-05, + "loss": 0.5462, + "step": 1925 + }, + { + "epoch": 0.3424, + "grad_norm": 0.29145917296409607, + "learning_rate": 3.689060522675689e-05, + "loss": 0.5189, + "step": 1926 + }, + { + "epoch": 0.34257777777777776, + "grad_norm": 0.3490188419818878, + "learning_rate": 3.6878321161802104e-05, + "loss": 0.6426, + "step": 1927 + }, + { + "epoch": 0.34275555555555554, + "grad_norm": 0.2431727647781372, + "learning_rate": 3.686603339166362e-05, + "loss": 0.5114, + "step": 1928 + }, + { + "epoch": 0.3429333333333333, + "grad_norm": 0.23919442296028137, + "learning_rate": 3.685374192017436e-05, + "loss": 0.4502, + "step": 1929 + }, + { + "epoch": 0.3431111111111111, + "grad_norm": 0.26660627126693726, + "learning_rate": 3.6841446751168355e-05, + "loss": 0.58, + "step": 1930 + }, + { + "epoch": 0.34328888888888887, + "grad_norm": 0.29465609788894653, + "learning_rate": 3.682914788848083e-05, + "loss": 0.501, + "step": 1931 + }, + { + "epoch": 0.34346666666666664, + "grad_norm": 0.2881031930446625, + "learning_rate": 3.681684533594815e-05, + "loss": 0.4499, + "step": 1932 + }, + { + "epoch": 0.3436444444444444, + "grad_norm": 0.2623785734176636, + "learning_rate": 3.680453909740782e-05, + "loss": 0.4368, + "step": 1933 + }, + { + "epoch": 0.3438222222222222, + "grad_norm": 0.22271504998207092, + "learning_rate": 3.679222917669851e-05, + "loss": 0.5817, + "step": 1934 + }, + { + "epoch": 0.344, + "grad_norm": 0.2615748345851898, + "learning_rate": 3.6779915577660015e-05, + "loss": 0.4364, + "step": 1935 + }, + { + "epoch": 0.3441777777777778, + "grad_norm": 0.19897601008415222, + "learning_rate": 3.6767598304133324e-05, + "loss": 0.3466, + "step": 1936 + }, + { + "epoch": 0.3443555555555556, + "grad_norm": 0.30668073892593384, + "learning_rate": 3.67552773599605e-05, + "loss": 0.4921, + "step": 1937 + }, + { + "epoch": 0.34453333333333336, + "grad_norm": 0.28807684779167175, + "learning_rate": 3.674295274898485e-05, + "loss": 0.6456, + "step": 1938 + }, + { + "epoch": 0.34471111111111113, + "grad_norm": 0.24431774020195007, + "learning_rate": 3.673062447505072e-05, + "loss": 0.5434, + "step": 1939 + }, + { + "epoch": 0.3448888888888889, + "grad_norm": 0.27331554889678955, + "learning_rate": 3.6718292542003666e-05, + "loss": 0.3745, + "step": 1940 + }, + { + "epoch": 0.3450666666666667, + "grad_norm": 0.24217864871025085, + "learning_rate": 3.6705956953690364e-05, + "loss": 0.4175, + "step": 1941 + }, + { + "epoch": 0.34524444444444446, + "grad_norm": 0.3288584053516388, + "learning_rate": 3.6693617713958634e-05, + "loss": 0.4358, + "step": 1942 + }, + { + "epoch": 0.34542222222222224, + "grad_norm": 0.22553203999996185, + "learning_rate": 3.668127482665743e-05, + "loss": 0.3799, + "step": 1943 + }, + { + "epoch": 0.3456, + "grad_norm": 0.2872907221317291, + "learning_rate": 3.6668928295636854e-05, + "loss": 0.4357, + "step": 1944 + }, + { + "epoch": 0.3457777777777778, + "grad_norm": 0.2929996848106384, + "learning_rate": 3.665657812474812e-05, + "loss": 0.3633, + "step": 1945 + }, + { + "epoch": 0.34595555555555557, + "grad_norm": 0.2582526206970215, + "learning_rate": 3.664422431784361e-05, + "loss": 0.3998, + "step": 1946 + }, + { + "epoch": 0.34613333333333335, + "grad_norm": 0.3431326448917389, + "learning_rate": 3.663186687877682e-05, + "loss": 0.4287, + "step": 1947 + }, + { + "epoch": 0.3463111111111111, + "grad_norm": 0.3807786703109741, + "learning_rate": 3.661950581140239e-05, + "loss": 0.5018, + "step": 1948 + }, + { + "epoch": 0.3464888888888889, + "grad_norm": 0.4096670150756836, + "learning_rate": 3.6607141119576084e-05, + "loss": 0.4299, + "step": 1949 + }, + { + "epoch": 0.3466666666666667, + "grad_norm": 0.4422893226146698, + "learning_rate": 3.659477280715479e-05, + "loss": 0.6081, + "step": 1950 + }, + { + "epoch": 0.34684444444444446, + "grad_norm": 0.24923039972782135, + "learning_rate": 3.6582400877996546e-05, + "loss": 0.5227, + "step": 1951 + }, + { + "epoch": 0.34702222222222223, + "grad_norm": 0.26584309339523315, + "learning_rate": 3.657002533596049e-05, + "loss": 0.4327, + "step": 1952 + }, + { + "epoch": 0.3472, + "grad_norm": 0.31643179059028625, + "learning_rate": 3.655764618490692e-05, + "loss": 0.5658, + "step": 1953 + }, + { + "epoch": 0.3473777777777778, + "grad_norm": 0.2787647545337677, + "learning_rate": 3.654526342869724e-05, + "loss": 0.5336, + "step": 1954 + }, + { + "epoch": 0.34755555555555556, + "grad_norm": 0.21620555222034454, + "learning_rate": 3.6532877071193974e-05, + "loss": 0.4816, + "step": 1955 + }, + { + "epoch": 0.34773333333333334, + "grad_norm": 0.23605084419250488, + "learning_rate": 3.6520487116260776e-05, + "loss": 0.5574, + "step": 1956 + }, + { + "epoch": 0.3479111111111111, + "grad_norm": 0.22515463829040527, + "learning_rate": 3.650809356776242e-05, + "loss": 0.4672, + "step": 1957 + }, + { + "epoch": 0.3480888888888889, + "grad_norm": 0.31022170186042786, + "learning_rate": 3.6495696429564823e-05, + "loss": 0.4596, + "step": 1958 + }, + { + "epoch": 0.34826666666666667, + "grad_norm": 0.25437894463539124, + "learning_rate": 3.648329570553498e-05, + "loss": 0.4393, + "step": 1959 + }, + { + "epoch": 0.34844444444444445, + "grad_norm": 0.25013643503189087, + "learning_rate": 3.647089139954104e-05, + "loss": 0.5936, + "step": 1960 + }, + { + "epoch": 0.3486222222222222, + "grad_norm": 0.21043139696121216, + "learning_rate": 3.645848351545225e-05, + "loss": 0.4029, + "step": 1961 + }, + { + "epoch": 0.3488, + "grad_norm": 0.26225078105926514, + "learning_rate": 3.644607205713898e-05, + "loss": 0.4061, + "step": 1962 + }, + { + "epoch": 0.3489777777777778, + "grad_norm": 0.25207337737083435, + "learning_rate": 3.643365702847272e-05, + "loss": 0.4265, + "step": 1963 + }, + { + "epoch": 0.34915555555555555, + "grad_norm": 0.19729246199131012, + "learning_rate": 3.642123843332606e-05, + "loss": 0.4232, + "step": 1964 + }, + { + "epoch": 0.34933333333333333, + "grad_norm": 0.24098612368106842, + "learning_rate": 3.640881627557271e-05, + "loss": 0.5515, + "step": 1965 + }, + { + "epoch": 0.3495111111111111, + "grad_norm": 0.25318804383277893, + "learning_rate": 3.639639055908751e-05, + "loss": 0.6905, + "step": 1966 + }, + { + "epoch": 0.3496888888888889, + "grad_norm": 0.24444417655467987, + "learning_rate": 3.638396128774636e-05, + "loss": 0.5113, + "step": 1967 + }, + { + "epoch": 0.34986666666666666, + "grad_norm": 0.24173714220523834, + "learning_rate": 3.637152846542633e-05, + "loss": 0.5194, + "step": 1968 + }, + { + "epoch": 0.35004444444444444, + "grad_norm": 0.23139755427837372, + "learning_rate": 3.635909209600555e-05, + "loss": 0.4269, + "step": 1969 + }, + { + "epoch": 0.3502222222222222, + "grad_norm": 0.22398440539836884, + "learning_rate": 3.634665218336328e-05, + "loss": 0.4342, + "step": 1970 + }, + { + "epoch": 0.3504, + "grad_norm": 0.2495083510875702, + "learning_rate": 3.633420873137988e-05, + "loss": 0.4025, + "step": 1971 + }, + { + "epoch": 0.35057777777777777, + "grad_norm": 0.29613104462623596, + "learning_rate": 3.632176174393682e-05, + "loss": 0.5453, + "step": 1972 + }, + { + "epoch": 0.35075555555555554, + "grad_norm": 0.24509337544441223, + "learning_rate": 3.630931122491666e-05, + "loss": 0.502, + "step": 1973 + }, + { + "epoch": 0.3509333333333333, + "grad_norm": 0.21222198009490967, + "learning_rate": 3.629685717820307e-05, + "loss": 0.4085, + "step": 1974 + }, + { + "epoch": 0.3511111111111111, + "grad_norm": 0.2646518647670746, + "learning_rate": 3.628439960768082e-05, + "loss": 0.6516, + "step": 1975 + }, + { + "epoch": 0.3512888888888889, + "grad_norm": 0.3048623204231262, + "learning_rate": 3.627193851723577e-05, + "loss": 0.5144, + "step": 1976 + }, + { + "epoch": 0.35146666666666665, + "grad_norm": 0.3339979648590088, + "learning_rate": 3.6259473910754904e-05, + "loss": 0.5887, + "step": 1977 + }, + { + "epoch": 0.3516444444444444, + "grad_norm": 0.25724416971206665, + "learning_rate": 3.624700579212626e-05, + "loss": 0.4511, + "step": 1978 + }, + { + "epoch": 0.3518222222222222, + "grad_norm": 0.26549121737480164, + "learning_rate": 3.623453416523902e-05, + "loss": 0.5093, + "step": 1979 + }, + { + "epoch": 0.352, + "grad_norm": 0.25060346722602844, + "learning_rate": 3.622205903398342e-05, + "loss": 0.4345, + "step": 1980 + }, + { + "epoch": 0.35217777777777776, + "grad_norm": 0.22731250524520874, + "learning_rate": 3.6209580402250815e-05, + "loss": 0.5389, + "step": 1981 + }, + { + "epoch": 0.35235555555555553, + "grad_norm": 0.25976020097732544, + "learning_rate": 3.6197098273933634e-05, + "loss": 0.6446, + "step": 1982 + }, + { + "epoch": 0.3525333333333333, + "grad_norm": 0.2761855721473694, + "learning_rate": 3.618461265292541e-05, + "loss": 0.7111, + "step": 1983 + }, + { + "epoch": 0.3527111111111111, + "grad_norm": 0.2257377952337265, + "learning_rate": 3.617212354312076e-05, + "loss": 0.579, + "step": 1984 + }, + { + "epoch": 0.35288888888888886, + "grad_norm": 0.2642453610897064, + "learning_rate": 3.61596309484154e-05, + "loss": 0.4293, + "step": 1985 + }, + { + "epoch": 0.35306666666666664, + "grad_norm": 0.24468721449375153, + "learning_rate": 3.614713487270611e-05, + "loss": 0.4739, + "step": 1986 + }, + { + "epoch": 0.35324444444444447, + "grad_norm": 0.25956660509109497, + "learning_rate": 3.613463531989076e-05, + "loss": 0.5368, + "step": 1987 + }, + { + "epoch": 0.35342222222222225, + "grad_norm": 0.271192729473114, + "learning_rate": 3.6122132293868335e-05, + "loss": 0.5234, + "step": 1988 + }, + { + "epoch": 0.3536, + "grad_norm": 0.23808296024799347, + "learning_rate": 3.6109625798538873e-05, + "loss": 0.4797, + "step": 1989 + }, + { + "epoch": 0.3537777777777778, + "grad_norm": 0.24553291499614716, + "learning_rate": 3.6097115837803505e-05, + "loss": 0.3586, + "step": 1990 + }, + { + "epoch": 0.3539555555555556, + "grad_norm": 0.3320119082927704, + "learning_rate": 3.608460241556443e-05, + "loss": 0.3824, + "step": 1991 + }, + { + "epoch": 0.35413333333333336, + "grad_norm": 0.2908457815647125, + "learning_rate": 3.6072085535724956e-05, + "loss": 0.3948, + "step": 1992 + }, + { + "epoch": 0.35431111111111113, + "grad_norm": 0.4101564884185791, + "learning_rate": 3.6059565202189435e-05, + "loss": 0.429, + "step": 1993 + }, + { + "epoch": 0.3544888888888889, + "grad_norm": 0.33346498012542725, + "learning_rate": 3.604704141886332e-05, + "loss": 0.4119, + "step": 1994 + }, + { + "epoch": 0.3546666666666667, + "grad_norm": 0.3327579200267792, + "learning_rate": 3.603451418965313e-05, + "loss": 0.3571, + "step": 1995 + }, + { + "epoch": 0.35484444444444446, + "grad_norm": 0.29637157917022705, + "learning_rate": 3.602198351846647e-05, + "loss": 0.4134, + "step": 1996 + }, + { + "epoch": 0.35502222222222224, + "grad_norm": 0.48964494466781616, + "learning_rate": 3.600944940921199e-05, + "loss": 0.4576, + "step": 1997 + }, + { + "epoch": 0.3552, + "grad_norm": 0.3680611550807953, + "learning_rate": 3.5996911865799454e-05, + "loss": 0.3944, + "step": 1998 + }, + { + "epoch": 0.3553777777777778, + "grad_norm": 0.39194124937057495, + "learning_rate": 3.5984370892139666e-05, + "loss": 0.453, + "step": 1999 + }, + { + "epoch": 0.35555555555555557, + "grad_norm": 0.6034827828407288, + "learning_rate": 3.5971826492144504e-05, + "loss": 0.5526, + "step": 2000 + }, + { + "epoch": 0.35573333333333335, + "grad_norm": 0.2394157499074936, + "learning_rate": 3.5959278669726935e-05, + "loss": 0.5426, + "step": 2001 + }, + { + "epoch": 0.3559111111111111, + "grad_norm": 0.2761767506599426, + "learning_rate": 3.594672742880097e-05, + "loss": 0.4418, + "step": 2002 + }, + { + "epoch": 0.3560888888888889, + "grad_norm": 0.30572789907455444, + "learning_rate": 3.5934172773281696e-05, + "loss": 0.5944, + "step": 2003 + }, + { + "epoch": 0.3562666666666667, + "grad_norm": 0.3124197721481323, + "learning_rate": 3.592161470708526e-05, + "loss": 0.568, + "step": 2004 + }, + { + "epoch": 0.35644444444444445, + "grad_norm": 0.27353614568710327, + "learning_rate": 3.5909053234128895e-05, + "loss": 0.4233, + "step": 2005 + }, + { + "epoch": 0.35662222222222223, + "grad_norm": 0.3082450330257416, + "learning_rate": 3.5896488358330856e-05, + "loss": 0.4583, + "step": 2006 + }, + { + "epoch": 0.3568, + "grad_norm": 0.27109378576278687, + "learning_rate": 3.588392008361049e-05, + "loss": 0.4538, + "step": 2007 + }, + { + "epoch": 0.3569777777777778, + "grad_norm": 0.2335667908191681, + "learning_rate": 3.5871348413888204e-05, + "loss": 0.4587, + "step": 2008 + }, + { + "epoch": 0.35715555555555556, + "grad_norm": 0.3224318325519562, + "learning_rate": 3.585877335308546e-05, + "loss": 0.4176, + "step": 2009 + }, + { + "epoch": 0.35733333333333334, + "grad_norm": 0.210116907954216, + "learning_rate": 3.5846194905124757e-05, + "loss": 0.4273, + "step": 2010 + }, + { + "epoch": 0.3575111111111111, + "grad_norm": 0.32360702753067017, + "learning_rate": 3.5833613073929684e-05, + "loss": 0.6708, + "step": 2011 + }, + { + "epoch": 0.3576888888888889, + "grad_norm": 0.21429632604122162, + "learning_rate": 3.582102786342485e-05, + "loss": 0.4625, + "step": 2012 + }, + { + "epoch": 0.35786666666666667, + "grad_norm": 0.2860332727432251, + "learning_rate": 3.5808439277535964e-05, + "loss": 0.657, + "step": 2013 + }, + { + "epoch": 0.35804444444444444, + "grad_norm": 0.26128655672073364, + "learning_rate": 3.5795847320189746e-05, + "loss": 0.5382, + "step": 2014 + }, + { + "epoch": 0.3582222222222222, + "grad_norm": 0.2888024151325226, + "learning_rate": 3.5783251995313985e-05, + "loss": 0.5704, + "step": 2015 + }, + { + "epoch": 0.3584, + "grad_norm": 0.22416767477989197, + "learning_rate": 3.577065330683751e-05, + "loss": 0.4163, + "step": 2016 + }, + { + "epoch": 0.3585777777777778, + "grad_norm": 0.25067469477653503, + "learning_rate": 3.575805125869022e-05, + "loss": 0.4468, + "step": 2017 + }, + { + "epoch": 0.35875555555555555, + "grad_norm": 0.25245460867881775, + "learning_rate": 3.574544585480305e-05, + "loss": 0.3565, + "step": 2018 + }, + { + "epoch": 0.3589333333333333, + "grad_norm": 0.23376943171024323, + "learning_rate": 3.573283709910798e-05, + "loss": 0.5703, + "step": 2019 + }, + { + "epoch": 0.3591111111111111, + "grad_norm": 0.3533632755279541, + "learning_rate": 3.572022499553802e-05, + "loss": 0.6158, + "step": 2020 + }, + { + "epoch": 0.3592888888888889, + "grad_norm": 0.2572995126247406, + "learning_rate": 3.570760954802726e-05, + "loss": 0.6479, + "step": 2021 + }, + { + "epoch": 0.35946666666666666, + "grad_norm": 0.23424765467643738, + "learning_rate": 3.569499076051081e-05, + "loss": 0.5923, + "step": 2022 + }, + { + "epoch": 0.35964444444444443, + "grad_norm": 0.22361761331558228, + "learning_rate": 3.568236863692482e-05, + "loss": 0.4538, + "step": 2023 + }, + { + "epoch": 0.3598222222222222, + "grad_norm": 0.2418365776538849, + "learning_rate": 3.56697431812065e-05, + "loss": 0.6069, + "step": 2024 + }, + { + "epoch": 0.36, + "grad_norm": 0.2124192863702774, + "learning_rate": 3.565711439729408e-05, + "loss": 0.5013, + "step": 2025 + }, + { + "epoch": 0.36017777777777776, + "grad_norm": 0.23211051523685455, + "learning_rate": 3.564448228912682e-05, + "loss": 0.5583, + "step": 2026 + }, + { + "epoch": 0.36035555555555554, + "grad_norm": 0.5080576539039612, + "learning_rate": 3.5631846860645044e-05, + "loss": 0.5044, + "step": 2027 + }, + { + "epoch": 0.3605333333333333, + "grad_norm": 0.23350077867507935, + "learning_rate": 3.56192081157901e-05, + "loss": 0.462, + "step": 2028 + }, + { + "epoch": 0.3607111111111111, + "grad_norm": 0.20580492913722992, + "learning_rate": 3.5606566058504375e-05, + "loss": 0.4819, + "step": 2029 + }, + { + "epoch": 0.36088888888888887, + "grad_norm": 0.26576268672943115, + "learning_rate": 3.559392069273127e-05, + "loss": 0.4301, + "step": 2030 + }, + { + "epoch": 0.36106666666666665, + "grad_norm": 0.2505791485309601, + "learning_rate": 3.5581272022415244e-05, + "loss": 0.5558, + "step": 2031 + }, + { + "epoch": 0.3612444444444444, + "grad_norm": 0.20034657418727875, + "learning_rate": 3.5568620051501756e-05, + "loss": 0.4245, + "step": 2032 + }, + { + "epoch": 0.3614222222222222, + "grad_norm": 0.3134501576423645, + "learning_rate": 3.555596478393733e-05, + "loss": 0.675, + "step": 2033 + }, + { + "epoch": 0.3616, + "grad_norm": 0.23866966366767883, + "learning_rate": 3.554330622366949e-05, + "loss": 0.4368, + "step": 2034 + }, + { + "epoch": 0.36177777777777775, + "grad_norm": 0.1871061623096466, + "learning_rate": 3.5530644374646815e-05, + "loss": 0.3713, + "step": 2035 + }, + { + "epoch": 0.36195555555555553, + "grad_norm": 0.5392162799835205, + "learning_rate": 3.551797924081887e-05, + "loss": 0.5163, + "step": 2036 + }, + { + "epoch": 0.3621333333333333, + "grad_norm": 0.2455499768257141, + "learning_rate": 3.5505310826136286e-05, + "loss": 0.4753, + "step": 2037 + }, + { + "epoch": 0.3623111111111111, + "grad_norm": 0.2988753020763397, + "learning_rate": 3.5492639134550695e-05, + "loss": 0.6686, + "step": 2038 + }, + { + "epoch": 0.3624888888888889, + "grad_norm": 0.279950350522995, + "learning_rate": 3.5479964170014746e-05, + "loss": 0.4652, + "step": 2039 + }, + { + "epoch": 0.3626666666666667, + "grad_norm": 0.2598835527896881, + "learning_rate": 3.546728593648213e-05, + "loss": 0.3991, + "step": 2040 + }, + { + "epoch": 0.36284444444444447, + "grad_norm": 0.22966304421424866, + "learning_rate": 3.545460443790753e-05, + "loss": 0.4513, + "step": 2041 + }, + { + "epoch": 0.36302222222222225, + "grad_norm": 0.33538466691970825, + "learning_rate": 3.544191967824669e-05, + "loss": 0.3092, + "step": 2042 + }, + { + "epoch": 0.3632, + "grad_norm": 0.2845430374145508, + "learning_rate": 3.542923166145633e-05, + "loss": 0.3916, + "step": 2043 + }, + { + "epoch": 0.3633777777777778, + "grad_norm": 0.3298134505748749, + "learning_rate": 3.54165403914942e-05, + "loss": 0.3322, + "step": 2044 + }, + { + "epoch": 0.3635555555555556, + "grad_norm": 0.32051870226860046, + "learning_rate": 3.540384587231906e-05, + "loss": 0.4408, + "step": 2045 + }, + { + "epoch": 0.36373333333333335, + "grad_norm": 0.2550102472305298, + "learning_rate": 3.53911481078907e-05, + "loss": 0.398, + "step": 2046 + }, + { + "epoch": 0.36391111111111113, + "grad_norm": 0.2990383803844452, + "learning_rate": 3.5378447102169895e-05, + "loss": 0.3493, + "step": 2047 + }, + { + "epoch": 0.3640888888888889, + "grad_norm": 0.35762161016464233, + "learning_rate": 3.536574285911847e-05, + "loss": 0.4633, + "step": 2048 + }, + { + "epoch": 0.3642666666666667, + "grad_norm": 0.36321207880973816, + "learning_rate": 3.535303538269922e-05, + "loss": 0.4407, + "step": 2049 + }, + { + "epoch": 0.36444444444444446, + "grad_norm": 0.3657382130622864, + "learning_rate": 3.534032467687597e-05, + "loss": 0.469, + "step": 2050 + }, + { + "epoch": 0.36462222222222224, + "grad_norm": 0.22765199840068817, + "learning_rate": 3.532761074561355e-05, + "loss": 0.4781, + "step": 2051 + }, + { + "epoch": 0.3648, + "grad_norm": 0.21776601672172546, + "learning_rate": 3.531489359287779e-05, + "loss": 0.5775, + "step": 2052 + }, + { + "epoch": 0.3649777777777778, + "grad_norm": 0.2641981244087219, + "learning_rate": 3.5302173222635524e-05, + "loss": 0.5239, + "step": 2053 + }, + { + "epoch": 0.36515555555555557, + "grad_norm": 0.2800407111644745, + "learning_rate": 3.528944963885461e-05, + "loss": 0.459, + "step": 2054 + }, + { + "epoch": 0.36533333333333334, + "grad_norm": 0.27440667152404785, + "learning_rate": 3.527672284550389e-05, + "loss": 0.6238, + "step": 2055 + }, + { + "epoch": 0.3655111111111111, + "grad_norm": 0.33131271600723267, + "learning_rate": 3.52639928465532e-05, + "loss": 0.7595, + "step": 2056 + }, + { + "epoch": 0.3656888888888889, + "grad_norm": 0.23424576222896576, + "learning_rate": 3.5251259645973394e-05, + "loss": 0.3892, + "step": 2057 + }, + { + "epoch": 0.3658666666666667, + "grad_norm": 0.23169712722301483, + "learning_rate": 3.523852324773631e-05, + "loss": 0.4299, + "step": 2058 + }, + { + "epoch": 0.36604444444444445, + "grad_norm": 0.24841339886188507, + "learning_rate": 3.5225783655814796e-05, + "loss": 0.4542, + "step": 2059 + }, + { + "epoch": 0.3662222222222222, + "grad_norm": 0.22915951907634735, + "learning_rate": 3.521304087418269e-05, + "loss": 0.3651, + "step": 2060 + }, + { + "epoch": 0.3664, + "grad_norm": 0.24642620980739594, + "learning_rate": 3.5200294906814824e-05, + "loss": 0.4292, + "step": 2061 + }, + { + "epoch": 0.3665777777777778, + "grad_norm": 0.25656482577323914, + "learning_rate": 3.5187545757687015e-05, + "loss": 0.622, + "step": 2062 + }, + { + "epoch": 0.36675555555555556, + "grad_norm": 0.2141350507736206, + "learning_rate": 3.517479343077611e-05, + "loss": 0.4321, + "step": 2063 + }, + { + "epoch": 0.36693333333333333, + "grad_norm": 0.2655150592327118, + "learning_rate": 3.516203793005989e-05, + "loss": 0.4254, + "step": 2064 + }, + { + "epoch": 0.3671111111111111, + "grad_norm": 0.21726089715957642, + "learning_rate": 3.514927925951717e-05, + "loss": 0.4293, + "step": 2065 + }, + { + "epoch": 0.3672888888888889, + "grad_norm": 0.27894213795661926, + "learning_rate": 3.513651742312774e-05, + "loss": 0.4821, + "step": 2066 + }, + { + "epoch": 0.36746666666666666, + "grad_norm": 0.2362583875656128, + "learning_rate": 3.512375242487236e-05, + "loss": 0.4318, + "step": 2067 + }, + { + "epoch": 0.36764444444444444, + "grad_norm": 0.26625198125839233, + "learning_rate": 3.511098426873283e-05, + "loss": 0.5165, + "step": 2068 + }, + { + "epoch": 0.3678222222222222, + "grad_norm": 0.2515326142311096, + "learning_rate": 3.5098212958691854e-05, + "loss": 0.4271, + "step": 2069 + }, + { + "epoch": 0.368, + "grad_norm": 0.22928926348686218, + "learning_rate": 3.50854384987332e-05, + "loss": 0.4526, + "step": 2070 + }, + { + "epoch": 0.36817777777777777, + "grad_norm": 0.27953198552131653, + "learning_rate": 3.507266089284157e-05, + "loss": 0.4965, + "step": 2071 + }, + { + "epoch": 0.36835555555555555, + "grad_norm": 0.3242968022823334, + "learning_rate": 3.5059880145002654e-05, + "loss": 0.5741, + "step": 2072 + }, + { + "epoch": 0.3685333333333333, + "grad_norm": 0.24874995648860931, + "learning_rate": 3.5047096259203135e-05, + "loss": 0.5676, + "step": 2073 + }, + { + "epoch": 0.3687111111111111, + "grad_norm": 0.28286781907081604, + "learning_rate": 3.503430923943066e-05, + "loss": 0.441, + "step": 2074 + }, + { + "epoch": 0.3688888888888889, + "grad_norm": 0.2183365523815155, + "learning_rate": 3.5021519089673876e-05, + "loss": 0.524, + "step": 2075 + }, + { + "epoch": 0.36906666666666665, + "grad_norm": 0.24324169754981995, + "learning_rate": 3.5008725813922386e-05, + "loss": 0.3905, + "step": 2076 + }, + { + "epoch": 0.36924444444444443, + "grad_norm": 0.2610672414302826, + "learning_rate": 3.4995929416166756e-05, + "loss": 0.5719, + "step": 2077 + }, + { + "epoch": 0.3694222222222222, + "grad_norm": 0.23189662396907806, + "learning_rate": 3.498312990039856e-05, + "loss": 0.3422, + "step": 2078 + }, + { + "epoch": 0.3696, + "grad_norm": 0.27522850036621094, + "learning_rate": 3.497032727061034e-05, + "loss": 0.3579, + "step": 2079 + }, + { + "epoch": 0.36977777777777776, + "grad_norm": 0.24101312458515167, + "learning_rate": 3.495752153079557e-05, + "loss": 0.6423, + "step": 2080 + }, + { + "epoch": 0.36995555555555554, + "grad_norm": 0.27489200234413147, + "learning_rate": 3.494471268494875e-05, + "loss": 0.5208, + "step": 2081 + }, + { + "epoch": 0.3701333333333333, + "grad_norm": 0.2049010992050171, + "learning_rate": 3.493190073706529e-05, + "loss": 0.4961, + "step": 2082 + }, + { + "epoch": 0.3703111111111111, + "grad_norm": 0.28459906578063965, + "learning_rate": 3.491908569114164e-05, + "loss": 0.6202, + "step": 2083 + }, + { + "epoch": 0.37048888888888887, + "grad_norm": 0.2584361135959625, + "learning_rate": 3.4906267551175124e-05, + "loss": 0.4961, + "step": 2084 + }, + { + "epoch": 0.37066666666666664, + "grad_norm": 0.33265259861946106, + "learning_rate": 3.489344632116412e-05, + "loss": 0.6258, + "step": 2085 + }, + { + "epoch": 0.3708444444444444, + "grad_norm": 0.2544153034687042, + "learning_rate": 3.488062200510791e-05, + "loss": 0.5375, + "step": 2086 + }, + { + "epoch": 0.3710222222222222, + "grad_norm": 0.22062474489212036, + "learning_rate": 3.4867794607006784e-05, + "loss": 0.4642, + "step": 2087 + }, + { + "epoch": 0.3712, + "grad_norm": 0.26904961466789246, + "learning_rate": 3.485496413086195e-05, + "loss": 0.538, + "step": 2088 + }, + { + "epoch": 0.37137777777777775, + "grad_norm": 0.2870281934738159, + "learning_rate": 3.484213058067559e-05, + "loss": 0.3874, + "step": 2089 + }, + { + "epoch": 0.37155555555555553, + "grad_norm": 0.2527245879173279, + "learning_rate": 3.482929396045087e-05, + "loss": 0.4071, + "step": 2090 + }, + { + "epoch": 0.37173333333333336, + "grad_norm": 0.3237399160861969, + "learning_rate": 3.481645427419188e-05, + "loss": 0.4589, + "step": 2091 + }, + { + "epoch": 0.37191111111111114, + "grad_norm": 0.35495588183403015, + "learning_rate": 3.4803611525903685e-05, + "loss": 0.3487, + "step": 2092 + }, + { + "epoch": 0.3720888888888889, + "grad_norm": 0.38268110156059265, + "learning_rate": 3.479076571959231e-05, + "loss": 0.4733, + "step": 2093 + }, + { + "epoch": 0.3722666666666667, + "grad_norm": 0.3425139784812927, + "learning_rate": 3.477791685926471e-05, + "loss": 0.4299, + "step": 2094 + }, + { + "epoch": 0.37244444444444447, + "grad_norm": 0.3098171353340149, + "learning_rate": 3.4765064948928814e-05, + "loss": 0.4816, + "step": 2095 + }, + { + "epoch": 0.37262222222222224, + "grad_norm": 0.31044498085975647, + "learning_rate": 3.475220999259349e-05, + "loss": 0.4012, + "step": 2096 + }, + { + "epoch": 0.3728, + "grad_norm": 0.37865689396858215, + "learning_rate": 3.473935199426858e-05, + "loss": 0.5689, + "step": 2097 + }, + { + "epoch": 0.3729777777777778, + "grad_norm": 0.37300118803977966, + "learning_rate": 3.4726490957964834e-05, + "loss": 0.4925, + "step": 2098 + }, + { + "epoch": 0.3731555555555556, + "grad_norm": 0.42155811190605164, + "learning_rate": 3.471362688769398e-05, + "loss": 0.4334, + "step": 2099 + }, + { + "epoch": 0.37333333333333335, + "grad_norm": 0.41801220178604126, + "learning_rate": 3.4700759787468695e-05, + "loss": 0.4922, + "step": 2100 + }, + { + "epoch": 0.3735111111111111, + "grad_norm": 0.2610384523868561, + "learning_rate": 3.4687889661302576e-05, + "loss": 0.5597, + "step": 2101 + }, + { + "epoch": 0.3736888888888889, + "grad_norm": 0.24586988985538483, + "learning_rate": 3.467501651321019e-05, + "loss": 0.4295, + "step": 2102 + }, + { + "epoch": 0.3738666666666667, + "grad_norm": 0.25822538137435913, + "learning_rate": 3.466214034720702e-05, + "loss": 0.4671, + "step": 2103 + }, + { + "epoch": 0.37404444444444446, + "grad_norm": 0.3112949728965759, + "learning_rate": 3.4649261167309526e-05, + "loss": 0.3786, + "step": 2104 + }, + { + "epoch": 0.37422222222222223, + "grad_norm": 0.24209995567798615, + "learning_rate": 3.4636378977535075e-05, + "loss": 0.4576, + "step": 2105 + }, + { + "epoch": 0.3744, + "grad_norm": 0.2665556073188782, + "learning_rate": 3.462349378190199e-05, + "loss": 0.4626, + "step": 2106 + }, + { + "epoch": 0.3745777777777778, + "grad_norm": 0.2391367107629776, + "learning_rate": 3.461060558442952e-05, + "loss": 0.5722, + "step": 2107 + }, + { + "epoch": 0.37475555555555556, + "grad_norm": 0.26002785563468933, + "learning_rate": 3.459771438913787e-05, + "loss": 0.5739, + "step": 2108 + }, + { + "epoch": 0.37493333333333334, + "grad_norm": 0.2670307755470276, + "learning_rate": 3.458482020004815e-05, + "loss": 0.4995, + "step": 2109 + }, + { + "epoch": 0.3751111111111111, + "grad_norm": 0.31694403290748596, + "learning_rate": 3.457192302118244e-05, + "loss": 0.4849, + "step": 2110 + }, + { + "epoch": 0.3752888888888889, + "grad_norm": 0.23306290805339813, + "learning_rate": 3.455902285656373e-05, + "loss": 0.5401, + "step": 2111 + }, + { + "epoch": 0.37546666666666667, + "grad_norm": 0.37330561876296997, + "learning_rate": 3.454611971021593e-05, + "loss": 0.664, + "step": 2112 + }, + { + "epoch": 0.37564444444444445, + "grad_norm": 0.31714606285095215, + "learning_rate": 3.453321358616393e-05, + "loss": 0.6728, + "step": 2113 + }, + { + "epoch": 0.3758222222222222, + "grad_norm": 0.23792865872383118, + "learning_rate": 3.452030448843347e-05, + "loss": 0.4271, + "step": 2114 + }, + { + "epoch": 0.376, + "grad_norm": 0.3249034881591797, + "learning_rate": 3.45073924210513e-05, + "loss": 0.5126, + "step": 2115 + }, + { + "epoch": 0.3761777777777778, + "grad_norm": 0.2810419201850891, + "learning_rate": 3.4494477388045035e-05, + "loss": 0.354, + "step": 2116 + }, + { + "epoch": 0.37635555555555555, + "grad_norm": 0.24656501412391663, + "learning_rate": 3.448155939344324e-05, + "loss": 0.338, + "step": 2117 + }, + { + "epoch": 0.37653333333333333, + "grad_norm": 0.2581455707550049, + "learning_rate": 3.4468638441275415e-05, + "loss": 0.5291, + "step": 2118 + }, + { + "epoch": 0.3767111111111111, + "grad_norm": 0.2130831778049469, + "learning_rate": 3.445571453557196e-05, + "loss": 0.4804, + "step": 2119 + }, + { + "epoch": 0.3768888888888889, + "grad_norm": 0.27429383993148804, + "learning_rate": 3.444278768036421e-05, + "loss": 0.4173, + "step": 2120 + }, + { + "epoch": 0.37706666666666666, + "grad_norm": 0.25032860040664673, + "learning_rate": 3.442985787968442e-05, + "loss": 0.5487, + "step": 2121 + }, + { + "epoch": 0.37724444444444444, + "grad_norm": 0.24062617123126984, + "learning_rate": 3.4416925137565754e-05, + "loss": 0.4233, + "step": 2122 + }, + { + "epoch": 0.3774222222222222, + "grad_norm": 0.3051285147666931, + "learning_rate": 3.440398945804229e-05, + "loss": 0.405, + "step": 2123 + }, + { + "epoch": 0.3776, + "grad_norm": 0.24486301839351654, + "learning_rate": 3.439105084514905e-05, + "loss": 0.4729, + "step": 2124 + }, + { + "epoch": 0.37777777777777777, + "grad_norm": 0.24730072915554047, + "learning_rate": 3.437810930292195e-05, + "loss": 0.4028, + "step": 2125 + }, + { + "epoch": 0.37795555555555554, + "grad_norm": 0.18803048133850098, + "learning_rate": 3.436516483539781e-05, + "loss": 0.4, + "step": 2126 + }, + { + "epoch": 0.3781333333333333, + "grad_norm": 0.27054160833358765, + "learning_rate": 3.435221744661438e-05, + "loss": 0.5029, + "step": 2127 + }, + { + "epoch": 0.3783111111111111, + "grad_norm": 0.43489333987236023, + "learning_rate": 3.433926714061032e-05, + "loss": 0.3755, + "step": 2128 + }, + { + "epoch": 0.3784888888888889, + "grad_norm": 0.22567783296108246, + "learning_rate": 3.432631392142519e-05, + "loss": 0.4352, + "step": 2129 + }, + { + "epoch": 0.37866666666666665, + "grad_norm": 0.21575966477394104, + "learning_rate": 3.431335779309947e-05, + "loss": 0.5492, + "step": 2130 + }, + { + "epoch": 0.37884444444444443, + "grad_norm": 0.29870107769966125, + "learning_rate": 3.430039875967454e-05, + "loss": 0.8241, + "step": 2131 + }, + { + "epoch": 0.3790222222222222, + "grad_norm": 0.2585407495498657, + "learning_rate": 3.428743682519269e-05, + "loss": 0.5199, + "step": 2132 + }, + { + "epoch": 0.3792, + "grad_norm": 0.25725045800209045, + "learning_rate": 3.427447199369711e-05, + "loss": 0.4396, + "step": 2133 + }, + { + "epoch": 0.37937777777777776, + "grad_norm": 0.24106164276599884, + "learning_rate": 3.4261504269231904e-05, + "loss": 0.5917, + "step": 2134 + }, + { + "epoch": 0.37955555555555553, + "grad_norm": 0.30276933312416077, + "learning_rate": 3.4248533655842066e-05, + "loss": 0.5929, + "step": 2135 + }, + { + "epoch": 0.3797333333333333, + "grad_norm": 0.24279651045799255, + "learning_rate": 3.423556015757349e-05, + "loss": 0.5158, + "step": 2136 + }, + { + "epoch": 0.3799111111111111, + "grad_norm": 0.24842707812786102, + "learning_rate": 3.4222583778472996e-05, + "loss": 0.5214, + "step": 2137 + }, + { + "epoch": 0.38008888888888887, + "grad_norm": 0.25559934973716736, + "learning_rate": 3.4209604522588255e-05, + "loss": 0.3991, + "step": 2138 + }, + { + "epoch": 0.38026666666666664, + "grad_norm": 0.2626963257789612, + "learning_rate": 3.419662239396789e-05, + "loss": 0.5419, + "step": 2139 + }, + { + "epoch": 0.3804444444444444, + "grad_norm": 0.23748789727687836, + "learning_rate": 3.418363739666137e-05, + "loss": 0.4338, + "step": 2140 + }, + { + "epoch": 0.3806222222222222, + "grad_norm": 0.3695782721042633, + "learning_rate": 3.417064953471911e-05, + "loss": 0.3996, + "step": 2141 + }, + { + "epoch": 0.3808, + "grad_norm": 0.33631959557533264, + "learning_rate": 3.415765881219236e-05, + "loss": 0.3854, + "step": 2142 + }, + { + "epoch": 0.3809777777777778, + "grad_norm": 0.3013719916343689, + "learning_rate": 3.414466523313332e-05, + "loss": 0.444, + "step": 2143 + }, + { + "epoch": 0.3811555555555556, + "grad_norm": 0.3267067074775696, + "learning_rate": 3.4131668801595027e-05, + "loss": 0.4596, + "step": 2144 + }, + { + "epoch": 0.38133333333333336, + "grad_norm": 0.34401312470436096, + "learning_rate": 3.411866952163146e-05, + "loss": 0.5537, + "step": 2145 + }, + { + "epoch": 0.38151111111111113, + "grad_norm": 0.3007937967777252, + "learning_rate": 3.410566739729746e-05, + "loss": 0.3972, + "step": 2146 + }, + { + "epoch": 0.3816888888888889, + "grad_norm": 0.4229884445667267, + "learning_rate": 3.409266243264874e-05, + "loss": 0.3944, + "step": 2147 + }, + { + "epoch": 0.3818666666666667, + "grad_norm": 0.4144022762775421, + "learning_rate": 3.407965463174192e-05, + "loss": 0.3212, + "step": 2148 + }, + { + "epoch": 0.38204444444444446, + "grad_norm": 0.4471008777618408, + "learning_rate": 3.4066643998634505e-05, + "loss": 0.4593, + "step": 2149 + }, + { + "epoch": 0.38222222222222224, + "grad_norm": 0.42643436789512634, + "learning_rate": 3.4053630537384885e-05, + "loss": 0.5059, + "step": 2150 + }, + { + "epoch": 0.3824, + "grad_norm": 0.2592998147010803, + "learning_rate": 3.4040614252052305e-05, + "loss": 0.5609, + "step": 2151 + }, + { + "epoch": 0.3825777777777778, + "grad_norm": 0.24748949706554413, + "learning_rate": 3.402759514669694e-05, + "loss": 0.4246, + "step": 2152 + }, + { + "epoch": 0.38275555555555557, + "grad_norm": 0.46788910031318665, + "learning_rate": 3.401457322537979e-05, + "loss": 0.6454, + "step": 2153 + }, + { + "epoch": 0.38293333333333335, + "grad_norm": 0.2923056483268738, + "learning_rate": 3.400154849216278e-05, + "loss": 0.4639, + "step": 2154 + }, + { + "epoch": 0.3831111111111111, + "grad_norm": 0.2576306164264679, + "learning_rate": 3.398852095110868e-05, + "loss": 0.4698, + "step": 2155 + }, + { + "epoch": 0.3832888888888889, + "grad_norm": 0.2585231065750122, + "learning_rate": 3.397549060628116e-05, + "loss": 0.5491, + "step": 2156 + }, + { + "epoch": 0.3834666666666667, + "grad_norm": 0.2383284717798233, + "learning_rate": 3.396245746174473e-05, + "loss": 0.4456, + "step": 2157 + }, + { + "epoch": 0.38364444444444445, + "grad_norm": 0.24488377571105957, + "learning_rate": 3.394942152156482e-05, + "loss": 0.4351, + "step": 2158 + }, + { + "epoch": 0.38382222222222223, + "grad_norm": 0.23700352013111115, + "learning_rate": 3.39363827898077e-05, + "loss": 0.4883, + "step": 2159 + }, + { + "epoch": 0.384, + "grad_norm": 0.24665071070194244, + "learning_rate": 3.392334127054051e-05, + "loss": 0.5546, + "step": 2160 + }, + { + "epoch": 0.3841777777777778, + "grad_norm": 0.25925207138061523, + "learning_rate": 3.3910296967831266e-05, + "loss": 0.4273, + "step": 2161 + }, + { + "epoch": 0.38435555555555556, + "grad_norm": 0.2363111823797226, + "learning_rate": 3.389724988574887e-05, + "loss": 0.4456, + "step": 2162 + }, + { + "epoch": 0.38453333333333334, + "grad_norm": 0.24633704125881195, + "learning_rate": 3.388420002836307e-05, + "loss": 0.4286, + "step": 2163 + }, + { + "epoch": 0.3847111111111111, + "grad_norm": 0.2751052677631378, + "learning_rate": 3.387114739974448e-05, + "loss": 0.5885, + "step": 2164 + }, + { + "epoch": 0.3848888888888889, + "grad_norm": 0.23692676424980164, + "learning_rate": 3.3858092003964594e-05, + "loss": 0.4241, + "step": 2165 + }, + { + "epoch": 0.38506666666666667, + "grad_norm": 0.3027472198009491, + "learning_rate": 3.384503384509574e-05, + "loss": 0.5624, + "step": 2166 + }, + { + "epoch": 0.38524444444444444, + "grad_norm": 0.20521026849746704, + "learning_rate": 3.3831972927211135e-05, + "loss": 0.3362, + "step": 2167 + }, + { + "epoch": 0.3854222222222222, + "grad_norm": 0.3886958956718445, + "learning_rate": 3.381890925438486e-05, + "loss": 0.728, + "step": 2168 + }, + { + "epoch": 0.3856, + "grad_norm": 0.27369847893714905, + "learning_rate": 3.380584283069183e-05, + "loss": 0.4814, + "step": 2169 + }, + { + "epoch": 0.3857777777777778, + "grad_norm": 0.2596404552459717, + "learning_rate": 3.379277366020782e-05, + "loss": 0.518, + "step": 2170 + }, + { + "epoch": 0.38595555555555555, + "grad_norm": 0.23133036494255066, + "learning_rate": 3.3779701747009504e-05, + "loss": 0.4338, + "step": 2171 + }, + { + "epoch": 0.38613333333333333, + "grad_norm": 0.28718218207359314, + "learning_rate": 3.376662709517435e-05, + "loss": 0.5018, + "step": 2172 + }, + { + "epoch": 0.3863111111111111, + "grad_norm": 0.26473402976989746, + "learning_rate": 3.375354970878073e-05, + "loss": 0.6312, + "step": 2173 + }, + { + "epoch": 0.3864888888888889, + "grad_norm": 0.24622993171215057, + "learning_rate": 3.374046959190786e-05, + "loss": 0.4628, + "step": 2174 + }, + { + "epoch": 0.38666666666666666, + "grad_norm": 0.30092787742614746, + "learning_rate": 3.372738674863577e-05, + "loss": 0.5431, + "step": 2175 + }, + { + "epoch": 0.38684444444444444, + "grad_norm": 0.26763394474983215, + "learning_rate": 3.3714301183045385e-05, + "loss": 0.5245, + "step": 2176 + }, + { + "epoch": 0.3870222222222222, + "grad_norm": 0.24639491736888885, + "learning_rate": 3.370121289921845e-05, + "loss": 0.4503, + "step": 2177 + }, + { + "epoch": 0.3872, + "grad_norm": 0.2714393138885498, + "learning_rate": 3.368812190123759e-05, + "loss": 0.4557, + "step": 2178 + }, + { + "epoch": 0.38737777777777777, + "grad_norm": 0.2553410232067108, + "learning_rate": 3.367502819318624e-05, + "loss": 0.4406, + "step": 2179 + }, + { + "epoch": 0.38755555555555554, + "grad_norm": 0.30576181411743164, + "learning_rate": 3.3661931779148707e-05, + "loss": 0.4333, + "step": 2180 + }, + { + "epoch": 0.3877333333333333, + "grad_norm": 0.3816477358341217, + "learning_rate": 3.3648832663210124e-05, + "loss": 0.6322, + "step": 2181 + }, + { + "epoch": 0.3879111111111111, + "grad_norm": 0.2461373656988144, + "learning_rate": 3.363573084945648e-05, + "loss": 0.5, + "step": 2182 + }, + { + "epoch": 0.38808888888888887, + "grad_norm": 0.2696300148963928, + "learning_rate": 3.3622626341974594e-05, + "loss": 0.47, + "step": 2183 + }, + { + "epoch": 0.38826666666666665, + "grad_norm": 0.2867066562175751, + "learning_rate": 3.360951914485215e-05, + "loss": 0.7599, + "step": 2184 + }, + { + "epoch": 0.3884444444444444, + "grad_norm": 0.25492119789123535, + "learning_rate": 3.359640926217763e-05, + "loss": 0.4192, + "step": 2185 + }, + { + "epoch": 0.3886222222222222, + "grad_norm": 0.21582196652889252, + "learning_rate": 3.3583296698040384e-05, + "loss": 0.4483, + "step": 2186 + }, + { + "epoch": 0.3888, + "grad_norm": 0.22196675837039948, + "learning_rate": 3.35701814565306e-05, + "loss": 0.4185, + "step": 2187 + }, + { + "epoch": 0.38897777777777776, + "grad_norm": 0.2523905336856842, + "learning_rate": 3.355706354173928e-05, + "loss": 0.4424, + "step": 2188 + }, + { + "epoch": 0.38915555555555553, + "grad_norm": 0.24271929264068604, + "learning_rate": 3.354394295775829e-05, + "loss": 0.4333, + "step": 2189 + }, + { + "epoch": 0.3893333333333333, + "grad_norm": 0.21415016055107117, + "learning_rate": 3.3530819708680286e-05, + "loss": 0.4178, + "step": 2190 + }, + { + "epoch": 0.3895111111111111, + "grad_norm": 0.2533232271671295, + "learning_rate": 3.35176937985988e-05, + "loss": 0.4, + "step": 2191 + }, + { + "epoch": 0.38968888888888886, + "grad_norm": 0.3138527572154999, + "learning_rate": 3.350456523160815e-05, + "loss": 0.3212, + "step": 2192 + }, + { + "epoch": 0.38986666666666664, + "grad_norm": 0.2770172953605652, + "learning_rate": 3.349143401180354e-05, + "loss": 0.3167, + "step": 2193 + }, + { + "epoch": 0.39004444444444447, + "grad_norm": 0.2787688076496124, + "learning_rate": 3.347830014328094e-05, + "loss": 0.3645, + "step": 2194 + }, + { + "epoch": 0.39022222222222225, + "grad_norm": 0.2864219844341278, + "learning_rate": 3.346516363013719e-05, + "loss": 0.3508, + "step": 2195 + }, + { + "epoch": 0.3904, + "grad_norm": 0.30944886803627014, + "learning_rate": 3.3452024476469934e-05, + "loss": 0.4598, + "step": 2196 + }, + { + "epoch": 0.3905777777777778, + "grad_norm": 0.33153730630874634, + "learning_rate": 3.343888268637765e-05, + "loss": 0.401, + "step": 2197 + }, + { + "epoch": 0.3907555555555556, + "grad_norm": 0.38578036427497864, + "learning_rate": 3.3425738263959615e-05, + "loss": 0.3632, + "step": 2198 + }, + { + "epoch": 0.39093333333333335, + "grad_norm": 0.39611396193504333, + "learning_rate": 3.341259121331597e-05, + "loss": 0.4101, + "step": 2199 + }, + { + "epoch": 0.39111111111111113, + "grad_norm": 0.3686160743236542, + "learning_rate": 3.339944153854764e-05, + "loss": 0.4413, + "step": 2200 + }, + { + "epoch": 0.3912888888888889, + "grad_norm": 0.26790690422058105, + "learning_rate": 3.338628924375638e-05, + "loss": 0.5225, + "step": 2201 + }, + { + "epoch": 0.3914666666666667, + "grad_norm": 0.2447367012500763, + "learning_rate": 3.3373134333044756e-05, + "loss": 0.3565, + "step": 2202 + }, + { + "epoch": 0.39164444444444446, + "grad_norm": 0.2650652229785919, + "learning_rate": 3.3359976810516164e-05, + "loss": 0.4911, + "step": 2203 + }, + { + "epoch": 0.39182222222222224, + "grad_norm": 0.25458577275276184, + "learning_rate": 3.334681668027481e-05, + "loss": 0.5962, + "step": 2204 + }, + { + "epoch": 0.392, + "grad_norm": 0.2600625157356262, + "learning_rate": 3.33336539464257e-05, + "loss": 0.4809, + "step": 2205 + }, + { + "epoch": 0.3921777777777778, + "grad_norm": 0.38257142901420593, + "learning_rate": 3.332048861307467e-05, + "loss": 0.6504, + "step": 2206 + }, + { + "epoch": 0.39235555555555557, + "grad_norm": 0.2285948544740677, + "learning_rate": 3.3307320684328354e-05, + "loss": 0.4407, + "step": 2207 + }, + { + "epoch": 0.39253333333333335, + "grad_norm": 0.211137592792511, + "learning_rate": 3.3294150164294204e-05, + "loss": 0.4702, + "step": 2208 + }, + { + "epoch": 0.3927111111111111, + "grad_norm": 0.28607502579689026, + "learning_rate": 3.328097705708047e-05, + "loss": 0.4726, + "step": 2209 + }, + { + "epoch": 0.3928888888888889, + "grad_norm": 0.22693341970443726, + "learning_rate": 3.326780136679623e-05, + "loss": 0.3793, + "step": 2210 + }, + { + "epoch": 0.3930666666666667, + "grad_norm": 0.2918817400932312, + "learning_rate": 3.325462309755134e-05, + "loss": 0.5321, + "step": 2211 + }, + { + "epoch": 0.39324444444444445, + "grad_norm": 0.3294132947921753, + "learning_rate": 3.324144225345649e-05, + "loss": 0.5803, + "step": 2212 + }, + { + "epoch": 0.39342222222222223, + "grad_norm": 0.25932836532592773, + "learning_rate": 3.322825883862314e-05, + "loss": 0.6165, + "step": 2213 + }, + { + "epoch": 0.3936, + "grad_norm": 0.23814480006694794, + "learning_rate": 3.321507285716357e-05, + "loss": 0.42, + "step": 2214 + }, + { + "epoch": 0.3937777777777778, + "grad_norm": 0.27778419852256775, + "learning_rate": 3.320188431319088e-05, + "loss": 0.4835, + "step": 2215 + }, + { + "epoch": 0.39395555555555556, + "grad_norm": 0.295652836561203, + "learning_rate": 3.318869321081892e-05, + "loss": 0.5736, + "step": 2216 + }, + { + "epoch": 0.39413333333333334, + "grad_norm": 0.305899053812027, + "learning_rate": 3.31754995541624e-05, + "loss": 0.5611, + "step": 2217 + }, + { + "epoch": 0.3943111111111111, + "grad_norm": 0.2739260196685791, + "learning_rate": 3.3162303347336764e-05, + "loss": 0.5765, + "step": 2218 + }, + { + "epoch": 0.3944888888888889, + "grad_norm": 0.2561546266078949, + "learning_rate": 3.31491045944583e-05, + "loss": 0.5928, + "step": 2219 + }, + { + "epoch": 0.39466666666666667, + "grad_norm": 0.2588704824447632, + "learning_rate": 3.313590329964406e-05, + "loss": 0.5561, + "step": 2220 + }, + { + "epoch": 0.39484444444444444, + "grad_norm": 0.2036692202091217, + "learning_rate": 3.312269946701191e-05, + "loss": 0.4049, + "step": 2221 + }, + { + "epoch": 0.3950222222222222, + "grad_norm": 0.259208619594574, + "learning_rate": 3.31094931006805e-05, + "loss": 0.5813, + "step": 2222 + }, + { + "epoch": 0.3952, + "grad_norm": 0.25525709986686707, + "learning_rate": 3.309628420476926e-05, + "loss": 0.4203, + "step": 2223 + }, + { + "epoch": 0.3953777777777778, + "grad_norm": 0.258137583732605, + "learning_rate": 3.3083072783398416e-05, + "loss": 0.4186, + "step": 2224 + }, + { + "epoch": 0.39555555555555555, + "grad_norm": 0.30049389600753784, + "learning_rate": 3.3069858840688994e-05, + "loss": 0.502, + "step": 2225 + }, + { + "epoch": 0.3957333333333333, + "grad_norm": 0.24046258628368378, + "learning_rate": 3.305664238076278e-05, + "loss": 0.441, + "step": 2226 + }, + { + "epoch": 0.3959111111111111, + "grad_norm": 0.2497313916683197, + "learning_rate": 3.3043423407742375e-05, + "loss": 0.4136, + "step": 2227 + }, + { + "epoch": 0.3960888888888889, + "grad_norm": 0.2546222507953644, + "learning_rate": 3.3030201925751145e-05, + "loss": 0.5368, + "step": 2228 + }, + { + "epoch": 0.39626666666666666, + "grad_norm": 0.2721613645553589, + "learning_rate": 3.301697793891324e-05, + "loss": 0.4433, + "step": 2229 + }, + { + "epoch": 0.39644444444444443, + "grad_norm": 0.2567611038684845, + "learning_rate": 3.300375145135361e-05, + "loss": 0.6141, + "step": 2230 + }, + { + "epoch": 0.3966222222222222, + "grad_norm": 0.2444530874490738, + "learning_rate": 3.299052246719795e-05, + "loss": 0.5093, + "step": 2231 + }, + { + "epoch": 0.3968, + "grad_norm": 0.25389376282691956, + "learning_rate": 3.297729099057277e-05, + "loss": 0.3706, + "step": 2232 + }, + { + "epoch": 0.39697777777777776, + "grad_norm": 0.27604737877845764, + "learning_rate": 3.296405702560532e-05, + "loss": 0.5249, + "step": 2233 + }, + { + "epoch": 0.39715555555555554, + "grad_norm": 0.27496466040611267, + "learning_rate": 3.295082057642367e-05, + "loss": 0.5712, + "step": 2234 + }, + { + "epoch": 0.3973333333333333, + "grad_norm": 0.2664611339569092, + "learning_rate": 3.293758164715663e-05, + "loss": 0.543, + "step": 2235 + }, + { + "epoch": 0.3975111111111111, + "grad_norm": 0.2882423996925354, + "learning_rate": 3.29243402419338e-05, + "loss": 0.4336, + "step": 2236 + }, + { + "epoch": 0.39768888888888887, + "grad_norm": 0.2532421052455902, + "learning_rate": 3.2911096364885544e-05, + "loss": 0.4643, + "step": 2237 + }, + { + "epoch": 0.39786666666666665, + "grad_norm": 0.27396079897880554, + "learning_rate": 3.2897850020143005e-05, + "loss": 0.3918, + "step": 2238 + }, + { + "epoch": 0.3980444444444444, + "grad_norm": 0.27605608105659485, + "learning_rate": 3.2884601211838085e-05, + "loss": 0.469, + "step": 2239 + }, + { + "epoch": 0.3982222222222222, + "grad_norm": 0.34194549918174744, + "learning_rate": 3.287134994410347e-05, + "loss": 0.4957, + "step": 2240 + }, + { + "epoch": 0.3984, + "grad_norm": 0.2727879583835602, + "learning_rate": 3.28580962210726e-05, + "loss": 0.3395, + "step": 2241 + }, + { + "epoch": 0.39857777777777775, + "grad_norm": 0.31715822219848633, + "learning_rate": 3.2844840046879686e-05, + "loss": 0.4056, + "step": 2242 + }, + { + "epoch": 0.39875555555555553, + "grad_norm": 0.3175671100616455, + "learning_rate": 3.283158142565971e-05, + "loss": 0.4273, + "step": 2243 + }, + { + "epoch": 0.3989333333333333, + "grad_norm": 0.3305791914463043, + "learning_rate": 3.28183203615484e-05, + "loss": 0.3836, + "step": 2244 + }, + { + "epoch": 0.39911111111111114, + "grad_norm": 0.3780287206172943, + "learning_rate": 3.280505685868226e-05, + "loss": 0.4786, + "step": 2245 + }, + { + "epoch": 0.3992888888888889, + "grad_norm": 0.3880319595336914, + "learning_rate": 3.279179092119855e-05, + "loss": 0.417, + "step": 2246 + }, + { + "epoch": 0.3994666666666667, + "grad_norm": 0.47869691252708435, + "learning_rate": 3.277852255323529e-05, + "loss": 0.4158, + "step": 2247 + }, + { + "epoch": 0.39964444444444447, + "grad_norm": 0.3569205105304718, + "learning_rate": 3.276525175893126e-05, + "loss": 0.4886, + "step": 2248 + }, + { + "epoch": 0.39982222222222225, + "grad_norm": 0.5690917372703552, + "learning_rate": 3.2751978542425995e-05, + "loss": 0.4938, + "step": 2249 + }, + { + "epoch": 0.4, + "grad_norm": 0.5662466287612915, + "learning_rate": 3.273870290785979e-05, + "loss": 0.452, + "step": 2250 + }, + { + "epoch": 0.4001777777777778, + "grad_norm": 0.3003396987915039, + "learning_rate": 3.272542485937369e-05, + "loss": 0.5371, + "step": 2251 + }, + { + "epoch": 0.4003555555555556, + "grad_norm": 0.25178733468055725, + "learning_rate": 3.271214440110948e-05, + "loss": 0.5218, + "step": 2252 + }, + { + "epoch": 0.40053333333333335, + "grad_norm": 0.27537745237350464, + "learning_rate": 3.269886153720972e-05, + "loss": 0.4772, + "step": 2253 + }, + { + "epoch": 0.40071111111111113, + "grad_norm": 0.24302232265472412, + "learning_rate": 3.2685576271817716e-05, + "loss": 0.4593, + "step": 2254 + }, + { + "epoch": 0.4008888888888889, + "grad_norm": 0.24583515524864197, + "learning_rate": 3.267228860907751e-05, + "loss": 0.5077, + "step": 2255 + }, + { + "epoch": 0.4010666666666667, + "grad_norm": 0.2971247434616089, + "learning_rate": 3.2658998553133895e-05, + "loss": 0.3756, + "step": 2256 + }, + { + "epoch": 0.40124444444444446, + "grad_norm": 0.26474133133888245, + "learning_rate": 3.2645706108132424e-05, + "loss": 0.4709, + "step": 2257 + }, + { + "epoch": 0.40142222222222224, + "grad_norm": 0.27239760756492615, + "learning_rate": 3.263241127821938e-05, + "loss": 0.5125, + "step": 2258 + }, + { + "epoch": 0.4016, + "grad_norm": 0.2814341187477112, + "learning_rate": 3.2619114067541796e-05, + "loss": 0.5078, + "step": 2259 + }, + { + "epoch": 0.4017777777777778, + "grad_norm": 0.26514071226119995, + "learning_rate": 3.260581448024745e-05, + "loss": 0.373, + "step": 2260 + }, + { + "epoch": 0.40195555555555557, + "grad_norm": 0.268004834651947, + "learning_rate": 3.2592512520484856e-05, + "loss": 0.4904, + "step": 2261 + }, + { + "epoch": 0.40213333333333334, + "grad_norm": 0.22918185591697693, + "learning_rate": 3.257920819240328e-05, + "loss": 0.4323, + "step": 2262 + }, + { + "epoch": 0.4023111111111111, + "grad_norm": 0.32379430532455444, + "learning_rate": 3.25659015001527e-05, + "loss": 0.424, + "step": 2263 + }, + { + "epoch": 0.4024888888888889, + "grad_norm": 0.2927483022212982, + "learning_rate": 3.2552592447883865e-05, + "loss": 0.5168, + "step": 2264 + }, + { + "epoch": 0.4026666666666667, + "grad_norm": 0.23326265811920166, + "learning_rate": 3.253928103974823e-05, + "loss": 0.523, + "step": 2265 + }, + { + "epoch": 0.40284444444444445, + "grad_norm": 0.2547294795513153, + "learning_rate": 3.2525967279898015e-05, + "loss": 0.4662, + "step": 2266 + }, + { + "epoch": 0.4030222222222222, + "grad_norm": 0.25667548179626465, + "learning_rate": 3.251265117248614e-05, + "loss": 0.5386, + "step": 2267 + }, + { + "epoch": 0.4032, + "grad_norm": 0.22199925780296326, + "learning_rate": 3.249933272166629e-05, + "loss": 0.4251, + "step": 2268 + }, + { + "epoch": 0.4033777777777778, + "grad_norm": 0.28474536538124084, + "learning_rate": 3.248601193159287e-05, + "loss": 0.5572, + "step": 2269 + }, + { + "epoch": 0.40355555555555556, + "grad_norm": 0.274249792098999, + "learning_rate": 3.247268880642098e-05, + "loss": 0.4969, + "step": 2270 + }, + { + "epoch": 0.40373333333333333, + "grad_norm": 0.22203154861927032, + "learning_rate": 3.245936335030651e-05, + "loss": 0.4231, + "step": 2271 + }, + { + "epoch": 0.4039111111111111, + "grad_norm": 0.3119150996208191, + "learning_rate": 3.244603556740603e-05, + "loss": 0.4317, + "step": 2272 + }, + { + "epoch": 0.4040888888888889, + "grad_norm": 0.25260046124458313, + "learning_rate": 3.243270546187687e-05, + "loss": 0.5056, + "step": 2273 + }, + { + "epoch": 0.40426666666666666, + "grad_norm": 0.3757512867450714, + "learning_rate": 3.241937303787703e-05, + "loss": 0.4631, + "step": 2274 + }, + { + "epoch": 0.40444444444444444, + "grad_norm": 0.3302309215068817, + "learning_rate": 3.240603829956531e-05, + "loss": 0.5482, + "step": 2275 + }, + { + "epoch": 0.4046222222222222, + "grad_norm": 0.2648998200893402, + "learning_rate": 3.239270125110117e-05, + "loss": 0.5557, + "step": 2276 + }, + { + "epoch": 0.4048, + "grad_norm": 0.27261245250701904, + "learning_rate": 3.2379361896644816e-05, + "loss": 0.3654, + "step": 2277 + }, + { + "epoch": 0.40497777777777777, + "grad_norm": 0.25771352648735046, + "learning_rate": 3.236602024035716e-05, + "loss": 0.6762, + "step": 2278 + }, + { + "epoch": 0.40515555555555555, + "grad_norm": 0.2597479522228241, + "learning_rate": 3.235267628639987e-05, + "loss": 0.5005, + "step": 2279 + }, + { + "epoch": 0.4053333333333333, + "grad_norm": 0.2761436104774475, + "learning_rate": 3.2339330038935265e-05, + "loss": 0.4543, + "step": 2280 + }, + { + "epoch": 0.4055111111111111, + "grad_norm": 0.2764914631843567, + "learning_rate": 3.2325981502126433e-05, + "loss": 0.5194, + "step": 2281 + }, + { + "epoch": 0.4056888888888889, + "grad_norm": 0.2043561339378357, + "learning_rate": 3.2312630680137175e-05, + "loss": 0.4091, + "step": 2282 + }, + { + "epoch": 0.40586666666666665, + "grad_norm": 0.2624204754829407, + "learning_rate": 3.229927757713196e-05, + "loss": 0.4914, + "step": 2283 + }, + { + "epoch": 0.40604444444444443, + "grad_norm": 0.22036872804164886, + "learning_rate": 3.228592219727602e-05, + "loss": 0.4003, + "step": 2284 + }, + { + "epoch": 0.4062222222222222, + "grad_norm": 0.25706037878990173, + "learning_rate": 3.227256454473526e-05, + "loss": 0.4383, + "step": 2285 + }, + { + "epoch": 0.4064, + "grad_norm": 0.27608251571655273, + "learning_rate": 3.225920462367632e-05, + "loss": 0.4679, + "step": 2286 + }, + { + "epoch": 0.40657777777777776, + "grad_norm": 0.23131489753723145, + "learning_rate": 3.2245842438266526e-05, + "loss": 0.538, + "step": 2287 + }, + { + "epoch": 0.40675555555555554, + "grad_norm": 0.30274698138237, + "learning_rate": 3.223247799267394e-05, + "loss": 0.5218, + "step": 2288 + }, + { + "epoch": 0.4069333333333333, + "grad_norm": 0.3098407983779907, + "learning_rate": 3.221911129106728e-05, + "loss": 0.6766, + "step": 2289 + }, + { + "epoch": 0.4071111111111111, + "grad_norm": 0.26679739356040955, + "learning_rate": 3.220574233761603e-05, + "loss": 0.6673, + "step": 2290 + }, + { + "epoch": 0.40728888888888887, + "grad_norm": 0.26043859124183655, + "learning_rate": 3.219237113649032e-05, + "loss": 0.481, + "step": 2291 + }, + { + "epoch": 0.40746666666666664, + "grad_norm": 0.24777059257030487, + "learning_rate": 3.2178997691861014e-05, + "loss": 0.3568, + "step": 2292 + }, + { + "epoch": 0.4076444444444444, + "grad_norm": 0.30639949440956116, + "learning_rate": 3.2165622007899676e-05, + "loss": 0.4174, + "step": 2293 + }, + { + "epoch": 0.4078222222222222, + "grad_norm": 0.25943437218666077, + "learning_rate": 3.215224408877854e-05, + "loss": 0.3707, + "step": 2294 + }, + { + "epoch": 0.408, + "grad_norm": 0.29938480257987976, + "learning_rate": 3.213886393867057e-05, + "loss": 0.3327, + "step": 2295 + }, + { + "epoch": 0.40817777777777775, + "grad_norm": 0.26499468088150024, + "learning_rate": 3.21254815617494e-05, + "loss": 0.3807, + "step": 2296 + }, + { + "epoch": 0.4083555555555556, + "grad_norm": 0.32577332854270935, + "learning_rate": 3.21120969621894e-05, + "loss": 0.4246, + "step": 2297 + }, + { + "epoch": 0.40853333333333336, + "grad_norm": 0.36527153849601746, + "learning_rate": 3.209871014416557e-05, + "loss": 0.4065, + "step": 2298 + }, + { + "epoch": 0.40871111111111114, + "grad_norm": 0.38569098711013794, + "learning_rate": 3.208532111185365e-05, + "loss": 0.4408, + "step": 2299 + }, + { + "epoch": 0.4088888888888889, + "grad_norm": 0.43131449818611145, + "learning_rate": 3.207192986943006e-05, + "loss": 0.4662, + "step": 2300 + }, + { + "epoch": 0.4090666666666667, + "grad_norm": 0.31158092617988586, + "learning_rate": 3.205853642107192e-05, + "loss": 0.68, + "step": 2301 + }, + { + "epoch": 0.40924444444444447, + "grad_norm": 0.2718139588832855, + "learning_rate": 3.204514077095699e-05, + "loss": 0.3825, + "step": 2302 + }, + { + "epoch": 0.40942222222222224, + "grad_norm": 0.22665324807167053, + "learning_rate": 3.203174292326378e-05, + "loss": 0.4934, + "step": 2303 + }, + { + "epoch": 0.4096, + "grad_norm": 0.2552378177642822, + "learning_rate": 3.2018342882171445e-05, + "loss": 0.3832, + "step": 2304 + }, + { + "epoch": 0.4097777777777778, + "grad_norm": 0.3228744566440582, + "learning_rate": 3.2004940651859844e-05, + "loss": 0.4984, + "step": 2305 + }, + { + "epoch": 0.4099555555555556, + "grad_norm": 0.24413657188415527, + "learning_rate": 3.19915362365095e-05, + "loss": 0.5115, + "step": 2306 + }, + { + "epoch": 0.41013333333333335, + "grad_norm": 0.315473347902298, + "learning_rate": 3.197812964030164e-05, + "loss": 0.4778, + "step": 2307 + }, + { + "epoch": 0.4103111111111111, + "grad_norm": 0.26569363474845886, + "learning_rate": 3.196472086741815e-05, + "loss": 0.4047, + "step": 2308 + }, + { + "epoch": 0.4104888888888889, + "grad_norm": 0.20616759359836578, + "learning_rate": 3.195130992204161e-05, + "loss": 0.3913, + "step": 2309 + }, + { + "epoch": 0.4106666666666667, + "grad_norm": 0.2646740972995758, + "learning_rate": 3.193789680835527e-05, + "loss": 0.5614, + "step": 2310 + }, + { + "epoch": 0.41084444444444446, + "grad_norm": 0.3248431980609894, + "learning_rate": 3.192448153054306e-05, + "loss": 0.6064, + "step": 2311 + }, + { + "epoch": 0.41102222222222223, + "grad_norm": 0.28016141057014465, + "learning_rate": 3.191106409278959e-05, + "loss": 0.4897, + "step": 2312 + }, + { + "epoch": 0.4112, + "grad_norm": 0.24610549211502075, + "learning_rate": 3.189764449928012e-05, + "loss": 0.463, + "step": 2313 + }, + { + "epoch": 0.4113777777777778, + "grad_norm": 0.2482464462518692, + "learning_rate": 3.1884222754200625e-05, + "loss": 0.3951, + "step": 2314 + }, + { + "epoch": 0.41155555555555556, + "grad_norm": 0.31816866993904114, + "learning_rate": 3.1870798861737705e-05, + "loss": 0.5978, + "step": 2315 + }, + { + "epoch": 0.41173333333333334, + "grad_norm": 0.2570197284221649, + "learning_rate": 3.185737282607867e-05, + "loss": 0.5063, + "step": 2316 + }, + { + "epoch": 0.4119111111111111, + "grad_norm": 0.23591694235801697, + "learning_rate": 3.1843944651411456e-05, + "loss": 0.5131, + "step": 2317 + }, + { + "epoch": 0.4120888888888889, + "grad_norm": 0.19064725935459137, + "learning_rate": 3.183051434192471e-05, + "loss": 0.4117, + "step": 2318 + }, + { + "epoch": 0.41226666666666667, + "grad_norm": 0.24479931592941284, + "learning_rate": 3.181708190180771e-05, + "loss": 0.3903, + "step": 2319 + }, + { + "epoch": 0.41244444444444445, + "grad_norm": 0.21253429353237152, + "learning_rate": 3.180364733525043e-05, + "loss": 0.4956, + "step": 2320 + }, + { + "epoch": 0.4126222222222222, + "grad_norm": 0.2751752734184265, + "learning_rate": 3.179021064644347e-05, + "loss": 0.4588, + "step": 2321 + }, + { + "epoch": 0.4128, + "grad_norm": 0.27173420786857605, + "learning_rate": 3.177677183957813e-05, + "loss": 0.6934, + "step": 2322 + }, + { + "epoch": 0.4129777777777778, + "grad_norm": 0.2785038650035858, + "learning_rate": 3.176333091884635e-05, + "loss": 0.4604, + "step": 2323 + }, + { + "epoch": 0.41315555555555555, + "grad_norm": 0.25333958864212036, + "learning_rate": 3.174988788844072e-05, + "loss": 0.3794, + "step": 2324 + }, + { + "epoch": 0.41333333333333333, + "grad_norm": 0.2173972725868225, + "learning_rate": 3.173644275255451e-05, + "loss": 0.5246, + "step": 2325 + }, + { + "epoch": 0.4135111111111111, + "grad_norm": 0.28589218854904175, + "learning_rate": 3.172299551538164e-05, + "loss": 0.5827, + "step": 2326 + }, + { + "epoch": 0.4136888888888889, + "grad_norm": 0.2525719702243805, + "learning_rate": 3.170954618111669e-05, + "loss": 0.4617, + "step": 2327 + }, + { + "epoch": 0.41386666666666666, + "grad_norm": 0.2642752230167389, + "learning_rate": 3.169609475395486e-05, + "loss": 0.5869, + "step": 2328 + }, + { + "epoch": 0.41404444444444444, + "grad_norm": 0.2701886296272278, + "learning_rate": 3.1682641238092064e-05, + "loss": 0.5071, + "step": 2329 + }, + { + "epoch": 0.4142222222222222, + "grad_norm": 0.27753856778144836, + "learning_rate": 3.166918563772481e-05, + "loss": 0.4908, + "step": 2330 + }, + { + "epoch": 0.4144, + "grad_norm": 0.2360255867242813, + "learning_rate": 3.1655727957050285e-05, + "loss": 0.3829, + "step": 2331 + }, + { + "epoch": 0.41457777777777777, + "grad_norm": 0.25760018825531006, + "learning_rate": 3.1642268200266317e-05, + "loss": 0.52, + "step": 2332 + }, + { + "epoch": 0.41475555555555554, + "grad_norm": 0.265447735786438, + "learning_rate": 3.162880637157139e-05, + "loss": 0.5096, + "step": 2333 + }, + { + "epoch": 0.4149333333333333, + "grad_norm": 0.31820639967918396, + "learning_rate": 3.1615342475164636e-05, + "loss": 0.5408, + "step": 2334 + }, + { + "epoch": 0.4151111111111111, + "grad_norm": 0.2752600908279419, + "learning_rate": 3.16018765152458e-05, + "loss": 0.4864, + "step": 2335 + }, + { + "epoch": 0.4152888888888889, + "grad_norm": 0.308669775724411, + "learning_rate": 3.158840849601532e-05, + "loss": 0.5432, + "step": 2336 + }, + { + "epoch": 0.41546666666666665, + "grad_norm": 0.233273446559906, + "learning_rate": 3.157493842167423e-05, + "loss": 0.405, + "step": 2337 + }, + { + "epoch": 0.4156444444444444, + "grad_norm": 0.27532798051834106, + "learning_rate": 3.156146629642425e-05, + "loss": 0.3612, + "step": 2338 + }, + { + "epoch": 0.4158222222222222, + "grad_norm": 0.3004027009010315, + "learning_rate": 3.15479921244677e-05, + "loss": 0.5954, + "step": 2339 + }, + { + "epoch": 0.416, + "grad_norm": 0.25567081570625305, + "learning_rate": 3.153451591000756e-05, + "loss": 0.3939, + "step": 2340 + }, + { + "epoch": 0.41617777777777776, + "grad_norm": 0.2970482409000397, + "learning_rate": 3.152103765724743e-05, + "loss": 0.4206, + "step": 2341 + }, + { + "epoch": 0.41635555555555553, + "grad_norm": 0.3003411591053009, + "learning_rate": 3.150755737039157e-05, + "loss": 0.2856, + "step": 2342 + }, + { + "epoch": 0.4165333333333333, + "grad_norm": 0.32657188177108765, + "learning_rate": 3.149407505364486e-05, + "loss": 0.3656, + "step": 2343 + }, + { + "epoch": 0.4167111111111111, + "grad_norm": 0.29524633288383484, + "learning_rate": 3.148059071121282e-05, + "loss": 0.4721, + "step": 2344 + }, + { + "epoch": 0.41688888888888886, + "grad_norm": 0.2855384051799774, + "learning_rate": 3.146710434730159e-05, + "loss": 0.3702, + "step": 2345 + }, + { + "epoch": 0.41706666666666664, + "grad_norm": 0.3212869167327881, + "learning_rate": 3.145361596611795e-05, + "loss": 0.4172, + "step": 2346 + }, + { + "epoch": 0.4172444444444444, + "grad_norm": 0.44451799988746643, + "learning_rate": 3.1440125571869306e-05, + "loss": 0.4715, + "step": 2347 + }, + { + "epoch": 0.4174222222222222, + "grad_norm": 0.48374512791633606, + "learning_rate": 3.142663316876368e-05, + "loss": 0.5269, + "step": 2348 + }, + { + "epoch": 0.4176, + "grad_norm": 0.30065396428108215, + "learning_rate": 3.141313876100976e-05, + "loss": 0.3593, + "step": 2349 + }, + { + "epoch": 0.4177777777777778, + "grad_norm": 0.4186636805534363, + "learning_rate": 3.139964235281682e-05, + "loss": 0.4197, + "step": 2350 + }, + { + "epoch": 0.4179555555555556, + "grad_norm": 0.2300005406141281, + "learning_rate": 3.138614394839476e-05, + "loss": 0.5526, + "step": 2351 + }, + { + "epoch": 0.41813333333333336, + "grad_norm": 0.23057812452316284, + "learning_rate": 3.137264355195413e-05, + "loss": 0.4315, + "step": 2352 + }, + { + "epoch": 0.41831111111111113, + "grad_norm": 0.29620182514190674, + "learning_rate": 3.135914116770609e-05, + "loss": 0.5519, + "step": 2353 + }, + { + "epoch": 0.4184888888888889, + "grad_norm": 0.2461867779493332, + "learning_rate": 3.134563679986238e-05, + "loss": 0.4729, + "step": 2354 + }, + { + "epoch": 0.4186666666666667, + "grad_norm": 0.22086381912231445, + "learning_rate": 3.133213045263543e-05, + "loss": 0.3936, + "step": 2355 + }, + { + "epoch": 0.41884444444444446, + "grad_norm": 0.28992173075675964, + "learning_rate": 3.1318622130238236e-05, + "loss": 0.4702, + "step": 2356 + }, + { + "epoch": 0.41902222222222224, + "grad_norm": 0.26983410120010376, + "learning_rate": 3.1305111836884425e-05, + "loss": 0.5343, + "step": 2357 + }, + { + "epoch": 0.4192, + "grad_norm": 0.3131723701953888, + "learning_rate": 3.129159957678824e-05, + "loss": 0.4721, + "step": 2358 + }, + { + "epoch": 0.4193777777777778, + "grad_norm": 0.2247696965932846, + "learning_rate": 3.127808535416454e-05, + "loss": 0.3829, + "step": 2359 + }, + { + "epoch": 0.41955555555555557, + "grad_norm": 0.27495676279067993, + "learning_rate": 3.126456917322878e-05, + "loss": 0.4712, + "step": 2360 + }, + { + "epoch": 0.41973333333333335, + "grad_norm": 0.25901392102241516, + "learning_rate": 3.1251051038197055e-05, + "loss": 0.4267, + "step": 2361 + }, + { + "epoch": 0.4199111111111111, + "grad_norm": 0.3064854145050049, + "learning_rate": 3.123753095328604e-05, + "loss": 0.4999, + "step": 2362 + }, + { + "epoch": 0.4200888888888889, + "grad_norm": 0.22995403409004211, + "learning_rate": 3.1224008922713044e-05, + "loss": 0.4914, + "step": 2363 + }, + { + "epoch": 0.4202666666666667, + "grad_norm": 0.26995977759361267, + "learning_rate": 3.121048495069596e-05, + "loss": 0.4898, + "step": 2364 + }, + { + "epoch": 0.42044444444444445, + "grad_norm": 0.21759191155433655, + "learning_rate": 3.11969590414533e-05, + "loss": 0.3516, + "step": 2365 + }, + { + "epoch": 0.42062222222222223, + "grad_norm": 0.21706731617450714, + "learning_rate": 3.118343119920418e-05, + "loss": 0.4765, + "step": 2366 + }, + { + "epoch": 0.4208, + "grad_norm": 0.22309869527816772, + "learning_rate": 3.11699014281683e-05, + "loss": 0.4854, + "step": 2367 + }, + { + "epoch": 0.4209777777777778, + "grad_norm": 0.21380077302455902, + "learning_rate": 3.1156369732566006e-05, + "loss": 0.4255, + "step": 2368 + }, + { + "epoch": 0.42115555555555556, + "grad_norm": 0.3022780418395996, + "learning_rate": 3.114283611661818e-05, + "loss": 0.6779, + "step": 2369 + }, + { + "epoch": 0.42133333333333334, + "grad_norm": 0.25827300548553467, + "learning_rate": 3.1129300584546375e-05, + "loss": 0.5316, + "step": 2370 + }, + { + "epoch": 0.4215111111111111, + "grad_norm": 0.2360515594482422, + "learning_rate": 3.111576314057268e-05, + "loss": 0.4979, + "step": 2371 + }, + { + "epoch": 0.4216888888888889, + "grad_norm": 0.23165373504161835, + "learning_rate": 3.1102223788919824e-05, + "loss": 0.4831, + "step": 2372 + }, + { + "epoch": 0.42186666666666667, + "grad_norm": 0.29023346304893494, + "learning_rate": 3.10886825338111e-05, + "loss": 0.4932, + "step": 2373 + }, + { + "epoch": 0.42204444444444444, + "grad_norm": 0.24942994117736816, + "learning_rate": 3.107513937947041e-05, + "loss": 0.3652, + "step": 2374 + }, + { + "epoch": 0.4222222222222222, + "grad_norm": 0.276439368724823, + "learning_rate": 3.1061594330122246e-05, + "loss": 0.5024, + "step": 2375 + }, + { + "epoch": 0.4224, + "grad_norm": 0.2673211693763733, + "learning_rate": 3.104804738999169e-05, + "loss": 0.4082, + "step": 2376 + }, + { + "epoch": 0.4225777777777778, + "grad_norm": 0.2506446838378906, + "learning_rate": 3.103449856330443e-05, + "loss": 0.4285, + "step": 2377 + }, + { + "epoch": 0.42275555555555555, + "grad_norm": 0.2678147852420807, + "learning_rate": 3.102094785428671e-05, + "loss": 0.5667, + "step": 2378 + }, + { + "epoch": 0.42293333333333333, + "grad_norm": 0.2852556109428406, + "learning_rate": 3.100739526716538e-05, + "loss": 0.3461, + "step": 2379 + }, + { + "epoch": 0.4231111111111111, + "grad_norm": 0.28759077191352844, + "learning_rate": 3.099384080616789e-05, + "loss": 0.5604, + "step": 2380 + }, + { + "epoch": 0.4232888888888889, + "grad_norm": 0.27783697843551636, + "learning_rate": 3.098028447552224e-05, + "loss": 0.4894, + "step": 2381 + }, + { + "epoch": 0.42346666666666666, + "grad_norm": 0.2880311906337738, + "learning_rate": 3.0966726279457034e-05, + "loss": 0.4868, + "step": 2382 + }, + { + "epoch": 0.42364444444444443, + "grad_norm": 0.30736860632896423, + "learning_rate": 3.0953166222201476e-05, + "loss": 0.572, + "step": 2383 + }, + { + "epoch": 0.4238222222222222, + "grad_norm": 0.320116251707077, + "learning_rate": 3.09396043079853e-05, + "loss": 0.4734, + "step": 2384 + }, + { + "epoch": 0.424, + "grad_norm": 0.24316641688346863, + "learning_rate": 3.092604054103888e-05, + "loss": 0.4084, + "step": 2385 + }, + { + "epoch": 0.42417777777777776, + "grad_norm": 0.2622350752353668, + "learning_rate": 3.091247492559312e-05, + "loss": 0.5172, + "step": 2386 + }, + { + "epoch": 0.42435555555555554, + "grad_norm": 0.32751017808914185, + "learning_rate": 3.089890746587953e-05, + "loss": 0.4862, + "step": 2387 + }, + { + "epoch": 0.4245333333333333, + "grad_norm": 0.26200732588768005, + "learning_rate": 3.088533816613017e-05, + "loss": 0.5026, + "step": 2388 + }, + { + "epoch": 0.4247111111111111, + "grad_norm": 0.21019378304481506, + "learning_rate": 3.087176703057769e-05, + "loss": 0.3166, + "step": 2389 + }, + { + "epoch": 0.42488888888888887, + "grad_norm": 0.21405984461307526, + "learning_rate": 3.085819406345532e-05, + "loss": 0.3473, + "step": 2390 + }, + { + "epoch": 0.42506666666666665, + "grad_norm": 0.30213499069213867, + "learning_rate": 3.0844619268996845e-05, + "loss": 0.4899, + "step": 2391 + }, + { + "epoch": 0.4252444444444444, + "grad_norm": 0.3040148913860321, + "learning_rate": 3.083104265143663e-05, + "loss": 0.4131, + "step": 2392 + }, + { + "epoch": 0.4254222222222222, + "grad_norm": 0.31165146827697754, + "learning_rate": 3.08174642150096e-05, + "loss": 0.4622, + "step": 2393 + }, + { + "epoch": 0.4256, + "grad_norm": 0.6099965572357178, + "learning_rate": 3.0803883963951255e-05, + "loss": 0.474, + "step": 2394 + }, + { + "epoch": 0.42577777777777776, + "grad_norm": 0.26763227581977844, + "learning_rate": 3.0790301902497666e-05, + "loss": 0.354, + "step": 2395 + }, + { + "epoch": 0.42595555555555553, + "grad_norm": 0.2983456254005432, + "learning_rate": 3.0776718034885454e-05, + "loss": 0.3372, + "step": 2396 + }, + { + "epoch": 0.4261333333333333, + "grad_norm": 0.3489031195640564, + "learning_rate": 3.07631323653518e-05, + "loss": 0.3672, + "step": 2397 + }, + { + "epoch": 0.4263111111111111, + "grad_norm": 0.34307318925857544, + "learning_rate": 3.074954489813449e-05, + "loss": 0.359, + "step": 2398 + }, + { + "epoch": 0.42648888888888886, + "grad_norm": 0.38835078477859497, + "learning_rate": 3.0735955637471794e-05, + "loss": 0.4397, + "step": 2399 + }, + { + "epoch": 0.4266666666666667, + "grad_norm": 0.44580212235450745, + "learning_rate": 3.072236458760262e-05, + "loss": 0.4026, + "step": 2400 + }, + { + "epoch": 0.42684444444444447, + "grad_norm": 0.27241140604019165, + "learning_rate": 3.0708771752766394e-05, + "loss": 0.4897, + "step": 2401 + }, + { + "epoch": 0.42702222222222225, + "grad_norm": 0.2954046428203583, + "learning_rate": 3.06951771372031e-05, + "loss": 0.449, + "step": 2402 + }, + { + "epoch": 0.4272, + "grad_norm": 0.3490567207336426, + "learning_rate": 3.068158074515328e-05, + "loss": 0.6214, + "step": 2403 + }, + { + "epoch": 0.4273777777777778, + "grad_norm": 0.2830658555030823, + "learning_rate": 3.0667982580858044e-05, + "loss": 0.5199, + "step": 2404 + }, + { + "epoch": 0.4275555555555556, + "grad_norm": 0.2707262337207794, + "learning_rate": 3.0654382648559026e-05, + "loss": 0.468, + "step": 2405 + }, + { + "epoch": 0.42773333333333335, + "grad_norm": 0.26871082186698914, + "learning_rate": 3.064078095249844e-05, + "loss": 0.4958, + "step": 2406 + }, + { + "epoch": 0.42791111111111113, + "grad_norm": 0.2855778932571411, + "learning_rate": 3.062717749691904e-05, + "loss": 0.483, + "step": 2407 + }, + { + "epoch": 0.4280888888888889, + "grad_norm": 0.3035014867782593, + "learning_rate": 3.0613572286064125e-05, + "loss": 0.5826, + "step": 2408 + }, + { + "epoch": 0.4282666666666667, + "grad_norm": 0.28705570101737976, + "learning_rate": 3.059996532417754e-05, + "loss": 0.5547, + "step": 2409 + }, + { + "epoch": 0.42844444444444446, + "grad_norm": 0.31234225630760193, + "learning_rate": 3.058635661550369e-05, + "loss": 0.5811, + "step": 2410 + }, + { + "epoch": 0.42862222222222224, + "grad_norm": 0.3076266348361969, + "learning_rate": 3.0572746164287514e-05, + "loss": 0.5508, + "step": 2411 + }, + { + "epoch": 0.4288, + "grad_norm": 0.28488075733184814, + "learning_rate": 3.055913397477448e-05, + "loss": 0.4335, + "step": 2412 + }, + { + "epoch": 0.4289777777777778, + "grad_norm": 0.22385305166244507, + "learning_rate": 3.054552005121064e-05, + "loss": 0.402, + "step": 2413 + }, + { + "epoch": 0.42915555555555557, + "grad_norm": 0.22280387580394745, + "learning_rate": 3.053190439784253e-05, + "loss": 0.493, + "step": 2414 + }, + { + "epoch": 0.42933333333333334, + "grad_norm": 0.2640891671180725, + "learning_rate": 3.051828701891729e-05, + "loss": 0.5512, + "step": 2415 + }, + { + "epoch": 0.4295111111111111, + "grad_norm": 0.26429641246795654, + "learning_rate": 3.050466791868254e-05, + "loss": 0.4309, + "step": 2416 + }, + { + "epoch": 0.4296888888888889, + "grad_norm": 0.20089729130268097, + "learning_rate": 3.049104710138647e-05, + "loss": 0.3616, + "step": 2417 + }, + { + "epoch": 0.4298666666666667, + "grad_norm": 0.24620403349399567, + "learning_rate": 3.0477424571277807e-05, + "loss": 0.468, + "step": 2418 + }, + { + "epoch": 0.43004444444444445, + "grad_norm": 0.2684881091117859, + "learning_rate": 3.0463800332605784e-05, + "loss": 0.4656, + "step": 2419 + }, + { + "epoch": 0.43022222222222223, + "grad_norm": 0.2627245783805847, + "learning_rate": 3.0450174389620205e-05, + "loss": 0.4709, + "step": 2420 + }, + { + "epoch": 0.4304, + "grad_norm": 0.26581257581710815, + "learning_rate": 3.0436546746571372e-05, + "loss": 0.5262, + "step": 2421 + }, + { + "epoch": 0.4305777777777778, + "grad_norm": 0.3223540186882019, + "learning_rate": 3.0422917407710137e-05, + "loss": 0.3956, + "step": 2422 + }, + { + "epoch": 0.43075555555555556, + "grad_norm": 0.2730993926525116, + "learning_rate": 3.040928637728787e-05, + "loss": 0.5748, + "step": 2423 + }, + { + "epoch": 0.43093333333333333, + "grad_norm": 0.26168909668922424, + "learning_rate": 3.0395653659556488e-05, + "loss": 0.4517, + "step": 2424 + }, + { + "epoch": 0.4311111111111111, + "grad_norm": 0.21035990118980408, + "learning_rate": 3.0382019258768403e-05, + "loss": 0.4038, + "step": 2425 + }, + { + "epoch": 0.4312888888888889, + "grad_norm": 0.2359234243631363, + "learning_rate": 3.0368383179176585e-05, + "loss": 0.5574, + "step": 2426 + }, + { + "epoch": 0.43146666666666667, + "grad_norm": 0.2905442416667938, + "learning_rate": 3.0354745425034498e-05, + "loss": 0.504, + "step": 2427 + }, + { + "epoch": 0.43164444444444444, + "grad_norm": 0.26843884587287903, + "learning_rate": 3.034110600059616e-05, + "loss": 0.4856, + "step": 2428 + }, + { + "epoch": 0.4318222222222222, + "grad_norm": 0.26153868436813354, + "learning_rate": 3.032746491011607e-05, + "loss": 0.3982, + "step": 2429 + }, + { + "epoch": 0.432, + "grad_norm": 0.25243306159973145, + "learning_rate": 3.0313822157849287e-05, + "loss": 0.4232, + "step": 2430 + }, + { + "epoch": 0.43217777777777777, + "grad_norm": 0.2908584177494049, + "learning_rate": 3.0300177748051373e-05, + "loss": 0.612, + "step": 2431 + }, + { + "epoch": 0.43235555555555555, + "grad_norm": 0.23297370970249176, + "learning_rate": 3.028653168497838e-05, + "loss": 0.3475, + "step": 2432 + }, + { + "epoch": 0.4325333333333333, + "grad_norm": 0.27595868706703186, + "learning_rate": 3.0272883972886935e-05, + "loss": 0.5513, + "step": 2433 + }, + { + "epoch": 0.4327111111111111, + "grad_norm": 0.24053683876991272, + "learning_rate": 3.0259234616034116e-05, + "loss": 0.4449, + "step": 2434 + }, + { + "epoch": 0.4328888888888889, + "grad_norm": 0.2623700201511383, + "learning_rate": 3.0245583618677558e-05, + "loss": 0.535, + "step": 2435 + }, + { + "epoch": 0.43306666666666666, + "grad_norm": 0.3136540651321411, + "learning_rate": 3.023193098507538e-05, + "loss": 0.7098, + "step": 2436 + }, + { + "epoch": 0.43324444444444443, + "grad_norm": 0.2571598291397095, + "learning_rate": 3.0218276719486244e-05, + "loss": 0.481, + "step": 2437 + }, + { + "epoch": 0.4334222222222222, + "grad_norm": 0.28701069951057434, + "learning_rate": 3.020462082616928e-05, + "loss": 0.5038, + "step": 2438 + }, + { + "epoch": 0.4336, + "grad_norm": 0.2790224552154541, + "learning_rate": 3.0190963309384156e-05, + "loss": 0.4856, + "step": 2439 + }, + { + "epoch": 0.43377777777777776, + "grad_norm": 0.258585125207901, + "learning_rate": 3.0177304173391037e-05, + "loss": 0.4195, + "step": 2440 + }, + { + "epoch": 0.43395555555555554, + "grad_norm": 0.24113842844963074, + "learning_rate": 3.016364342245059e-05, + "loss": 0.4607, + "step": 2441 + }, + { + "epoch": 0.4341333333333333, + "grad_norm": 0.3220059275627136, + "learning_rate": 3.0149981060823995e-05, + "loss": 0.3382, + "step": 2442 + }, + { + "epoch": 0.4343111111111111, + "grad_norm": 0.2911723852157593, + "learning_rate": 3.013631709277292e-05, + "loss": 0.4579, + "step": 2443 + }, + { + "epoch": 0.43448888888888887, + "grad_norm": 0.344404011964798, + "learning_rate": 3.0122651522559553e-05, + "loss": 0.395, + "step": 2444 + }, + { + "epoch": 0.43466666666666665, + "grad_norm": 0.3078421950340271, + "learning_rate": 3.0108984354446556e-05, + "loss": 0.3424, + "step": 2445 + }, + { + "epoch": 0.4348444444444444, + "grad_norm": 0.4224089980125427, + "learning_rate": 3.0095315592697126e-05, + "loss": 0.438, + "step": 2446 + }, + { + "epoch": 0.4350222222222222, + "grad_norm": 0.30957403779029846, + "learning_rate": 3.008164524157491e-05, + "loss": 0.3914, + "step": 2447 + }, + { + "epoch": 0.4352, + "grad_norm": 0.3997316062450409, + "learning_rate": 3.00679733053441e-05, + "loss": 0.3539, + "step": 2448 + }, + { + "epoch": 0.43537777777777775, + "grad_norm": 0.3600807189941406, + "learning_rate": 3.005429978826934e-05, + "loss": 0.4227, + "step": 2449 + }, + { + "epoch": 0.43555555555555553, + "grad_norm": 0.44279560446739197, + "learning_rate": 3.0040624694615803e-05, + "loss": 0.5196, + "step": 2450 + }, + { + "epoch": 0.4357333333333333, + "grad_norm": 0.26898548007011414, + "learning_rate": 3.002694802864912e-05, + "loss": 0.5322, + "step": 2451 + }, + { + "epoch": 0.43591111111111114, + "grad_norm": 0.23317627608776093, + "learning_rate": 3.0013269794635446e-05, + "loss": 0.4661, + "step": 2452 + }, + { + "epoch": 0.4360888888888889, + "grad_norm": 0.22133682668209076, + "learning_rate": 2.9999589996841386e-05, + "loss": 0.3919, + "step": 2453 + }, + { + "epoch": 0.4362666666666667, + "grad_norm": 0.2459014505147934, + "learning_rate": 2.9985908639534075e-05, + "loss": 0.3396, + "step": 2454 + }, + { + "epoch": 0.43644444444444447, + "grad_norm": 0.23966382443904877, + "learning_rate": 2.9972225726981113e-05, + "loss": 0.4671, + "step": 2455 + }, + { + "epoch": 0.43662222222222224, + "grad_norm": 0.2418704479932785, + "learning_rate": 2.9958541263450584e-05, + "loss": 0.4959, + "step": 2456 + }, + { + "epoch": 0.4368, + "grad_norm": 0.2714509665966034, + "learning_rate": 2.9944855253211052e-05, + "loss": 0.4956, + "step": 2457 + }, + { + "epoch": 0.4369777777777778, + "grad_norm": 0.23329026997089386, + "learning_rate": 2.9931167700531578e-05, + "loss": 0.4278, + "step": 2458 + }, + { + "epoch": 0.4371555555555556, + "grad_norm": 0.2838408350944519, + "learning_rate": 2.991747860968168e-05, + "loss": 0.5193, + "step": 2459 + }, + { + "epoch": 0.43733333333333335, + "grad_norm": 0.25937822461128235, + "learning_rate": 2.9903787984931396e-05, + "loss": 0.4943, + "step": 2460 + }, + { + "epoch": 0.43751111111111113, + "grad_norm": 0.2656700909137726, + "learning_rate": 2.9890095830551207e-05, + "loss": 0.4261, + "step": 2461 + }, + { + "epoch": 0.4376888888888889, + "grad_norm": 0.24404363334178925, + "learning_rate": 2.9876402150812078e-05, + "loss": 0.4127, + "step": 2462 + }, + { + "epoch": 0.4378666666666667, + "grad_norm": 0.29921919107437134, + "learning_rate": 2.9862706949985463e-05, + "loss": 0.4962, + "step": 2463 + }, + { + "epoch": 0.43804444444444446, + "grad_norm": 0.2477061003446579, + "learning_rate": 2.984901023234327e-05, + "loss": 0.3987, + "step": 2464 + }, + { + "epoch": 0.43822222222222224, + "grad_norm": 0.2864641547203064, + "learning_rate": 2.9835312002157913e-05, + "loss": 0.4344, + "step": 2465 + }, + { + "epoch": 0.4384, + "grad_norm": 0.316698282957077, + "learning_rate": 2.9821612263702226e-05, + "loss": 0.3895, + "step": 2466 + }, + { + "epoch": 0.4385777777777778, + "grad_norm": 0.36035674810409546, + "learning_rate": 2.9807911021249573e-05, + "loss": 0.3847, + "step": 2467 + }, + { + "epoch": 0.43875555555555557, + "grad_norm": 0.22015735507011414, + "learning_rate": 2.9794208279073743e-05, + "loss": 0.377, + "step": 2468 + }, + { + "epoch": 0.43893333333333334, + "grad_norm": 0.30792272090911865, + "learning_rate": 2.978050404144901e-05, + "loss": 0.5587, + "step": 2469 + }, + { + "epoch": 0.4391111111111111, + "grad_norm": 0.2976752519607544, + "learning_rate": 2.9766798312650112e-05, + "loss": 0.4712, + "step": 2470 + }, + { + "epoch": 0.4392888888888889, + "grad_norm": 0.2652478814125061, + "learning_rate": 2.9753091096952255e-05, + "loss": 0.4242, + "step": 2471 + }, + { + "epoch": 0.43946666666666667, + "grad_norm": 0.3099959194660187, + "learning_rate": 2.973938239863111e-05, + "loss": 0.5247, + "step": 2472 + }, + { + "epoch": 0.43964444444444445, + "grad_norm": 0.3074376881122589, + "learning_rate": 2.97256722219628e-05, + "loss": 0.6887, + "step": 2473 + }, + { + "epoch": 0.4398222222222222, + "grad_norm": 0.2646455764770508, + "learning_rate": 2.971196057122393e-05, + "loss": 0.526, + "step": 2474 + }, + { + "epoch": 0.44, + "grad_norm": 0.31363919377326965, + "learning_rate": 2.9698247450691525e-05, + "loss": 0.4933, + "step": 2475 + }, + { + "epoch": 0.4401777777777778, + "grad_norm": 0.3091849684715271, + "learning_rate": 2.9684532864643122e-05, + "loss": 0.4849, + "step": 2476 + }, + { + "epoch": 0.44035555555555556, + "grad_norm": 0.2621522545814514, + "learning_rate": 2.9670816817356668e-05, + "loss": 0.4254, + "step": 2477 + }, + { + "epoch": 0.44053333333333333, + "grad_norm": 0.23403044044971466, + "learning_rate": 2.9657099313110593e-05, + "loss": 0.4494, + "step": 2478 + }, + { + "epoch": 0.4407111111111111, + "grad_norm": 0.24549809098243713, + "learning_rate": 2.9643380356183775e-05, + "loss": 0.5049, + "step": 2479 + }, + { + "epoch": 0.4408888888888889, + "grad_norm": 0.24203680455684662, + "learning_rate": 2.9629659950855544e-05, + "loss": 0.5751, + "step": 2480 + }, + { + "epoch": 0.44106666666666666, + "grad_norm": 0.24386686086654663, + "learning_rate": 2.9615938101405676e-05, + "loss": 0.4483, + "step": 2481 + }, + { + "epoch": 0.44124444444444444, + "grad_norm": 0.27281898260116577, + "learning_rate": 2.9602214812114415e-05, + "loss": 0.4725, + "step": 2482 + }, + { + "epoch": 0.4414222222222222, + "grad_norm": 0.2585911452770233, + "learning_rate": 2.958849008726242e-05, + "loss": 0.347, + "step": 2483 + }, + { + "epoch": 0.4416, + "grad_norm": 0.35200974345207214, + "learning_rate": 2.9574763931130843e-05, + "loss": 0.4906, + "step": 2484 + }, + { + "epoch": 0.44177777777777777, + "grad_norm": 0.2283005714416504, + "learning_rate": 2.956103634800126e-05, + "loss": 0.3794, + "step": 2485 + }, + { + "epoch": 0.44195555555555555, + "grad_norm": 0.21556305885314941, + "learning_rate": 2.9547307342155673e-05, + "loss": 0.3942, + "step": 2486 + }, + { + "epoch": 0.4421333333333333, + "grad_norm": 0.26582440733909607, + "learning_rate": 2.953357691787656e-05, + "loss": 0.5453, + "step": 2487 + }, + { + "epoch": 0.4423111111111111, + "grad_norm": 0.3069571256637573, + "learning_rate": 2.9519845079446823e-05, + "loss": 0.6818, + "step": 2488 + }, + { + "epoch": 0.4424888888888889, + "grad_norm": 0.28764209151268005, + "learning_rate": 2.9506111831149818e-05, + "loss": 0.4955, + "step": 2489 + }, + { + "epoch": 0.44266666666666665, + "grad_norm": 0.23868568241596222, + "learning_rate": 2.9492377177269315e-05, + "loss": 0.4434, + "step": 2490 + }, + { + "epoch": 0.44284444444444443, + "grad_norm": 0.253560334444046, + "learning_rate": 2.9478641122089562e-05, + "loss": 0.3618, + "step": 2491 + }, + { + "epoch": 0.4430222222222222, + "grad_norm": 0.3298662602901459, + "learning_rate": 2.9464903669895205e-05, + "loss": 0.4069, + "step": 2492 + }, + { + "epoch": 0.4432, + "grad_norm": 0.30510053038597107, + "learning_rate": 2.9451164824971356e-05, + "loss": 0.4507, + "step": 2493 + }, + { + "epoch": 0.44337777777777776, + "grad_norm": 0.27811530232429504, + "learning_rate": 2.943742459160354e-05, + "loss": 0.4197, + "step": 2494 + }, + { + "epoch": 0.44355555555555554, + "grad_norm": 0.32515591382980347, + "learning_rate": 2.942368297407772e-05, + "loss": 0.4057, + "step": 2495 + }, + { + "epoch": 0.4437333333333333, + "grad_norm": 0.3068099021911621, + "learning_rate": 2.9409939976680313e-05, + "loss": 0.485, + "step": 2496 + }, + { + "epoch": 0.4439111111111111, + "grad_norm": 0.29018837213516235, + "learning_rate": 2.939619560369813e-05, + "loss": 0.3526, + "step": 2497 + }, + { + "epoch": 0.44408888888888887, + "grad_norm": 0.39035913348197937, + "learning_rate": 2.938244985941844e-05, + "loss": 0.3793, + "step": 2498 + }, + { + "epoch": 0.44426666666666664, + "grad_norm": 0.4008481502532959, + "learning_rate": 2.9368702748128912e-05, + "loss": 0.4409, + "step": 2499 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 0.5846676230430603, + "learning_rate": 2.935495427411768e-05, + "loss": 0.457, + "step": 2500 + }, + { + "epoch": 0.4446222222222222, + "grad_norm": 0.2248874008655548, + "learning_rate": 2.9341204441673266e-05, + "loss": 0.4672, + "step": 2501 + }, + { + "epoch": 0.4448, + "grad_norm": 0.25796782970428467, + "learning_rate": 2.9327453255084638e-05, + "loss": 0.4897, + "step": 2502 + }, + { + "epoch": 0.4449777777777778, + "grad_norm": 0.27782344818115234, + "learning_rate": 2.9313700718641167e-05, + "loss": 0.4877, + "step": 2503 + }, + { + "epoch": 0.4451555555555556, + "grad_norm": 0.25185859203338623, + "learning_rate": 2.9299946836632673e-05, + "loss": 0.4794, + "step": 2504 + }, + { + "epoch": 0.44533333333333336, + "grad_norm": 0.2849651575088501, + "learning_rate": 2.9286191613349374e-05, + "loss": 0.5172, + "step": 2505 + }, + { + "epoch": 0.44551111111111114, + "grad_norm": 0.3525363802909851, + "learning_rate": 2.9272435053081922e-05, + "loss": 0.3671, + "step": 2506 + }, + { + "epoch": 0.4456888888888889, + "grad_norm": 0.221153125166893, + "learning_rate": 2.9258677160121352e-05, + "loss": 0.4143, + "step": 2507 + }, + { + "epoch": 0.4458666666666667, + "grad_norm": 0.286569207906723, + "learning_rate": 2.9244917938759163e-05, + "loss": 0.5407, + "step": 2508 + }, + { + "epoch": 0.44604444444444447, + "grad_norm": 0.3258972764015198, + "learning_rate": 2.9231157393287234e-05, + "loss": 0.5719, + "step": 2509 + }, + { + "epoch": 0.44622222222222224, + "grad_norm": 0.2869493067264557, + "learning_rate": 2.9217395527997875e-05, + "loss": 0.4338, + "step": 2510 + }, + { + "epoch": 0.4464, + "grad_norm": 0.26944276690483093, + "learning_rate": 2.920363234718379e-05, + "loss": 0.4747, + "step": 2511 + }, + { + "epoch": 0.4465777777777778, + "grad_norm": 0.22366926074028015, + "learning_rate": 2.9189867855138103e-05, + "loss": 0.4696, + "step": 2512 + }, + { + "epoch": 0.4467555555555556, + "grad_norm": 0.2834321856498718, + "learning_rate": 2.9176102056154363e-05, + "loss": 0.5587, + "step": 2513 + }, + { + "epoch": 0.44693333333333335, + "grad_norm": 0.2289460152387619, + "learning_rate": 2.9162334954526493e-05, + "loss": 0.4326, + "step": 2514 + }, + { + "epoch": 0.4471111111111111, + "grad_norm": 0.2968672513961792, + "learning_rate": 2.9148566554548857e-05, + "loss": 0.4374, + "step": 2515 + }, + { + "epoch": 0.4472888888888889, + "grad_norm": 0.3145713210105896, + "learning_rate": 2.9134796860516194e-05, + "loss": 0.5072, + "step": 2516 + }, + { + "epoch": 0.4474666666666667, + "grad_norm": 0.23477409780025482, + "learning_rate": 2.9121025876723674e-05, + "loss": 0.3865, + "step": 2517 + }, + { + "epoch": 0.44764444444444446, + "grad_norm": 0.21038885414600372, + "learning_rate": 2.9107253607466832e-05, + "loss": 0.3829, + "step": 2518 + }, + { + "epoch": 0.44782222222222223, + "grad_norm": 0.2696356773376465, + "learning_rate": 2.9093480057041662e-05, + "loss": 0.4655, + "step": 2519 + }, + { + "epoch": 0.448, + "grad_norm": 0.23881399631500244, + "learning_rate": 2.9079705229744493e-05, + "loss": 0.44, + "step": 2520 + }, + { + "epoch": 0.4481777777777778, + "grad_norm": 0.2687569856643677, + "learning_rate": 2.9065929129872094e-05, + "loss": 0.4759, + "step": 2521 + }, + { + "epoch": 0.44835555555555556, + "grad_norm": 0.23170039057731628, + "learning_rate": 2.9052151761721617e-05, + "loss": 0.4789, + "step": 2522 + }, + { + "epoch": 0.44853333333333334, + "grad_norm": 0.23779144883155823, + "learning_rate": 2.9038373129590622e-05, + "loss": 0.4647, + "step": 2523 + }, + { + "epoch": 0.4487111111111111, + "grad_norm": 0.32157498598098755, + "learning_rate": 2.9024593237777037e-05, + "loss": 0.4731, + "step": 2524 + }, + { + "epoch": 0.4488888888888889, + "grad_norm": 0.241258442401886, + "learning_rate": 2.901081209057921e-05, + "loss": 0.3517, + "step": 2525 + }, + { + "epoch": 0.44906666666666667, + "grad_norm": 0.28465917706489563, + "learning_rate": 2.8997029692295874e-05, + "loss": 0.4609, + "step": 2526 + }, + { + "epoch": 0.44924444444444445, + "grad_norm": 0.32059818506240845, + "learning_rate": 2.8983246047226135e-05, + "loss": 0.4727, + "step": 2527 + }, + { + "epoch": 0.4494222222222222, + "grad_norm": 0.2626987397670746, + "learning_rate": 2.8969461159669513e-05, + "loss": 0.534, + "step": 2528 + }, + { + "epoch": 0.4496, + "grad_norm": 0.30564212799072266, + "learning_rate": 2.8955675033925895e-05, + "loss": 0.6557, + "step": 2529 + }, + { + "epoch": 0.4497777777777778, + "grad_norm": 0.25887104868888855, + "learning_rate": 2.894188767429557e-05, + "loss": 0.4757, + "step": 2530 + }, + { + "epoch": 0.44995555555555555, + "grad_norm": 0.24244801700115204, + "learning_rate": 2.8928099085079197e-05, + "loss": 0.3608, + "step": 2531 + }, + { + "epoch": 0.45013333333333333, + "grad_norm": 0.2565864324569702, + "learning_rate": 2.8914309270577834e-05, + "loss": 0.3836, + "step": 2532 + }, + { + "epoch": 0.4503111111111111, + "grad_norm": 0.27997133135795593, + "learning_rate": 2.8900518235092905e-05, + "loss": 0.4361, + "step": 2533 + }, + { + "epoch": 0.4504888888888889, + "grad_norm": 0.23155897855758667, + "learning_rate": 2.8886725982926232e-05, + "loss": 0.4614, + "step": 2534 + }, + { + "epoch": 0.45066666666666666, + "grad_norm": 0.2875325083732605, + "learning_rate": 2.8872932518379997e-05, + "loss": 0.3786, + "step": 2535 + }, + { + "epoch": 0.45084444444444444, + "grad_norm": 0.2684052586555481, + "learning_rate": 2.8859137845756784e-05, + "loss": 0.5278, + "step": 2536 + }, + { + "epoch": 0.4510222222222222, + "grad_norm": 0.32185545563697815, + "learning_rate": 2.884534196935953e-05, + "loss": 0.6911, + "step": 2537 + }, + { + "epoch": 0.4512, + "grad_norm": 0.22377189993858337, + "learning_rate": 2.8831544893491563e-05, + "loss": 0.4316, + "step": 2538 + }, + { + "epoch": 0.45137777777777777, + "grad_norm": 0.352762907743454, + "learning_rate": 2.881774662245658e-05, + "loss": 0.3826, + "step": 2539 + }, + { + "epoch": 0.45155555555555554, + "grad_norm": 0.24529008567333221, + "learning_rate": 2.8803947160558652e-05, + "loss": 0.4148, + "step": 2540 + }, + { + "epoch": 0.4517333333333333, + "grad_norm": 0.2901937663555145, + "learning_rate": 2.879014651210223e-05, + "loss": 0.3936, + "step": 2541 + }, + { + "epoch": 0.4519111111111111, + "grad_norm": 0.39304324984550476, + "learning_rate": 2.8776344681392105e-05, + "loss": 0.3976, + "step": 2542 + }, + { + "epoch": 0.4520888888888889, + "grad_norm": 0.27212122082710266, + "learning_rate": 2.8762541672733472e-05, + "loss": 0.2798, + "step": 2543 + }, + { + "epoch": 0.45226666666666665, + "grad_norm": 0.2753283977508545, + "learning_rate": 2.874873749043187e-05, + "loss": 0.4375, + "step": 2544 + }, + { + "epoch": 0.4524444444444444, + "grad_norm": 0.33852502703666687, + "learning_rate": 2.8734932138793225e-05, + "loss": 0.4343, + "step": 2545 + }, + { + "epoch": 0.4526222222222222, + "grad_norm": 0.38610076904296875, + "learning_rate": 2.8721125622123806e-05, + "loss": 0.5252, + "step": 2546 + }, + { + "epoch": 0.4528, + "grad_norm": 0.33478352427482605, + "learning_rate": 2.8707317944730268e-05, + "loss": 0.3944, + "step": 2547 + }, + { + "epoch": 0.45297777777777776, + "grad_norm": 0.5933079719543457, + "learning_rate": 2.8693509110919598e-05, + "loss": 0.5407, + "step": 2548 + }, + { + "epoch": 0.45315555555555553, + "grad_norm": 0.44740936160087585, + "learning_rate": 2.8679699124999166e-05, + "loss": 0.5082, + "step": 2549 + }, + { + "epoch": 0.4533333333333333, + "grad_norm": 0.5158184170722961, + "learning_rate": 2.866588799127671e-05, + "loss": 0.4892, + "step": 2550 + }, + { + "epoch": 0.4535111111111111, + "grad_norm": 0.2637031078338623, + "learning_rate": 2.8652075714060295e-05, + "loss": 0.5185, + "step": 2551 + }, + { + "epoch": 0.45368888888888886, + "grad_norm": 0.2709507942199707, + "learning_rate": 2.8638262297658368e-05, + "loss": 0.6141, + "step": 2552 + }, + { + "epoch": 0.45386666666666664, + "grad_norm": 0.22555111348628998, + "learning_rate": 2.8624447746379722e-05, + "loss": 0.4069, + "step": 2553 + }, + { + "epoch": 0.4540444444444444, + "grad_norm": 0.2944674491882324, + "learning_rate": 2.8610632064533517e-05, + "loss": 0.4707, + "step": 2554 + }, + { + "epoch": 0.45422222222222225, + "grad_norm": 0.2797921895980835, + "learning_rate": 2.859681525642923e-05, + "loss": 0.6232, + "step": 2555 + }, + { + "epoch": 0.4544, + "grad_norm": 0.2537840008735657, + "learning_rate": 2.858299732637674e-05, + "loss": 0.384, + "step": 2556 + }, + { + "epoch": 0.4545777777777778, + "grad_norm": 0.26099979877471924, + "learning_rate": 2.856917827868622e-05, + "loss": 0.4493, + "step": 2557 + }, + { + "epoch": 0.4547555555555556, + "grad_norm": 0.23144148290157318, + "learning_rate": 2.855535811766825e-05, + "loss": 0.4361, + "step": 2558 + }, + { + "epoch": 0.45493333333333336, + "grad_norm": 0.2853239178657532, + "learning_rate": 2.8541536847633717e-05, + "loss": 0.5272, + "step": 2559 + }, + { + "epoch": 0.45511111111111113, + "grad_norm": 0.24015262722969055, + "learning_rate": 2.8527714472893862e-05, + "loss": 0.4416, + "step": 2560 + }, + { + "epoch": 0.4552888888888889, + "grad_norm": 0.2875637114048004, + "learning_rate": 2.8513890997760272e-05, + "loss": 0.5063, + "step": 2561 + }, + { + "epoch": 0.4554666666666667, + "grad_norm": 0.20942963659763336, + "learning_rate": 2.8500066426544896e-05, + "loss": 0.3619, + "step": 2562 + }, + { + "epoch": 0.45564444444444446, + "grad_norm": 0.24299213290214539, + "learning_rate": 2.8486240763559986e-05, + "loss": 0.5294, + "step": 2563 + }, + { + "epoch": 0.45582222222222224, + "grad_norm": 0.22225119173526764, + "learning_rate": 2.847241401311817e-05, + "loss": 0.3673, + "step": 2564 + }, + { + "epoch": 0.456, + "grad_norm": 0.30358871817588806, + "learning_rate": 2.845858617953239e-05, + "loss": 0.6203, + "step": 2565 + }, + { + "epoch": 0.4561777777777778, + "grad_norm": 0.27116143703460693, + "learning_rate": 2.844475726711595e-05, + "loss": 0.5473, + "step": 2566 + }, + { + "epoch": 0.45635555555555557, + "grad_norm": 0.2246532291173935, + "learning_rate": 2.843092728018248e-05, + "loss": 0.4543, + "step": 2567 + }, + { + "epoch": 0.45653333333333335, + "grad_norm": 0.23672115802764893, + "learning_rate": 2.8417096223045925e-05, + "loss": 0.3729, + "step": 2568 + }, + { + "epoch": 0.4567111111111111, + "grad_norm": 0.3492291569709778, + "learning_rate": 2.840326410002061e-05, + "loss": 0.5773, + "step": 2569 + }, + { + "epoch": 0.4568888888888889, + "grad_norm": 0.2673388123512268, + "learning_rate": 2.8389430915421132e-05, + "loss": 0.6236, + "step": 2570 + }, + { + "epoch": 0.4570666666666667, + "grad_norm": 0.2656526565551758, + "learning_rate": 2.8375596673562482e-05, + "loss": 0.465, + "step": 2571 + }, + { + "epoch": 0.45724444444444445, + "grad_norm": 0.33117249608039856, + "learning_rate": 2.8361761378759934e-05, + "loss": 0.776, + "step": 2572 + }, + { + "epoch": 0.45742222222222223, + "grad_norm": 0.272762656211853, + "learning_rate": 2.834792503532911e-05, + "loss": 0.5591, + "step": 2573 + }, + { + "epoch": 0.4576, + "grad_norm": 0.2594073414802551, + "learning_rate": 2.833408764758595e-05, + "loss": 0.4744, + "step": 2574 + }, + { + "epoch": 0.4577777777777778, + "grad_norm": 0.29209282994270325, + "learning_rate": 2.832024921984674e-05, + "loss": 0.4565, + "step": 2575 + }, + { + "epoch": 0.45795555555555556, + "grad_norm": 0.26159170269966125, + "learning_rate": 2.8306409756428064e-05, + "loss": 0.488, + "step": 2576 + }, + { + "epoch": 0.45813333333333334, + "grad_norm": 0.25381600856781006, + "learning_rate": 2.829256926164685e-05, + "loss": 0.4966, + "step": 2577 + }, + { + "epoch": 0.4583111111111111, + "grad_norm": 0.24267758429050446, + "learning_rate": 2.8278727739820333e-05, + "loss": 0.5224, + "step": 2578 + }, + { + "epoch": 0.4584888888888889, + "grad_norm": 0.32321590185165405, + "learning_rate": 2.8264885195266065e-05, + "loss": 0.6228, + "step": 2579 + }, + { + "epoch": 0.45866666666666667, + "grad_norm": 0.24928446114063263, + "learning_rate": 2.8251041632301957e-05, + "loss": 0.5105, + "step": 2580 + }, + { + "epoch": 0.45884444444444444, + "grad_norm": 0.31957152485847473, + "learning_rate": 2.8237197055246172e-05, + "loss": 0.4422, + "step": 2581 + }, + { + "epoch": 0.4590222222222222, + "grad_norm": 0.2511320114135742, + "learning_rate": 2.8223351468417254e-05, + "loss": 0.3378, + "step": 2582 + }, + { + "epoch": 0.4592, + "grad_norm": 0.2694665789604187, + "learning_rate": 2.8209504876134007e-05, + "loss": 0.4609, + "step": 2583 + }, + { + "epoch": 0.4593777777777778, + "grad_norm": 0.2905427813529968, + "learning_rate": 2.8195657282715594e-05, + "loss": 0.566, + "step": 2584 + }, + { + "epoch": 0.45955555555555555, + "grad_norm": 0.23790432512760162, + "learning_rate": 2.8181808692481453e-05, + "loss": 0.4223, + "step": 2585 + }, + { + "epoch": 0.4597333333333333, + "grad_norm": 0.25193968415260315, + "learning_rate": 2.816795910975137e-05, + "loss": 0.5893, + "step": 2586 + }, + { + "epoch": 0.4599111111111111, + "grad_norm": 0.2869940400123596, + "learning_rate": 2.8154108538845404e-05, + "loss": 0.5539, + "step": 2587 + }, + { + "epoch": 0.4600888888888889, + "grad_norm": 0.4210547208786011, + "learning_rate": 2.8140256984083947e-05, + "loss": 0.4714, + "step": 2588 + }, + { + "epoch": 0.46026666666666666, + "grad_norm": 0.24389785528182983, + "learning_rate": 2.8126404449787685e-05, + "loss": 0.3989, + "step": 2589 + }, + { + "epoch": 0.46044444444444443, + "grad_norm": 0.31674498319625854, + "learning_rate": 2.8112550940277616e-05, + "loss": 0.4046, + "step": 2590 + }, + { + "epoch": 0.4606222222222222, + "grad_norm": 0.2901545464992523, + "learning_rate": 2.8098696459875046e-05, + "loss": 0.4267, + "step": 2591 + }, + { + "epoch": 0.4608, + "grad_norm": 0.3398211896419525, + "learning_rate": 2.8084841012901574e-05, + "loss": 0.442, + "step": 2592 + }, + { + "epoch": 0.46097777777777776, + "grad_norm": 0.2972176969051361, + "learning_rate": 2.8070984603679107e-05, + "loss": 0.379, + "step": 2593 + }, + { + "epoch": 0.46115555555555554, + "grad_norm": 0.3861132562160492, + "learning_rate": 2.8057127236529844e-05, + "loss": 0.4017, + "step": 2594 + }, + { + "epoch": 0.4613333333333333, + "grad_norm": 0.3197527825832367, + "learning_rate": 2.80432689157763e-05, + "loss": 0.3625, + "step": 2595 + }, + { + "epoch": 0.4615111111111111, + "grad_norm": 0.296059787273407, + "learning_rate": 2.8029409645741267e-05, + "loss": 0.4688, + "step": 2596 + }, + { + "epoch": 0.46168888888888887, + "grad_norm": 0.47241833806037903, + "learning_rate": 2.8015549430747852e-05, + "loss": 0.405, + "step": 2597 + }, + { + "epoch": 0.46186666666666665, + "grad_norm": 0.34685230255126953, + "learning_rate": 2.8001688275119432e-05, + "loss": 0.4424, + "step": 2598 + }, + { + "epoch": 0.4620444444444444, + "grad_norm": 0.41835668683052063, + "learning_rate": 2.7987826183179712e-05, + "loss": 0.467, + "step": 2599 + }, + { + "epoch": 0.4622222222222222, + "grad_norm": 0.3845471441745758, + "learning_rate": 2.797396315925265e-05, + "loss": 0.4805, + "step": 2600 + }, + { + "epoch": 0.4624, + "grad_norm": 0.2852180600166321, + "learning_rate": 2.7960099207662532e-05, + "loss": 0.3883, + "step": 2601 + }, + { + "epoch": 0.46257777777777775, + "grad_norm": 0.2547115087509155, + "learning_rate": 2.79462343327339e-05, + "loss": 0.3908, + "step": 2602 + }, + { + "epoch": 0.46275555555555553, + "grad_norm": 0.29451602697372437, + "learning_rate": 2.793236853879161e-05, + "loss": 0.5675, + "step": 2603 + }, + { + "epoch": 0.4629333333333333, + "grad_norm": 0.19915078580379486, + "learning_rate": 2.79185018301608e-05, + "loss": 0.4174, + "step": 2604 + }, + { + "epoch": 0.4631111111111111, + "grad_norm": 0.21351076662540436, + "learning_rate": 2.7904634211166876e-05, + "loss": 0.4366, + "step": 2605 + }, + { + "epoch": 0.46328888888888886, + "grad_norm": 0.2622143626213074, + "learning_rate": 2.7890765686135544e-05, + "loss": 0.4875, + "step": 2606 + }, + { + "epoch": 0.4634666666666667, + "grad_norm": 0.20998716354370117, + "learning_rate": 2.7876896259392788e-05, + "loss": 0.3867, + "step": 2607 + }, + { + "epoch": 0.46364444444444447, + "grad_norm": 0.2636096477508545, + "learning_rate": 2.7863025935264875e-05, + "loss": 0.5069, + "step": 2608 + }, + { + "epoch": 0.46382222222222225, + "grad_norm": 0.2735377848148346, + "learning_rate": 2.7849154718078346e-05, + "loss": 0.4107, + "step": 2609 + }, + { + "epoch": 0.464, + "grad_norm": 0.2855624854564667, + "learning_rate": 2.783528261216004e-05, + "loss": 0.5757, + "step": 2610 + }, + { + "epoch": 0.4641777777777778, + "grad_norm": 0.28109976649284363, + "learning_rate": 2.782140962183704e-05, + "loss": 0.6672, + "step": 2611 + }, + { + "epoch": 0.4643555555555556, + "grad_norm": 0.2662220895290375, + "learning_rate": 2.7807535751436738e-05, + "loss": 0.5209, + "step": 2612 + }, + { + "epoch": 0.46453333333333335, + "grad_norm": 0.3121795952320099, + "learning_rate": 2.7793661005286774e-05, + "loss": 0.5222, + "step": 2613 + }, + { + "epoch": 0.46471111111111113, + "grad_norm": 0.2676851451396942, + "learning_rate": 2.7779785387715078e-05, + "loss": 0.5182, + "step": 2614 + }, + { + "epoch": 0.4648888888888889, + "grad_norm": 0.28443947434425354, + "learning_rate": 2.7765908903049848e-05, + "loss": 0.6308, + "step": 2615 + }, + { + "epoch": 0.4650666666666667, + "grad_norm": 0.2885751724243164, + "learning_rate": 2.7752031555619555e-05, + "loss": 0.5988, + "step": 2616 + }, + { + "epoch": 0.46524444444444446, + "grad_norm": 0.2557695209980011, + "learning_rate": 2.773815334975292e-05, + "loss": 0.5732, + "step": 2617 + }, + { + "epoch": 0.46542222222222224, + "grad_norm": 0.2911243438720703, + "learning_rate": 2.7724274289778974e-05, + "loss": 0.4265, + "step": 2618 + }, + { + "epoch": 0.4656, + "grad_norm": 0.2733997106552124, + "learning_rate": 2.7710394380026954e-05, + "loss": 0.5486, + "step": 2619 + }, + { + "epoch": 0.4657777777777778, + "grad_norm": 0.28769850730895996, + "learning_rate": 2.769651362482642e-05, + "loss": 0.5711, + "step": 2620 + }, + { + "epoch": 0.46595555555555557, + "grad_norm": 0.24765627086162567, + "learning_rate": 2.7682632028507167e-05, + "loss": 0.4066, + "step": 2621 + }, + { + "epoch": 0.46613333333333334, + "grad_norm": 0.2901877462863922, + "learning_rate": 2.766874959539925e-05, + "loss": 0.5594, + "step": 2622 + }, + { + "epoch": 0.4663111111111111, + "grad_norm": 0.23390290141105652, + "learning_rate": 2.7654866329833002e-05, + "loss": 0.4828, + "step": 2623 + }, + { + "epoch": 0.4664888888888889, + "grad_norm": 0.23176878690719604, + "learning_rate": 2.7640982236138992e-05, + "loss": 0.359, + "step": 2624 + }, + { + "epoch": 0.4666666666666667, + "grad_norm": 0.27021723985671997, + "learning_rate": 2.7627097318648076e-05, + "loss": 0.3924, + "step": 2625 + }, + { + "epoch": 0.46684444444444445, + "grad_norm": 0.27413272857666016, + "learning_rate": 2.761321158169134e-05, + "loss": 0.4043, + "step": 2626 + }, + { + "epoch": 0.4670222222222222, + "grad_norm": 0.27315467596054077, + "learning_rate": 2.7599325029600143e-05, + "loss": 0.5203, + "step": 2627 + }, + { + "epoch": 0.4672, + "grad_norm": 0.22873376309871674, + "learning_rate": 2.7585437666706087e-05, + "loss": 0.3736, + "step": 2628 + }, + { + "epoch": 0.4673777777777778, + "grad_norm": 0.23601143062114716, + "learning_rate": 2.7571549497341042e-05, + "loss": 0.3877, + "step": 2629 + }, + { + "epoch": 0.46755555555555556, + "grad_norm": 0.2553922235965729, + "learning_rate": 2.7557660525837108e-05, + "loss": 0.4852, + "step": 2630 + }, + { + "epoch": 0.46773333333333333, + "grad_norm": 0.3287767469882965, + "learning_rate": 2.754377075652666e-05, + "loss": 0.4627, + "step": 2631 + }, + { + "epoch": 0.4679111111111111, + "grad_norm": 0.267870157957077, + "learning_rate": 2.7529880193742297e-05, + "loss": 0.6019, + "step": 2632 + }, + { + "epoch": 0.4680888888888889, + "grad_norm": 0.24530629813671112, + "learning_rate": 2.7515988841816887e-05, + "loss": 0.4602, + "step": 2633 + }, + { + "epoch": 0.46826666666666666, + "grad_norm": 0.26207485795021057, + "learning_rate": 2.7502096705083535e-05, + "loss": 0.4363, + "step": 2634 + }, + { + "epoch": 0.46844444444444444, + "grad_norm": 0.25233450531959534, + "learning_rate": 2.7488203787875577e-05, + "loss": 0.3769, + "step": 2635 + }, + { + "epoch": 0.4686222222222222, + "grad_norm": 0.27759021520614624, + "learning_rate": 2.747431009452663e-05, + "loss": 0.4682, + "step": 2636 + }, + { + "epoch": 0.4688, + "grad_norm": 0.23523619771003723, + "learning_rate": 2.7460415629370508e-05, + "loss": 0.4463, + "step": 2637 + }, + { + "epoch": 0.46897777777777777, + "grad_norm": 0.27616363763809204, + "learning_rate": 2.744652039674129e-05, + "loss": 0.412, + "step": 2638 + }, + { + "epoch": 0.46915555555555555, + "grad_norm": 0.28102391958236694, + "learning_rate": 2.74326244009733e-05, + "loss": 0.4421, + "step": 2639 + }, + { + "epoch": 0.4693333333333333, + "grad_norm": 0.252031534910202, + "learning_rate": 2.7418727646401094e-05, + "loss": 0.3936, + "step": 2640 + }, + { + "epoch": 0.4695111111111111, + "grad_norm": 0.28618207573890686, + "learning_rate": 2.7404830137359444e-05, + "loss": 0.3481, + "step": 2641 + }, + { + "epoch": 0.4696888888888889, + "grad_norm": 0.3900219798088074, + "learning_rate": 2.739093187818339e-05, + "loss": 0.4406, + "step": 2642 + }, + { + "epoch": 0.46986666666666665, + "grad_norm": 0.37281185388565063, + "learning_rate": 2.7377032873208186e-05, + "loss": 0.4285, + "step": 2643 + }, + { + "epoch": 0.47004444444444443, + "grad_norm": 0.33701759576797485, + "learning_rate": 2.7363133126769325e-05, + "loss": 0.5967, + "step": 2644 + }, + { + "epoch": 0.4702222222222222, + "grad_norm": 0.3020801842212677, + "learning_rate": 2.734923264320254e-05, + "loss": 0.4057, + "step": 2645 + }, + { + "epoch": 0.4704, + "grad_norm": 0.40793532133102417, + "learning_rate": 2.733533142684377e-05, + "loss": 0.4302, + "step": 2646 + }, + { + "epoch": 0.47057777777777776, + "grad_norm": 0.3178774118423462, + "learning_rate": 2.73214294820292e-05, + "loss": 0.3733, + "step": 2647 + }, + { + "epoch": 0.47075555555555554, + "grad_norm": 0.2962934672832489, + "learning_rate": 2.730752681309524e-05, + "loss": 0.4286, + "step": 2648 + }, + { + "epoch": 0.4709333333333333, + "grad_norm": 0.3513740599155426, + "learning_rate": 2.7293623424378535e-05, + "loss": 0.3722, + "step": 2649 + }, + { + "epoch": 0.4711111111111111, + "grad_norm": 0.42167922854423523, + "learning_rate": 2.7279719320215924e-05, + "loss": 0.4975, + "step": 2650 + }, + { + "epoch": 0.47128888888888887, + "grad_norm": 0.2567519247531891, + "learning_rate": 2.726581450494451e-05, + "loss": 0.5167, + "step": 2651 + }, + { + "epoch": 0.47146666666666665, + "grad_norm": 0.23638205230236053, + "learning_rate": 2.725190898290158e-05, + "loss": 0.4623, + "step": 2652 + }, + { + "epoch": 0.4716444444444444, + "grad_norm": 0.3296489119529724, + "learning_rate": 2.723800275842468e-05, + "loss": 0.4686, + "step": 2653 + }, + { + "epoch": 0.4718222222222222, + "grad_norm": 0.273482084274292, + "learning_rate": 2.7224095835851525e-05, + "loss": 0.5614, + "step": 2654 + }, + { + "epoch": 0.472, + "grad_norm": 0.27394604682922363, + "learning_rate": 2.721018821952011e-05, + "loss": 0.4797, + "step": 2655 + }, + { + "epoch": 0.47217777777777775, + "grad_norm": 0.2177024483680725, + "learning_rate": 2.7196279913768584e-05, + "loss": 0.4062, + "step": 2656 + }, + { + "epoch": 0.47235555555555553, + "grad_norm": 0.2680479884147644, + "learning_rate": 2.7182370922935353e-05, + "loss": 0.4423, + "step": 2657 + }, + { + "epoch": 0.47253333333333336, + "grad_norm": 0.3028639853000641, + "learning_rate": 2.716846125135903e-05, + "loss": 0.5805, + "step": 2658 + }, + { + "epoch": 0.47271111111111114, + "grad_norm": 0.3344045877456665, + "learning_rate": 2.715455090337842e-05, + "loss": 0.5299, + "step": 2659 + }, + { + "epoch": 0.4728888888888889, + "grad_norm": 0.2679993212223053, + "learning_rate": 2.7140639883332564e-05, + "loss": 0.5427, + "step": 2660 + }, + { + "epoch": 0.4730666666666667, + "grad_norm": 0.2035873383283615, + "learning_rate": 2.7126728195560702e-05, + "loss": 0.3621, + "step": 2661 + }, + { + "epoch": 0.47324444444444447, + "grad_norm": 0.32415854930877686, + "learning_rate": 2.711281584440228e-05, + "loss": 0.3939, + "step": 2662 + }, + { + "epoch": 0.47342222222222224, + "grad_norm": 0.26181235909461975, + "learning_rate": 2.7098902834196943e-05, + "loss": 0.4508, + "step": 2663 + }, + { + "epoch": 0.4736, + "grad_norm": 0.2614816725254059, + "learning_rate": 2.7084989169284568e-05, + "loss": 0.6049, + "step": 2664 + }, + { + "epoch": 0.4737777777777778, + "grad_norm": 0.2784140408039093, + "learning_rate": 2.707107485400521e-05, + "loss": 0.4225, + "step": 2665 + }, + { + "epoch": 0.4739555555555556, + "grad_norm": 0.2713667154312134, + "learning_rate": 2.705715989269914e-05, + "loss": 0.3935, + "step": 2666 + }, + { + "epoch": 0.47413333333333335, + "grad_norm": 0.3248816132545471, + "learning_rate": 2.7043244289706826e-05, + "loss": 0.7012, + "step": 2667 + }, + { + "epoch": 0.47431111111111113, + "grad_norm": 0.2341870367527008, + "learning_rate": 2.702932804936894e-05, + "loss": 0.5207, + "step": 2668 + }, + { + "epoch": 0.4744888888888889, + "grad_norm": 0.28810498118400574, + "learning_rate": 2.7015411176026344e-05, + "loss": 0.4385, + "step": 2669 + }, + { + "epoch": 0.4746666666666667, + "grad_norm": 0.3099035322666168, + "learning_rate": 2.700149367402011e-05, + "loss": 0.4678, + "step": 2670 + }, + { + "epoch": 0.47484444444444446, + "grad_norm": 0.23505716025829315, + "learning_rate": 2.6987575547691497e-05, + "loss": 0.4486, + "step": 2671 + }, + { + "epoch": 0.47502222222222223, + "grad_norm": 0.28962358832359314, + "learning_rate": 2.6973656801381963e-05, + "loss": 0.5416, + "step": 2672 + }, + { + "epoch": 0.4752, + "grad_norm": 0.23237808048725128, + "learning_rate": 2.695973743943315e-05, + "loss": 0.3358, + "step": 2673 + }, + { + "epoch": 0.4753777777777778, + "grad_norm": 0.227394700050354, + "learning_rate": 2.6945817466186912e-05, + "loss": 0.4838, + "step": 2674 + }, + { + "epoch": 0.47555555555555556, + "grad_norm": 0.2561541497707367, + "learning_rate": 2.693189688598528e-05, + "loss": 0.3834, + "step": 2675 + }, + { + "epoch": 0.47573333333333334, + "grad_norm": 0.25477033853530884, + "learning_rate": 2.6917975703170466e-05, + "loss": 0.4684, + "step": 2676 + }, + { + "epoch": 0.4759111111111111, + "grad_norm": 0.277456134557724, + "learning_rate": 2.6904053922084895e-05, + "loss": 0.4787, + "step": 2677 + }, + { + "epoch": 0.4760888888888889, + "grad_norm": 0.26161813735961914, + "learning_rate": 2.6890131547071147e-05, + "loss": 0.4976, + "step": 2678 + }, + { + "epoch": 0.47626666666666667, + "grad_norm": 0.2510134279727936, + "learning_rate": 2.6876208582472012e-05, + "loss": 0.4741, + "step": 2679 + }, + { + "epoch": 0.47644444444444445, + "grad_norm": 0.3506874740123749, + "learning_rate": 2.686228503263045e-05, + "loss": 0.4803, + "step": 2680 + }, + { + "epoch": 0.4766222222222222, + "grad_norm": 0.2110377699136734, + "learning_rate": 2.684836090188963e-05, + "loss": 0.448, + "step": 2681 + }, + { + "epoch": 0.4768, + "grad_norm": 0.22271575033664703, + "learning_rate": 2.6834436194592853e-05, + "loss": 0.4548, + "step": 2682 + }, + { + "epoch": 0.4769777777777778, + "grad_norm": 0.2670367658138275, + "learning_rate": 2.6820510915083648e-05, + "loss": 0.4639, + "step": 2683 + }, + { + "epoch": 0.47715555555555556, + "grad_norm": 0.3348279297351837, + "learning_rate": 2.6806585067705692e-05, + "loss": 0.5265, + "step": 2684 + }, + { + "epoch": 0.47733333333333333, + "grad_norm": 0.24391323328018188, + "learning_rate": 2.6792658656802856e-05, + "loss": 0.4273, + "step": 2685 + }, + { + "epoch": 0.4775111111111111, + "grad_norm": 0.2954247295856476, + "learning_rate": 2.6778731686719178e-05, + "loss": 0.5341, + "step": 2686 + }, + { + "epoch": 0.4776888888888889, + "grad_norm": 0.271161288022995, + "learning_rate": 2.6764804161798867e-05, + "loss": 0.5802, + "step": 2687 + }, + { + "epoch": 0.47786666666666666, + "grad_norm": 0.2661648094654083, + "learning_rate": 2.6750876086386328e-05, + "loss": 0.4937, + "step": 2688 + }, + { + "epoch": 0.47804444444444444, + "grad_norm": 0.29163336753845215, + "learning_rate": 2.6736947464826108e-05, + "loss": 0.4053, + "step": 2689 + }, + { + "epoch": 0.4782222222222222, + "grad_norm": 0.32902923226356506, + "learning_rate": 2.6723018301462937e-05, + "loss": 0.3683, + "step": 2690 + }, + { + "epoch": 0.4784, + "grad_norm": 0.39490464329719543, + "learning_rate": 2.6709088600641717e-05, + "loss": 0.3821, + "step": 2691 + }, + { + "epoch": 0.47857777777777777, + "grad_norm": 0.41516822576522827, + "learning_rate": 2.6695158366707522e-05, + "loss": 0.4142, + "step": 2692 + }, + { + "epoch": 0.47875555555555555, + "grad_norm": 0.45474985241889954, + "learning_rate": 2.6681227604005576e-05, + "loss": 0.4589, + "step": 2693 + }, + { + "epoch": 0.4789333333333333, + "grad_norm": 0.3482872545719147, + "learning_rate": 2.666729631688128e-05, + "loss": 0.4654, + "step": 2694 + }, + { + "epoch": 0.4791111111111111, + "grad_norm": 0.28876376152038574, + "learning_rate": 2.6653364509680188e-05, + "loss": 0.335, + "step": 2695 + }, + { + "epoch": 0.4792888888888889, + "grad_norm": 0.39880484342575073, + "learning_rate": 2.6639432186748043e-05, + "loss": 0.5145, + "step": 2696 + }, + { + "epoch": 0.47946666666666665, + "grad_norm": 0.3217504620552063, + "learning_rate": 2.662549935243071e-05, + "loss": 0.3586, + "step": 2697 + }, + { + "epoch": 0.47964444444444443, + "grad_norm": 0.40799853205680847, + "learning_rate": 2.661156601107424e-05, + "loss": 0.5525, + "step": 2698 + }, + { + "epoch": 0.4798222222222222, + "grad_norm": 0.4684382677078247, + "learning_rate": 2.6597632167024843e-05, + "loss": 0.4075, + "step": 2699 + }, + { + "epoch": 0.48, + "grad_norm": 0.4690878093242645, + "learning_rate": 2.6583697824628868e-05, + "loss": 0.4868, + "step": 2700 + }, + { + "epoch": 0.48017777777777776, + "grad_norm": 0.25890591740608215, + "learning_rate": 2.656976298823284e-05, + "loss": 0.4663, + "step": 2701 + }, + { + "epoch": 0.48035555555555554, + "grad_norm": 0.2905474901199341, + "learning_rate": 2.6555827662183414e-05, + "loss": 0.5808, + "step": 2702 + }, + { + "epoch": 0.4805333333333333, + "grad_norm": 0.2552701532840729, + "learning_rate": 2.6541891850827427e-05, + "loss": 0.461, + "step": 2703 + }, + { + "epoch": 0.4807111111111111, + "grad_norm": 0.2832605540752411, + "learning_rate": 2.6527955558511842e-05, + "loss": 0.4267, + "step": 2704 + }, + { + "epoch": 0.48088888888888887, + "grad_norm": 0.2686953544616699, + "learning_rate": 2.6514018789583784e-05, + "loss": 0.5908, + "step": 2705 + }, + { + "epoch": 0.48106666666666664, + "grad_norm": 0.31386080384254456, + "learning_rate": 2.650008154839052e-05, + "loss": 0.6185, + "step": 2706 + }, + { + "epoch": 0.4812444444444444, + "grad_norm": 0.2457066774368286, + "learning_rate": 2.6486143839279487e-05, + "loss": 0.5126, + "step": 2707 + }, + { + "epoch": 0.4814222222222222, + "grad_norm": 0.29232901334762573, + "learning_rate": 2.647220566659822e-05, + "loss": 0.5445, + "step": 2708 + }, + { + "epoch": 0.4816, + "grad_norm": 0.23501421511173248, + "learning_rate": 2.6458267034694463e-05, + "loss": 0.4427, + "step": 2709 + }, + { + "epoch": 0.4817777777777778, + "grad_norm": 0.25790467858314514, + "learning_rate": 2.6444327947916036e-05, + "loss": 0.5115, + "step": 2710 + }, + { + "epoch": 0.4819555555555556, + "grad_norm": 0.25371503829956055, + "learning_rate": 2.6430388410610955e-05, + "loss": 0.4417, + "step": 2711 + }, + { + "epoch": 0.48213333333333336, + "grad_norm": 0.19024552404880524, + "learning_rate": 2.641644842712735e-05, + "loss": 0.3899, + "step": 2712 + }, + { + "epoch": 0.48231111111111113, + "grad_norm": 0.2697563171386719, + "learning_rate": 2.6402508001813496e-05, + "loss": 0.5841, + "step": 2713 + }, + { + "epoch": 0.4824888888888889, + "grad_norm": 0.2678986191749573, + "learning_rate": 2.63885671390178e-05, + "loss": 0.6338, + "step": 2714 + }, + { + "epoch": 0.4826666666666667, + "grad_norm": 0.2988133728504181, + "learning_rate": 2.637462584308881e-05, + "loss": 0.7539, + "step": 2715 + }, + { + "epoch": 0.48284444444444446, + "grad_norm": 0.28690090775489807, + "learning_rate": 2.636068411837523e-05, + "loss": 0.4311, + "step": 2716 + }, + { + "epoch": 0.48302222222222224, + "grad_norm": 0.25773856043815613, + "learning_rate": 2.634674196922585e-05, + "loss": 0.4205, + "step": 2717 + }, + { + "epoch": 0.4832, + "grad_norm": 0.2889302968978882, + "learning_rate": 2.633279939998964e-05, + "loss": 0.5915, + "step": 2718 + }, + { + "epoch": 0.4833777777777778, + "grad_norm": 0.21328595280647278, + "learning_rate": 2.6318856415015664e-05, + "loss": 0.4696, + "step": 2719 + }, + { + "epoch": 0.48355555555555557, + "grad_norm": 0.25663915276527405, + "learning_rate": 2.6304913018653144e-05, + "loss": 0.4324, + "step": 2720 + }, + { + "epoch": 0.48373333333333335, + "grad_norm": 0.25406956672668457, + "learning_rate": 2.6290969215251416e-05, + "loss": 0.4725, + "step": 2721 + }, + { + "epoch": 0.4839111111111111, + "grad_norm": 0.24036285281181335, + "learning_rate": 2.627702500915995e-05, + "loss": 0.5367, + "step": 2722 + }, + { + "epoch": 0.4840888888888889, + "grad_norm": 0.2485695332288742, + "learning_rate": 2.6263080404728325e-05, + "loss": 0.4526, + "step": 2723 + }, + { + "epoch": 0.4842666666666667, + "grad_norm": 0.2908492386341095, + "learning_rate": 2.6249135406306273e-05, + "loss": 0.5476, + "step": 2724 + }, + { + "epoch": 0.48444444444444446, + "grad_norm": 0.23845696449279785, + "learning_rate": 2.623519001824362e-05, + "loss": 0.5002, + "step": 2725 + }, + { + "epoch": 0.48462222222222223, + "grad_norm": 0.24599142372608185, + "learning_rate": 2.6221244244890336e-05, + "loss": 0.6047, + "step": 2726 + }, + { + "epoch": 0.4848, + "grad_norm": 0.2732583284378052, + "learning_rate": 2.6207298090596493e-05, + "loss": 0.5502, + "step": 2727 + }, + { + "epoch": 0.4849777777777778, + "grad_norm": 0.27662861347198486, + "learning_rate": 2.6193351559712292e-05, + "loss": 0.5494, + "step": 2728 + }, + { + "epoch": 0.48515555555555556, + "grad_norm": 0.2510906159877777, + "learning_rate": 2.6179404656588058e-05, + "loss": 0.4593, + "step": 2729 + }, + { + "epoch": 0.48533333333333334, + "grad_norm": 0.20205947756767273, + "learning_rate": 2.616545738557421e-05, + "loss": 0.4826, + "step": 2730 + }, + { + "epoch": 0.4855111111111111, + "grad_norm": 0.2853657603263855, + "learning_rate": 2.615150975102131e-05, + "loss": 0.4059, + "step": 2731 + }, + { + "epoch": 0.4856888888888889, + "grad_norm": 0.32150328159332275, + "learning_rate": 2.6137561757280003e-05, + "loss": 0.3672, + "step": 2732 + }, + { + "epoch": 0.48586666666666667, + "grad_norm": 0.29451146721839905, + "learning_rate": 2.6123613408701082e-05, + "loss": 0.5277, + "step": 2733 + }, + { + "epoch": 0.48604444444444445, + "grad_norm": 0.24611295759677887, + "learning_rate": 2.610966470963541e-05, + "loss": 0.3928, + "step": 2734 + }, + { + "epoch": 0.4862222222222222, + "grad_norm": 0.2848418354988098, + "learning_rate": 2.6095715664433995e-05, + "loss": 0.3376, + "step": 2735 + }, + { + "epoch": 0.4864, + "grad_norm": 0.2356472611427307, + "learning_rate": 2.6081766277447927e-05, + "loss": 0.3758, + "step": 2736 + }, + { + "epoch": 0.4865777777777778, + "grad_norm": 0.21882623434066772, + "learning_rate": 2.606781655302843e-05, + "loss": 0.4407, + "step": 2737 + }, + { + "epoch": 0.48675555555555555, + "grad_norm": 0.26945775747299194, + "learning_rate": 2.605386649552679e-05, + "loss": 0.3131, + "step": 2738 + }, + { + "epoch": 0.48693333333333333, + "grad_norm": 0.28269338607788086, + "learning_rate": 2.603991610929445e-05, + "loss": 0.5191, + "step": 2739 + }, + { + "epoch": 0.4871111111111111, + "grad_norm": 0.3174385130405426, + "learning_rate": 2.6025965398682916e-05, + "loss": 0.368, + "step": 2740 + }, + { + "epoch": 0.4872888888888889, + "grad_norm": 0.28310227394104004, + "learning_rate": 2.6012014368043814e-05, + "loss": 0.3847, + "step": 2741 + }, + { + "epoch": 0.48746666666666666, + "grad_norm": 0.27895382046699524, + "learning_rate": 2.5998063021728865e-05, + "loss": 0.3615, + "step": 2742 + }, + { + "epoch": 0.48764444444444444, + "grad_norm": 0.34286290407180786, + "learning_rate": 2.5984111364089876e-05, + "loss": 0.3879, + "step": 2743 + }, + { + "epoch": 0.4878222222222222, + "grad_norm": 0.3528904616832733, + "learning_rate": 2.597015939947878e-05, + "loss": 0.389, + "step": 2744 + }, + { + "epoch": 0.488, + "grad_norm": 0.308472216129303, + "learning_rate": 2.595620713224757e-05, + "loss": 0.3318, + "step": 2745 + }, + { + "epoch": 0.48817777777777777, + "grad_norm": 0.2985777258872986, + "learning_rate": 2.594225456674837e-05, + "loss": 0.4537, + "step": 2746 + }, + { + "epoch": 0.48835555555555554, + "grad_norm": 0.38600680232048035, + "learning_rate": 2.5928301707333365e-05, + "loss": 0.4033, + "step": 2747 + }, + { + "epoch": 0.4885333333333333, + "grad_norm": 0.39502426981925964, + "learning_rate": 2.5914348558354857e-05, + "loss": 0.4138, + "step": 2748 + }, + { + "epoch": 0.4887111111111111, + "grad_norm": 0.3881101608276367, + "learning_rate": 2.5900395124165218e-05, + "loss": 0.4174, + "step": 2749 + }, + { + "epoch": 0.4888888888888889, + "grad_norm": 0.5198167562484741, + "learning_rate": 2.5886441409116923e-05, + "loss": 0.4723, + "step": 2750 + }, + { + "epoch": 0.48906666666666665, + "grad_norm": 0.3125801980495453, + "learning_rate": 2.587248741756253e-05, + "loss": 0.4911, + "step": 2751 + }, + { + "epoch": 0.4892444444444444, + "grad_norm": 0.31034207344055176, + "learning_rate": 2.5858533153854675e-05, + "loss": 0.4005, + "step": 2752 + }, + { + "epoch": 0.4894222222222222, + "grad_norm": 0.2857619524002075, + "learning_rate": 2.58445786223461e-05, + "loss": 0.4552, + "step": 2753 + }, + { + "epoch": 0.4896, + "grad_norm": 0.2831451892852783, + "learning_rate": 2.5830623827389612e-05, + "loss": 0.7091, + "step": 2754 + }, + { + "epoch": 0.48977777777777776, + "grad_norm": 0.335252970457077, + "learning_rate": 2.5816668773338098e-05, + "loss": 0.6693, + "step": 2755 + }, + { + "epoch": 0.48995555555555553, + "grad_norm": 0.2518710792064667, + "learning_rate": 2.5802713464544542e-05, + "loss": 0.3941, + "step": 2756 + }, + { + "epoch": 0.4901333333333333, + "grad_norm": 0.29140859842300415, + "learning_rate": 2.5788757905362e-05, + "loss": 0.5582, + "step": 2757 + }, + { + "epoch": 0.4903111111111111, + "grad_norm": 0.2581357955932617, + "learning_rate": 2.5774802100143592e-05, + "loss": 0.4602, + "step": 2758 + }, + { + "epoch": 0.49048888888888886, + "grad_norm": 0.28110405802726746, + "learning_rate": 2.5760846053242544e-05, + "loss": 0.4486, + "step": 2759 + }, + { + "epoch": 0.49066666666666664, + "grad_norm": 0.27292370796203613, + "learning_rate": 2.5746889769012128e-05, + "loss": 0.4101, + "step": 2760 + }, + { + "epoch": 0.49084444444444447, + "grad_norm": 0.28042080998420715, + "learning_rate": 2.5732933251805713e-05, + "loss": 0.4549, + "step": 2761 + }, + { + "epoch": 0.49102222222222225, + "grad_norm": 0.21104884147644043, + "learning_rate": 2.5718976505976717e-05, + "loss": 0.4094, + "step": 2762 + }, + { + "epoch": 0.4912, + "grad_norm": 0.22419004142284393, + "learning_rate": 2.5705019535878668e-05, + "loss": 0.4267, + "step": 2763 + }, + { + "epoch": 0.4913777777777778, + "grad_norm": 0.26020732522010803, + "learning_rate": 2.569106234586511e-05, + "loss": 0.5738, + "step": 2764 + }, + { + "epoch": 0.4915555555555556, + "grad_norm": 0.231083944439888, + "learning_rate": 2.5677104940289702e-05, + "loss": 0.4983, + "step": 2765 + }, + { + "epoch": 0.49173333333333336, + "grad_norm": 0.25302308797836304, + "learning_rate": 2.566314732350615e-05, + "loss": 0.5177, + "step": 2766 + }, + { + "epoch": 0.49191111111111113, + "grad_norm": 0.3101799190044403, + "learning_rate": 2.5649189499868232e-05, + "loss": 0.454, + "step": 2767 + }, + { + "epoch": 0.4920888888888889, + "grad_norm": 0.24899360537528992, + "learning_rate": 2.5635231473729772e-05, + "loss": 0.353, + "step": 2768 + }, + { + "epoch": 0.4922666666666667, + "grad_norm": 0.2688126564025879, + "learning_rate": 2.562127324944469e-05, + "loss": 0.5033, + "step": 2769 + }, + { + "epoch": 0.49244444444444446, + "grad_norm": 0.2824364900588989, + "learning_rate": 2.560731483136694e-05, + "loss": 0.4348, + "step": 2770 + }, + { + "epoch": 0.49262222222222224, + "grad_norm": 0.2828972339630127, + "learning_rate": 2.559335622385055e-05, + "loss": 0.4945, + "step": 2771 + }, + { + "epoch": 0.4928, + "grad_norm": 0.2719619870185852, + "learning_rate": 2.5579397431249606e-05, + "loss": 0.6965, + "step": 2772 + }, + { + "epoch": 0.4929777777777778, + "grad_norm": 0.2755928635597229, + "learning_rate": 2.5565438457918244e-05, + "loss": 0.3725, + "step": 2773 + }, + { + "epoch": 0.49315555555555557, + "grad_norm": 0.3056289851665497, + "learning_rate": 2.5551479308210668e-05, + "loss": 0.4994, + "step": 2774 + }, + { + "epoch": 0.49333333333333335, + "grad_norm": 0.2517152726650238, + "learning_rate": 2.5537519986481122e-05, + "loss": 0.4054, + "step": 2775 + }, + { + "epoch": 0.4935111111111111, + "grad_norm": 0.22125428915023804, + "learning_rate": 2.5523560497083926e-05, + "loss": 0.3778, + "step": 2776 + }, + { + "epoch": 0.4936888888888889, + "grad_norm": 0.2657763659954071, + "learning_rate": 2.5509600844373427e-05, + "loss": 0.5209, + "step": 2777 + }, + { + "epoch": 0.4938666666666667, + "grad_norm": 0.2610790431499481, + "learning_rate": 2.5495641032704043e-05, + "loss": 0.5083, + "step": 2778 + }, + { + "epoch": 0.49404444444444445, + "grad_norm": 0.28351613879203796, + "learning_rate": 2.5481681066430217e-05, + "loss": 0.5771, + "step": 2779 + }, + { + "epoch": 0.49422222222222223, + "grad_norm": 0.2747285068035126, + "learning_rate": 2.5467720949906483e-05, + "loss": 0.4371, + "step": 2780 + }, + { + "epoch": 0.4944, + "grad_norm": 0.31932732462882996, + "learning_rate": 2.545376068748737e-05, + "loss": 0.5459, + "step": 2781 + }, + { + "epoch": 0.4945777777777778, + "grad_norm": 0.27323007583618164, + "learning_rate": 2.5439800283527494e-05, + "loss": 0.5315, + "step": 2782 + }, + { + "epoch": 0.49475555555555556, + "grad_norm": 0.26356151700019836, + "learning_rate": 2.5425839742381498e-05, + "loss": 0.5614, + "step": 2783 + }, + { + "epoch": 0.49493333333333334, + "grad_norm": 0.23906755447387695, + "learning_rate": 2.5411879068404056e-05, + "loss": 0.5843, + "step": 2784 + }, + { + "epoch": 0.4951111111111111, + "grad_norm": 0.2917463481426239, + "learning_rate": 2.539791826594991e-05, + "loss": 0.4812, + "step": 2785 + }, + { + "epoch": 0.4952888888888889, + "grad_norm": 0.2779139578342438, + "learning_rate": 2.5383957339373825e-05, + "loss": 0.5144, + "step": 2786 + }, + { + "epoch": 0.49546666666666667, + "grad_norm": 0.29621532559394836, + "learning_rate": 2.5369996293030606e-05, + "loss": 0.5312, + "step": 2787 + }, + { + "epoch": 0.49564444444444444, + "grad_norm": 0.2670246958732605, + "learning_rate": 2.5356035131275096e-05, + "loss": 0.5133, + "step": 2788 + }, + { + "epoch": 0.4958222222222222, + "grad_norm": 0.2757388949394226, + "learning_rate": 2.5342073858462185e-05, + "loss": 0.386, + "step": 2789 + }, + { + "epoch": 0.496, + "grad_norm": 0.43750113248825073, + "learning_rate": 2.532811247894677e-05, + "loss": 0.3328, + "step": 2790 + }, + { + "epoch": 0.4961777777777778, + "grad_norm": 0.32524773478507996, + "learning_rate": 2.531415099708382e-05, + "loss": 0.3551, + "step": 2791 + }, + { + "epoch": 0.49635555555555555, + "grad_norm": 0.3105371296405792, + "learning_rate": 2.53001894172283e-05, + "loss": 0.3194, + "step": 2792 + }, + { + "epoch": 0.4965333333333333, + "grad_norm": 0.2663346827030182, + "learning_rate": 2.5286227743735225e-05, + "loss": 0.4075, + "step": 2793 + }, + { + "epoch": 0.4967111111111111, + "grad_norm": 0.29553449153900146, + "learning_rate": 2.527226598095964e-05, + "loss": 0.2968, + "step": 2794 + }, + { + "epoch": 0.4968888888888889, + "grad_norm": 0.3999388515949249, + "learning_rate": 2.5258304133256612e-05, + "loss": 0.4113, + "step": 2795 + }, + { + "epoch": 0.49706666666666666, + "grad_norm": 0.36481136083602905, + "learning_rate": 2.524434220498123e-05, + "loss": 0.3698, + "step": 2796 + }, + { + "epoch": 0.49724444444444443, + "grad_norm": 0.3680489957332611, + "learning_rate": 2.5230380200488613e-05, + "loss": 0.3522, + "step": 2797 + }, + { + "epoch": 0.4974222222222222, + "grad_norm": 0.36200034618377686, + "learning_rate": 2.5216418124133916e-05, + "loss": 0.4509, + "step": 2798 + }, + { + "epoch": 0.4976, + "grad_norm": 0.47373074293136597, + "learning_rate": 2.520245598027229e-05, + "loss": 0.4987, + "step": 2799 + }, + { + "epoch": 0.49777777777777776, + "grad_norm": 0.4866207540035248, + "learning_rate": 2.518849377325893e-05, + "loss": 0.4574, + "step": 2800 + }, + { + "epoch": 0.49795555555555554, + "grad_norm": 0.21719440817832947, + "learning_rate": 2.517453150744904e-05, + "loss": 0.3704, + "step": 2801 + }, + { + "epoch": 0.4981333333333333, + "grad_norm": 0.2632356882095337, + "learning_rate": 2.5160569187197852e-05, + "loss": 0.4963, + "step": 2802 + }, + { + "epoch": 0.4983111111111111, + "grad_norm": 0.2822842299938202, + "learning_rate": 2.5146606816860597e-05, + "loss": 0.462, + "step": 2803 + }, + { + "epoch": 0.49848888888888887, + "grad_norm": 0.35738906264305115, + "learning_rate": 2.513264440079254e-05, + "loss": 0.5469, + "step": 2804 + }, + { + "epoch": 0.49866666666666665, + "grad_norm": 0.27505576610565186, + "learning_rate": 2.5118681943348944e-05, + "loss": 0.4739, + "step": 2805 + }, + { + "epoch": 0.4988444444444444, + "grad_norm": 0.24038927257061005, + "learning_rate": 2.51047194488851e-05, + "loss": 0.4243, + "step": 2806 + }, + { + "epoch": 0.4990222222222222, + "grad_norm": 0.334676057100296, + "learning_rate": 2.509075692175631e-05, + "loss": 0.6306, + "step": 2807 + }, + { + "epoch": 0.4992, + "grad_norm": 0.25052592158317566, + "learning_rate": 2.5076794366317867e-05, + "loss": 0.5211, + "step": 2808 + }, + { + "epoch": 0.49937777777777775, + "grad_norm": 0.27809587121009827, + "learning_rate": 2.5062831786925102e-05, + "loss": 0.464, + "step": 2809 + }, + { + "epoch": 0.49955555555555553, + "grad_norm": 0.3118326961994171, + "learning_rate": 2.5048869187933316e-05, + "loss": 0.3898, + "step": 2810 + }, + { + "epoch": 0.4997333333333333, + "grad_norm": 0.22030174732208252, + "learning_rate": 2.5034906573697864e-05, + "loss": 0.4278, + "step": 2811 + }, + { + "epoch": 0.4999111111111111, + "grad_norm": 0.2705147862434387, + "learning_rate": 2.5020943948574055e-05, + "loss": 0.4229, + "step": 2812 + }, + { + "epoch": 0.5000888888888889, + "grad_norm": 0.22648459672927856, + "learning_rate": 2.500698131691725e-05, + "loss": 0.5006, + "step": 2813 + }, + { + "epoch": 0.5002666666666666, + "grad_norm": 0.30771857500076294, + "learning_rate": 2.499301868308276e-05, + "loss": 0.4431, + "step": 2814 + }, + { + "epoch": 0.5004444444444445, + "grad_norm": 0.2470857799053192, + "learning_rate": 2.4979056051425954e-05, + "loss": 0.5619, + "step": 2815 + }, + { + "epoch": 0.5006222222222222, + "grad_norm": 0.24304385483264923, + "learning_rate": 2.496509342630214e-05, + "loss": 0.3823, + "step": 2816 + }, + { + "epoch": 0.5008, + "grad_norm": 0.2256855070590973, + "learning_rate": 2.4951130812066686e-05, + "loss": 0.451, + "step": 2817 + }, + { + "epoch": 0.5009777777777777, + "grad_norm": 0.34568914771080017, + "learning_rate": 2.4937168213074907e-05, + "loss": 0.5131, + "step": 2818 + }, + { + "epoch": 0.5011555555555556, + "grad_norm": 0.2696516513824463, + "learning_rate": 2.492320563368214e-05, + "loss": 0.5358, + "step": 2819 + }, + { + "epoch": 0.5013333333333333, + "grad_norm": 0.28063344955444336, + "learning_rate": 2.4909243078243696e-05, + "loss": 0.5135, + "step": 2820 + }, + { + "epoch": 0.5015111111111111, + "grad_norm": 0.2337941825389862, + "learning_rate": 2.4895280551114907e-05, + "loss": 0.4608, + "step": 2821 + }, + { + "epoch": 0.5016888888888889, + "grad_norm": 0.245266854763031, + "learning_rate": 2.4881318056651062e-05, + "loss": 0.4405, + "step": 2822 + }, + { + "epoch": 0.5018666666666667, + "grad_norm": 0.29259192943573, + "learning_rate": 2.4867355599207474e-05, + "loss": 0.5014, + "step": 2823 + }, + { + "epoch": 0.5020444444444444, + "grad_norm": 0.2646804749965668, + "learning_rate": 2.4853393183139412e-05, + "loss": 0.6951, + "step": 2824 + }, + { + "epoch": 0.5022222222222222, + "grad_norm": 0.2847024202346802, + "learning_rate": 2.4839430812802157e-05, + "loss": 0.4809, + "step": 2825 + }, + { + "epoch": 0.5024, + "grad_norm": 0.28216397762298584, + "learning_rate": 2.4825468492550964e-05, + "loss": 0.5356, + "step": 2826 + }, + { + "epoch": 0.5025777777777778, + "grad_norm": 0.2545182406902313, + "learning_rate": 2.481150622674108e-05, + "loss": 0.5083, + "step": 2827 + }, + { + "epoch": 0.5027555555555555, + "grad_norm": 0.2076844722032547, + "learning_rate": 2.4797544019727717e-05, + "loss": 0.4163, + "step": 2828 + }, + { + "epoch": 0.5029333333333333, + "grad_norm": 0.2975939214229584, + "learning_rate": 2.4783581875866097e-05, + "loss": 0.4371, + "step": 2829 + }, + { + "epoch": 0.5031111111111111, + "grad_norm": 0.2668185234069824, + "learning_rate": 2.4769619799511393e-05, + "loss": 0.6738, + "step": 2830 + }, + { + "epoch": 0.5032888888888889, + "grad_norm": 0.2934461832046509, + "learning_rate": 2.475565779501878e-05, + "loss": 0.4421, + "step": 2831 + }, + { + "epoch": 0.5034666666666666, + "grad_norm": 0.2635568380355835, + "learning_rate": 2.4741695866743397e-05, + "loss": 0.4407, + "step": 2832 + }, + { + "epoch": 0.5036444444444445, + "grad_norm": 0.21844281256198883, + "learning_rate": 2.472773401904037e-05, + "loss": 0.4569, + "step": 2833 + }, + { + "epoch": 0.5038222222222222, + "grad_norm": 0.2578512132167816, + "learning_rate": 2.471377225626478e-05, + "loss": 0.4234, + "step": 2834 + }, + { + "epoch": 0.504, + "grad_norm": 0.2681830823421478, + "learning_rate": 2.4699810582771713e-05, + "loss": 0.4739, + "step": 2835 + }, + { + "epoch": 0.5041777777777777, + "grad_norm": 0.24891342222690582, + "learning_rate": 2.4685849002916183e-05, + "loss": 0.5571, + "step": 2836 + }, + { + "epoch": 0.5043555555555556, + "grad_norm": 0.29252007603645325, + "learning_rate": 2.4671887521053237e-05, + "loss": 0.4885, + "step": 2837 + }, + { + "epoch": 0.5045333333333333, + "grad_norm": 0.2589651644229889, + "learning_rate": 2.465792614153782e-05, + "loss": 0.3258, + "step": 2838 + }, + { + "epoch": 0.5047111111111111, + "grad_norm": 0.2787851393222809, + "learning_rate": 2.4643964868724914e-05, + "loss": 0.5634, + "step": 2839 + }, + { + "epoch": 0.5048888888888889, + "grad_norm": 0.272087424993515, + "learning_rate": 2.46300037069694e-05, + "loss": 0.4861, + "step": 2840 + }, + { + "epoch": 0.5050666666666667, + "grad_norm": 0.28746315836906433, + "learning_rate": 2.4616042660626177e-05, + "loss": 0.3998, + "step": 2841 + }, + { + "epoch": 0.5052444444444445, + "grad_norm": 0.3004007041454315, + "learning_rate": 2.4602081734050093e-05, + "loss": 0.4267, + "step": 2842 + }, + { + "epoch": 0.5054222222222222, + "grad_norm": 0.26894333958625793, + "learning_rate": 2.4588120931595947e-05, + "loss": 0.33, + "step": 2843 + }, + { + "epoch": 0.5056, + "grad_norm": 0.29740604758262634, + "learning_rate": 2.4574160257618508e-05, + "loss": 0.3997, + "step": 2844 + }, + { + "epoch": 0.5057777777777778, + "grad_norm": 0.2644660174846649, + "learning_rate": 2.4560199716472508e-05, + "loss": 0.4065, + "step": 2845 + }, + { + "epoch": 0.5059555555555556, + "grad_norm": 0.365890234708786, + "learning_rate": 2.4546239312512635e-05, + "loss": 0.3752, + "step": 2846 + }, + { + "epoch": 0.5061333333333333, + "grad_norm": 0.3552113175392151, + "learning_rate": 2.4532279050093523e-05, + "loss": 0.3412, + "step": 2847 + }, + { + "epoch": 0.5063111111111112, + "grad_norm": 0.5882339477539062, + "learning_rate": 2.4518318933569785e-05, + "loss": 0.5261, + "step": 2848 + }, + { + "epoch": 0.5064888888888889, + "grad_norm": 0.3949835002422333, + "learning_rate": 2.4504358967295966e-05, + "loss": 0.3529, + "step": 2849 + }, + { + "epoch": 0.5066666666666667, + "grad_norm": 0.486517071723938, + "learning_rate": 2.449039915562658e-05, + "loss": 0.5692, + "step": 2850 + }, + { + "epoch": 0.5068444444444444, + "grad_norm": 0.23317596316337585, + "learning_rate": 2.447643950291608e-05, + "loss": 0.3571, + "step": 2851 + }, + { + "epoch": 0.5070222222222223, + "grad_norm": 0.2751808166503906, + "learning_rate": 2.4462480013518883e-05, + "loss": 0.4211, + "step": 2852 + }, + { + "epoch": 0.5072, + "grad_norm": 0.23703603446483612, + "learning_rate": 2.444852069178933e-05, + "loss": 0.4463, + "step": 2853 + }, + { + "epoch": 0.5073777777777778, + "grad_norm": 0.2453392595052719, + "learning_rate": 2.4434561542081762e-05, + "loss": 0.4234, + "step": 2854 + }, + { + "epoch": 0.5075555555555555, + "grad_norm": 0.2990357279777527, + "learning_rate": 2.4420602568750393e-05, + "loss": 0.6145, + "step": 2855 + }, + { + "epoch": 0.5077333333333334, + "grad_norm": 0.29537495970726013, + "learning_rate": 2.4406643776149458e-05, + "loss": 0.3986, + "step": 2856 + }, + { + "epoch": 0.5079111111111111, + "grad_norm": 0.3087059557437897, + "learning_rate": 2.439268516863306e-05, + "loss": 0.6037, + "step": 2857 + }, + { + "epoch": 0.5080888888888889, + "grad_norm": 0.2548162341117859, + "learning_rate": 2.437872675055532e-05, + "loss": 0.4388, + "step": 2858 + }, + { + "epoch": 0.5082666666666666, + "grad_norm": 0.24017971754074097, + "learning_rate": 2.4364768526270227e-05, + "loss": 0.4394, + "step": 2859 + }, + { + "epoch": 0.5084444444444445, + "grad_norm": 0.31177040934562683, + "learning_rate": 2.4350810500131777e-05, + "loss": 0.4829, + "step": 2860 + }, + { + "epoch": 0.5086222222222222, + "grad_norm": 0.34393543004989624, + "learning_rate": 2.4336852676493847e-05, + "loss": 0.5459, + "step": 2861 + }, + { + "epoch": 0.5088, + "grad_norm": 0.21737191081047058, + "learning_rate": 2.43228950597103e-05, + "loss": 0.3568, + "step": 2862 + }, + { + "epoch": 0.5089777777777778, + "grad_norm": 0.2872813642024994, + "learning_rate": 2.4308937654134893e-05, + "loss": 0.4619, + "step": 2863 + }, + { + "epoch": 0.5091555555555556, + "grad_norm": 0.23554383218288422, + "learning_rate": 2.429498046412134e-05, + "loss": 0.4451, + "step": 2864 + }, + { + "epoch": 0.5093333333333333, + "grad_norm": 0.29822999238967896, + "learning_rate": 2.428102349402328e-05, + "loss": 0.4887, + "step": 2865 + }, + { + "epoch": 0.5095111111111111, + "grad_norm": 0.2923024892807007, + "learning_rate": 2.4267066748194296e-05, + "loss": 0.3649, + "step": 2866 + }, + { + "epoch": 0.5096888888888889, + "grad_norm": 0.2945121228694916, + "learning_rate": 2.4253110230987878e-05, + "loss": 0.5475, + "step": 2867 + }, + { + "epoch": 0.5098666666666667, + "grad_norm": 0.28472045063972473, + "learning_rate": 2.4239153946757468e-05, + "loss": 0.5352, + "step": 2868 + }, + { + "epoch": 0.5100444444444444, + "grad_norm": 0.2496708333492279, + "learning_rate": 2.4225197899856414e-05, + "loss": 0.4444, + "step": 2869 + }, + { + "epoch": 0.5102222222222222, + "grad_norm": 0.2784474194049835, + "learning_rate": 2.4211242094638013e-05, + "loss": 0.5288, + "step": 2870 + }, + { + "epoch": 0.5104, + "grad_norm": 0.20867948234081268, + "learning_rate": 2.4197286535455464e-05, + "loss": 0.4681, + "step": 2871 + }, + { + "epoch": 0.5105777777777778, + "grad_norm": 0.24279505014419556, + "learning_rate": 2.418333122666191e-05, + "loss": 0.4976, + "step": 2872 + }, + { + "epoch": 0.5107555555555555, + "grad_norm": 0.2929462790489197, + "learning_rate": 2.4169376172610397e-05, + "loss": 0.6057, + "step": 2873 + }, + { + "epoch": 0.5109333333333334, + "grad_norm": 0.20310814678668976, + "learning_rate": 2.415542137765391e-05, + "loss": 0.3812, + "step": 2874 + }, + { + "epoch": 0.5111111111111111, + "grad_norm": 0.2890523374080658, + "learning_rate": 2.414146684614533e-05, + "loss": 0.463, + "step": 2875 + }, + { + "epoch": 0.5112888888888889, + "grad_norm": 0.25623899698257446, + "learning_rate": 2.4127512582437485e-05, + "loss": 0.413, + "step": 2876 + }, + { + "epoch": 0.5114666666666666, + "grad_norm": 0.22927865386009216, + "learning_rate": 2.411355859088308e-05, + "loss": 0.4362, + "step": 2877 + }, + { + "epoch": 0.5116444444444445, + "grad_norm": 0.2734526991844177, + "learning_rate": 2.4099604875834795e-05, + "loss": 0.4413, + "step": 2878 + }, + { + "epoch": 0.5118222222222222, + "grad_norm": 0.2789153754711151, + "learning_rate": 2.408565144164515e-05, + "loss": 0.482, + "step": 2879 + }, + { + "epoch": 0.512, + "grad_norm": 0.24057766795158386, + "learning_rate": 2.4071698292666648e-05, + "loss": 0.4339, + "step": 2880 + }, + { + "epoch": 0.5121777777777777, + "grad_norm": 0.23841005563735962, + "learning_rate": 2.4057745433251635e-05, + "loss": 0.5186, + "step": 2881 + }, + { + "epoch": 0.5123555555555556, + "grad_norm": 0.1933268904685974, + "learning_rate": 2.4043792867752444e-05, + "loss": 0.378, + "step": 2882 + }, + { + "epoch": 0.5125333333333333, + "grad_norm": 0.22927036881446838, + "learning_rate": 2.4029840600521227e-05, + "loss": 0.5108, + "step": 2883 + }, + { + "epoch": 0.5127111111111111, + "grad_norm": 0.2248595952987671, + "learning_rate": 2.401588863591013e-05, + "loss": 0.4027, + "step": 2884 + }, + { + "epoch": 0.5128888888888888, + "grad_norm": 0.2231544703245163, + "learning_rate": 2.4001936978271144e-05, + "loss": 0.5681, + "step": 2885 + }, + { + "epoch": 0.5130666666666667, + "grad_norm": 0.25113558769226074, + "learning_rate": 2.398798563195619e-05, + "loss": 0.4216, + "step": 2886 + }, + { + "epoch": 0.5132444444444444, + "grad_norm": 0.2890892028808594, + "learning_rate": 2.3974034601317086e-05, + "loss": 0.5618, + "step": 2887 + }, + { + "epoch": 0.5134222222222222, + "grad_norm": 0.19842980802059174, + "learning_rate": 2.3960083890705557e-05, + "loss": 0.3652, + "step": 2888 + }, + { + "epoch": 0.5136, + "grad_norm": 0.25195077061653137, + "learning_rate": 2.394613350447321e-05, + "loss": 0.483, + "step": 2889 + }, + { + "epoch": 0.5137777777777778, + "grad_norm": 0.2541206479072571, + "learning_rate": 2.3932183446971583e-05, + "loss": 0.5762, + "step": 2890 + }, + { + "epoch": 0.5139555555555556, + "grad_norm": 0.2898159921169281, + "learning_rate": 2.391823372255208e-05, + "loss": 0.4449, + "step": 2891 + }, + { + "epoch": 0.5141333333333333, + "grad_norm": 0.2802295982837677, + "learning_rate": 2.390428433556601e-05, + "loss": 0.3855, + "step": 2892 + }, + { + "epoch": 0.5143111111111112, + "grad_norm": 0.28692886233329773, + "learning_rate": 2.3890335290364595e-05, + "loss": 0.4102, + "step": 2893 + }, + { + "epoch": 0.5144888888888889, + "grad_norm": 0.350414514541626, + "learning_rate": 2.387638659129892e-05, + "loss": 0.4696, + "step": 2894 + }, + { + "epoch": 0.5146666666666667, + "grad_norm": 0.3439677059650421, + "learning_rate": 2.386243824272e-05, + "loss": 0.4213, + "step": 2895 + }, + { + "epoch": 0.5148444444444444, + "grad_norm": 0.3176393508911133, + "learning_rate": 2.384849024897869e-05, + "loss": 0.3544, + "step": 2896 + }, + { + "epoch": 0.5150222222222223, + "grad_norm": 0.34434840083122253, + "learning_rate": 2.38345426144258e-05, + "loss": 0.4268, + "step": 2897 + }, + { + "epoch": 0.5152, + "grad_norm": 0.39401403069496155, + "learning_rate": 2.3820595343411944e-05, + "loss": 0.3157, + "step": 2898 + }, + { + "epoch": 0.5153777777777778, + "grad_norm": 0.3257497251033783, + "learning_rate": 2.3806648440287714e-05, + "loss": 0.3711, + "step": 2899 + }, + { + "epoch": 0.5155555555555555, + "grad_norm": 0.4697202444076538, + "learning_rate": 2.379270190940351e-05, + "loss": 0.4909, + "step": 2900 + }, + { + "epoch": 0.5157333333333334, + "grad_norm": 0.2765554189682007, + "learning_rate": 2.377875575510967e-05, + "loss": 0.4369, + "step": 2901 + }, + { + "epoch": 0.5159111111111111, + "grad_norm": 0.25433337688446045, + "learning_rate": 2.376480998175638e-05, + "loss": 0.487, + "step": 2902 + }, + { + "epoch": 0.5160888888888889, + "grad_norm": 0.20568214356899261, + "learning_rate": 2.3750864593693732e-05, + "loss": 0.3431, + "step": 2903 + }, + { + "epoch": 0.5162666666666667, + "grad_norm": 0.3234412968158722, + "learning_rate": 2.3736919595271677e-05, + "loss": 0.5696, + "step": 2904 + }, + { + "epoch": 0.5164444444444445, + "grad_norm": 0.2738851308822632, + "learning_rate": 2.3722974990840058e-05, + "loss": 0.4949, + "step": 2905 + }, + { + "epoch": 0.5166222222222222, + "grad_norm": 0.2564401924610138, + "learning_rate": 2.3709030784748587e-05, + "loss": 0.5401, + "step": 2906 + }, + { + "epoch": 0.5168, + "grad_norm": 0.33738309144973755, + "learning_rate": 2.369508698134686e-05, + "loss": 0.4529, + "step": 2907 + }, + { + "epoch": 0.5169777777777778, + "grad_norm": 0.35009872913360596, + "learning_rate": 2.368114358498434e-05, + "loss": 0.4515, + "step": 2908 + }, + { + "epoch": 0.5171555555555556, + "grad_norm": 0.27846693992614746, + "learning_rate": 2.366720060001037e-05, + "loss": 0.4402, + "step": 2909 + }, + { + "epoch": 0.5173333333333333, + "grad_norm": 0.2543535530567169, + "learning_rate": 2.365325803077415e-05, + "loss": 0.5379, + "step": 2910 + }, + { + "epoch": 0.5175111111111111, + "grad_norm": 0.2479899376630783, + "learning_rate": 2.3639315881624777e-05, + "loss": 0.5189, + "step": 2911 + }, + { + "epoch": 0.5176888888888889, + "grad_norm": 0.24539698660373688, + "learning_rate": 2.3625374156911185e-05, + "loss": 0.4369, + "step": 2912 + }, + { + "epoch": 0.5178666666666667, + "grad_norm": 0.30989357829093933, + "learning_rate": 2.3611432860982204e-05, + "loss": 0.5439, + "step": 2913 + }, + { + "epoch": 0.5180444444444444, + "grad_norm": 0.2808515727519989, + "learning_rate": 2.3597491998186506e-05, + "loss": 0.4418, + "step": 2914 + }, + { + "epoch": 0.5182222222222223, + "grad_norm": 0.28210654854774475, + "learning_rate": 2.3583551572872656e-05, + "loss": 0.6309, + "step": 2915 + }, + { + "epoch": 0.5184, + "grad_norm": 0.3038218319416046, + "learning_rate": 2.3569611589389047e-05, + "loss": 0.5972, + "step": 2916 + }, + { + "epoch": 0.5185777777777778, + "grad_norm": 0.2985530197620392, + "learning_rate": 2.355567205208397e-05, + "loss": 0.6573, + "step": 2917 + }, + { + "epoch": 0.5187555555555555, + "grad_norm": 0.25254157185554504, + "learning_rate": 2.3541732965305543e-05, + "loss": 0.6679, + "step": 2918 + }, + { + "epoch": 0.5189333333333334, + "grad_norm": 0.2539697289466858, + "learning_rate": 2.3527794333401786e-05, + "loss": 0.4808, + "step": 2919 + }, + { + "epoch": 0.5191111111111111, + "grad_norm": 0.2901414632797241, + "learning_rate": 2.3513856160720522e-05, + "loss": 0.6226, + "step": 2920 + }, + { + "epoch": 0.5192888888888889, + "grad_norm": 0.2817017436027527, + "learning_rate": 2.349991845160949e-05, + "loss": 0.453, + "step": 2921 + }, + { + "epoch": 0.5194666666666666, + "grad_norm": 0.2587340474128723, + "learning_rate": 2.348598121041622e-05, + "loss": 0.5841, + "step": 2922 + }, + { + "epoch": 0.5196444444444445, + "grad_norm": 0.27609550952911377, + "learning_rate": 2.3472044441488174e-05, + "loss": 0.4046, + "step": 2923 + }, + { + "epoch": 0.5198222222222222, + "grad_norm": 0.25620874762535095, + "learning_rate": 2.345810814917258e-05, + "loss": 0.444, + "step": 2924 + }, + { + "epoch": 0.52, + "grad_norm": 0.29214295744895935, + "learning_rate": 2.3444172337816592e-05, + "loss": 0.5548, + "step": 2925 + }, + { + "epoch": 0.5201777777777777, + "grad_norm": 0.2659465968608856, + "learning_rate": 2.3430237011767167e-05, + "loss": 0.4064, + "step": 2926 + }, + { + "epoch": 0.5203555555555556, + "grad_norm": 0.2599498927593231, + "learning_rate": 2.3416302175371138e-05, + "loss": 0.3326, + "step": 2927 + }, + { + "epoch": 0.5205333333333333, + "grad_norm": 0.2800828814506531, + "learning_rate": 2.3402367832975163e-05, + "loss": 0.5793, + "step": 2928 + }, + { + "epoch": 0.5207111111111111, + "grad_norm": 0.2523271441459656, + "learning_rate": 2.3388433988925763e-05, + "loss": 0.4752, + "step": 2929 + }, + { + "epoch": 0.5208888888888888, + "grad_norm": 0.27516043186187744, + "learning_rate": 2.3374500647569297e-05, + "loss": 0.4336, + "step": 2930 + }, + { + "epoch": 0.5210666666666667, + "grad_norm": 0.26827526092529297, + "learning_rate": 2.336056781325197e-05, + "loss": 0.6334, + "step": 2931 + }, + { + "epoch": 0.5212444444444444, + "grad_norm": 0.25338873267173767, + "learning_rate": 2.3346635490319814e-05, + "loss": 0.4454, + "step": 2932 + }, + { + "epoch": 0.5214222222222222, + "grad_norm": 0.2623731195926666, + "learning_rate": 2.3332703683118732e-05, + "loss": 0.3235, + "step": 2933 + }, + { + "epoch": 0.5216, + "grad_norm": 0.2607010006904602, + "learning_rate": 2.3318772395994433e-05, + "loss": 0.5419, + "step": 2934 + }, + { + "epoch": 0.5217777777777778, + "grad_norm": 0.26967692375183105, + "learning_rate": 2.3304841633292487e-05, + "loss": 0.349, + "step": 2935 + }, + { + "epoch": 0.5219555555555555, + "grad_norm": 0.3082602918148041, + "learning_rate": 2.3290911399358285e-05, + "loss": 0.6043, + "step": 2936 + }, + { + "epoch": 0.5221333333333333, + "grad_norm": 0.25890079140663147, + "learning_rate": 2.327698169853707e-05, + "loss": 0.4205, + "step": 2937 + }, + { + "epoch": 0.5223111111111111, + "grad_norm": 0.2111167162656784, + "learning_rate": 2.32630525351739e-05, + "loss": 0.4278, + "step": 2938 + }, + { + "epoch": 0.5224888888888889, + "grad_norm": 0.3305027186870575, + "learning_rate": 2.324912391361368e-05, + "loss": 0.4214, + "step": 2939 + }, + { + "epoch": 0.5226666666666666, + "grad_norm": 0.3526584506034851, + "learning_rate": 2.323519583820114e-05, + "loss": 0.4434, + "step": 2940 + }, + { + "epoch": 0.5228444444444444, + "grad_norm": 0.3649956285953522, + "learning_rate": 2.3221268313280838e-05, + "loss": 0.4595, + "step": 2941 + }, + { + "epoch": 0.5230222222222223, + "grad_norm": 0.3027312159538269, + "learning_rate": 2.320734134319715e-05, + "loss": 0.3837, + "step": 2942 + }, + { + "epoch": 0.5232, + "grad_norm": 0.29625198245048523, + "learning_rate": 2.319341493229431e-05, + "loss": 0.3884, + "step": 2943 + }, + { + "epoch": 0.5233777777777778, + "grad_norm": 0.2824147343635559, + "learning_rate": 2.3179489084916358e-05, + "loss": 0.334, + "step": 2944 + }, + { + "epoch": 0.5235555555555556, + "grad_norm": 0.29194971919059753, + "learning_rate": 2.316556380540715e-05, + "loss": 0.3228, + "step": 2945 + }, + { + "epoch": 0.5237333333333334, + "grad_norm": 0.36555200815200806, + "learning_rate": 2.3151639098110377e-05, + "loss": 0.4909, + "step": 2946 + }, + { + "epoch": 0.5239111111111111, + "grad_norm": 0.36511796712875366, + "learning_rate": 2.3137714967369545e-05, + "loss": 0.44, + "step": 2947 + }, + { + "epoch": 0.5240888888888889, + "grad_norm": 0.40185901522636414, + "learning_rate": 2.3123791417527994e-05, + "loss": 0.4035, + "step": 2948 + }, + { + "epoch": 0.5242666666666667, + "grad_norm": 0.39672690629959106, + "learning_rate": 2.3109868452928855e-05, + "loss": 0.4083, + "step": 2949 + }, + { + "epoch": 0.5244444444444445, + "grad_norm": 0.5538506507873535, + "learning_rate": 2.3095946077915114e-05, + "loss": 0.4706, + "step": 2950 + }, + { + "epoch": 0.5246222222222222, + "grad_norm": 0.2571939527988434, + "learning_rate": 2.3082024296829536e-05, + "loss": 0.5387, + "step": 2951 + }, + { + "epoch": 0.5248, + "grad_norm": 0.258331835269928, + "learning_rate": 2.3068103114014726e-05, + "loss": 0.3754, + "step": 2952 + }, + { + "epoch": 0.5249777777777778, + "grad_norm": 0.2491993010044098, + "learning_rate": 2.3054182533813087e-05, + "loss": 0.4382, + "step": 2953 + }, + { + "epoch": 0.5251555555555556, + "grad_norm": 0.2990800738334656, + "learning_rate": 2.304026256056685e-05, + "loss": 0.5502, + "step": 2954 + }, + { + "epoch": 0.5253333333333333, + "grad_norm": 0.3093014061450958, + "learning_rate": 2.3026343198618043e-05, + "loss": 0.4829, + "step": 2955 + }, + { + "epoch": 0.5255111111111112, + "grad_norm": 0.30510789155960083, + "learning_rate": 2.301242445230851e-05, + "loss": 0.5304, + "step": 2956 + }, + { + "epoch": 0.5256888888888889, + "grad_norm": 0.2941192388534546, + "learning_rate": 2.2998506325979894e-05, + "loss": 0.5289, + "step": 2957 + }, + { + "epoch": 0.5258666666666667, + "grad_norm": 0.2575453817844391, + "learning_rate": 2.2984588823973662e-05, + "loss": 0.566, + "step": 2958 + }, + { + "epoch": 0.5260444444444444, + "grad_norm": 0.24552775919437408, + "learning_rate": 2.2970671950631064e-05, + "loss": 0.4933, + "step": 2959 + }, + { + "epoch": 0.5262222222222223, + "grad_norm": 0.2217177152633667, + "learning_rate": 2.2956755710293183e-05, + "loss": 0.4186, + "step": 2960 + }, + { + "epoch": 0.5264, + "grad_norm": 0.2924637198448181, + "learning_rate": 2.294284010730086e-05, + "loss": 0.4515, + "step": 2961 + }, + { + "epoch": 0.5265777777777778, + "grad_norm": 0.3973850607872009, + "learning_rate": 2.2928925145994794e-05, + "loss": 0.5661, + "step": 2962 + }, + { + "epoch": 0.5267555555555555, + "grad_norm": 0.2648034989833832, + "learning_rate": 2.291501083071543e-05, + "loss": 0.4757, + "step": 2963 + }, + { + "epoch": 0.5269333333333334, + "grad_norm": 0.26346439123153687, + "learning_rate": 2.2901097165803062e-05, + "loss": 0.4716, + "step": 2964 + }, + { + "epoch": 0.5271111111111111, + "grad_norm": 0.29133710265159607, + "learning_rate": 2.2887184155597723e-05, + "loss": 0.4467, + "step": 2965 + }, + { + "epoch": 0.5272888888888889, + "grad_norm": 0.29561737179756165, + "learning_rate": 2.28732718044393e-05, + "loss": 0.4538, + "step": 2966 + }, + { + "epoch": 0.5274666666666666, + "grad_norm": 0.2607540786266327, + "learning_rate": 2.2859360116667432e-05, + "loss": 0.4279, + "step": 2967 + }, + { + "epoch": 0.5276444444444445, + "grad_norm": 0.23969291150569916, + "learning_rate": 2.284544909662158e-05, + "loss": 0.4149, + "step": 2968 + }, + { + "epoch": 0.5278222222222222, + "grad_norm": 0.2727516293525696, + "learning_rate": 2.2831538748640974e-05, + "loss": 0.5137, + "step": 2969 + }, + { + "epoch": 0.528, + "grad_norm": 0.2958790957927704, + "learning_rate": 2.281762907706465e-05, + "loss": 0.5748, + "step": 2970 + }, + { + "epoch": 0.5281777777777777, + "grad_norm": 0.2629375755786896, + "learning_rate": 2.280372008623142e-05, + "loss": 0.5168, + "step": 2971 + }, + { + "epoch": 0.5283555555555556, + "grad_norm": 0.3317106068134308, + "learning_rate": 2.27898117804799e-05, + "loss": 0.4831, + "step": 2972 + }, + { + "epoch": 0.5285333333333333, + "grad_norm": 0.2788366973400116, + "learning_rate": 2.2775904164148477e-05, + "loss": 0.6125, + "step": 2973 + }, + { + "epoch": 0.5287111111111111, + "grad_norm": 0.2513751685619354, + "learning_rate": 2.2761997241575333e-05, + "loss": 0.3655, + "step": 2974 + }, + { + "epoch": 0.5288888888888889, + "grad_norm": 0.28906145691871643, + "learning_rate": 2.2748091017098423e-05, + "loss": 0.3725, + "step": 2975 + }, + { + "epoch": 0.5290666666666667, + "grad_norm": 0.23361513018608093, + "learning_rate": 2.2734185495055503e-05, + "loss": 0.5327, + "step": 2976 + }, + { + "epoch": 0.5292444444444444, + "grad_norm": 0.23080763220787048, + "learning_rate": 2.272028067978408e-05, + "loss": 0.4451, + "step": 2977 + }, + { + "epoch": 0.5294222222222222, + "grad_norm": 0.24192284047603607, + "learning_rate": 2.270637657562148e-05, + "loss": 0.4331, + "step": 2978 + }, + { + "epoch": 0.5296, + "grad_norm": 0.2804851830005646, + "learning_rate": 2.2692473186904765e-05, + "loss": 0.5155, + "step": 2979 + }, + { + "epoch": 0.5297777777777778, + "grad_norm": 0.3095892071723938, + "learning_rate": 2.267857051797081e-05, + "loss": 0.4082, + "step": 2980 + }, + { + "epoch": 0.5299555555555555, + "grad_norm": 0.3020741641521454, + "learning_rate": 2.266466857315624e-05, + "loss": 0.5722, + "step": 2981 + }, + { + "epoch": 0.5301333333333333, + "grad_norm": 0.2788066565990448, + "learning_rate": 2.2650767356797474e-05, + "loss": 0.5409, + "step": 2982 + }, + { + "epoch": 0.5303111111111111, + "grad_norm": 0.23060497641563416, + "learning_rate": 2.2636866873230677e-05, + "loss": 0.4333, + "step": 2983 + }, + { + "epoch": 0.5304888888888889, + "grad_norm": 0.24649953842163086, + "learning_rate": 2.2622967126791823e-05, + "loss": 0.4292, + "step": 2984 + }, + { + "epoch": 0.5306666666666666, + "grad_norm": 0.2515879273414612, + "learning_rate": 2.2609068121816612e-05, + "loss": 0.432, + "step": 2985 + }, + { + "epoch": 0.5308444444444445, + "grad_norm": 0.3675811290740967, + "learning_rate": 2.2595169862640568e-05, + "loss": 0.5552, + "step": 2986 + }, + { + "epoch": 0.5310222222222222, + "grad_norm": 0.33375123143196106, + "learning_rate": 2.2581272353598915e-05, + "loss": 0.5122, + "step": 2987 + }, + { + "epoch": 0.5312, + "grad_norm": 0.2922704517841339, + "learning_rate": 2.256737559902671e-05, + "loss": 0.5036, + "step": 2988 + }, + { + "epoch": 0.5313777777777777, + "grad_norm": 0.28789961338043213, + "learning_rate": 2.255347960325871e-05, + "loss": 0.5394, + "step": 2989 + }, + { + "epoch": 0.5315555555555556, + "grad_norm": 0.21229441463947296, + "learning_rate": 2.2539584370629508e-05, + "loss": 0.4303, + "step": 2990 + }, + { + "epoch": 0.5317333333333333, + "grad_norm": 0.2564389407634735, + "learning_rate": 2.2525689905473376e-05, + "loss": 0.429, + "step": 2991 + }, + { + "epoch": 0.5319111111111111, + "grad_norm": 0.29564332962036133, + "learning_rate": 2.2511796212124425e-05, + "loss": 0.5475, + "step": 2992 + }, + { + "epoch": 0.5320888888888888, + "grad_norm": 0.4213256537914276, + "learning_rate": 2.2497903294916474e-05, + "loss": 0.3665, + "step": 2993 + }, + { + "epoch": 0.5322666666666667, + "grad_norm": 0.349552184343338, + "learning_rate": 2.248401115818312e-05, + "loss": 0.4839, + "step": 2994 + }, + { + "epoch": 0.5324444444444445, + "grad_norm": 0.3147105276584625, + "learning_rate": 2.247011980625771e-05, + "loss": 0.4091, + "step": 2995 + }, + { + "epoch": 0.5326222222222222, + "grad_norm": 0.41335031390190125, + "learning_rate": 2.2456229243473345e-05, + "loss": 0.4017, + "step": 2996 + }, + { + "epoch": 0.5328, + "grad_norm": 0.47623759508132935, + "learning_rate": 2.2442339474162898e-05, + "loss": 0.537, + "step": 2997 + }, + { + "epoch": 0.5329777777777778, + "grad_norm": 0.364778071641922, + "learning_rate": 2.2428450502658967e-05, + "loss": 0.4765, + "step": 2998 + }, + { + "epoch": 0.5331555555555556, + "grad_norm": 0.36246734857559204, + "learning_rate": 2.241456233329392e-05, + "loss": 0.4383, + "step": 2999 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 0.46416351199150085, + "learning_rate": 2.2400674970399863e-05, + "loss": 0.4666, + "step": 3000 + }, + { + "epoch": 0.5333333333333333, + "eval_loss": 0.4750007688999176, + "eval_runtime": 1817.4276, + "eval_samples_per_second": 2.751, + "eval_steps_per_second": 0.344, + "step": 3000 + }, + { + "epoch": 0.5335111111111112, + "grad_norm": 0.29531964659690857, + "learning_rate": 2.238678841830867e-05, + "loss": 0.4692, + "step": 3001 + }, + { + "epoch": 0.5336888888888889, + "grad_norm": 0.28551557660102844, + "learning_rate": 2.2372902681351923e-05, + "loss": 0.3918, + "step": 3002 + }, + { + "epoch": 0.5338666666666667, + "grad_norm": 0.23644867539405823, + "learning_rate": 2.235901776386101e-05, + "loss": 0.4583, + "step": 3003 + }, + { + "epoch": 0.5340444444444444, + "grad_norm": 0.30046346783638, + "learning_rate": 2.2345133670166997e-05, + "loss": 0.4637, + "step": 3004 + }, + { + "epoch": 0.5342222222222223, + "grad_norm": 0.2879243493080139, + "learning_rate": 2.2331250404600755e-05, + "loss": 0.4915, + "step": 3005 + }, + { + "epoch": 0.5344, + "grad_norm": 0.22171127796173096, + "learning_rate": 2.2317367971492835e-05, + "loss": 0.3516, + "step": 3006 + }, + { + "epoch": 0.5345777777777778, + "grad_norm": 0.22647008299827576, + "learning_rate": 2.2303486375173585e-05, + "loss": 0.4151, + "step": 3007 + }, + { + "epoch": 0.5347555555555555, + "grad_norm": 0.26434826850891113, + "learning_rate": 2.2289605619973045e-05, + "loss": 0.553, + "step": 3008 + }, + { + "epoch": 0.5349333333333334, + "grad_norm": 0.2929951250553131, + "learning_rate": 2.2275725710221035e-05, + "loss": 0.4706, + "step": 3009 + }, + { + "epoch": 0.5351111111111111, + "grad_norm": 0.23885107040405273, + "learning_rate": 2.2261846650247075e-05, + "loss": 0.4362, + "step": 3010 + }, + { + "epoch": 0.5352888888888889, + "grad_norm": 0.28213077783584595, + "learning_rate": 2.224796844438045e-05, + "loss": 0.5036, + "step": 3011 + }, + { + "epoch": 0.5354666666666666, + "grad_norm": 0.2620048522949219, + "learning_rate": 2.223409109695015e-05, + "loss": 0.4175, + "step": 3012 + }, + { + "epoch": 0.5356444444444445, + "grad_norm": 0.25692641735076904, + "learning_rate": 2.2220214612284924e-05, + "loss": 0.4043, + "step": 3013 + }, + { + "epoch": 0.5358222222222222, + "grad_norm": 0.22911515831947327, + "learning_rate": 2.2206338994713228e-05, + "loss": 0.3807, + "step": 3014 + }, + { + "epoch": 0.536, + "grad_norm": 0.3070697784423828, + "learning_rate": 2.2192464248563265e-05, + "loss": 0.4715, + "step": 3015 + }, + { + "epoch": 0.5361777777777778, + "grad_norm": 0.2712344825267792, + "learning_rate": 2.217859037816296e-05, + "loss": 0.467, + "step": 3016 + }, + { + "epoch": 0.5363555555555556, + "grad_norm": 0.2436918020248413, + "learning_rate": 2.2164717387839966e-05, + "loss": 0.4681, + "step": 3017 + }, + { + "epoch": 0.5365333333333333, + "grad_norm": 0.30209654569625854, + "learning_rate": 2.215084528192165e-05, + "loss": 0.4805, + "step": 3018 + }, + { + "epoch": 0.5367111111111111, + "grad_norm": 0.20991253852844238, + "learning_rate": 2.213697406473513e-05, + "loss": 0.3232, + "step": 3019 + }, + { + "epoch": 0.5368888888888889, + "grad_norm": 0.2576856017112732, + "learning_rate": 2.2123103740607215e-05, + "loss": 0.4405, + "step": 3020 + }, + { + "epoch": 0.5370666666666667, + "grad_norm": 0.2661442458629608, + "learning_rate": 2.2109234313864465e-05, + "loss": 0.4939, + "step": 3021 + }, + { + "epoch": 0.5372444444444444, + "grad_norm": 0.2675219476222992, + "learning_rate": 2.209536578883313e-05, + "loss": 0.4972, + "step": 3022 + }, + { + "epoch": 0.5374222222222222, + "grad_norm": 0.2708624303340912, + "learning_rate": 2.208149816983921e-05, + "loss": 0.5284, + "step": 3023 + }, + { + "epoch": 0.5376, + "grad_norm": 0.24882188439369202, + "learning_rate": 2.2067631461208393e-05, + "loss": 0.3975, + "step": 3024 + }, + { + "epoch": 0.5377777777777778, + "grad_norm": 0.25015929341316223, + "learning_rate": 2.205376566726611e-05, + "loss": 0.4428, + "step": 3025 + }, + { + "epoch": 0.5379555555555555, + "grad_norm": 0.2505841851234436, + "learning_rate": 2.2039900792337474e-05, + "loss": 0.5788, + "step": 3026 + }, + { + "epoch": 0.5381333333333334, + "grad_norm": 0.2929605543613434, + "learning_rate": 2.202603684074736e-05, + "loss": 0.5626, + "step": 3027 + }, + { + "epoch": 0.5383111111111111, + "grad_norm": 0.30496540665626526, + "learning_rate": 2.2012173816820297e-05, + "loss": 0.5182, + "step": 3028 + }, + { + "epoch": 0.5384888888888889, + "grad_norm": 0.28312090039253235, + "learning_rate": 2.199831172488058e-05, + "loss": 0.6722, + "step": 3029 + }, + { + "epoch": 0.5386666666666666, + "grad_norm": 0.22242353856563568, + "learning_rate": 2.1984450569252154e-05, + "loss": 0.4375, + "step": 3030 + }, + { + "epoch": 0.5388444444444445, + "grad_norm": 0.24788910150527954, + "learning_rate": 2.1970590354258745e-05, + "loss": 0.529, + "step": 3031 + }, + { + "epoch": 0.5390222222222222, + "grad_norm": 0.24507980048656464, + "learning_rate": 2.1956731084223702e-05, + "loss": 0.3992, + "step": 3032 + }, + { + "epoch": 0.5392, + "grad_norm": 0.3020592927932739, + "learning_rate": 2.194287276347016e-05, + "loss": 0.4148, + "step": 3033 + }, + { + "epoch": 0.5393777777777777, + "grad_norm": 0.2942517101764679, + "learning_rate": 2.19290153963209e-05, + "loss": 0.4847, + "step": 3034 + }, + { + "epoch": 0.5395555555555556, + "grad_norm": 0.2896391451358795, + "learning_rate": 2.1915158987098432e-05, + "loss": 0.4205, + "step": 3035 + }, + { + "epoch": 0.5397333333333333, + "grad_norm": 0.3078225553035736, + "learning_rate": 2.1901303540124956e-05, + "loss": 0.5328, + "step": 3036 + }, + { + "epoch": 0.5399111111111111, + "grad_norm": 0.29907989501953125, + "learning_rate": 2.188744905972239e-05, + "loss": 0.4695, + "step": 3037 + }, + { + "epoch": 0.5400888888888888, + "grad_norm": 0.3034135103225708, + "learning_rate": 2.187359555021232e-05, + "loss": 0.4692, + "step": 3038 + }, + { + "epoch": 0.5402666666666667, + "grad_norm": 0.28414812684059143, + "learning_rate": 2.1859743015916065e-05, + "loss": 0.5705, + "step": 3039 + }, + { + "epoch": 0.5404444444444444, + "grad_norm": 0.271185964345932, + "learning_rate": 2.1845891461154602e-05, + "loss": 0.5259, + "step": 3040 + }, + { + "epoch": 0.5406222222222222, + "grad_norm": 0.27508774399757385, + "learning_rate": 2.183204089024864e-05, + "loss": 0.3037, + "step": 3041 + }, + { + "epoch": 0.5408, + "grad_norm": 0.31245699524879456, + "learning_rate": 2.181819130751855e-05, + "loss": 0.3887, + "step": 3042 + }, + { + "epoch": 0.5409777777777778, + "grad_norm": 0.31404441595077515, + "learning_rate": 2.1804342717284415e-05, + "loss": 0.4337, + "step": 3043 + }, + { + "epoch": 0.5411555555555555, + "grad_norm": 0.322235107421875, + "learning_rate": 2.1790495123866e-05, + "loss": 0.4413, + "step": 3044 + }, + { + "epoch": 0.5413333333333333, + "grad_norm": 0.3345963656902313, + "learning_rate": 2.177664853158276e-05, + "loss": 0.448, + "step": 3045 + }, + { + "epoch": 0.5415111111111112, + "grad_norm": 0.4021861255168915, + "learning_rate": 2.176280294475383e-05, + "loss": 0.4805, + "step": 3046 + }, + { + "epoch": 0.5416888888888889, + "grad_norm": 0.36822208762168884, + "learning_rate": 2.1748958367698046e-05, + "loss": 0.4612, + "step": 3047 + }, + { + "epoch": 0.5418666666666667, + "grad_norm": 0.48115992546081543, + "learning_rate": 2.1735114804733938e-05, + "loss": 0.4292, + "step": 3048 + }, + { + "epoch": 0.5420444444444444, + "grad_norm": 0.38257506489753723, + "learning_rate": 2.172127226017967e-05, + "loss": 0.3556, + "step": 3049 + }, + { + "epoch": 0.5422222222222223, + "grad_norm": 0.5177589654922485, + "learning_rate": 2.170743073835316e-05, + "loss": 0.5349, + "step": 3050 + }, + { + "epoch": 0.5424, + "grad_norm": 0.3127012252807617, + "learning_rate": 2.1693590243571938e-05, + "loss": 0.4273, + "step": 3051 + }, + { + "epoch": 0.5425777777777778, + "grad_norm": 0.2610132694244385, + "learning_rate": 2.1679750780153267e-05, + "loss": 0.3007, + "step": 3052 + }, + { + "epoch": 0.5427555555555555, + "grad_norm": 0.20275862514972687, + "learning_rate": 2.166591235241405e-05, + "loss": 0.3236, + "step": 3053 + }, + { + "epoch": 0.5429333333333334, + "grad_norm": 0.22637619078159332, + "learning_rate": 2.16520749646709e-05, + "loss": 0.4046, + "step": 3054 + }, + { + "epoch": 0.5431111111111111, + "grad_norm": 0.2169070988893509, + "learning_rate": 2.163823862124007e-05, + "loss": 0.4394, + "step": 3055 + }, + { + "epoch": 0.5432888888888889, + "grad_norm": 0.25098901987075806, + "learning_rate": 2.1624403326437523e-05, + "loss": 0.3685, + "step": 3056 + }, + { + "epoch": 0.5434666666666667, + "grad_norm": 0.2361176460981369, + "learning_rate": 2.1610569084578867e-05, + "loss": 0.3222, + "step": 3057 + }, + { + "epoch": 0.5436444444444445, + "grad_norm": 0.3332407474517822, + "learning_rate": 2.1596735899979396e-05, + "loss": 0.5406, + "step": 3058 + }, + { + "epoch": 0.5438222222222222, + "grad_norm": 0.338297039270401, + "learning_rate": 2.158290377695407e-05, + "loss": 0.7095, + "step": 3059 + }, + { + "epoch": 0.544, + "grad_norm": 0.27734121680259705, + "learning_rate": 2.1569072719817526e-05, + "loss": 0.4317, + "step": 3060 + }, + { + "epoch": 0.5441777777777778, + "grad_norm": 0.26475220918655396, + "learning_rate": 2.155524273288405e-05, + "loss": 0.5098, + "step": 3061 + }, + { + "epoch": 0.5443555555555556, + "grad_norm": 0.258499413728714, + "learning_rate": 2.1541413820467615e-05, + "loss": 0.4341, + "step": 3062 + }, + { + "epoch": 0.5445333333333333, + "grad_norm": 0.29776763916015625, + "learning_rate": 2.1527585986881837e-05, + "loss": 0.4436, + "step": 3063 + }, + { + "epoch": 0.5447111111111111, + "grad_norm": 0.2530074417591095, + "learning_rate": 2.1513759236440023e-05, + "loss": 0.4647, + "step": 3064 + }, + { + "epoch": 0.5448888888888889, + "grad_norm": 0.2732505798339844, + "learning_rate": 2.149993357345511e-05, + "loss": 0.5972, + "step": 3065 + }, + { + "epoch": 0.5450666666666667, + "grad_norm": 0.2999916970729828, + "learning_rate": 2.148610900223973e-05, + "loss": 0.4883, + "step": 3066 + }, + { + "epoch": 0.5452444444444444, + "grad_norm": 0.26998549699783325, + "learning_rate": 2.1472285527106137e-05, + "loss": 0.5718, + "step": 3067 + }, + { + "epoch": 0.5454222222222223, + "grad_norm": 0.2478506714105606, + "learning_rate": 2.145846315236629e-05, + "loss": 0.5141, + "step": 3068 + }, + { + "epoch": 0.5456, + "grad_norm": 0.2767607867717743, + "learning_rate": 2.1444641882331744e-05, + "loss": 0.5581, + "step": 3069 + }, + { + "epoch": 0.5457777777777778, + "grad_norm": 0.3269823491573334, + "learning_rate": 2.1430821721313782e-05, + "loss": 0.5526, + "step": 3070 + }, + { + "epoch": 0.5459555555555555, + "grad_norm": 0.2836489975452423, + "learning_rate": 2.1417002673623264e-05, + "loss": 0.464, + "step": 3071 + }, + { + "epoch": 0.5461333333333334, + "grad_norm": 0.228063702583313, + "learning_rate": 2.1403184743570778e-05, + "loss": 0.61, + "step": 3072 + }, + { + "epoch": 0.5463111111111111, + "grad_norm": 0.2642533779144287, + "learning_rate": 2.138936793546649e-05, + "loss": 0.398, + "step": 3073 + }, + { + "epoch": 0.5464888888888889, + "grad_norm": 0.23672597110271454, + "learning_rate": 2.137555225362028e-05, + "loss": 0.4233, + "step": 3074 + }, + { + "epoch": 0.5466666666666666, + "grad_norm": 0.246201291680336, + "learning_rate": 2.1361737702341634e-05, + "loss": 0.4884, + "step": 3075 + }, + { + "epoch": 0.5468444444444445, + "grad_norm": 0.27198857069015503, + "learning_rate": 2.1347924285939714e-05, + "loss": 0.6578, + "step": 3076 + }, + { + "epoch": 0.5470222222222222, + "grad_norm": 0.26538413763046265, + "learning_rate": 2.1334112008723297e-05, + "loss": 0.5842, + "step": 3077 + }, + { + "epoch": 0.5472, + "grad_norm": 0.29970574378967285, + "learning_rate": 2.132030087500084e-05, + "loss": 0.6014, + "step": 3078 + }, + { + "epoch": 0.5473777777777777, + "grad_norm": 0.23340369760990143, + "learning_rate": 2.130649088908041e-05, + "loss": 0.3963, + "step": 3079 + }, + { + "epoch": 0.5475555555555556, + "grad_norm": 0.2756349742412567, + "learning_rate": 2.1292682055269745e-05, + "loss": 0.4629, + "step": 3080 + }, + { + "epoch": 0.5477333333333333, + "grad_norm": 0.25403615832328796, + "learning_rate": 2.1278874377876197e-05, + "loss": 0.4711, + "step": 3081 + }, + { + "epoch": 0.5479111111111111, + "grad_norm": 0.2636300325393677, + "learning_rate": 2.1265067861206784e-05, + "loss": 0.5987, + "step": 3082 + }, + { + "epoch": 0.5480888888888888, + "grad_norm": 0.2964397072792053, + "learning_rate": 2.1251262509568133e-05, + "loss": 0.4918, + "step": 3083 + }, + { + "epoch": 0.5482666666666667, + "grad_norm": 0.25804513692855835, + "learning_rate": 2.123745832726654e-05, + "loss": 0.4802, + "step": 3084 + }, + { + "epoch": 0.5484444444444444, + "grad_norm": 0.3168201148509979, + "learning_rate": 2.1223655318607904e-05, + "loss": 0.6161, + "step": 3085 + }, + { + "epoch": 0.5486222222222222, + "grad_norm": 0.2684416174888611, + "learning_rate": 2.1209853487897784e-05, + "loss": 0.5222, + "step": 3086 + }, + { + "epoch": 0.5488, + "grad_norm": 0.2620787024497986, + "learning_rate": 2.119605283944135e-05, + "loss": 0.5084, + "step": 3087 + }, + { + "epoch": 0.5489777777777778, + "grad_norm": 0.2609635591506958, + "learning_rate": 2.1182253377543425e-05, + "loss": 0.4202, + "step": 3088 + }, + { + "epoch": 0.5491555555555555, + "grad_norm": 0.334938108921051, + "learning_rate": 2.1168455106508446e-05, + "loss": 0.3444, + "step": 3089 + }, + { + "epoch": 0.5493333333333333, + "grad_norm": 0.3327725827693939, + "learning_rate": 2.1154658030640483e-05, + "loss": 0.553, + "step": 3090 + }, + { + "epoch": 0.5495111111111111, + "grad_norm": 0.32416072487831116, + "learning_rate": 2.114086215424322e-05, + "loss": 0.3738, + "step": 3091 + }, + { + "epoch": 0.5496888888888889, + "grad_norm": 0.30679434537887573, + "learning_rate": 2.1127067481620013e-05, + "loss": 0.3967, + "step": 3092 + }, + { + "epoch": 0.5498666666666666, + "grad_norm": 0.3186824321746826, + "learning_rate": 2.1113274017073774e-05, + "loss": 0.4242, + "step": 3093 + }, + { + "epoch": 0.5500444444444444, + "grad_norm": 0.2917342185974121, + "learning_rate": 2.1099481764907108e-05, + "loss": 0.3514, + "step": 3094 + }, + { + "epoch": 0.5502222222222222, + "grad_norm": 0.3786706030368805, + "learning_rate": 2.108569072942217e-05, + "loss": 0.5337, + "step": 3095 + }, + { + "epoch": 0.5504, + "grad_norm": 0.3980700969696045, + "learning_rate": 2.1071900914920816e-05, + "loss": 0.4806, + "step": 3096 + }, + { + "epoch": 0.5505777777777778, + "grad_norm": 0.6132436394691467, + "learning_rate": 2.1058112325704436e-05, + "loss": 0.4667, + "step": 3097 + }, + { + "epoch": 0.5507555555555556, + "grad_norm": 0.4503932595252991, + "learning_rate": 2.1044324966074104e-05, + "loss": 0.3364, + "step": 3098 + }, + { + "epoch": 0.5509333333333334, + "grad_norm": 0.4796298146247864, + "learning_rate": 2.103053884033049e-05, + "loss": 0.4539, + "step": 3099 + }, + { + "epoch": 0.5511111111111111, + "grad_norm": 0.502371072769165, + "learning_rate": 2.1016753952773867e-05, + "loss": 0.4744, + "step": 3100 + }, + { + "epoch": 0.5512888888888889, + "grad_norm": 0.32859691977500916, + "learning_rate": 2.1002970307704132e-05, + "loss": 0.5014, + "step": 3101 + }, + { + "epoch": 0.5514666666666667, + "grad_norm": 0.25064122676849365, + "learning_rate": 2.0989187909420786e-05, + "loss": 0.4872, + "step": 3102 + }, + { + "epoch": 0.5516444444444445, + "grad_norm": 0.349989116191864, + "learning_rate": 2.0975406762222966e-05, + "loss": 0.5538, + "step": 3103 + }, + { + "epoch": 0.5518222222222222, + "grad_norm": 0.28533419966697693, + "learning_rate": 2.0961626870409383e-05, + "loss": 0.5271, + "step": 3104 + }, + { + "epoch": 0.552, + "grad_norm": 0.29115214943885803, + "learning_rate": 2.0947848238278385e-05, + "loss": 0.4614, + "step": 3105 + }, + { + "epoch": 0.5521777777777778, + "grad_norm": 0.29967767000198364, + "learning_rate": 2.0934070870127912e-05, + "loss": 0.4662, + "step": 3106 + }, + { + "epoch": 0.5523555555555556, + "grad_norm": 0.2915601134300232, + "learning_rate": 2.0920294770255517e-05, + "loss": 0.4886, + "step": 3107 + }, + { + "epoch": 0.5525333333333333, + "grad_norm": 0.24265961349010468, + "learning_rate": 2.0906519942958347e-05, + "loss": 0.3342, + "step": 3108 + }, + { + "epoch": 0.5527111111111112, + "grad_norm": 0.28501981496810913, + "learning_rate": 2.089274639253317e-05, + "loss": 0.4474, + "step": 3109 + }, + { + "epoch": 0.5528888888888889, + "grad_norm": 0.2504535913467407, + "learning_rate": 2.0878974123276328e-05, + "loss": 0.4079, + "step": 3110 + }, + { + "epoch": 0.5530666666666667, + "grad_norm": 0.3231838643550873, + "learning_rate": 2.0865203139483812e-05, + "loss": 0.4608, + "step": 3111 + }, + { + "epoch": 0.5532444444444444, + "grad_norm": 0.3012704849243164, + "learning_rate": 2.085143344545114e-05, + "loss": 0.6843, + "step": 3112 + }, + { + "epoch": 0.5534222222222223, + "grad_norm": 0.25548985600471497, + "learning_rate": 2.083766504547351e-05, + "loss": 0.4361, + "step": 3113 + }, + { + "epoch": 0.5536, + "grad_norm": 0.3070829212665558, + "learning_rate": 2.082389794384564e-05, + "loss": 0.5337, + "step": 3114 + }, + { + "epoch": 0.5537777777777778, + "grad_norm": 0.2541947066783905, + "learning_rate": 2.08101321448619e-05, + "loss": 0.4287, + "step": 3115 + }, + { + "epoch": 0.5539555555555555, + "grad_norm": 0.2300274819135666, + "learning_rate": 2.0796367652816213e-05, + "loss": 0.5245, + "step": 3116 + }, + { + "epoch": 0.5541333333333334, + "grad_norm": 0.28947713971138, + "learning_rate": 2.0782604472002128e-05, + "loss": 0.4189, + "step": 3117 + }, + { + "epoch": 0.5543111111111111, + "grad_norm": 0.3014647364616394, + "learning_rate": 2.076884260671276e-05, + "loss": 0.5879, + "step": 3118 + }, + { + "epoch": 0.5544888888888889, + "grad_norm": 0.24373458325862885, + "learning_rate": 2.075508206124084e-05, + "loss": 0.4104, + "step": 3119 + }, + { + "epoch": 0.5546666666666666, + "grad_norm": 0.29070523381233215, + "learning_rate": 2.0741322839878647e-05, + "loss": 0.5328, + "step": 3120 + }, + { + "epoch": 0.5548444444444445, + "grad_norm": 0.27347061038017273, + "learning_rate": 2.0727564946918087e-05, + "loss": 0.4892, + "step": 3121 + }, + { + "epoch": 0.5550222222222222, + "grad_norm": 0.2780001163482666, + "learning_rate": 2.0713808386650625e-05, + "loss": 0.5335, + "step": 3122 + }, + { + "epoch": 0.5552, + "grad_norm": 0.28905633091926575, + "learning_rate": 2.070005316336733e-05, + "loss": 0.5908, + "step": 3123 + }, + { + "epoch": 0.5553777777777777, + "grad_norm": 0.2759278416633606, + "learning_rate": 2.0686299281358835e-05, + "loss": 0.5362, + "step": 3124 + }, + { + "epoch": 0.5555555555555556, + "grad_norm": 0.26638245582580566, + "learning_rate": 2.067254674491538e-05, + "loss": 0.5735, + "step": 3125 + }, + { + "epoch": 0.5557333333333333, + "grad_norm": 0.24599094688892365, + "learning_rate": 2.0658795558326743e-05, + "loss": 0.4904, + "step": 3126 + }, + { + "epoch": 0.5559111111111111, + "grad_norm": 0.3339538872241974, + "learning_rate": 2.0645045725882332e-05, + "loss": 0.4904, + "step": 3127 + }, + { + "epoch": 0.5560888888888889, + "grad_norm": 0.27423030138015747, + "learning_rate": 2.0631297251871093e-05, + "loss": 0.4467, + "step": 3128 + }, + { + "epoch": 0.5562666666666667, + "grad_norm": 0.2931954860687256, + "learning_rate": 2.0617550140581578e-05, + "loss": 0.4755, + "step": 3129 + }, + { + "epoch": 0.5564444444444444, + "grad_norm": 0.32772743701934814, + "learning_rate": 2.0603804396301876e-05, + "loss": 0.6609, + "step": 3130 + }, + { + "epoch": 0.5566222222222222, + "grad_norm": 0.21841637790203094, + "learning_rate": 2.0590060023319696e-05, + "loss": 0.4569, + "step": 3131 + }, + { + "epoch": 0.5568, + "grad_norm": 0.2912640869617462, + "learning_rate": 2.0576317025922283e-05, + "loss": 0.4714, + "step": 3132 + }, + { + "epoch": 0.5569777777777778, + "grad_norm": 0.23478421568870544, + "learning_rate": 2.056257540839647e-05, + "loss": 0.5665, + "step": 3133 + }, + { + "epoch": 0.5571555555555555, + "grad_norm": 0.2980351150035858, + "learning_rate": 2.0548835175028647e-05, + "loss": 0.5857, + "step": 3134 + }, + { + "epoch": 0.5573333333333333, + "grad_norm": 0.2784363031387329, + "learning_rate": 2.0535096330104804e-05, + "loss": 0.5658, + "step": 3135 + }, + { + "epoch": 0.5575111111111111, + "grad_norm": 0.25529050827026367, + "learning_rate": 2.0521358877910444e-05, + "loss": 0.3909, + "step": 3136 + }, + { + "epoch": 0.5576888888888889, + "grad_norm": 0.2960292398929596, + "learning_rate": 2.0507622822730695e-05, + "loss": 0.508, + "step": 3137 + }, + { + "epoch": 0.5578666666666666, + "grad_norm": 0.264558881521225, + "learning_rate": 2.0493888168850188e-05, + "loss": 0.4703, + "step": 3138 + }, + { + "epoch": 0.5580444444444445, + "grad_norm": 0.307281494140625, + "learning_rate": 2.0480154920553186e-05, + "loss": 0.5105, + "step": 3139 + }, + { + "epoch": 0.5582222222222222, + "grad_norm": 0.2524307668209076, + "learning_rate": 2.0466423082123443e-05, + "loss": 0.5532, + "step": 3140 + }, + { + "epoch": 0.5584, + "grad_norm": 0.2923561930656433, + "learning_rate": 2.0452692657844333e-05, + "loss": 0.5471, + "step": 3141 + }, + { + "epoch": 0.5585777777777777, + "grad_norm": 0.25408312678337097, + "learning_rate": 2.0438963651998747e-05, + "loss": 0.3081, + "step": 3142 + }, + { + "epoch": 0.5587555555555556, + "grad_norm": 0.26359203457832336, + "learning_rate": 2.042523606886916e-05, + "loss": 0.3909, + "step": 3143 + }, + { + "epoch": 0.5589333333333333, + "grad_norm": 0.26516661047935486, + "learning_rate": 2.041150991273758e-05, + "loss": 0.3804, + "step": 3144 + }, + { + "epoch": 0.5591111111111111, + "grad_norm": 0.3086276948451996, + "learning_rate": 2.0397785187885598e-05, + "loss": 0.4069, + "step": 3145 + }, + { + "epoch": 0.5592888888888888, + "grad_norm": 0.422495037317276, + "learning_rate": 2.038406189859433e-05, + "loss": 0.3952, + "step": 3146 + }, + { + "epoch": 0.5594666666666667, + "grad_norm": 0.4331241548061371, + "learning_rate": 2.037034004914447e-05, + "loss": 0.4695, + "step": 3147 + }, + { + "epoch": 0.5596444444444445, + "grad_norm": 0.4339928925037384, + "learning_rate": 2.0356619643816234e-05, + "loss": 0.4834, + "step": 3148 + }, + { + "epoch": 0.5598222222222222, + "grad_norm": 0.5655353665351868, + "learning_rate": 2.034290068688941e-05, + "loss": 0.3832, + "step": 3149 + }, + { + "epoch": 0.56, + "grad_norm": 0.5897719264030457, + "learning_rate": 2.032918318264334e-05, + "loss": 0.3625, + "step": 3150 + }, + { + "epoch": 0.5601777777777778, + "grad_norm": 0.2616625130176544, + "learning_rate": 2.031546713535688e-05, + "loss": 0.4186, + "step": 3151 + }, + { + "epoch": 0.5603555555555556, + "grad_norm": 0.2676461637020111, + "learning_rate": 2.030175254930848e-05, + "loss": 0.5266, + "step": 3152 + }, + { + "epoch": 0.5605333333333333, + "grad_norm": 0.2639397382736206, + "learning_rate": 2.0288039428776073e-05, + "loss": 0.3292, + "step": 3153 + }, + { + "epoch": 0.5607111111111112, + "grad_norm": 0.3112473487854004, + "learning_rate": 2.02743277780372e-05, + "loss": 0.5792, + "step": 3154 + }, + { + "epoch": 0.5608888888888889, + "grad_norm": 0.28444188833236694, + "learning_rate": 2.0260617601368886e-05, + "loss": 0.5071, + "step": 3155 + }, + { + "epoch": 0.5610666666666667, + "grad_norm": 0.23163720965385437, + "learning_rate": 2.024690890304775e-05, + "loss": 0.298, + "step": 3156 + }, + { + "epoch": 0.5612444444444444, + "grad_norm": 0.2724601626396179, + "learning_rate": 2.0233201687349887e-05, + "loss": 0.5744, + "step": 3157 + }, + { + "epoch": 0.5614222222222223, + "grad_norm": 0.29206791520118713, + "learning_rate": 2.0219495958550992e-05, + "loss": 0.5043, + "step": 3158 + }, + { + "epoch": 0.5616, + "grad_norm": 0.28387364745140076, + "learning_rate": 2.020579172092626e-05, + "loss": 0.5397, + "step": 3159 + }, + { + "epoch": 0.5617777777777778, + "grad_norm": 0.30038413405418396, + "learning_rate": 2.0192088978750433e-05, + "loss": 0.5945, + "step": 3160 + }, + { + "epoch": 0.5619555555555555, + "grad_norm": 0.2841411530971527, + "learning_rate": 2.0178387736297773e-05, + "loss": 0.4562, + "step": 3161 + }, + { + "epoch": 0.5621333333333334, + "grad_norm": 0.29041579365730286, + "learning_rate": 2.0164687997842096e-05, + "loss": 0.6363, + "step": 3162 + }, + { + "epoch": 0.5623111111111111, + "grad_norm": 0.21311260759830475, + "learning_rate": 2.0150989767656728e-05, + "loss": 0.4831, + "step": 3163 + }, + { + "epoch": 0.5624888888888889, + "grad_norm": 0.2630387246608734, + "learning_rate": 2.013729305001454e-05, + "loss": 0.548, + "step": 3164 + }, + { + "epoch": 0.5626666666666666, + "grad_norm": 0.28595903515815735, + "learning_rate": 2.012359784918792e-05, + "loss": 0.4274, + "step": 3165 + }, + { + "epoch": 0.5628444444444445, + "grad_norm": 0.2719959020614624, + "learning_rate": 2.01099041694488e-05, + "loss": 0.5296, + "step": 3166 + }, + { + "epoch": 0.5630222222222222, + "grad_norm": 0.23411796987056732, + "learning_rate": 2.0096212015068606e-05, + "loss": 0.4328, + "step": 3167 + }, + { + "epoch": 0.5632, + "grad_norm": 0.29815274477005005, + "learning_rate": 2.0082521390318322e-05, + "loss": 0.5192, + "step": 3168 + }, + { + "epoch": 0.5633777777777778, + "grad_norm": 0.2904717028141022, + "learning_rate": 2.0068832299468428e-05, + "loss": 0.5623, + "step": 3169 + }, + { + "epoch": 0.5635555555555556, + "grad_norm": 0.38077226281166077, + "learning_rate": 2.0055144746788957e-05, + "loss": 0.551, + "step": 3170 + }, + { + "epoch": 0.5637333333333333, + "grad_norm": 0.25648513436317444, + "learning_rate": 2.004145873654942e-05, + "loss": 0.5084, + "step": 3171 + }, + { + "epoch": 0.5639111111111111, + "grad_norm": 0.28845271468162537, + "learning_rate": 2.0027774273018892e-05, + "loss": 0.5028, + "step": 3172 + }, + { + "epoch": 0.5640888888888889, + "grad_norm": 0.2336028665304184, + "learning_rate": 2.0014091360465927e-05, + "loss": 0.449, + "step": 3173 + }, + { + "epoch": 0.5642666666666667, + "grad_norm": 0.26510196924209595, + "learning_rate": 2.000041000315862e-05, + "loss": 0.4149, + "step": 3174 + }, + { + "epoch": 0.5644444444444444, + "grad_norm": 0.2599692642688751, + "learning_rate": 1.998673020536456e-05, + "loss": 0.4764, + "step": 3175 + }, + { + "epoch": 0.5646222222222222, + "grad_norm": 0.22014567255973816, + "learning_rate": 1.9973051971350888e-05, + "loss": 0.4654, + "step": 3176 + }, + { + "epoch": 0.5648, + "grad_norm": 0.27445879578590393, + "learning_rate": 1.9959375305384203e-05, + "loss": 0.55, + "step": 3177 + }, + { + "epoch": 0.5649777777777778, + "grad_norm": 0.26876360177993774, + "learning_rate": 1.994570021173067e-05, + "loss": 0.5305, + "step": 3178 + }, + { + "epoch": 0.5651555555555555, + "grad_norm": 0.2770456373691559, + "learning_rate": 1.9932026694655907e-05, + "loss": 0.4367, + "step": 3179 + }, + { + "epoch": 0.5653333333333334, + "grad_norm": 0.28398647904396057, + "learning_rate": 1.99183547584251e-05, + "loss": 0.4745, + "step": 3180 + }, + { + "epoch": 0.5655111111111111, + "grad_norm": 0.29460814595222473, + "learning_rate": 1.9904684407302883e-05, + "loss": 0.4221, + "step": 3181 + }, + { + "epoch": 0.5656888888888889, + "grad_norm": 0.27162420749664307, + "learning_rate": 1.989101564555345e-05, + "loss": 0.5035, + "step": 3182 + }, + { + "epoch": 0.5658666666666666, + "grad_norm": 0.29304832220077515, + "learning_rate": 1.9877348477440456e-05, + "loss": 0.3807, + "step": 3183 + }, + { + "epoch": 0.5660444444444445, + "grad_norm": 0.254295289516449, + "learning_rate": 1.9863682907227088e-05, + "loss": 0.3795, + "step": 3184 + }, + { + "epoch": 0.5662222222222222, + "grad_norm": 0.287888765335083, + "learning_rate": 1.9850018939176014e-05, + "loss": 0.4591, + "step": 3185 + }, + { + "epoch": 0.5664, + "grad_norm": 0.2829858362674713, + "learning_rate": 1.983635657754942e-05, + "loss": 0.406, + "step": 3186 + }, + { + "epoch": 0.5665777777777777, + "grad_norm": 0.310127317905426, + "learning_rate": 1.9822695826608972e-05, + "loss": 0.5422, + "step": 3187 + }, + { + "epoch": 0.5667555555555556, + "grad_norm": 0.27587437629699707, + "learning_rate": 1.9809036690615853e-05, + "loss": 0.363, + "step": 3188 + }, + { + "epoch": 0.5669333333333333, + "grad_norm": 0.2756005525588989, + "learning_rate": 1.979537917383073e-05, + "loss": 0.3461, + "step": 3189 + }, + { + "epoch": 0.5671111111111111, + "grad_norm": 0.2738426923751831, + "learning_rate": 1.9781723280513768e-05, + "loss": 0.375, + "step": 3190 + }, + { + "epoch": 0.5672888888888888, + "grad_norm": 0.30873578786849976, + "learning_rate": 1.9768069014924622e-05, + "loss": 0.4331, + "step": 3191 + }, + { + "epoch": 0.5674666666666667, + "grad_norm": 0.33168864250183105, + "learning_rate": 1.9754416381322455e-05, + "loss": 0.3794, + "step": 3192 + }, + { + "epoch": 0.5676444444444444, + "grad_norm": 0.3501400351524353, + "learning_rate": 1.9740765383965893e-05, + "loss": 0.3936, + "step": 3193 + }, + { + "epoch": 0.5678222222222222, + "grad_norm": 0.3080213963985443, + "learning_rate": 1.9727116027113077e-05, + "loss": 0.3822, + "step": 3194 + }, + { + "epoch": 0.568, + "grad_norm": 0.392995685338974, + "learning_rate": 1.9713468315021622e-05, + "loss": 0.3613, + "step": 3195 + }, + { + "epoch": 0.5681777777777778, + "grad_norm": 0.4086252748966217, + "learning_rate": 1.969982225194864e-05, + "loss": 0.3874, + "step": 3196 + }, + { + "epoch": 0.5683555555555555, + "grad_norm": 0.5959545373916626, + "learning_rate": 1.9686177842150715e-05, + "loss": 0.4446, + "step": 3197 + }, + { + "epoch": 0.5685333333333333, + "grad_norm": 0.36379534006118774, + "learning_rate": 1.967253508988394e-05, + "loss": 0.4038, + "step": 3198 + }, + { + "epoch": 0.5687111111111111, + "grad_norm": 0.42305928468704224, + "learning_rate": 1.9658893999403847e-05, + "loss": 0.4209, + "step": 3199 + }, + { + "epoch": 0.5688888888888889, + "grad_norm": 0.48341116309165955, + "learning_rate": 1.964525457496551e-05, + "loss": 0.4932, + "step": 3200 + }, + { + "epoch": 0.5690666666666667, + "grad_norm": 0.313327431678772, + "learning_rate": 1.963161682082342e-05, + "loss": 0.5217, + "step": 3201 + }, + { + "epoch": 0.5692444444444444, + "grad_norm": 0.2682130038738251, + "learning_rate": 1.96179807412316e-05, + "loss": 0.416, + "step": 3202 + }, + { + "epoch": 0.5694222222222223, + "grad_norm": 0.23071040213108063, + "learning_rate": 1.9604346340443518e-05, + "loss": 0.399, + "step": 3203 + }, + { + "epoch": 0.5696, + "grad_norm": 0.24082015454769135, + "learning_rate": 1.9590713622712132e-05, + "loss": 0.3499, + "step": 3204 + }, + { + "epoch": 0.5697777777777778, + "grad_norm": 0.28907546401023865, + "learning_rate": 1.957708259228987e-05, + "loss": 0.4411, + "step": 3205 + }, + { + "epoch": 0.5699555555555555, + "grad_norm": 0.26845699548721313, + "learning_rate": 1.956345325342863e-05, + "loss": 0.4431, + "step": 3206 + }, + { + "epoch": 0.5701333333333334, + "grad_norm": 0.2563682794570923, + "learning_rate": 1.95498256103798e-05, + "loss": 0.4196, + "step": 3207 + }, + { + "epoch": 0.5703111111111111, + "grad_norm": 0.23393134772777557, + "learning_rate": 1.9536199667394215e-05, + "loss": 0.4343, + "step": 3208 + }, + { + "epoch": 0.5704888888888889, + "grad_norm": 0.33118677139282227, + "learning_rate": 1.95225754287222e-05, + "loss": 0.4387, + "step": 3209 + }, + { + "epoch": 0.5706666666666667, + "grad_norm": 0.2317488044500351, + "learning_rate": 1.9508952898613528e-05, + "loss": 0.3241, + "step": 3210 + }, + { + "epoch": 0.5708444444444445, + "grad_norm": 0.2599552273750305, + "learning_rate": 1.9495332081317464e-05, + "loss": 0.4308, + "step": 3211 + }, + { + "epoch": 0.5710222222222222, + "grad_norm": 0.27021849155426025, + "learning_rate": 1.9481712981082714e-05, + "loss": 0.384, + "step": 3212 + }, + { + "epoch": 0.5712, + "grad_norm": 0.24949754774570465, + "learning_rate": 1.946809560215747e-05, + "loss": 0.4218, + "step": 3213 + }, + { + "epoch": 0.5713777777777778, + "grad_norm": 0.2137870341539383, + "learning_rate": 1.945447994878937e-05, + "loss": 0.4495, + "step": 3214 + }, + { + "epoch": 0.5715555555555556, + "grad_norm": 0.2736102044582367, + "learning_rate": 1.9440866025225525e-05, + "loss": 0.541, + "step": 3215 + }, + { + "epoch": 0.5717333333333333, + "grad_norm": 0.3255680799484253, + "learning_rate": 1.942725383571249e-05, + "loss": 0.4685, + "step": 3216 + }, + { + "epoch": 0.5719111111111111, + "grad_norm": 0.24515753984451294, + "learning_rate": 1.9413643384496316e-05, + "loss": 0.5382, + "step": 3217 + }, + { + "epoch": 0.5720888888888889, + "grad_norm": 0.2996431291103363, + "learning_rate": 1.9400034675822452e-05, + "loss": 0.4599, + "step": 3218 + }, + { + "epoch": 0.5722666666666667, + "grad_norm": 0.2762985825538635, + "learning_rate": 1.938642771393588e-05, + "loss": 0.5659, + "step": 3219 + }, + { + "epoch": 0.5724444444444444, + "grad_norm": 0.2766968011856079, + "learning_rate": 1.9372822503080957e-05, + "loss": 0.4789, + "step": 3220 + }, + { + "epoch": 0.5726222222222223, + "grad_norm": 0.2951643764972687, + "learning_rate": 1.9359219047501565e-05, + "loss": 0.5353, + "step": 3221 + }, + { + "epoch": 0.5728, + "grad_norm": 0.3069530427455902, + "learning_rate": 1.9345617351440973e-05, + "loss": 0.4943, + "step": 3222 + }, + { + "epoch": 0.5729777777777778, + "grad_norm": 0.2672772705554962, + "learning_rate": 1.9332017419141962e-05, + "loss": 0.4687, + "step": 3223 + }, + { + "epoch": 0.5731555555555555, + "grad_norm": 0.2736593782901764, + "learning_rate": 1.9318419254846718e-05, + "loss": 0.4357, + "step": 3224 + }, + { + "epoch": 0.5733333333333334, + "grad_norm": 0.29173022508621216, + "learning_rate": 1.9304822862796903e-05, + "loss": 0.5716, + "step": 3225 + }, + { + "epoch": 0.5735111111111111, + "grad_norm": 0.310024231672287, + "learning_rate": 1.9291228247233605e-05, + "loss": 0.6019, + "step": 3226 + }, + { + "epoch": 0.5736888888888889, + "grad_norm": 0.23740927875041962, + "learning_rate": 1.9277635412397383e-05, + "loss": 0.4527, + "step": 3227 + }, + { + "epoch": 0.5738666666666666, + "grad_norm": 0.25164806842803955, + "learning_rate": 1.926404436252821e-05, + "loss": 0.4136, + "step": 3228 + }, + { + "epoch": 0.5740444444444445, + "grad_norm": 0.2700943052768707, + "learning_rate": 1.9250455101865526e-05, + "loss": 0.4104, + "step": 3229 + }, + { + "epoch": 0.5742222222222222, + "grad_norm": 0.27772271633148193, + "learning_rate": 1.92368676346482e-05, + "loss": 0.4461, + "step": 3230 + }, + { + "epoch": 0.5744, + "grad_norm": 0.2517658770084381, + "learning_rate": 1.922328196511456e-05, + "loss": 0.4594, + "step": 3231 + }, + { + "epoch": 0.5745777777777777, + "grad_norm": 0.34047532081604004, + "learning_rate": 1.920969809750234e-05, + "loss": 0.4789, + "step": 3232 + }, + { + "epoch": 0.5747555555555556, + "grad_norm": 0.27112624049186707, + "learning_rate": 1.919611603604875e-05, + "loss": 0.268, + "step": 3233 + }, + { + "epoch": 0.5749333333333333, + "grad_norm": 0.26572734117507935, + "learning_rate": 1.9182535784990403e-05, + "loss": 0.4496, + "step": 3234 + }, + { + "epoch": 0.5751111111111111, + "grad_norm": 0.2651844918727875, + "learning_rate": 1.916895734856338e-05, + "loss": 0.5723, + "step": 3235 + }, + { + "epoch": 0.5752888888888888, + "grad_norm": 0.2680882215499878, + "learning_rate": 1.915538073100316e-05, + "loss": 0.5624, + "step": 3236 + }, + { + "epoch": 0.5754666666666667, + "grad_norm": 0.271443635225296, + "learning_rate": 1.914180593654469e-05, + "loss": 0.6168, + "step": 3237 + }, + { + "epoch": 0.5756444444444444, + "grad_norm": 0.31552666425704956, + "learning_rate": 1.9128232969422315e-05, + "loss": 0.4564, + "step": 3238 + }, + { + "epoch": 0.5758222222222222, + "grad_norm": 0.28059840202331543, + "learning_rate": 1.9114661833869847e-05, + "loss": 0.4679, + "step": 3239 + }, + { + "epoch": 0.576, + "grad_norm": 0.26936236023902893, + "learning_rate": 1.9101092534120478e-05, + "loss": 0.481, + "step": 3240 + }, + { + "epoch": 0.5761777777777778, + "grad_norm": 0.3736543357372284, + "learning_rate": 1.908752507440689e-05, + "loss": 0.5709, + "step": 3241 + }, + { + "epoch": 0.5763555555555555, + "grad_norm": 0.3158590495586395, + "learning_rate": 1.9073959458961125e-05, + "loss": 0.4265, + "step": 3242 + }, + { + "epoch": 0.5765333333333333, + "grad_norm": 0.2607210576534271, + "learning_rate": 1.9060395692014708e-05, + "loss": 0.3774, + "step": 3243 + }, + { + "epoch": 0.5767111111111111, + "grad_norm": 0.40169498324394226, + "learning_rate": 1.9046833777798533e-05, + "loss": 0.4664, + "step": 3244 + }, + { + "epoch": 0.5768888888888889, + "grad_norm": 0.3821481466293335, + "learning_rate": 1.9033273720542975e-05, + "loss": 0.4236, + "step": 3245 + }, + { + "epoch": 0.5770666666666666, + "grad_norm": 0.46172308921813965, + "learning_rate": 1.9019715524477767e-05, + "loss": 0.4291, + "step": 3246 + }, + { + "epoch": 0.5772444444444444, + "grad_norm": 0.4354674816131592, + "learning_rate": 1.9006159193832125e-05, + "loss": 0.4032, + "step": 3247 + }, + { + "epoch": 0.5774222222222222, + "grad_norm": 0.4149959683418274, + "learning_rate": 1.8992604732834623e-05, + "loss": 0.3827, + "step": 3248 + }, + { + "epoch": 0.5776, + "grad_norm": 0.3861846923828125, + "learning_rate": 1.89790521457133e-05, + "loss": 0.363, + "step": 3249 + }, + { + "epoch": 0.5777777777777777, + "grad_norm": 0.4809871315956116, + "learning_rate": 1.8965501436695577e-05, + "loss": 0.4338, + "step": 3250 + }, + { + "epoch": 0.5779555555555556, + "grad_norm": 0.2961549460887909, + "learning_rate": 1.895195261000831e-05, + "loss": 0.529, + "step": 3251 + }, + { + "epoch": 0.5781333333333334, + "grad_norm": 0.34819185733795166, + "learning_rate": 1.893840566987776e-05, + "loss": 0.6855, + "step": 3252 + }, + { + "epoch": 0.5783111111111111, + "grad_norm": 0.22617462277412415, + "learning_rate": 1.8924860620529594e-05, + "loss": 0.3904, + "step": 3253 + }, + { + "epoch": 0.5784888888888889, + "grad_norm": 0.3230564594268799, + "learning_rate": 1.891131746618891e-05, + "loss": 0.4631, + "step": 3254 + }, + { + "epoch": 0.5786666666666667, + "grad_norm": 0.31536340713500977, + "learning_rate": 1.8897776211080182e-05, + "loss": 0.4743, + "step": 3255 + }, + { + "epoch": 0.5788444444444445, + "grad_norm": 0.26243850588798523, + "learning_rate": 1.888423685942732e-05, + "loss": 0.5369, + "step": 3256 + }, + { + "epoch": 0.5790222222222222, + "grad_norm": 0.2524162828922272, + "learning_rate": 1.8870699415453627e-05, + "loss": 0.4042, + "step": 3257 + }, + { + "epoch": 0.5792, + "grad_norm": 0.26767271757125854, + "learning_rate": 1.885716388338182e-05, + "loss": 0.3712, + "step": 3258 + }, + { + "epoch": 0.5793777777777778, + "grad_norm": 0.22887256741523743, + "learning_rate": 1.8843630267434e-05, + "loss": 0.4499, + "step": 3259 + }, + { + "epoch": 0.5795555555555556, + "grad_norm": 0.2639103829860687, + "learning_rate": 1.8830098571831705e-05, + "loss": 0.5461, + "step": 3260 + }, + { + "epoch": 0.5797333333333333, + "grad_norm": 0.36999112367630005, + "learning_rate": 1.8816568800795822e-05, + "loss": 0.6832, + "step": 3261 + }, + { + "epoch": 0.5799111111111112, + "grad_norm": 0.33179110288619995, + "learning_rate": 1.8803040958546707e-05, + "loss": 0.5373, + "step": 3262 + }, + { + "epoch": 0.5800888888888889, + "grad_norm": 0.2644979953765869, + "learning_rate": 1.8789515049304038e-05, + "loss": 0.4129, + "step": 3263 + }, + { + "epoch": 0.5802666666666667, + "grad_norm": 0.3102962076663971, + "learning_rate": 1.8775991077286965e-05, + "loss": 0.4563, + "step": 3264 + }, + { + "epoch": 0.5804444444444444, + "grad_norm": 0.3044317364692688, + "learning_rate": 1.8762469046713956e-05, + "loss": 0.5271, + "step": 3265 + }, + { + "epoch": 0.5806222222222223, + "grad_norm": 0.3022008538246155, + "learning_rate": 1.8748948961802948e-05, + "loss": 0.4192, + "step": 3266 + }, + { + "epoch": 0.5808, + "grad_norm": 0.24398674070835114, + "learning_rate": 1.873543082677122e-05, + "loss": 0.5766, + "step": 3267 + }, + { + "epoch": 0.5809777777777778, + "grad_norm": 0.30086076259613037, + "learning_rate": 1.872191464583547e-05, + "loss": 0.5652, + "step": 3268 + }, + { + "epoch": 0.5811555555555555, + "grad_norm": 0.23053579032421112, + "learning_rate": 1.8708400423211764e-05, + "loss": 0.3151, + "step": 3269 + }, + { + "epoch": 0.5813333333333334, + "grad_norm": 0.26834923028945923, + "learning_rate": 1.869488816311558e-05, + "loss": 0.5256, + "step": 3270 + }, + { + "epoch": 0.5815111111111111, + "grad_norm": 0.32117733359336853, + "learning_rate": 1.868137786976177e-05, + "loss": 0.6316, + "step": 3271 + }, + { + "epoch": 0.5816888888888889, + "grad_norm": 0.2941349744796753, + "learning_rate": 1.8667869547364576e-05, + "loss": 0.488, + "step": 3272 + }, + { + "epoch": 0.5818666666666666, + "grad_norm": 0.280171275138855, + "learning_rate": 1.865436320013762e-05, + "loss": 0.5877, + "step": 3273 + }, + { + "epoch": 0.5820444444444445, + "grad_norm": 0.309059202671051, + "learning_rate": 1.864085883229392e-05, + "loss": 0.494, + "step": 3274 + }, + { + "epoch": 0.5822222222222222, + "grad_norm": 0.33706822991371155, + "learning_rate": 1.8627356448045867e-05, + "loss": 0.7136, + "step": 3275 + }, + { + "epoch": 0.5824, + "grad_norm": 0.2701260447502136, + "learning_rate": 1.8613856051605243e-05, + "loss": 0.4157, + "step": 3276 + }, + { + "epoch": 0.5825777777777777, + "grad_norm": 0.27974867820739746, + "learning_rate": 1.8600357647183185e-05, + "loss": 0.4594, + "step": 3277 + }, + { + "epoch": 0.5827555555555556, + "grad_norm": 0.26949864625930786, + "learning_rate": 1.8586861238990244e-05, + "loss": 0.4197, + "step": 3278 + }, + { + "epoch": 0.5829333333333333, + "grad_norm": 0.25465089082717896, + "learning_rate": 1.8573366831236323e-05, + "loss": 0.4227, + "step": 3279 + }, + { + "epoch": 0.5831111111111111, + "grad_norm": 0.2902585566043854, + "learning_rate": 1.8559874428130706e-05, + "loss": 0.5035, + "step": 3280 + }, + { + "epoch": 0.5832888888888889, + "grad_norm": 0.32112398743629456, + "learning_rate": 1.8546384033882062e-05, + "loss": 0.4503, + "step": 3281 + }, + { + "epoch": 0.5834666666666667, + "grad_norm": 0.2516997754573822, + "learning_rate": 1.8532895652698422e-05, + "loss": 0.4879, + "step": 3282 + }, + { + "epoch": 0.5836444444444444, + "grad_norm": 0.254833459854126, + "learning_rate": 1.851940928878718e-05, + "loss": 0.4178, + "step": 3283 + }, + { + "epoch": 0.5838222222222222, + "grad_norm": 0.27882134914398193, + "learning_rate": 1.8505924946355147e-05, + "loss": 0.4794, + "step": 3284 + }, + { + "epoch": 0.584, + "grad_norm": 0.23359514772891998, + "learning_rate": 1.8492442629608434e-05, + "loss": 0.3881, + "step": 3285 + }, + { + "epoch": 0.5841777777777778, + "grad_norm": 0.3188570737838745, + "learning_rate": 1.8478962342752583e-05, + "loss": 0.4885, + "step": 3286 + }, + { + "epoch": 0.5843555555555555, + "grad_norm": 0.33785542845726013, + "learning_rate": 1.846548408999245e-05, + "loss": 0.4524, + "step": 3287 + }, + { + "epoch": 0.5845333333333333, + "grad_norm": 0.26515066623687744, + "learning_rate": 1.8452007875532317e-05, + "loss": 0.4779, + "step": 3288 + }, + { + "epoch": 0.5847111111111111, + "grad_norm": 0.24841231107711792, + "learning_rate": 1.8438533703575754e-05, + "loss": 0.5127, + "step": 3289 + }, + { + "epoch": 0.5848888888888889, + "grad_norm": 0.2695436477661133, + "learning_rate": 1.8425061578325772e-05, + "loss": 0.62, + "step": 3290 + }, + { + "epoch": 0.5850666666666666, + "grad_norm": 0.26782360672950745, + "learning_rate": 1.841159150398469e-05, + "loss": 0.4024, + "step": 3291 + }, + { + "epoch": 0.5852444444444445, + "grad_norm": 0.2684522271156311, + "learning_rate": 1.8398123484754203e-05, + "loss": 0.3829, + "step": 3292 + }, + { + "epoch": 0.5854222222222222, + "grad_norm": 0.3152904510498047, + "learning_rate": 1.8384657524835376e-05, + "loss": 0.4779, + "step": 3293 + }, + { + "epoch": 0.5856, + "grad_norm": 0.3470858037471771, + "learning_rate": 1.8371193628428613e-05, + "loss": 0.326, + "step": 3294 + }, + { + "epoch": 0.5857777777777777, + "grad_norm": 0.3335106372833252, + "learning_rate": 1.8357731799733686e-05, + "loss": 0.3908, + "step": 3295 + }, + { + "epoch": 0.5859555555555556, + "grad_norm": 0.3976891040802002, + "learning_rate": 1.8344272042949724e-05, + "loss": 0.3756, + "step": 3296 + }, + { + "epoch": 0.5861333333333333, + "grad_norm": 0.39689165353775024, + "learning_rate": 1.8330814362275198e-05, + "loss": 0.4526, + "step": 3297 + }, + { + "epoch": 0.5863111111111111, + "grad_norm": 0.5928948521614075, + "learning_rate": 1.8317358761907942e-05, + "loss": 0.3892, + "step": 3298 + }, + { + "epoch": 0.5864888888888888, + "grad_norm": 0.4676694869995117, + "learning_rate": 1.8303905246045138e-05, + "loss": 0.3854, + "step": 3299 + }, + { + "epoch": 0.5866666666666667, + "grad_norm": 0.46652740240097046, + "learning_rate": 1.829045381888332e-05, + "loss": 0.4078, + "step": 3300 + }, + { + "epoch": 0.5868444444444444, + "grad_norm": 0.2485920488834381, + "learning_rate": 1.827700448461836e-05, + "loss": 0.6167, + "step": 3301 + }, + { + "epoch": 0.5870222222222222, + "grad_norm": 0.27780160307884216, + "learning_rate": 1.8263557247445495e-05, + "loss": 0.61, + "step": 3302 + }, + { + "epoch": 0.5872, + "grad_norm": 0.24181243777275085, + "learning_rate": 1.8250112111559287e-05, + "loss": 0.4182, + "step": 3303 + }, + { + "epoch": 0.5873777777777778, + "grad_norm": 0.3064706027507782, + "learning_rate": 1.8236669081153657e-05, + "loss": 0.446, + "step": 3304 + }, + { + "epoch": 0.5875555555555556, + "grad_norm": 0.29117628931999207, + "learning_rate": 1.822322816042188e-05, + "loss": 0.4217, + "step": 3305 + }, + { + "epoch": 0.5877333333333333, + "grad_norm": 0.2641940116882324, + "learning_rate": 1.820978935355653e-05, + "loss": 0.6919, + "step": 3306 + }, + { + "epoch": 0.5879111111111112, + "grad_norm": 0.2919057309627533, + "learning_rate": 1.8196352664749576e-05, + "loss": 0.4032, + "step": 3307 + }, + { + "epoch": 0.5880888888888889, + "grad_norm": 0.3009297251701355, + "learning_rate": 1.818291809819229e-05, + "loss": 0.5362, + "step": 3308 + }, + { + "epoch": 0.5882666666666667, + "grad_norm": 0.2529834508895874, + "learning_rate": 1.8169485658075298e-05, + "loss": 0.4297, + "step": 3309 + }, + { + "epoch": 0.5884444444444444, + "grad_norm": 0.5418556928634644, + "learning_rate": 1.8156055348588546e-05, + "loss": 0.2894, + "step": 3310 + }, + { + "epoch": 0.5886222222222223, + "grad_norm": 0.2599745988845825, + "learning_rate": 1.8142627173921338e-05, + "loss": 0.587, + "step": 3311 + }, + { + "epoch": 0.5888, + "grad_norm": 0.22423335909843445, + "learning_rate": 1.812920113826229e-05, + "loss": 0.3499, + "step": 3312 + }, + { + "epoch": 0.5889777777777778, + "grad_norm": 0.22998203337192535, + "learning_rate": 1.811577724579938e-05, + "loss": 0.4789, + "step": 3313 + }, + { + "epoch": 0.5891555555555555, + "grad_norm": 0.2615731358528137, + "learning_rate": 1.8102355500719876e-05, + "loss": 0.3135, + "step": 3314 + }, + { + "epoch": 0.5893333333333334, + "grad_norm": 0.32444679737091064, + "learning_rate": 1.8088935907210418e-05, + "loss": 0.6166, + "step": 3315 + }, + { + "epoch": 0.5895111111111111, + "grad_norm": 0.2940405309200287, + "learning_rate": 1.807551846945694e-05, + "loss": 0.391, + "step": 3316 + }, + { + "epoch": 0.5896888888888889, + "grad_norm": 0.3233529329299927, + "learning_rate": 1.8062103191644737e-05, + "loss": 0.6337, + "step": 3317 + }, + { + "epoch": 0.5898666666666667, + "grad_norm": 0.24845018982887268, + "learning_rate": 1.8048690077958397e-05, + "loss": 0.4523, + "step": 3318 + }, + { + "epoch": 0.5900444444444445, + "grad_norm": 0.293120801448822, + "learning_rate": 1.803527913258186e-05, + "loss": 0.5287, + "step": 3319 + }, + { + "epoch": 0.5902222222222222, + "grad_norm": 0.2731170654296875, + "learning_rate": 1.8021870359698368e-05, + "loss": 0.4846, + "step": 3320 + }, + { + "epoch": 0.5904, + "grad_norm": 0.2904965579509735, + "learning_rate": 1.800846376349051e-05, + "loss": 0.3864, + "step": 3321 + }, + { + "epoch": 0.5905777777777778, + "grad_norm": 0.26140472292900085, + "learning_rate": 1.7995059348140165e-05, + "loss": 0.4368, + "step": 3322 + }, + { + "epoch": 0.5907555555555556, + "grad_norm": 0.26421239972114563, + "learning_rate": 1.798165711782856e-05, + "loss": 0.5153, + "step": 3323 + }, + { + "epoch": 0.5909333333333333, + "grad_norm": 0.3236648142337799, + "learning_rate": 1.796825707673622e-05, + "loss": 0.5938, + "step": 3324 + }, + { + "epoch": 0.5911111111111111, + "grad_norm": 0.28799864649772644, + "learning_rate": 1.7954859229043016e-05, + "loss": 0.5055, + "step": 3325 + }, + { + "epoch": 0.5912888888888889, + "grad_norm": 0.3020201325416565, + "learning_rate": 1.7941463578928086e-05, + "loss": 0.4362, + "step": 3326 + }, + { + "epoch": 0.5914666666666667, + "grad_norm": 0.31696251034736633, + "learning_rate": 1.7928070130569942e-05, + "loss": 0.4696, + "step": 3327 + }, + { + "epoch": 0.5916444444444444, + "grad_norm": 0.29181909561157227, + "learning_rate": 1.7914678888146347e-05, + "loss": 0.5546, + "step": 3328 + }, + { + "epoch": 0.5918222222222222, + "grad_norm": 0.2454567849636078, + "learning_rate": 1.790128985583444e-05, + "loss": 0.3779, + "step": 3329 + }, + { + "epoch": 0.592, + "grad_norm": 0.2743252217769623, + "learning_rate": 1.788790303781061e-05, + "loss": 0.4384, + "step": 3330 + }, + { + "epoch": 0.5921777777777778, + "grad_norm": 0.2398066222667694, + "learning_rate": 1.7874518438250597e-05, + "loss": 0.3823, + "step": 3331 + }, + { + "epoch": 0.5923555555555555, + "grad_norm": 0.3603478968143463, + "learning_rate": 1.7861136061329437e-05, + "loss": 0.4138, + "step": 3332 + }, + { + "epoch": 0.5925333333333334, + "grad_norm": 0.266900897026062, + "learning_rate": 1.7847755911221466e-05, + "loss": 0.4228, + "step": 3333 + }, + { + "epoch": 0.5927111111111111, + "grad_norm": 0.23177871108055115, + "learning_rate": 1.7834377992100333e-05, + "loss": 0.3968, + "step": 3334 + }, + { + "epoch": 0.5928888888888889, + "grad_norm": 0.3170031011104584, + "learning_rate": 1.782100230813899e-05, + "loss": 0.5235, + "step": 3335 + }, + { + "epoch": 0.5930666666666666, + "grad_norm": 0.35680127143859863, + "learning_rate": 1.7807628863509685e-05, + "loss": 0.4267, + "step": 3336 + }, + { + "epoch": 0.5932444444444445, + "grad_norm": 0.24877925217151642, + "learning_rate": 1.779425766238398e-05, + "loss": 0.3773, + "step": 3337 + }, + { + "epoch": 0.5934222222222222, + "grad_norm": 0.2826848328113556, + "learning_rate": 1.7780888708932718e-05, + "loss": 0.4659, + "step": 3338 + }, + { + "epoch": 0.5936, + "grad_norm": 0.2551001310348511, + "learning_rate": 1.7767522007326072e-05, + "loss": 0.3371, + "step": 3339 + }, + { + "epoch": 0.5937777777777777, + "grad_norm": 0.27018412947654724, + "learning_rate": 1.7754157561733476e-05, + "loss": 0.4905, + "step": 3340 + }, + { + "epoch": 0.5939555555555556, + "grad_norm": 0.3840058445930481, + "learning_rate": 1.7740795376323692e-05, + "loss": 0.5398, + "step": 3341 + }, + { + "epoch": 0.5941333333333333, + "grad_norm": 0.32242876291275024, + "learning_rate": 1.7727435455264747e-05, + "loss": 0.4249, + "step": 3342 + }, + { + "epoch": 0.5943111111111111, + "grad_norm": 0.3178948760032654, + "learning_rate": 1.7714077802723994e-05, + "loss": 0.4335, + "step": 3343 + }, + { + "epoch": 0.5944888888888888, + "grad_norm": 0.35138529539108276, + "learning_rate": 1.7700722422868048e-05, + "loss": 0.406, + "step": 3344 + }, + { + "epoch": 0.5946666666666667, + "grad_norm": 0.32035595178604126, + "learning_rate": 1.768736931986284e-05, + "loss": 0.3984, + "step": 3345 + }, + { + "epoch": 0.5948444444444444, + "grad_norm": 0.38027626276016235, + "learning_rate": 1.767401849787357e-05, + "loss": 0.4398, + "step": 3346 + }, + { + "epoch": 0.5950222222222222, + "grad_norm": 0.40239015221595764, + "learning_rate": 1.7660669961064748e-05, + "loss": 0.3566, + "step": 3347 + }, + { + "epoch": 0.5952, + "grad_norm": 0.34841740131378174, + "learning_rate": 1.7647323713600138e-05, + "loss": 0.4433, + "step": 3348 + }, + { + "epoch": 0.5953777777777778, + "grad_norm": 0.43865370750427246, + "learning_rate": 1.7633979759642844e-05, + "loss": 0.4777, + "step": 3349 + }, + { + "epoch": 0.5955555555555555, + "grad_norm": 0.3721213638782501, + "learning_rate": 1.7620638103355186e-05, + "loss": 0.3404, + "step": 3350 + }, + { + "epoch": 0.5957333333333333, + "grad_norm": 0.22971749305725098, + "learning_rate": 1.7607298748898842e-05, + "loss": 0.4293, + "step": 3351 + }, + { + "epoch": 0.5959111111111111, + "grad_norm": 0.23924027383327484, + "learning_rate": 1.759396170043469e-05, + "loss": 0.3137, + "step": 3352 + }, + { + "epoch": 0.5960888888888889, + "grad_norm": 0.26949816942214966, + "learning_rate": 1.7580626962122977e-05, + "loss": 0.4874, + "step": 3353 + }, + { + "epoch": 0.5962666666666666, + "grad_norm": 0.26858681440353394, + "learning_rate": 1.7567294538123142e-05, + "loss": 0.4967, + "step": 3354 + }, + { + "epoch": 0.5964444444444444, + "grad_norm": 0.28054308891296387, + "learning_rate": 1.7553964432593976e-05, + "loss": 0.4975, + "step": 3355 + }, + { + "epoch": 0.5966222222222223, + "grad_norm": 0.2798413634300232, + "learning_rate": 1.7540636649693496e-05, + "loss": 0.5328, + "step": 3356 + }, + { + "epoch": 0.5968, + "grad_norm": 0.29521167278289795, + "learning_rate": 1.752731119357902e-05, + "loss": 0.4569, + "step": 3357 + }, + { + "epoch": 0.5969777777777778, + "grad_norm": 0.2734304368495941, + "learning_rate": 1.7513988068407146e-05, + "loss": 0.4849, + "step": 3358 + }, + { + "epoch": 0.5971555555555556, + "grad_norm": 0.2744982838630676, + "learning_rate": 1.750066727833371e-05, + "loss": 0.5084, + "step": 3359 + }, + { + "epoch": 0.5973333333333334, + "grad_norm": 0.27769947052001953, + "learning_rate": 1.748734882751386e-05, + "loss": 0.5392, + "step": 3360 + }, + { + "epoch": 0.5975111111111111, + "grad_norm": 0.27792733907699585, + "learning_rate": 1.747403272010199e-05, + "loss": 0.4623, + "step": 3361 + }, + { + "epoch": 0.5976888888888889, + "grad_norm": 0.2711454927921295, + "learning_rate": 1.7460718960251772e-05, + "loss": 0.537, + "step": 3362 + }, + { + "epoch": 0.5978666666666667, + "grad_norm": 0.2758307158946991, + "learning_rate": 1.744740755211614e-05, + "loss": 0.4949, + "step": 3363 + }, + { + "epoch": 0.5980444444444445, + "grad_norm": 0.22645916044712067, + "learning_rate": 1.7434098499847306e-05, + "loss": 0.4202, + "step": 3364 + }, + { + "epoch": 0.5982222222222222, + "grad_norm": 0.28519243001937866, + "learning_rate": 1.742079180759672e-05, + "loss": 0.5132, + "step": 3365 + }, + { + "epoch": 0.5984, + "grad_norm": 0.26003497838974, + "learning_rate": 1.7407487479515147e-05, + "loss": 0.4111, + "step": 3366 + }, + { + "epoch": 0.5985777777777778, + "grad_norm": 0.22969241440296173, + "learning_rate": 1.7394185519752545e-05, + "loss": 0.3695, + "step": 3367 + }, + { + "epoch": 0.5987555555555556, + "grad_norm": 0.30624037981033325, + "learning_rate": 1.7380885932458206e-05, + "loss": 0.5245, + "step": 3368 + }, + { + "epoch": 0.5989333333333333, + "grad_norm": 0.2629794478416443, + "learning_rate": 1.736758872178062e-05, + "loss": 0.4999, + "step": 3369 + }, + { + "epoch": 0.5991111111111111, + "grad_norm": 0.2558906078338623, + "learning_rate": 1.7354293891867582e-05, + "loss": 0.3923, + "step": 3370 + }, + { + "epoch": 0.5992888888888889, + "grad_norm": 0.23043213784694672, + "learning_rate": 1.73410014468661e-05, + "loss": 0.4277, + "step": 3371 + }, + { + "epoch": 0.5994666666666667, + "grad_norm": 0.25345873832702637, + "learning_rate": 1.7327711390922494e-05, + "loss": 0.51, + "step": 3372 + }, + { + "epoch": 0.5996444444444444, + "grad_norm": 0.25423872470855713, + "learning_rate": 1.7314423728182283e-05, + "loss": 0.3543, + "step": 3373 + }, + { + "epoch": 0.5998222222222223, + "grad_norm": 0.2843184471130371, + "learning_rate": 1.7301138462790278e-05, + "loss": 0.4308, + "step": 3374 + }, + { + "epoch": 0.6, + "grad_norm": 0.2699650228023529, + "learning_rate": 1.728785559889052e-05, + "loss": 0.3902, + "step": 3375 + }, + { + "epoch": 0.6001777777777778, + "grad_norm": 0.25496819615364075, + "learning_rate": 1.7274575140626318e-05, + "loss": 0.4081, + "step": 3376 + }, + { + "epoch": 0.6003555555555555, + "grad_norm": 0.24399985373020172, + "learning_rate": 1.7261297092140212e-05, + "loss": 0.4358, + "step": 3377 + }, + { + "epoch": 0.6005333333333334, + "grad_norm": 0.29444223642349243, + "learning_rate": 1.7248021457574004e-05, + "loss": 0.4722, + "step": 3378 + }, + { + "epoch": 0.6007111111111111, + "grad_norm": 0.2717171311378479, + "learning_rate": 1.7234748241068742e-05, + "loss": 0.5185, + "step": 3379 + }, + { + "epoch": 0.6008888888888889, + "grad_norm": 0.3238567113876343, + "learning_rate": 1.7221477446764717e-05, + "loss": 0.5799, + "step": 3380 + }, + { + "epoch": 0.6010666666666666, + "grad_norm": 0.27983537316322327, + "learning_rate": 1.7208209078801454e-05, + "loss": 0.5964, + "step": 3381 + }, + { + "epoch": 0.6012444444444445, + "grad_norm": 0.2766222059726715, + "learning_rate": 1.719494314131775e-05, + "loss": 0.4796, + "step": 3382 + }, + { + "epoch": 0.6014222222222222, + "grad_norm": 0.2575446665287018, + "learning_rate": 1.7181679638451603e-05, + "loss": 0.3926, + "step": 3383 + }, + { + "epoch": 0.6016, + "grad_norm": 0.22468270361423492, + "learning_rate": 1.7168418574340298e-05, + "loss": 0.3561, + "step": 3384 + }, + { + "epoch": 0.6017777777777777, + "grad_norm": 0.3665255308151245, + "learning_rate": 1.7155159953120313e-05, + "loss": 0.5803, + "step": 3385 + }, + { + "epoch": 0.6019555555555556, + "grad_norm": 0.2082727700471878, + "learning_rate": 1.7141903778927406e-05, + "loss": 0.3625, + "step": 3386 + }, + { + "epoch": 0.6021333333333333, + "grad_norm": 0.3333989679813385, + "learning_rate": 1.7128650055896535e-05, + "loss": 0.511, + "step": 3387 + }, + { + "epoch": 0.6023111111111111, + "grad_norm": 0.28337129950523376, + "learning_rate": 1.7115398788161925e-05, + "loss": 0.4778, + "step": 3388 + }, + { + "epoch": 0.6024888888888889, + "grad_norm": 0.24719880521297455, + "learning_rate": 1.7102149979857e-05, + "loss": 0.4349, + "step": 3389 + }, + { + "epoch": 0.6026666666666667, + "grad_norm": 0.33533400297164917, + "learning_rate": 1.708890363511447e-05, + "loss": 0.6158, + "step": 3390 + }, + { + "epoch": 0.6028444444444444, + "grad_norm": 0.24376456439495087, + "learning_rate": 1.7075659758066208e-05, + "loss": 0.3959, + "step": 3391 + }, + { + "epoch": 0.6030222222222222, + "grad_norm": 0.27658236026763916, + "learning_rate": 1.7062418352843382e-05, + "loss": 0.3872, + "step": 3392 + }, + { + "epoch": 0.6032, + "grad_norm": 0.3464023470878601, + "learning_rate": 1.7049179423576334e-05, + "loss": 0.4553, + "step": 3393 + }, + { + "epoch": 0.6033777777777778, + "grad_norm": 0.3411446511745453, + "learning_rate": 1.703594297439469e-05, + "loss": 0.3544, + "step": 3394 + }, + { + "epoch": 0.6035555555555555, + "grad_norm": 0.3437288701534271, + "learning_rate": 1.702270900942724e-05, + "loss": 0.4531, + "step": 3395 + }, + { + "epoch": 0.6037333333333333, + "grad_norm": 0.3372286260128021, + "learning_rate": 1.7009477532802054e-05, + "loss": 0.4785, + "step": 3396 + }, + { + "epoch": 0.6039111111111111, + "grad_norm": 0.3632700443267822, + "learning_rate": 1.6996248548646394e-05, + "loss": 0.3574, + "step": 3397 + }, + { + "epoch": 0.6040888888888889, + "grad_norm": 0.3662477731704712, + "learning_rate": 1.6983022061086763e-05, + "loss": 0.5135, + "step": 3398 + }, + { + "epoch": 0.6042666666666666, + "grad_norm": 0.3889213800430298, + "learning_rate": 1.6969798074248858e-05, + "loss": 0.4645, + "step": 3399 + }, + { + "epoch": 0.6044444444444445, + "grad_norm": 0.53945392370224, + "learning_rate": 1.6956576592257635e-05, + "loss": 0.4899, + "step": 3400 + }, + { + "epoch": 0.6046222222222222, + "grad_norm": 0.2613181471824646, + "learning_rate": 1.6943357619237226e-05, + "loss": 0.497, + "step": 3401 + }, + { + "epoch": 0.6048, + "grad_norm": 0.28174224495887756, + "learning_rate": 1.693014115931102e-05, + "loss": 0.5803, + "step": 3402 + }, + { + "epoch": 0.6049777777777777, + "grad_norm": 0.23731449246406555, + "learning_rate": 1.6916927216601593e-05, + "loss": 0.4588, + "step": 3403 + }, + { + "epoch": 0.6051555555555556, + "grad_norm": 0.2838279604911804, + "learning_rate": 1.690371579523075e-05, + "loss": 0.4984, + "step": 3404 + }, + { + "epoch": 0.6053333333333333, + "grad_norm": 0.3193090260028839, + "learning_rate": 1.689050689931951e-05, + "loss": 0.4364, + "step": 3405 + }, + { + "epoch": 0.6055111111111111, + "grad_norm": 0.3101356327533722, + "learning_rate": 1.6877300532988094e-05, + "loss": 0.5995, + "step": 3406 + }, + { + "epoch": 0.6056888888888889, + "grad_norm": 0.31551453471183777, + "learning_rate": 1.686409670035594e-05, + "loss": 0.5807, + "step": 3407 + }, + { + "epoch": 0.6058666666666667, + "grad_norm": 0.2801552414894104, + "learning_rate": 1.68508954055417e-05, + "loss": 0.4358, + "step": 3408 + }, + { + "epoch": 0.6060444444444445, + "grad_norm": 0.2849624752998352, + "learning_rate": 1.6837696652663242e-05, + "loss": 0.4398, + "step": 3409 + }, + { + "epoch": 0.6062222222222222, + "grad_norm": 0.30434650182724, + "learning_rate": 1.6824500445837606e-05, + "loss": 0.5634, + "step": 3410 + }, + { + "epoch": 0.6064, + "grad_norm": 0.21425320208072662, + "learning_rate": 1.681130678918108e-05, + "loss": 0.3609, + "step": 3411 + }, + { + "epoch": 0.6065777777777778, + "grad_norm": 0.2506297826766968, + "learning_rate": 1.6798115686809125e-05, + "loss": 0.4475, + "step": 3412 + }, + { + "epoch": 0.6067555555555556, + "grad_norm": 0.30223703384399414, + "learning_rate": 1.6784927142836436e-05, + "loss": 0.536, + "step": 3413 + }, + { + "epoch": 0.6069333333333333, + "grad_norm": 0.3431306481361389, + "learning_rate": 1.6771741161376862e-05, + "loss": 0.5672, + "step": 3414 + }, + { + "epoch": 0.6071111111111112, + "grad_norm": 0.28368932008743286, + "learning_rate": 1.6758557746543518e-05, + "loss": 0.4362, + "step": 3415 + }, + { + "epoch": 0.6072888888888889, + "grad_norm": 0.2901994585990906, + "learning_rate": 1.6745376902448656e-05, + "loss": 0.6525, + "step": 3416 + }, + { + "epoch": 0.6074666666666667, + "grad_norm": 0.3143194019794464, + "learning_rate": 1.6732198633203773e-05, + "loss": 0.574, + "step": 3417 + }, + { + "epoch": 0.6076444444444444, + "grad_norm": 0.2506396472454071, + "learning_rate": 1.6719022942919527e-05, + "loss": 0.4848, + "step": 3418 + }, + { + "epoch": 0.6078222222222223, + "grad_norm": 0.3265102803707123, + "learning_rate": 1.6705849835705802e-05, + "loss": 0.5064, + "step": 3419 + }, + { + "epoch": 0.608, + "grad_norm": 0.2830265164375305, + "learning_rate": 1.669267931567165e-05, + "loss": 0.4989, + "step": 3420 + }, + { + "epoch": 0.6081777777777778, + "grad_norm": 0.3493908941745758, + "learning_rate": 1.6679511386925337e-05, + "loss": 0.5749, + "step": 3421 + }, + { + "epoch": 0.6083555555555555, + "grad_norm": 0.26769155263900757, + "learning_rate": 1.66663460535743e-05, + "loss": 0.5115, + "step": 3422 + }, + { + "epoch": 0.6085333333333334, + "grad_norm": 0.3172113299369812, + "learning_rate": 1.6653183319725195e-05, + "loss": 0.5441, + "step": 3423 + }, + { + "epoch": 0.6087111111111111, + "grad_norm": 0.2942335307598114, + "learning_rate": 1.6640023189483835e-05, + "loss": 0.5083, + "step": 3424 + }, + { + "epoch": 0.6088888888888889, + "grad_norm": 0.3153296709060669, + "learning_rate": 1.662686566695525e-05, + "loss": 0.6587, + "step": 3425 + }, + { + "epoch": 0.6090666666666666, + "grad_norm": 0.24099671840667725, + "learning_rate": 1.6613710756243626e-05, + "loss": 0.5585, + "step": 3426 + }, + { + "epoch": 0.6092444444444445, + "grad_norm": 0.25383883714675903, + "learning_rate": 1.660055846145237e-05, + "loss": 0.5014, + "step": 3427 + }, + { + "epoch": 0.6094222222222222, + "grad_norm": 0.30049562454223633, + "learning_rate": 1.6587408786684033e-05, + "loss": 0.4325, + "step": 3428 + }, + { + "epoch": 0.6096, + "grad_norm": 0.32024946808815, + "learning_rate": 1.6574261736040387e-05, + "loss": 0.6308, + "step": 3429 + }, + { + "epoch": 0.6097777777777778, + "grad_norm": 0.22235243022441864, + "learning_rate": 1.656111731362236e-05, + "loss": 0.4144, + "step": 3430 + }, + { + "epoch": 0.6099555555555556, + "grad_norm": 0.38528186082839966, + "learning_rate": 1.6547975523530075e-05, + "loss": 0.6212, + "step": 3431 + }, + { + "epoch": 0.6101333333333333, + "grad_norm": 0.29109230637550354, + "learning_rate": 1.6534836369862806e-05, + "loss": 0.5424, + "step": 3432 + }, + { + "epoch": 0.6103111111111111, + "grad_norm": 0.25402164459228516, + "learning_rate": 1.6521699856719062e-05, + "loss": 0.4296, + "step": 3433 + }, + { + "epoch": 0.6104888888888889, + "grad_norm": 0.2694256603717804, + "learning_rate": 1.650856598819646e-05, + "loss": 0.4375, + "step": 3434 + }, + { + "epoch": 0.6106666666666667, + "grad_norm": 0.33077505230903625, + "learning_rate": 1.6495434768391855e-05, + "loss": 0.6915, + "step": 3435 + }, + { + "epoch": 0.6108444444444444, + "grad_norm": 0.2692342698574066, + "learning_rate": 1.648230620140121e-05, + "loss": 0.6162, + "step": 3436 + }, + { + "epoch": 0.6110222222222222, + "grad_norm": 0.28206002712249756, + "learning_rate": 1.6469180291319723e-05, + "loss": 0.4879, + "step": 3437 + }, + { + "epoch": 0.6112, + "grad_norm": 0.2316645085811615, + "learning_rate": 1.645605704224172e-05, + "loss": 0.38, + "step": 3438 + }, + { + "epoch": 0.6113777777777778, + "grad_norm": 0.3595908582210541, + "learning_rate": 1.644293645826072e-05, + "loss": 0.6005, + "step": 3439 + }, + { + "epoch": 0.6115555555555555, + "grad_norm": 0.2596064507961273, + "learning_rate": 1.6429818543469406e-05, + "loss": 0.4065, + "step": 3440 + }, + { + "epoch": 0.6117333333333334, + "grad_norm": 0.2699691951274872, + "learning_rate": 1.6416703301959622e-05, + "loss": 0.5314, + "step": 3441 + }, + { + "epoch": 0.6119111111111111, + "grad_norm": 0.30650588870048523, + "learning_rate": 1.6403590737822376e-05, + "loss": 0.4807, + "step": 3442 + }, + { + "epoch": 0.6120888888888889, + "grad_norm": 0.24779197573661804, + "learning_rate": 1.6390480855147862e-05, + "loss": 0.3649, + "step": 3443 + }, + { + "epoch": 0.6122666666666666, + "grad_norm": 0.26482099294662476, + "learning_rate": 1.637737365802541e-05, + "loss": 0.4287, + "step": 3444 + }, + { + "epoch": 0.6124444444444445, + "grad_norm": 0.35267460346221924, + "learning_rate": 1.6364269150543532e-05, + "loss": 0.3749, + "step": 3445 + }, + { + "epoch": 0.6126222222222222, + "grad_norm": 0.2792567014694214, + "learning_rate": 1.635116733678988e-05, + "loss": 0.355, + "step": 3446 + }, + { + "epoch": 0.6128, + "grad_norm": 0.31379714608192444, + "learning_rate": 1.6338068220851306e-05, + "loss": 0.3531, + "step": 3447 + }, + { + "epoch": 0.6129777777777777, + "grad_norm": 0.37978580594062805, + "learning_rate": 1.6324971806813767e-05, + "loss": 0.4066, + "step": 3448 + }, + { + "epoch": 0.6131555555555556, + "grad_norm": 0.37624213099479675, + "learning_rate": 1.6311878098762417e-05, + "loss": 0.3799, + "step": 3449 + }, + { + "epoch": 0.6133333333333333, + "grad_norm": 0.44553759694099426, + "learning_rate": 1.629878710078155e-05, + "loss": 0.4443, + "step": 3450 + }, + { + "epoch": 0.6135111111111111, + "grad_norm": 0.29710549116134644, + "learning_rate": 1.6285698816954624e-05, + "loss": 0.4636, + "step": 3451 + }, + { + "epoch": 0.6136888888888888, + "grad_norm": 0.2545873522758484, + "learning_rate": 1.6272613251364237e-05, + "loss": 0.5195, + "step": 3452 + }, + { + "epoch": 0.6138666666666667, + "grad_norm": 0.2200932502746582, + "learning_rate": 1.6259530408092154e-05, + "loss": 0.4408, + "step": 3453 + }, + { + "epoch": 0.6140444444444444, + "grad_norm": 0.2815747559070587, + "learning_rate": 1.6246450291219266e-05, + "loss": 0.4137, + "step": 3454 + }, + { + "epoch": 0.6142222222222222, + "grad_norm": 0.3393726944923401, + "learning_rate": 1.6233372904825656e-05, + "loss": 0.3547, + "step": 3455 + }, + { + "epoch": 0.6144, + "grad_norm": 0.24042043089866638, + "learning_rate": 1.6220298252990502e-05, + "loss": 0.4145, + "step": 3456 + }, + { + "epoch": 0.6145777777777778, + "grad_norm": 0.289188951253891, + "learning_rate": 1.620722633979219e-05, + "loss": 0.5269, + "step": 3457 + }, + { + "epoch": 0.6147555555555556, + "grad_norm": 0.3192743957042694, + "learning_rate": 1.6194157169308182e-05, + "loss": 0.5333, + "step": 3458 + }, + { + "epoch": 0.6149333333333333, + "grad_norm": 0.2537321448326111, + "learning_rate": 1.6181090745615147e-05, + "loss": 0.3614, + "step": 3459 + }, + { + "epoch": 0.6151111111111112, + "grad_norm": 0.2596038281917572, + "learning_rate": 1.6168027072788867e-05, + "loss": 0.3871, + "step": 3460 + }, + { + "epoch": 0.6152888888888889, + "grad_norm": 0.26865798234939575, + "learning_rate": 1.6154966154904265e-05, + "loss": 0.5413, + "step": 3461 + }, + { + "epoch": 0.6154666666666667, + "grad_norm": 0.2327553629875183, + "learning_rate": 1.6141907996035415e-05, + "loss": 0.4561, + "step": 3462 + }, + { + "epoch": 0.6156444444444444, + "grad_norm": 0.2254241704940796, + "learning_rate": 1.612885260025552e-05, + "loss": 0.5027, + "step": 3463 + }, + { + "epoch": 0.6158222222222223, + "grad_norm": 0.246908038854599, + "learning_rate": 1.611579997163693e-05, + "loss": 0.3706, + "step": 3464 + }, + { + "epoch": 0.616, + "grad_norm": 0.2608049809932709, + "learning_rate": 1.610275011425113e-05, + "loss": 0.3916, + "step": 3465 + }, + { + "epoch": 0.6161777777777778, + "grad_norm": 0.24750645458698273, + "learning_rate": 1.6089703032168733e-05, + "loss": 0.3673, + "step": 3466 + }, + { + "epoch": 0.6163555555555555, + "grad_norm": 0.3097095191478729, + "learning_rate": 1.60766587294595e-05, + "loss": 0.6258, + "step": 3467 + }, + { + "epoch": 0.6165333333333334, + "grad_norm": 0.27370399236679077, + "learning_rate": 1.606361721019231e-05, + "loss": 0.4784, + "step": 3468 + }, + { + "epoch": 0.6167111111111111, + "grad_norm": 0.2685700058937073, + "learning_rate": 1.605057847843518e-05, + "loss": 0.4599, + "step": 3469 + }, + { + "epoch": 0.6168888888888889, + "grad_norm": 0.25664037466049194, + "learning_rate": 1.6037542538255274e-05, + "loss": 0.4026, + "step": 3470 + }, + { + "epoch": 0.6170666666666667, + "grad_norm": 0.2589212954044342, + "learning_rate": 1.6024509393718844e-05, + "loss": 0.387, + "step": 3471 + }, + { + "epoch": 0.6172444444444445, + "grad_norm": 0.27300047874450684, + "learning_rate": 1.6011479048891324e-05, + "loss": 0.4278, + "step": 3472 + }, + { + "epoch": 0.6174222222222222, + "grad_norm": 0.29986175894737244, + "learning_rate": 1.5998451507837216e-05, + "loss": 0.5843, + "step": 3473 + }, + { + "epoch": 0.6176, + "grad_norm": 0.308949738740921, + "learning_rate": 1.598542677462021e-05, + "loss": 0.5381, + "step": 3474 + }, + { + "epoch": 0.6177777777777778, + "grad_norm": 0.2335803061723709, + "learning_rate": 1.5972404853303062e-05, + "loss": 0.4949, + "step": 3475 + }, + { + "epoch": 0.6179555555555556, + "grad_norm": 0.34124496579170227, + "learning_rate": 1.5959385747947698e-05, + "loss": 0.5642, + "step": 3476 + }, + { + "epoch": 0.6181333333333333, + "grad_norm": 0.27719154953956604, + "learning_rate": 1.5946369462615117e-05, + "loss": 0.504, + "step": 3477 + }, + { + "epoch": 0.6183111111111111, + "grad_norm": 0.21705426275730133, + "learning_rate": 1.59333560013655e-05, + "loss": 0.4315, + "step": 3478 + }, + { + "epoch": 0.6184888888888889, + "grad_norm": 0.2715508043766022, + "learning_rate": 1.5920345368258084e-05, + "loss": 0.571, + "step": 3479 + }, + { + "epoch": 0.6186666666666667, + "grad_norm": 0.22464250028133392, + "learning_rate": 1.5907337567351264e-05, + "loss": 0.4339, + "step": 3480 + }, + { + "epoch": 0.6188444444444444, + "grad_norm": 0.30078834295272827, + "learning_rate": 1.5894332602702545e-05, + "loss": 0.435, + "step": 3481 + }, + { + "epoch": 0.6190222222222223, + "grad_norm": 0.23706455528736115, + "learning_rate": 1.588133047836854e-05, + "loss": 0.55, + "step": 3482 + }, + { + "epoch": 0.6192, + "grad_norm": 0.2755063474178314, + "learning_rate": 1.586833119840497e-05, + "loss": 0.5109, + "step": 3483 + }, + { + "epoch": 0.6193777777777778, + "grad_norm": 0.232285276055336, + "learning_rate": 1.585533476686669e-05, + "loss": 0.442, + "step": 3484 + }, + { + "epoch": 0.6195555555555555, + "grad_norm": 0.2604234516620636, + "learning_rate": 1.584234118780764e-05, + "loss": 0.4841, + "step": 3485 + }, + { + "epoch": 0.6197333333333334, + "grad_norm": 0.23840339481830597, + "learning_rate": 1.58293504652809e-05, + "loss": 0.5414, + "step": 3486 + }, + { + "epoch": 0.6199111111111111, + "grad_norm": 0.3220157325267792, + "learning_rate": 1.581636260333863e-05, + "loss": 0.6238, + "step": 3487 + }, + { + "epoch": 0.6200888888888889, + "grad_norm": 0.22296661138534546, + "learning_rate": 1.580337760603212e-05, + "loss": 0.3318, + "step": 3488 + }, + { + "epoch": 0.6202666666666666, + "grad_norm": 0.29308021068573, + "learning_rate": 1.579039547741175e-05, + "loss": 0.5567, + "step": 3489 + }, + { + "epoch": 0.6204444444444445, + "grad_norm": 0.3049785792827606, + "learning_rate": 1.577741622152702e-05, + "loss": 0.4892, + "step": 3490 + }, + { + "epoch": 0.6206222222222222, + "grad_norm": 0.3373163342475891, + "learning_rate": 1.5764439842426515e-05, + "loss": 0.4142, + "step": 3491 + }, + { + "epoch": 0.6208, + "grad_norm": 0.3368525505065918, + "learning_rate": 1.5751466344157947e-05, + "loss": 0.3893, + "step": 3492 + }, + { + "epoch": 0.6209777777777777, + "grad_norm": 0.2850845158100128, + "learning_rate": 1.5738495730768105e-05, + "loss": 0.4022, + "step": 3493 + }, + { + "epoch": 0.6211555555555556, + "grad_norm": 0.35597696900367737, + "learning_rate": 1.5725528006302898e-05, + "loss": 0.3774, + "step": 3494 + }, + { + "epoch": 0.6213333333333333, + "grad_norm": 0.35393720865249634, + "learning_rate": 1.5712563174807317e-05, + "loss": 0.4362, + "step": 3495 + }, + { + "epoch": 0.6215111111111111, + "grad_norm": 0.35573136806488037, + "learning_rate": 1.5699601240325474e-05, + "loss": 0.3905, + "step": 3496 + }, + { + "epoch": 0.6216888888888888, + "grad_norm": 0.36656707525253296, + "learning_rate": 1.5686642206900538e-05, + "loss": 0.3849, + "step": 3497 + }, + { + "epoch": 0.6218666666666667, + "grad_norm": 0.4563030004501343, + "learning_rate": 1.5673686078574822e-05, + "loss": 0.3913, + "step": 3498 + }, + { + "epoch": 0.6220444444444444, + "grad_norm": 0.42437121272087097, + "learning_rate": 1.5660732859389686e-05, + "loss": 0.3411, + "step": 3499 + }, + { + "epoch": 0.6222222222222222, + "grad_norm": 0.482433944940567, + "learning_rate": 1.5647782553385636e-05, + "loss": 0.5558, + "step": 3500 + }, + { + "epoch": 0.6224, + "grad_norm": 0.2443191260099411, + "learning_rate": 1.56348351646022e-05, + "loss": 0.4034, + "step": 3501 + }, + { + "epoch": 0.6225777777777778, + "grad_norm": 0.23964877426624298, + "learning_rate": 1.562189069707807e-05, + "loss": 0.4474, + "step": 3502 + }, + { + "epoch": 0.6227555555555555, + "grad_norm": 0.2298581302165985, + "learning_rate": 1.560894915485095e-05, + "loss": 0.5241, + "step": 3503 + }, + { + "epoch": 0.6229333333333333, + "grad_norm": 0.27036231756210327, + "learning_rate": 1.5596010541957712e-05, + "loss": 0.4822, + "step": 3504 + }, + { + "epoch": 0.6231111111111111, + "grad_norm": 0.290789395570755, + "learning_rate": 1.5583074862434255e-05, + "loss": 0.5118, + "step": 3505 + }, + { + "epoch": 0.6232888888888889, + "grad_norm": 0.2417406141757965, + "learning_rate": 1.557014212031559e-05, + "loss": 0.4837, + "step": 3506 + }, + { + "epoch": 0.6234666666666666, + "grad_norm": 0.2861023545265198, + "learning_rate": 1.555721231963579e-05, + "loss": 0.3431, + "step": 3507 + }, + { + "epoch": 0.6236444444444444, + "grad_norm": 0.2518467605113983, + "learning_rate": 1.5544285464428045e-05, + "loss": 0.5041, + "step": 3508 + }, + { + "epoch": 0.6238222222222222, + "grad_norm": 0.4318239688873291, + "learning_rate": 1.5531361558724587e-05, + "loss": 0.7202, + "step": 3509 + }, + { + "epoch": 0.624, + "grad_norm": 0.28938204050064087, + "learning_rate": 1.5518440606556766e-05, + "loss": 0.3595, + "step": 3510 + }, + { + "epoch": 0.6241777777777778, + "grad_norm": 0.2674754858016968, + "learning_rate": 1.5505522611954975e-05, + "loss": 0.4859, + "step": 3511 + }, + { + "epoch": 0.6243555555555556, + "grad_norm": 0.30254605412483215, + "learning_rate": 1.549260757894871e-05, + "loss": 0.4884, + "step": 3512 + }, + { + "epoch": 0.6245333333333334, + "grad_norm": 0.23700743913650513, + "learning_rate": 1.5479695511566534e-05, + "loss": 0.4125, + "step": 3513 + }, + { + "epoch": 0.6247111111111111, + "grad_norm": 0.3400924801826477, + "learning_rate": 1.5466786413836077e-05, + "loss": 0.6042, + "step": 3514 + }, + { + "epoch": 0.6248888888888889, + "grad_norm": 0.2733568251132965, + "learning_rate": 1.5453880289784066e-05, + "loss": 0.5067, + "step": 3515 + }, + { + "epoch": 0.6250666666666667, + "grad_norm": 0.25780993700027466, + "learning_rate": 1.544097714343627e-05, + "loss": 0.5733, + "step": 3516 + }, + { + "epoch": 0.6252444444444445, + "grad_norm": 0.3310803174972534, + "learning_rate": 1.5428076978817562e-05, + "loss": 0.5408, + "step": 3517 + }, + { + "epoch": 0.6254222222222222, + "grad_norm": 0.19981937110424042, + "learning_rate": 1.5415179799951844e-05, + "loss": 0.4298, + "step": 3518 + }, + { + "epoch": 0.6256, + "grad_norm": 0.24123001098632812, + "learning_rate": 1.5402285610862142e-05, + "loss": 0.3339, + "step": 3519 + }, + { + "epoch": 0.6257777777777778, + "grad_norm": 0.29009774327278137, + "learning_rate": 1.538939441557048e-05, + "loss": 0.5873, + "step": 3520 + }, + { + "epoch": 0.6259555555555556, + "grad_norm": 0.23727940022945404, + "learning_rate": 1.5376506218098015e-05, + "loss": 0.4402, + "step": 3521 + }, + { + "epoch": 0.6261333333333333, + "grad_norm": 0.21850895881652832, + "learning_rate": 1.5363621022464924e-05, + "loss": 0.3306, + "step": 3522 + }, + { + "epoch": 0.6263111111111112, + "grad_norm": 0.22471006214618683, + "learning_rate": 1.535073883269048e-05, + "loss": 0.3565, + "step": 3523 + }, + { + "epoch": 0.6264888888888889, + "grad_norm": 0.26791492104530334, + "learning_rate": 1.533785965279298e-05, + "loss": 0.582, + "step": 3524 + }, + { + "epoch": 0.6266666666666667, + "grad_norm": 0.2948598861694336, + "learning_rate": 1.5324983486789818e-05, + "loss": 0.5376, + "step": 3525 + }, + { + "epoch": 0.6268444444444444, + "grad_norm": 0.24018436670303345, + "learning_rate": 1.5312110338697426e-05, + "loss": 0.5029, + "step": 3526 + }, + { + "epoch": 0.6270222222222223, + "grad_norm": 0.29089438915252686, + "learning_rate": 1.5299240212531314e-05, + "loss": 0.4862, + "step": 3527 + }, + { + "epoch": 0.6272, + "grad_norm": 0.2648610472679138, + "learning_rate": 1.5286373112306018e-05, + "loss": 0.5493, + "step": 3528 + }, + { + "epoch": 0.6273777777777778, + "grad_norm": 0.30678480863571167, + "learning_rate": 1.5273509042035172e-05, + "loss": 0.5334, + "step": 3529 + }, + { + "epoch": 0.6275555555555555, + "grad_norm": 0.2817789614200592, + "learning_rate": 1.5260648005731427e-05, + "loss": 0.4773, + "step": 3530 + }, + { + "epoch": 0.6277333333333334, + "grad_norm": 0.33746954798698425, + "learning_rate": 1.524779000740651e-05, + "loss": 0.4933, + "step": 3531 + }, + { + "epoch": 0.6279111111111111, + "grad_norm": 0.25532791018486023, + "learning_rate": 1.5234935051071192e-05, + "loss": 0.5798, + "step": 3532 + }, + { + "epoch": 0.6280888888888889, + "grad_norm": 0.2670576870441437, + "learning_rate": 1.52220831407353e-05, + "loss": 0.4848, + "step": 3533 + }, + { + "epoch": 0.6282666666666666, + "grad_norm": 0.22281870245933533, + "learning_rate": 1.5209234280407697e-05, + "loss": 0.3885, + "step": 3534 + }, + { + "epoch": 0.6284444444444445, + "grad_norm": 0.2752179503440857, + "learning_rate": 1.5196388474096319e-05, + "loss": 0.3617, + "step": 3535 + }, + { + "epoch": 0.6286222222222222, + "grad_norm": 0.24865487217903137, + "learning_rate": 1.5183545725808127e-05, + "loss": 0.439, + "step": 3536 + }, + { + "epoch": 0.6288, + "grad_norm": 0.23080642521381378, + "learning_rate": 1.5170706039549142e-05, + "loss": 0.4813, + "step": 3537 + }, + { + "epoch": 0.6289777777777777, + "grad_norm": 0.296745628118515, + "learning_rate": 1.515786941932441e-05, + "loss": 0.5631, + "step": 3538 + }, + { + "epoch": 0.6291555555555556, + "grad_norm": 0.2646890878677368, + "learning_rate": 1.5145035869138067e-05, + "loss": 0.3949, + "step": 3539 + }, + { + "epoch": 0.6293333333333333, + "grad_norm": 0.2304856926202774, + "learning_rate": 1.513220539299322e-05, + "loss": 0.4227, + "step": 3540 + }, + { + "epoch": 0.6295111111111111, + "grad_norm": 0.3167457580566406, + "learning_rate": 1.5119377994892094e-05, + "loss": 0.5353, + "step": 3541 + }, + { + "epoch": 0.6296888888888889, + "grad_norm": 0.26865383982658386, + "learning_rate": 1.5106553678835884e-05, + "loss": 0.3973, + "step": 3542 + }, + { + "epoch": 0.6298666666666667, + "grad_norm": 0.3043624758720398, + "learning_rate": 1.5093732448824888e-05, + "loss": 0.3615, + "step": 3543 + }, + { + "epoch": 0.6300444444444444, + "grad_norm": 0.3228950798511505, + "learning_rate": 1.5080914308858374e-05, + "loss": 0.555, + "step": 3544 + }, + { + "epoch": 0.6302222222222222, + "grad_norm": 0.3670639097690582, + "learning_rate": 1.506809926293471e-05, + "loss": 0.3384, + "step": 3545 + }, + { + "epoch": 0.6304, + "grad_norm": 0.3196782171726227, + "learning_rate": 1.505528731505126e-05, + "loss": 0.3457, + "step": 3546 + }, + { + "epoch": 0.6305777777777778, + "grad_norm": 0.34762653708457947, + "learning_rate": 1.5042478469204435e-05, + "loss": 0.4191, + "step": 3547 + }, + { + "epoch": 0.6307555555555555, + "grad_norm": 0.41670212149620056, + "learning_rate": 1.5029672729389669e-05, + "loss": 0.4866, + "step": 3548 + }, + { + "epoch": 0.6309333333333333, + "grad_norm": 0.3807736933231354, + "learning_rate": 1.5016870099601444e-05, + "loss": 0.376, + "step": 3549 + }, + { + "epoch": 0.6311111111111111, + "grad_norm": 0.534493625164032, + "learning_rate": 1.5004070583833251e-05, + "loss": 0.4595, + "step": 3550 + }, + { + "epoch": 0.6312888888888889, + "grad_norm": 0.23593223094940186, + "learning_rate": 1.4991274186077632e-05, + "loss": 0.3885, + "step": 3551 + }, + { + "epoch": 0.6314666666666666, + "grad_norm": 0.2329351007938385, + "learning_rate": 1.4978480910326132e-05, + "loss": 0.3756, + "step": 3552 + }, + { + "epoch": 0.6316444444444445, + "grad_norm": 0.28859513998031616, + "learning_rate": 1.4965690760569346e-05, + "loss": 0.6125, + "step": 3553 + }, + { + "epoch": 0.6318222222222222, + "grad_norm": 0.2809862494468689, + "learning_rate": 1.4952903740796873e-05, + "loss": 0.4294, + "step": 3554 + }, + { + "epoch": 0.632, + "grad_norm": 0.2988995313644409, + "learning_rate": 1.4940119854997354e-05, + "loss": 0.5165, + "step": 3555 + }, + { + "epoch": 0.6321777777777777, + "grad_norm": 0.3111002445220947, + "learning_rate": 1.4927339107158437e-05, + "loss": 0.5201, + "step": 3556 + }, + { + "epoch": 0.6323555555555556, + "grad_norm": 0.30978119373321533, + "learning_rate": 1.4914561501266805e-05, + "loss": 0.5249, + "step": 3557 + }, + { + "epoch": 0.6325333333333333, + "grad_norm": 0.2904992699623108, + "learning_rate": 1.4901787041308146e-05, + "loss": 0.5185, + "step": 3558 + }, + { + "epoch": 0.6327111111111111, + "grad_norm": 0.29443278908729553, + "learning_rate": 1.4889015731267186e-05, + "loss": 0.5009, + "step": 3559 + }, + { + "epoch": 0.6328888888888888, + "grad_norm": 0.2625446915626526, + "learning_rate": 1.4876247575127641e-05, + "loss": 0.447, + "step": 3560 + }, + { + "epoch": 0.6330666666666667, + "grad_norm": 0.301430344581604, + "learning_rate": 1.4863482576872275e-05, + "loss": 0.576, + "step": 3561 + }, + { + "epoch": 0.6332444444444445, + "grad_norm": 0.2658097743988037, + "learning_rate": 1.485072074048284e-05, + "loss": 0.4526, + "step": 3562 + }, + { + "epoch": 0.6334222222222222, + "grad_norm": 0.2660572826862335, + "learning_rate": 1.4837962069940114e-05, + "loss": 0.5995, + "step": 3563 + }, + { + "epoch": 0.6336, + "grad_norm": 0.318282812833786, + "learning_rate": 1.4825206569223899e-05, + "loss": 0.649, + "step": 3564 + }, + { + "epoch": 0.6337777777777778, + "grad_norm": 0.30754390358924866, + "learning_rate": 1.4812454242312979e-05, + "loss": 0.5575, + "step": 3565 + }, + { + "epoch": 0.6339555555555556, + "grad_norm": 0.22760820388793945, + "learning_rate": 1.4799705093185181e-05, + "loss": 0.3546, + "step": 3566 + }, + { + "epoch": 0.6341333333333333, + "grad_norm": 0.3868652582168579, + "learning_rate": 1.4786959125817312e-05, + "loss": 0.551, + "step": 3567 + }, + { + "epoch": 0.6343111111111112, + "grad_norm": 0.2678442597389221, + "learning_rate": 1.4774216344185205e-05, + "loss": 0.4576, + "step": 3568 + }, + { + "epoch": 0.6344888888888889, + "grad_norm": 0.33745184540748596, + "learning_rate": 1.476147675226369e-05, + "loss": 0.6076, + "step": 3569 + }, + { + "epoch": 0.6346666666666667, + "grad_norm": 0.26735594868659973, + "learning_rate": 1.474874035402661e-05, + "loss": 0.4059, + "step": 3570 + }, + { + "epoch": 0.6348444444444444, + "grad_norm": 0.3119068145751953, + "learning_rate": 1.4736007153446801e-05, + "loss": 0.5051, + "step": 3571 + }, + { + "epoch": 0.6350222222222223, + "grad_norm": 0.32012316584587097, + "learning_rate": 1.4723277154496111e-05, + "loss": 0.4705, + "step": 3572 + }, + { + "epoch": 0.6352, + "grad_norm": 0.2335209846496582, + "learning_rate": 1.4710550361145386e-05, + "loss": 0.5993, + "step": 3573 + }, + { + "epoch": 0.6353777777777778, + "grad_norm": 0.3181397020816803, + "learning_rate": 1.4697826777364477e-05, + "loss": 0.7151, + "step": 3574 + }, + { + "epoch": 0.6355555555555555, + "grad_norm": 0.2864576280117035, + "learning_rate": 1.4685106407122218e-05, + "loss": 0.4293, + "step": 3575 + }, + { + "epoch": 0.6357333333333334, + "grad_norm": 0.2603675127029419, + "learning_rate": 1.467238925438646e-05, + "loss": 0.5728, + "step": 3576 + }, + { + "epoch": 0.6359111111111111, + "grad_norm": 0.26273441314697266, + "learning_rate": 1.4659675323124036e-05, + "loss": 0.3883, + "step": 3577 + }, + { + "epoch": 0.6360888888888889, + "grad_norm": 0.2464042454957962, + "learning_rate": 1.464696461730079e-05, + "loss": 0.4741, + "step": 3578 + }, + { + "epoch": 0.6362666666666666, + "grad_norm": 0.28392016887664795, + "learning_rate": 1.4634257140881536e-05, + "loss": 0.5801, + "step": 3579 + }, + { + "epoch": 0.6364444444444445, + "grad_norm": 0.28749316930770874, + "learning_rate": 1.462155289783011e-05, + "loss": 0.4557, + "step": 3580 + }, + { + "epoch": 0.6366222222222222, + "grad_norm": 0.30617669224739075, + "learning_rate": 1.4608851892109304e-05, + "loss": 0.4567, + "step": 3581 + }, + { + "epoch": 0.6368, + "grad_norm": 0.2467452436685562, + "learning_rate": 1.4596154127680947e-05, + "loss": 0.4446, + "step": 3582 + }, + { + "epoch": 0.6369777777777778, + "grad_norm": 0.3463928997516632, + "learning_rate": 1.4583459608505801e-05, + "loss": 0.6071, + "step": 3583 + }, + { + "epoch": 0.6371555555555556, + "grad_norm": 0.29073402285575867, + "learning_rate": 1.4570768338543672e-05, + "loss": 0.4254, + "step": 3584 + }, + { + "epoch": 0.6373333333333333, + "grad_norm": 0.23615197837352753, + "learning_rate": 1.455808032175331e-05, + "loss": 0.4756, + "step": 3585 + }, + { + "epoch": 0.6375111111111111, + "grad_norm": 0.2662949562072754, + "learning_rate": 1.4545395562092468e-05, + "loss": 0.5345, + "step": 3586 + }, + { + "epoch": 0.6376888888888889, + "grad_norm": 0.26360878348350525, + "learning_rate": 1.4532714063517871e-05, + "loss": 0.4514, + "step": 3587 + }, + { + "epoch": 0.6378666666666667, + "grad_norm": 0.28474000096321106, + "learning_rate": 1.452003582998526e-05, + "loss": 0.4954, + "step": 3588 + }, + { + "epoch": 0.6380444444444444, + "grad_norm": 0.2530825734138489, + "learning_rate": 1.4507360865449319e-05, + "loss": 0.4094, + "step": 3589 + }, + { + "epoch": 0.6382222222222222, + "grad_norm": 0.2904086112976074, + "learning_rate": 1.4494689173863726e-05, + "loss": 0.5997, + "step": 3590 + }, + { + "epoch": 0.6384, + "grad_norm": 0.308840811252594, + "learning_rate": 1.4482020759181135e-05, + "loss": 0.391, + "step": 3591 + }, + { + "epoch": 0.6385777777777778, + "grad_norm": 0.3390917181968689, + "learning_rate": 1.4469355625353198e-05, + "loss": 0.4488, + "step": 3592 + }, + { + "epoch": 0.6387555555555555, + "grad_norm": 0.44907519221305847, + "learning_rate": 1.445669377633051e-05, + "loss": 0.3841, + "step": 3593 + }, + { + "epoch": 0.6389333333333334, + "grad_norm": 0.26853013038635254, + "learning_rate": 1.4444035216062684e-05, + "loss": 0.4086, + "step": 3594 + }, + { + "epoch": 0.6391111111111111, + "grad_norm": 0.32772859930992126, + "learning_rate": 1.4431379948498253e-05, + "loss": 0.3669, + "step": 3595 + }, + { + "epoch": 0.6392888888888889, + "grad_norm": 0.3255805969238281, + "learning_rate": 1.4418727977584774e-05, + "loss": 0.4992, + "step": 3596 + }, + { + "epoch": 0.6394666666666666, + "grad_norm": 0.32332366704940796, + "learning_rate": 1.4406079307268734e-05, + "loss": 0.4384, + "step": 3597 + }, + { + "epoch": 0.6396444444444445, + "grad_norm": 0.40644580125808716, + "learning_rate": 1.4393433941495637e-05, + "loss": 0.464, + "step": 3598 + }, + { + "epoch": 0.6398222222222222, + "grad_norm": 0.3871169984340668, + "learning_rate": 1.43807918842099e-05, + "loss": 0.32, + "step": 3599 + }, + { + "epoch": 0.64, + "grad_norm": 0.5102043151855469, + "learning_rate": 1.4368153139354962e-05, + "loss": 0.5066, + "step": 3600 + }, + { + "epoch": 0.6401777777777777, + "grad_norm": 0.2648928165435791, + "learning_rate": 1.4355517710873184e-05, + "loss": 0.5369, + "step": 3601 + }, + { + "epoch": 0.6403555555555556, + "grad_norm": 0.2976272702217102, + "learning_rate": 1.434288560270593e-05, + "loss": 0.515, + "step": 3602 + }, + { + "epoch": 0.6405333333333333, + "grad_norm": 0.27733874320983887, + "learning_rate": 1.4330256818793508e-05, + "loss": 0.5381, + "step": 3603 + }, + { + "epoch": 0.6407111111111111, + "grad_norm": 0.2524182200431824, + "learning_rate": 1.4317631363075184e-05, + "loss": 0.451, + "step": 3604 + }, + { + "epoch": 0.6408888888888888, + "grad_norm": 0.3096165955066681, + "learning_rate": 1.4305009239489192e-05, + "loss": 0.4473, + "step": 3605 + }, + { + "epoch": 0.6410666666666667, + "grad_norm": 0.2737775444984436, + "learning_rate": 1.4292390451972745e-05, + "loss": 0.5543, + "step": 3606 + }, + { + "epoch": 0.6412444444444444, + "grad_norm": 0.305505633354187, + "learning_rate": 1.427977500446199e-05, + "loss": 0.5438, + "step": 3607 + }, + { + "epoch": 0.6414222222222222, + "grad_norm": 0.2638792395591736, + "learning_rate": 1.426716290089204e-05, + "loss": 0.4717, + "step": 3608 + }, + { + "epoch": 0.6416, + "grad_norm": 0.2731456160545349, + "learning_rate": 1.4254554145196953e-05, + "loss": 0.4266, + "step": 3609 + }, + { + "epoch": 0.6417777777777778, + "grad_norm": 0.3034774959087372, + "learning_rate": 1.4241948741309782e-05, + "loss": 0.437, + "step": 3610 + }, + { + "epoch": 0.6419555555555555, + "grad_norm": 0.21936370432376862, + "learning_rate": 1.42293466931625e-05, + "loss": 0.455, + "step": 3611 + }, + { + "epoch": 0.6421333333333333, + "grad_norm": 0.2472326159477234, + "learning_rate": 1.421674800468603e-05, + "loss": 0.4134, + "step": 3612 + }, + { + "epoch": 0.6423111111111112, + "grad_norm": 0.3099437654018402, + "learning_rate": 1.4204152679810258e-05, + "loss": 0.5684, + "step": 3613 + }, + { + "epoch": 0.6424888888888889, + "grad_norm": 0.2899812161922455, + "learning_rate": 1.4191560722464032e-05, + "loss": 0.3793, + "step": 3614 + }, + { + "epoch": 0.6426666666666667, + "grad_norm": 0.28951987624168396, + "learning_rate": 1.4178972136575153e-05, + "loss": 0.4825, + "step": 3615 + }, + { + "epoch": 0.6428444444444444, + "grad_norm": 0.24252907931804657, + "learning_rate": 1.4166386926070322e-05, + "loss": 0.4685, + "step": 3616 + }, + { + "epoch": 0.6430222222222223, + "grad_norm": 0.30897241830825806, + "learning_rate": 1.4153805094875248e-05, + "loss": 0.491, + "step": 3617 + }, + { + "epoch": 0.6432, + "grad_norm": 0.2750794291496277, + "learning_rate": 1.4141226646914541e-05, + "loss": 0.4469, + "step": 3618 + }, + { + "epoch": 0.6433777777777778, + "grad_norm": 0.22074298560619354, + "learning_rate": 1.412865158611179e-05, + "loss": 0.4107, + "step": 3619 + }, + { + "epoch": 0.6435555555555555, + "grad_norm": 0.26437100768089294, + "learning_rate": 1.4116079916389507e-05, + "loss": 0.4277, + "step": 3620 + }, + { + "epoch": 0.6437333333333334, + "grad_norm": 0.288970947265625, + "learning_rate": 1.4103511641669152e-05, + "loss": 0.4564, + "step": 3621 + }, + { + "epoch": 0.6439111111111111, + "grad_norm": 0.2344079166650772, + "learning_rate": 1.4090946765871104e-05, + "loss": 0.3158, + "step": 3622 + }, + { + "epoch": 0.6440888888888889, + "grad_norm": 0.32060837745666504, + "learning_rate": 1.4078385292914736e-05, + "loss": 0.412, + "step": 3623 + }, + { + "epoch": 0.6442666666666667, + "grad_norm": 0.29774701595306396, + "learning_rate": 1.406582722671831e-05, + "loss": 0.4337, + "step": 3624 + }, + { + "epoch": 0.6444444444444445, + "grad_norm": 0.26115119457244873, + "learning_rate": 1.4053272571199036e-05, + "loss": 0.4082, + "step": 3625 + }, + { + "epoch": 0.6446222222222222, + "grad_norm": 0.26321181654930115, + "learning_rate": 1.4040721330273062e-05, + "loss": 0.5051, + "step": 3626 + }, + { + "epoch": 0.6448, + "grad_norm": 0.288730263710022, + "learning_rate": 1.4028173507855493e-05, + "loss": 0.5353, + "step": 3627 + }, + { + "epoch": 0.6449777777777778, + "grad_norm": 0.2653726041316986, + "learning_rate": 1.401562910786034e-05, + "loss": 0.4388, + "step": 3628 + }, + { + "epoch": 0.6451555555555556, + "grad_norm": 0.2973995506763458, + "learning_rate": 1.4003088134200553e-05, + "loss": 0.4968, + "step": 3629 + }, + { + "epoch": 0.6453333333333333, + "grad_norm": 0.2280983030796051, + "learning_rate": 1.399055059078801e-05, + "loss": 0.3877, + "step": 3630 + }, + { + "epoch": 0.6455111111111111, + "grad_norm": 0.22726783156394958, + "learning_rate": 1.397801648153354e-05, + "loss": 0.545, + "step": 3631 + }, + { + "epoch": 0.6456888888888889, + "grad_norm": 0.29215335845947266, + "learning_rate": 1.3965485810346874e-05, + "loss": 0.4247, + "step": 3632 + }, + { + "epoch": 0.6458666666666667, + "grad_norm": 0.32735562324523926, + "learning_rate": 1.3952958581136688e-05, + "loss": 0.5571, + "step": 3633 + }, + { + "epoch": 0.6460444444444444, + "grad_norm": 0.21734178066253662, + "learning_rate": 1.3940434797810568e-05, + "loss": 0.4084, + "step": 3634 + }, + { + "epoch": 0.6462222222222223, + "grad_norm": 0.3073517680168152, + "learning_rate": 1.392791446427505e-05, + "loss": 0.5069, + "step": 3635 + }, + { + "epoch": 0.6464, + "grad_norm": 0.2676813006401062, + "learning_rate": 1.3915397584435563e-05, + "loss": 0.4057, + "step": 3636 + }, + { + "epoch": 0.6465777777777778, + "grad_norm": 0.21096782386302948, + "learning_rate": 1.3902884162196508e-05, + "loss": 0.4146, + "step": 3637 + }, + { + "epoch": 0.6467555555555555, + "grad_norm": 0.2558507025241852, + "learning_rate": 1.3890374201461132e-05, + "loss": 0.5424, + "step": 3638 + }, + { + "epoch": 0.6469333333333334, + "grad_norm": 0.27405086159706116, + "learning_rate": 1.387786770613167e-05, + "loss": 0.4401, + "step": 3639 + }, + { + "epoch": 0.6471111111111111, + "grad_norm": 0.28879252076148987, + "learning_rate": 1.386536468010924e-05, + "loss": 0.4553, + "step": 3640 + }, + { + "epoch": 0.6472888888888889, + "grad_norm": 0.29778018593788147, + "learning_rate": 1.3852865127293902e-05, + "loss": 0.2955, + "step": 3641 + }, + { + "epoch": 0.6474666666666666, + "grad_norm": 0.28298091888427734, + "learning_rate": 1.3840369051584612e-05, + "loss": 0.3915, + "step": 3642 + }, + { + "epoch": 0.6476444444444445, + "grad_norm": 0.3052406311035156, + "learning_rate": 1.3827876456879246e-05, + "loss": 0.4773, + "step": 3643 + }, + { + "epoch": 0.6478222222222222, + "grad_norm": 0.3781207799911499, + "learning_rate": 1.3815387347074594e-05, + "loss": 0.3687, + "step": 3644 + }, + { + "epoch": 0.648, + "grad_norm": 0.3979189693927765, + "learning_rate": 1.3802901726066375e-05, + "loss": 0.3973, + "step": 3645 + }, + { + "epoch": 0.6481777777777777, + "grad_norm": 0.3300996720790863, + "learning_rate": 1.3790419597749199e-05, + "loss": 0.3711, + "step": 3646 + }, + { + "epoch": 0.6483555555555556, + "grad_norm": 0.6666622161865234, + "learning_rate": 1.3777940966016592e-05, + "loss": 0.5463, + "step": 3647 + }, + { + "epoch": 0.6485333333333333, + "grad_norm": 0.4219915568828583, + "learning_rate": 1.3765465834760988e-05, + "loss": 0.4281, + "step": 3648 + }, + { + "epoch": 0.6487111111111111, + "grad_norm": 0.3818369507789612, + "learning_rate": 1.3752994207873743e-05, + "loss": 0.5037, + "step": 3649 + }, + { + "epoch": 0.6488888888888888, + "grad_norm": 0.5252047777175903, + "learning_rate": 1.3740526089245109e-05, + "loss": 0.4474, + "step": 3650 + }, + { + "epoch": 0.6490666666666667, + "grad_norm": 0.3042842149734497, + "learning_rate": 1.3728061482764238e-05, + "loss": 0.525, + "step": 3651 + }, + { + "epoch": 0.6492444444444444, + "grad_norm": 0.293026864528656, + "learning_rate": 1.3715600392319186e-05, + "loss": 0.6196, + "step": 3652 + }, + { + "epoch": 0.6494222222222222, + "grad_norm": 0.2554410398006439, + "learning_rate": 1.3703142821796938e-05, + "loss": 0.4497, + "step": 3653 + }, + { + "epoch": 0.6496, + "grad_norm": 0.28548961877822876, + "learning_rate": 1.369068877508335e-05, + "loss": 0.5183, + "step": 3654 + }, + { + "epoch": 0.6497777777777778, + "grad_norm": 0.2873605191707611, + "learning_rate": 1.367823825606319e-05, + "loss": 0.6106, + "step": 3655 + }, + { + "epoch": 0.6499555555555555, + "grad_norm": 0.3186854422092438, + "learning_rate": 1.366579126862012e-05, + "loss": 0.7776, + "step": 3656 + }, + { + "epoch": 0.6501333333333333, + "grad_norm": 0.2968122065067291, + "learning_rate": 1.3653347816636727e-05, + "loss": 0.4679, + "step": 3657 + }, + { + "epoch": 0.6503111111111111, + "grad_norm": 0.2600296139717102, + "learning_rate": 1.3640907903994454e-05, + "loss": 0.5622, + "step": 3658 + }, + { + "epoch": 0.6504888888888889, + "grad_norm": 0.30393871665000916, + "learning_rate": 1.3628471534573686e-05, + "loss": 0.5214, + "step": 3659 + }, + { + "epoch": 0.6506666666666666, + "grad_norm": 0.2889195680618286, + "learning_rate": 1.3616038712253646e-05, + "loss": 0.4259, + "step": 3660 + }, + { + "epoch": 0.6508444444444444, + "grad_norm": 0.29727956652641296, + "learning_rate": 1.3603609440912507e-05, + "loss": 0.5057, + "step": 3661 + }, + { + "epoch": 0.6510222222222222, + "grad_norm": 0.28211915493011475, + "learning_rate": 1.3591183724427286e-05, + "loss": 0.5616, + "step": 3662 + }, + { + "epoch": 0.6512, + "grad_norm": 0.2475895881652832, + "learning_rate": 1.3578761566673954e-05, + "loss": 0.4202, + "step": 3663 + }, + { + "epoch": 0.6513777777777778, + "grad_norm": 0.28747761249542236, + "learning_rate": 1.3566342971527291e-05, + "loss": 0.5493, + "step": 3664 + }, + { + "epoch": 0.6515555555555556, + "grad_norm": 0.3396277129650116, + "learning_rate": 1.3553927942861016e-05, + "loss": 0.5443, + "step": 3665 + }, + { + "epoch": 0.6517333333333334, + "grad_norm": 0.335657000541687, + "learning_rate": 1.3541516484547753e-05, + "loss": 0.4237, + "step": 3666 + }, + { + "epoch": 0.6519111111111111, + "grad_norm": 0.2873026430606842, + "learning_rate": 1.3529108600458967e-05, + "loss": 0.5352, + "step": 3667 + }, + { + "epoch": 0.6520888888888889, + "grad_norm": 0.2761879861354828, + "learning_rate": 1.3516704294465027e-05, + "loss": 0.5863, + "step": 3668 + }, + { + "epoch": 0.6522666666666667, + "grad_norm": 0.3050605356693268, + "learning_rate": 1.3504303570435179e-05, + "loss": 0.6293, + "step": 3669 + }, + { + "epoch": 0.6524444444444445, + "grad_norm": 0.3679588735103607, + "learning_rate": 1.3491906432237577e-05, + "loss": 0.486, + "step": 3670 + }, + { + "epoch": 0.6526222222222222, + "grad_norm": 0.4235539734363556, + "learning_rate": 1.3479512883739232e-05, + "loss": 0.5246, + "step": 3671 + }, + { + "epoch": 0.6528, + "grad_norm": 0.25813600420951843, + "learning_rate": 1.3467122928806037e-05, + "loss": 0.4855, + "step": 3672 + }, + { + "epoch": 0.6529777777777778, + "grad_norm": 0.2842148542404175, + "learning_rate": 1.3454736571302763e-05, + "loss": 0.5106, + "step": 3673 + }, + { + "epoch": 0.6531555555555556, + "grad_norm": 0.2375819832086563, + "learning_rate": 1.3442353815093078e-05, + "loss": 0.4382, + "step": 3674 + }, + { + "epoch": 0.6533333333333333, + "grad_norm": 0.333158403635025, + "learning_rate": 1.3429974664039501e-05, + "loss": 0.5415, + "step": 3675 + }, + { + "epoch": 0.6535111111111112, + "grad_norm": 0.2537843883037567, + "learning_rate": 1.3417599122003464e-05, + "loss": 0.5797, + "step": 3676 + }, + { + "epoch": 0.6536888888888889, + "grad_norm": 0.28055626153945923, + "learning_rate": 1.340522719284521e-05, + "loss": 0.549, + "step": 3677 + }, + { + "epoch": 0.6538666666666667, + "grad_norm": 0.2653438150882721, + "learning_rate": 1.339285888042392e-05, + "loss": 0.4191, + "step": 3678 + }, + { + "epoch": 0.6540444444444444, + "grad_norm": 0.314916729927063, + "learning_rate": 1.3380494188597603e-05, + "loss": 0.7298, + "step": 3679 + }, + { + "epoch": 0.6542222222222223, + "grad_norm": 0.24185356497764587, + "learning_rate": 1.3368133121223187e-05, + "loss": 0.4542, + "step": 3680 + }, + { + "epoch": 0.6544, + "grad_norm": 0.3319655954837799, + "learning_rate": 1.3355775682156393e-05, + "loss": 0.5532, + "step": 3681 + }, + { + "epoch": 0.6545777777777778, + "grad_norm": 0.2659103274345398, + "learning_rate": 1.3343421875251888e-05, + "loss": 0.5087, + "step": 3682 + }, + { + "epoch": 0.6547555555555555, + "grad_norm": 0.35448548197746277, + "learning_rate": 1.3331071704363152e-05, + "loss": 0.5299, + "step": 3683 + }, + { + "epoch": 0.6549333333333334, + "grad_norm": 0.2732556164264679, + "learning_rate": 1.3318725173342572e-05, + "loss": 0.509, + "step": 3684 + }, + { + "epoch": 0.6551111111111111, + "grad_norm": 0.26744773983955383, + "learning_rate": 1.3306382286041369e-05, + "loss": 0.4271, + "step": 3685 + }, + { + "epoch": 0.6552888888888889, + "grad_norm": 0.23292432725429535, + "learning_rate": 1.329404304630964e-05, + "loss": 0.456, + "step": 3686 + }, + { + "epoch": 0.6554666666666666, + "grad_norm": 0.2885597348213196, + "learning_rate": 1.3281707457996335e-05, + "loss": 0.4234, + "step": 3687 + }, + { + "epoch": 0.6556444444444445, + "grad_norm": 0.2795572578907013, + "learning_rate": 1.3269375524949284e-05, + "loss": 0.5902, + "step": 3688 + }, + { + "epoch": 0.6558222222222222, + "grad_norm": 0.2856482267379761, + "learning_rate": 1.3257047251015158e-05, + "loss": 0.3293, + "step": 3689 + }, + { + "epoch": 0.656, + "grad_norm": 0.2692379951477051, + "learning_rate": 1.3244722640039497e-05, + "loss": 0.4182, + "step": 3690 + }, + { + "epoch": 0.6561777777777777, + "grad_norm": 0.26897457242012024, + "learning_rate": 1.3232401695866687e-05, + "loss": 0.3734, + "step": 3691 + }, + { + "epoch": 0.6563555555555556, + "grad_norm": 0.281819611787796, + "learning_rate": 1.322008442233999e-05, + "loss": 0.4514, + "step": 3692 + }, + { + "epoch": 0.6565333333333333, + "grad_norm": 0.33315202593803406, + "learning_rate": 1.3207770823301505e-05, + "loss": 0.3516, + "step": 3693 + }, + { + "epoch": 0.6567111111111111, + "grad_norm": 0.34838998317718506, + "learning_rate": 1.3195460902592195e-05, + "loss": 0.439, + "step": 3694 + }, + { + "epoch": 0.6568888888888889, + "grad_norm": 0.30764952301979065, + "learning_rate": 1.3183154664051855e-05, + "loss": 0.3998, + "step": 3695 + }, + { + "epoch": 0.6570666666666667, + "grad_norm": 0.3267304301261902, + "learning_rate": 1.3170852111519175e-05, + "loss": 0.3488, + "step": 3696 + }, + { + "epoch": 0.6572444444444444, + "grad_norm": 0.34809061884880066, + "learning_rate": 1.3158553248831657e-05, + "loss": 0.3854, + "step": 3697 + }, + { + "epoch": 0.6574222222222222, + "grad_norm": 0.47183486819267273, + "learning_rate": 1.3146258079825657e-05, + "loss": 0.4662, + "step": 3698 + }, + { + "epoch": 0.6576, + "grad_norm": 0.47761884331703186, + "learning_rate": 1.3133966608336385e-05, + "loss": 0.3672, + "step": 3699 + }, + { + "epoch": 0.6577777777777778, + "grad_norm": 0.4577677845954895, + "learning_rate": 1.3121678838197909e-05, + "loss": 0.468, + "step": 3700 + }, + { + "epoch": 0.6579555555555555, + "grad_norm": 0.30990877747535706, + "learning_rate": 1.3109394773243117e-05, + "loss": 0.4535, + "step": 3701 + }, + { + "epoch": 0.6581333333333333, + "grad_norm": 0.25815117359161377, + "learning_rate": 1.3097114417303782e-05, + "loss": 0.3632, + "step": 3702 + }, + { + "epoch": 0.6583111111111111, + "grad_norm": 0.27762022614479065, + "learning_rate": 1.308483777421046e-05, + "loss": 0.4664, + "step": 3703 + }, + { + "epoch": 0.6584888888888889, + "grad_norm": 0.2904873490333557, + "learning_rate": 1.3072564847792606e-05, + "loss": 0.4908, + "step": 3704 + }, + { + "epoch": 0.6586666666666666, + "grad_norm": 0.23394836485385895, + "learning_rate": 1.3060295641878473e-05, + "loss": 0.4202, + "step": 3705 + }, + { + "epoch": 0.6588444444444445, + "grad_norm": 0.2244613915681839, + "learning_rate": 1.3048030160295196e-05, + "loss": 0.4849, + "step": 3706 + }, + { + "epoch": 0.6590222222222222, + "grad_norm": 0.2783906161785126, + "learning_rate": 1.3035768406868714e-05, + "loss": 0.5783, + "step": 3707 + }, + { + "epoch": 0.6592, + "grad_norm": 0.27435386180877686, + "learning_rate": 1.302351038542381e-05, + "loss": 0.5487, + "step": 3708 + }, + { + "epoch": 0.6593777777777777, + "grad_norm": 0.25050902366638184, + "learning_rate": 1.3011256099784103e-05, + "loss": 0.5768, + "step": 3709 + }, + { + "epoch": 0.6595555555555556, + "grad_norm": 0.2641248106956482, + "learning_rate": 1.2999005553772068e-05, + "loss": 0.4147, + "step": 3710 + }, + { + "epoch": 0.6597333333333333, + "grad_norm": 0.2778720259666443, + "learning_rate": 1.2986758751208983e-05, + "loss": 0.542, + "step": 3711 + }, + { + "epoch": 0.6599111111111111, + "grad_norm": 0.2245502918958664, + "learning_rate": 1.297451569591498e-05, + "loss": 0.5583, + "step": 3712 + }, + { + "epoch": 0.6600888888888888, + "grad_norm": 0.22290337085723877, + "learning_rate": 1.2962276391708995e-05, + "loss": 0.5295, + "step": 3713 + }, + { + "epoch": 0.6602666666666667, + "grad_norm": 0.26890265941619873, + "learning_rate": 1.2950040842408834e-05, + "loss": 0.5159, + "step": 3714 + }, + { + "epoch": 0.6604444444444444, + "grad_norm": 0.2981129586696625, + "learning_rate": 1.2937809051831101e-05, + "loss": 0.534, + "step": 3715 + }, + { + "epoch": 0.6606222222222222, + "grad_norm": 0.261539101600647, + "learning_rate": 1.292558102379124e-05, + "loss": 0.4535, + "step": 3716 + }, + { + "epoch": 0.6608, + "grad_norm": 0.28450867533683777, + "learning_rate": 1.2913356762103502e-05, + "loss": 0.4279, + "step": 3717 + }, + { + "epoch": 0.6609777777777778, + "grad_norm": 0.2977967858314514, + "learning_rate": 1.2901136270580993e-05, + "loss": 0.4838, + "step": 3718 + }, + { + "epoch": 0.6611555555555556, + "grad_norm": 0.2609885632991791, + "learning_rate": 1.2888919553035653e-05, + "loss": 0.3574, + "step": 3719 + }, + { + "epoch": 0.6613333333333333, + "grad_norm": 0.28258606791496277, + "learning_rate": 1.2876706613278178e-05, + "loss": 0.5733, + "step": 3720 + }, + { + "epoch": 0.6615111111111112, + "grad_norm": 0.28239452838897705, + "learning_rate": 1.2864497455118152e-05, + "loss": 0.4997, + "step": 3721 + }, + { + "epoch": 0.6616888888888889, + "grad_norm": 0.2931699752807617, + "learning_rate": 1.2852292082363948e-05, + "loss": 0.3521, + "step": 3722 + }, + { + "epoch": 0.6618666666666667, + "grad_norm": 0.3117409944534302, + "learning_rate": 1.2840090498822788e-05, + "loss": 0.4556, + "step": 3723 + }, + { + "epoch": 0.6620444444444444, + "grad_norm": 0.31448081135749817, + "learning_rate": 1.2827892708300649e-05, + "loss": 0.4889, + "step": 3724 + }, + { + "epoch": 0.6622222222222223, + "grad_norm": 0.25680407881736755, + "learning_rate": 1.2815698714602403e-05, + "loss": 0.4939, + "step": 3725 + }, + { + "epoch": 0.6624, + "grad_norm": 0.21393144130706787, + "learning_rate": 1.280350852153168e-05, + "loss": 0.4222, + "step": 3726 + }, + { + "epoch": 0.6625777777777778, + "grad_norm": 0.28916069865226746, + "learning_rate": 1.279132213289096e-05, + "loss": 0.5149, + "step": 3727 + }, + { + "epoch": 0.6627555555555555, + "grad_norm": 0.30744144320487976, + "learning_rate": 1.2779139552481517e-05, + "loss": 0.4206, + "step": 3728 + }, + { + "epoch": 0.6629333333333334, + "grad_norm": 0.27311044931411743, + "learning_rate": 1.2766960784103438e-05, + "loss": 0.4847, + "step": 3729 + }, + { + "epoch": 0.6631111111111111, + "grad_norm": 0.27114731073379517, + "learning_rate": 1.2754785831555615e-05, + "loss": 0.4611, + "step": 3730 + }, + { + "epoch": 0.6632888888888889, + "grad_norm": 0.33003324270248413, + "learning_rate": 1.2742614698635782e-05, + "loss": 0.547, + "step": 3731 + }, + { + "epoch": 0.6634666666666666, + "grad_norm": 0.2770994305610657, + "learning_rate": 1.2730447389140449e-05, + "loss": 0.3976, + "step": 3732 + }, + { + "epoch": 0.6636444444444445, + "grad_norm": 0.28405457735061646, + "learning_rate": 1.2718283906864939e-05, + "loss": 0.6457, + "step": 3733 + }, + { + "epoch": 0.6638222222222222, + "grad_norm": 0.3116143047809601, + "learning_rate": 1.270612425560338e-05, + "loss": 0.4857, + "step": 3734 + }, + { + "epoch": 0.664, + "grad_norm": 0.30108338594436646, + "learning_rate": 1.2693968439148726e-05, + "loss": 0.5515, + "step": 3735 + }, + { + "epoch": 0.6641777777777778, + "grad_norm": 0.269810289144516, + "learning_rate": 1.2681816461292715e-05, + "loss": 0.471, + "step": 3736 + }, + { + "epoch": 0.6643555555555556, + "grad_norm": 0.2827768325805664, + "learning_rate": 1.266966832582589e-05, + "loss": 0.5727, + "step": 3737 + }, + { + "epoch": 0.6645333333333333, + "grad_norm": 0.22445344924926758, + "learning_rate": 1.2657524036537582e-05, + "loss": 0.5158, + "step": 3738 + }, + { + "epoch": 0.6647111111111111, + "grad_norm": 0.296317994594574, + "learning_rate": 1.2645383597215964e-05, + "loss": 0.4524, + "step": 3739 + }, + { + "epoch": 0.6648888888888889, + "grad_norm": 0.2978474497795105, + "learning_rate": 1.263324701164797e-05, + "loss": 0.4436, + "step": 3740 + }, + { + "epoch": 0.6650666666666667, + "grad_norm": 0.27710801362991333, + "learning_rate": 1.2621114283619345e-05, + "loss": 0.3271, + "step": 3741 + }, + { + "epoch": 0.6652444444444444, + "grad_norm": 0.32322874665260315, + "learning_rate": 1.2608985416914616e-05, + "loss": 0.5243, + "step": 3742 + }, + { + "epoch": 0.6654222222222222, + "grad_norm": 0.34060606360435486, + "learning_rate": 1.259686041531714e-05, + "loss": 0.4424, + "step": 3743 + }, + { + "epoch": 0.6656, + "grad_norm": 0.32810261845588684, + "learning_rate": 1.2584739282609031e-05, + "loss": 0.4825, + "step": 3744 + }, + { + "epoch": 0.6657777777777778, + "grad_norm": 0.38095948100090027, + "learning_rate": 1.257262202257124e-05, + "loss": 0.3541, + "step": 3745 + }, + { + "epoch": 0.6659555555555555, + "grad_norm": 0.27892521023750305, + "learning_rate": 1.2560508638983437e-05, + "loss": 0.3036, + "step": 3746 + }, + { + "epoch": 0.6661333333333334, + "grad_norm": 0.33980947732925415, + "learning_rate": 1.2548399135624167e-05, + "loss": 0.4225, + "step": 3747 + }, + { + "epoch": 0.6663111111111111, + "grad_norm": 0.3240998387336731, + "learning_rate": 1.2536293516270703e-05, + "loss": 0.3694, + "step": 3748 + }, + { + "epoch": 0.6664888888888889, + "grad_norm": 0.44670820236206055, + "learning_rate": 1.2524191784699147e-05, + "loss": 0.4797, + "step": 3749 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.5575885772705078, + "learning_rate": 1.2512093944684361e-05, + "loss": 0.4969, + "step": 3750 + }, + { + "epoch": 0.6668444444444445, + "grad_norm": 0.22958840429782867, + "learning_rate": 1.2500000000000006e-05, + "loss": 0.4522, + "step": 3751 + }, + { + "epoch": 0.6670222222222222, + "grad_norm": 0.2590342164039612, + "learning_rate": 1.2487909954418511e-05, + "loss": 0.4772, + "step": 3752 + }, + { + "epoch": 0.6672, + "grad_norm": 0.3410946726799011, + "learning_rate": 1.2475823811711126e-05, + "loss": 0.5314, + "step": 3753 + }, + { + "epoch": 0.6673777777777777, + "grad_norm": 0.268044650554657, + "learning_rate": 1.246374157564785e-05, + "loss": 0.3748, + "step": 3754 + }, + { + "epoch": 0.6675555555555556, + "grad_norm": 0.24918286502361298, + "learning_rate": 1.245166324999747e-05, + "loss": 0.5083, + "step": 3755 + }, + { + "epoch": 0.6677333333333333, + "grad_norm": 0.29544925689697266, + "learning_rate": 1.243958883852755e-05, + "loss": 0.6156, + "step": 3756 + }, + { + "epoch": 0.6679111111111111, + "grad_norm": 0.26616764068603516, + "learning_rate": 1.2427518345004458e-05, + "loss": 0.4468, + "step": 3757 + }, + { + "epoch": 0.6680888888888888, + "grad_norm": 0.24368666112422943, + "learning_rate": 1.241545177319331e-05, + "loss": 0.4613, + "step": 3758 + }, + { + "epoch": 0.6682666666666667, + "grad_norm": 0.3137027621269226, + "learning_rate": 1.2403389126858014e-05, + "loss": 0.5144, + "step": 3759 + }, + { + "epoch": 0.6684444444444444, + "grad_norm": 0.288210928440094, + "learning_rate": 1.239133040976124e-05, + "loss": 0.5745, + "step": 3760 + }, + { + "epoch": 0.6686222222222222, + "grad_norm": 0.304063081741333, + "learning_rate": 1.2379275625664461e-05, + "loss": 0.5395, + "step": 3761 + }, + { + "epoch": 0.6688, + "grad_norm": 0.3440232574939728, + "learning_rate": 1.2367224778327894e-05, + "loss": 0.6002, + "step": 3762 + }, + { + "epoch": 0.6689777777777778, + "grad_norm": 0.31900396943092346, + "learning_rate": 1.2355177871510538e-05, + "loss": 0.5787, + "step": 3763 + }, + { + "epoch": 0.6691555555555555, + "grad_norm": 0.26778024435043335, + "learning_rate": 1.2343134908970153e-05, + "loss": 0.5582, + "step": 3764 + }, + { + "epoch": 0.6693333333333333, + "grad_norm": 0.28850457072257996, + "learning_rate": 1.2331095894463302e-05, + "loss": 0.5564, + "step": 3765 + }, + { + "epoch": 0.6695111111111111, + "grad_norm": 0.27700814604759216, + "learning_rate": 1.2319060831745272e-05, + "loss": 0.3899, + "step": 3766 + }, + { + "epoch": 0.6696888888888889, + "grad_norm": 0.2851391136646271, + "learning_rate": 1.2307029724570166e-05, + "loss": 0.4866, + "step": 3767 + }, + { + "epoch": 0.6698666666666667, + "grad_norm": 0.29823037981987, + "learning_rate": 1.229500257669079e-05, + "loss": 0.458, + "step": 3768 + }, + { + "epoch": 0.6700444444444444, + "grad_norm": 0.28471851348876953, + "learning_rate": 1.2282979391858768e-05, + "loss": 0.4589, + "step": 3769 + }, + { + "epoch": 0.6702222222222223, + "grad_norm": 0.3492095470428467, + "learning_rate": 1.2270960173824483e-05, + "loss": 0.4394, + "step": 3770 + }, + { + "epoch": 0.6704, + "grad_norm": 0.2824592590332031, + "learning_rate": 1.2258944926337057e-05, + "loss": 0.4296, + "step": 3771 + }, + { + "epoch": 0.6705777777777778, + "grad_norm": 0.30075711011886597, + "learning_rate": 1.2246933653144385e-05, + "loss": 0.5613, + "step": 3772 + }, + { + "epoch": 0.6707555555555555, + "grad_norm": 0.2932722568511963, + "learning_rate": 1.2234926357993115e-05, + "loss": 0.4561, + "step": 3773 + }, + { + "epoch": 0.6709333333333334, + "grad_norm": 0.2685048282146454, + "learning_rate": 1.2222923044628676e-05, + "loss": 0.5374, + "step": 3774 + }, + { + "epoch": 0.6711111111111111, + "grad_norm": 0.3064698874950409, + "learning_rate": 1.2210923716795233e-05, + "loss": 0.57, + "step": 3775 + }, + { + "epoch": 0.6712888888888889, + "grad_norm": 0.28305307030677795, + "learning_rate": 1.2198928378235716e-05, + "loss": 0.4692, + "step": 3776 + }, + { + "epoch": 0.6714666666666667, + "grad_norm": 0.3337193727493286, + "learning_rate": 1.21869370326918e-05, + "loss": 0.5497, + "step": 3777 + }, + { + "epoch": 0.6716444444444445, + "grad_norm": 0.23489388823509216, + "learning_rate": 1.2174949683903941e-05, + "loss": 0.4302, + "step": 3778 + }, + { + "epoch": 0.6718222222222222, + "grad_norm": 0.29271256923675537, + "learning_rate": 1.2162966335611326e-05, + "loss": 0.5776, + "step": 3779 + }, + { + "epoch": 0.672, + "grad_norm": 0.272230327129364, + "learning_rate": 1.2150986991551897e-05, + "loss": 0.3446, + "step": 3780 + }, + { + "epoch": 0.6721777777777778, + "grad_norm": 0.3177034258842468, + "learning_rate": 1.2139011655462337e-05, + "loss": 0.7109, + "step": 3781 + }, + { + "epoch": 0.6723555555555556, + "grad_norm": 0.31202468276023865, + "learning_rate": 1.2127040331078115e-05, + "loss": 0.4923, + "step": 3782 + }, + { + "epoch": 0.6725333333333333, + "grad_norm": 0.24914774298667908, + "learning_rate": 1.2115073022133408e-05, + "loss": 0.4049, + "step": 3783 + }, + { + "epoch": 0.6727111111111111, + "grad_norm": 0.2841534912586212, + "learning_rate": 1.2103109732361179e-05, + "loss": 0.4446, + "step": 3784 + }, + { + "epoch": 0.6728888888888889, + "grad_norm": 0.26708075404167175, + "learning_rate": 1.2091150465493084e-05, + "loss": 0.5042, + "step": 3785 + }, + { + "epoch": 0.6730666666666667, + "grad_norm": 0.2697168290615082, + "learning_rate": 1.2079195225259579e-05, + "loss": 0.4903, + "step": 3786 + }, + { + "epoch": 0.6732444444444444, + "grad_norm": 0.26974692940711975, + "learning_rate": 1.2067244015389829e-05, + "loss": 0.3418, + "step": 3787 + }, + { + "epoch": 0.6734222222222223, + "grad_norm": 0.3019174635410309, + "learning_rate": 1.2055296839611773e-05, + "loss": 0.5222, + "step": 3788 + }, + { + "epoch": 0.6736, + "grad_norm": 0.32124119997024536, + "learning_rate": 1.2043353701652045e-05, + "loss": 0.5975, + "step": 3789 + }, + { + "epoch": 0.6737777777777778, + "grad_norm": 0.2736112177371979, + "learning_rate": 1.2031414605236066e-05, + "loss": 0.5762, + "step": 3790 + }, + { + "epoch": 0.6739555555555555, + "grad_norm": 0.34176111221313477, + "learning_rate": 1.2019479554087964e-05, + "loss": 0.5687, + "step": 3791 + }, + { + "epoch": 0.6741333333333334, + "grad_norm": 0.5681228041648865, + "learning_rate": 1.2007548551930634e-05, + "loss": 0.3991, + "step": 3792 + }, + { + "epoch": 0.6743111111111111, + "grad_norm": 0.31165456771850586, + "learning_rate": 1.1995621602485685e-05, + "loss": 0.4434, + "step": 3793 + }, + { + "epoch": 0.6744888888888889, + "grad_norm": 0.35658934712409973, + "learning_rate": 1.1983698709473468e-05, + "loss": 0.3794, + "step": 3794 + }, + { + "epoch": 0.6746666666666666, + "grad_norm": 0.3436378240585327, + "learning_rate": 1.1971779876613063e-05, + "loss": 0.3575, + "step": 3795 + }, + { + "epoch": 0.6748444444444445, + "grad_norm": 0.3623034656047821, + "learning_rate": 1.1959865107622307e-05, + "loss": 0.3865, + "step": 3796 + }, + { + "epoch": 0.6750222222222222, + "grad_norm": 0.38479626178741455, + "learning_rate": 1.194795440621774e-05, + "loss": 0.4245, + "step": 3797 + }, + { + "epoch": 0.6752, + "grad_norm": 0.40485680103302, + "learning_rate": 1.1936047776114654e-05, + "loss": 0.4821, + "step": 3798 + }, + { + "epoch": 0.6753777777777777, + "grad_norm": 0.5116071701049805, + "learning_rate": 1.1924145221027047e-05, + "loss": 0.3986, + "step": 3799 + }, + { + "epoch": 0.6755555555555556, + "grad_norm": 0.4739973843097687, + "learning_rate": 1.1912246744667686e-05, + "loss": 0.4343, + "step": 3800 + }, + { + "epoch": 0.6757333333333333, + "grad_norm": 0.2875535488128662, + "learning_rate": 1.1900352350748026e-05, + "loss": 0.4868, + "step": 3801 + }, + { + "epoch": 0.6759111111111111, + "grad_norm": 0.3556734323501587, + "learning_rate": 1.1888462042978268e-05, + "loss": 0.6428, + "step": 3802 + }, + { + "epoch": 0.6760888888888889, + "grad_norm": 0.22967340052127838, + "learning_rate": 1.1876575825067327e-05, + "loss": 0.3634, + "step": 3803 + }, + { + "epoch": 0.6762666666666667, + "grad_norm": 0.280965119600296, + "learning_rate": 1.1864693700722865e-05, + "loss": 0.5466, + "step": 3804 + }, + { + "epoch": 0.6764444444444444, + "grad_norm": 0.2734106183052063, + "learning_rate": 1.1852815673651246e-05, + "loss": 0.507, + "step": 3805 + }, + { + "epoch": 0.6766222222222222, + "grad_norm": 0.3374417722225189, + "learning_rate": 1.1840941747557558e-05, + "loss": 0.4295, + "step": 3806 + }, + { + "epoch": 0.6768, + "grad_norm": 0.28618279099464417, + "learning_rate": 1.1829071926145607e-05, + "loss": 0.5167, + "step": 3807 + }, + { + "epoch": 0.6769777777777778, + "grad_norm": 0.31252652406692505, + "learning_rate": 1.1817206213117942e-05, + "loss": 0.4678, + "step": 3808 + }, + { + "epoch": 0.6771555555555555, + "grad_norm": 0.2891975939273834, + "learning_rate": 1.18053446121758e-05, + "loss": 0.597, + "step": 3809 + }, + { + "epoch": 0.6773333333333333, + "grad_norm": 0.23467566072940826, + "learning_rate": 1.1793487127019173e-05, + "loss": 0.446, + "step": 3810 + }, + { + "epoch": 0.6775111111111111, + "grad_norm": 0.29486486315727234, + "learning_rate": 1.1781633761346707e-05, + "loss": 0.5508, + "step": 3811 + }, + { + "epoch": 0.6776888888888889, + "grad_norm": 0.27178841829299927, + "learning_rate": 1.1769784518855836e-05, + "loss": 0.445, + "step": 3812 + }, + { + "epoch": 0.6778666666666666, + "grad_norm": 0.3029109537601471, + "learning_rate": 1.1757939403242647e-05, + "loss": 0.5707, + "step": 3813 + }, + { + "epoch": 0.6780444444444444, + "grad_norm": 0.2756340205669403, + "learning_rate": 1.1746098418201986e-05, + "loss": 0.5961, + "step": 3814 + }, + { + "epoch": 0.6782222222222222, + "grad_norm": 0.3047228157520294, + "learning_rate": 1.1734261567427385e-05, + "loss": 0.4688, + "step": 3815 + }, + { + "epoch": 0.6784, + "grad_norm": 0.25551119446754456, + "learning_rate": 1.172242885461109e-05, + "loss": 0.4503, + "step": 3816 + }, + { + "epoch": 0.6785777777777777, + "grad_norm": 0.3651791512966156, + "learning_rate": 1.1710600283444047e-05, + "loss": 0.6738, + "step": 3817 + }, + { + "epoch": 0.6787555555555556, + "grad_norm": 0.313495397567749, + "learning_rate": 1.1698775857615943e-05, + "loss": 0.6173, + "step": 3818 + }, + { + "epoch": 0.6789333333333334, + "grad_norm": 0.2680607736110687, + "learning_rate": 1.1686955580815137e-05, + "loss": 0.5231, + "step": 3819 + }, + { + "epoch": 0.6791111111111111, + "grad_norm": 0.3124810457229614, + "learning_rate": 1.1675139456728701e-05, + "loss": 0.4955, + "step": 3820 + }, + { + "epoch": 0.6792888888888889, + "grad_norm": 0.26599523425102234, + "learning_rate": 1.1663327489042435e-05, + "loss": 0.4248, + "step": 3821 + }, + { + "epoch": 0.6794666666666667, + "grad_norm": 0.26010963320732117, + "learning_rate": 1.1651519681440817e-05, + "loss": 0.5021, + "step": 3822 + }, + { + "epoch": 0.6796444444444445, + "grad_norm": 0.20134033262729645, + "learning_rate": 1.1639716037607035e-05, + "loss": 0.4678, + "step": 3823 + }, + { + "epoch": 0.6798222222222222, + "grad_norm": 0.28365206718444824, + "learning_rate": 1.1627916561222968e-05, + "loss": 0.5323, + "step": 3824 + }, + { + "epoch": 0.68, + "grad_norm": 0.25659167766571045, + "learning_rate": 1.1616121255969226e-05, + "loss": 0.3852, + "step": 3825 + }, + { + "epoch": 0.6801777777777778, + "grad_norm": 0.3166353106498718, + "learning_rate": 1.1604330125525079e-05, + "loss": 0.5197, + "step": 3826 + }, + { + "epoch": 0.6803555555555556, + "grad_norm": 0.30156680941581726, + "learning_rate": 1.159254317356854e-05, + "loss": 0.5858, + "step": 3827 + }, + { + "epoch": 0.6805333333333333, + "grad_norm": 0.3877135217189789, + "learning_rate": 1.1580760403776255e-05, + "loss": 0.5837, + "step": 3828 + }, + { + "epoch": 0.6807111111111112, + "grad_norm": 0.2418241947889328, + "learning_rate": 1.1568981819823635e-05, + "loss": 0.4639, + "step": 3829 + }, + { + "epoch": 0.6808888888888889, + "grad_norm": 0.3416629731655121, + "learning_rate": 1.1557207425384728e-05, + "loss": 0.622, + "step": 3830 + }, + { + "epoch": 0.6810666666666667, + "grad_norm": 0.29652127623558044, + "learning_rate": 1.1545437224132318e-05, + "loss": 0.5851, + "step": 3831 + }, + { + "epoch": 0.6812444444444444, + "grad_norm": 0.32427823543548584, + "learning_rate": 1.153367121973786e-05, + "loss": 0.3783, + "step": 3832 + }, + { + "epoch": 0.6814222222222223, + "grad_norm": 0.3264455199241638, + "learning_rate": 1.1521909415871502e-05, + "loss": 0.3677, + "step": 3833 + }, + { + "epoch": 0.6816, + "grad_norm": 0.28872057795524597, + "learning_rate": 1.1510151816202067e-05, + "loss": 0.4233, + "step": 3834 + }, + { + "epoch": 0.6817777777777778, + "grad_norm": 0.23662762343883514, + "learning_rate": 1.1498398424397106e-05, + "loss": 0.3451, + "step": 3835 + }, + { + "epoch": 0.6819555555555555, + "grad_norm": 0.3008701503276825, + "learning_rate": 1.1486649244122824e-05, + "loss": 0.5659, + "step": 3836 + }, + { + "epoch": 0.6821333333333334, + "grad_norm": 0.3152478039264679, + "learning_rate": 1.1474904279044122e-05, + "loss": 0.509, + "step": 3837 + }, + { + "epoch": 0.6823111111111111, + "grad_norm": 0.24948373436927795, + "learning_rate": 1.1463163532824572e-05, + "loss": 0.487, + "step": 3838 + }, + { + "epoch": 0.6824888888888889, + "grad_norm": 0.24704550206661224, + "learning_rate": 1.1451427009126472e-05, + "loss": 0.3945, + "step": 3839 + }, + { + "epoch": 0.6826666666666666, + "grad_norm": 0.37283769249916077, + "learning_rate": 1.1439694711610756e-05, + "loss": 0.4606, + "step": 3840 + }, + { + "epoch": 0.6828444444444445, + "grad_norm": 0.2884368300437927, + "learning_rate": 1.1427966643937069e-05, + "loss": 0.505, + "step": 3841 + }, + { + "epoch": 0.6830222222222222, + "grad_norm": 0.3158597946166992, + "learning_rate": 1.1416242809763708e-05, + "loss": 0.4207, + "step": 3842 + }, + { + "epoch": 0.6832, + "grad_norm": 0.3262021541595459, + "learning_rate": 1.1404523212747692e-05, + "loss": 0.4389, + "step": 3843 + }, + { + "epoch": 0.6833777777777778, + "grad_norm": 0.40961745381355286, + "learning_rate": 1.1392807856544683e-05, + "loss": 0.3301, + "step": 3844 + }, + { + "epoch": 0.6835555555555556, + "grad_norm": 0.34743255376815796, + "learning_rate": 1.1381096744809028e-05, + "loss": 0.4311, + "step": 3845 + }, + { + "epoch": 0.6837333333333333, + "grad_norm": 0.33275002241134644, + "learning_rate": 1.1369389881193749e-05, + "loss": 0.5035, + "step": 3846 + }, + { + "epoch": 0.6839111111111111, + "grad_norm": 0.42534899711608887, + "learning_rate": 1.1357687269350564e-05, + "loss": 0.3949, + "step": 3847 + }, + { + "epoch": 0.6840888888888889, + "grad_norm": 0.3521302342414856, + "learning_rate": 1.1345988912929839e-05, + "loss": 0.3998, + "step": 3848 + }, + { + "epoch": 0.6842666666666667, + "grad_norm": 0.6208592057228088, + "learning_rate": 1.1334294815580623e-05, + "loss": 0.3799, + "step": 3849 + }, + { + "epoch": 0.6844444444444444, + "grad_norm": 0.4304450750350952, + "learning_rate": 1.1322604980950621e-05, + "loss": 0.4651, + "step": 3850 + }, + { + "epoch": 0.6846222222222222, + "grad_norm": 0.25129497051239014, + "learning_rate": 1.1310919412686247e-05, + "loss": 0.3831, + "step": 3851 + }, + { + "epoch": 0.6848, + "grad_norm": 0.29035142064094543, + "learning_rate": 1.129923811443254e-05, + "loss": 0.4465, + "step": 3852 + }, + { + "epoch": 0.6849777777777778, + "grad_norm": 0.2642446756362915, + "learning_rate": 1.1287561089833248e-05, + "loss": 0.3919, + "step": 3853 + }, + { + "epoch": 0.6851555555555555, + "grad_norm": 0.27215635776519775, + "learning_rate": 1.1275888342530736e-05, + "loss": 0.4977, + "step": 3854 + }, + { + "epoch": 0.6853333333333333, + "grad_norm": 0.37257617712020874, + "learning_rate": 1.1264219876166085e-05, + "loss": 0.665, + "step": 3855 + }, + { + "epoch": 0.6855111111111111, + "grad_norm": 0.2865580916404724, + "learning_rate": 1.1252555694379006e-05, + "loss": 0.4658, + "step": 3856 + }, + { + "epoch": 0.6856888888888889, + "grad_norm": 0.29447847604751587, + "learning_rate": 1.1240895800807899e-05, + "loss": 0.4554, + "step": 3857 + }, + { + "epoch": 0.6858666666666666, + "grad_norm": 0.28941842913627625, + "learning_rate": 1.1229240199089807e-05, + "loss": 0.4038, + "step": 3858 + }, + { + "epoch": 0.6860444444444445, + "grad_norm": 0.29001617431640625, + "learning_rate": 1.1217588892860445e-05, + "loss": 0.7321, + "step": 3859 + }, + { + "epoch": 0.6862222222222222, + "grad_norm": 0.2791994512081146, + "learning_rate": 1.120594188575417e-05, + "loss": 0.5576, + "step": 3860 + }, + { + "epoch": 0.6864, + "grad_norm": 0.27800828218460083, + "learning_rate": 1.1194299181404036e-05, + "loss": 0.4253, + "step": 3861 + }, + { + "epoch": 0.6865777777777777, + "grad_norm": 0.2804970443248749, + "learning_rate": 1.1182660783441718e-05, + "loss": 0.4371, + "step": 3862 + }, + { + "epoch": 0.6867555555555556, + "grad_norm": 0.26960620284080505, + "learning_rate": 1.1171026695497558e-05, + "loss": 0.4828, + "step": 3863 + }, + { + "epoch": 0.6869333333333333, + "grad_norm": 0.27683693170547485, + "learning_rate": 1.1159396921200554e-05, + "loss": 0.3689, + "step": 3864 + }, + { + "epoch": 0.6871111111111111, + "grad_norm": 0.24399888515472412, + "learning_rate": 1.1147771464178378e-05, + "loss": 0.5386, + "step": 3865 + }, + { + "epoch": 0.6872888888888888, + "grad_norm": 0.37429189682006836, + "learning_rate": 1.1136150328057324e-05, + "loss": 0.6165, + "step": 3866 + }, + { + "epoch": 0.6874666666666667, + "grad_norm": 0.2687307894229889, + "learning_rate": 1.1124533516462356e-05, + "loss": 0.4125, + "step": 3867 + }, + { + "epoch": 0.6876444444444444, + "grad_norm": 0.31186121702194214, + "learning_rate": 1.1112921033017079e-05, + "loss": 0.5019, + "step": 3868 + }, + { + "epoch": 0.6878222222222222, + "grad_norm": 0.22518765926361084, + "learning_rate": 1.1101312881343768e-05, + "loss": 0.3973, + "step": 3869 + }, + { + "epoch": 0.688, + "grad_norm": 0.295976459980011, + "learning_rate": 1.1089709065063324e-05, + "loss": 0.5375, + "step": 3870 + }, + { + "epoch": 0.6881777777777778, + "grad_norm": 0.2932175397872925, + "learning_rate": 1.107810958779531e-05, + "loss": 0.4747, + "step": 3871 + }, + { + "epoch": 0.6883555555555556, + "grad_norm": 0.26988255977630615, + "learning_rate": 1.1066514453157914e-05, + "loss": 0.5588, + "step": 3872 + }, + { + "epoch": 0.6885333333333333, + "grad_norm": 0.2832370400428772, + "learning_rate": 1.1054923664768002e-05, + "loss": 0.4237, + "step": 3873 + }, + { + "epoch": 0.6887111111111112, + "grad_norm": 0.30757683515548706, + "learning_rate": 1.1043337226241073e-05, + "loss": 0.47, + "step": 3874 + }, + { + "epoch": 0.6888888888888889, + "grad_norm": 0.30981534719467163, + "learning_rate": 1.1031755141191258e-05, + "loss": 0.5936, + "step": 3875 + }, + { + "epoch": 0.6890666666666667, + "grad_norm": 0.26662352681159973, + "learning_rate": 1.1020177413231334e-05, + "loss": 0.4165, + "step": 3876 + }, + { + "epoch": 0.6892444444444444, + "grad_norm": 0.2750694453716278, + "learning_rate": 1.1008604045972709e-05, + "loss": 0.4738, + "step": 3877 + }, + { + "epoch": 0.6894222222222223, + "grad_norm": 0.3151184618473053, + "learning_rate": 1.099703504302547e-05, + "loss": 0.4888, + "step": 3878 + }, + { + "epoch": 0.6896, + "grad_norm": 0.34451162815093994, + "learning_rate": 1.0985470407998296e-05, + "loss": 0.5971, + "step": 3879 + }, + { + "epoch": 0.6897777777777778, + "grad_norm": 0.2798691987991333, + "learning_rate": 1.0973910144498534e-05, + "loss": 0.3761, + "step": 3880 + }, + { + "epoch": 0.6899555555555555, + "grad_norm": 0.2597362697124481, + "learning_rate": 1.0962354256132141e-05, + "loss": 0.3171, + "step": 3881 + }, + { + "epoch": 0.6901333333333334, + "grad_norm": 0.2765495479106903, + "learning_rate": 1.095080274650374e-05, + "loss": 0.5342, + "step": 3882 + }, + { + "epoch": 0.6903111111111111, + "grad_norm": 0.3069227337837219, + "learning_rate": 1.093925561921657e-05, + "loss": 0.7408, + "step": 3883 + }, + { + "epoch": 0.6904888888888889, + "grad_norm": 0.2968656122684479, + "learning_rate": 1.0927712877872504e-05, + "loss": 0.4841, + "step": 3884 + }, + { + "epoch": 0.6906666666666667, + "grad_norm": 0.3325310945510864, + "learning_rate": 1.091617452607204e-05, + "loss": 0.4677, + "step": 3885 + }, + { + "epoch": 0.6908444444444445, + "grad_norm": 0.345062255859375, + "learning_rate": 1.0904640567414332e-05, + "loss": 0.4899, + "step": 3886 + }, + { + "epoch": 0.6910222222222222, + "grad_norm": 0.27838656306266785, + "learning_rate": 1.089311100549714e-05, + "loss": 0.5489, + "step": 3887 + }, + { + "epoch": 0.6912, + "grad_norm": 0.3260872960090637, + "learning_rate": 1.0881585843916859e-05, + "loss": 0.5351, + "step": 3888 + }, + { + "epoch": 0.6913777777777778, + "grad_norm": 0.31912434101104736, + "learning_rate": 1.0870065086268505e-05, + "loss": 0.4479, + "step": 3889 + }, + { + "epoch": 0.6915555555555556, + "grad_norm": 0.2957444190979004, + "learning_rate": 1.0858548736145744e-05, + "loss": 0.3537, + "step": 3890 + }, + { + "epoch": 0.6917333333333333, + "grad_norm": 0.26471593976020813, + "learning_rate": 1.0847036797140831e-05, + "loss": 0.2969, + "step": 3891 + }, + { + "epoch": 0.6919111111111111, + "grad_norm": 0.30726662278175354, + "learning_rate": 1.0835529272844694e-05, + "loss": 0.3737, + "step": 3892 + }, + { + "epoch": 0.6920888888888889, + "grad_norm": 0.35442402958869934, + "learning_rate": 1.0824026166846818e-05, + "loss": 0.467, + "step": 3893 + }, + { + "epoch": 0.6922666666666667, + "grad_norm": 0.435217946767807, + "learning_rate": 1.0812527482735377e-05, + "loss": 0.4258, + "step": 3894 + }, + { + "epoch": 0.6924444444444444, + "grad_norm": 0.3302832543849945, + "learning_rate": 1.0801033224097109e-05, + "loss": 0.3952, + "step": 3895 + }, + { + "epoch": 0.6926222222222223, + "grad_norm": 0.49825379252433777, + "learning_rate": 1.0789543394517435e-05, + "loss": 0.4785, + "step": 3896 + }, + { + "epoch": 0.6928, + "grad_norm": 0.49600350856781006, + "learning_rate": 1.0778057997580313e-05, + "loss": 0.4827, + "step": 3897 + }, + { + "epoch": 0.6929777777777778, + "grad_norm": 0.3542730212211609, + "learning_rate": 1.0766577036868395e-05, + "loss": 0.4403, + "step": 3898 + }, + { + "epoch": 0.6931555555555555, + "grad_norm": 0.4241696298122406, + "learning_rate": 1.0755100515962893e-05, + "loss": 0.4236, + "step": 3899 + }, + { + "epoch": 0.6933333333333334, + "grad_norm": 0.49910980463027954, + "learning_rate": 1.074362843844368e-05, + "loss": 0.4648, + "step": 3900 + }, + { + "epoch": 0.6935111111111111, + "grad_norm": 0.3297288715839386, + "learning_rate": 1.0732160807889211e-05, + "loss": 0.6461, + "step": 3901 + }, + { + "epoch": 0.6936888888888889, + "grad_norm": 0.2836495041847229, + "learning_rate": 1.0720697627876564e-05, + "loss": 0.5754, + "step": 3902 + }, + { + "epoch": 0.6938666666666666, + "grad_norm": 0.262487530708313, + "learning_rate": 1.070923890198142e-05, + "loss": 0.495, + "step": 3903 + }, + { + "epoch": 0.6940444444444445, + "grad_norm": 0.2773379981517792, + "learning_rate": 1.0697784633778094e-05, + "loss": 0.4715, + "step": 3904 + }, + { + "epoch": 0.6942222222222222, + "grad_norm": 0.2876657247543335, + "learning_rate": 1.068633482683949e-05, + "loss": 0.476, + "step": 3905 + }, + { + "epoch": 0.6944, + "grad_norm": 0.30171671509742737, + "learning_rate": 1.0674889484737125e-05, + "loss": 0.4014, + "step": 3906 + }, + { + "epoch": 0.6945777777777777, + "grad_norm": 0.2897906005382538, + "learning_rate": 1.0663448611041113e-05, + "loss": 0.3261, + "step": 3907 + }, + { + "epoch": 0.6947555555555556, + "grad_norm": 0.2666492462158203, + "learning_rate": 1.0652012209320205e-05, + "loss": 0.4927, + "step": 3908 + }, + { + "epoch": 0.6949333333333333, + "grad_norm": 0.3371738791465759, + "learning_rate": 1.0640580283141731e-05, + "loss": 0.5, + "step": 3909 + }, + { + "epoch": 0.6951111111111111, + "grad_norm": 0.3997240662574768, + "learning_rate": 1.0629152836071631e-05, + "loss": 0.5473, + "step": 3910 + }, + { + "epoch": 0.6952888888888888, + "grad_norm": 0.2384890913963318, + "learning_rate": 1.0617729871674436e-05, + "loss": 0.4659, + "step": 3911 + }, + { + "epoch": 0.6954666666666667, + "grad_norm": 0.2533833384513855, + "learning_rate": 1.0606311393513314e-05, + "loss": 0.5839, + "step": 3912 + }, + { + "epoch": 0.6956444444444444, + "grad_norm": 0.2785697281360626, + "learning_rate": 1.0594897405149995e-05, + "loss": 0.5385, + "step": 3913 + }, + { + "epoch": 0.6958222222222222, + "grad_norm": 0.271124005317688, + "learning_rate": 1.0583487910144829e-05, + "loss": 0.5092, + "step": 3914 + }, + { + "epoch": 0.696, + "grad_norm": 0.2684361934661865, + "learning_rate": 1.057208291205675e-05, + "loss": 0.508, + "step": 3915 + }, + { + "epoch": 0.6961777777777778, + "grad_norm": 0.30014467239379883, + "learning_rate": 1.0560682414443315e-05, + "loss": 0.3673, + "step": 3916 + }, + { + "epoch": 0.6963555555555555, + "grad_norm": 0.25053468346595764, + "learning_rate": 1.0549286420860643e-05, + "loss": 0.5815, + "step": 3917 + }, + { + "epoch": 0.6965333333333333, + "grad_norm": 0.2919447124004364, + "learning_rate": 1.05378949348635e-05, + "loss": 0.5893, + "step": 3918 + }, + { + "epoch": 0.6967111111111111, + "grad_norm": 0.26793912053108215, + "learning_rate": 1.0526507960005164e-05, + "loss": 0.4098, + "step": 3919 + }, + { + "epoch": 0.6968888888888889, + "grad_norm": 0.2784053683280945, + "learning_rate": 1.0515125499837592e-05, + "loss": 0.5059, + "step": 3920 + }, + { + "epoch": 0.6970666666666666, + "grad_norm": 0.27602267265319824, + "learning_rate": 1.050374755791127e-05, + "loss": 0.414, + "step": 3921 + }, + { + "epoch": 0.6972444444444444, + "grad_norm": 0.2508735954761505, + "learning_rate": 1.0492374137775318e-05, + "loss": 0.532, + "step": 3922 + }, + { + "epoch": 0.6974222222222223, + "grad_norm": 0.30002692341804504, + "learning_rate": 1.048100524297742e-05, + "loss": 0.5193, + "step": 3923 + }, + { + "epoch": 0.6976, + "grad_norm": 0.273262083530426, + "learning_rate": 1.0469640877063846e-05, + "loss": 0.4748, + "step": 3924 + }, + { + "epoch": 0.6977777777777778, + "grad_norm": 0.21585173904895782, + "learning_rate": 1.0458281043579482e-05, + "loss": 0.4785, + "step": 3925 + }, + { + "epoch": 0.6979555555555556, + "grad_norm": 0.30065637826919556, + "learning_rate": 1.0446925746067768e-05, + "loss": 0.4107, + "step": 3926 + }, + { + "epoch": 0.6981333333333334, + "grad_norm": 0.26831141114234924, + "learning_rate": 1.0435574988070742e-05, + "loss": 0.4368, + "step": 3927 + }, + { + "epoch": 0.6983111111111111, + "grad_norm": 0.3368549346923828, + "learning_rate": 1.0424228773129019e-05, + "loss": 0.5007, + "step": 3928 + }, + { + "epoch": 0.6984888888888889, + "grad_norm": 0.2814876437187195, + "learning_rate": 1.041288710478182e-05, + "loss": 0.6592, + "step": 3929 + }, + { + "epoch": 0.6986666666666667, + "grad_norm": 0.24441933631896973, + "learning_rate": 1.0401549986566927e-05, + "loss": 0.5207, + "step": 3930 + }, + { + "epoch": 0.6988444444444445, + "grad_norm": 0.22328375279903412, + "learning_rate": 1.03902174220207e-05, + "loss": 0.4438, + "step": 3931 + }, + { + "epoch": 0.6990222222222222, + "grad_norm": 0.2711698114871979, + "learning_rate": 1.0378889414678086e-05, + "loss": 0.4922, + "step": 3932 + }, + { + "epoch": 0.6992, + "grad_norm": 0.27595457434654236, + "learning_rate": 1.0367565968072619e-05, + "loss": 0.4841, + "step": 3933 + }, + { + "epoch": 0.6993777777777778, + "grad_norm": 0.3372577428817749, + "learning_rate": 1.0356247085736386e-05, + "loss": 0.5467, + "step": 3934 + }, + { + "epoch": 0.6995555555555556, + "grad_norm": 0.3112560212612152, + "learning_rate": 1.0344932771200097e-05, + "loss": 0.458, + "step": 3935 + }, + { + "epoch": 0.6997333333333333, + "grad_norm": 0.27395960688591003, + "learning_rate": 1.033362302799297e-05, + "loss": 0.4949, + "step": 3936 + }, + { + "epoch": 0.6999111111111112, + "grad_norm": 0.23405636847019196, + "learning_rate": 1.0322317859642851e-05, + "loss": 0.475, + "step": 3937 + }, + { + "epoch": 0.7000888888888889, + "grad_norm": 0.2661612629890442, + "learning_rate": 1.0311017269676135e-05, + "loss": 0.4796, + "step": 3938 + }, + { + "epoch": 0.7002666666666667, + "grad_norm": 0.2639073133468628, + "learning_rate": 1.029972126161781e-05, + "loss": 0.4593, + "step": 3939 + }, + { + "epoch": 0.7004444444444444, + "grad_norm": 0.28875625133514404, + "learning_rate": 1.0288429838991403e-05, + "loss": 0.5401, + "step": 3940 + }, + { + "epoch": 0.7006222222222223, + "grad_norm": 0.3352855145931244, + "learning_rate": 1.0277143005319038e-05, + "loss": 0.4109, + "step": 3941 + }, + { + "epoch": 0.7008, + "grad_norm": 0.24676311016082764, + "learning_rate": 1.026586076412138e-05, + "loss": 0.3773, + "step": 3942 + }, + { + "epoch": 0.7009777777777778, + "grad_norm": 0.26245036721229553, + "learning_rate": 1.0254583118917698e-05, + "loss": 0.4175, + "step": 3943 + }, + { + "epoch": 0.7011555555555555, + "grad_norm": 0.3014986515045166, + "learning_rate": 1.0243310073225807e-05, + "loss": 0.4087, + "step": 3944 + }, + { + "epoch": 0.7013333333333334, + "grad_norm": 0.3380795121192932, + "learning_rate": 1.0232041630562078e-05, + "loss": 0.4941, + "step": 3945 + }, + { + "epoch": 0.7015111111111111, + "grad_norm": 0.27835503220558167, + "learning_rate": 1.022077779444145e-05, + "loss": 0.4396, + "step": 3946 + }, + { + "epoch": 0.7016888888888889, + "grad_norm": 0.33500486612319946, + "learning_rate": 1.0209518568377451e-05, + "loss": 0.3222, + "step": 3947 + }, + { + "epoch": 0.7018666666666666, + "grad_norm": 0.3301418125629425, + "learning_rate": 1.0198263955882145e-05, + "loss": 0.3641, + "step": 3948 + }, + { + "epoch": 0.7020444444444445, + "grad_norm": 0.33927637338638306, + "learning_rate": 1.018701396046616e-05, + "loss": 0.4656, + "step": 3949 + }, + { + "epoch": 0.7022222222222222, + "grad_norm": 0.5447875261306763, + "learning_rate": 1.0175768585638675e-05, + "loss": 0.5048, + "step": 3950 + }, + { + "epoch": 0.7024, + "grad_norm": 0.3373758792877197, + "learning_rate": 1.0164527834907467e-05, + "loss": 0.5523, + "step": 3951 + }, + { + "epoch": 0.7025777777777777, + "grad_norm": 0.2564699351787567, + "learning_rate": 1.0153291711778826e-05, + "loss": 0.3867, + "step": 3952 + }, + { + "epoch": 0.7027555555555556, + "grad_norm": 0.2382597029209137, + "learning_rate": 1.0142060219757619e-05, + "loss": 0.4174, + "step": 3953 + }, + { + "epoch": 0.7029333333333333, + "grad_norm": 0.25260260701179504, + "learning_rate": 1.0130833362347256e-05, + "loss": 0.4516, + "step": 3954 + }, + { + "epoch": 0.7031111111111111, + "grad_norm": 0.3911687433719635, + "learning_rate": 1.0119611143049732e-05, + "loss": 0.668, + "step": 3955 + }, + { + "epoch": 0.7032888888888889, + "grad_norm": 0.2767343819141388, + "learning_rate": 1.0108393565365551e-05, + "loss": 0.4035, + "step": 3956 + }, + { + "epoch": 0.7034666666666667, + "grad_norm": 0.28357967734336853, + "learning_rate": 1.0097180632793821e-05, + "loss": 0.6356, + "step": 3957 + }, + { + "epoch": 0.7036444444444444, + "grad_norm": 0.30080461502075195, + "learning_rate": 1.0085972348832137e-05, + "loss": 0.6762, + "step": 3958 + }, + { + "epoch": 0.7038222222222222, + "grad_norm": 0.29303720593452454, + "learning_rate": 1.0074768716976704e-05, + "loss": 0.519, + "step": 3959 + }, + { + "epoch": 0.704, + "grad_norm": 0.22744840383529663, + "learning_rate": 1.0063569740722237e-05, + "loss": 0.4098, + "step": 3960 + }, + { + "epoch": 0.7041777777777778, + "grad_norm": 0.3323523998260498, + "learning_rate": 1.0052375423562038e-05, + "loss": 0.3523, + "step": 3961 + }, + { + "epoch": 0.7043555555555555, + "grad_norm": 0.319697767496109, + "learning_rate": 1.0041185768987893e-05, + "loss": 0.4433, + "step": 3962 + }, + { + "epoch": 0.7045333333333333, + "grad_norm": 0.2916536033153534, + "learning_rate": 1.0030000780490199e-05, + "loss": 0.5115, + "step": 3963 + }, + { + "epoch": 0.7047111111111111, + "grad_norm": 0.2669242322444916, + "learning_rate": 1.0018820461557851e-05, + "loss": 0.4193, + "step": 3964 + }, + { + "epoch": 0.7048888888888889, + "grad_norm": 0.2510354518890381, + "learning_rate": 1.0007644815678326e-05, + "loss": 0.5273, + "step": 3965 + }, + { + "epoch": 0.7050666666666666, + "grad_norm": 0.2935733497142792, + "learning_rate": 9.996473846337614e-06, + "loss": 0.4204, + "step": 3966 + }, + { + "epoch": 0.7052444444444445, + "grad_norm": 0.2379063367843628, + "learning_rate": 9.985307557020257e-06, + "loss": 0.4766, + "step": 3967 + }, + { + "epoch": 0.7054222222222222, + "grad_norm": 0.27737000584602356, + "learning_rate": 9.974145951209324e-06, + "loss": 0.4928, + "step": 3968 + }, + { + "epoch": 0.7056, + "grad_norm": 0.3073188364505768, + "learning_rate": 9.962989032386453e-06, + "loss": 0.4416, + "step": 3969 + }, + { + "epoch": 0.7057777777777777, + "grad_norm": 0.3411184251308441, + "learning_rate": 9.951836804031794e-06, + "loss": 0.5395, + "step": 3970 + }, + { + "epoch": 0.7059555555555556, + "grad_norm": 0.3046908378601074, + "learning_rate": 9.94068926962404e-06, + "loss": 0.6235, + "step": 3971 + }, + { + "epoch": 0.7061333333333333, + "grad_norm": 0.2705841660499573, + "learning_rate": 9.929546432640419e-06, + "loss": 0.4344, + "step": 3972 + }, + { + "epoch": 0.7063111111111111, + "grad_norm": 0.2895147204399109, + "learning_rate": 9.918408296556706e-06, + "loss": 0.5561, + "step": 3973 + }, + { + "epoch": 0.7064888888888889, + "grad_norm": 0.29335781931877136, + "learning_rate": 9.907274864847197e-06, + "loss": 0.3912, + "step": 3974 + }, + { + "epoch": 0.7066666666666667, + "grad_norm": 0.30020835995674133, + "learning_rate": 9.89614614098471e-06, + "loss": 0.352, + "step": 3975 + }, + { + "epoch": 0.7068444444444445, + "grad_norm": 0.27374789118766785, + "learning_rate": 9.88502212844063e-06, + "loss": 0.399, + "step": 3976 + }, + { + "epoch": 0.7070222222222222, + "grad_norm": 0.26845628023147583, + "learning_rate": 9.873902830684831e-06, + "loss": 0.5093, + "step": 3977 + }, + { + "epoch": 0.7072, + "grad_norm": 0.3367484509944916, + "learning_rate": 9.862788251185764e-06, + "loss": 0.4553, + "step": 3978 + }, + { + "epoch": 0.7073777777777778, + "grad_norm": 0.3098890781402588, + "learning_rate": 9.851678393410343e-06, + "loss": 0.4598, + "step": 3979 + }, + { + "epoch": 0.7075555555555556, + "grad_norm": 0.26454684138298035, + "learning_rate": 9.840573260824074e-06, + "loss": 0.4783, + "step": 3980 + }, + { + "epoch": 0.7077333333333333, + "grad_norm": 0.26476532220840454, + "learning_rate": 9.829472856890942e-06, + "loss": 0.4008, + "step": 3981 + }, + { + "epoch": 0.7079111111111112, + "grad_norm": 0.28903499245643616, + "learning_rate": 9.818377185073493e-06, + "loss": 0.6409, + "step": 3982 + }, + { + "epoch": 0.7080888888888889, + "grad_norm": 0.24806906282901764, + "learning_rate": 9.807286248832778e-06, + "loss": 0.4024, + "step": 3983 + }, + { + "epoch": 0.7082666666666667, + "grad_norm": 0.28311246633529663, + "learning_rate": 9.796200051628365e-06, + "loss": 0.5686, + "step": 3984 + }, + { + "epoch": 0.7084444444444444, + "grad_norm": 0.23541559278964996, + "learning_rate": 9.78511859691835e-06, + "loss": 0.4724, + "step": 3985 + }, + { + "epoch": 0.7086222222222223, + "grad_norm": 0.2656582295894623, + "learning_rate": 9.774041888159364e-06, + "loss": 0.5124, + "step": 3986 + }, + { + "epoch": 0.7088, + "grad_norm": 0.29616108536720276, + "learning_rate": 9.762969928806536e-06, + "loss": 0.49, + "step": 3987 + }, + { + "epoch": 0.7089777777777778, + "grad_norm": 0.2778891324996948, + "learning_rate": 9.751902722313527e-06, + "loss": 0.3985, + "step": 3988 + }, + { + "epoch": 0.7091555555555555, + "grad_norm": 0.3022860288619995, + "learning_rate": 9.740840272132498e-06, + "loss": 0.6038, + "step": 3989 + }, + { + "epoch": 0.7093333333333334, + "grad_norm": 0.28284740447998047, + "learning_rate": 9.729782581714161e-06, + "loss": 0.4811, + "step": 3990 + }, + { + "epoch": 0.7095111111111111, + "grad_norm": 0.3301031291484833, + "learning_rate": 9.718729654507713e-06, + "loss": 0.5589, + "step": 3991 + }, + { + "epoch": 0.7096888888888889, + "grad_norm": 0.24755598604679108, + "learning_rate": 9.707681493960868e-06, + "loss": 0.5294, + "step": 3992 + }, + { + "epoch": 0.7098666666666666, + "grad_norm": 0.2882211208343506, + "learning_rate": 9.696638103519859e-06, + "loss": 0.3641, + "step": 3993 + }, + { + "epoch": 0.7100444444444445, + "grad_norm": 0.2764646112918854, + "learning_rate": 9.685599486629444e-06, + "loss": 0.5061, + "step": 3994 + }, + { + "epoch": 0.7102222222222222, + "grad_norm": 0.2578766345977783, + "learning_rate": 9.674565646732867e-06, + "loss": 0.3381, + "step": 3995 + }, + { + "epoch": 0.7104, + "grad_norm": 0.3789452910423279, + "learning_rate": 9.663536587271902e-06, + "loss": 0.4141, + "step": 3996 + }, + { + "epoch": 0.7105777777777778, + "grad_norm": 0.31487780809402466, + "learning_rate": 9.652512311686809e-06, + "loss": 0.4444, + "step": 3997 + }, + { + "epoch": 0.7107555555555556, + "grad_norm": 0.3755534291267395, + "learning_rate": 9.64149282341639e-06, + "loss": 0.567, + "step": 3998 + }, + { + "epoch": 0.7109333333333333, + "grad_norm": 0.3619266748428345, + "learning_rate": 9.630478125897919e-06, + "loss": 0.4207, + "step": 3999 + }, + { + "epoch": 0.7111111111111111, + "grad_norm": 0.44275200366973877, + "learning_rate": 9.619468222567215e-06, + "loss": 0.3985, + "step": 4000 + } + ], + "logging_steps": 1, + "max_steps": 5625, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.555555625273098e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}