diff --git "a/adapter_info/trainer_state.json" "b/adapter_info/trainer_state.json" new file mode 100644--- /dev/null +++ "b/adapter_info/trainer_state.json" @@ -0,0 +1,10766 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 1532, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0008071025020177562, + "grad_norm": 0.09123530238866806, + "learning_rate": 5.263157894736843e-07, + "loss": 2.2797, + "step": 1 + }, + { + "epoch": 0.0016142050040355124, + "grad_norm": 0.09823115170001984, + "learning_rate": 1.0526315789473685e-06, + "loss": 2.2867, + "step": 2 + }, + { + "epoch": 0.002421307506053269, + "grad_norm": 0.09137655049562454, + "learning_rate": 1.5789473684210526e-06, + "loss": 2.2867, + "step": 3 + }, + { + "epoch": 0.003228410008071025, + "grad_norm": 0.10211791098117828, + "learning_rate": 2.105263157894737e-06, + "loss": 2.4699, + "step": 4 + }, + { + "epoch": 0.004035512510088781, + "grad_norm": 0.09525001049041748, + "learning_rate": 2.631578947368421e-06, + "loss": 2.4167, + "step": 5 + }, + { + "epoch": 0.004842615012106538, + "grad_norm": 0.10257679224014282, + "learning_rate": 3.157894736842105e-06, + "loss": 2.505, + "step": 6 + }, + { + "epoch": 0.005649717514124294, + "grad_norm": 0.10152263939380646, + "learning_rate": 3.6842105263157896e-06, + "loss": 2.2791, + "step": 7 + }, + { + "epoch": 0.00645682001614205, + "grad_norm": 0.10086437314748764, + "learning_rate": 4.210526315789474e-06, + "loss": 2.3387, + "step": 8 + }, + { + "epoch": 0.007263922518159807, + "grad_norm": 0.10246654599905014, + "learning_rate": 4.736842105263158e-06, + "loss": 2.4762, + "step": 9 + }, + { + "epoch": 0.008071025020177562, + "grad_norm": 0.09224504977464676, + "learning_rate": 5.263157894736842e-06, + "loss": 2.1899, + "step": 10 + }, + { + "epoch": 0.00887812752219532, + "grad_norm": 0.09916076809167862, + "learning_rate": 5.789473684210527e-06, + "loss": 2.3367, + "step": 11 + }, + { + "epoch": 0.009685230024213076, + "grad_norm": 0.09999067336320877, + "learning_rate": 6.31578947368421e-06, + "loss": 2.3378, + "step": 12 + }, + { + "epoch": 0.010492332526230832, + "grad_norm": 0.0847872793674469, + "learning_rate": 6.842105263157896e-06, + "loss": 2.1473, + "step": 13 + }, + { + "epoch": 0.011299435028248588, + "grad_norm": 0.09087894856929779, + "learning_rate": 7.368421052631579e-06, + "loss": 2.2322, + "step": 14 + }, + { + "epoch": 0.012106537530266344, + "grad_norm": 0.09416953474283218, + "learning_rate": 7.894736842105265e-06, + "loss": 2.2214, + "step": 15 + }, + { + "epoch": 0.0129136400322841, + "grad_norm": 0.0900375097990036, + "learning_rate": 8.421052631578948e-06, + "loss": 2.0069, + "step": 16 + }, + { + "epoch": 0.013720742534301856, + "grad_norm": 0.09621618688106537, + "learning_rate": 8.947368421052632e-06, + "loss": 2.0504, + "step": 17 + }, + { + "epoch": 0.014527845036319613, + "grad_norm": 0.08563732355833054, + "learning_rate": 9.473684210526315e-06, + "loss": 1.8789, + "step": 18 + }, + { + "epoch": 0.01533494753833737, + "grad_norm": 0.09274489432573318, + "learning_rate": 1e-05, + "loss": 1.8559, + "step": 19 + }, + { + "epoch": 0.016142050040355124, + "grad_norm": 0.08004423975944519, + "learning_rate": 1.0526315789473684e-05, + "loss": 1.7343, + "step": 20 + }, + { + "epoch": 0.01694915254237288, + "grad_norm": 0.07666365802288055, + "learning_rate": 1.105263157894737e-05, + "loss": 1.6508, + "step": 21 + }, + { + "epoch": 0.01775625504439064, + "grad_norm": 0.07832133024930954, + "learning_rate": 1.1578947368421053e-05, + "loss": 1.4855, + "step": 22 + }, + { + "epoch": 0.018563357546408393, + "grad_norm": 0.09107667207717896, + "learning_rate": 1.2105263157894737e-05, + "loss": 1.8072, + "step": 23 + }, + { + "epoch": 0.01937046004842615, + "grad_norm": 0.07449404150247574, + "learning_rate": 1.263157894736842e-05, + "loss": 1.4041, + "step": 24 + }, + { + "epoch": 0.020177562550443905, + "grad_norm": 0.0739971473813057, + "learning_rate": 1.3157894736842108e-05, + "loss": 1.4815, + "step": 25 + }, + { + "epoch": 0.020984665052461663, + "grad_norm": 0.06932147592306137, + "learning_rate": 1.3684210526315791e-05, + "loss": 1.2558, + "step": 26 + }, + { + "epoch": 0.021791767554479417, + "grad_norm": 0.07036187499761581, + "learning_rate": 1.4210526315789475e-05, + "loss": 1.1721, + "step": 27 + }, + { + "epoch": 0.022598870056497175, + "grad_norm": 0.0687093511223793, + "learning_rate": 1.4736842105263159e-05, + "loss": 1.2257, + "step": 28 + }, + { + "epoch": 0.023405972558514933, + "grad_norm": 0.07659783214330673, + "learning_rate": 1.5263157894736846e-05, + "loss": 1.1707, + "step": 29 + }, + { + "epoch": 0.024213075060532687, + "grad_norm": 0.06630632281303406, + "learning_rate": 1.578947368421053e-05, + "loss": 1.0853, + "step": 30 + }, + { + "epoch": 0.025020177562550445, + "grad_norm": 0.08422867953777313, + "learning_rate": 1.6315789473684213e-05, + "loss": 1.1875, + "step": 31 + }, + { + "epoch": 0.0258272800645682, + "grad_norm": 0.06504432111978531, + "learning_rate": 1.6842105263157896e-05, + "loss": 1.0025, + "step": 32 + }, + { + "epoch": 0.026634382566585957, + "grad_norm": 0.06000803783535957, + "learning_rate": 1.736842105263158e-05, + "loss": 0.9418, + "step": 33 + }, + { + "epoch": 0.02744148506860371, + "grad_norm": 0.06218458712100983, + "learning_rate": 1.7894736842105264e-05, + "loss": 0.9268, + "step": 34 + }, + { + "epoch": 0.02824858757062147, + "grad_norm": 0.05695388466119766, + "learning_rate": 1.8421052631578947e-05, + "loss": 0.9855, + "step": 35 + }, + { + "epoch": 0.029055690072639227, + "grad_norm": 0.05278893560171127, + "learning_rate": 1.894736842105263e-05, + "loss": 0.9472, + "step": 36 + }, + { + "epoch": 0.02986279257465698, + "grad_norm": 0.05200784653425217, + "learning_rate": 1.9473684210526318e-05, + "loss": 0.8339, + "step": 37 + }, + { + "epoch": 0.03066989507667474, + "grad_norm": 0.05492912605404854, + "learning_rate": 2e-05, + "loss": 0.9994, + "step": 38 + }, + { + "epoch": 0.031476997578692496, + "grad_norm": 0.049852605909109116, + "learning_rate": 1.9999965787604234e-05, + "loss": 0.8292, + "step": 39 + }, + { + "epoch": 0.03228410008071025, + "grad_norm": 0.05501826852560043, + "learning_rate": 1.999986315065103e-05, + "loss": 0.8816, + "step": 40 + }, + { + "epoch": 0.033091202582728005, + "grad_norm": 0.055790241807699203, + "learning_rate": 1.9999692089842683e-05, + "loss": 0.8772, + "step": 41 + }, + { + "epoch": 0.03389830508474576, + "grad_norm": 0.05849922075867653, + "learning_rate": 1.9999452606349666e-05, + "loss": 0.8292, + "step": 42 + }, + { + "epoch": 0.03470540758676352, + "grad_norm": 0.04576743021607399, + "learning_rate": 1.999914470181065e-05, + "loss": 0.8168, + "step": 43 + }, + { + "epoch": 0.03551251008878128, + "grad_norm": 0.0529194213449955, + "learning_rate": 1.9998768378332455e-05, + "loss": 0.8172, + "step": 44 + }, + { + "epoch": 0.03631961259079903, + "grad_norm": 0.08031924813985825, + "learning_rate": 1.9998323638490072e-05, + "loss": 0.7076, + "step": 45 + }, + { + "epoch": 0.03712671509281679, + "grad_norm": 0.05479391664266586, + "learning_rate": 1.9997810485326627e-05, + "loss": 0.8023, + "step": 46 + }, + { + "epoch": 0.037933817594834544, + "grad_norm": 0.04970843344926834, + "learning_rate": 1.9997228922353352e-05, + "loss": 0.779, + "step": 47 + }, + { + "epoch": 0.0387409200968523, + "grad_norm": 0.03704531863331795, + "learning_rate": 1.9996578953549584e-05, + "loss": 0.7064, + "step": 48 + }, + { + "epoch": 0.03954802259887006, + "grad_norm": 0.04975225776433945, + "learning_rate": 1.9995860583362724e-05, + "loss": 0.7547, + "step": 49 + }, + { + "epoch": 0.04035512510088781, + "grad_norm": 0.043880075216293335, + "learning_rate": 1.9995073816708203e-05, + "loss": 0.6839, + "step": 50 + }, + { + "epoch": 0.04116222760290557, + "grad_norm": 0.053028006106615067, + "learning_rate": 1.999421865896945e-05, + "loss": 0.8114, + "step": 51 + }, + { + "epoch": 0.041969330104923326, + "grad_norm": 0.0408385694026947, + "learning_rate": 1.999329511599787e-05, + "loss": 0.72, + "step": 52 + }, + { + "epoch": 0.042776432606941084, + "grad_norm": 0.040051594376564026, + "learning_rate": 1.9992303194112782e-05, + "loss": 0.7465, + "step": 53 + }, + { + "epoch": 0.043583535108958835, + "grad_norm": 0.03900357335805893, + "learning_rate": 1.99912429001014e-05, + "loss": 0.8, + "step": 54 + }, + { + "epoch": 0.04439063761097659, + "grad_norm": 0.03680373355746269, + "learning_rate": 1.9990114241218753e-05, + "loss": 0.6754, + "step": 55 + }, + { + "epoch": 0.04519774011299435, + "grad_norm": 0.039670154452323914, + "learning_rate": 1.998891722518767e-05, + "loss": 0.744, + "step": 56 + }, + { + "epoch": 0.04600484261501211, + "grad_norm": 0.04500587657094002, + "learning_rate": 1.998765186019871e-05, + "loss": 0.7246, + "step": 57 + }, + { + "epoch": 0.046811945117029866, + "grad_norm": 0.035050880163908005, + "learning_rate": 1.9986318154910106e-05, + "loss": 0.7131, + "step": 58 + }, + { + "epoch": 0.047619047619047616, + "grad_norm": 0.03428708761930466, + "learning_rate": 1.9984916118447704e-05, + "loss": 0.718, + "step": 59 + }, + { + "epoch": 0.048426150121065374, + "grad_norm": 0.03644852712750435, + "learning_rate": 1.9983445760404915e-05, + "loss": 0.7542, + "step": 60 + }, + { + "epoch": 0.04923325262308313, + "grad_norm": 0.032885193824768066, + "learning_rate": 1.998190709084263e-05, + "loss": 0.6672, + "step": 61 + }, + { + "epoch": 0.05004035512510089, + "grad_norm": 0.037364475429058075, + "learning_rate": 1.9980300120289164e-05, + "loss": 0.7374, + "step": 62 + }, + { + "epoch": 0.05084745762711865, + "grad_norm": 0.0386485680937767, + "learning_rate": 1.997862485974018e-05, + "loss": 0.7086, + "step": 63 + }, + { + "epoch": 0.0516545601291364, + "grad_norm": 0.04049423336982727, + "learning_rate": 1.9976881320658617e-05, + "loss": 0.7577, + "step": 64 + }, + { + "epoch": 0.052461662631154156, + "grad_norm": 0.03641749545931816, + "learning_rate": 1.99750695149746e-05, + "loss": 0.6173, + "step": 65 + }, + { + "epoch": 0.053268765133171914, + "grad_norm": 0.04646143317222595, + "learning_rate": 1.9973189455085368e-05, + "loss": 0.682, + "step": 66 + }, + { + "epoch": 0.05407586763518967, + "grad_norm": 0.03728644922375679, + "learning_rate": 1.9971241153855202e-05, + "loss": 0.6916, + "step": 67 + }, + { + "epoch": 0.05488297013720742, + "grad_norm": 0.03582463040947914, + "learning_rate": 1.9969224624615306e-05, + "loss": 0.5859, + "step": 68 + }, + { + "epoch": 0.05569007263922518, + "grad_norm": 0.038537513464689255, + "learning_rate": 1.996713988116374e-05, + "loss": 0.6288, + "step": 69 + }, + { + "epoch": 0.05649717514124294, + "grad_norm": 0.04025536775588989, + "learning_rate": 1.996498693776532e-05, + "loss": 0.6208, + "step": 70 + }, + { + "epoch": 0.057304277643260695, + "grad_norm": 0.04114150255918503, + "learning_rate": 1.996276580915151e-05, + "loss": 0.669, + "step": 71 + }, + { + "epoch": 0.05811138014527845, + "grad_norm": 0.03860090300440788, + "learning_rate": 1.996047651052034e-05, + "loss": 0.603, + "step": 72 + }, + { + "epoch": 0.058918482647296204, + "grad_norm": 0.041348908096551895, + "learning_rate": 1.995811905753629e-05, + "loss": 0.648, + "step": 73 + }, + { + "epoch": 0.05972558514931396, + "grad_norm": 0.048039790242910385, + "learning_rate": 1.9955693466330184e-05, + "loss": 0.7023, + "step": 74 + }, + { + "epoch": 0.06053268765133172, + "grad_norm": 0.037045303732156754, + "learning_rate": 1.9953199753499074e-05, + "loss": 0.6015, + "step": 75 + }, + { + "epoch": 0.06133979015334948, + "grad_norm": 0.042891182005405426, + "learning_rate": 1.9950637936106142e-05, + "loss": 0.6587, + "step": 76 + }, + { + "epoch": 0.062146892655367235, + "grad_norm": 0.03565942123532295, + "learning_rate": 1.994800803168057e-05, + "loss": 0.6205, + "step": 77 + }, + { + "epoch": 0.06295399515738499, + "grad_norm": 0.03730818256735802, + "learning_rate": 1.9945310058217422e-05, + "loss": 0.6147, + "step": 78 + }, + { + "epoch": 0.06376109765940274, + "grad_norm": 0.04279996454715729, + "learning_rate": 1.9942544034177523e-05, + "loss": 0.6845, + "step": 79 + }, + { + "epoch": 0.0645682001614205, + "grad_norm": 0.03682834282517433, + "learning_rate": 1.9939709978487344e-05, + "loss": 0.6147, + "step": 80 + }, + { + "epoch": 0.06537530266343826, + "grad_norm": 0.04337656497955322, + "learning_rate": 1.9936807910538844e-05, + "loss": 0.681, + "step": 81 + }, + { + "epoch": 0.06618240516545601, + "grad_norm": 0.0396178774535656, + "learning_rate": 1.9933837850189365e-05, + "loss": 0.5515, + "step": 82 + }, + { + "epoch": 0.06698950766747377, + "grad_norm": 0.04068434238433838, + "learning_rate": 1.993079981776148e-05, + "loss": 0.6166, + "step": 83 + }, + { + "epoch": 0.06779661016949153, + "grad_norm": 0.04743474721908569, + "learning_rate": 1.9927693834042867e-05, + "loss": 0.6416, + "step": 84 + }, + { + "epoch": 0.06860371267150928, + "grad_norm": 0.04356124997138977, + "learning_rate": 1.9924519920286153e-05, + "loss": 0.5854, + "step": 85 + }, + { + "epoch": 0.06941081517352704, + "grad_norm": 0.039751145988702774, + "learning_rate": 1.992127809820878e-05, + "loss": 0.5932, + "step": 86 + }, + { + "epoch": 0.07021791767554479, + "grad_norm": 0.038546741008758545, + "learning_rate": 1.9917968389992838e-05, + "loss": 0.6388, + "step": 87 + }, + { + "epoch": 0.07102502017756256, + "grad_norm": 0.0414685420691967, + "learning_rate": 1.9914590818284947e-05, + "loss": 0.6816, + "step": 88 + }, + { + "epoch": 0.07183212267958031, + "grad_norm": 0.038934748619794846, + "learning_rate": 1.991114540619607e-05, + "loss": 0.6178, + "step": 89 + }, + { + "epoch": 0.07263922518159806, + "grad_norm": 0.037200141698122025, + "learning_rate": 1.990763217730136e-05, + "loss": 0.5509, + "step": 90 + }, + { + "epoch": 0.07344632768361582, + "grad_norm": 0.03767746314406395, + "learning_rate": 1.9904051155640018e-05, + "loss": 0.6043, + "step": 91 + }, + { + "epoch": 0.07425343018563357, + "grad_norm": 0.03699817880988121, + "learning_rate": 1.990040236571511e-05, + "loss": 0.518, + "step": 92 + }, + { + "epoch": 0.07506053268765134, + "grad_norm": 0.04023592919111252, + "learning_rate": 1.9896685832493404e-05, + "loss": 0.5704, + "step": 93 + }, + { + "epoch": 0.07586763518966909, + "grad_norm": 0.03699226304888725, + "learning_rate": 1.98929015814052e-05, + "loss": 0.5447, + "step": 94 + }, + { + "epoch": 0.07667473769168684, + "grad_norm": 0.04335412383079529, + "learning_rate": 1.988904963834416e-05, + "loss": 0.5441, + "step": 95 + }, + { + "epoch": 0.0774818401937046, + "grad_norm": 0.04365302622318268, + "learning_rate": 1.988513002966712e-05, + "loss": 0.597, + "step": 96 + }, + { + "epoch": 0.07828894269572235, + "grad_norm": 0.040038615465164185, + "learning_rate": 1.9881142782193924e-05, + "loss": 0.5233, + "step": 97 + }, + { + "epoch": 0.07909604519774012, + "grad_norm": 0.03829483687877655, + "learning_rate": 1.987708792320723e-05, + "loss": 0.5462, + "step": 98 + }, + { + "epoch": 0.07990314769975787, + "grad_norm": 0.03859161585569382, + "learning_rate": 1.9872965480452326e-05, + "loss": 0.5514, + "step": 99 + }, + { + "epoch": 0.08071025020177562, + "grad_norm": 0.043581772595644, + "learning_rate": 1.986877548213694e-05, + "loss": 0.5961, + "step": 100 + }, + { + "epoch": 0.08151735270379339, + "grad_norm": 0.045486994087696075, + "learning_rate": 1.9864517956931046e-05, + "loss": 0.5943, + "step": 101 + }, + { + "epoch": 0.08232445520581114, + "grad_norm": 0.04024514555931091, + "learning_rate": 1.9860192933966674e-05, + "loss": 0.5931, + "step": 102 + }, + { + "epoch": 0.08313155770782889, + "grad_norm": 0.040237389504909515, + "learning_rate": 1.98558004428377e-05, + "loss": 0.5814, + "step": 103 + }, + { + "epoch": 0.08393866020984665, + "grad_norm": 0.045958422124385834, + "learning_rate": 1.9851340513599658e-05, + "loss": 0.6151, + "step": 104 + }, + { + "epoch": 0.0847457627118644, + "grad_norm": 0.04062805697321892, + "learning_rate": 1.9846813176769518e-05, + "loss": 0.539, + "step": 105 + }, + { + "epoch": 0.08555286521388217, + "grad_norm": 0.03963112458586693, + "learning_rate": 1.9842218463325488e-05, + "loss": 0.5272, + "step": 106 + }, + { + "epoch": 0.08635996771589992, + "grad_norm": 0.041944652795791626, + "learning_rate": 1.98375564047068e-05, + "loss": 0.6423, + "step": 107 + }, + { + "epoch": 0.08716707021791767, + "grad_norm": 0.03997455909848213, + "learning_rate": 1.9832827032813492e-05, + "loss": 0.6484, + "step": 108 + }, + { + "epoch": 0.08797417271993543, + "grad_norm": 0.03940528258681297, + "learning_rate": 1.9828030380006193e-05, + "loss": 0.5202, + "step": 109 + }, + { + "epoch": 0.08878127522195318, + "grad_norm": 0.03852485492825508, + "learning_rate": 1.9823166479105898e-05, + "loss": 0.4952, + "step": 110 + }, + { + "epoch": 0.08958837772397095, + "grad_norm": 0.04255465418100357, + "learning_rate": 1.9818235363393748e-05, + "loss": 0.5827, + "step": 111 + }, + { + "epoch": 0.0903954802259887, + "grad_norm": 0.038469936698675156, + "learning_rate": 1.9813237066610803e-05, + "loss": 0.5568, + "step": 112 + }, + { + "epoch": 0.09120258272800645, + "grad_norm": 0.04193663224577904, + "learning_rate": 1.9808171622957803e-05, + "loss": 0.5405, + "step": 113 + }, + { + "epoch": 0.09200968523002422, + "grad_norm": 0.04573473334312439, + "learning_rate": 1.9803039067094937e-05, + "loss": 0.5158, + "step": 114 + }, + { + "epoch": 0.09281678773204197, + "grad_norm": 0.04189401865005493, + "learning_rate": 1.9797839434141616e-05, + "loss": 0.5968, + "step": 115 + }, + { + "epoch": 0.09362389023405973, + "grad_norm": 0.04444560781121254, + "learning_rate": 1.9792572759676218e-05, + "loss": 0.5191, + "step": 116 + }, + { + "epoch": 0.09443099273607748, + "grad_norm": 0.04850579798221588, + "learning_rate": 1.9787239079735852e-05, + "loss": 0.563, + "step": 117 + }, + { + "epoch": 0.09523809523809523, + "grad_norm": 0.03827857971191406, + "learning_rate": 1.9781838430816117e-05, + "loss": 0.4721, + "step": 118 + }, + { + "epoch": 0.096045197740113, + "grad_norm": 0.041206639260053635, + "learning_rate": 1.9776370849870837e-05, + "loss": 0.5149, + "step": 119 + }, + { + "epoch": 0.09685230024213075, + "grad_norm": 0.041229840368032455, + "learning_rate": 1.9770836374311817e-05, + "loss": 0.5404, + "step": 120 + }, + { + "epoch": 0.09765940274414851, + "grad_norm": 0.046815477311611176, + "learning_rate": 1.9765235042008596e-05, + "loss": 0.5211, + "step": 121 + }, + { + "epoch": 0.09846650524616626, + "grad_norm": 0.043892793357372284, + "learning_rate": 1.9759566891288172e-05, + "loss": 0.5504, + "step": 122 + }, + { + "epoch": 0.09927360774818401, + "grad_norm": 0.041209544986486435, + "learning_rate": 1.9753831960934747e-05, + "loss": 0.531, + "step": 123 + }, + { + "epoch": 0.10008071025020178, + "grad_norm": 0.03901658579707146, + "learning_rate": 1.9748030290189466e-05, + "loss": 0.5191, + "step": 124 + }, + { + "epoch": 0.10088781275221953, + "grad_norm": 0.03842341527342796, + "learning_rate": 1.9742161918750135e-05, + "loss": 0.5228, + "step": 125 + }, + { + "epoch": 0.1016949152542373, + "grad_norm": 0.049568090587854385, + "learning_rate": 1.9736226886770966e-05, + "loss": 0.4958, + "step": 126 + }, + { + "epoch": 0.10250201775625505, + "grad_norm": 0.04007592052221298, + "learning_rate": 1.973022523486229e-05, + "loss": 0.5539, + "step": 127 + }, + { + "epoch": 0.1033091202582728, + "grad_norm": 0.0374893881380558, + "learning_rate": 1.972415700409029e-05, + "loss": 0.5441, + "step": 128 + }, + { + "epoch": 0.10411622276029056, + "grad_norm": 0.046349216252565384, + "learning_rate": 1.97180222359767e-05, + "loss": 0.5518, + "step": 129 + }, + { + "epoch": 0.10492332526230831, + "grad_norm": 0.04060063138604164, + "learning_rate": 1.9711820972498552e-05, + "loss": 0.4912, + "step": 130 + }, + { + "epoch": 0.10573042776432606, + "grad_norm": 0.03931838274002075, + "learning_rate": 1.9705553256087856e-05, + "loss": 0.4939, + "step": 131 + }, + { + "epoch": 0.10653753026634383, + "grad_norm": 0.04112625494599342, + "learning_rate": 1.9699219129631336e-05, + "loss": 0.557, + "step": 132 + }, + { + "epoch": 0.10734463276836158, + "grad_norm": 0.046433791518211365, + "learning_rate": 1.9692818636470113e-05, + "loss": 0.621, + "step": 133 + }, + { + "epoch": 0.10815173527037934, + "grad_norm": 0.04291560500860214, + "learning_rate": 1.968635182039943e-05, + "loss": 0.4939, + "step": 134 + }, + { + "epoch": 0.1089588377723971, + "grad_norm": 0.044043783098459244, + "learning_rate": 1.967981872566835e-05, + "loss": 0.5557, + "step": 135 + }, + { + "epoch": 0.10976594027441484, + "grad_norm": 0.043234121054410934, + "learning_rate": 1.9673219396979428e-05, + "loss": 0.477, + "step": 136 + }, + { + "epoch": 0.11057304277643261, + "grad_norm": 0.037246223539114, + "learning_rate": 1.9666553879488435e-05, + "loss": 0.4863, + "step": 137 + }, + { + "epoch": 0.11138014527845036, + "grad_norm": 0.04165706783533096, + "learning_rate": 1.9659822218804034e-05, + "loss": 0.5986, + "step": 138 + }, + { + "epoch": 0.11218724778046812, + "grad_norm": 0.037833321839571, + "learning_rate": 1.965302446098748e-05, + "loss": 0.5649, + "step": 139 + }, + { + "epoch": 0.11299435028248588, + "grad_norm": 0.04190123453736305, + "learning_rate": 1.964616065255228e-05, + "loss": 0.5044, + "step": 140 + }, + { + "epoch": 0.11380145278450363, + "grad_norm": 0.04529823362827301, + "learning_rate": 1.9639230840463907e-05, + "loss": 0.627, + "step": 141 + }, + { + "epoch": 0.11460855528652139, + "grad_norm": 0.03832566738128662, + "learning_rate": 1.963223507213945e-05, + "loss": 0.5422, + "step": 142 + }, + { + "epoch": 0.11541565778853914, + "grad_norm": 0.04111456498503685, + "learning_rate": 1.9625173395447315e-05, + "loss": 0.548, + "step": 143 + }, + { + "epoch": 0.1162227602905569, + "grad_norm": 0.039997123181819916, + "learning_rate": 1.961804585870687e-05, + "loss": 0.5015, + "step": 144 + }, + { + "epoch": 0.11702986279257466, + "grad_norm": 0.041700903326272964, + "learning_rate": 1.9610852510688142e-05, + "loss": 0.5675, + "step": 145 + }, + { + "epoch": 0.11783696529459241, + "grad_norm": 0.03820529580116272, + "learning_rate": 1.9603593400611463e-05, + "loss": 0.5111, + "step": 146 + }, + { + "epoch": 0.11864406779661017, + "grad_norm": 0.043333180248737335, + "learning_rate": 1.9596268578147142e-05, + "loss": 0.584, + "step": 147 + }, + { + "epoch": 0.11945117029862792, + "grad_norm": 0.03793581947684288, + "learning_rate": 1.9588878093415125e-05, + "loss": 0.5232, + "step": 148 + }, + { + "epoch": 0.12025827280064569, + "grad_norm": 0.04295019060373306, + "learning_rate": 1.958142199698465e-05, + "loss": 0.533, + "step": 149 + }, + { + "epoch": 0.12106537530266344, + "grad_norm": 0.04009397327899933, + "learning_rate": 1.9573900339873897e-05, + "loss": 0.5095, + "step": 150 + }, + { + "epoch": 0.12187247780468119, + "grad_norm": 0.04774903878569603, + "learning_rate": 1.9566313173549653e-05, + "loss": 0.5519, + "step": 151 + }, + { + "epoch": 0.12267958030669895, + "grad_norm": 0.0475604310631752, + "learning_rate": 1.9558660549926946e-05, + "loss": 0.5511, + "step": 152 + }, + { + "epoch": 0.1234866828087167, + "grad_norm": 0.037444114685058594, + "learning_rate": 1.9550942521368692e-05, + "loss": 0.5134, + "step": 153 + }, + { + "epoch": 0.12429378531073447, + "grad_norm": 0.0871221199631691, + "learning_rate": 1.9543159140685336e-05, + "loss": 0.5256, + "step": 154 + }, + { + "epoch": 0.12510088781275222, + "grad_norm": 0.03306428715586662, + "learning_rate": 1.9535310461134504e-05, + "loss": 0.4591, + "step": 155 + }, + { + "epoch": 0.12590799031476999, + "grad_norm": 0.03849384933710098, + "learning_rate": 1.9527396536420624e-05, + "loss": 0.536, + "step": 156 + }, + { + "epoch": 0.12671509281678772, + "grad_norm": 0.03767150267958641, + "learning_rate": 1.951941742069455e-05, + "loss": 0.5601, + "step": 157 + }, + { + "epoch": 0.1275221953188055, + "grad_norm": 0.03878191486001015, + "learning_rate": 1.9511373168553228e-05, + "loss": 0.4948, + "step": 158 + }, + { + "epoch": 0.12832929782082325, + "grad_norm": 0.038342781364917755, + "learning_rate": 1.9503263835039275e-05, + "loss": 0.5839, + "step": 159 + }, + { + "epoch": 0.129136400322841, + "grad_norm": 0.04736362025141716, + "learning_rate": 1.9495089475640644e-05, + "loss": 0.5203, + "step": 160 + }, + { + "epoch": 0.12994350282485875, + "grad_norm": 0.04264182969927788, + "learning_rate": 1.9486850146290218e-05, + "loss": 0.5562, + "step": 161 + }, + { + "epoch": 0.13075060532687652, + "grad_norm": 0.036652226001024246, + "learning_rate": 1.9478545903365432e-05, + "loss": 0.5224, + "step": 162 + }, + { + "epoch": 0.13155770782889428, + "grad_norm": 0.037250448018312454, + "learning_rate": 1.9470176803687896e-05, + "loss": 0.4876, + "step": 163 + }, + { + "epoch": 0.13236481033091202, + "grad_norm": 0.05088205635547638, + "learning_rate": 1.9461742904523003e-05, + "loss": 0.487, + "step": 164 + }, + { + "epoch": 0.13317191283292978, + "grad_norm": 0.04238941892981529, + "learning_rate": 1.9453244263579532e-05, + "loss": 0.4689, + "step": 165 + }, + { + "epoch": 0.13397901533494755, + "grad_norm": 0.04009280726313591, + "learning_rate": 1.9444680939009255e-05, + "loss": 0.5043, + "step": 166 + }, + { + "epoch": 0.13478611783696529, + "grad_norm": 0.04117881506681442, + "learning_rate": 1.9436052989406538e-05, + "loss": 0.5367, + "step": 167 + }, + { + "epoch": 0.13559322033898305, + "grad_norm": 0.04030587896704674, + "learning_rate": 1.9427360473807957e-05, + "loss": 0.4733, + "step": 168 + }, + { + "epoch": 0.13640032284100082, + "grad_norm": 0.04532231390476227, + "learning_rate": 1.941860345169186e-05, + "loss": 0.4976, + "step": 169 + }, + { + "epoch": 0.13720742534301855, + "grad_norm": 0.03877172991633415, + "learning_rate": 1.9409781982977988e-05, + "loss": 0.5249, + "step": 170 + }, + { + "epoch": 0.13801452784503632, + "grad_norm": 0.039575811475515366, + "learning_rate": 1.940089612802706e-05, + "loss": 0.534, + "step": 171 + }, + { + "epoch": 0.13882163034705408, + "grad_norm": 0.04503751918673515, + "learning_rate": 1.9391945947640352e-05, + "loss": 0.552, + "step": 172 + }, + { + "epoch": 0.13962873284907182, + "grad_norm": 0.041354332119226456, + "learning_rate": 1.938293150305929e-05, + "loss": 0.521, + "step": 173 + }, + { + "epoch": 0.14043583535108958, + "grad_norm": 0.03722424805164337, + "learning_rate": 1.937385285596502e-05, + "loss": 0.3982, + "step": 174 + }, + { + "epoch": 0.14124293785310735, + "grad_norm": 0.044452231377363205, + "learning_rate": 1.9364710068477994e-05, + "loss": 0.4525, + "step": 175 + }, + { + "epoch": 0.1420500403551251, + "grad_norm": 0.0437353290617466, + "learning_rate": 1.9355503203157545e-05, + "loss": 0.5436, + "step": 176 + }, + { + "epoch": 0.14285714285714285, + "grad_norm": 0.036104194819927216, + "learning_rate": 1.934623232300146e-05, + "loss": 0.463, + "step": 177 + }, + { + "epoch": 0.14366424535916061, + "grad_norm": 0.03944510221481323, + "learning_rate": 1.9336897491445544e-05, + "loss": 0.5538, + "step": 178 + }, + { + "epoch": 0.14447134786117838, + "grad_norm": 0.039386969059705734, + "learning_rate": 1.9327498772363187e-05, + "loss": 0.5193, + "step": 179 + }, + { + "epoch": 0.14527845036319612, + "grad_norm": 0.04991620406508446, + "learning_rate": 1.931803623006492e-05, + "loss": 0.5429, + "step": 180 + }, + { + "epoch": 0.14608555286521388, + "grad_norm": 0.03600313887000084, + "learning_rate": 1.9308509929298e-05, + "loss": 0.506, + "step": 181 + }, + { + "epoch": 0.14689265536723164, + "grad_norm": 0.03998609259724617, + "learning_rate": 1.9298919935245944e-05, + "loss": 0.5426, + "step": 182 + }, + { + "epoch": 0.14769975786924938, + "grad_norm": 0.040966298431158066, + "learning_rate": 1.9289266313528075e-05, + "loss": 0.5069, + "step": 183 + }, + { + "epoch": 0.14850686037126715, + "grad_norm": 0.04375036060810089, + "learning_rate": 1.927954913019911e-05, + "loss": 0.505, + "step": 184 + }, + { + "epoch": 0.1493139628732849, + "grad_norm": 0.038769107311964035, + "learning_rate": 1.926976845174867e-05, + "loss": 0.5215, + "step": 185 + }, + { + "epoch": 0.15012106537530268, + "grad_norm": 0.03845265135169029, + "learning_rate": 1.9259924345100837e-05, + "loss": 0.5043, + "step": 186 + }, + { + "epoch": 0.1509281678773204, + "grad_norm": 0.03340338170528412, + "learning_rate": 1.9250016877613714e-05, + "loss": 0.4641, + "step": 187 + }, + { + "epoch": 0.15173527037933818, + "grad_norm": 0.042416080832481384, + "learning_rate": 1.9240046117078935e-05, + "loss": 0.528, + "step": 188 + }, + { + "epoch": 0.15254237288135594, + "grad_norm": 0.04447726532816887, + "learning_rate": 1.9230012131721223e-05, + "loss": 0.541, + "step": 189 + }, + { + "epoch": 0.15334947538337368, + "grad_norm": 0.0382893830537796, + "learning_rate": 1.9219914990197914e-05, + "loss": 0.5028, + "step": 190 + }, + { + "epoch": 0.15415657788539144, + "grad_norm": 0.039434779435396194, + "learning_rate": 1.9209754761598488e-05, + "loss": 0.5406, + "step": 191 + }, + { + "epoch": 0.1549636803874092, + "grad_norm": 0.03525169566273689, + "learning_rate": 1.91995315154441e-05, + "loss": 0.4859, + "step": 192 + }, + { + "epoch": 0.15577078288942695, + "grad_norm": 0.0399198979139328, + "learning_rate": 1.9189245321687093e-05, + "loss": 0.563, + "step": 193 + }, + { + "epoch": 0.1565778853914447, + "grad_norm": 0.040046997368335724, + "learning_rate": 1.917889625071054e-05, + "loss": 0.5029, + "step": 194 + }, + { + "epoch": 0.15738498789346247, + "grad_norm": 0.04070790857076645, + "learning_rate": 1.916848437332774e-05, + "loss": 0.4785, + "step": 195 + }, + { + "epoch": 0.15819209039548024, + "grad_norm": 0.03891390189528465, + "learning_rate": 1.9158009760781744e-05, + "loss": 0.4973, + "step": 196 + }, + { + "epoch": 0.15899919289749798, + "grad_norm": 0.03878718987107277, + "learning_rate": 1.9147472484744873e-05, + "loss": 0.5465, + "step": 197 + }, + { + "epoch": 0.15980629539951574, + "grad_norm": 0.03733861818909645, + "learning_rate": 1.913687261731822e-05, + "loss": 0.5212, + "step": 198 + }, + { + "epoch": 0.1606133979015335, + "grad_norm": 0.03874503821134567, + "learning_rate": 1.9126210231031158e-05, + "loss": 0.4533, + "step": 199 + }, + { + "epoch": 0.16142050040355124, + "grad_norm": 0.03465745225548744, + "learning_rate": 1.9115485398840838e-05, + "loss": 0.445, + "step": 200 + }, + { + "epoch": 0.162227602905569, + "grad_norm": 0.04122417792677879, + "learning_rate": 1.9104698194131703e-05, + "loss": 0.5264, + "step": 201 + }, + { + "epoch": 0.16303470540758677, + "grad_norm": 0.04614901915192604, + "learning_rate": 1.9093848690714977e-05, + "loss": 0.5669, + "step": 202 + }, + { + "epoch": 0.1638418079096045, + "grad_norm": 0.03678954392671585, + "learning_rate": 1.908293696282816e-05, + "loss": 0.4039, + "step": 203 + }, + { + "epoch": 0.16464891041162227, + "grad_norm": 0.03784651309251785, + "learning_rate": 1.907196308513452e-05, + "loss": 0.4659, + "step": 204 + }, + { + "epoch": 0.16545601291364004, + "grad_norm": 0.04650195315480232, + "learning_rate": 1.906092713272259e-05, + "loss": 0.4867, + "step": 205 + }, + { + "epoch": 0.16626311541565778, + "grad_norm": 0.04255447909235954, + "learning_rate": 1.904982918110565e-05, + "loss": 0.44, + "step": 206 + }, + { + "epoch": 0.16707021791767554, + "grad_norm": 0.0433458648622036, + "learning_rate": 1.9038669306221194e-05, + "loss": 0.4586, + "step": 207 + }, + { + "epoch": 0.1678773204196933, + "grad_norm": 0.04138747602701187, + "learning_rate": 1.9027447584430435e-05, + "loss": 0.5412, + "step": 208 + }, + { + "epoch": 0.16868442292171107, + "grad_norm": 0.05272183194756508, + "learning_rate": 1.9016164092517767e-05, + "loss": 0.514, + "step": 209 + }, + { + "epoch": 0.1694915254237288, + "grad_norm": 0.04436493664979935, + "learning_rate": 1.9004818907690255e-05, + "loss": 0.4419, + "step": 210 + }, + { + "epoch": 0.17029862792574657, + "grad_norm": 0.03898143395781517, + "learning_rate": 1.899341210757709e-05, + "loss": 0.4747, + "step": 211 + }, + { + "epoch": 0.17110573042776434, + "grad_norm": 0.04165184497833252, + "learning_rate": 1.8981943770229058e-05, + "loss": 0.5105, + "step": 212 + }, + { + "epoch": 0.17191283292978207, + "grad_norm": 0.03903055191040039, + "learning_rate": 1.897041397411802e-05, + "loss": 0.4984, + "step": 213 + }, + { + "epoch": 0.17271993543179984, + "grad_norm": 0.04989544302225113, + "learning_rate": 1.8958822798136367e-05, + "loss": 0.4596, + "step": 214 + }, + { + "epoch": 0.1735270379338176, + "grad_norm": 0.040278587490320206, + "learning_rate": 1.8947170321596477e-05, + "loss": 0.5342, + "step": 215 + }, + { + "epoch": 0.17433414043583534, + "grad_norm": 0.04341874271631241, + "learning_rate": 1.893545662423018e-05, + "loss": 0.4712, + "step": 216 + }, + { + "epoch": 0.1751412429378531, + "grad_norm": 0.04516846686601639, + "learning_rate": 1.8923681786188207e-05, + "loss": 0.476, + "step": 217 + }, + { + "epoch": 0.17594834543987087, + "grad_norm": 0.039414409548044205, + "learning_rate": 1.891184588803964e-05, + "loss": 0.4865, + "step": 218 + }, + { + "epoch": 0.17675544794188863, + "grad_norm": 0.042853113263845444, + "learning_rate": 1.8899949010771365e-05, + "loss": 0.5758, + "step": 219 + }, + { + "epoch": 0.17756255044390637, + "grad_norm": 0.04502491280436516, + "learning_rate": 1.888799123578752e-05, + "loss": 0.5365, + "step": 220 + }, + { + "epoch": 0.17836965294592413, + "grad_norm": 0.03714486211538315, + "learning_rate": 1.8875972644908924e-05, + "loss": 0.5428, + "step": 221 + }, + { + "epoch": 0.1791767554479419, + "grad_norm": 0.0374261736869812, + "learning_rate": 1.886389332037254e-05, + "loss": 0.4923, + "step": 222 + }, + { + "epoch": 0.17998385794995964, + "grad_norm": 0.04436585307121277, + "learning_rate": 1.8851753344830897e-05, + "loss": 0.5548, + "step": 223 + }, + { + "epoch": 0.1807909604519774, + "grad_norm": 0.037220027297735214, + "learning_rate": 1.8839552801351515e-05, + "loss": 0.4781, + "step": 224 + }, + { + "epoch": 0.18159806295399517, + "grad_norm": 0.03513636440038681, + "learning_rate": 1.882729177341637e-05, + "loss": 0.4831, + "step": 225 + }, + { + "epoch": 0.1824051654560129, + "grad_norm": 0.04300914704799652, + "learning_rate": 1.881497034492128e-05, + "loss": 0.4867, + "step": 226 + }, + { + "epoch": 0.18321226795803067, + "grad_norm": 0.03557848930358887, + "learning_rate": 1.8802588600175365e-05, + "loss": 0.4856, + "step": 227 + }, + { + "epoch": 0.18401937046004843, + "grad_norm": 0.03743339329957962, + "learning_rate": 1.879014662390046e-05, + "loss": 0.5118, + "step": 228 + }, + { + "epoch": 0.1848264729620662, + "grad_norm": 0.037632618099451065, + "learning_rate": 1.877764450123053e-05, + "loss": 0.5256, + "step": 229 + }, + { + "epoch": 0.18563357546408393, + "grad_norm": 0.03996562957763672, + "learning_rate": 1.8765082317711076e-05, + "loss": 0.4748, + "step": 230 + }, + { + "epoch": 0.1864406779661017, + "grad_norm": 0.03809904307126999, + "learning_rate": 1.875246015929859e-05, + "loss": 0.5161, + "step": 231 + }, + { + "epoch": 0.18724778046811946, + "grad_norm": 0.037256382405757904, + "learning_rate": 1.8739778112359923e-05, + "loss": 0.4922, + "step": 232 + }, + { + "epoch": 0.1880548829701372, + "grad_norm": 0.03946573659777641, + "learning_rate": 1.8727036263671714e-05, + "loss": 0.4865, + "step": 233 + }, + { + "epoch": 0.18886198547215496, + "grad_norm": 0.03847169876098633, + "learning_rate": 1.8714234700419802e-05, + "loss": 0.4714, + "step": 234 + }, + { + "epoch": 0.18966908797417273, + "grad_norm": 0.04428127035498619, + "learning_rate": 1.8701373510198613e-05, + "loss": 0.5162, + "step": 235 + }, + { + "epoch": 0.19047619047619047, + "grad_norm": 0.03315884992480278, + "learning_rate": 1.8688452781010576e-05, + "loss": 0.4288, + "step": 236 + }, + { + "epoch": 0.19128329297820823, + "grad_norm": 0.044353123754262924, + "learning_rate": 1.867547260126551e-05, + "loss": 0.5049, + "step": 237 + }, + { + "epoch": 0.192090395480226, + "grad_norm": 0.041326623409986496, + "learning_rate": 1.8662433059780017e-05, + "loss": 0.5128, + "step": 238 + }, + { + "epoch": 0.19289749798224373, + "grad_norm": 0.04205748811364174, + "learning_rate": 1.8649334245776904e-05, + "loss": 0.4981, + "step": 239 + }, + { + "epoch": 0.1937046004842615, + "grad_norm": 0.04195690155029297, + "learning_rate": 1.8636176248884518e-05, + "loss": 0.4683, + "step": 240 + }, + { + "epoch": 0.19451170298627926, + "grad_norm": 0.0392894521355629, + "learning_rate": 1.8622959159136182e-05, + "loss": 0.4387, + "step": 241 + }, + { + "epoch": 0.19531880548829703, + "grad_norm": 0.04331105947494507, + "learning_rate": 1.860968306696956e-05, + "loss": 0.5649, + "step": 242 + }, + { + "epoch": 0.19612590799031476, + "grad_norm": 0.03682456538081169, + "learning_rate": 1.8596348063226037e-05, + "loss": 0.5376, + "step": 243 + }, + { + "epoch": 0.19693301049233253, + "grad_norm": 0.042542796581983566, + "learning_rate": 1.8582954239150095e-05, + "loss": 0.4451, + "step": 244 + }, + { + "epoch": 0.1977401129943503, + "grad_norm": 0.038883309811353683, + "learning_rate": 1.85695016863887e-05, + "loss": 0.4738, + "step": 245 + }, + { + "epoch": 0.19854721549636803, + "grad_norm": 0.040823884308338165, + "learning_rate": 1.8555990496990655e-05, + "loss": 0.4148, + "step": 246 + }, + { + "epoch": 0.1993543179983858, + "grad_norm": 0.03684435412287712, + "learning_rate": 1.8542420763406e-05, + "loss": 0.5052, + "step": 247 + }, + { + "epoch": 0.20016142050040356, + "grad_norm": 0.04214322566986084, + "learning_rate": 1.8528792578485352e-05, + "loss": 0.5138, + "step": 248 + }, + { + "epoch": 0.2009685230024213, + "grad_norm": 0.042726051062345505, + "learning_rate": 1.851510603547928e-05, + "loss": 0.4969, + "step": 249 + }, + { + "epoch": 0.20177562550443906, + "grad_norm": 0.034282345324754715, + "learning_rate": 1.8501361228037673e-05, + "loss": 0.4237, + "step": 250 + }, + { + "epoch": 0.20258272800645682, + "grad_norm": 0.038208913058042526, + "learning_rate": 1.8487558250209088e-05, + "loss": 0.4203, + "step": 251 + }, + { + "epoch": 0.2033898305084746, + "grad_norm": 0.04460599273443222, + "learning_rate": 1.847369719644011e-05, + "loss": 0.5293, + "step": 252 + }, + { + "epoch": 0.20419693301049233, + "grad_norm": 0.04102775827050209, + "learning_rate": 1.8459778161574715e-05, + "loss": 0.3967, + "step": 253 + }, + { + "epoch": 0.2050040355125101, + "grad_norm": 0.04214366525411606, + "learning_rate": 1.8445801240853603e-05, + "loss": 0.5049, + "step": 254 + }, + { + "epoch": 0.20581113801452786, + "grad_norm": 0.04023770987987518, + "learning_rate": 1.8431766529913572e-05, + "loss": 0.4902, + "step": 255 + }, + { + "epoch": 0.2066182405165456, + "grad_norm": 0.04067148268222809, + "learning_rate": 1.8417674124786832e-05, + "loss": 0.4539, + "step": 256 + }, + { + "epoch": 0.20742534301856336, + "grad_norm": 0.05563601106405258, + "learning_rate": 1.840352412190037e-05, + "loss": 0.5169, + "step": 257 + }, + { + "epoch": 0.20823244552058112, + "grad_norm": 0.03937901183962822, + "learning_rate": 1.8389316618075294e-05, + "loss": 0.5009, + "step": 258 + }, + { + "epoch": 0.20903954802259886, + "grad_norm": 0.04437664896249771, + "learning_rate": 1.8375051710526143e-05, + "loss": 0.499, + "step": 259 + }, + { + "epoch": 0.20984665052461662, + "grad_norm": 0.04327564314007759, + "learning_rate": 1.8360729496860255e-05, + "loss": 0.5357, + "step": 260 + }, + { + "epoch": 0.2106537530266344, + "grad_norm": 0.0612650103867054, + "learning_rate": 1.8346350075077078e-05, + "loss": 0.5325, + "step": 261 + }, + { + "epoch": 0.21146085552865213, + "grad_norm": 0.04520949721336365, + "learning_rate": 1.8331913543567502e-05, + "loss": 0.428, + "step": 262 + }, + { + "epoch": 0.2122679580306699, + "grad_norm": 0.04141777381300926, + "learning_rate": 1.83174200011132e-05, + "loss": 0.45, + "step": 263 + }, + { + "epoch": 0.21307506053268765, + "grad_norm": 0.03921473026275635, + "learning_rate": 1.8302869546885927e-05, + "loss": 0.4547, + "step": 264 + }, + { + "epoch": 0.21388216303470542, + "grad_norm": 0.040408920496702194, + "learning_rate": 1.8288262280446868e-05, + "loss": 0.5301, + "step": 265 + }, + { + "epoch": 0.21468926553672316, + "grad_norm": 0.038676533848047256, + "learning_rate": 1.8273598301745935e-05, + "loss": 0.4751, + "step": 266 + }, + { + "epoch": 0.21549636803874092, + "grad_norm": 0.04012975096702576, + "learning_rate": 1.8258877711121103e-05, + "loss": 0.428, + "step": 267 + }, + { + "epoch": 0.21630347054075869, + "grad_norm": 0.035383693873882294, + "learning_rate": 1.8244100609297698e-05, + "loss": 0.462, + "step": 268 + }, + { + "epoch": 0.21711057304277642, + "grad_norm": 0.034487176686525345, + "learning_rate": 1.8229267097387737e-05, + "loss": 0.4795, + "step": 269 + }, + { + "epoch": 0.2179176755447942, + "grad_norm": 0.03842625394463539, + "learning_rate": 1.8214377276889212e-05, + "loss": 0.5321, + "step": 270 + }, + { + "epoch": 0.21872477804681195, + "grad_norm": 0.03516872972249985, + "learning_rate": 1.8199431249685413e-05, + "loss": 0.4355, + "step": 271 + }, + { + "epoch": 0.2195318805488297, + "grad_norm": 0.03586851432919502, + "learning_rate": 1.818442911804422e-05, + "loss": 0.4521, + "step": 272 + }, + { + "epoch": 0.22033898305084745, + "grad_norm": 0.03677456080913544, + "learning_rate": 1.81693709846174e-05, + "loss": 0.4225, + "step": 273 + }, + { + "epoch": 0.22114608555286522, + "grad_norm": 0.04374263435602188, + "learning_rate": 1.8154256952439926e-05, + "loss": 0.529, + "step": 274 + }, + { + "epoch": 0.22195318805488298, + "grad_norm": 0.04119790717959404, + "learning_rate": 1.813908712492924e-05, + "loss": 0.5778, + "step": 275 + }, + { + "epoch": 0.22276029055690072, + "grad_norm": 0.03965405747294426, + "learning_rate": 1.8123861605884572e-05, + "loss": 0.4413, + "step": 276 + }, + { + "epoch": 0.22356739305891848, + "grad_norm": 0.038712743669748306, + "learning_rate": 1.8108580499486225e-05, + "loss": 0.4621, + "step": 277 + }, + { + "epoch": 0.22437449556093625, + "grad_norm": 0.03554076701402664, + "learning_rate": 1.8093243910294845e-05, + "loss": 0.4627, + "step": 278 + }, + { + "epoch": 0.22518159806295399, + "grad_norm": 0.03951288387179375, + "learning_rate": 1.807785194325072e-05, + "loss": 0.4631, + "step": 279 + }, + { + "epoch": 0.22598870056497175, + "grad_norm": 0.04191136360168457, + "learning_rate": 1.8062404703673078e-05, + "loss": 0.4426, + "step": 280 + }, + { + "epoch": 0.22679580306698952, + "grad_norm": 0.04739841818809509, + "learning_rate": 1.804690229725932e-05, + "loss": 0.4568, + "step": 281 + }, + { + "epoch": 0.22760290556900725, + "grad_norm": 0.03690197318792343, + "learning_rate": 1.803134483008434e-05, + "loss": 0.3947, + "step": 282 + }, + { + "epoch": 0.22841000807102502, + "grad_norm": 0.039233967661857605, + "learning_rate": 1.801573240859979e-05, + "loss": 0.4882, + "step": 283 + }, + { + "epoch": 0.22921711057304278, + "grad_norm": 0.04171024635434151, + "learning_rate": 1.8000065139633336e-05, + "loss": 0.4821, + "step": 284 + }, + { + "epoch": 0.23002421307506055, + "grad_norm": 0.0385863296687603, + "learning_rate": 1.7984343130387936e-05, + "loss": 0.4326, + "step": 285 + }, + { + "epoch": 0.23083131557707828, + "grad_norm": 0.042283158749341965, + "learning_rate": 1.7968566488441115e-05, + "loss": 0.4834, + "step": 286 + }, + { + "epoch": 0.23163841807909605, + "grad_norm": 0.033981919288635254, + "learning_rate": 1.795273532174421e-05, + "loss": 0.4169, + "step": 287 + }, + { + "epoch": 0.2324455205811138, + "grad_norm": 0.034205395728349686, + "learning_rate": 1.7936849738621656e-05, + "loss": 0.4283, + "step": 288 + }, + { + "epoch": 0.23325262308313155, + "grad_norm": 0.03619087114930153, + "learning_rate": 1.7920909847770223e-05, + "loss": 0.4406, + "step": 289 + }, + { + "epoch": 0.23405972558514931, + "grad_norm": 0.04226299375295639, + "learning_rate": 1.790491575825828e-05, + "loss": 0.5038, + "step": 290 + }, + { + "epoch": 0.23486682808716708, + "grad_norm": 0.04093988239765167, + "learning_rate": 1.7888867579525045e-05, + "loss": 0.508, + "step": 291 + }, + { + "epoch": 0.23567393058918482, + "grad_norm": 0.040389351546764374, + "learning_rate": 1.787276542137986e-05, + "loss": 0.5127, + "step": 292 + }, + { + "epoch": 0.23648103309120258, + "grad_norm": 0.03728820011019707, + "learning_rate": 1.78566093940014e-05, + "loss": 0.4442, + "step": 293 + }, + { + "epoch": 0.23728813559322035, + "grad_norm": 0.0355042964220047, + "learning_rate": 1.784039960793694e-05, + "loss": 0.4448, + "step": 294 + }, + { + "epoch": 0.23809523809523808, + "grad_norm": 0.039183955639600754, + "learning_rate": 1.7824136174101615e-05, + "loss": 0.4753, + "step": 295 + }, + { + "epoch": 0.23890234059725585, + "grad_norm": 0.039957690984010696, + "learning_rate": 1.780781920377762e-05, + "loss": 0.4083, + "step": 296 + }, + { + "epoch": 0.2397094430992736, + "grad_norm": 0.03782647103071213, + "learning_rate": 1.779144880861349e-05, + "loss": 0.4701, + "step": 297 + }, + { + "epoch": 0.24051654560129138, + "grad_norm": 0.04110096022486687, + "learning_rate": 1.777502510062331e-05, + "loss": 0.3975, + "step": 298 + }, + { + "epoch": 0.2413236481033091, + "grad_norm": 0.039122823625802994, + "learning_rate": 1.7758548192185966e-05, + "loss": 0.4414, + "step": 299 + }, + { + "epoch": 0.24213075060532688, + "grad_norm": 0.041810158640146255, + "learning_rate": 1.7742018196044352e-05, + "loss": 0.4951, + "step": 300 + }, + { + "epoch": 0.24293785310734464, + "grad_norm": 0.04400990530848503, + "learning_rate": 1.7725435225304632e-05, + "loss": 0.5081, + "step": 301 + }, + { + "epoch": 0.24374495560936238, + "grad_norm": 0.036244601011276245, + "learning_rate": 1.7708799393435426e-05, + "loss": 0.4776, + "step": 302 + }, + { + "epoch": 0.24455205811138014, + "grad_norm": 0.04246852546930313, + "learning_rate": 1.7692110814267077e-05, + "loss": 0.409, + "step": 303 + }, + { + "epoch": 0.2453591606133979, + "grad_norm": 0.03885439783334732, + "learning_rate": 1.7675369601990834e-05, + "loss": 0.4556, + "step": 304 + }, + { + "epoch": 0.24616626311541565, + "grad_norm": 0.04391565918922424, + "learning_rate": 1.765857587115809e-05, + "loss": 0.5051, + "step": 305 + }, + { + "epoch": 0.2469733656174334, + "grad_norm": 0.044587794691324234, + "learning_rate": 1.7641729736679608e-05, + "loss": 0.525, + "step": 306 + }, + { + "epoch": 0.24778046811945117, + "grad_norm": 0.03848964348435402, + "learning_rate": 1.7624831313824707e-05, + "loss": 0.4648, + "step": 307 + }, + { + "epoch": 0.24858757062146894, + "grad_norm": 0.04077491536736488, + "learning_rate": 1.7607880718220493e-05, + "loss": 0.4774, + "step": 308 + }, + { + "epoch": 0.24939467312348668, + "grad_norm": 0.03753182664513588, + "learning_rate": 1.759087806585106e-05, + "loss": 0.4698, + "step": 309 + }, + { + "epoch": 0.25020177562550444, + "grad_norm": 0.038406360894441605, + "learning_rate": 1.7573823473056704e-05, + "loss": 0.4519, + "step": 310 + }, + { + "epoch": 0.2510088781275222, + "grad_norm": 0.03918207436800003, + "learning_rate": 1.7556717056533124e-05, + "loss": 0.4447, + "step": 311 + }, + { + "epoch": 0.25181598062953997, + "grad_norm": 0.04120977967977524, + "learning_rate": 1.7539558933330618e-05, + "loss": 0.4791, + "step": 312 + }, + { + "epoch": 0.25262308313155774, + "grad_norm": 0.043005820363759995, + "learning_rate": 1.7522349220853284e-05, + "loss": 0.4385, + "step": 313 + }, + { + "epoch": 0.25343018563357544, + "grad_norm": 0.043990328907966614, + "learning_rate": 1.750508803685822e-05, + "loss": 0.4184, + "step": 314 + }, + { + "epoch": 0.2542372881355932, + "grad_norm": 0.0447077676653862, + "learning_rate": 1.748777549945472e-05, + "loss": 0.4524, + "step": 315 + }, + { + "epoch": 0.255044390637611, + "grad_norm": 0.046113114804029465, + "learning_rate": 1.747041172710346e-05, + "loss": 0.4221, + "step": 316 + }, + { + "epoch": 0.25585149313962874, + "grad_norm": 0.0372101292014122, + "learning_rate": 1.745299683861569e-05, + "loss": 0.439, + "step": 317 + }, + { + "epoch": 0.2566585956416465, + "grad_norm": 0.040746718645095825, + "learning_rate": 1.743553095315242e-05, + "loss": 0.4499, + "step": 318 + }, + { + "epoch": 0.25746569814366427, + "grad_norm": 0.03624715283513069, + "learning_rate": 1.741801419022361e-05, + "loss": 0.4495, + "step": 319 + }, + { + "epoch": 0.258272800645682, + "grad_norm": 0.04276509955525398, + "learning_rate": 1.740044666968734e-05, + "loss": 0.4393, + "step": 320 + }, + { + "epoch": 0.25907990314769974, + "grad_norm": 0.041294898837804794, + "learning_rate": 1.738282851174901e-05, + "loss": 0.4413, + "step": 321 + }, + { + "epoch": 0.2598870056497175, + "grad_norm": 0.042043715715408325, + "learning_rate": 1.7365159836960494e-05, + "loss": 0.5303, + "step": 322 + }, + { + "epoch": 0.26069410815173527, + "grad_norm": 0.04050707444548607, + "learning_rate": 1.734744076621933e-05, + "loss": 0.5062, + "step": 323 + }, + { + "epoch": 0.26150121065375304, + "grad_norm": 0.03927404060959816, + "learning_rate": 1.7329671420767898e-05, + "loss": 0.4519, + "step": 324 + }, + { + "epoch": 0.2623083131557708, + "grad_norm": 0.04352013021707535, + "learning_rate": 1.7311851922192568e-05, + "loss": 0.4835, + "step": 325 + }, + { + "epoch": 0.26311541565778856, + "grad_norm": 0.04132748767733574, + "learning_rate": 1.729398239242288e-05, + "loss": 0.4179, + "step": 326 + }, + { + "epoch": 0.2639225181598063, + "grad_norm": 0.04499141499400139, + "learning_rate": 1.727606295373073e-05, + "loss": 0.4907, + "step": 327 + }, + { + "epoch": 0.26472962066182404, + "grad_norm": 0.0411851666867733, + "learning_rate": 1.7258093728729503e-05, + "loss": 0.4438, + "step": 328 + }, + { + "epoch": 0.2655367231638418, + "grad_norm": 0.045802779495716095, + "learning_rate": 1.724007484037324e-05, + "loss": 0.4276, + "step": 329 + }, + { + "epoch": 0.26634382566585957, + "grad_norm": 0.04081873223185539, + "learning_rate": 1.722200641195581e-05, + "loss": 0.4514, + "step": 330 + }, + { + "epoch": 0.26715092816787733, + "grad_norm": 0.04145602881908417, + "learning_rate": 1.7203888567110066e-05, + "loss": 0.5665, + "step": 331 + }, + { + "epoch": 0.2679580306698951, + "grad_norm": 0.03948275372385979, + "learning_rate": 1.7185721429806975e-05, + "loss": 0.4196, + "step": 332 + }, + { + "epoch": 0.2687651331719128, + "grad_norm": 0.04905907064676285, + "learning_rate": 1.7167505124354803e-05, + "loss": 0.4886, + "step": 333 + }, + { + "epoch": 0.26957223567393057, + "grad_norm": 0.04699491709470749, + "learning_rate": 1.714923977539823e-05, + "loss": 0.4767, + "step": 334 + }, + { + "epoch": 0.27037933817594834, + "grad_norm": 0.04403752461075783, + "learning_rate": 1.7130925507917534e-05, + "loss": 0.5452, + "step": 335 + }, + { + "epoch": 0.2711864406779661, + "grad_norm": 0.03853682428598404, + "learning_rate": 1.7112562447227703e-05, + "loss": 0.4439, + "step": 336 + }, + { + "epoch": 0.27199354317998387, + "grad_norm": 0.040106069296598434, + "learning_rate": 1.70941507189776e-05, + "loss": 0.4426, + "step": 337 + }, + { + "epoch": 0.27280064568200163, + "grad_norm": 0.04464896023273468, + "learning_rate": 1.7075690449149092e-05, + "loss": 0.4732, + "step": 338 + }, + { + "epoch": 0.2736077481840194, + "grad_norm": 0.0417223796248436, + "learning_rate": 1.7057181764056188e-05, + "loss": 0.4296, + "step": 339 + }, + { + "epoch": 0.2744148506860371, + "grad_norm": 0.03915628045797348, + "learning_rate": 1.7038624790344185e-05, + "loss": 0.4623, + "step": 340 + }, + { + "epoch": 0.27522195318805487, + "grad_norm": 0.035377517342567444, + "learning_rate": 1.7020019654988784e-05, + "loss": 0.3977, + "step": 341 + }, + { + "epoch": 0.27602905569007263, + "grad_norm": 0.04356204345822334, + "learning_rate": 1.7001366485295235e-05, + "loss": 0.4423, + "step": 342 + }, + { + "epoch": 0.2768361581920904, + "grad_norm": 0.03903350234031677, + "learning_rate": 1.6982665408897468e-05, + "loss": 0.4568, + "step": 343 + }, + { + "epoch": 0.27764326069410816, + "grad_norm": 0.045728739351034164, + "learning_rate": 1.6963916553757204e-05, + "loss": 0.4824, + "step": 344 + }, + { + "epoch": 0.2784503631961259, + "grad_norm": 0.03853655606508255, + "learning_rate": 1.6945120048163092e-05, + "loss": 0.4289, + "step": 345 + }, + { + "epoch": 0.27925746569814364, + "grad_norm": 0.037414539605379105, + "learning_rate": 1.6926276020729836e-05, + "loss": 0.4266, + "step": 346 + }, + { + "epoch": 0.2800645682001614, + "grad_norm": 0.042327504605054855, + "learning_rate": 1.6907384600397295e-05, + "loss": 0.472, + "step": 347 + }, + { + "epoch": 0.28087167070217917, + "grad_norm": 0.039189938455820084, + "learning_rate": 1.6888445916429624e-05, + "loss": 0.5246, + "step": 348 + }, + { + "epoch": 0.28167877320419693, + "grad_norm": 0.04332740232348442, + "learning_rate": 1.686946009841437e-05, + "loss": 0.4247, + "step": 349 + }, + { + "epoch": 0.2824858757062147, + "grad_norm": 0.03479583561420441, + "learning_rate": 1.6850427276261596e-05, + "loss": 0.4145, + "step": 350 + }, + { + "epoch": 0.28329297820823246, + "grad_norm": 0.03648196533322334, + "learning_rate": 1.683134758020299e-05, + "loss": 0.4035, + "step": 351 + }, + { + "epoch": 0.2841000807102502, + "grad_norm": 0.03672903776168823, + "learning_rate": 1.681222114079098e-05, + "loss": 0.4395, + "step": 352 + }, + { + "epoch": 0.28490718321226793, + "grad_norm": 0.04137750715017319, + "learning_rate": 1.6793048088897827e-05, + "loss": 0.4235, + "step": 353 + }, + { + "epoch": 0.2857142857142857, + "grad_norm": 0.03800482675433159, + "learning_rate": 1.677382855571473e-05, + "loss": 0.4711, + "step": 354 + }, + { + "epoch": 0.28652138821630346, + "grad_norm": 0.03724050521850586, + "learning_rate": 1.675456267275096e-05, + "loss": 0.474, + "step": 355 + }, + { + "epoch": 0.28732849071832123, + "grad_norm": 0.040064744651317596, + "learning_rate": 1.6735250571832907e-05, + "loss": 0.467, + "step": 356 + }, + { + "epoch": 0.288135593220339, + "grad_norm": 0.038956187665462494, + "learning_rate": 1.6715892385103223e-05, + "loss": 0.4088, + "step": 357 + }, + { + "epoch": 0.28894269572235676, + "grad_norm": 0.03888452798128128, + "learning_rate": 1.6696488245019898e-05, + "loss": 0.4535, + "step": 358 + }, + { + "epoch": 0.2897497982243745, + "grad_norm": 0.040293607860803604, + "learning_rate": 1.6677038284355358e-05, + "loss": 0.4577, + "step": 359 + }, + { + "epoch": 0.29055690072639223, + "grad_norm": 0.03922054544091225, + "learning_rate": 1.6657542636195546e-05, + "loss": 0.4294, + "step": 360 + }, + { + "epoch": 0.29136400322841, + "grad_norm": 0.039267491549253464, + "learning_rate": 1.6638001433939036e-05, + "loss": 0.4304, + "step": 361 + }, + { + "epoch": 0.29217110573042776, + "grad_norm": 0.03777645528316498, + "learning_rate": 1.6618414811296094e-05, + "loss": 0.4243, + "step": 362 + }, + { + "epoch": 0.2929782082324455, + "grad_norm": 0.038406968116760254, + "learning_rate": 1.6598782902287775e-05, + "loss": 0.4831, + "step": 363 + }, + { + "epoch": 0.2937853107344633, + "grad_norm": 0.039704568684101105, + "learning_rate": 1.657910584124501e-05, + "loss": 0.4482, + "step": 364 + }, + { + "epoch": 0.29459241323648105, + "grad_norm": 0.04083388298749924, + "learning_rate": 1.655938376280768e-05, + "loss": 0.4234, + "step": 365 + }, + { + "epoch": 0.29539951573849876, + "grad_norm": 0.037917107343673706, + "learning_rate": 1.653961680192369e-05, + "loss": 0.3928, + "step": 366 + }, + { + "epoch": 0.29620661824051653, + "grad_norm": 0.0368562787771225, + "learning_rate": 1.6519805093848063e-05, + "loss": 0.439, + "step": 367 + }, + { + "epoch": 0.2970137207425343, + "grad_norm": 0.040610842406749725, + "learning_rate": 1.6499948774142e-05, + "loss": 0.4654, + "step": 368 + }, + { + "epoch": 0.29782082324455206, + "grad_norm": 0.040698591619729996, + "learning_rate": 1.648004797867195e-05, + "loss": 0.4465, + "step": 369 + }, + { + "epoch": 0.2986279257465698, + "grad_norm": 0.0374905951321125, + "learning_rate": 1.6460102843608692e-05, + "loss": 0.4665, + "step": 370 + }, + { + "epoch": 0.2994350282485876, + "grad_norm": 0.04037656635046005, + "learning_rate": 1.6440113505426397e-05, + "loss": 0.4234, + "step": 371 + }, + { + "epoch": 0.30024213075060535, + "grad_norm": 0.038281265646219254, + "learning_rate": 1.6420080100901693e-05, + "loss": 0.4879, + "step": 372 + }, + { + "epoch": 0.30104923325262306, + "grad_norm": 0.043232664465904236, + "learning_rate": 1.640000276711274e-05, + "loss": 0.4534, + "step": 373 + }, + { + "epoch": 0.3018563357546408, + "grad_norm": 0.039622269570827484, + "learning_rate": 1.637988164143827e-05, + "loss": 0.4077, + "step": 374 + }, + { + "epoch": 0.3026634382566586, + "grad_norm": 0.03671490401029587, + "learning_rate": 1.6359716861556668e-05, + "loss": 0.4214, + "step": 375 + }, + { + "epoch": 0.30347054075867635, + "grad_norm": 0.051524847745895386, + "learning_rate": 1.6339508565445016e-05, + "loss": 0.4914, + "step": 376 + }, + { + "epoch": 0.3042776432606941, + "grad_norm": 0.04071008041501045, + "learning_rate": 1.6319256891378166e-05, + "loss": 0.4077, + "step": 377 + }, + { + "epoch": 0.3050847457627119, + "grad_norm": 0.03799833729863167, + "learning_rate": 1.629896197792777e-05, + "loss": 0.3997, + "step": 378 + }, + { + "epoch": 0.3058918482647296, + "grad_norm": 0.03393981233239174, + "learning_rate": 1.627862396396135e-05, + "loss": 0.3734, + "step": 379 + }, + { + "epoch": 0.30669895076674736, + "grad_norm": 0.04095921292901039, + "learning_rate": 1.6258242988641346e-05, + "loss": 0.5031, + "step": 380 + }, + { + "epoch": 0.3075060532687651, + "grad_norm": 0.04322320595383644, + "learning_rate": 1.6237819191424155e-05, + "loss": 0.3861, + "step": 381 + }, + { + "epoch": 0.3083131557707829, + "grad_norm": 0.039408132433891296, + "learning_rate": 1.6217352712059186e-05, + "loss": 0.4431, + "step": 382 + }, + { + "epoch": 0.30912025827280065, + "grad_norm": 0.04281940311193466, + "learning_rate": 1.619684369058789e-05, + "loss": 0.4247, + "step": 383 + }, + { + "epoch": 0.3099273607748184, + "grad_norm": 0.03588446229696274, + "learning_rate": 1.617629226734283e-05, + "loss": 0.4107, + "step": 384 + }, + { + "epoch": 0.3107344632768362, + "grad_norm": 0.04118945822119713, + "learning_rate": 1.6155698582946678e-05, + "loss": 0.3958, + "step": 385 + }, + { + "epoch": 0.3115415657788539, + "grad_norm": 0.039396982640028, + "learning_rate": 1.6135062778311304e-05, + "loss": 0.4786, + "step": 386 + }, + { + "epoch": 0.31234866828087166, + "grad_norm": 0.03626495227217674, + "learning_rate": 1.6114384994636765e-05, + "loss": 0.3843, + "step": 387 + }, + { + "epoch": 0.3131557707828894, + "grad_norm": 0.04274310916662216, + "learning_rate": 1.6093665373410357e-05, + "loss": 0.488, + "step": 388 + }, + { + "epoch": 0.3139628732849072, + "grad_norm": 0.040091730654239655, + "learning_rate": 1.6072904056405673e-05, + "loss": 0.4844, + "step": 389 + }, + { + "epoch": 0.31476997578692495, + "grad_norm": 0.041715774685144424, + "learning_rate": 1.6052101185681576e-05, + "loss": 0.457, + "step": 390 + }, + { + "epoch": 0.3155770782889427, + "grad_norm": 0.03776617348194122, + "learning_rate": 1.6031256903581283e-05, + "loss": 0.4419, + "step": 391 + }, + { + "epoch": 0.3163841807909605, + "grad_norm": 0.03789287060499191, + "learning_rate": 1.601037135273136e-05, + "loss": 0.4598, + "step": 392 + }, + { + "epoch": 0.3171912832929782, + "grad_norm": 0.036457955837249756, + "learning_rate": 1.598944467604075e-05, + "loss": 0.3921, + "step": 393 + }, + { + "epoch": 0.31799838579499595, + "grad_norm": 0.03638232499361038, + "learning_rate": 1.5968477016699802e-05, + "loss": 0.4188, + "step": 394 + }, + { + "epoch": 0.3188054882970137, + "grad_norm": 0.03561306744813919, + "learning_rate": 1.594746851817929e-05, + "loss": 0.3882, + "step": 395 + }, + { + "epoch": 0.3196125907990315, + "grad_norm": 0.037340566515922546, + "learning_rate": 1.592641932422943e-05, + "loss": 0.4307, + "step": 396 + }, + { + "epoch": 0.32041969330104925, + "grad_norm": 0.037597768008708954, + "learning_rate": 1.5905329578878885e-05, + "loss": 0.4076, + "step": 397 + }, + { + "epoch": 0.321226795803067, + "grad_norm": 0.03577461093664169, + "learning_rate": 1.5884199426433804e-05, + "loss": 0.404, + "step": 398 + }, + { + "epoch": 0.3220338983050847, + "grad_norm": 0.05212230980396271, + "learning_rate": 1.5863029011476815e-05, + "loss": 0.5017, + "step": 399 + }, + { + "epoch": 0.3228410008071025, + "grad_norm": 0.03799346089363098, + "learning_rate": 1.584181847886604e-05, + "loss": 0.4341, + "step": 400 + }, + { + "epoch": 0.32364810330912025, + "grad_norm": 0.042107388377189636, + "learning_rate": 1.5820567973734103e-05, + "loss": 0.4576, + "step": 401 + }, + { + "epoch": 0.324455205811138, + "grad_norm": 0.03835953027009964, + "learning_rate": 1.579927764148714e-05, + "loss": 0.4717, + "step": 402 + }, + { + "epoch": 0.3252623083131558, + "grad_norm": 0.04675079137086868, + "learning_rate": 1.5777947627803818e-05, + "loss": 0.4853, + "step": 403 + }, + { + "epoch": 0.32606941081517354, + "grad_norm": 0.03846751153469086, + "learning_rate": 1.57565780786343e-05, + "loss": 0.4379, + "step": 404 + }, + { + "epoch": 0.3268765133171913, + "grad_norm": 0.04810432344675064, + "learning_rate": 1.5735169140199284e-05, + "loss": 0.4313, + "step": 405 + }, + { + "epoch": 0.327683615819209, + "grad_norm": 0.04163535684347153, + "learning_rate": 1.5713720958988985e-05, + "loss": 0.4422, + "step": 406 + }, + { + "epoch": 0.3284907183212268, + "grad_norm": 0.042189642786979675, + "learning_rate": 1.5692233681762137e-05, + "loss": 0.4573, + "step": 407 + }, + { + "epoch": 0.32929782082324455, + "grad_norm": 0.038153890520334244, + "learning_rate": 1.5670707455544983e-05, + "loss": 0.4377, + "step": 408 + }, + { + "epoch": 0.3301049233252623, + "grad_norm": 0.040447358042001724, + "learning_rate": 1.564914242763028e-05, + "loss": 0.4143, + "step": 409 + }, + { + "epoch": 0.3309120258272801, + "grad_norm": 0.03961403667926788, + "learning_rate": 1.562753874557628e-05, + "loss": 0.4291, + "step": 410 + }, + { + "epoch": 0.33171912832929784, + "grad_norm": 0.04563172534108162, + "learning_rate": 1.560589655720573e-05, + "loss": 0.4549, + "step": 411 + }, + { + "epoch": 0.33252623083131555, + "grad_norm": 0.04365930333733559, + "learning_rate": 1.5584216010604852e-05, + "loss": 0.4165, + "step": 412 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 0.03692984953522682, + "learning_rate": 1.5562497254122333e-05, + "loss": 0.3979, + "step": 413 + }, + { + "epoch": 0.3341404358353511, + "grad_norm": 0.039555348455905914, + "learning_rate": 1.5540740436368313e-05, + "loss": 0.4813, + "step": 414 + }, + { + "epoch": 0.33494753833736884, + "grad_norm": 0.03751777485013008, + "learning_rate": 1.551894570621336e-05, + "loss": 0.4167, + "step": 415 + }, + { + "epoch": 0.3357546408393866, + "grad_norm": 0.03755151107907295, + "learning_rate": 1.5497113212787463e-05, + "loss": 0.4185, + "step": 416 + }, + { + "epoch": 0.3365617433414044, + "grad_norm": 0.0399174690246582, + "learning_rate": 1.5475243105479002e-05, + "loss": 0.3922, + "step": 417 + }, + { + "epoch": 0.33736884584342214, + "grad_norm": 0.03419012576341629, + "learning_rate": 1.5453335533933733e-05, + "loss": 0.4386, + "step": 418 + }, + { + "epoch": 0.33817594834543985, + "grad_norm": 0.04388118535280228, + "learning_rate": 1.543139064805376e-05, + "loss": 0.5092, + "step": 419 + }, + { + "epoch": 0.3389830508474576, + "grad_norm": 0.04407074674963951, + "learning_rate": 1.54094085979965e-05, + "loss": 0.4565, + "step": 420 + }, + { + "epoch": 0.3397901533494754, + "grad_norm": 0.04343196749687195, + "learning_rate": 1.538738953417368e-05, + "loss": 0.4412, + "step": 421 + }, + { + "epoch": 0.34059725585149314, + "grad_norm": 0.03644689917564392, + "learning_rate": 1.5365333607250277e-05, + "loss": 0.4272, + "step": 422 + }, + { + "epoch": 0.3414043583535109, + "grad_norm": 0.04378410428762436, + "learning_rate": 1.5343240968143516e-05, + "loss": 0.4221, + "step": 423 + }, + { + "epoch": 0.34221146085552867, + "grad_norm": 0.03766017034649849, + "learning_rate": 1.532111176802182e-05, + "loss": 0.4441, + "step": 424 + }, + { + "epoch": 0.34301856335754644, + "grad_norm": 0.03888798505067825, + "learning_rate": 1.5298946158303776e-05, + "loss": 0.4014, + "step": 425 + }, + { + "epoch": 0.34382566585956414, + "grad_norm": 0.042303815484046936, + "learning_rate": 1.527674429065711e-05, + "loss": 0.4548, + "step": 426 + }, + { + "epoch": 0.3446327683615819, + "grad_norm": 0.040103986859321594, + "learning_rate": 1.5254506316997638e-05, + "loss": 0.4539, + "step": 427 + }, + { + "epoch": 0.3454398708635997, + "grad_norm": 0.04512703791260719, + "learning_rate": 1.5232232389488228e-05, + "loss": 0.4667, + "step": 428 + }, + { + "epoch": 0.34624697336561744, + "grad_norm": 0.0437651164829731, + "learning_rate": 1.520992266053777e-05, + "loss": 0.4823, + "step": 429 + }, + { + "epoch": 0.3470540758676352, + "grad_norm": 0.044296592473983765, + "learning_rate": 1.5187577282800114e-05, + "loss": 0.5237, + "step": 430 + }, + { + "epoch": 0.34786117836965297, + "grad_norm": 0.048969656229019165, + "learning_rate": 1.5165196409173044e-05, + "loss": 0.4724, + "step": 431 + }, + { + "epoch": 0.3486682808716707, + "grad_norm": 0.04471556469798088, + "learning_rate": 1.5142780192797222e-05, + "loss": 0.4207, + "step": 432 + }, + { + "epoch": 0.34947538337368844, + "grad_norm": 0.04149989038705826, + "learning_rate": 1.512032878705514e-05, + "loss": 0.483, + "step": 433 + }, + { + "epoch": 0.3502824858757062, + "grad_norm": 0.041714008897542953, + "learning_rate": 1.5097842345570075e-05, + "loss": 0.4601, + "step": 434 + }, + { + "epoch": 0.35108958837772397, + "grad_norm": 0.03949635103344917, + "learning_rate": 1.5075321022205032e-05, + "loss": 0.3904, + "step": 435 + }, + { + "epoch": 0.35189669087974174, + "grad_norm": 0.03717697039246559, + "learning_rate": 1.5052764971061696e-05, + "loss": 0.3682, + "step": 436 + }, + { + "epoch": 0.3527037933817595, + "grad_norm": 0.042058829218149185, + "learning_rate": 1.503017434647938e-05, + "loss": 0.4627, + "step": 437 + }, + { + "epoch": 0.35351089588377727, + "grad_norm": 0.03921369090676308, + "learning_rate": 1.5007549303033959e-05, + "loss": 0.4001, + "step": 438 + }, + { + "epoch": 0.354317998385795, + "grad_norm": 0.06680601090192795, + "learning_rate": 1.4984889995536825e-05, + "loss": 0.4442, + "step": 439 + }, + { + "epoch": 0.35512510088781274, + "grad_norm": 0.048763904720544815, + "learning_rate": 1.4962196579033813e-05, + "loss": 0.4623, + "step": 440 + }, + { + "epoch": 0.3559322033898305, + "grad_norm": 0.042305104434490204, + "learning_rate": 1.4939469208804152e-05, + "loss": 0.4807, + "step": 441 + }, + { + "epoch": 0.35673930589184827, + "grad_norm": 0.039217207580804825, + "learning_rate": 1.4916708040359403e-05, + "loss": 0.4262, + "step": 442 + }, + { + "epoch": 0.35754640839386603, + "grad_norm": 0.04164797067642212, + "learning_rate": 1.4893913229442383e-05, + "loss": 0.4513, + "step": 443 + }, + { + "epoch": 0.3583535108958838, + "grad_norm": 0.04174966737627983, + "learning_rate": 1.4871084932026114e-05, + "loss": 0.4474, + "step": 444 + }, + { + "epoch": 0.3591606133979015, + "grad_norm": 0.04466141760349274, + "learning_rate": 1.4848223304312738e-05, + "loss": 0.4859, + "step": 445 + }, + { + "epoch": 0.35996771589991927, + "grad_norm": 0.04579891636967659, + "learning_rate": 1.4825328502732477e-05, + "loss": 0.4946, + "step": 446 + }, + { + "epoch": 0.36077481840193704, + "grad_norm": 0.04515714570879936, + "learning_rate": 1.4802400683942526e-05, + "loss": 0.3916, + "step": 447 + }, + { + "epoch": 0.3615819209039548, + "grad_norm": 0.0440857857465744, + "learning_rate": 1.4779440004826006e-05, + "loss": 0.4027, + "step": 448 + }, + { + "epoch": 0.36238902340597257, + "grad_norm": 0.044258441776037216, + "learning_rate": 1.4756446622490894e-05, + "loss": 0.4781, + "step": 449 + }, + { + "epoch": 0.36319612590799033, + "grad_norm": 0.0432918481528759, + "learning_rate": 1.4733420694268917e-05, + "loss": 0.3359, + "step": 450 + }, + { + "epoch": 0.3640032284100081, + "grad_norm": 0.03601154685020447, + "learning_rate": 1.4710362377714516e-05, + "loss": 0.3822, + "step": 451 + }, + { + "epoch": 0.3648103309120258, + "grad_norm": 0.04147439822554588, + "learning_rate": 1.4687271830603744e-05, + "loss": 0.4054, + "step": 452 + }, + { + "epoch": 0.36561743341404357, + "grad_norm": 0.03952731937170029, + "learning_rate": 1.466414921093318e-05, + "loss": 0.4001, + "step": 453 + }, + { + "epoch": 0.36642453591606133, + "grad_norm": 0.04244634881615639, + "learning_rate": 1.4640994676918873e-05, + "loss": 0.4463, + "step": 454 + }, + { + "epoch": 0.3672316384180791, + "grad_norm": 0.03925754129886627, + "learning_rate": 1.4617808386995243e-05, + "loss": 0.3938, + "step": 455 + }, + { + "epoch": 0.36803874092009686, + "grad_norm": 0.03985142707824707, + "learning_rate": 1.4594590499813985e-05, + "loss": 0.4186, + "step": 456 + }, + { + "epoch": 0.3688458434221146, + "grad_norm": 0.04152598977088928, + "learning_rate": 1.4571341174243017e-05, + "loss": 0.4217, + "step": 457 + }, + { + "epoch": 0.3696529459241324, + "grad_norm": 0.043532952666282654, + "learning_rate": 1.4548060569365358e-05, + "loss": 0.4714, + "step": 458 + }, + { + "epoch": 0.3704600484261501, + "grad_norm": 0.04257139936089516, + "learning_rate": 1.4524748844478065e-05, + "loss": 0.4244, + "step": 459 + }, + { + "epoch": 0.37126715092816787, + "grad_norm": 0.04078930243849754, + "learning_rate": 1.4501406159091132e-05, + "loss": 0.3825, + "step": 460 + }, + { + "epoch": 0.37207425343018563, + "grad_norm": 0.04355788230895996, + "learning_rate": 1.4478032672926389e-05, + "loss": 0.4495, + "step": 461 + }, + { + "epoch": 0.3728813559322034, + "grad_norm": 0.04207145422697067, + "learning_rate": 1.445462854591644e-05, + "loss": 0.4162, + "step": 462 + }, + { + "epoch": 0.37368845843422116, + "grad_norm": 0.04463491588830948, + "learning_rate": 1.4431193938203524e-05, + "loss": 0.4389, + "step": 463 + }, + { + "epoch": 0.3744955609362389, + "grad_norm": 0.045574747025966644, + "learning_rate": 1.4407729010138463e-05, + "loss": 0.4531, + "step": 464 + }, + { + "epoch": 0.37530266343825663, + "grad_norm": 0.0455825999379158, + "learning_rate": 1.4384233922279538e-05, + "loss": 0.4312, + "step": 465 + }, + { + "epoch": 0.3761097659402744, + "grad_norm": 0.04084734618663788, + "learning_rate": 1.4360708835391392e-05, + "loss": 0.4295, + "step": 466 + }, + { + "epoch": 0.37691686844229216, + "grad_norm": 0.04858178272843361, + "learning_rate": 1.433715391044395e-05, + "loss": 0.519, + "step": 467 + }, + { + "epoch": 0.37772397094430993, + "grad_norm": 0.03454524278640747, + "learning_rate": 1.4313569308611289e-05, + "loss": 0.3733, + "step": 468 + }, + { + "epoch": 0.3785310734463277, + "grad_norm": 0.03903524577617645, + "learning_rate": 1.4289955191270555e-05, + "loss": 0.3699, + "step": 469 + }, + { + "epoch": 0.37933817594834546, + "grad_norm": 0.05262704938650131, + "learning_rate": 1.4266311720000857e-05, + "loss": 0.5171, + "step": 470 + }, + { + "epoch": 0.3801452784503632, + "grad_norm": 0.0390932559967041, + "learning_rate": 1.4242639056582155e-05, + "loss": 0.3953, + "step": 471 + }, + { + "epoch": 0.38095238095238093, + "grad_norm": 0.04563722386956215, + "learning_rate": 1.4218937362994152e-05, + "loss": 0.502, + "step": 472 + }, + { + "epoch": 0.3817594834543987, + "grad_norm": 0.03626189008355141, + "learning_rate": 1.419520680141519e-05, + "loss": 0.4058, + "step": 473 + }, + { + "epoch": 0.38256658595641646, + "grad_norm": 0.04210507124662399, + "learning_rate": 1.4171447534221147e-05, + "loss": 0.4653, + "step": 474 + }, + { + "epoch": 0.3833736884584342, + "grad_norm": 0.046014368534088135, + "learning_rate": 1.4147659723984313e-05, + "loss": 0.4791, + "step": 475 + }, + { + "epoch": 0.384180790960452, + "grad_norm": 0.041533246636390686, + "learning_rate": 1.412384353347228e-05, + "loss": 0.402, + "step": 476 + }, + { + "epoch": 0.38498789346246975, + "grad_norm": 0.041039034724235535, + "learning_rate": 1.4099999125646839e-05, + "loss": 0.4235, + "step": 477 + }, + { + "epoch": 0.38579499596448746, + "grad_norm": 0.04687739536166191, + "learning_rate": 1.407612666366285e-05, + "loss": 0.4187, + "step": 478 + }, + { + "epoch": 0.38660209846650523, + "grad_norm": 0.04226434975862503, + "learning_rate": 1.4052226310867138e-05, + "loss": 0.437, + "step": 479 + }, + { + "epoch": 0.387409200968523, + "grad_norm": 0.03916895389556885, + "learning_rate": 1.4028298230797371e-05, + "loss": 0.3641, + "step": 480 + }, + { + "epoch": 0.38821630347054076, + "grad_norm": 0.03749072924256325, + "learning_rate": 1.4004342587180934e-05, + "loss": 0.4067, + "step": 481 + }, + { + "epoch": 0.3890234059725585, + "grad_norm": 0.03737153112888336, + "learning_rate": 1.3980359543933822e-05, + "loss": 0.3473, + "step": 482 + }, + { + "epoch": 0.3898305084745763, + "grad_norm": 0.04256528615951538, + "learning_rate": 1.3956349265159508e-05, + "loss": 0.3997, + "step": 483 + }, + { + "epoch": 0.39063761097659405, + "grad_norm": 0.0409972257912159, + "learning_rate": 1.3932311915147822e-05, + "loss": 0.4516, + "step": 484 + }, + { + "epoch": 0.39144471347861176, + "grad_norm": 0.04119865968823433, + "learning_rate": 1.3908247658373835e-05, + "loss": 0.4282, + "step": 485 + }, + { + "epoch": 0.3922518159806295, + "grad_norm": 0.041595458984375, + "learning_rate": 1.3884156659496715e-05, + "loss": 0.4043, + "step": 486 + }, + { + "epoch": 0.3930589184826473, + "grad_norm": 0.035765454173088074, + "learning_rate": 1.3860039083358628e-05, + "loss": 0.4139, + "step": 487 + }, + { + "epoch": 0.39386602098466506, + "grad_norm": 0.03818071261048317, + "learning_rate": 1.3835895094983578e-05, + "loss": 0.3724, + "step": 488 + }, + { + "epoch": 0.3946731234866828, + "grad_norm": 0.03883254528045654, + "learning_rate": 1.3811724859576305e-05, + "loss": 0.446, + "step": 489 + }, + { + "epoch": 0.3954802259887006, + "grad_norm": 0.044025968760252, + "learning_rate": 1.3787528542521145e-05, + "loss": 0.4644, + "step": 490 + }, + { + "epoch": 0.3962873284907183, + "grad_norm": 0.03932966664433479, + "learning_rate": 1.376330630938089e-05, + "loss": 0.4721, + "step": 491 + }, + { + "epoch": 0.39709443099273606, + "grad_norm": 0.039546482264995575, + "learning_rate": 1.3739058325895663e-05, + "loss": 0.4095, + "step": 492 + }, + { + "epoch": 0.3979015334947538, + "grad_norm": 0.0403420627117157, + "learning_rate": 1.371478475798179e-05, + "loss": 0.4073, + "step": 493 + }, + { + "epoch": 0.3987086359967716, + "grad_norm": 0.042821839451789856, + "learning_rate": 1.3690485771730649e-05, + "loss": 0.5032, + "step": 494 + }, + { + "epoch": 0.39951573849878935, + "grad_norm": 0.04345512017607689, + "learning_rate": 1.3666161533407551e-05, + "loss": 0.4399, + "step": 495 + }, + { + "epoch": 0.4003228410008071, + "grad_norm": 0.04715168476104736, + "learning_rate": 1.3641812209450585e-05, + "loss": 0.4391, + "step": 496 + }, + { + "epoch": 0.4011299435028249, + "grad_norm": 0.04834922403097153, + "learning_rate": 1.3617437966469496e-05, + "loss": 0.4231, + "step": 497 + }, + { + "epoch": 0.4019370460048426, + "grad_norm": 0.04127978906035423, + "learning_rate": 1.3593038971244534e-05, + "loss": 0.4376, + "step": 498 + }, + { + "epoch": 0.40274414850686036, + "grad_norm": 0.049879882484674454, + "learning_rate": 1.3568615390725311e-05, + "loss": 0.4751, + "step": 499 + }, + { + "epoch": 0.4035512510088781, + "grad_norm": 0.04311080649495125, + "learning_rate": 1.3544167392029675e-05, + "loss": 0.4184, + "step": 500 + }, + { + "epoch": 0.4043583535108959, + "grad_norm": 0.04278581589460373, + "learning_rate": 1.3519695142442539e-05, + "loss": 0.3598, + "step": 501 + }, + { + "epoch": 0.40516545601291365, + "grad_norm": 0.03896728530526161, + "learning_rate": 1.3495198809414764e-05, + "loss": 0.3933, + "step": 502 + }, + { + "epoch": 0.4059725585149314, + "grad_norm": 0.03846316412091255, + "learning_rate": 1.3470678560562003e-05, + "loss": 0.3793, + "step": 503 + }, + { + "epoch": 0.4067796610169492, + "grad_norm": 0.04504529759287834, + "learning_rate": 1.344613456366354e-05, + "loss": 0.4258, + "step": 504 + }, + { + "epoch": 0.4075867635189669, + "grad_norm": 0.04523047059774399, + "learning_rate": 1.3421566986661166e-05, + "loss": 0.382, + "step": 505 + }, + { + "epoch": 0.40839386602098465, + "grad_norm": 0.0470433384180069, + "learning_rate": 1.3396975997658013e-05, + "loss": 0.4093, + "step": 506 + }, + { + "epoch": 0.4092009685230024, + "grad_norm": 0.041309718042612076, + "learning_rate": 1.337236176491741e-05, + "loss": 0.4026, + "step": 507 + }, + { + "epoch": 0.4100080710250202, + "grad_norm": 0.04193636029958725, + "learning_rate": 1.3347724456861735e-05, + "loss": 0.384, + "step": 508 + }, + { + "epoch": 0.41081517352703795, + "grad_norm": 0.04549890384078026, + "learning_rate": 1.3323064242071248e-05, + "loss": 0.4228, + "step": 509 + }, + { + "epoch": 0.4116222760290557, + "grad_norm": 0.0428113155066967, + "learning_rate": 1.3298381289282964e-05, + "loss": 0.4309, + "step": 510 + }, + { + "epoch": 0.4124293785310734, + "grad_norm": 0.04721170291304588, + "learning_rate": 1.327367576738947e-05, + "loss": 0.41, + "step": 511 + }, + { + "epoch": 0.4132364810330912, + "grad_norm": 0.047412823885679245, + "learning_rate": 1.3248947845437776e-05, + "loss": 0.5009, + "step": 512 + }, + { + "epoch": 0.41404358353510895, + "grad_norm": 0.04840272292494774, + "learning_rate": 1.3224197692628185e-05, + "loss": 0.4325, + "step": 513 + }, + { + "epoch": 0.4148506860371267, + "grad_norm": 0.04272138699889183, + "learning_rate": 1.3199425478313093e-05, + "loss": 0.3978, + "step": 514 + }, + { + "epoch": 0.4156577885391445, + "grad_norm": 0.04556628689169884, + "learning_rate": 1.3174631371995861e-05, + "loss": 0.447, + "step": 515 + }, + { + "epoch": 0.41646489104116224, + "grad_norm": 0.040329255163669586, + "learning_rate": 1.3149815543329652e-05, + "loss": 0.4256, + "step": 516 + }, + { + "epoch": 0.41727199354318, + "grad_norm": 0.039529670029878616, + "learning_rate": 1.3124978162116245e-05, + "loss": 0.4201, + "step": 517 + }, + { + "epoch": 0.4180790960451977, + "grad_norm": 0.03994109109044075, + "learning_rate": 1.3100119398304911e-05, + "loss": 0.3687, + "step": 518 + }, + { + "epoch": 0.4188861985472155, + "grad_norm": 0.04244811832904816, + "learning_rate": 1.3075239421991221e-05, + "loss": 0.4158, + "step": 519 + }, + { + "epoch": 0.41969330104923325, + "grad_norm": 0.03751133754849434, + "learning_rate": 1.3050338403415892e-05, + "loss": 0.4107, + "step": 520 + }, + { + "epoch": 0.420500403551251, + "grad_norm": 0.043357253074645996, + "learning_rate": 1.3025416512963628e-05, + "loss": 0.4352, + "step": 521 + }, + { + "epoch": 0.4213075060532688, + "grad_norm": 0.038147930055856705, + "learning_rate": 1.3000473921161947e-05, + "loss": 0.3219, + "step": 522 + }, + { + "epoch": 0.42211460855528654, + "grad_norm": 0.04303905740380287, + "learning_rate": 1.297551079868001e-05, + "loss": 0.5032, + "step": 523 + }, + { + "epoch": 0.42292171105730425, + "grad_norm": 0.042353641241788864, + "learning_rate": 1.2950527316327457e-05, + "loss": 0.4332, + "step": 524 + }, + { + "epoch": 0.423728813559322, + "grad_norm": 0.04321341589093208, + "learning_rate": 1.2925523645053253e-05, + "loss": 0.4201, + "step": 525 + }, + { + "epoch": 0.4245359160613398, + "grad_norm": 0.04340353608131409, + "learning_rate": 1.2900499955944495e-05, + "loss": 0.4224, + "step": 526 + }, + { + "epoch": 0.42534301856335754, + "grad_norm": 0.04402042180299759, + "learning_rate": 1.2875456420225251e-05, + "loss": 0.4612, + "step": 527 + }, + { + "epoch": 0.4261501210653753, + "grad_norm": 0.0438569076359272, + "learning_rate": 1.28503932092554e-05, + "loss": 0.3924, + "step": 528 + }, + { + "epoch": 0.4269572235673931, + "grad_norm": 0.042652543634176254, + "learning_rate": 1.2825310494529435e-05, + "loss": 0.4295, + "step": 529 + }, + { + "epoch": 0.42776432606941084, + "grad_norm": 0.03889980539679527, + "learning_rate": 1.2800208447675307e-05, + "loss": 0.4072, + "step": 530 + }, + { + "epoch": 0.42857142857142855, + "grad_norm": 0.039839811623096466, + "learning_rate": 1.2775087240453252e-05, + "loss": 0.4294, + "step": 531 + }, + { + "epoch": 0.4293785310734463, + "grad_norm": 0.042306624352931976, + "learning_rate": 1.2749947044754605e-05, + "loss": 0.3922, + "step": 532 + }, + { + "epoch": 0.4301856335754641, + "grad_norm": 0.045233551412820816, + "learning_rate": 1.272478803260063e-05, + "loss": 0.3839, + "step": 533 + }, + { + "epoch": 0.43099273607748184, + "grad_norm": 0.04435190185904503, + "learning_rate": 1.2699610376141345e-05, + "loss": 0.4093, + "step": 534 + }, + { + "epoch": 0.4317998385794996, + "grad_norm": 0.04015007242560387, + "learning_rate": 1.2674414247654338e-05, + "loss": 0.3753, + "step": 535 + }, + { + "epoch": 0.43260694108151737, + "grad_norm": 0.04248462989926338, + "learning_rate": 1.2649199819543595e-05, + "loss": 0.416, + "step": 536 + }, + { + "epoch": 0.43341404358353514, + "grad_norm": 0.04055071994662285, + "learning_rate": 1.2623967264338315e-05, + "loss": 0.3541, + "step": 537 + }, + { + "epoch": 0.43422114608555284, + "grad_norm": 0.04720915108919144, + "learning_rate": 1.2598716754691728e-05, + "loss": 0.4218, + "step": 538 + }, + { + "epoch": 0.4350282485875706, + "grad_norm": 0.0534139946103096, + "learning_rate": 1.2573448463379923e-05, + "loss": 0.42, + "step": 539 + }, + { + "epoch": 0.4358353510895884, + "grad_norm": 0.04554564505815506, + "learning_rate": 1.2548162563300653e-05, + "loss": 0.4565, + "step": 540 + }, + { + "epoch": 0.43664245359160614, + "grad_norm": 0.04438880831003189, + "learning_rate": 1.2522859227472165e-05, + "loss": 0.4409, + "step": 541 + }, + { + "epoch": 0.4374495560936239, + "grad_norm": 0.046392470598220825, + "learning_rate": 1.2497538629032004e-05, + "loss": 0.3867, + "step": 542 + }, + { + "epoch": 0.43825665859564167, + "grad_norm": 0.045731160789728165, + "learning_rate": 1.2472200941235839e-05, + "loss": 0.428, + "step": 543 + }, + { + "epoch": 0.4390637610976594, + "grad_norm": 0.04255305975675583, + "learning_rate": 1.2446846337456273e-05, + "loss": 0.4047, + "step": 544 + }, + { + "epoch": 0.43987086359967714, + "grad_norm": 0.03939209133386612, + "learning_rate": 1.242147499118165e-05, + "loss": 0.3873, + "step": 545 + }, + { + "epoch": 0.4406779661016949, + "grad_norm": 0.045134663581848145, + "learning_rate": 1.2396087076014882e-05, + "loss": 0.4079, + "step": 546 + }, + { + "epoch": 0.44148506860371267, + "grad_norm": 0.04972568526864052, + "learning_rate": 1.2370682765672242e-05, + "loss": 0.418, + "step": 547 + }, + { + "epoch": 0.44229217110573044, + "grad_norm": 0.041813429445028305, + "learning_rate": 1.23452622339822e-05, + "loss": 0.388, + "step": 548 + }, + { + "epoch": 0.4430992736077482, + "grad_norm": 0.040732573717832565, + "learning_rate": 1.2319825654884215e-05, + "loss": 0.4179, + "step": 549 + }, + { + "epoch": 0.44390637610976597, + "grad_norm": 0.047029539942741394, + "learning_rate": 1.2294373202427545e-05, + "loss": 0.4755, + "step": 550 + }, + { + "epoch": 0.4447134786117837, + "grad_norm": 0.042257849127054214, + "learning_rate": 1.226890505077007e-05, + "loss": 0.411, + "step": 551 + }, + { + "epoch": 0.44552058111380144, + "grad_norm": 0.042869675904512405, + "learning_rate": 1.224342137417708e-05, + "loss": 0.4055, + "step": 552 + }, + { + "epoch": 0.4463276836158192, + "grad_norm": 0.049996279180049896, + "learning_rate": 1.2217922347020104e-05, + "loss": 0.4589, + "step": 553 + }, + { + "epoch": 0.44713478611783697, + "grad_norm": 0.045039620250463486, + "learning_rate": 1.2192408143775708e-05, + "loss": 0.3789, + "step": 554 + }, + { + "epoch": 0.44794188861985473, + "grad_norm": 0.045354608446359634, + "learning_rate": 1.2166878939024292e-05, + "loss": 0.4314, + "step": 555 + }, + { + "epoch": 0.4487489911218725, + "grad_norm": 0.043889421969652176, + "learning_rate": 1.2141334907448907e-05, + "loss": 0.4507, + "step": 556 + }, + { + "epoch": 0.4495560936238902, + "grad_norm": 0.0434400737285614, + "learning_rate": 1.2115776223834056e-05, + "loss": 0.3997, + "step": 557 + }, + { + "epoch": 0.45036319612590797, + "grad_norm": 0.04490221291780472, + "learning_rate": 1.2090203063064503e-05, + "loss": 0.4415, + "step": 558 + }, + { + "epoch": 0.45117029862792574, + "grad_norm": 0.0410829521715641, + "learning_rate": 1.2064615600124064e-05, + "loss": 0.4138, + "step": 559 + }, + { + "epoch": 0.4519774011299435, + "grad_norm": 0.04448908194899559, + "learning_rate": 1.2039014010094418e-05, + "loss": 0.3875, + "step": 560 + }, + { + "epoch": 0.45278450363196127, + "grad_norm": 0.04140767827630043, + "learning_rate": 1.2013398468153921e-05, + "loss": 0.3939, + "step": 561 + }, + { + "epoch": 0.45359160613397903, + "grad_norm": 0.04112355411052704, + "learning_rate": 1.1987769149576372e-05, + "loss": 0.3972, + "step": 562 + }, + { + "epoch": 0.4543987086359968, + "grad_norm": 0.04010258987545967, + "learning_rate": 1.1962126229729856e-05, + "loss": 0.4379, + "step": 563 + }, + { + "epoch": 0.4552058111380145, + "grad_norm": 0.04502004757523537, + "learning_rate": 1.193646988407552e-05, + "loss": 0.3902, + "step": 564 + }, + { + "epoch": 0.45601291364003227, + "grad_norm": 0.04058624431490898, + "learning_rate": 1.1910800288166366e-05, + "loss": 0.3886, + "step": 565 + }, + { + "epoch": 0.45682001614205003, + "grad_norm": 0.04509864002466202, + "learning_rate": 1.1885117617646078e-05, + "loss": 0.4397, + "step": 566 + }, + { + "epoch": 0.4576271186440678, + "grad_norm": 0.04298222064971924, + "learning_rate": 1.1859422048247788e-05, + "loss": 0.4143, + "step": 567 + }, + { + "epoch": 0.45843422114608556, + "grad_norm": 0.04298047721385956, + "learning_rate": 1.1833713755792895e-05, + "loss": 0.459, + "step": 568 + }, + { + "epoch": 0.45924132364810333, + "grad_norm": 0.046091094613075256, + "learning_rate": 1.1807992916189856e-05, + "loss": 0.4352, + "step": 569 + }, + { + "epoch": 0.4600484261501211, + "grad_norm": 0.0470380075275898, + "learning_rate": 1.178225970543298e-05, + "loss": 0.4522, + "step": 570 + }, + { + "epoch": 0.4608555286521388, + "grad_norm": 0.04680854454636574, + "learning_rate": 1.1756514299601218e-05, + "loss": 0.4675, + "step": 571 + }, + { + "epoch": 0.46166263115415657, + "grad_norm": 0.04647412523627281, + "learning_rate": 1.1730756874856983e-05, + "loss": 0.4437, + "step": 572 + }, + { + "epoch": 0.46246973365617433, + "grad_norm": 0.041261084377765656, + "learning_rate": 1.1704987607444911e-05, + "loss": 0.4302, + "step": 573 + }, + { + "epoch": 0.4632768361581921, + "grad_norm": 0.04350990429520607, + "learning_rate": 1.167920667369068e-05, + "loss": 0.3811, + "step": 574 + }, + { + "epoch": 0.46408393866020986, + "grad_norm": 0.045072976499795914, + "learning_rate": 1.1653414249999786e-05, + "loss": 0.3791, + "step": 575 + }, + { + "epoch": 0.4648910411622276, + "grad_norm": 0.039517756551504135, + "learning_rate": 1.1627610512856355e-05, + "loss": 0.3528, + "step": 576 + }, + { + "epoch": 0.46569814366424533, + "grad_norm": 0.0447382777929306, + "learning_rate": 1.1601795638821923e-05, + "loss": 0.408, + "step": 577 + }, + { + "epoch": 0.4665052461662631, + "grad_norm": 0.041687775403261185, + "learning_rate": 1.1575969804534222e-05, + "loss": 0.3981, + "step": 578 + }, + { + "epoch": 0.46731234866828086, + "grad_norm": 0.041715968400239944, + "learning_rate": 1.1550133186705987e-05, + "loss": 0.3914, + "step": 579 + }, + { + "epoch": 0.46811945117029863, + "grad_norm": 0.04036974161863327, + "learning_rate": 1.1524285962123738e-05, + "loss": 0.3387, + "step": 580 + }, + { + "epoch": 0.4689265536723164, + "grad_norm": 0.04490208625793457, + "learning_rate": 1.1498428307646568e-05, + "loss": 0.4202, + "step": 581 + }, + { + "epoch": 0.46973365617433416, + "grad_norm": 0.041817329823970795, + "learning_rate": 1.1472560400204943e-05, + "loss": 0.3995, + "step": 582 + }, + { + "epoch": 0.4705407586763519, + "grad_norm": 0.042354173958301544, + "learning_rate": 1.1446682416799474e-05, + "loss": 0.4135, + "step": 583 + }, + { + "epoch": 0.47134786117836963, + "grad_norm": 0.04079662635922432, + "learning_rate": 1.1420794534499729e-05, + "loss": 0.3493, + "step": 584 + }, + { + "epoch": 0.4721549636803874, + "grad_norm": 0.039188142865896225, + "learning_rate": 1.1394896930442998e-05, + "loss": 0.3186, + "step": 585 + }, + { + "epoch": 0.47296206618240516, + "grad_norm": 0.04252387955784798, + "learning_rate": 1.1368989781833102e-05, + "loss": 0.3874, + "step": 586 + }, + { + "epoch": 0.4737691686844229, + "grad_norm": 0.041413117200136185, + "learning_rate": 1.1343073265939161e-05, + "loss": 0.4033, + "step": 587 + }, + { + "epoch": 0.4745762711864407, + "grad_norm": 0.04359855130314827, + "learning_rate": 1.1317147560094394e-05, + "loss": 0.4151, + "step": 588 + }, + { + "epoch": 0.47538337368845845, + "grad_norm": 0.04194974526762962, + "learning_rate": 1.1291212841694907e-05, + "loss": 0.3966, + "step": 589 + }, + { + "epoch": 0.47619047619047616, + "grad_norm": 0.051467522978782654, + "learning_rate": 1.126526928819847e-05, + "loss": 0.4678, + "step": 590 + }, + { + "epoch": 0.47699757869249393, + "grad_norm": 0.047667261213064194, + "learning_rate": 1.1239317077123301e-05, + "loss": 0.4456, + "step": 591 + }, + { + "epoch": 0.4778046811945117, + "grad_norm": 0.044262733310461044, + "learning_rate": 1.1213356386046871e-05, + "loss": 0.4349, + "step": 592 + }, + { + "epoch": 0.47861178369652946, + "grad_norm": 0.04664575308561325, + "learning_rate": 1.1187387392604665e-05, + "loss": 0.4318, + "step": 593 + }, + { + "epoch": 0.4794188861985472, + "grad_norm": 0.043433092534542084, + "learning_rate": 1.1161410274488975e-05, + "loss": 0.4258, + "step": 594 + }, + { + "epoch": 0.480225988700565, + "grad_norm": 0.04289019852876663, + "learning_rate": 1.1135425209447692e-05, + "loss": 0.4328, + "step": 595 + }, + { + "epoch": 0.48103309120258275, + "grad_norm": 0.04451136663556099, + "learning_rate": 1.1109432375283087e-05, + "loss": 0.4602, + "step": 596 + }, + { + "epoch": 0.48184019370460046, + "grad_norm": 0.04060007631778717, + "learning_rate": 1.108343194985058e-05, + "loss": 0.3685, + "step": 597 + }, + { + "epoch": 0.4826472962066182, + "grad_norm": 0.043877966701984406, + "learning_rate": 1.105742411105754e-05, + "loss": 0.4037, + "step": 598 + }, + { + "epoch": 0.483454398708636, + "grad_norm": 0.04378955066204071, + "learning_rate": 1.1031409036862066e-05, + "loss": 0.4241, + "step": 599 + }, + { + "epoch": 0.48426150121065376, + "grad_norm": 0.04598834738135338, + "learning_rate": 1.100538690527176e-05, + "loss": 0.4278, + "step": 600 + }, + { + "epoch": 0.4850686037126715, + "grad_norm": 0.04028361290693283, + "learning_rate": 1.097935789434251e-05, + "loss": 0.3873, + "step": 601 + }, + { + "epoch": 0.4858757062146893, + "grad_norm": 0.0447799488902092, + "learning_rate": 1.0953322182177289e-05, + "loss": 0.4208, + "step": 602 + }, + { + "epoch": 0.48668280871670705, + "grad_norm": 0.0401802696287632, + "learning_rate": 1.0927279946924907e-05, + "loss": 0.3745, + "step": 603 + }, + { + "epoch": 0.48748991121872476, + "grad_norm": 0.049999117851257324, + "learning_rate": 1.0901231366778817e-05, + "loss": 0.4353, + "step": 604 + }, + { + "epoch": 0.4882970137207425, + "grad_norm": 0.04383356496691704, + "learning_rate": 1.087517661997589e-05, + "loss": 0.3881, + "step": 605 + }, + { + "epoch": 0.4891041162227603, + "grad_norm": 0.039079997688531876, + "learning_rate": 1.0849115884795186e-05, + "loss": 0.3434, + "step": 606 + }, + { + "epoch": 0.48991121872477805, + "grad_norm": 0.04208359494805336, + "learning_rate": 1.0823049339556739e-05, + "loss": 0.4092, + "step": 607 + }, + { + "epoch": 0.4907183212267958, + "grad_norm": 0.041914671659469604, + "learning_rate": 1.0796977162620343e-05, + "loss": 0.3685, + "step": 608 + }, + { + "epoch": 0.4915254237288136, + "grad_norm": 0.0390278585255146, + "learning_rate": 1.0770899532384329e-05, + "loss": 0.3857, + "step": 609 + }, + { + "epoch": 0.4923325262308313, + "grad_norm": 0.04671277478337288, + "learning_rate": 1.0744816627284337e-05, + "loss": 0.4768, + "step": 610 + }, + { + "epoch": 0.49313962873284906, + "grad_norm": 0.045115381479263306, + "learning_rate": 1.0718728625792095e-05, + "loss": 0.4087, + "step": 611 + }, + { + "epoch": 0.4939467312348668, + "grad_norm": 0.052573543041944504, + "learning_rate": 1.0692635706414217e-05, + "loss": 0.4743, + "step": 612 + }, + { + "epoch": 0.4947538337368846, + "grad_norm": 0.045512620359659195, + "learning_rate": 1.0666538047690956e-05, + "loss": 0.3968, + "step": 613 + }, + { + "epoch": 0.49556093623890235, + "grad_norm": 0.04206302762031555, + "learning_rate": 1.0640435828194995e-05, + "loss": 0.4065, + "step": 614 + }, + { + "epoch": 0.4963680387409201, + "grad_norm": 0.04784831777215004, + "learning_rate": 1.0614329226530238e-05, + "loss": 0.4117, + "step": 615 + }, + { + "epoch": 0.4971751412429379, + "grad_norm": 0.0442197360098362, + "learning_rate": 1.058821842133055e-05, + "loss": 0.387, + "step": 616 + }, + { + "epoch": 0.4979822437449556, + "grad_norm": 0.04519395902752876, + "learning_rate": 1.0562103591258577e-05, + "loss": 0.3652, + "step": 617 + }, + { + "epoch": 0.49878934624697335, + "grad_norm": 0.044820185750722885, + "learning_rate": 1.05359849150045e-05, + "loss": 0.4579, + "step": 618 + }, + { + "epoch": 0.4995964487489911, + "grad_norm": 0.04219202697277069, + "learning_rate": 1.0509862571284812e-05, + "loss": 0.4186, + "step": 619 + }, + { + "epoch": 0.5004035512510089, + "grad_norm": 0.03920457139611244, + "learning_rate": 1.0483736738841115e-05, + "loss": 0.3337, + "step": 620 + }, + { + "epoch": 0.5012106537530266, + "grad_norm": 0.04564947262406349, + "learning_rate": 1.0457607596438864e-05, + "loss": 0.4153, + "step": 621 + }, + { + "epoch": 0.5020177562550444, + "grad_norm": 0.04112798348069191, + "learning_rate": 1.0431475322866174e-05, + "loss": 0.4401, + "step": 622 + }, + { + "epoch": 0.5028248587570622, + "grad_norm": 0.0465184785425663, + "learning_rate": 1.040534009693258e-05, + "loss": 0.4448, + "step": 623 + }, + { + "epoch": 0.5036319612590799, + "grad_norm": 0.03798345848917961, + "learning_rate": 1.0379202097467825e-05, + "loss": 0.3517, + "step": 624 + }, + { + "epoch": 0.5044390637610977, + "grad_norm": 0.04183805361390114, + "learning_rate": 1.035306150332062e-05, + "loss": 0.4508, + "step": 625 + }, + { + "epoch": 0.5052461662631155, + "grad_norm": 0.04639915004372597, + "learning_rate": 1.032691849335744e-05, + "loss": 0.3802, + "step": 626 + }, + { + "epoch": 0.5060532687651331, + "grad_norm": 0.044055815786123276, + "learning_rate": 1.0300773246461286e-05, + "loss": 0.4868, + "step": 627 + }, + { + "epoch": 0.5068603712671509, + "grad_norm": 0.03922991082072258, + "learning_rate": 1.0274625941530464e-05, + "loss": 0.3864, + "step": 628 + }, + { + "epoch": 0.5076674737691687, + "grad_norm": 0.04367651045322418, + "learning_rate": 1.024847675747736e-05, + "loss": 0.4214, + "step": 629 + }, + { + "epoch": 0.5084745762711864, + "grad_norm": 0.04028239846229553, + "learning_rate": 1.0222325873227223e-05, + "loss": 0.3886, + "step": 630 + }, + { + "epoch": 0.5092816787732042, + "grad_norm": 0.04197577387094498, + "learning_rate": 1.0196173467716935e-05, + "loss": 0.3542, + "step": 631 + }, + { + "epoch": 0.510088781275222, + "grad_norm": 0.0527801550924778, + "learning_rate": 1.017001971989378e-05, + "loss": 0.4785, + "step": 632 + }, + { + "epoch": 0.5108958837772397, + "grad_norm": 0.04306379333138466, + "learning_rate": 1.0143864808714238e-05, + "loss": 0.4334, + "step": 633 + }, + { + "epoch": 0.5117029862792575, + "grad_norm": 0.03770716115832329, + "learning_rate": 1.0117708913142744e-05, + "loss": 0.3406, + "step": 634 + }, + { + "epoch": 0.5125100887812752, + "grad_norm": 0.04461565613746643, + "learning_rate": 1.0091552212150464e-05, + "loss": 0.3929, + "step": 635 + }, + { + "epoch": 0.513317191283293, + "grad_norm": 0.04541825130581856, + "learning_rate": 1.0065394884714077e-05, + "loss": 0.4846, + "step": 636 + }, + { + "epoch": 0.5141242937853108, + "grad_norm": 0.04394349828362465, + "learning_rate": 1.0039237109814559e-05, + "loss": 0.4096, + "step": 637 + }, + { + "epoch": 0.5149313962873285, + "grad_norm": 0.04815034940838814, + "learning_rate": 1.0013079066435933e-05, + "loss": 0.4932, + "step": 638 + }, + { + "epoch": 0.5157384987893463, + "grad_norm": 0.04465879872441292, + "learning_rate": 9.986920933564069e-06, + "loss": 0.4452, + "step": 639 + }, + { + "epoch": 0.516545601291364, + "grad_norm": 0.046694882214069366, + "learning_rate": 9.960762890185443e-06, + "loss": 0.4554, + "step": 640 + }, + { + "epoch": 0.5173527037933817, + "grad_norm": 0.039234355092048645, + "learning_rate": 9.934605115285924e-06, + "loss": 0.3147, + "step": 641 + }, + { + "epoch": 0.5181598062953995, + "grad_norm": 0.0512404665350914, + "learning_rate": 9.908447787849542e-06, + "loss": 0.4644, + "step": 642 + }, + { + "epoch": 0.5189669087974172, + "grad_norm": 0.04253727197647095, + "learning_rate": 9.88229108685726e-06, + "loss": 0.3808, + "step": 643 + }, + { + "epoch": 0.519774011299435, + "grad_norm": 0.03961801528930664, + "learning_rate": 9.856135191285763e-06, + "loss": 0.3379, + "step": 644 + }, + { + "epoch": 0.5205811138014528, + "grad_norm": 0.04661339893937111, + "learning_rate": 9.82998028010622e-06, + "loss": 0.3963, + "step": 645 + }, + { + "epoch": 0.5213882163034705, + "grad_norm": 0.04584723711013794, + "learning_rate": 9.803826532283068e-06, + "loss": 0.4102, + "step": 646 + }, + { + "epoch": 0.5221953188054883, + "grad_norm": 0.043115437030792236, + "learning_rate": 9.777674126772782e-06, + "loss": 0.4193, + "step": 647 + }, + { + "epoch": 0.5230024213075061, + "grad_norm": 0.0466901957988739, + "learning_rate": 9.751523242522644e-06, + "loss": 0.4218, + "step": 648 + }, + { + "epoch": 0.5238095238095238, + "grad_norm": 0.04835409298539162, + "learning_rate": 9.725374058469538e-06, + "loss": 0.4366, + "step": 649 + }, + { + "epoch": 0.5246166263115416, + "grad_norm": 0.04258834943175316, + "learning_rate": 9.699226753538716e-06, + "loss": 0.3321, + "step": 650 + }, + { + "epoch": 0.5254237288135594, + "grad_norm": 0.04763965308666229, + "learning_rate": 9.67308150664256e-06, + "loss": 0.3591, + "step": 651 + }, + { + "epoch": 0.5262308313155771, + "grad_norm": 0.04827116057276726, + "learning_rate": 9.646938496679383e-06, + "loss": 0.4874, + "step": 652 + }, + { + "epoch": 0.5270379338175948, + "grad_norm": 0.04092332348227501, + "learning_rate": 9.62079790253218e-06, + "loss": 0.4025, + "step": 653 + }, + { + "epoch": 0.5278450363196125, + "grad_norm": 0.04576878994703293, + "learning_rate": 9.594659903067424e-06, + "loss": 0.463, + "step": 654 + }, + { + "epoch": 0.5286521388216303, + "grad_norm": 0.044727567583322525, + "learning_rate": 9.56852467713383e-06, + "loss": 0.3952, + "step": 655 + }, + { + "epoch": 0.5294592413236481, + "grad_norm": 0.0517955981194973, + "learning_rate": 9.54239240356114e-06, + "loss": 0.4618, + "step": 656 + }, + { + "epoch": 0.5302663438256658, + "grad_norm": 0.046104107052087784, + "learning_rate": 9.516263261158892e-06, + "loss": 0.4143, + "step": 657 + }, + { + "epoch": 0.5310734463276836, + "grad_norm": 0.048121001571416855, + "learning_rate": 9.49013742871519e-06, + "loss": 0.4183, + "step": 658 + }, + { + "epoch": 0.5318805488297014, + "grad_norm": 0.0463542602956295, + "learning_rate": 9.464015084995503e-06, + "loss": 0.4279, + "step": 659 + }, + { + "epoch": 0.5326876513317191, + "grad_norm": 0.04264969006180763, + "learning_rate": 9.437896408741426e-06, + "loss": 0.3497, + "step": 660 + }, + { + "epoch": 0.5334947538337369, + "grad_norm": 0.04616716876626015, + "learning_rate": 9.411781578669452e-06, + "loss": 0.3653, + "step": 661 + }, + { + "epoch": 0.5343018563357547, + "grad_norm": 0.04366273805499077, + "learning_rate": 9.385670773469767e-06, + "loss": 0.3905, + "step": 662 + }, + { + "epoch": 0.5351089588377724, + "grad_norm": 0.04917929694056511, + "learning_rate": 9.359564171805006e-06, + "loss": 0.4314, + "step": 663 + }, + { + "epoch": 0.5359160613397902, + "grad_norm": 0.042748548090457916, + "learning_rate": 9.333461952309049e-06, + "loss": 0.3719, + "step": 664 + }, + { + "epoch": 0.536723163841808, + "grad_norm": 0.042014606297016144, + "learning_rate": 9.307364293585785e-06, + "loss": 0.377, + "step": 665 + }, + { + "epoch": 0.5375302663438256, + "grad_norm": 0.047551706433296204, + "learning_rate": 9.281271374207907e-06, + "loss": 0.4342, + "step": 666 + }, + { + "epoch": 0.5383373688458434, + "grad_norm": 0.04877143353223801, + "learning_rate": 9.25518337271567e-06, + "loss": 0.4899, + "step": 667 + }, + { + "epoch": 0.5391444713478611, + "grad_norm": 0.043122414499521255, + "learning_rate": 9.229100467615673e-06, + "loss": 0.4008, + "step": 668 + }, + { + "epoch": 0.5399515738498789, + "grad_norm": 0.04934109374880791, + "learning_rate": 9.20302283737966e-06, + "loss": 0.4822, + "step": 669 + }, + { + "epoch": 0.5407586763518967, + "grad_norm": 0.04138447344303131, + "learning_rate": 9.176950660443263e-06, + "loss": 0.3791, + "step": 670 + }, + { + "epoch": 0.5415657788539144, + "grad_norm": 0.050151653587818146, + "learning_rate": 9.150884115204816e-06, + "loss": 0.3879, + "step": 671 + }, + { + "epoch": 0.5423728813559322, + "grad_norm": 0.03810346871614456, + "learning_rate": 9.124823380024114e-06, + "loss": 0.3253, + "step": 672 + }, + { + "epoch": 0.54317998385795, + "grad_norm": 0.04531979188323021, + "learning_rate": 9.098768633221187e-06, + "loss": 0.4136, + "step": 673 + }, + { + "epoch": 0.5439870863599677, + "grad_norm": 0.04374850168824196, + "learning_rate": 9.072720053075096e-06, + "loss": 0.413, + "step": 674 + }, + { + "epoch": 0.5447941888619855, + "grad_norm": 0.049856919795274734, + "learning_rate": 9.046677817822716e-06, + "loss": 0.4459, + "step": 675 + }, + { + "epoch": 0.5456012913640033, + "grad_norm": 0.04950174689292908, + "learning_rate": 9.02064210565749e-06, + "loss": 0.4244, + "step": 676 + }, + { + "epoch": 0.546408393866021, + "grad_norm": 0.04497706890106201, + "learning_rate": 8.994613094728246e-06, + "loss": 0.3811, + "step": 677 + }, + { + "epoch": 0.5472154963680388, + "grad_norm": 0.04163501039147377, + "learning_rate": 8.968590963137935e-06, + "loss": 0.3594, + "step": 678 + }, + { + "epoch": 0.5480225988700564, + "grad_norm": 0.050817154347896576, + "learning_rate": 8.942575888942462e-06, + "loss": 0.3757, + "step": 679 + }, + { + "epoch": 0.5488297013720742, + "grad_norm": 0.050537239760160446, + "learning_rate": 8.916568050149422e-06, + "loss": 0.4263, + "step": 680 + }, + { + "epoch": 0.549636803874092, + "grad_norm": 0.0474698431789875, + "learning_rate": 8.890567624716913e-06, + "loss": 0.357, + "step": 681 + }, + { + "epoch": 0.5504439063761097, + "grad_norm": 0.04469543695449829, + "learning_rate": 8.86457479055231e-06, + "loss": 0.3911, + "step": 682 + }, + { + "epoch": 0.5512510088781275, + "grad_norm": 0.044582296162843704, + "learning_rate": 8.83858972551103e-06, + "loss": 0.3657, + "step": 683 + }, + { + "epoch": 0.5520581113801453, + "grad_norm": 0.04360214248299599, + "learning_rate": 8.812612607395338e-06, + "loss": 0.4512, + "step": 684 + }, + { + "epoch": 0.552865213882163, + "grad_norm": 0.03942641615867615, + "learning_rate": 8.78664361395313e-06, + "loss": 0.3448, + "step": 685 + }, + { + "epoch": 0.5536723163841808, + "grad_norm": 0.04751434177160263, + "learning_rate": 8.760682922876697e-06, + "loss": 0.4069, + "step": 686 + }, + { + "epoch": 0.5544794188861986, + "grad_norm": 0.039195213466882706, + "learning_rate": 8.734730711801535e-06, + "loss": 0.3701, + "step": 687 + }, + { + "epoch": 0.5552865213882163, + "grad_norm": 0.049299318343400955, + "learning_rate": 8.708787158305094e-06, + "loss": 0.4405, + "step": 688 + }, + { + "epoch": 0.5560936238902341, + "grad_norm": 0.04242517799139023, + "learning_rate": 8.682852439905609e-06, + "loss": 0.3831, + "step": 689 + }, + { + "epoch": 0.5569007263922519, + "grad_norm": 0.039667051285505295, + "learning_rate": 8.656926734060842e-06, + "loss": 0.3735, + "step": 690 + }, + { + "epoch": 0.5577078288942696, + "grad_norm": 0.04560346528887749, + "learning_rate": 8.631010218166902e-06, + "loss": 0.3369, + "step": 691 + }, + { + "epoch": 0.5585149313962873, + "grad_norm": 0.04885312542319298, + "learning_rate": 8.605103069557007e-06, + "loss": 0.3539, + "step": 692 + }, + { + "epoch": 0.559322033898305, + "grad_norm": 0.04993152990937233, + "learning_rate": 8.579205465500276e-06, + "loss": 0.3958, + "step": 693 + }, + { + "epoch": 0.5601291364003228, + "grad_norm": 0.04083723947405815, + "learning_rate": 8.553317583200527e-06, + "loss": 0.4112, + "step": 694 + }, + { + "epoch": 0.5609362389023406, + "grad_norm": 0.04566939175128937, + "learning_rate": 8.52743959979506e-06, + "loss": 0.413, + "step": 695 + }, + { + "epoch": 0.5617433414043583, + "grad_norm": 0.043536778539419174, + "learning_rate": 8.501571692353432e-06, + "loss": 0.3832, + "step": 696 + }, + { + "epoch": 0.5625504439063761, + "grad_norm": 0.04263819009065628, + "learning_rate": 8.475714037876263e-06, + "loss": 0.4028, + "step": 697 + }, + { + "epoch": 0.5633575464083939, + "grad_norm": 0.046215664595365524, + "learning_rate": 8.449866813294016e-06, + "loss": 0.3721, + "step": 698 + }, + { + "epoch": 0.5641646489104116, + "grad_norm": 0.04530710726976395, + "learning_rate": 8.424030195465781e-06, + "loss": 0.3834, + "step": 699 + }, + { + "epoch": 0.5649717514124294, + "grad_norm": 0.048315443098545074, + "learning_rate": 8.398204361178079e-06, + "loss": 0.4508, + "step": 700 + }, + { + "epoch": 0.5657788539144472, + "grad_norm": 0.04589349031448364, + "learning_rate": 8.372389487143647e-06, + "loss": 0.3801, + "step": 701 + }, + { + "epoch": 0.5665859564164649, + "grad_norm": 0.047280244529247284, + "learning_rate": 8.346585750000215e-06, + "loss": 0.4046, + "step": 702 + }, + { + "epoch": 0.5673930589184827, + "grad_norm": 0.043632179498672485, + "learning_rate": 8.320793326309327e-06, + "loss": 0.4125, + "step": 703 + }, + { + "epoch": 0.5682001614205004, + "grad_norm": 0.042842645198106766, + "learning_rate": 8.295012392555092e-06, + "loss": 0.3482, + "step": 704 + }, + { + "epoch": 0.5690072639225182, + "grad_norm": 0.04746723547577858, + "learning_rate": 8.26924312514302e-06, + "loss": 0.4518, + "step": 705 + }, + { + "epoch": 0.5698143664245359, + "grad_norm": 0.050085753202438354, + "learning_rate": 8.243485700398782e-06, + "loss": 0.4467, + "step": 706 + }, + { + "epoch": 0.5706214689265536, + "grad_norm": 0.049086228013038635, + "learning_rate": 8.217740294567024e-06, + "loss": 0.4415, + "step": 707 + }, + { + "epoch": 0.5714285714285714, + "grad_norm": 0.04610542580485344, + "learning_rate": 8.192007083810149e-06, + "loss": 0.3963, + "step": 708 + }, + { + "epoch": 0.5722356739305892, + "grad_norm": 0.04760580509901047, + "learning_rate": 8.166286244207109e-06, + "loss": 0.445, + "step": 709 + }, + { + "epoch": 0.5730427764326069, + "grad_norm": 0.04431818798184395, + "learning_rate": 8.140577951752213e-06, + "loss": 0.358, + "step": 710 + }, + { + "epoch": 0.5738498789346247, + "grad_norm": 0.04857053607702255, + "learning_rate": 8.114882382353925e-06, + "loss": 0.3933, + "step": 711 + }, + { + "epoch": 0.5746569814366425, + "grad_norm": 0.04636574909090996, + "learning_rate": 8.089199711833634e-06, + "loss": 0.4183, + "step": 712 + }, + { + "epoch": 0.5754640839386602, + "grad_norm": 0.04742918536067009, + "learning_rate": 8.063530115924486e-06, + "loss": 0.3524, + "step": 713 + }, + { + "epoch": 0.576271186440678, + "grad_norm": 0.043312814086675644, + "learning_rate": 8.037873770270149e-06, + "loss": 0.3452, + "step": 714 + }, + { + "epoch": 0.5770782889426957, + "grad_norm": 0.047119125723838806, + "learning_rate": 8.012230850423632e-06, + "loss": 0.3949, + "step": 715 + }, + { + "epoch": 0.5778853914447135, + "grad_norm": 0.04490087553858757, + "learning_rate": 7.986601531846084e-06, + "loss": 0.3413, + "step": 716 + }, + { + "epoch": 0.5786924939467313, + "grad_norm": 0.041443049907684326, + "learning_rate": 7.960985989905584e-06, + "loss": 0.3568, + "step": 717 + }, + { + "epoch": 0.579499596448749, + "grad_norm": 0.043406542390584946, + "learning_rate": 7.935384399875941e-06, + "loss": 0.33, + "step": 718 + }, + { + "epoch": 0.5803066989507667, + "grad_norm": 0.04098028689622879, + "learning_rate": 7.9097969369355e-06, + "loss": 0.3882, + "step": 719 + }, + { + "epoch": 0.5811138014527845, + "grad_norm": 0.04382213577628136, + "learning_rate": 7.884223776165947e-06, + "loss": 0.3984, + "step": 720 + }, + { + "epoch": 0.5819209039548022, + "grad_norm": 0.03980768844485283, + "learning_rate": 7.858665092551095e-06, + "loss": 0.355, + "step": 721 + }, + { + "epoch": 0.58272800645682, + "grad_norm": 0.04712940379977226, + "learning_rate": 7.833121060975708e-06, + "loss": 0.4196, + "step": 722 + }, + { + "epoch": 0.5835351089588378, + "grad_norm": 0.04403112828731537, + "learning_rate": 7.807591856224295e-06, + "loss": 0.3674, + "step": 723 + }, + { + "epoch": 0.5843422114608555, + "grad_norm": 0.04864363372325897, + "learning_rate": 7.782077652979898e-06, + "loss": 0.4704, + "step": 724 + }, + { + "epoch": 0.5851493139628733, + "grad_norm": 0.04410781338810921, + "learning_rate": 7.756578625822923e-06, + "loss": 0.4068, + "step": 725 + }, + { + "epoch": 0.585956416464891, + "grad_norm": 0.039659466594457626, + "learning_rate": 7.731094949229934e-06, + "loss": 0.3709, + "step": 726 + }, + { + "epoch": 0.5867635189669088, + "grad_norm": 0.041823580861091614, + "learning_rate": 7.705626797572456e-06, + "loss": 0.3826, + "step": 727 + }, + { + "epoch": 0.5875706214689266, + "grad_norm": 0.048205725848674774, + "learning_rate": 7.680174345115789e-06, + "loss": 0.4217, + "step": 728 + }, + { + "epoch": 0.5883777239709443, + "grad_norm": 0.04867953807115555, + "learning_rate": 7.654737766017801e-06, + "loss": 0.3801, + "step": 729 + }, + { + "epoch": 0.5891848264729621, + "grad_norm": 0.04925863444805145, + "learning_rate": 7.629317234327761e-06, + "loss": 0.3971, + "step": 730 + }, + { + "epoch": 0.5899919289749799, + "grad_norm": 0.04729580134153366, + "learning_rate": 7.603912923985122e-06, + "loss": 0.4304, + "step": 731 + }, + { + "epoch": 0.5907990314769975, + "grad_norm": 0.0465373694896698, + "learning_rate": 7.5785250088183495e-06, + "loss": 0.3699, + "step": 732 + }, + { + "epoch": 0.5916061339790153, + "grad_norm": 0.045501019805669785, + "learning_rate": 7.55315366254373e-06, + "loss": 0.3363, + "step": 733 + }, + { + "epoch": 0.5924132364810331, + "grad_norm": 0.05061681196093559, + "learning_rate": 7.527799058764163e-06, + "loss": 0.4148, + "step": 734 + }, + { + "epoch": 0.5932203389830508, + "grad_norm": 0.046648457646369934, + "learning_rate": 7.502461370967999e-06, + "loss": 0.389, + "step": 735 + }, + { + "epoch": 0.5940274414850686, + "grad_norm": 0.04624919220805168, + "learning_rate": 7.47714077252784e-06, + "loss": 0.3875, + "step": 736 + }, + { + "epoch": 0.5948345439870864, + "grad_norm": 0.04941301420331001, + "learning_rate": 7.45183743669935e-06, + "loss": 0.4064, + "step": 737 + }, + { + "epoch": 0.5956416464891041, + "grad_norm": 0.04847235605120659, + "learning_rate": 7.426551536620082e-06, + "loss": 0.3604, + "step": 738 + }, + { + "epoch": 0.5964487489911219, + "grad_norm": 0.04394169896841049, + "learning_rate": 7.401283245308275e-06, + "loss": 0.3765, + "step": 739 + }, + { + "epoch": 0.5972558514931396, + "grad_norm": 0.048312414437532425, + "learning_rate": 7.376032735661689e-06, + "loss": 0.4014, + "step": 740 + }, + { + "epoch": 0.5980629539951574, + "grad_norm": 0.045725490897893906, + "learning_rate": 7.350800180456405e-06, + "loss": 0.4155, + "step": 741 + }, + { + "epoch": 0.5988700564971752, + "grad_norm": 0.050338540226221085, + "learning_rate": 7.325585752345663e-06, + "loss": 0.4415, + "step": 742 + }, + { + "epoch": 0.5996771589991929, + "grad_norm": 0.04538509622216225, + "learning_rate": 7.30038962385866e-06, + "loss": 0.3874, + "step": 743 + }, + { + "epoch": 0.6004842615012107, + "grad_norm": 0.05074746534228325, + "learning_rate": 7.275211967399374e-06, + "loss": 0.3919, + "step": 744 + }, + { + "epoch": 0.6012913640032284, + "grad_norm": 0.04364052787423134, + "learning_rate": 7.250052955245399e-06, + "loss": 0.3886, + "step": 745 + }, + { + "epoch": 0.6020984665052461, + "grad_norm": 0.049053583294153214, + "learning_rate": 7.224912759546752e-06, + "loss": 0.4, + "step": 746 + }, + { + "epoch": 0.6029055690072639, + "grad_norm": 0.05290306359529495, + "learning_rate": 7.199791552324694e-06, + "loss": 0.4192, + "step": 747 + }, + { + "epoch": 0.6037126715092817, + "grad_norm": 0.05066477879881859, + "learning_rate": 7.1746895054705705e-06, + "loss": 0.4287, + "step": 748 + }, + { + "epoch": 0.6045197740112994, + "grad_norm": 0.04861418902873993, + "learning_rate": 7.149606790744602e-06, + "loss": 0.412, + "step": 749 + }, + { + "epoch": 0.6053268765133172, + "grad_norm": 0.04922877252101898, + "learning_rate": 7.12454357977475e-06, + "loss": 0.4198, + "step": 750 + }, + { + "epoch": 0.606133979015335, + "grad_norm": 0.039801307022571564, + "learning_rate": 7.099500044055507e-06, + "loss": 0.3603, + "step": 751 + }, + { + "epoch": 0.6069410815173527, + "grad_norm": 0.0413752906024456, + "learning_rate": 7.07447635494675e-06, + "loss": 0.3778, + "step": 752 + }, + { + "epoch": 0.6077481840193705, + "grad_norm": 0.05113828927278519, + "learning_rate": 7.049472683672544e-06, + "loss": 0.4032, + "step": 753 + }, + { + "epoch": 0.6085552865213882, + "grad_norm": 0.04697990417480469, + "learning_rate": 7.0244892013199976e-06, + "loss": 0.3934, + "step": 754 + }, + { + "epoch": 0.609362389023406, + "grad_norm": 0.04355325922369957, + "learning_rate": 6.999526078838056e-06, + "loss": 0.3863, + "step": 755 + }, + { + "epoch": 0.6101694915254238, + "grad_norm": 0.04471498355269432, + "learning_rate": 6.9745834870363725e-06, + "loss": 0.3622, + "step": 756 + }, + { + "epoch": 0.6109765940274415, + "grad_norm": 0.04962782561779022, + "learning_rate": 6.949661596584108e-06, + "loss": 0.402, + "step": 757 + }, + { + "epoch": 0.6117836965294592, + "grad_norm": 0.04287125542759895, + "learning_rate": 6.924760578008782e-06, + "loss": 0.3331, + "step": 758 + }, + { + "epoch": 0.612590799031477, + "grad_norm": 0.048510193824768066, + "learning_rate": 6.899880601695095e-06, + "loss": 0.4074, + "step": 759 + }, + { + "epoch": 0.6133979015334947, + "grad_norm": 0.045081108808517456, + "learning_rate": 6.875021837883758e-06, + "loss": 0.4115, + "step": 760 + }, + { + "epoch": 0.6142050040355125, + "grad_norm": 0.053174663335084915, + "learning_rate": 6.8501844566703525e-06, + "loss": 0.4534, + "step": 761 + }, + { + "epoch": 0.6150121065375302, + "grad_norm": 0.04487127065658569, + "learning_rate": 6.8253686280041395e-06, + "loss": 0.4223, + "step": 762 + }, + { + "epoch": 0.615819209039548, + "grad_norm": 0.050555501133203506, + "learning_rate": 6.800574521686908e-06, + "loss": 0.4536, + "step": 763 + }, + { + "epoch": 0.6166263115415658, + "grad_norm": 0.047019895166158676, + "learning_rate": 6.775802307371819e-06, + "loss": 0.3819, + "step": 764 + }, + { + "epoch": 0.6174334140435835, + "grad_norm": 0.04500943049788475, + "learning_rate": 6.751052154562227e-06, + "loss": 0.3923, + "step": 765 + }, + { + "epoch": 0.6182405165456013, + "grad_norm": 0.04650492966175079, + "learning_rate": 6.726324232610535e-06, + "loss": 0.3725, + "step": 766 + }, + { + "epoch": 0.6190476190476191, + "grad_norm": 0.04661106690764427, + "learning_rate": 6.701618710717036e-06, + "loss": 0.3785, + "step": 767 + }, + { + "epoch": 0.6198547215496368, + "grad_norm": 0.044705361127853394, + "learning_rate": 6.676935757928751e-06, + "loss": 0.3442, + "step": 768 + }, + { + "epoch": 0.6206618240516546, + "grad_norm": 0.04856501892209053, + "learning_rate": 6.652275543138271e-06, + "loss": 0.4079, + "step": 769 + }, + { + "epoch": 0.6214689265536724, + "grad_norm": 0.0447048656642437, + "learning_rate": 6.627638235082594e-06, + "loss": 0.3875, + "step": 770 + }, + { + "epoch": 0.6222760290556901, + "grad_norm": 0.04440969228744507, + "learning_rate": 6.603024002341991e-06, + "loss": 0.3647, + "step": 771 + }, + { + "epoch": 0.6230831315577078, + "grad_norm": 0.046474307775497437, + "learning_rate": 6.578433013338838e-06, + "loss": 0.4085, + "step": 772 + }, + { + "epoch": 0.6238902340597255, + "grad_norm": 0.04039683938026428, + "learning_rate": 6.553865436336461e-06, + "loss": 0.3918, + "step": 773 + }, + { + "epoch": 0.6246973365617433, + "grad_norm": 0.041126858443021774, + "learning_rate": 6.529321439438002e-06, + "loss": 0.3198, + "step": 774 + }, + { + "epoch": 0.6255044390637611, + "grad_norm": 0.047061994671821594, + "learning_rate": 6.504801190585239e-06, + "loss": 0.3589, + "step": 775 + }, + { + "epoch": 0.6263115415657788, + "grad_norm": 0.0462849922478199, + "learning_rate": 6.4803048575574635e-06, + "loss": 0.3766, + "step": 776 + }, + { + "epoch": 0.6271186440677966, + "grad_norm": 0.038880471140146255, + "learning_rate": 6.455832607970327e-06, + "loss": 0.338, + "step": 777 + }, + { + "epoch": 0.6279257465698144, + "grad_norm": 0.04801465570926666, + "learning_rate": 6.431384609274689e-06, + "loss": 0.3689, + "step": 778 + }, + { + "epoch": 0.6287328490718321, + "grad_norm": 0.04502476006746292, + "learning_rate": 6.406961028755471e-06, + "loss": 0.4114, + "step": 779 + }, + { + "epoch": 0.6295399515738499, + "grad_norm": 0.050989337265491486, + "learning_rate": 6.382562033530506e-06, + "loss": 0.4461, + "step": 780 + }, + { + "epoch": 0.6303470540758677, + "grad_norm": 0.04601307213306427, + "learning_rate": 6.358187790549418e-06, + "loss": 0.3419, + "step": 781 + }, + { + "epoch": 0.6311541565778854, + "grad_norm": 0.04786008596420288, + "learning_rate": 6.333838466592453e-06, + "loss": 0.4029, + "step": 782 + }, + { + "epoch": 0.6319612590799032, + "grad_norm": 0.049381598830223083, + "learning_rate": 6.309514228269351e-06, + "loss": 0.3908, + "step": 783 + }, + { + "epoch": 0.632768361581921, + "grad_norm": 0.04913535341620445, + "learning_rate": 6.285215242018214e-06, + "loss": 0.3768, + "step": 784 + }, + { + "epoch": 0.6335754640839386, + "grad_norm": 0.048182886093854904, + "learning_rate": 6.260941674104339e-06, + "loss": 0.3727, + "step": 785 + }, + { + "epoch": 0.6343825665859564, + "grad_norm": 0.04583975300192833, + "learning_rate": 6.236693690619112e-06, + "loss": 0.3501, + "step": 786 + }, + { + "epoch": 0.6351896690879741, + "grad_norm": 0.049178920686244965, + "learning_rate": 6.212471457478857e-06, + "loss": 0.4163, + "step": 787 + }, + { + "epoch": 0.6359967715899919, + "grad_norm": 0.057297833263874054, + "learning_rate": 6.188275140423694e-06, + "loss": 0.4326, + "step": 788 + }, + { + "epoch": 0.6368038740920097, + "grad_norm": 0.04626166820526123, + "learning_rate": 6.164104905016426e-06, + "loss": 0.4021, + "step": 789 + }, + { + "epoch": 0.6376109765940274, + "grad_norm": 0.04393228515982628, + "learning_rate": 6.1399609166413765e-06, + "loss": 0.3317, + "step": 790 + }, + { + "epoch": 0.6384180790960452, + "grad_norm": 0.041127268224954605, + "learning_rate": 6.115843340503288e-06, + "loss": 0.3409, + "step": 791 + }, + { + "epoch": 0.639225181598063, + "grad_norm": 0.04137645289301872, + "learning_rate": 6.091752341626168e-06, + "loss": 0.3631, + "step": 792 + }, + { + "epoch": 0.6400322841000807, + "grad_norm": 0.04440583661198616, + "learning_rate": 6.0676880848521794e-06, + "loss": 0.4272, + "step": 793 + }, + { + "epoch": 0.6408393866020985, + "grad_norm": 0.05209174379706383, + "learning_rate": 6.043650734840496e-06, + "loss": 0.4008, + "step": 794 + }, + { + "epoch": 0.6416464891041163, + "grad_norm": 0.045219723135232925, + "learning_rate": 6.019640456066181e-06, + "loss": 0.3482, + "step": 795 + }, + { + "epoch": 0.642453591606134, + "grad_norm": 0.05367840453982353, + "learning_rate": 5.995657412819068e-06, + "loss": 0.4266, + "step": 796 + }, + { + "epoch": 0.6432606941081518, + "grad_norm": 0.0441439189016819, + "learning_rate": 5.971701769202632e-06, + "loss": 0.3938, + "step": 797 + }, + { + "epoch": 0.6440677966101694, + "grad_norm": 0.046317096799612045, + "learning_rate": 5.947773689132863e-06, + "loss": 0.3681, + "step": 798 + }, + { + "epoch": 0.6448748991121872, + "grad_norm": 0.045697569847106934, + "learning_rate": 5.923873336337154e-06, + "loss": 0.4577, + "step": 799 + }, + { + "epoch": 0.645682001614205, + "grad_norm": 0.044494569301605225, + "learning_rate": 5.900000874353164e-06, + "loss": 0.4019, + "step": 800 + }, + { + "epoch": 0.6464891041162227, + "grad_norm": 0.048398248851299286, + "learning_rate": 5.876156466527723e-06, + "loss": 0.3927, + "step": 801 + }, + { + "epoch": 0.6472962066182405, + "grad_norm": 0.05004660412669182, + "learning_rate": 5.852340276015689e-06, + "loss": 0.3581, + "step": 802 + }, + { + "epoch": 0.6481033091202583, + "grad_norm": 0.04907209798693657, + "learning_rate": 5.828552465778855e-06, + "loss": 0.3857, + "step": 803 + }, + { + "epoch": 0.648910411622276, + "grad_norm": 0.045290783047676086, + "learning_rate": 5.804793198584815e-06, + "loss": 0.35, + "step": 804 + }, + { + "epoch": 0.6497175141242938, + "grad_norm": 0.049314212054014206, + "learning_rate": 5.781062637005853e-06, + "loss": 0.4207, + "step": 805 + }, + { + "epoch": 0.6505246166263116, + "grad_norm": 0.04956541955471039, + "learning_rate": 5.757360943417848e-06, + "loss": 0.4034, + "step": 806 + }, + { + "epoch": 0.6513317191283293, + "grad_norm": 0.047677140682935715, + "learning_rate": 5.7336882799991425e-06, + "loss": 0.3735, + "step": 807 + }, + { + "epoch": 0.6521388216303471, + "grad_norm": 0.04479856416583061, + "learning_rate": 5.710044808729447e-06, + "loss": 0.3895, + "step": 808 + }, + { + "epoch": 0.6529459241323649, + "grad_norm": 0.051077935844659805, + "learning_rate": 5.686430691388718e-06, + "loss": 0.4304, + "step": 809 + }, + { + "epoch": 0.6537530266343826, + "grad_norm": 0.04743655025959015, + "learning_rate": 5.662846089556054e-06, + "loss": 0.3646, + "step": 810 + }, + { + "epoch": 0.6545601291364003, + "grad_norm": 0.04958782345056534, + "learning_rate": 5.639291164608609e-06, + "loss": 0.3504, + "step": 811 + }, + { + "epoch": 0.655367231638418, + "grad_norm": 0.04840300232172012, + "learning_rate": 5.6157660777204635e-06, + "loss": 0.3677, + "step": 812 + }, + { + "epoch": 0.6561743341404358, + "grad_norm": 0.04449933022260666, + "learning_rate": 5.592270989861535e-06, + "loss": 0.3586, + "step": 813 + }, + { + "epoch": 0.6569814366424536, + "grad_norm": 0.048872679471969604, + "learning_rate": 5.568806061796478e-06, + "loss": 0.4055, + "step": 814 + }, + { + "epoch": 0.6577885391444713, + "grad_norm": 0.05295393988490105, + "learning_rate": 5.545371454083566e-06, + "loss": 0.4166, + "step": 815 + }, + { + "epoch": 0.6585956416464891, + "grad_norm": 0.050493236631155014, + "learning_rate": 5.521967327073612e-06, + "loss": 0.3925, + "step": 816 + }, + { + "epoch": 0.6594027441485069, + "grad_norm": 0.05255108326673508, + "learning_rate": 5.498593840908871e-06, + "loss": 0.4376, + "step": 817 + }, + { + "epoch": 0.6602098466505246, + "grad_norm": 0.04289624094963074, + "learning_rate": 5.475251155521935e-06, + "loss": 0.4391, + "step": 818 + }, + { + "epoch": 0.6610169491525424, + "grad_norm": 0.04436655342578888, + "learning_rate": 5.4519394306346415e-06, + "loss": 0.3855, + "step": 819 + }, + { + "epoch": 0.6618240516545602, + "grad_norm": 0.045538369566202164, + "learning_rate": 5.428658825756987e-06, + "loss": 0.383, + "step": 820 + }, + { + "epoch": 0.6626311541565779, + "grad_norm": 0.051048438996076584, + "learning_rate": 5.4054095001860196e-06, + "loss": 0.3938, + "step": 821 + }, + { + "epoch": 0.6634382566585957, + "grad_norm": 0.042728327214717865, + "learning_rate": 5.382191613004761e-06, + "loss": 0.3949, + "step": 822 + }, + { + "epoch": 0.6642453591606134, + "grad_norm": 0.0470639243721962, + "learning_rate": 5.359005323081127e-06, + "loss": 0.4268, + "step": 823 + }, + { + "epoch": 0.6650524616626311, + "grad_norm": 0.049070440232753754, + "learning_rate": 5.335850789066819e-06, + "loss": 0.3786, + "step": 824 + }, + { + "epoch": 0.6658595641646489, + "grad_norm": 0.05124800652265549, + "learning_rate": 5.31272816939626e-06, + "loss": 0.3951, + "step": 825 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.045142266899347305, + "learning_rate": 5.289637622285485e-06, + "loss": 0.3774, + "step": 826 + }, + { + "epoch": 0.6674737691686844, + "grad_norm": 0.04465753585100174, + "learning_rate": 5.266579305731088e-06, + "loss": 0.3376, + "step": 827 + }, + { + "epoch": 0.6682808716707022, + "grad_norm": 0.046953234821558, + "learning_rate": 5.2435533775091115e-06, + "loss": 0.3808, + "step": 828 + }, + { + "epoch": 0.6690879741727199, + "grad_norm": 0.048814088106155396, + "learning_rate": 5.220559995173995e-06, + "loss": 0.3921, + "step": 829 + }, + { + "epoch": 0.6698950766747377, + "grad_norm": 0.05050293728709221, + "learning_rate": 5.197599316057479e-06, + "loss": 0.4176, + "step": 830 + }, + { + "epoch": 0.6707021791767555, + "grad_norm": 0.041133660823106766, + "learning_rate": 5.174671497267525e-06, + "loss": 0.3142, + "step": 831 + }, + { + "epoch": 0.6715092816787732, + "grad_norm": 0.04762447252869606, + "learning_rate": 5.151776695687261e-06, + "loss": 0.3962, + "step": 832 + }, + { + "epoch": 0.672316384180791, + "grad_norm": 0.049833763390779495, + "learning_rate": 5.128915067973888e-06, + "loss": 0.3544, + "step": 833 + }, + { + "epoch": 0.6731234866828087, + "grad_norm": 0.04340628907084465, + "learning_rate": 5.106086770557619e-06, + "loss": 0.3583, + "step": 834 + }, + { + "epoch": 0.6739305891848265, + "grad_norm": 0.05325612425804138, + "learning_rate": 5.083291959640602e-06, + "loss": 0.4229, + "step": 835 + }, + { + "epoch": 0.6747376916868443, + "grad_norm": 0.04724089801311493, + "learning_rate": 5.0605307911958515e-06, + "loss": 0.3477, + "step": 836 + }, + { + "epoch": 0.6755447941888619, + "grad_norm": 0.04456155002117157, + "learning_rate": 5.03780342096619e-06, + "loss": 0.3177, + "step": 837 + }, + { + "epoch": 0.6763518966908797, + "grad_norm": 0.053026482462882996, + "learning_rate": 5.015110004463177e-06, + "loss": 0.4088, + "step": 838 + }, + { + "epoch": 0.6771589991928975, + "grad_norm": 0.040615472942590714, + "learning_rate": 4.992450696966039e-06, + "loss": 0.349, + "step": 839 + }, + { + "epoch": 0.6779661016949152, + "grad_norm": 0.051405586302280426, + "learning_rate": 4.969825653520621e-06, + "loss": 0.37, + "step": 840 + }, + { + "epoch": 0.678773204196933, + "grad_norm": 0.04546589031815529, + "learning_rate": 4.947235028938309e-06, + "loss": 0.3746, + "step": 841 + }, + { + "epoch": 0.6795803066989508, + "grad_norm": 0.04085971787571907, + "learning_rate": 4.9246789777949725e-06, + "loss": 0.3511, + "step": 842 + }, + { + "epoch": 0.6803874092009685, + "grad_norm": 0.050242677330970764, + "learning_rate": 4.902157654429928e-06, + "loss": 0.4374, + "step": 843 + }, + { + "epoch": 0.6811945117029863, + "grad_norm": 0.052028290927410126, + "learning_rate": 4.8796712129448596e-06, + "loss": 0.3637, + "step": 844 + }, + { + "epoch": 0.682001614205004, + "grad_norm": 0.04794751852750778, + "learning_rate": 4.857219807202781e-06, + "loss": 0.3421, + "step": 845 + }, + { + "epoch": 0.6828087167070218, + "grad_norm": 0.04805922508239746, + "learning_rate": 4.834803590826956e-06, + "loss": 0.3942, + "step": 846 + }, + { + "epoch": 0.6836158192090396, + "grad_norm": 0.05546120926737785, + "learning_rate": 4.8124227171998905e-06, + "loss": 0.4107, + "step": 847 + }, + { + "epoch": 0.6844229217110573, + "grad_norm": 0.052278514951467514, + "learning_rate": 4.790077339462234e-06, + "loss": 0.4613, + "step": 848 + }, + { + "epoch": 0.6852300242130751, + "grad_norm": 0.04965787008404732, + "learning_rate": 4.7677676105117734e-06, + "loss": 0.406, + "step": 849 + }, + { + "epoch": 0.6860371267150929, + "grad_norm": 0.04963855817914009, + "learning_rate": 4.745493683002368e-06, + "loss": 0.403, + "step": 850 + }, + { + "epoch": 0.6868442292171105, + "grad_norm": 0.03866651654243469, + "learning_rate": 4.723255709342893e-06, + "loss": 0.292, + "step": 851 + }, + { + "epoch": 0.6876513317191283, + "grad_norm": 0.04614201933145523, + "learning_rate": 4.701053841696226e-06, + "loss": 0.3667, + "step": 852 + }, + { + "epoch": 0.688458434221146, + "grad_norm": 0.051998917013406754, + "learning_rate": 4.678888231978186e-06, + "loss": 0.429, + "step": 853 + }, + { + "epoch": 0.6892655367231638, + "grad_norm": 0.044506240636110306, + "learning_rate": 4.6567590318564886e-06, + "loss": 0.3222, + "step": 854 + }, + { + "epoch": 0.6900726392251816, + "grad_norm": 0.047115735709667206, + "learning_rate": 4.634666392749729e-06, + "loss": 0.3772, + "step": 855 + }, + { + "epoch": 0.6908797417271993, + "grad_norm": 0.046469517052173615, + "learning_rate": 4.612610465826326e-06, + "loss": 0.3457, + "step": 856 + }, + { + "epoch": 0.6916868442292171, + "grad_norm": 0.05154654383659363, + "learning_rate": 4.590591402003501e-06, + "loss": 0.4502, + "step": 857 + }, + { + "epoch": 0.6924939467312349, + "grad_norm": 0.05142826586961746, + "learning_rate": 4.568609351946241e-06, + "loss": 0.3652, + "step": 858 + }, + { + "epoch": 0.6933010492332526, + "grad_norm": 0.056190911680459976, + "learning_rate": 4.546664466066268e-06, + "loss": 0.4137, + "step": 859 + }, + { + "epoch": 0.6941081517352704, + "grad_norm": 0.04742627218365669, + "learning_rate": 4.524756894521003e-06, + "loss": 0.3578, + "step": 860 + }, + { + "epoch": 0.6949152542372882, + "grad_norm": 0.053221605718135834, + "learning_rate": 4.502886787212543e-06, + "loss": 0.413, + "step": 861 + }, + { + "epoch": 0.6957223567393059, + "grad_norm": 0.05199446156620979, + "learning_rate": 4.481054293786645e-06, + "loss": 0.372, + "step": 862 + }, + { + "epoch": 0.6965294592413237, + "grad_norm": 0.050332196056842804, + "learning_rate": 4.4592595636316915e-06, + "loss": 0.365, + "step": 863 + }, + { + "epoch": 0.6973365617433414, + "grad_norm": 0.0501842126250267, + "learning_rate": 4.437502745877668e-06, + "loss": 0.4525, + "step": 864 + }, + { + "epoch": 0.6981436642453591, + "grad_norm": 0.049436986446380615, + "learning_rate": 4.415783989395151e-06, + "loss": 0.3934, + "step": 865 + }, + { + "epoch": 0.6989507667473769, + "grad_norm": 0.051288872957229614, + "learning_rate": 4.394103442794273e-06, + "loss": 0.316, + "step": 866 + }, + { + "epoch": 0.6997578692493946, + "grad_norm": 0.04671122506260872, + "learning_rate": 4.372461254423722e-06, + "loss": 0.3919, + "step": 867 + }, + { + "epoch": 0.7005649717514124, + "grad_norm": 0.0548025481402874, + "learning_rate": 4.350857572369722e-06, + "loss": 0.4481, + "step": 868 + }, + { + "epoch": 0.7013720742534302, + "grad_norm": 0.047147806733846664, + "learning_rate": 4.329292544455017e-06, + "loss": 0.3677, + "step": 869 + }, + { + "epoch": 0.7021791767554479, + "grad_norm": 0.04897439107298851, + "learning_rate": 4.3077663182378634e-06, + "loss": 0.3749, + "step": 870 + }, + { + "epoch": 0.7029862792574657, + "grad_norm": 0.05925610661506653, + "learning_rate": 4.286279041011017e-06, + "loss": 0.4326, + "step": 871 + }, + { + "epoch": 0.7037933817594835, + "grad_norm": 0.05080854520201683, + "learning_rate": 4.264830859800719e-06, + "loss": 0.3676, + "step": 872 + }, + { + "epoch": 0.7046004842615012, + "grad_norm": 0.05124906823039055, + "learning_rate": 4.243421921365702e-06, + "loss": 0.4365, + "step": 873 + }, + { + "epoch": 0.705407586763519, + "grad_norm": 0.046696025878190994, + "learning_rate": 4.222052372196184e-06, + "loss": 0.3863, + "step": 874 + }, + { + "epoch": 0.7062146892655368, + "grad_norm": 0.0472051203250885, + "learning_rate": 4.2007223585128576e-06, + "loss": 0.3862, + "step": 875 + }, + { + "epoch": 0.7070217917675545, + "grad_norm": 0.04693121090531349, + "learning_rate": 4.1794320262659015e-06, + "loss": 0.3594, + "step": 876 + }, + { + "epoch": 0.7078288942695722, + "grad_norm": 0.052231140434741974, + "learning_rate": 4.158181521133964e-06, + "loss": 0.38, + "step": 877 + }, + { + "epoch": 0.70863599677159, + "grad_norm": 0.04406065121293068, + "learning_rate": 4.136970988523189e-06, + "loss": 0.3882, + "step": 878 + }, + { + "epoch": 0.7094430992736077, + "grad_norm": 0.04371519759297371, + "learning_rate": 4.115800573566197e-06, + "loss": 0.3596, + "step": 879 + }, + { + "epoch": 0.7102502017756255, + "grad_norm": 0.042410142719745636, + "learning_rate": 4.0946704211211165e-06, + "loss": 0.3507, + "step": 880 + }, + { + "epoch": 0.7110573042776432, + "grad_norm": 0.04645742103457451, + "learning_rate": 4.073580675770575e-06, + "loss": 0.3602, + "step": 881 + }, + { + "epoch": 0.711864406779661, + "grad_norm": 0.05483156070113182, + "learning_rate": 4.052531481820713e-06, + "loss": 0.4491, + "step": 882 + }, + { + "epoch": 0.7126715092816788, + "grad_norm": 0.04915105551481247, + "learning_rate": 4.0315229833002e-06, + "loss": 0.3792, + "step": 883 + }, + { + "epoch": 0.7134786117836965, + "grad_norm": 0.05002758279442787, + "learning_rate": 4.010555323959253e-06, + "loss": 0.4236, + "step": 884 + }, + { + "epoch": 0.7142857142857143, + "grad_norm": 0.04625044763088226, + "learning_rate": 3.989628647268645e-06, + "loss": 0.4021, + "step": 885 + }, + { + "epoch": 0.7150928167877321, + "grad_norm": 0.050767894834280014, + "learning_rate": 3.9687430964187225e-06, + "loss": 0.4123, + "step": 886 + }, + { + "epoch": 0.7158999192897498, + "grad_norm": 0.04956914111971855, + "learning_rate": 3.9478988143184286e-06, + "loss": 0.4182, + "step": 887 + }, + { + "epoch": 0.7167070217917676, + "grad_norm": 0.044376108795404434, + "learning_rate": 3.927095943594331e-06, + "loss": 0.3106, + "step": 888 + }, + { + "epoch": 0.7175141242937854, + "grad_norm": 0.044406965374946594, + "learning_rate": 3.906334626589642e-06, + "loss": 0.3655, + "step": 889 + }, + { + "epoch": 0.718321226795803, + "grad_norm": 0.04712330549955368, + "learning_rate": 3.885615005363237e-06, + "loss": 0.3986, + "step": 890 + }, + { + "epoch": 0.7191283292978208, + "grad_norm": 0.04551446810364723, + "learning_rate": 3.864937221688697e-06, + "loss": 0.3642, + "step": 891 + }, + { + "epoch": 0.7199354317998385, + "grad_norm": 0.04825610667467117, + "learning_rate": 3.844301417053323e-06, + "loss": 0.4192, + "step": 892 + }, + { + "epoch": 0.7207425343018563, + "grad_norm": 0.05064934864640236, + "learning_rate": 3.823707732657175e-06, + "loss": 0.3854, + "step": 893 + }, + { + "epoch": 0.7215496368038741, + "grad_norm": 0.05061221495270729, + "learning_rate": 3.8031563094121104e-06, + "loss": 0.4301, + "step": 894 + }, + { + "epoch": 0.7223567393058918, + "grad_norm": 0.04634290561079979, + "learning_rate": 3.782647287940817e-06, + "loss": 0.3964, + "step": 895 + }, + { + "epoch": 0.7231638418079096, + "grad_norm": 0.049565527588129044, + "learning_rate": 3.762180808575848e-06, + "loss": 0.4221, + "step": 896 + }, + { + "epoch": 0.7239709443099274, + "grad_norm": 0.0462944395840168, + "learning_rate": 3.7417570113586553e-06, + "loss": 0.3673, + "step": 897 + }, + { + "epoch": 0.7247780468119451, + "grad_norm": 0.04157343879342079, + "learning_rate": 3.721376036038653e-06, + "loss": 0.3436, + "step": 898 + }, + { + "epoch": 0.7255851493139629, + "grad_norm": 0.05127215012907982, + "learning_rate": 3.701038022072234e-06, + "loss": 0.384, + "step": 899 + }, + { + "epoch": 0.7263922518159807, + "grad_norm": 0.0550130195915699, + "learning_rate": 3.680743108621836e-06, + "loss": 0.4338, + "step": 900 + }, + { + "epoch": 0.7271993543179984, + "grad_norm": 0.048612404614686966, + "learning_rate": 3.6604914345549857e-06, + "loss": 0.3708, + "step": 901 + }, + { + "epoch": 0.7280064568200162, + "grad_norm": 0.04515503719449043, + "learning_rate": 3.6402831384433346e-06, + "loss": 0.3574, + "step": 902 + }, + { + "epoch": 0.7288135593220338, + "grad_norm": 0.048535142093896866, + "learning_rate": 3.6201183585617304e-06, + "loss": 0.3614, + "step": 903 + }, + { + "epoch": 0.7296206618240516, + "grad_norm": 0.05240613594651222, + "learning_rate": 3.5999972328872623e-06, + "loss": 0.3595, + "step": 904 + }, + { + "epoch": 0.7304277643260694, + "grad_norm": 0.04590952768921852, + "learning_rate": 3.5799198990983063e-06, + "loss": 0.4138, + "step": 905 + }, + { + "epoch": 0.7312348668280871, + "grad_norm": 0.04791981354355812, + "learning_rate": 3.5598864945736077e-06, + "loss": 0.4026, + "step": 906 + }, + { + "epoch": 0.7320419693301049, + "grad_norm": 0.05078763887286186, + "learning_rate": 3.539897156391312e-06, + "loss": 0.3849, + "step": 907 + }, + { + "epoch": 0.7328490718321227, + "grad_norm": 0.04727828875184059, + "learning_rate": 3.5199520213280526e-06, + "loss": 0.3521, + "step": 908 + }, + { + "epoch": 0.7336561743341404, + "grad_norm": 0.045900169759988785, + "learning_rate": 3.500051225858001e-06, + "loss": 0.3558, + "step": 909 + }, + { + "epoch": 0.7344632768361582, + "grad_norm": 0.05100518465042114, + "learning_rate": 3.480194906151938e-06, + "loss": 0.3983, + "step": 910 + }, + { + "epoch": 0.735270379338176, + "grad_norm": 0.049115296453237534, + "learning_rate": 3.4603831980763136e-06, + "loss": 0.3813, + "step": 911 + }, + { + "epoch": 0.7360774818401937, + "grad_norm": 0.05351780727505684, + "learning_rate": 3.440616237192325e-06, + "loss": 0.4019, + "step": 912 + }, + { + "epoch": 0.7368845843422115, + "grad_norm": 0.04854034259915352, + "learning_rate": 3.4208941587549917e-06, + "loss": 0.4318, + "step": 913 + }, + { + "epoch": 0.7376916868442293, + "grad_norm": 0.04963256046175957, + "learning_rate": 3.401217097712226e-06, + "loss": 0.3967, + "step": 914 + }, + { + "epoch": 0.738498789346247, + "grad_norm": 0.048572007566690445, + "learning_rate": 3.3815851887039064e-06, + "loss": 0.3585, + "step": 915 + }, + { + "epoch": 0.7393058918482648, + "grad_norm": 0.05005783215165138, + "learning_rate": 3.3619985660609655e-06, + "loss": 0.3355, + "step": 916 + }, + { + "epoch": 0.7401129943502824, + "grad_norm": 0.04688713699579239, + "learning_rate": 3.342457363804458e-06, + "loss": 0.3541, + "step": 917 + }, + { + "epoch": 0.7409200968523002, + "grad_norm": 0.04656829312443733, + "learning_rate": 3.322961715644647e-06, + "loss": 0.3095, + "step": 918 + }, + { + "epoch": 0.741727199354318, + "grad_norm": 0.05018598213791847, + "learning_rate": 3.3035117549801034e-06, + "loss": 0.3648, + "step": 919 + }, + { + "epoch": 0.7425343018563357, + "grad_norm": 0.05106114596128464, + "learning_rate": 3.284107614896778e-06, + "loss": 0.4279, + "step": 920 + }, + { + "epoch": 0.7433414043583535, + "grad_norm": 0.045182712376117706, + "learning_rate": 3.2647494281670967e-06, + "loss": 0.394, + "step": 921 + }, + { + "epoch": 0.7441485068603713, + "grad_norm": 0.04775145649909973, + "learning_rate": 3.2454373272490436e-06, + "loss": 0.337, + "step": 922 + }, + { + "epoch": 0.744955609362389, + "grad_norm": 0.045995697379112244, + "learning_rate": 3.226171444285272e-06, + "loss": 0.3459, + "step": 923 + }, + { + "epoch": 0.7457627118644068, + "grad_norm": 0.0462590754032135, + "learning_rate": 3.20695191110218e-06, + "loss": 0.3684, + "step": 924 + }, + { + "epoch": 0.7465698143664246, + "grad_norm": 0.0479058213531971, + "learning_rate": 3.187778859209022e-06, + "loss": 0.366, + "step": 925 + }, + { + "epoch": 0.7473769168684423, + "grad_norm": 0.050555307418107986, + "learning_rate": 3.1686524197970103e-06, + "loss": 0.4058, + "step": 926 + }, + { + "epoch": 0.7481840193704601, + "grad_norm": 0.048881128430366516, + "learning_rate": 3.1495727237384087e-06, + "loss": 0.3504, + "step": 927 + }, + { + "epoch": 0.7489911218724778, + "grad_norm": 0.05036919191479683, + "learning_rate": 3.1305399015856332e-06, + "loss": 0.3647, + "step": 928 + }, + { + "epoch": 0.7497982243744956, + "grad_norm": 0.04874005541205406, + "learning_rate": 3.111554083570377e-06, + "loss": 0.3649, + "step": 929 + }, + { + "epoch": 0.7506053268765133, + "grad_norm": 0.05192394554615021, + "learning_rate": 3.0926153996027066e-06, + "loss": 0.3635, + "step": 930 + }, + { + "epoch": 0.751412429378531, + "grad_norm": 0.05109355226159096, + "learning_rate": 3.0737239792701656e-06, + "loss": 0.3651, + "step": 931 + }, + { + "epoch": 0.7522195318805488, + "grad_norm": 0.04810253158211708, + "learning_rate": 3.05487995183691e-06, + "loss": 0.3923, + "step": 932 + }, + { + "epoch": 0.7530266343825666, + "grad_norm": 0.044694848358631134, + "learning_rate": 3.0360834462428002e-06, + "loss": 0.3712, + "step": 933 + }, + { + "epoch": 0.7538337368845843, + "grad_norm": 0.04288363456726074, + "learning_rate": 3.0173345911025343e-06, + "loss": 0.3793, + "step": 934 + }, + { + "epoch": 0.7546408393866021, + "grad_norm": 0.05430062487721443, + "learning_rate": 2.9986335147047642e-06, + "loss": 0.3893, + "step": 935 + }, + { + "epoch": 0.7554479418886199, + "grad_norm": 0.04797254130244255, + "learning_rate": 2.9799803450112185e-06, + "loss": 0.3265, + "step": 936 + }, + { + "epoch": 0.7562550443906376, + "grad_norm": 0.0462198331952095, + "learning_rate": 2.961375209655819e-06, + "loss": 0.3466, + "step": 937 + }, + { + "epoch": 0.7570621468926554, + "grad_norm": 0.05069499462842941, + "learning_rate": 2.942818235943814e-06, + "loss": 0.4104, + "step": 938 + }, + { + "epoch": 0.7578692493946732, + "grad_norm": 0.05128340423107147, + "learning_rate": 2.92430955085091e-06, + "loss": 0.3697, + "step": 939 + }, + { + "epoch": 0.7586763518966909, + "grad_norm": 0.05035511404275894, + "learning_rate": 2.9058492810224003e-06, + "loss": 0.3698, + "step": 940 + }, + { + "epoch": 0.7594834543987087, + "grad_norm": 0.05182954668998718, + "learning_rate": 2.8874375527722975e-06, + "loss": 0.3887, + "step": 941 + }, + { + "epoch": 0.7602905569007264, + "grad_norm": 0.047854945063591, + "learning_rate": 2.8690744920824697e-06, + "loss": 0.3705, + "step": 942 + }, + { + "epoch": 0.7610976594027441, + "grad_norm": 0.05556565895676613, + "learning_rate": 2.850760224601774e-06, + "loss": 0.4228, + "step": 943 + }, + { + "epoch": 0.7619047619047619, + "grad_norm": 0.05028011277318001, + "learning_rate": 2.832494875645202e-06, + "loss": 0.399, + "step": 944 + }, + { + "epoch": 0.7627118644067796, + "grad_norm": 0.04954536259174347, + "learning_rate": 2.814278570193025e-06, + "loss": 0.3334, + "step": 945 + }, + { + "epoch": 0.7635189669087974, + "grad_norm": 0.0548592135310173, + "learning_rate": 2.796111432889934e-06, + "loss": 0.415, + "step": 946 + }, + { + "epoch": 0.7643260694108152, + "grad_norm": 0.043614424765110016, + "learning_rate": 2.77799358804419e-06, + "loss": 0.3448, + "step": 947 + }, + { + "epoch": 0.7651331719128329, + "grad_norm": 0.04365172237157822, + "learning_rate": 2.759925159626763e-06, + "loss": 0.3924, + "step": 948 + }, + { + "epoch": 0.7659402744148507, + "grad_norm": 0.050896864384412766, + "learning_rate": 2.7419062712705025e-06, + "loss": 0.3812, + "step": 949 + }, + { + "epoch": 0.7667473769168685, + "grad_norm": 0.05392688512802124, + "learning_rate": 2.7239370462692716e-06, + "loss": 0.4221, + "step": 950 + }, + { + "epoch": 0.7675544794188862, + "grad_norm": 0.050148848444223404, + "learning_rate": 2.7060176075771217e-06, + "loss": 0.3669, + "step": 951 + }, + { + "epoch": 0.768361581920904, + "grad_norm": 0.04388059303164482, + "learning_rate": 2.6881480778074396e-06, + "loss": 0.3798, + "step": 952 + }, + { + "epoch": 0.7691686844229217, + "grad_norm": 0.048120852559804916, + "learning_rate": 2.6703285792321045e-06, + "loss": 0.373, + "step": 953 + }, + { + "epoch": 0.7699757869249395, + "grad_norm": 0.04943971335887909, + "learning_rate": 2.6525592337806684e-06, + "loss": 0.3712, + "step": 954 + }, + { + "epoch": 0.7707828894269573, + "grad_norm": 0.04997150972485542, + "learning_rate": 2.634840163039508e-06, + "loss": 0.3958, + "step": 955 + }, + { + "epoch": 0.7715899919289749, + "grad_norm": 0.06133696436882019, + "learning_rate": 2.617171488250991e-06, + "loss": 0.4321, + "step": 956 + }, + { + "epoch": 0.7723970944309927, + "grad_norm": 0.05077798292040825, + "learning_rate": 2.599553330312663e-06, + "loss": 0.3904, + "step": 957 + }, + { + "epoch": 0.7732041969330105, + "grad_norm": 0.05077609792351723, + "learning_rate": 2.581985809776395e-06, + "loss": 0.361, + "step": 958 + }, + { + "epoch": 0.7740112994350282, + "grad_norm": 0.04809306561946869, + "learning_rate": 2.564469046847583e-06, + "loss": 0.3769, + "step": 959 + }, + { + "epoch": 0.774818401937046, + "grad_norm": 0.04891866073012352, + "learning_rate": 2.5470031613843127e-06, + "loss": 0.3927, + "step": 960 + }, + { + "epoch": 0.7756255044390638, + "grad_norm": 0.04797545447945595, + "learning_rate": 2.529588272896544e-06, + "loss": 0.3339, + "step": 961 + }, + { + "epoch": 0.7764326069410815, + "grad_norm": 0.05068304389715195, + "learning_rate": 2.512224500545285e-06, + "loss": 0.4047, + "step": 962 + }, + { + "epoch": 0.7772397094430993, + "grad_norm": 0.05368385463953018, + "learning_rate": 2.494911963141784e-06, + "loss": 0.3864, + "step": 963 + }, + { + "epoch": 0.778046811945117, + "grad_norm": 0.05426260456442833, + "learning_rate": 2.4776507791467195e-06, + "loss": 0.3241, + "step": 964 + }, + { + "epoch": 0.7788539144471348, + "grad_norm": 0.052247244864702225, + "learning_rate": 2.4604410666693846e-06, + "loss": 0.3536, + "step": 965 + }, + { + "epoch": 0.7796610169491526, + "grad_norm": 0.04787401109933853, + "learning_rate": 2.4432829434668757e-06, + "loss": 0.3588, + "step": 966 + }, + { + "epoch": 0.7804681194511703, + "grad_norm": 0.050289399921894073, + "learning_rate": 2.426176526943298e-06, + "loss": 0.4559, + "step": 967 + }, + { + "epoch": 0.7812752219531881, + "grad_norm": 0.048816096037626266, + "learning_rate": 2.4091219341489458e-06, + "loss": 0.3831, + "step": 968 + }, + { + "epoch": 0.7820823244552058, + "grad_norm": 0.05117584764957428, + "learning_rate": 2.3921192817795125e-06, + "loss": 0.3452, + "step": 969 + }, + { + "epoch": 0.7828894269572235, + "grad_norm": 0.04862622544169426, + "learning_rate": 2.3751686861752967e-06, + "loss": 0.4173, + "step": 970 + }, + { + "epoch": 0.7836965294592413, + "grad_norm": 0.0534110851585865, + "learning_rate": 2.3582702633203925e-06, + "loss": 0.4199, + "step": 971 + }, + { + "epoch": 0.784503631961259, + "grad_norm": 0.05284995958209038, + "learning_rate": 2.3414241288419114e-06, + "loss": 0.4122, + "step": 972 + }, + { + "epoch": 0.7853107344632768, + "grad_norm": 0.049155883491039276, + "learning_rate": 2.32463039800917e-06, + "loss": 0.4517, + "step": 973 + }, + { + "epoch": 0.7861178369652946, + "grad_norm": 0.050774216651916504, + "learning_rate": 2.307889185732928e-06, + "loss": 0.4045, + "step": 974 + }, + { + "epoch": 0.7869249394673123, + "grad_norm": 0.055596962571144104, + "learning_rate": 2.2912006065645763e-06, + "loss": 0.4097, + "step": 975 + }, + { + "epoch": 0.7877320419693301, + "grad_norm": 0.04935078322887421, + "learning_rate": 2.2745647746953714e-06, + "loss": 0.3764, + "step": 976 + }, + { + "epoch": 0.7885391444713479, + "grad_norm": 0.05432211980223656, + "learning_rate": 2.25798180395565e-06, + "loss": 0.3792, + "step": 977 + }, + { + "epoch": 0.7893462469733656, + "grad_norm": 0.05181829631328583, + "learning_rate": 2.2414518078140366e-06, + "loss": 0.3573, + "step": 978 + }, + { + "epoch": 0.7901533494753834, + "grad_norm": 0.04890815168619156, + "learning_rate": 2.2249748993766908e-06, + "loss": 0.3253, + "step": 979 + }, + { + "epoch": 0.7909604519774012, + "grad_norm": 0.051907122135162354, + "learning_rate": 2.208551191386512e-06, + "loss": 0.4315, + "step": 980 + }, + { + "epoch": 0.7917675544794189, + "grad_norm": 0.05192062631249428, + "learning_rate": 2.192180796222384e-06, + "loss": 0.3794, + "step": 981 + }, + { + "epoch": 0.7925746569814366, + "grad_norm": 0.05353526771068573, + "learning_rate": 2.17586382589839e-06, + "loss": 0.3847, + "step": 982 + }, + { + "epoch": 0.7933817594834544, + "grad_norm": 0.04733423516154289, + "learning_rate": 2.1596003920630626e-06, + "loss": 0.3336, + "step": 983 + }, + { + "epoch": 0.7941888619854721, + "grad_norm": 0.046822912991046906, + "learning_rate": 2.143390605998604e-06, + "loss": 0.3655, + "step": 984 + }, + { + "epoch": 0.7949959644874899, + "grad_norm": 0.05472195893526077, + "learning_rate": 2.1272345786201408e-06, + "loss": 0.4253, + "step": 985 + }, + { + "epoch": 0.7958030669895076, + "grad_norm": 0.04976249858736992, + "learning_rate": 2.111132420474953e-06, + "loss": 0.32, + "step": 986 + }, + { + "epoch": 0.7966101694915254, + "grad_norm": 0.04802384972572327, + "learning_rate": 2.095084241741725e-06, + "loss": 0.3397, + "step": 987 + }, + { + "epoch": 0.7974172719935432, + "grad_norm": 0.047920335084199905, + "learning_rate": 2.0790901522297822e-06, + "loss": 0.3472, + "step": 988 + }, + { + "epoch": 0.7982243744955609, + "grad_norm": 0.04877651855349541, + "learning_rate": 2.063150261378346e-06, + "loss": 0.3795, + "step": 989 + }, + { + "epoch": 0.7990314769975787, + "grad_norm": 0.05073659494519234, + "learning_rate": 2.0472646782557915e-06, + "loss": 0.378, + "step": 990 + }, + { + "epoch": 0.7998385794995965, + "grad_norm": 0.053008127957582474, + "learning_rate": 2.0314335115588866e-06, + "loss": 0.4238, + "step": 991 + }, + { + "epoch": 0.8006456820016142, + "grad_norm": 0.052719682455062866, + "learning_rate": 2.0156568696120636e-06, + "loss": 0.4257, + "step": 992 + }, + { + "epoch": 0.801452784503632, + "grad_norm": 0.048593856394290924, + "learning_rate": 1.9999348603666657e-06, + "loss": 0.3694, + "step": 993 + }, + { + "epoch": 0.8022598870056498, + "grad_norm": 0.04972432553768158, + "learning_rate": 1.9842675914002117e-06, + "loss": 0.3558, + "step": 994 + }, + { + "epoch": 0.8030669895076675, + "grad_norm": 0.04915817454457283, + "learning_rate": 1.968655169915661e-06, + "loss": 0.3779, + "step": 995 + }, + { + "epoch": 0.8038740920096852, + "grad_norm": 0.055468250066041946, + "learning_rate": 1.9530977027406828e-06, + "loss": 0.4878, + "step": 996 + }, + { + "epoch": 0.804681194511703, + "grad_norm": 0.052601367235183716, + "learning_rate": 1.937595296326924e-06, + "loss": 0.4093, + "step": 997 + }, + { + "epoch": 0.8054882970137207, + "grad_norm": 0.05105729401111603, + "learning_rate": 1.922148056749279e-06, + "loss": 0.4446, + "step": 998 + }, + { + "epoch": 0.8062953995157385, + "grad_norm": 0.05593395605683327, + "learning_rate": 1.9067560897051585e-06, + "loss": 0.3759, + "step": 999 + }, + { + "epoch": 0.8071025020177562, + "grad_norm": 0.048278238624334335, + "learning_rate": 1.8914195005137793e-06, + "loss": 0.4164, + "step": 1000 + }, + { + "epoch": 0.807909604519774, + "grad_norm": 0.04513949155807495, + "learning_rate": 1.8761383941154288e-06, + "loss": 0.336, + "step": 1001 + }, + { + "epoch": 0.8087167070217918, + "grad_norm": 0.051345016807317734, + "learning_rate": 1.8609128750707629e-06, + "loss": 0.3387, + "step": 1002 + }, + { + "epoch": 0.8095238095238095, + "grad_norm": 0.05032331123948097, + "learning_rate": 1.845743047560079e-06, + "loss": 0.3358, + "step": 1003 + }, + { + "epoch": 0.8103309120258273, + "grad_norm": 0.05130847543478012, + "learning_rate": 1.8306290153826012e-06, + "loss": 0.385, + "step": 1004 + }, + { + "epoch": 0.8111380145278451, + "grad_norm": 0.04798781871795654, + "learning_rate": 1.8155708819557826e-06, + "loss": 0.3949, + "step": 1005 + }, + { + "epoch": 0.8119451170298628, + "grad_norm": 0.0486295223236084, + "learning_rate": 1.8005687503145886e-06, + "loss": 0.3852, + "step": 1006 + }, + { + "epoch": 0.8127522195318806, + "grad_norm": 0.04694712907075882, + "learning_rate": 1.7856227231107892e-06, + "loss": 0.3438, + "step": 1007 + }, + { + "epoch": 0.8135593220338984, + "grad_norm": 0.05250077322125435, + "learning_rate": 1.7707329026122665e-06, + "loss": 0.4018, + "step": 1008 + }, + { + "epoch": 0.814366424535916, + "grad_norm": 0.04679161310195923, + "learning_rate": 1.7558993907023037e-06, + "loss": 0.3884, + "step": 1009 + }, + { + "epoch": 0.8151735270379338, + "grad_norm": 0.055237215012311935, + "learning_rate": 1.7411222888788993e-06, + "loss": 0.3961, + "step": 1010 + }, + { + "epoch": 0.8159806295399515, + "grad_norm": 0.05706053599715233, + "learning_rate": 1.7264016982540632e-06, + "loss": 0.3956, + "step": 1011 + }, + { + "epoch": 0.8167877320419693, + "grad_norm": 0.05254807323217392, + "learning_rate": 1.7117377195531337e-06, + "loss": 0.3668, + "step": 1012 + }, + { + "epoch": 0.8175948345439871, + "grad_norm": 0.050519250333309174, + "learning_rate": 1.6971304531140753e-06, + "loss": 0.4352, + "step": 1013 + }, + { + "epoch": 0.8184019370460048, + "grad_norm": 0.04889274388551712, + "learning_rate": 1.682579998886803e-06, + "loss": 0.3763, + "step": 1014 + }, + { + "epoch": 0.8192090395480226, + "grad_norm": 0.04828520119190216, + "learning_rate": 1.6680864564324984e-06, + "loss": 0.382, + "step": 1015 + }, + { + "epoch": 0.8200161420500404, + "grad_norm": 0.04728667438030243, + "learning_rate": 1.6536499249229243e-06, + "loss": 0.3385, + "step": 1016 + }, + { + "epoch": 0.8208232445520581, + "grad_norm": 0.0539923720061779, + "learning_rate": 1.639270503139746e-06, + "loss": 0.4052, + "step": 1017 + }, + { + "epoch": 0.8216303470540759, + "grad_norm": 0.04704679921269417, + "learning_rate": 1.62494828947386e-06, + "loss": 0.3622, + "step": 1018 + }, + { + "epoch": 0.8224374495560937, + "grad_norm": 0.054417211562395096, + "learning_rate": 1.6106833819247114e-06, + "loss": 0.3984, + "step": 1019 + }, + { + "epoch": 0.8232445520581114, + "grad_norm": 0.05438614636659622, + "learning_rate": 1.5964758780996314e-06, + "loss": 0.3658, + "step": 1020 + }, + { + "epoch": 0.8240516545601292, + "grad_norm": 0.04959606006741524, + "learning_rate": 1.5823258752131698e-06, + "loss": 0.351, + "step": 1021 + }, + { + "epoch": 0.8248587570621468, + "grad_norm": 0.04936448112130165, + "learning_rate": 1.5682334700864289e-06, + "loss": 0.3469, + "step": 1022 + }, + { + "epoch": 0.8256658595641646, + "grad_norm": 0.04924401640892029, + "learning_rate": 1.5541987591463969e-06, + "loss": 0.3566, + "step": 1023 + }, + { + "epoch": 0.8264729620661824, + "grad_norm": 0.051505424082279205, + "learning_rate": 1.5402218384252875e-06, + "loss": 0.3495, + "step": 1024 + }, + { + "epoch": 0.8272800645682001, + "grad_norm": 0.050205498933792114, + "learning_rate": 1.5263028035598903e-06, + "loss": 0.4093, + "step": 1025 + }, + { + "epoch": 0.8280871670702179, + "grad_norm": 0.04906272143125534, + "learning_rate": 1.512441749790916e-06, + "loss": 0.3906, + "step": 1026 + }, + { + "epoch": 0.8288942695722357, + "grad_norm": 0.05131586641073227, + "learning_rate": 1.4986387719623286e-06, + "loss": 0.4302, + "step": 1027 + }, + { + "epoch": 0.8297013720742534, + "grad_norm": 0.05708414316177368, + "learning_rate": 1.4848939645207227e-06, + "loss": 0.3953, + "step": 1028 + }, + { + "epoch": 0.8305084745762712, + "grad_norm": 0.05040213093161583, + "learning_rate": 1.471207421514651e-06, + "loss": 0.3571, + "step": 1029 + }, + { + "epoch": 0.831315577078289, + "grad_norm": 0.046975214034318924, + "learning_rate": 1.4575792365940023e-06, + "loss": 0.3597, + "step": 1030 + }, + { + "epoch": 0.8321226795803067, + "grad_norm": 0.04971659183502197, + "learning_rate": 1.4440095030093459e-06, + "loss": 0.3662, + "step": 1031 + }, + { + "epoch": 0.8329297820823245, + "grad_norm": 0.05251837521791458, + "learning_rate": 1.4304983136113048e-06, + "loss": 0.3641, + "step": 1032 + }, + { + "epoch": 0.8337368845843423, + "grad_norm": 0.058566804975271225, + "learning_rate": 1.4170457608499077e-06, + "loss": 0.3912, + "step": 1033 + }, + { + "epoch": 0.83454398708636, + "grad_norm": 0.052699752151966095, + "learning_rate": 1.4036519367739655e-06, + "loss": 0.3913, + "step": 1034 + }, + { + "epoch": 0.8353510895883777, + "grad_norm": 0.05296236276626587, + "learning_rate": 1.39031693303044e-06, + "loss": 0.3577, + "step": 1035 + }, + { + "epoch": 0.8361581920903954, + "grad_norm": 0.04882199317216873, + "learning_rate": 1.3770408408638193e-06, + "loss": 0.3931, + "step": 1036 + }, + { + "epoch": 0.8369652945924132, + "grad_norm": 0.049222007393836975, + "learning_rate": 1.3638237511154839e-06, + "loss": 0.3793, + "step": 1037 + }, + { + "epoch": 0.837772397094431, + "grad_norm": 0.046755850315093994, + "learning_rate": 1.3506657542231006e-06, + "loss": 0.3604, + "step": 1038 + }, + { + "epoch": 0.8385794995964487, + "grad_norm": 0.052193090319633484, + "learning_rate": 1.3375669402199842e-06, + "loss": 0.3871, + "step": 1039 + }, + { + "epoch": 0.8393866020984665, + "grad_norm": 0.052387386560440063, + "learning_rate": 1.3245273987344954e-06, + "loss": 0.3727, + "step": 1040 + }, + { + "epoch": 0.8401937046004843, + "grad_norm": 0.04449461027979851, + "learning_rate": 1.3115472189894275e-06, + "loss": 0.3227, + "step": 1041 + }, + { + "epoch": 0.841000807102502, + "grad_norm": 0.04822150245308876, + "learning_rate": 1.2986264898013878e-06, + "loss": 0.3887, + "step": 1042 + }, + { + "epoch": 0.8418079096045198, + "grad_norm": 0.049816031008958817, + "learning_rate": 1.2857652995801984e-06, + "loss": 0.3654, + "step": 1043 + }, + { + "epoch": 0.8426150121065376, + "grad_norm": 0.053092245012521744, + "learning_rate": 1.2729637363282876e-06, + "loss": 0.396, + "step": 1044 + }, + { + "epoch": 0.8434221146085553, + "grad_norm": 0.05084899067878723, + "learning_rate": 1.2602218876400807e-06, + "loss": 0.3845, + "step": 1045 + }, + { + "epoch": 0.8442292171105731, + "grad_norm": 0.052148766815662384, + "learning_rate": 1.2475398407014117e-06, + "loss": 0.4181, + "step": 1046 + }, + { + "epoch": 0.8450363196125908, + "grad_norm": 0.050065118819475174, + "learning_rate": 1.2349176822889242e-06, + "loss": 0.3919, + "step": 1047 + }, + { + "epoch": 0.8458434221146085, + "grad_norm": 0.04557253420352936, + "learning_rate": 1.222355498769473e-06, + "loss": 0.3882, + "step": 1048 + }, + { + "epoch": 0.8466505246166263, + "grad_norm": 0.05082574486732483, + "learning_rate": 1.20985337609954e-06, + "loss": 0.3537, + "step": 1049 + }, + { + "epoch": 0.847457627118644, + "grad_norm": 0.05619888752698898, + "learning_rate": 1.197411399824635e-06, + "loss": 0.4135, + "step": 1050 + }, + { + "epoch": 0.8482647296206618, + "grad_norm": 0.04628870263695717, + "learning_rate": 1.1850296550787243e-06, + "loss": 0.3351, + "step": 1051 + }, + { + "epoch": 0.8490718321226796, + "grad_norm": 0.04656703397631645, + "learning_rate": 1.1727082265836343e-06, + "loss": 0.392, + "step": 1052 + }, + { + "epoch": 0.8498789346246973, + "grad_norm": 0.046244919300079346, + "learning_rate": 1.1604471986484855e-06, + "loss": 0.3328, + "step": 1053 + }, + { + "epoch": 0.8506860371267151, + "grad_norm": 0.049990214407444, + "learning_rate": 1.1482466551691074e-06, + "loss": 0.3703, + "step": 1054 + }, + { + "epoch": 0.8514931396287329, + "grad_norm": 0.05336960032582283, + "learning_rate": 1.136106679627461e-06, + "loss": 0.4406, + "step": 1055 + }, + { + "epoch": 0.8523002421307506, + "grad_norm": 0.04921855032444, + "learning_rate": 1.1240273550910774e-06, + "loss": 0.3643, + "step": 1056 + }, + { + "epoch": 0.8531073446327684, + "grad_norm": 0.05234561115503311, + "learning_rate": 1.1120087642124843e-06, + "loss": 0.3505, + "step": 1057 + }, + { + "epoch": 0.8539144471347861, + "grad_norm": 0.04735950753092766, + "learning_rate": 1.100050989228636e-06, + "loss": 0.386, + "step": 1058 + }, + { + "epoch": 0.8547215496368039, + "grad_norm": 0.04945006221532822, + "learning_rate": 1.0881541119603633e-06, + "loss": 0.3505, + "step": 1059 + }, + { + "epoch": 0.8555286521388217, + "grad_norm": 0.04880194738507271, + "learning_rate": 1.0763182138117945e-06, + "loss": 0.3159, + "step": 1060 + }, + { + "epoch": 0.8563357546408394, + "grad_norm": 0.04775940626859665, + "learning_rate": 1.0645433757698209e-06, + "loss": 0.3387, + "step": 1061 + }, + { + "epoch": 0.8571428571428571, + "grad_norm": 0.05463748052716255, + "learning_rate": 1.0528296784035241e-06, + "loss": 0.4155, + "step": 1062 + }, + { + "epoch": 0.8579499596448749, + "grad_norm": 0.049621522426605225, + "learning_rate": 1.0411772018636368e-06, + "loss": 0.3434, + "step": 1063 + }, + { + "epoch": 0.8587570621468926, + "grad_norm": 0.051833219826221466, + "learning_rate": 1.0295860258819856e-06, + "loss": 0.4296, + "step": 1064 + }, + { + "epoch": 0.8595641646489104, + "grad_norm": 0.049275219440460205, + "learning_rate": 1.0180562297709473e-06, + "loss": 0.3551, + "step": 1065 + }, + { + "epoch": 0.8603712671509282, + "grad_norm": 0.051726870238780975, + "learning_rate": 1.0065878924229144e-06, + "loss": 0.3456, + "step": 1066 + }, + { + "epoch": 0.8611783696529459, + "grad_norm": 0.05007703974843025, + "learning_rate": 9.95181092309745e-07, + "loss": 0.3393, + "step": 1067 + }, + { + "epoch": 0.8619854721549637, + "grad_norm": 0.05635494366288185, + "learning_rate": 9.838359074822323e-07, + "loss": 0.4061, + "step": 1068 + }, + { + "epoch": 0.8627925746569814, + "grad_norm": 0.05516330897808075, + "learning_rate": 9.725524155695688e-07, + "loss": 0.4037, + "step": 1069 + }, + { + "epoch": 0.8635996771589992, + "grad_norm": 0.0452413484454155, + "learning_rate": 9.613306937788102e-07, + "loss": 0.3627, + "step": 1070 + }, + { + "epoch": 0.864406779661017, + "grad_norm": 0.044155534356832504, + "learning_rate": 9.501708188943514e-07, + "loss": 0.3338, + "step": 1071 + }, + { + "epoch": 0.8652138821630347, + "grad_norm": 0.05689741298556328, + "learning_rate": 9.390728672774074e-07, + "loss": 0.4419, + "step": 1072 + }, + { + "epoch": 0.8660209846650525, + "grad_norm": 0.06184333935379982, + "learning_rate": 9.280369148654789e-07, + "loss": 0.45, + "step": 1073 + }, + { + "epoch": 0.8668280871670703, + "grad_norm": 0.05039246752858162, + "learning_rate": 9.170630371718436e-07, + "loss": 0.3764, + "step": 1074 + }, + { + "epoch": 0.8676351896690879, + "grad_norm": 0.052961479872465134, + "learning_rate": 9.06151309285026e-07, + "loss": 0.4372, + "step": 1075 + }, + { + "epoch": 0.8684422921711057, + "grad_norm": 0.054415419697761536, + "learning_rate": 8.953018058682994e-07, + "loss": 0.4019, + "step": 1076 + }, + { + "epoch": 0.8692493946731235, + "grad_norm": 0.0524618923664093, + "learning_rate": 8.845146011591654e-07, + "loss": 0.3817, + "step": 1077 + }, + { + "epoch": 0.8700564971751412, + "grad_norm": 0.05585518106818199, + "learning_rate": 8.737897689688446e-07, + "loss": 0.3745, + "step": 1078 + }, + { + "epoch": 0.870863599677159, + "grad_norm": 0.04939413070678711, + "learning_rate": 8.631273826817821e-07, + "loss": 0.3667, + "step": 1079 + }, + { + "epoch": 0.8716707021791767, + "grad_norm": 0.05061611533164978, + "learning_rate": 8.525275152551282e-07, + "loss": 0.3755, + "step": 1080 + }, + { + "epoch": 0.8724778046811945, + "grad_norm": 0.051909856498241425, + "learning_rate": 8.419902392182588e-07, + "loss": 0.3544, + "step": 1081 + }, + { + "epoch": 0.8732849071832123, + "grad_norm": 0.0464894101023674, + "learning_rate": 8.315156266722635e-07, + "loss": 0.2752, + "step": 1082 + }, + { + "epoch": 0.87409200968523, + "grad_norm": 0.04502680152654648, + "learning_rate": 8.211037492894625e-07, + "loss": 0.3677, + "step": 1083 + }, + { + "epoch": 0.8748991121872478, + "grad_norm": 0.04734842851758003, + "learning_rate": 8.107546783129095e-07, + "loss": 0.298, + "step": 1084 + }, + { + "epoch": 0.8757062146892656, + "grad_norm": 0.04617322236299515, + "learning_rate": 8.00468484555904e-07, + "loss": 0.3164, + "step": 1085 + }, + { + "epoch": 0.8765133171912833, + "grad_norm": 0.052291661500930786, + "learning_rate": 7.902452384015136e-07, + "loss": 0.3937, + "step": 1086 + }, + { + "epoch": 0.8773204196933011, + "grad_norm": 0.047588132321834564, + "learning_rate": 7.800850098020874e-07, + "loss": 0.3512, + "step": 1087 + }, + { + "epoch": 0.8781275221953188, + "grad_norm": 0.052274979650974274, + "learning_rate": 7.699878682787787e-07, + "loss": 0.3795, + "step": 1088 + }, + { + "epoch": 0.8789346246973365, + "grad_norm": 0.052272357046604156, + "learning_rate": 7.599538829210684e-07, + "loss": 0.4079, + "step": 1089 + }, + { + "epoch": 0.8797417271993543, + "grad_norm": 0.05046515539288521, + "learning_rate": 7.499831223862907e-07, + "loss": 0.3683, + "step": 1090 + }, + { + "epoch": 0.880548829701372, + "grad_norm": 0.054393284022808075, + "learning_rate": 7.40075654899165e-07, + "loss": 0.3961, + "step": 1091 + }, + { + "epoch": 0.8813559322033898, + "grad_norm": 0.055753178894519806, + "learning_rate": 7.302315482513333e-07, + "loss": 0.4332, + "step": 1092 + }, + { + "epoch": 0.8821630347054076, + "grad_norm": 0.054667022079229355, + "learning_rate": 7.204508698008894e-07, + "loss": 0.3557, + "step": 1093 + }, + { + "epoch": 0.8829701372074253, + "grad_norm": 0.056577954441308975, + "learning_rate": 7.10733686471925e-07, + "loss": 0.4301, + "step": 1094 + }, + { + "epoch": 0.8837772397094431, + "grad_norm": 0.048220470547676086, + "learning_rate": 7.010800647540583e-07, + "loss": 0.3505, + "step": 1095 + }, + { + "epoch": 0.8845843422114609, + "grad_norm": 0.05123896524310112, + "learning_rate": 6.91490070702e-07, + "loss": 0.3813, + "step": 1096 + }, + { + "epoch": 0.8853914447134786, + "grad_norm": 0.05035271868109703, + "learning_rate": 6.819637699350811e-07, + "loss": 0.3497, + "step": 1097 + }, + { + "epoch": 0.8861985472154964, + "grad_norm": 0.05092376098036766, + "learning_rate": 6.72501227636817e-07, + "loss": 0.3873, + "step": 1098 + }, + { + "epoch": 0.8870056497175142, + "grad_norm": 0.047779954969882965, + "learning_rate": 6.631025085544563e-07, + "loss": 0.3953, + "step": 1099 + }, + { + "epoch": 0.8878127522195319, + "grad_norm": 0.05742986127734184, + "learning_rate": 6.53767676998539e-07, + "loss": 0.3918, + "step": 1100 + }, + { + "epoch": 0.8886198547215496, + "grad_norm": 0.04611702263355255, + "learning_rate": 6.444967968424543e-07, + "loss": 0.3518, + "step": 1101 + }, + { + "epoch": 0.8894269572235673, + "grad_norm": 0.04918667674064636, + "learning_rate": 6.352899315220085e-07, + "loss": 0.3576, + "step": 1102 + }, + { + "epoch": 0.8902340597255851, + "grad_norm": 0.05174895375967026, + "learning_rate": 6.261471440349831e-07, + "loss": 0.3991, + "step": 1103 + }, + { + "epoch": 0.8910411622276029, + "grad_norm": 0.05327307805418968, + "learning_rate": 6.170684969407115e-07, + "loss": 0.4019, + "step": 1104 + }, + { + "epoch": 0.8918482647296206, + "grad_norm": 0.05138659477233887, + "learning_rate": 6.08054052359649e-07, + "loss": 0.3321, + "step": 1105 + }, + { + "epoch": 0.8926553672316384, + "grad_norm": 0.053043756633996964, + "learning_rate": 5.991038719729414e-07, + "loss": 0.3577, + "step": 1106 + }, + { + "epoch": 0.8934624697336562, + "grad_norm": 0.05240008980035782, + "learning_rate": 5.902180170220129e-07, + "loss": 0.3996, + "step": 1107 + }, + { + "epoch": 0.8942695722356739, + "grad_norm": 0.04946550726890564, + "learning_rate": 5.81396548308144e-07, + "loss": 0.415, + "step": 1108 + }, + { + "epoch": 0.8950766747376917, + "grad_norm": 0.045649804174900055, + "learning_rate": 5.726395261920448e-07, + "loss": 0.3409, + "step": 1109 + }, + { + "epoch": 0.8958837772397095, + "grad_norm": 0.05500829964876175, + "learning_rate": 5.63947010593462e-07, + "loss": 0.4325, + "step": 1110 + }, + { + "epoch": 0.8966908797417272, + "grad_norm": 0.049944207072257996, + "learning_rate": 5.553190609907478e-07, + "loss": 0.3422, + "step": 1111 + }, + { + "epoch": 0.897497982243745, + "grad_norm": 0.049136772751808167, + "learning_rate": 5.467557364204701e-07, + "loss": 0.4173, + "step": 1112 + }, + { + "epoch": 0.8983050847457628, + "grad_norm": 0.04535511136054993, + "learning_rate": 5.382570954769972e-07, + "loss": 0.3397, + "step": 1113 + }, + { + "epoch": 0.8991121872477804, + "grad_norm": 0.048276886343955994, + "learning_rate": 5.298231963121059e-07, + "loss": 0.3849, + "step": 1114 + }, + { + "epoch": 0.8999192897497982, + "grad_norm": 0.05075502023100853, + "learning_rate": 5.214540966345727e-07, + "loss": 0.3805, + "step": 1115 + }, + { + "epoch": 0.9007263922518159, + "grad_norm": 0.049204349517822266, + "learning_rate": 5.131498537097857e-07, + "loss": 0.3482, + "step": 1116 + }, + { + "epoch": 0.9015334947538337, + "grad_norm": 0.04519697278738022, + "learning_rate": 5.049105243593577e-07, + "loss": 0.3455, + "step": 1117 + }, + { + "epoch": 0.9023405972558515, + "grad_norm": 0.04939783364534378, + "learning_rate": 4.967361649607261e-07, + "loss": 0.3688, + "step": 1118 + }, + { + "epoch": 0.9031476997578692, + "grad_norm": 0.04733356088399887, + "learning_rate": 4.886268314467746e-07, + "loss": 0.3951, + "step": 1119 + }, + { + "epoch": 0.903954802259887, + "grad_norm": 0.04528392478823662, + "learning_rate": 4.80582579305452e-07, + "loss": 0.3545, + "step": 1120 + }, + { + "epoch": 0.9047619047619048, + "grad_norm": 0.05273212119936943, + "learning_rate": 4.726034635793808e-07, + "loss": 0.4049, + "step": 1121 + }, + { + "epoch": 0.9055690072639225, + "grad_norm": 0.05001245066523552, + "learning_rate": 4.646895388654982e-07, + "loss": 0.3526, + "step": 1122 + }, + { + "epoch": 0.9063761097659403, + "grad_norm": 0.05269649997353554, + "learning_rate": 4.568408593146656e-07, + "loss": 0.3427, + "step": 1123 + }, + { + "epoch": 0.9071832122679581, + "grad_norm": 0.053200993686914444, + "learning_rate": 4.490574786313107e-07, + "loss": 0.3871, + "step": 1124 + }, + { + "epoch": 0.9079903147699758, + "grad_norm": 0.05451187118887901, + "learning_rate": 4.4133945007305614e-07, + "loss": 0.3682, + "step": 1125 + }, + { + "epoch": 0.9087974172719936, + "grad_norm": 0.05771080031991005, + "learning_rate": 4.3368682645034687e-07, + "loss": 0.3663, + "step": 1126 + }, + { + "epoch": 0.9096045197740112, + "grad_norm": 0.05234134569764137, + "learning_rate": 4.260996601261036e-07, + "loss": 0.3683, + "step": 1127 + }, + { + "epoch": 0.910411622276029, + "grad_norm": 0.05028250813484192, + "learning_rate": 4.18578003015353e-07, + "loss": 0.3965, + "step": 1128 + }, + { + "epoch": 0.9112187247780468, + "grad_norm": 0.053080759942531586, + "learning_rate": 4.1112190658487707e-07, + "loss": 0.4066, + "step": 1129 + }, + { + "epoch": 0.9120258272800645, + "grad_norm": 0.04984816908836365, + "learning_rate": 4.0373142185285984e-07, + "loss": 0.3632, + "step": 1130 + }, + { + "epoch": 0.9128329297820823, + "grad_norm": 0.04705459997057915, + "learning_rate": 3.964065993885391e-07, + "loss": 0.3467, + "step": 1131 + }, + { + "epoch": 0.9136400322841001, + "grad_norm": 0.054150789976119995, + "learning_rate": 3.8914748931185854e-07, + "loss": 0.3637, + "step": 1132 + }, + { + "epoch": 0.9144471347861178, + "grad_norm": 0.05248238146305084, + "learning_rate": 3.8195414129313046e-07, + "loss": 0.4039, + "step": 1133 + }, + { + "epoch": 0.9152542372881356, + "grad_norm": 0.047117725014686584, + "learning_rate": 3.7482660455268825e-07, + "loss": 0.3589, + "step": 1134 + }, + { + "epoch": 0.9160613397901534, + "grad_norm": 0.054256290197372437, + "learning_rate": 3.6776492786055105e-07, + "loss": 0.4435, + "step": 1135 + }, + { + "epoch": 0.9168684422921711, + "grad_norm": 0.054758455604314804, + "learning_rate": 3.6076915953609625e-07, + "loss": 0.3956, + "step": 1136 + }, + { + "epoch": 0.9176755447941889, + "grad_norm": 0.048094723373651505, + "learning_rate": 3.53839347447722e-07, + "loss": 0.3926, + "step": 1137 + }, + { + "epoch": 0.9184826472962067, + "grad_norm": 0.049094658344984055, + "learning_rate": 3.469755390125229e-07, + "loss": 0.3614, + "step": 1138 + }, + { + "epoch": 0.9192897497982244, + "grad_norm": 0.04998333007097244, + "learning_rate": 3.401777811959661e-07, + "loss": 0.3428, + "step": 1139 + }, + { + "epoch": 0.9200968523002422, + "grad_norm": 0.046912651509046555, + "learning_rate": 3.33446120511568e-07, + "loss": 0.318, + "step": 1140 + }, + { + "epoch": 0.9209039548022598, + "grad_norm": 0.049167439341545105, + "learning_rate": 3.267806030205756e-07, + "loss": 0.3459, + "step": 1141 + }, + { + "epoch": 0.9217110573042776, + "grad_norm": 0.05661160126328468, + "learning_rate": 3.201812743316524e-07, + "loss": 0.4099, + "step": 1142 + }, + { + "epoch": 0.9225181598062954, + "grad_norm": 0.05792146548628807, + "learning_rate": 3.136481796005686e-07, + "loss": 0.4039, + "step": 1143 + }, + { + "epoch": 0.9233252623083131, + "grad_norm": 0.05346622318029404, + "learning_rate": 3.0718136352988925e-07, + "loss": 0.352, + "step": 1144 + }, + { + "epoch": 0.9241323648103309, + "grad_norm": 0.05379043519496918, + "learning_rate": 3.0078087036866765e-07, + "loss": 0.364, + "step": 1145 + }, + { + "epoch": 0.9249394673123487, + "grad_norm": 0.04992745816707611, + "learning_rate": 2.9444674391214457e-07, + "loss": 0.4011, + "step": 1146 + }, + { + "epoch": 0.9257465698143664, + "grad_norm": 0.050753600895404816, + "learning_rate": 2.8817902750144953e-07, + "loss": 0.4096, + "step": 1147 + }, + { + "epoch": 0.9265536723163842, + "grad_norm": 0.045072466135025024, + "learning_rate": 2.819777640233001e-07, + "loss": 0.3485, + "step": 1148 + }, + { + "epoch": 0.927360774818402, + "grad_norm": 0.04854454845190048, + "learning_rate": 2.758429959097131e-07, + "loss": 0.3396, + "step": 1149 + }, + { + "epoch": 0.9281678773204197, + "grad_norm": 0.05049902945756912, + "learning_rate": 2.697747651377114e-07, + "loss": 0.37, + "step": 1150 + }, + { + "epoch": 0.9289749798224375, + "grad_norm": 0.05854698270559311, + "learning_rate": 2.6377311322903665e-07, + "loss": 0.4093, + "step": 1151 + }, + { + "epoch": 0.9297820823244553, + "grad_norm": 0.054229289293289185, + "learning_rate": 2.578380812498671e-07, + "loss": 0.401, + "step": 1152 + }, + { + "epoch": 0.930589184826473, + "grad_norm": 0.05177401751279831, + "learning_rate": 2.519697098105378e-07, + "loss": 0.3644, + "step": 1153 + }, + { + "epoch": 0.9313962873284907, + "grad_norm": 0.056719232350587845, + "learning_rate": 2.4616803906525433e-07, + "loss": 0.3965, + "step": 1154 + }, + { + "epoch": 0.9322033898305084, + "grad_norm": 0.050419602543115616, + "learning_rate": 2.4043310871182943e-07, + "loss": 0.4054, + "step": 1155 + }, + { + "epoch": 0.9330104923325262, + "grad_norm": 0.044479768723249435, + "learning_rate": 2.3476495799140574e-07, + "loss": 0.3401, + "step": 1156 + }, + { + "epoch": 0.933817594834544, + "grad_norm": 0.056795384734869, + "learning_rate": 2.291636256881846e-07, + "loss": 0.4476, + "step": 1157 + }, + { + "epoch": 0.9346246973365617, + "grad_norm": 0.05207651108503342, + "learning_rate": 2.2362915012916542e-07, + "loss": 0.3511, + "step": 1158 + }, + { + "epoch": 0.9354317998385795, + "grad_norm": 0.05466938018798828, + "learning_rate": 2.1816156918388454e-07, + "loss": 0.4056, + "step": 1159 + }, + { + "epoch": 0.9362389023405973, + "grad_norm": 0.050367191433906555, + "learning_rate": 2.1276092026414673e-07, + "loss": 0.434, + "step": 1160 + }, + { + "epoch": 0.937046004842615, + "grad_norm": 0.05178472772240639, + "learning_rate": 2.0742724032378315e-07, + "loss": 0.3642, + "step": 1161 + }, + { + "epoch": 0.9378531073446328, + "grad_norm": 0.05684222653508186, + "learning_rate": 2.0216056585838474e-07, + "loss": 0.399, + "step": 1162 + }, + { + "epoch": 0.9386602098466506, + "grad_norm": 0.05073128268122673, + "learning_rate": 1.9696093290506368e-07, + "loss": 0.4132, + "step": 1163 + }, + { + "epoch": 0.9394673123486683, + "grad_norm": 0.054002247750759125, + "learning_rate": 1.9182837704219914e-07, + "loss": 0.4296, + "step": 1164 + }, + { + "epoch": 0.9402744148506861, + "grad_norm": 0.048514340072870255, + "learning_rate": 1.867629333891985e-07, + "loss": 0.3644, + "step": 1165 + }, + { + "epoch": 0.9410815173527038, + "grad_norm": 0.05368981882929802, + "learning_rate": 1.817646366062531e-07, + "loss": 0.3647, + "step": 1166 + }, + { + "epoch": 0.9418886198547215, + "grad_norm": 0.05559367686510086, + "learning_rate": 1.7683352089410523e-07, + "loss": 0.3993, + "step": 1167 + }, + { + "epoch": 0.9426957223567393, + "grad_norm": 0.05872499570250511, + "learning_rate": 1.7196961999381147e-07, + "loss": 0.3889, + "step": 1168 + }, + { + "epoch": 0.943502824858757, + "grad_norm": 0.051862310618162155, + "learning_rate": 1.6717296718651078e-07, + "loss": 0.397, + "step": 1169 + }, + { + "epoch": 0.9443099273607748, + "grad_norm": 0.0534013956785202, + "learning_rate": 1.6244359529320242e-07, + "loss": 0.3978, + "step": 1170 + }, + { + "epoch": 0.9451170298627926, + "grad_norm": 0.05433657020330429, + "learning_rate": 1.577815366745139e-07, + "loss": 0.3812, + "step": 1171 + }, + { + "epoch": 0.9459241323648103, + "grad_norm": 0.0524270161986351, + "learning_rate": 1.531868232304845e-07, + "loss": 0.3357, + "step": 1172 + }, + { + "epoch": 0.9467312348668281, + "grad_norm": 0.054285310208797455, + "learning_rate": 1.4865948640034432e-07, + "loss": 0.4095, + "step": 1173 + }, + { + "epoch": 0.9475383373688459, + "grad_norm": 0.062304362654685974, + "learning_rate": 1.4419955716230116e-07, + "loss": 0.4376, + "step": 1174 + }, + { + "epoch": 0.9483454398708636, + "grad_norm": 0.05329901725053787, + "learning_rate": 1.398070660333295e-07, + "loss": 0.3448, + "step": 1175 + }, + { + "epoch": 0.9491525423728814, + "grad_norm": 0.04291558638215065, + "learning_rate": 1.3548204306895628e-07, + "loss": 0.3197, + "step": 1176 + }, + { + "epoch": 0.9499596448748991, + "grad_norm": 0.058099936693906784, + "learning_rate": 1.3122451786306223e-07, + "loss": 0.4031, + "step": 1177 + }, + { + "epoch": 0.9507667473769169, + "grad_norm": 0.05085314065217972, + "learning_rate": 1.270345195476741e-07, + "loss": 0.3729, + "step": 1178 + }, + { + "epoch": 0.9515738498789347, + "grad_norm": 0.048932112753391266, + "learning_rate": 1.229120767927694e-07, + "loss": 0.3084, + "step": 1179 + }, + { + "epoch": 0.9523809523809523, + "grad_norm": 0.05446094647049904, + "learning_rate": 1.188572178060754e-07, + "loss": 0.3768, + "step": 1180 + }, + { + "epoch": 0.9531880548829701, + "grad_norm": 0.057141583412885666, + "learning_rate": 1.1486997033288038e-07, + "loss": 0.3922, + "step": 1181 + }, + { + "epoch": 0.9539951573849879, + "grad_norm": 0.05461932346224785, + "learning_rate": 1.1095036165584272e-07, + "loss": 0.3816, + "step": 1182 + }, + { + "epoch": 0.9548022598870056, + "grad_norm": 0.052471328526735306, + "learning_rate": 1.0709841859480097e-07, + "loss": 0.3956, + "step": 1183 + }, + { + "epoch": 0.9556093623890234, + "grad_norm": 0.043329499661922455, + "learning_rate": 1.0331416750659851e-07, + "loss": 0.3205, + "step": 1184 + }, + { + "epoch": 0.9564164648910412, + "grad_norm": 0.04855290800333023, + "learning_rate": 9.959763428489256e-08, + "loss": 0.3544, + "step": 1185 + }, + { + "epoch": 0.9572235673930589, + "grad_norm": 0.05016739293932915, + "learning_rate": 9.59488443599843e-08, + "loss": 0.3534, + "step": 1186 + }, + { + "epoch": 0.9580306698950767, + "grad_norm": 0.05878947302699089, + "learning_rate": 9.236782269864242e-08, + "loss": 0.3834, + "step": 1187 + }, + { + "epoch": 0.9588377723970944, + "grad_norm": 0.051569633185863495, + "learning_rate": 8.885459380393313e-08, + "loss": 0.3874, + "step": 1188 + }, + { + "epoch": 0.9596448748991122, + "grad_norm": 0.05080967769026756, + "learning_rate": 8.540918171505374e-08, + "loss": 0.3476, + "step": 1189 + }, + { + "epoch": 0.96045197740113, + "grad_norm": 0.04376181587576866, + "learning_rate": 8.203161000716164e-08, + "loss": 0.3373, + "step": 1190 + }, + { + "epoch": 0.9612590799031477, + "grad_norm": 0.055172570049762726, + "learning_rate": 7.87219017912233e-08, + "loss": 0.3961, + "step": 1191 + }, + { + "epoch": 0.9620661824051655, + "grad_norm": 0.04730622097849846, + "learning_rate": 7.548007971384774e-08, + "loss": 0.3841, + "step": 1192 + }, + { + "epoch": 0.9628732849071832, + "grad_norm": 0.06012022867798805, + "learning_rate": 7.230616595713336e-08, + "loss": 0.4317, + "step": 1193 + }, + { + "epoch": 0.9636803874092009, + "grad_norm": 0.048860322684049606, + "learning_rate": 6.920018223852021e-08, + "loss": 0.3674, + "step": 1194 + }, + { + "epoch": 0.9644874899112187, + "grad_norm": 0.05125834792852402, + "learning_rate": 6.616214981063685e-08, + "loss": 0.361, + "step": 1195 + }, + { + "epoch": 0.9652945924132365, + "grad_norm": 0.048129141330718994, + "learning_rate": 6.319208946115817e-08, + "loss": 0.4051, + "step": 1196 + }, + { + "epoch": 0.9661016949152542, + "grad_norm": 0.05246327072381973, + "learning_rate": 6.029002151265673e-08, + "loss": 0.3963, + "step": 1197 + }, + { + "epoch": 0.966908797417272, + "grad_norm": 0.04887653514742851, + "learning_rate": 5.745596582247603e-08, + "loss": 0.4077, + "step": 1198 + }, + { + "epoch": 0.9677158999192897, + "grad_norm": 0.04455339536070824, + "learning_rate": 5.46899417825808e-08, + "loss": 0.2854, + "step": 1199 + }, + { + "epoch": 0.9685230024213075, + "grad_norm": 0.04976993426680565, + "learning_rate": 5.199196831943254e-08, + "loss": 0.3683, + "step": 1200 + }, + { + "epoch": 0.9693301049233253, + "grad_norm": 0.050257232040166855, + "learning_rate": 4.936206389385967e-08, + "loss": 0.3282, + "step": 1201 + }, + { + "epoch": 0.970137207425343, + "grad_norm": 0.05376894772052765, + "learning_rate": 4.680024650092652e-08, + "loss": 0.3623, + "step": 1202 + }, + { + "epoch": 0.9709443099273608, + "grad_norm": 0.048789188265800476, + "learning_rate": 4.430653366981785e-08, + "loss": 0.3376, + "step": 1203 + }, + { + "epoch": 0.9717514124293786, + "grad_norm": 0.04705846682190895, + "learning_rate": 4.1880942463710104e-08, + "loss": 0.3704, + "step": 1204 + }, + { + "epoch": 0.9725585149313963, + "grad_norm": 0.049898602068424225, + "learning_rate": 3.9523489479661447e-08, + "loss": 0.3926, + "step": 1205 + }, + { + "epoch": 0.9733656174334141, + "grad_norm": 0.05981026589870453, + "learning_rate": 3.7234190848493e-08, + "loss": 0.3665, + "step": 1206 + }, + { + "epoch": 0.9741727199354318, + "grad_norm": 0.04264838621020317, + "learning_rate": 3.501306223468448e-08, + "loss": 0.3608, + "step": 1207 + }, + { + "epoch": 0.9749798224374495, + "grad_norm": 0.052027132362127304, + "learning_rate": 3.286011883626206e-08, + "loss": 0.3859, + "step": 1208 + }, + { + "epoch": 0.9757869249394673, + "grad_norm": 0.05197995901107788, + "learning_rate": 3.077537538469622e-08, + "loss": 0.4028, + "step": 1209 + }, + { + "epoch": 0.976594027441485, + "grad_norm": 0.05613158270716667, + "learning_rate": 2.8758846144799624e-08, + "loss": 0.4116, + "step": 1210 + }, + { + "epoch": 0.9774011299435028, + "grad_norm": 0.05353546887636185, + "learning_rate": 2.6810544914631644e-08, + "loss": 0.3928, + "step": 1211 + }, + { + "epoch": 0.9782082324455206, + "grad_norm": 0.051475610584020615, + "learning_rate": 2.493048502540285e-08, + "loss": 0.3398, + "step": 1212 + }, + { + "epoch": 0.9790153349475383, + "grad_norm": 0.05530683323740959, + "learning_rate": 2.311867934138512e-08, + "loss": 0.4251, + "step": 1213 + }, + { + "epoch": 0.9798224374495561, + "grad_norm": 0.05283194035291672, + "learning_rate": 2.137514025981946e-08, + "loss": 0.3988, + "step": 1214 + }, + { + "epoch": 0.9806295399515739, + "grad_norm": 0.054351918399333954, + "learning_rate": 1.96998797108372e-08, + "loss": 0.4202, + "step": 1215 + }, + { + "epoch": 0.9814366424535916, + "grad_norm": 0.047337841242551804, + "learning_rate": 1.8092909157372272e-08, + "loss": 0.3381, + "step": 1216 + }, + { + "epoch": 0.9822437449556094, + "grad_norm": 0.05288921296596527, + "learning_rate": 1.655423959508795e-08, + "loss": 0.3813, + "step": 1217 + }, + { + "epoch": 0.9830508474576272, + "grad_norm": 0.056825585663318634, + "learning_rate": 1.5083881552299118e-08, + "loss": 0.4856, + "step": 1218 + }, + { + "epoch": 0.9838579499596449, + "grad_norm": 0.05358508601784706, + "learning_rate": 1.3681845089897895e-08, + "loss": 0.3537, + "step": 1219 + }, + { + "epoch": 0.9846650524616626, + "grad_norm": 0.052583999931812286, + "learning_rate": 1.234813980129257e-08, + "loss": 0.4317, + "step": 1220 + }, + { + "epoch": 0.9854721549636803, + "grad_norm": 0.04891924932599068, + "learning_rate": 1.1082774812330998e-08, + "loss": 0.3668, + "step": 1221 + }, + { + "epoch": 0.9862792574656981, + "grad_norm": 0.0513627864420414, + "learning_rate": 9.885758781249532e-09, + "loss": 0.3821, + "step": 1222 + }, + { + "epoch": 0.9870863599677159, + "grad_norm": 0.048382095992565155, + "learning_rate": 8.757099898603071e-09, + "loss": 0.326, + "step": 1223 + }, + { + "epoch": 0.9878934624697336, + "grad_norm": 0.04995129629969597, + "learning_rate": 7.696805887217329e-09, + "loss": 0.3843, + "step": 1224 + }, + { + "epoch": 0.9887005649717514, + "grad_norm": 0.04516911506652832, + "learning_rate": 6.704884002132206e-09, + "loss": 0.3553, + "step": 1225 + }, + { + "epoch": 0.9895076674737692, + "grad_norm": 0.05033162236213684, + "learning_rate": 5.781341030551835e-09, + "loss": 0.3908, + "step": 1226 + }, + { + "epoch": 0.9903147699757869, + "grad_norm": 0.05497167259454727, + "learning_rate": 4.926183291800169e-09, + "loss": 0.4237, + "step": 1227 + }, + { + "epoch": 0.9911218724778047, + "grad_norm": 0.050287820398807526, + "learning_rate": 4.139416637276572e-09, + "loss": 0.3525, + "step": 1228 + }, + { + "epoch": 0.9919289749798225, + "grad_norm": 0.054579704999923706, + "learning_rate": 3.4210464504158547e-09, + "loss": 0.4267, + "step": 1229 + }, + { + "epoch": 0.9927360774818402, + "grad_norm": 0.057175084948539734, + "learning_rate": 2.7710776466494114e-09, + "loss": 0.3782, + "step": 1230 + }, + { + "epoch": 0.993543179983858, + "grad_norm": 0.052831992506980896, + "learning_rate": 2.1895146733763583e-09, + "loss": 0.3787, + "step": 1231 + }, + { + "epoch": 0.9943502824858758, + "grad_norm": 0.04933362454175949, + "learning_rate": 1.6763615099291142e-09, + "loss": 0.3536, + "step": 1232 + }, + { + "epoch": 0.9951573849878934, + "grad_norm": 0.05213562771677971, + "learning_rate": 1.2316216675467563e-09, + "loss": 0.3125, + "step": 1233 + }, + { + "epoch": 0.9959644874899112, + "grad_norm": 0.051006123423576355, + "learning_rate": 8.552981893539259e-10, + "loss": 0.3783, + "step": 1234 + }, + { + "epoch": 0.9967715899919289, + "grad_norm": 0.05340098217129707, + "learning_rate": 5.473936503341826e-10, + "loss": 0.4189, + "step": 1235 + }, + { + "epoch": 0.9975786924939467, + "grad_norm": 0.05506238713860512, + "learning_rate": 3.0791015732001274e-10, + "loss": 0.3906, + "step": 1236 + }, + { + "epoch": 0.9983857949959645, + "grad_norm": 0.05131802707910538, + "learning_rate": 1.3684934897062462e-10, + "loss": 0.3888, + "step": 1237 + }, + { + "epoch": 0.9991928974979822, + "grad_norm": 0.051829464733600616, + "learning_rate": 3.421239576750779e-11, + "loss": 0.3569, + "step": 1238 + }, + { + "epoch": 1.0, + "grad_norm": 0.04889125004410744, + "learning_rate": 0.0, + "loss": 0.364, + "step": 1239 + }, + { + "epoch": 0.8093994778067886, + "grad_norm": 0.05489634722471237, + "learning_rate": 1.8457022888197873e-06, + "loss": 0.5747, + "step": 1240 + }, + { + "epoch": 0.8100522193211488, + "grad_norm": 0.05567055940628052, + "learning_rate": 1.8334827698003644e-06, + "loss": 0.5034, + "step": 1241 + }, + { + "epoch": 0.8107049608355091, + "grad_norm": 0.052047327160835266, + "learning_rate": 1.8212997512837815e-06, + "loss": 0.569, + "step": 1242 + }, + { + "epoch": 0.8113577023498695, + "grad_norm": 0.053265251219272614, + "learning_rate": 1.8091532877224194e-06, + "loss": 0.5673, + "step": 1243 + }, + { + "epoch": 0.8120104438642297, + "grad_norm": 0.05723145231604576, + "learning_rate": 1.7970434334052822e-06, + "loss": 0.5226, + "step": 1244 + }, + { + "epoch": 0.8126631853785901, + "grad_norm": 0.056260693818330765, + "learning_rate": 1.784970242457732e-06, + "loss": 0.5682, + "step": 1245 + }, + { + "epoch": 0.8133159268929504, + "grad_norm": 0.06144409254193306, + "learning_rate": 1.7729337688412772e-06, + "loss": 0.5818, + "step": 1246 + }, + { + "epoch": 0.8139686684073107, + "grad_norm": 0.05530359968543053, + "learning_rate": 1.7609340663533115e-06, + "loss": 0.5372, + "step": 1247 + }, + { + "epoch": 0.814621409921671, + "grad_norm": 0.052408602088689804, + "learning_rate": 1.7489711886268713e-06, + "loss": 0.5431, + "step": 1248 + }, + { + "epoch": 0.8152741514360313, + "grad_norm": 0.04931781440973282, + "learning_rate": 1.7370451891304208e-06, + "loss": 0.5209, + "step": 1249 + }, + { + "epoch": 0.8159268929503917, + "grad_norm": 0.052259039133787155, + "learning_rate": 1.725156121167576e-06, + "loss": 0.5618, + "step": 1250 + }, + { + "epoch": 0.816579634464752, + "grad_norm": 0.054095637053251266, + "learning_rate": 1.7133040378769039e-06, + "loss": 0.4872, + "step": 1251 + }, + { + "epoch": 0.8172323759791122, + "grad_norm": 0.05096259340643883, + "learning_rate": 1.701488992231658e-06, + "loss": 0.5254, + "step": 1252 + }, + { + "epoch": 0.8178851174934726, + "grad_norm": 0.04998144507408142, + "learning_rate": 1.6897110370395452e-06, + "loss": 0.4692, + "step": 1253 + }, + { + "epoch": 0.8185378590078329, + "grad_norm": 0.05502447485923767, + "learning_rate": 1.6779702249425122e-06, + "loss": 0.4983, + "step": 1254 + }, + { + "epoch": 0.8191906005221932, + "grad_norm": 0.05895658954977989, + "learning_rate": 1.6662666084164791e-06, + "loss": 0.5427, + "step": 1255 + }, + { + "epoch": 0.8198433420365535, + "grad_norm": 0.05542084947228432, + "learning_rate": 1.6546002397711247e-06, + "loss": 0.5164, + "step": 1256 + }, + { + "epoch": 0.8204960835509139, + "grad_norm": 0.053591158241033554, + "learning_rate": 1.6429711711496499e-06, + "loss": 0.491, + "step": 1257 + }, + { + "epoch": 0.8211488250652742, + "grad_norm": 0.051953479647636414, + "learning_rate": 1.6313794545285312e-06, + "loss": 0.5057, + "step": 1258 + }, + { + "epoch": 0.8218015665796344, + "grad_norm": 0.05032678321003914, + "learning_rate": 1.6198251417173138e-06, + "loss": 0.4555, + "step": 1259 + }, + { + "epoch": 0.8224543080939948, + "grad_norm": 0.054147832095623016, + "learning_rate": 1.6083082843583552e-06, + "loss": 0.5148, + "step": 1260 + }, + { + "epoch": 0.8231070496083551, + "grad_norm": 0.056531306356191635, + "learning_rate": 1.5968289339266084e-06, + "loss": 0.5749, + "step": 1261 + }, + { + "epoch": 0.8237597911227154, + "grad_norm": 0.04894256219267845, + "learning_rate": 1.5853871417293876e-06, + "loss": 0.4717, + "step": 1262 + }, + { + "epoch": 0.8244125326370757, + "grad_norm": 0.057365089654922485, + "learning_rate": 1.5739829589061384e-06, + "loss": 0.5384, + "step": 1263 + }, + { + "epoch": 0.825065274151436, + "grad_norm": 0.05126437544822693, + "learning_rate": 1.5626164364282103e-06, + "loss": 0.4903, + "step": 1264 + }, + { + "epoch": 0.8257180156657964, + "grad_norm": 0.06285741925239563, + "learning_rate": 1.5512876250986308e-06, + "loss": 0.6048, + "step": 1265 + }, + { + "epoch": 0.8263707571801566, + "grad_norm": 0.05392901971936226, + "learning_rate": 1.539996575551872e-06, + "loss": 0.5462, + "step": 1266 + }, + { + "epoch": 0.827023498694517, + "grad_norm": 0.053971629589796066, + "learning_rate": 1.5287433382536342e-06, + "loss": 0.5608, + "step": 1267 + }, + { + "epoch": 0.8276762402088773, + "grad_norm": 0.046124279499053955, + "learning_rate": 1.5175279635006102e-06, + "loss": 0.4852, + "step": 1268 + }, + { + "epoch": 0.8283289817232375, + "grad_norm": 0.056769512593746185, + "learning_rate": 1.5063505014202651e-06, + "loss": 0.5403, + "step": 1269 + }, + { + "epoch": 0.8289817232375979, + "grad_norm": 0.05687325447797775, + "learning_rate": 1.4952110019706156e-06, + "loss": 0.5591, + "step": 1270 + }, + { + "epoch": 0.8296344647519582, + "grad_norm": 0.05294881388545036, + "learning_rate": 1.4841095149399998e-06, + "loss": 0.4884, + "step": 1271 + }, + { + "epoch": 0.8302872062663186, + "grad_norm": 0.05597991868853569, + "learning_rate": 1.4730460899468601e-06, + "loss": 0.5337, + "step": 1272 + }, + { + "epoch": 0.8309399477806788, + "grad_norm": 0.05722576007246971, + "learning_rate": 1.4620207764395177e-06, + "loss": 0.4941, + "step": 1273 + }, + { + "epoch": 0.8315926892950392, + "grad_norm": 0.05292990803718567, + "learning_rate": 1.4510336236959554e-06, + "loss": 0.474, + "step": 1274 + }, + { + "epoch": 0.8322454308093995, + "grad_norm": 0.05963551253080368, + "learning_rate": 1.4400846808235946e-06, + "loss": 0.6022, + "step": 1275 + }, + { + "epoch": 0.8328981723237598, + "grad_norm": 0.0566948726773262, + "learning_rate": 1.4291739967590746e-06, + "loss": 0.5265, + "step": 1276 + }, + { + "epoch": 0.8335509138381201, + "grad_norm": 0.05305665358901024, + "learning_rate": 1.4183016202680378e-06, + "loss": 0.5592, + "step": 1277 + }, + { + "epoch": 0.8342036553524804, + "grad_norm": 0.058979522436857224, + "learning_rate": 1.4074675999449095e-06, + "loss": 0.5232, + "step": 1278 + }, + { + "epoch": 0.8348563968668408, + "grad_norm": 0.05601240321993828, + "learning_rate": 1.3966719842126808e-06, + "loss": 0.5869, + "step": 1279 + }, + { + "epoch": 0.835509138381201, + "grad_norm": 0.05249645933508873, + "learning_rate": 1.3859148213226903e-06, + "loss": 0.4805, + "step": 1280 + }, + { + "epoch": 0.8361618798955613, + "grad_norm": 0.058439917862415314, + "learning_rate": 1.3751961593544171e-06, + "loss": 0.5394, + "step": 1281 + }, + { + "epoch": 0.8368146214099217, + "grad_norm": 0.055136680603027344, + "learning_rate": 1.3645160462152495e-06, + "loss": 0.5072, + "step": 1282 + }, + { + "epoch": 0.837467362924282, + "grad_norm": 0.059607915580272675, + "learning_rate": 1.3538745296402867e-06, + "loss": 0.5557, + "step": 1283 + }, + { + "epoch": 0.8381201044386423, + "grad_norm": 0.0572134405374527, + "learning_rate": 1.3432716571921178e-06, + "loss": 0.51, + "step": 1284 + }, + { + "epoch": 0.8387728459530026, + "grad_norm": 0.053891804069280624, + "learning_rate": 1.3327074762606096e-06, + "loss": 0.5298, + "step": 1285 + }, + { + "epoch": 0.839425587467363, + "grad_norm": 0.0589805468916893, + "learning_rate": 1.3221820340627044e-06, + "loss": 0.6048, + "step": 1286 + }, + { + "epoch": 0.8400783289817232, + "grad_norm": 0.05279667675495148, + "learning_rate": 1.311695377642187e-06, + "loss": 0.4724, + "step": 1287 + }, + { + "epoch": 0.8407310704960835, + "grad_norm": 0.052640657871961594, + "learning_rate": 1.3012475538694935e-06, + "loss": 0.4407, + "step": 1288 + }, + { + "epoch": 0.8413838120104439, + "grad_norm": 0.0526045598089695, + "learning_rate": 1.2908386094415049e-06, + "loss": 0.5229, + "step": 1289 + }, + { + "epoch": 0.8420365535248042, + "grad_norm": 0.0544457770884037, + "learning_rate": 1.280468590881312e-06, + "loss": 0.4922, + "step": 1290 + }, + { + "epoch": 0.8426892950391645, + "grad_norm": 0.054909445345401764, + "learning_rate": 1.2701375445380459e-06, + "loss": 0.5311, + "step": 1291 + }, + { + "epoch": 0.8433420365535248, + "grad_norm": 0.056963834911584854, + "learning_rate": 1.2598455165866319e-06, + "loss": 0.5472, + "step": 1292 + }, + { + "epoch": 0.8439947780678851, + "grad_norm": 0.05432257801294327, + "learning_rate": 1.24959255302761e-06, + "loss": 0.4912, + "step": 1293 + }, + { + "epoch": 0.8446475195822454, + "grad_norm": 0.0577394962310791, + "learning_rate": 1.2393786996869262e-06, + "loss": 0.4972, + "step": 1294 + }, + { + "epoch": 0.8453002610966057, + "grad_norm": 0.05427301302552223, + "learning_rate": 1.229204002215706e-06, + "loss": 0.5027, + "step": 1295 + }, + { + "epoch": 0.8459530026109661, + "grad_norm": 0.052240319550037384, + "learning_rate": 1.2190685060900843e-06, + "loss": 0.5267, + "step": 1296 + }, + { + "epoch": 0.8466057441253264, + "grad_norm": 0.053541723638772964, + "learning_rate": 1.2089722566109752e-06, + "loss": 0.5539, + "step": 1297 + }, + { + "epoch": 0.8472584856396866, + "grad_norm": 0.054370395839214325, + "learning_rate": 1.1989152989038745e-06, + "loss": 0.5465, + "step": 1298 + }, + { + "epoch": 0.847911227154047, + "grad_norm": 0.052129361778497696, + "learning_rate": 1.1888976779186745e-06, + "loss": 0.5215, + "step": 1299 + }, + { + "epoch": 0.8485639686684073, + "grad_norm": 0.0562812015414238, + "learning_rate": 1.1789194384294377e-06, + "loss": 0.4823, + "step": 1300 + }, + { + "epoch": 0.8492167101827677, + "grad_norm": 0.05429472029209137, + "learning_rate": 1.1689806250342196e-06, + "loss": 0.5207, + "step": 1301 + }, + { + "epoch": 0.8498694516971279, + "grad_norm": 0.05133857950568199, + "learning_rate": 1.159081282154858e-06, + "loss": 0.4531, + "step": 1302 + }, + { + "epoch": 0.8505221932114883, + "grad_norm": 0.05682549253106117, + "learning_rate": 1.1492214540367674e-06, + "loss": 0.5547, + "step": 1303 + }, + { + "epoch": 0.8511749347258486, + "grad_norm": 0.05034679174423218, + "learning_rate": 1.1394011847487618e-06, + "loss": 0.5243, + "step": 1304 + }, + { + "epoch": 0.8518276762402088, + "grad_norm": 0.056760791689157486, + "learning_rate": 1.1296205181828378e-06, + "loss": 0.5835, + "step": 1305 + }, + { + "epoch": 0.8524804177545692, + "grad_norm": 0.054008033126592636, + "learning_rate": 1.1198794980539908e-06, + "loss": 0.4983, + "step": 1306 + }, + { + "epoch": 0.8531331592689295, + "grad_norm": 0.05667291581630707, + "learning_rate": 1.1101781679000135e-06, + "loss": 0.5211, + "step": 1307 + }, + { + "epoch": 0.8537859007832899, + "grad_norm": 0.05850447714328766, + "learning_rate": 1.1005165710812982e-06, + "loss": 0.5183, + "step": 1308 + }, + { + "epoch": 0.8544386422976501, + "grad_norm": 0.0570049025118351, + "learning_rate": 1.0908947507806567e-06, + "loss": 0.4721, + "step": 1309 + }, + { + "epoch": 0.8550913838120104, + "grad_norm": 0.05443601682782173, + "learning_rate": 1.081312750003114e-06, + "loss": 0.5579, + "step": 1310 + }, + { + "epoch": 0.8557441253263708, + "grad_norm": 0.05229130759835243, + "learning_rate": 1.071770611575721e-06, + "loss": 0.4809, + "step": 1311 + }, + { + "epoch": 0.856396866840731, + "grad_norm": 0.05782579630613327, + "learning_rate": 1.0622683781473596e-06, + "loss": 0.4737, + "step": 1312 + }, + { + "epoch": 0.8570496083550914, + "grad_norm": 0.05550665035843849, + "learning_rate": 1.0528060921885607e-06, + "loss": 0.5707, + "step": 1313 + }, + { + "epoch": 0.8577023498694517, + "grad_norm": 0.05818123370409012, + "learning_rate": 1.043383795991304e-06, + "loss": 0.5846, + "step": 1314 + }, + { + "epoch": 0.858355091383812, + "grad_norm": 0.05427548661828041, + "learning_rate": 1.0340015316688358e-06, + "loss": 0.5087, + "step": 1315 + }, + { + "epoch": 0.8590078328981723, + "grad_norm": 0.05885540693998337, + "learning_rate": 1.0246593411554796e-06, + "loss": 0.4413, + "step": 1316 + }, + { + "epoch": 0.8596605744125326, + "grad_norm": 0.05844886973500252, + "learning_rate": 1.0153572662064448e-06, + "loss": 0.5454, + "step": 1317 + }, + { + "epoch": 0.860313315926893, + "grad_norm": 0.054623618721961975, + "learning_rate": 1.0060953483976454e-06, + "loss": 0.5637, + "step": 1318 + }, + { + "epoch": 0.8609660574412533, + "grad_norm": 0.064220130443573, + "learning_rate": 9.968736291255122e-07, + "loss": 0.5217, + "step": 1319 + }, + { + "epoch": 0.8616187989556136, + "grad_norm": 0.05971851199865341, + "learning_rate": 9.87692149606806e-07, + "loss": 0.4555, + "step": 1320 + }, + { + "epoch": 0.8622715404699739, + "grad_norm": 0.053942758589982986, + "learning_rate": 9.78550950878433e-07, + "loss": 0.5939, + "step": 1321 + }, + { + "epoch": 0.8629242819843342, + "grad_norm": 0.05949598550796509, + "learning_rate": 9.69450073797269e-07, + "loss": 0.4948, + "step": 1322 + }, + { + "epoch": 0.8635770234986945, + "grad_norm": 0.05386824905872345, + "learning_rate": 9.603895590399648e-07, + "loss": 0.4731, + "step": 1323 + }, + { + "epoch": 0.8642297650130548, + "grad_norm": 0.05269710347056389, + "learning_rate": 9.513694471027735e-07, + "loss": 0.4558, + "step": 1324 + }, + { + "epoch": 0.8648825065274152, + "grad_norm": 0.05899108573794365, + "learning_rate": 9.423897783013658e-07, + "loss": 0.5939, + "step": 1325 + }, + { + "epoch": 0.8655352480417755, + "grad_norm": 0.055841196328401566, + "learning_rate": 9.334505927706516e-07, + "loss": 0.4845, + "step": 1326 + }, + { + "epoch": 0.8661879895561357, + "grad_norm": 0.055123697966337204, + "learning_rate": 9.245519304645978e-07, + "loss": 0.4934, + "step": 1327 + }, + { + "epoch": 0.8668407310704961, + "grad_norm": 0.053594909608364105, + "learning_rate": 9.156938311560526e-07, + "loss": 0.5103, + "step": 1328 + }, + { + "epoch": 0.8674934725848564, + "grad_norm": 0.053080540150403976, + "learning_rate": 9.068763344365683e-07, + "loss": 0.428, + "step": 1329 + }, + { + "epoch": 0.8681462140992167, + "grad_norm": 0.04971330985426903, + "learning_rate": 8.980994797162157e-07, + "loss": 0.4763, + "step": 1330 + }, + { + "epoch": 0.868798955613577, + "grad_norm": 0.056616924703121185, + "learning_rate": 8.893633062234285e-07, + "loss": 0.5279, + "step": 1331 + }, + { + "epoch": 0.8694516971279374, + "grad_norm": 0.05742453783750534, + "learning_rate": 8.806678530047985e-07, + "loss": 0.4932, + "step": 1332 + }, + { + "epoch": 0.8701044386422977, + "grad_norm": 0.05315115302801132, + "learning_rate": 8.720131589249281e-07, + "loss": 0.4897, + "step": 1333 + }, + { + "epoch": 0.8707571801566579, + "grad_norm": 0.057442694902420044, + "learning_rate": 8.633992626662402e-07, + "loss": 0.5205, + "step": 1334 + }, + { + "epoch": 0.8714099216710183, + "grad_norm": 0.05508160963654518, + "learning_rate": 8.548262027288101e-07, + "loss": 0.4926, + "step": 1335 + }, + { + "epoch": 0.8720626631853786, + "grad_norm": 0.05157114192843437, + "learning_rate": 8.462940174302026e-07, + "loss": 0.5125, + "step": 1336 + }, + { + "epoch": 0.8727154046997389, + "grad_norm": 0.05958451330661774, + "learning_rate": 8.378027449052784e-07, + "loss": 0.5025, + "step": 1337 + }, + { + "epoch": 0.8733681462140992, + "grad_norm": 0.055542390793561935, + "learning_rate": 8.293524231060468e-07, + "loss": 0.4989, + "step": 1338 + }, + { + "epoch": 0.8740208877284595, + "grad_norm": 0.06172487139701843, + "learning_rate": 8.20943089801487e-07, + "loss": 0.537, + "step": 1339 + }, + { + "epoch": 0.8746736292428199, + "grad_norm": 0.05982014536857605, + "learning_rate": 8.125747825773689e-07, + "loss": 0.5673, + "step": 1340 + }, + { + "epoch": 0.8753263707571801, + "grad_norm": 0.05253879725933075, + "learning_rate": 8.042475388361104e-07, + "loss": 0.4503, + "step": 1341 + }, + { + "epoch": 0.8759791122715405, + "grad_norm": 0.0581713505089283, + "learning_rate": 7.959613957965794e-07, + "loss": 0.5507, + "step": 1342 + }, + { + "epoch": 0.8766318537859008, + "grad_norm": 0.05596986040472984, + "learning_rate": 7.877163904939522e-07, + "loss": 0.5085, + "step": 1343 + }, + { + "epoch": 0.8772845953002611, + "grad_norm": 0.054406262934207916, + "learning_rate": 7.795125597795406e-07, + "loss": 0.5056, + "step": 1344 + }, + { + "epoch": 0.8779373368146214, + "grad_norm": 0.05836174637079239, + "learning_rate": 7.713499403206138e-07, + "loss": 0.5326, + "step": 1345 + }, + { + "epoch": 0.8785900783289817, + "grad_norm": 0.05304201692342758, + "learning_rate": 7.632285686002594e-07, + "loss": 0.4556, + "step": 1346 + }, + { + "epoch": 0.8792428198433421, + "grad_norm": 0.05944960191845894, + "learning_rate": 7.551484809172005e-07, + "loss": 0.5656, + "step": 1347 + }, + { + "epoch": 0.8798955613577023, + "grad_norm": 0.062106695026159286, + "learning_rate": 7.471097133856353e-07, + "loss": 0.562, + "step": 1348 + }, + { + "epoch": 0.8805483028720626, + "grad_norm": 0.05966848507523537, + "learning_rate": 7.391123019350932e-07, + "loss": 0.5701, + "step": 1349 + }, + { + "epoch": 0.881201044386423, + "grad_norm": 0.05123576894402504, + "learning_rate": 7.311562823102469e-07, + "loss": 0.5516, + "step": 1350 + }, + { + "epoch": 0.8818537859007833, + "grad_norm": 0.056662097573280334, + "learning_rate": 7.232416900707739e-07, + "loss": 0.5026, + "step": 1351 + }, + { + "epoch": 0.8825065274151436, + "grad_norm": 0.05869949981570244, + "learning_rate": 7.153685605911964e-07, + "loss": 0.5144, + "step": 1352 + }, + { + "epoch": 0.8831592689295039, + "grad_norm": 0.05815920606255531, + "learning_rate": 7.075369290607049e-07, + "loss": 0.5185, + "step": 1353 + }, + { + "epoch": 0.8838120104438643, + "grad_norm": 0.05506941303610802, + "learning_rate": 6.997468304830247e-07, + "loss": 0.579, + "step": 1354 + }, + { + "epoch": 0.8844647519582245, + "grad_norm": 0.058971211314201355, + "learning_rate": 6.919982996762431e-07, + "loss": 0.5332, + "step": 1355 + }, + { + "epoch": 0.8851174934725848, + "grad_norm": 0.05661641061306, + "learning_rate": 6.842913712726551e-07, + "loss": 0.5755, + "step": 1356 + }, + { + "epoch": 0.8857702349869452, + "grad_norm": 0.057749152183532715, + "learning_rate": 6.766260797186242e-07, + "loss": 0.4955, + "step": 1357 + }, + { + "epoch": 0.8864229765013055, + "grad_norm": 0.05507257953286171, + "learning_rate": 6.690024592744027e-07, + "loss": 0.4629, + "step": 1358 + }, + { + "epoch": 0.8870757180156658, + "grad_norm": 0.06188316270709038, + "learning_rate": 6.614205440140042e-07, + "loss": 0.5789, + "step": 1359 + }, + { + "epoch": 0.8877284595300261, + "grad_norm": 0.05622296407818794, + "learning_rate": 6.538803678250338e-07, + "loss": 0.5255, + "step": 1360 + }, + { + "epoch": 0.8883812010443864, + "grad_norm": 0.053286001086235046, + "learning_rate": 6.463819644085412e-07, + "loss": 0.4926, + "step": 1361 + }, + { + "epoch": 0.8890339425587467, + "grad_norm": 0.05363309010863304, + "learning_rate": 6.389253672788754e-07, + "loss": 0.5245, + "step": 1362 + }, + { + "epoch": 0.889686684073107, + "grad_norm": 0.05182503163814545, + "learning_rate": 6.315106097635304e-07, + "loss": 0.5641, + "step": 1363 + }, + { + "epoch": 0.8903394255874674, + "grad_norm": 0.054177477955818176, + "learning_rate": 6.241377250029934e-07, + "loss": 0.537, + "step": 1364 + }, + { + "epoch": 0.8909921671018277, + "grad_norm": 0.05669175460934639, + "learning_rate": 6.168067459506066e-07, + "loss": 0.4641, + "step": 1365 + }, + { + "epoch": 0.891644908616188, + "grad_norm": 0.058600328862667084, + "learning_rate": 6.095177053724011e-07, + "loss": 0.5456, + "step": 1366 + }, + { + "epoch": 0.8922976501305483, + "grad_norm": 0.05741265043616295, + "learning_rate": 6.022706358469776e-07, + "loss": 0.4892, + "step": 1367 + }, + { + "epoch": 0.8929503916449086, + "grad_norm": 0.06110072880983353, + "learning_rate": 5.950655697653363e-07, + "loss": 0.5218, + "step": 1368 + }, + { + "epoch": 0.893603133159269, + "grad_norm": 0.061403632164001465, + "learning_rate": 5.879025393307436e-07, + "loss": 0.5595, + "step": 1369 + }, + { + "epoch": 0.8942558746736292, + "grad_norm": 0.05400973930954933, + "learning_rate": 5.807815765585878e-07, + "loss": 0.5213, + "step": 1370 + }, + { + "epoch": 0.8949086161879896, + "grad_norm": 0.055990613996982574, + "learning_rate": 5.737027132762341e-07, + "loss": 0.5479, + "step": 1371 + }, + { + "epoch": 0.8955613577023499, + "grad_norm": 0.05797429010272026, + "learning_rate": 5.666659811228803e-07, + "loss": 0.5287, + "step": 1372 + }, + { + "epoch": 0.8962140992167101, + "grad_norm": 0.05289648100733757, + "learning_rate": 5.596714115494217e-07, + "loss": 0.4631, + "step": 1373 + }, + { + "epoch": 0.8968668407310705, + "grad_norm": 0.05488736554980278, + "learning_rate": 5.527190358183032e-07, + "loss": 0.5012, + "step": 1374 + }, + { + "epoch": 0.8975195822454308, + "grad_norm": 0.05579644441604614, + "learning_rate": 5.458088850033849e-07, + "loss": 0.488, + "step": 1375 + }, + { + "epoch": 0.8981723237597912, + "grad_norm": 0.06353001296520233, + "learning_rate": 5.389409899898013e-07, + "loss": 0.5669, + "step": 1376 + }, + { + "epoch": 0.8988250652741514, + "grad_norm": 0.06207989528775215, + "learning_rate": 5.321153814738222e-07, + "loss": 0.5379, + "step": 1377 + }, + { + "epoch": 0.8994778067885117, + "grad_norm": 0.04865063726902008, + "learning_rate": 5.253320899627179e-07, + "loss": 0.4628, + "step": 1378 + }, + { + "epoch": 0.9001305483028721, + "grad_norm": 0.05397767573595047, + "learning_rate": 5.185911457746207e-07, + "loss": 0.4546, + "step": 1379 + }, + { + "epoch": 0.9007832898172323, + "grad_norm": 0.0543336383998394, + "learning_rate": 5.1189257903839e-07, + "loss": 0.4474, + "step": 1380 + }, + { + "epoch": 0.9014360313315927, + "grad_norm": 0.055923014879226685, + "learning_rate": 5.052364196934779e-07, + "loss": 0.4864, + "step": 1381 + }, + { + "epoch": 0.902088772845953, + "grad_norm": 0.060891781002283096, + "learning_rate": 4.986226974897967e-07, + "loss": 0.5346, + "step": 1382 + }, + { + "epoch": 0.9027415143603134, + "grad_norm": 0.05753452330827713, + "learning_rate": 4.920514419875821e-07, + "loss": 0.5118, + "step": 1383 + }, + { + "epoch": 0.9033942558746736, + "grad_norm": 0.050467561930418015, + "learning_rate": 4.855226825572667e-07, + "loss": 0.4379, + "step": 1384 + }, + { + "epoch": 0.9040469973890339, + "grad_norm": 0.05872552469372749, + "learning_rate": 4.79036448379343e-07, + "loss": 0.4756, + "step": 1385 + }, + { + "epoch": 0.9046997389033943, + "grad_norm": 0.05431582033634186, + "learning_rate": 4.725927684442366e-07, + "loss": 0.4922, + "step": 1386 + }, + { + "epoch": 0.9053524804177546, + "grad_norm": 0.0574292428791523, + "learning_rate": 4.661916715521764e-07, + "loss": 0.5334, + "step": 1387 + }, + { + "epoch": 0.9060052219321149, + "grad_norm": 0.050154268741607666, + "learning_rate": 4.598331863130612e-07, + "loss": 0.4558, + "step": 1388 + }, + { + "epoch": 0.9066579634464752, + "grad_norm": 0.053724534809589386, + "learning_rate": 4.535173411463423e-07, + "loss": 0.4891, + "step": 1389 + }, + { + "epoch": 0.9073107049608355, + "grad_norm": 0.05234300717711449, + "learning_rate": 4.472441642808845e-07, + "loss": 0.4858, + "step": 1390 + }, + { + "epoch": 0.9079634464751958, + "grad_norm": 0.062360599637031555, + "learning_rate": 4.410136837548462e-07, + "loss": 0.5657, + "step": 1391 + }, + { + "epoch": 0.9086161879895561, + "grad_norm": 0.06348732858896255, + "learning_rate": 4.348259274155542e-07, + "loss": 0.5722, + "step": 1392 + }, + { + "epoch": 0.9092689295039165, + "grad_norm": 0.05668312683701515, + "learning_rate": 4.286809229193778e-07, + "loss": 0.4447, + "step": 1393 + }, + { + "epoch": 0.9099216710182768, + "grad_norm": 0.06089024245738983, + "learning_rate": 4.225786977316093e-07, + "loss": 0.4826, + "step": 1394 + }, + { + "epoch": 0.910574412532637, + "grad_norm": 0.05654263123869896, + "learning_rate": 4.165192791263295e-07, + "loss": 0.4942, + "step": 1395 + }, + { + "epoch": 0.9112271540469974, + "grad_norm": 0.058262892067432404, + "learning_rate": 4.1050269418629887e-07, + "loss": 0.5348, + "step": 1396 + }, + { + "epoch": 0.9118798955613577, + "grad_norm": 0.05608157068490982, + "learning_rate": 4.045289698028343e-07, + "loss": 0.5433, + "step": 1397 + }, + { + "epoch": 0.912532637075718, + "grad_norm": 0.05260564386844635, + "learning_rate": 3.985981326756794e-07, + "loss": 0.4732, + "step": 1398 + }, + { + "epoch": 0.9131853785900783, + "grad_norm": 0.056355442851781845, + "learning_rate": 3.927102093128976e-07, + "loss": 0.512, + "step": 1399 + }, + { + "epoch": 0.9138381201044387, + "grad_norm": 0.06535626202821732, + "learning_rate": 3.868652260307437e-07, + "loss": 0.6469, + "step": 1400 + }, + { + "epoch": 0.914490861618799, + "grad_norm": 0.05038583278656006, + "learning_rate": 3.810632089535526e-07, + "loss": 0.5062, + "step": 1401 + }, + { + "epoch": 0.9151436031331592, + "grad_norm": 0.05900431424379349, + "learning_rate": 3.753041840136218e-07, + "loss": 0.4974, + "step": 1402 + }, + { + "epoch": 0.9157963446475196, + "grad_norm": 0.05980895459651947, + "learning_rate": 3.6958817695109006e-07, + "loss": 0.5418, + "step": 1403 + }, + { + "epoch": 0.9164490861618799, + "grad_norm": 0.061525844037532806, + "learning_rate": 3.6391521331383126e-07, + "loss": 0.6017, + "step": 1404 + }, + { + "epoch": 0.9171018276762402, + "grad_norm": 0.053908564150333405, + "learning_rate": 3.5828531845733206e-07, + "loss": 0.4829, + "step": 1405 + }, + { + "epoch": 0.9177545691906005, + "grad_norm": 0.05936963111162186, + "learning_rate": 3.526985175445796e-07, + "loss": 0.6039, + "step": 1406 + }, + { + "epoch": 0.9184073107049608, + "grad_norm": 0.05012655258178711, + "learning_rate": 3.4715483554595974e-07, + "loss": 0.5029, + "step": 1407 + }, + { + "epoch": 0.9190600522193212, + "grad_norm": 0.05771825835108757, + "learning_rate": 3.416542972391268e-07, + "loss": 0.5213, + "step": 1408 + }, + { + "epoch": 0.9197127937336814, + "grad_norm": 0.05283350124955177, + "learning_rate": 3.361969272089116e-07, + "loss": 0.555, + "step": 1409 + }, + { + "epoch": 0.9203655352480418, + "grad_norm": 0.053370680660009384, + "learning_rate": 3.3078274984719825e-07, + "loss": 0.5161, + "step": 1410 + }, + { + "epoch": 0.9210182767624021, + "grad_norm": 0.054678335785865784, + "learning_rate": 3.254117893528186e-07, + "loss": 0.5444, + "step": 1411 + }, + { + "epoch": 0.9216710182767625, + "grad_norm": 0.05304795876145363, + "learning_rate": 3.2008406973145e-07, + "loss": 0.4347, + "step": 1412 + }, + { + "epoch": 0.9223237597911227, + "grad_norm": 0.052384186536073685, + "learning_rate": 3.147996147955012e-07, + "loss": 0.5218, + "step": 1413 + }, + { + "epoch": 0.922976501305483, + "grad_norm": 0.05954265594482422, + "learning_rate": 3.095584481640068e-07, + "loss": 0.5493, + "step": 1414 + }, + { + "epoch": 0.9236292428198434, + "grad_norm": 0.055753324180841446, + "learning_rate": 3.04360593262526e-07, + "loss": 0.4741, + "step": 1415 + }, + { + "epoch": 0.9242819843342036, + "grad_norm": 0.06900015473365784, + "learning_rate": 2.9920607332302844e-07, + "loss": 0.5573, + "step": 1416 + }, + { + "epoch": 0.924934725848564, + "grad_norm": 0.05286920815706253, + "learning_rate": 2.9409491138380655e-07, + "loss": 0.5286, + "step": 1417 + }, + { + "epoch": 0.9255874673629243, + "grad_norm": 0.055355388671159744, + "learning_rate": 2.8902713028935546e-07, + "loss": 0.5284, + "step": 1418 + }, + { + "epoch": 0.9262402088772846, + "grad_norm": 0.05673305317759514, + "learning_rate": 2.840027526902811e-07, + "loss": 0.5191, + "step": 1419 + }, + { + "epoch": 0.9268929503916449, + "grad_norm": 0.05061788111925125, + "learning_rate": 2.7902180104319443e-07, + "loss": 0.4535, + "step": 1420 + }, + { + "epoch": 0.9275456919060052, + "grad_norm": 0.059594254940748215, + "learning_rate": 2.7408429761061393e-07, + "loss": 0.5076, + "step": 1421 + }, + { + "epoch": 0.9281984334203656, + "grad_norm": 0.05334387719631195, + "learning_rate": 2.691902644608657e-07, + "loss": 0.5151, + "step": 1422 + }, + { + "epoch": 0.9288511749347258, + "grad_norm": 0.05792006850242615, + "learning_rate": 2.643397234679823e-07, + "loss": 0.5472, + "step": 1423 + }, + { + "epoch": 0.9295039164490861, + "grad_norm": 0.05704867094755173, + "learning_rate": 2.5953269631160847e-07, + "loss": 0.4783, + "step": 1424 + }, + { + "epoch": 0.9301566579634465, + "grad_norm": 0.05878971889615059, + "learning_rate": 2.547692044769012e-07, + "loss": 0.5099, + "step": 1425 + }, + { + "epoch": 0.9308093994778068, + "grad_norm": 0.05621445178985596, + "learning_rate": 2.500492692544354e-07, + "loss": 0.5338, + "step": 1426 + }, + { + "epoch": 0.9314621409921671, + "grad_norm": 0.058878008276224136, + "learning_rate": 2.453729117401082e-07, + "loss": 0.5617, + "step": 1427 + }, + { + "epoch": 0.9321148825065274, + "grad_norm": 0.061294734477996826, + "learning_rate": 2.4074015283504504e-07, + "loss": 0.5835, + "step": 1428 + }, + { + "epoch": 0.9327676240208878, + "grad_norm": 0.05842115730047226, + "learning_rate": 2.3615101324550694e-07, + "loss": 0.5393, + "step": 1429 + }, + { + "epoch": 0.933420365535248, + "grad_norm": 0.055717576295137405, + "learning_rate": 2.3160551348279438e-07, + "loss": 0.5517, + "step": 1430 + }, + { + "epoch": 0.9340731070496083, + "grad_norm": 0.054703693836927414, + "learning_rate": 2.2710367386316156e-07, + "loss": 0.5908, + "step": 1431 + }, + { + "epoch": 0.9347258485639687, + "grad_norm": 0.05739562585949898, + "learning_rate": 2.2264551450772e-07, + "loss": 0.5172, + "step": 1432 + }, + { + "epoch": 0.935378590078329, + "grad_norm": 0.05853067338466644, + "learning_rate": 2.1823105534235166e-07, + "loss": 0.6115, + "step": 1433 + }, + { + "epoch": 0.9360313315926893, + "grad_norm": 0.05493749678134918, + "learning_rate": 2.1386031609761937e-07, + "loss": 0.4931, + "step": 1434 + }, + { + "epoch": 0.9366840731070496, + "grad_norm": 0.056509993970394135, + "learning_rate": 2.095333163086777e-07, + "loss": 0.5375, + "step": 1435 + }, + { + "epoch": 0.9373368146214099, + "grad_norm": 0.05576891824603081, + "learning_rate": 2.052500753151876e-07, + "loss": 0.4706, + "step": 1436 + }, + { + "epoch": 0.9379895561357703, + "grad_norm": 0.05480458587408066, + "learning_rate": 2.0101061226122654e-07, + "loss": 0.5043, + "step": 1437 + }, + { + "epoch": 0.9386422976501305, + "grad_norm": 0.06164632365107536, + "learning_rate": 1.9681494609520735e-07, + "loss": 0.5588, + "step": 1438 + }, + { + "epoch": 0.9392950391644909, + "grad_norm": 0.05780002474784851, + "learning_rate": 1.926630955697917e-07, + "loss": 0.4877, + "step": 1439 + }, + { + "epoch": 0.9399477806788512, + "grad_norm": 0.059480633586645126, + "learning_rate": 1.8855507924180337e-07, + "loss": 0.5192, + "step": 1440 + }, + { + "epoch": 0.9406005221932114, + "grad_norm": 0.06673169136047363, + "learning_rate": 1.844909154721497e-07, + "loss": 0.5979, + "step": 1441 + }, + { + "epoch": 0.9412532637075718, + "grad_norm": 0.058235302567481995, + "learning_rate": 1.8047062242573576e-07, + "loss": 0.5195, + "step": 1442 + }, + { + "epoch": 0.9419060052219321, + "grad_norm": 0.057730354368686676, + "learning_rate": 1.764942180713869e-07, + "loss": 0.556, + "step": 1443 + }, + { + "epoch": 0.9425587467362925, + "grad_norm": 0.054570272564888, + "learning_rate": 1.7256172018176864e-07, + "loss": 0.5287, + "step": 1444 + }, + { + "epoch": 0.9432114882506527, + "grad_norm": 0.06531942635774612, + "learning_rate": 1.6867314633330023e-07, + "loss": 0.5772, + "step": 1445 + }, + { + "epoch": 0.943864229765013, + "grad_norm": 0.0553065724670887, + "learning_rate": 1.6482851390608235e-07, + "loss": 0.5504, + "step": 1446 + }, + { + "epoch": 0.9445169712793734, + "grad_norm": 0.057470377534627914, + "learning_rate": 1.6102784008382278e-07, + "loss": 0.5519, + "step": 1447 + }, + { + "epoch": 0.9451697127937336, + "grad_norm": 0.06070839986205101, + "learning_rate": 1.5727114185374758e-07, + "loss": 0.5528, + "step": 1448 + }, + { + "epoch": 0.945822454308094, + "grad_norm": 0.05690762773156166, + "learning_rate": 1.5355843600653896e-07, + "loss": 0.5234, + "step": 1449 + }, + { + "epoch": 0.9464751958224543, + "grad_norm": 0.05993395298719406, + "learning_rate": 1.4988973913625082e-07, + "loss": 0.5453, + "step": 1450 + }, + { + "epoch": 0.9471279373368147, + "grad_norm": 0.05665702000260353, + "learning_rate": 1.4626506764023663e-07, + "loss": 0.5268, + "step": 1451 + }, + { + "epoch": 0.9477806788511749, + "grad_norm": 0.05812077969312668, + "learning_rate": 1.4268443771908058e-07, + "loss": 0.5163, + "step": 1452 + }, + { + "epoch": 0.9484334203655352, + "grad_norm": 0.06277963519096375, + "learning_rate": 1.391478653765177e-07, + "loss": 0.5589, + "step": 1453 + }, + { + "epoch": 0.9490861618798956, + "grad_norm": 0.055003996938467026, + "learning_rate": 1.3565536641936827e-07, + "loss": 0.5688, + "step": 1454 + }, + { + "epoch": 0.9497389033942559, + "grad_norm": 0.055438101291656494, + "learning_rate": 1.3220695645746684e-07, + "loss": 0.5138, + "step": 1455 + }, + { + "epoch": 0.9503916449086162, + "grad_norm": 0.051719021052122116, + "learning_rate": 1.2880265090358668e-07, + "loss": 0.4777, + "step": 1456 + }, + { + "epoch": 0.9510443864229765, + "grad_norm": 0.05389731004834175, + "learning_rate": 1.2544246497337986e-07, + "loss": 0.4902, + "step": 1457 + }, + { + "epoch": 0.9516971279373369, + "grad_norm": 0.061336226761341095, + "learning_rate": 1.2212641368529842e-07, + "loss": 0.5617, + "step": 1458 + }, + { + "epoch": 0.9523498694516971, + "grad_norm": 0.05890428647398949, + "learning_rate": 1.1885451186053886e-07, + "loss": 0.5077, + "step": 1459 + }, + { + "epoch": 0.9530026109660574, + "grad_norm": 0.058837950229644775, + "learning_rate": 1.1562677412296996e-07, + "loss": 0.5116, + "step": 1460 + }, + { + "epoch": 0.9536553524804178, + "grad_norm": 0.05983196198940277, + "learning_rate": 1.1244321489906285e-07, + "loss": 0.5378, + "step": 1461 + }, + { + "epoch": 0.9543080939947781, + "grad_norm": 0.058248184621334076, + "learning_rate": 1.0930384841783548e-07, + "loss": 0.5436, + "step": 1462 + }, + { + "epoch": 0.9549608355091384, + "grad_norm": 0.05723857879638672, + "learning_rate": 1.0620868871078493e-07, + "loss": 0.5372, + "step": 1463 + }, + { + "epoch": 0.9556135770234987, + "grad_norm": 0.059107016772031784, + "learning_rate": 1.0315774961182412e-07, + "loss": 0.4748, + "step": 1464 + }, + { + "epoch": 0.956266318537859, + "grad_norm": 0.05467505007982254, + "learning_rate": 1.0015104475721848e-07, + "loss": 0.5476, + "step": 1465 + }, + { + "epoch": 0.9569190600522193, + "grad_norm": 0.060881637036800385, + "learning_rate": 9.71885875855294e-08, + "loss": 0.564, + "step": 1466 + }, + { + "epoch": 0.9575718015665796, + "grad_norm": 0.05629203841090202, + "learning_rate": 9.4270391337552e-08, + "loss": 0.5343, + "step": 1467 + }, + { + "epoch": 0.95822454308094, + "grad_norm": 0.055595796555280685, + "learning_rate": 9.13964690562552e-08, + "loss": 0.535, + "step": 1468 + }, + { + "epoch": 0.9588772845953003, + "grad_norm": 0.053988490253686905, + "learning_rate": 8.856683358672402e-08, + "loss": 0.5259, + "step": 1469 + }, + { + "epoch": 0.9595300261096605, + "grad_norm": 0.05572625994682312, + "learning_rate": 8.578149757610176e-08, + "loss": 0.5388, + "step": 1470 + }, + { + "epoch": 0.9601827676240209, + "grad_norm": 0.05242741480469704, + "learning_rate": 8.30404734735346e-08, + "loss": 0.4877, + "step": 1471 + }, + { + "epoch": 0.9608355091383812, + "grad_norm": 0.05756179615855217, + "learning_rate": 8.034377353011603e-08, + "loss": 0.5024, + "step": 1472 + }, + { + "epoch": 0.9614882506527415, + "grad_norm": 0.0551273413002491, + "learning_rate": 7.769140979882905e-08, + "loss": 0.558, + "step": 1473 + }, + { + "epoch": 0.9621409921671018, + "grad_norm": 0.056963566690683365, + "learning_rate": 7.508339413449528e-08, + "loss": 0.4535, + "step": 1474 + }, + { + "epoch": 0.9627937336814621, + "grad_norm": 0.05817368999123573, + "learning_rate": 7.251973819372371e-08, + "loss": 0.5771, + "step": 1475 + }, + { + "epoch": 0.9634464751958225, + "grad_norm": 0.057898059487342834, + "learning_rate": 7.000045343485306e-08, + "loss": 0.5658, + "step": 1476 + }, + { + "epoch": 0.9640992167101827, + "grad_norm": 0.05078176409006119, + "learning_rate": 6.752555111790515e-08, + "loss": 0.5092, + "step": 1477 + }, + { + "epoch": 0.9647519582245431, + "grad_norm": 0.05763321742415428, + "learning_rate": 6.509504230453379e-08, + "loss": 0.5376, + "step": 1478 + }, + { + "epoch": 0.9654046997389034, + "grad_norm": 0.055951688438653946, + "learning_rate": 6.270893785797261e-08, + "loss": 0.5519, + "step": 1479 + }, + { + "epoch": 0.9660574412532638, + "grad_norm": 0.055584829300642014, + "learning_rate": 6.036724844299069e-08, + "loss": 0.5564, + "step": 1480 + }, + { + "epoch": 0.966710182767624, + "grad_norm": 0.05595998466014862, + "learning_rate": 5.806998452584034e-08, + "loss": 0.3988, + "step": 1481 + }, + { + "epoch": 0.9673629242819843, + "grad_norm": 0.05466803163290024, + "learning_rate": 5.581715637421492e-08, + "loss": 0.5162, + "step": 1482 + }, + { + "epoch": 0.9680156657963447, + "grad_norm": 0.05414329841732979, + "learning_rate": 5.360877405720111e-08, + "loss": 0.4885, + "step": 1483 + }, + { + "epoch": 0.9686684073107049, + "grad_norm": 0.06734245270490646, + "learning_rate": 5.144484744523004e-08, + "loss": 0.6203, + "step": 1484 + }, + { + "epoch": 0.9693211488250653, + "grad_norm": 0.05423906818032265, + "learning_rate": 4.932538621004068e-08, + "loss": 0.4688, + "step": 1485 + }, + { + "epoch": 0.9699738903394256, + "grad_norm": 0.05275886133313179, + "learning_rate": 4.7250399824629867e-08, + "loss": 0.4565, + "step": 1486 + }, + { + "epoch": 0.970626631853786, + "grad_norm": 0.059688255190849304, + "learning_rate": 4.521989756321565e-08, + "loss": 0.5684, + "step": 1487 + }, + { + "epoch": 0.9712793733681462, + "grad_norm": 0.05637152120471001, + "learning_rate": 4.323388850118848e-08, + "loss": 0.5786, + "step": 1488 + }, + { + "epoch": 0.9719321148825065, + "grad_norm": 0.06485294550657272, + "learning_rate": 4.129238151508008e-08, + "loss": 0.592, + "step": 1489 + }, + { + "epoch": 0.9725848563968669, + "grad_norm": 0.04909993335604668, + "learning_rate": 3.939538528251463e-08, + "loss": 0.4946, + "step": 1490 + }, + { + "epoch": 0.9732375979112271, + "grad_norm": 0.058074407279491425, + "learning_rate": 3.754290828217655e-08, + "loss": 0.5951, + "step": 1491 + }, + { + "epoch": 0.9738903394255874, + "grad_norm": 0.056894298642873764, + "learning_rate": 3.5734958793769426e-08, + "loss": 0.4916, + "step": 1492 + }, + { + "epoch": 0.9745430809399478, + "grad_norm": 0.0597614087164402, + "learning_rate": 3.397154489798049e-08, + "loss": 0.5783, + "step": 1493 + }, + { + "epoch": 0.9751958224543081, + "grad_norm": 0.05714694410562515, + "learning_rate": 3.225267447644065e-08, + "loss": 0.5015, + "step": 1494 + }, + { + "epoch": 0.9758485639686684, + "grad_norm": 0.05466402322053909, + "learning_rate": 3.057835521169783e-08, + "loss": 0.5161, + "step": 1495 + }, + { + "epoch": 0.9765013054830287, + "grad_norm": 0.06516537070274353, + "learning_rate": 2.8948594587170366e-08, + "loss": 0.5422, + "step": 1496 + }, + { + "epoch": 0.9771540469973891, + "grad_norm": 0.055320855230093, + "learning_rate": 2.7363399887128105e-08, + "loss": 0.4968, + "step": 1497 + }, + { + "epoch": 0.9778067885117493, + "grad_norm": 0.062362801283597946, + "learning_rate": 2.5822778196645804e-08, + "loss": 0.6666, + "step": 1498 + }, + { + "epoch": 0.9784595300261096, + "grad_norm": 0.0544227734208107, + "learning_rate": 2.4326736401579788e-08, + "loss": 0.5162, + "step": 1499 + }, + { + "epoch": 0.97911227154047, + "grad_norm": 0.061307311058044434, + "learning_rate": 2.287528118853688e-08, + "loss": 0.5837, + "step": 1500 + }, + { + "epoch": 0.9797650130548303, + "grad_norm": 0.06065037474036217, + "learning_rate": 2.1468419044839984e-08, + "loss": 0.5936, + "step": 1501 + }, + { + "epoch": 0.9804177545691906, + "grad_norm": 0.05570589005947113, + "learning_rate": 2.010615625850365e-08, + "loss": 0.5298, + "step": 1502 + }, + { + "epoch": 0.9810704960835509, + "grad_norm": 0.06140119209885597, + "learning_rate": 1.8788498918204112e-08, + "loss": 0.5371, + "step": 1503 + }, + { + "epoch": 0.9817232375979112, + "grad_norm": 0.05893724039196968, + "learning_rate": 1.7515452913250407e-08, + "loss": 0.5344, + "step": 1504 + }, + { + "epoch": 0.9823759791122716, + "grad_norm": 0.05744578316807747, + "learning_rate": 1.6287023933564407e-08, + "loss": 0.4739, + "step": 1505 + }, + { + "epoch": 0.9830287206266318, + "grad_norm": 0.05734417587518692, + "learning_rate": 1.510321746964416e-08, + "loss": 0.5262, + "step": 1506 + }, + { + "epoch": 0.9836814621409922, + "grad_norm": 0.06392619013786316, + "learning_rate": 1.3964038812551706e-08, + "loss": 0.62, + "step": 1507 + }, + { + "epoch": 0.9843342036553525, + "grad_norm": 0.04519110545516014, + "learning_rate": 1.2869493053880855e-08, + "loss": 0.432, + "step": 1508 + }, + { + "epoch": 0.9849869451697127, + "grad_norm": 0.052288834005594254, + "learning_rate": 1.1819585085737217e-08, + "loss": 0.4645, + "step": 1509 + }, + { + "epoch": 0.9856396866840731, + "grad_norm": 0.06511126458644867, + "learning_rate": 1.0814319600718216e-08, + "loss": 0.5786, + "step": 1510 + }, + { + "epoch": 0.9862924281984334, + "grad_norm": 0.055628255009651184, + "learning_rate": 9.853701091888656e-09, + "loss": 0.4908, + "step": 1511 + }, + { + "epoch": 0.9869451697127938, + "grad_norm": 0.05760030820965767, + "learning_rate": 8.937733852764086e-09, + "loss": 0.6044, + "step": 1512 + }, + { + "epoch": 0.987597911227154, + "grad_norm": 0.048606064170598984, + "learning_rate": 8.066421977286355e-09, + "loss": 0.4827, + "step": 1513 + }, + { + "epoch": 0.9882506527415144, + "grad_norm": 0.05585504323244095, + "learning_rate": 7.239769359811411e-09, + "loss": 0.5, + "step": 1514 + }, + { + "epoch": 0.9889033942558747, + "grad_norm": 0.05974135175347328, + "learning_rate": 6.457779695090427e-09, + "loss": 0.5583, + "step": 1515 + }, + { + "epoch": 0.9895561357702349, + "grad_norm": 0.05551174655556679, + "learning_rate": 5.7204564782498136e-09, + "loss": 0.5376, + "step": 1516 + }, + { + "epoch": 0.9902088772845953, + "grad_norm": 0.05592876672744751, + "learning_rate": 5.027803004779008e-09, + "loss": 0.5015, + "step": 1517 + }, + { + "epoch": 0.9908616187989556, + "grad_norm": 0.058642178773880005, + "learning_rate": 4.37982237051271e-09, + "loss": 0.6064, + "step": 1518 + }, + { + "epoch": 0.991514360313316, + "grad_norm": 0.0588836707174778, + "learning_rate": 3.776517471621999e-09, + "loss": 0.4826, + "step": 1519 + }, + { + "epoch": 0.9921671018276762, + "grad_norm": 0.05897413566708565, + "learning_rate": 3.217891004596574e-09, + "loss": 0.5246, + "step": 1520 + }, + { + "epoch": 0.9928198433420365, + "grad_norm": 0.058347541838884354, + "learning_rate": 2.7039454662336484e-09, + "loss": 0.5419, + "step": 1521 + }, + { + "epoch": 0.9934725848563969, + "grad_norm": 0.06031772866845131, + "learning_rate": 2.2346831536312895e-09, + "loss": 0.5225, + "step": 1522 + }, + { + "epoch": 0.9941253263707572, + "grad_norm": 0.05832480639219284, + "learning_rate": 1.810106164169545e-09, + "loss": 0.5059, + "step": 1523 + }, + { + "epoch": 0.9947780678851175, + "grad_norm": 0.05937207490205765, + "learning_rate": 1.4302163955093317e-09, + "loss": 0.5684, + "step": 1524 + }, + { + "epoch": 0.9954308093994778, + "grad_norm": 0.05283202975988388, + "learning_rate": 1.0950155455802247e-09, + "loss": 0.514, + "step": 1525 + }, + { + "epoch": 0.9960835509138382, + "grad_norm": 0.061174239963293076, + "learning_rate": 8.045051125726844e-10, + "loss": 0.5251, + "step": 1526 + }, + { + "epoch": 0.9967362924281984, + "grad_norm": 0.06188638508319855, + "learning_rate": 5.586863949325061e-10, + "loss": 0.5051, + "step": 1527 + }, + { + "epoch": 0.9973890339425587, + "grad_norm": 0.054148390889167786, + "learning_rate": 3.575604913530484e-10, + "loss": 0.5091, + "step": 1528 + }, + { + "epoch": 0.9980417754569191, + "grad_norm": 0.0549580417573452, + "learning_rate": 2.011283007730125e-10, + "loss": 0.5188, + "step": 1529 + }, + { + "epoch": 0.9986945169712794, + "grad_norm": 0.05601628124713898, + "learning_rate": 8.93905223708913e-11, + "loss": 0.5068, + "step": 1530 + }, + { + "epoch": 0.9993472584856397, + "grad_norm": 0.056755177676677704, + "learning_rate": 2.2347655563859096e-11, + "loss": 0.5351, + "step": 1531 + }, + { + "epoch": 1.0, + "grad_norm": 0.06067736819386482, + "learning_rate": 0.0, + "loss": 0.5493, + "step": 1532 + }, + { + "epoch": 1.0, + "step": 1532, + "total_flos": 0.0, + "train_loss": 0.10018481526222303, + "train_runtime": 5590.0623, + "train_samples_per_second": 280.582, + "train_steps_per_second": 0.274 + } + ], + "logging_steps": 1.0, + "max_steps": 1532, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}