diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,8029 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 7.0, + "eval_steps": 500, + "global_step": 1141, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.006134969325153374, + "grad_norm": 0.14804252982139587, + "learning_rate": 8.333333333333333e-07, + "loss": 0.5673, + "step": 1 + }, + { + "epoch": 0.012269938650306749, + "grad_norm": 0.1421855241060257, + "learning_rate": 1.6666666666666667e-06, + "loss": 0.5703, + "step": 2 + }, + { + "epoch": 0.018404907975460124, + "grad_norm": 0.14651049673557281, + "learning_rate": 2.5e-06, + "loss": 0.5759, + "step": 3 + }, + { + "epoch": 0.024539877300613498, + "grad_norm": 0.14899852871894836, + "learning_rate": 3.3333333333333333e-06, + "loss": 0.6024, + "step": 4 + }, + { + "epoch": 0.03067484662576687, + "grad_norm": 0.1509060114622116, + "learning_rate": 4.166666666666667e-06, + "loss": 0.5794, + "step": 5 + }, + { + "epoch": 0.03680981595092025, + "grad_norm": 0.14855755865573883, + "learning_rate": 5e-06, + "loss": 0.5726, + "step": 6 + }, + { + "epoch": 0.04294478527607362, + "grad_norm": 0.161569744348526, + "learning_rate": 5.833333333333334e-06, + "loss": 0.5992, + "step": 7 + }, + { + "epoch": 0.049079754601226995, + "grad_norm": 0.1378004103899002, + "learning_rate": 6.666666666666667e-06, + "loss": 0.5801, + "step": 8 + }, + { + "epoch": 0.05521472392638037, + "grad_norm": 0.1505780816078186, + "learning_rate": 7.500000000000001e-06, + "loss": 0.5923, + "step": 9 + }, + { + "epoch": 0.06134969325153374, + "grad_norm": 0.15210111439228058, + "learning_rate": 8.333333333333334e-06, + "loss": 0.5794, + "step": 10 + }, + { + "epoch": 0.06748466257668712, + "grad_norm": 0.16127604246139526, + "learning_rate": 9.166666666666666e-06, + "loss": 0.5925, + "step": 11 + }, + { + "epoch": 0.0736196319018405, + "grad_norm": 0.15838903188705444, + "learning_rate": 1e-05, + "loss": 0.5879, + "step": 12 + }, + { + "epoch": 0.07975460122699386, + "grad_norm": 0.15082281827926636, + "learning_rate": 9.999980642396502e-06, + "loss": 0.6217, + "step": 13 + }, + { + "epoch": 0.08588957055214724, + "grad_norm": 0.16154126822948456, + "learning_rate": 9.999922569735891e-06, + "loss": 0.5725, + "step": 14 + }, + { + "epoch": 0.09202453987730061, + "grad_norm": 0.16344521939754486, + "learning_rate": 9.999825782467827e-06, + "loss": 0.6094, + "step": 15 + }, + { + "epoch": 0.09815950920245399, + "grad_norm": 0.16911642253398895, + "learning_rate": 9.99969028134174e-06, + "loss": 0.5627, + "step": 16 + }, + { + "epoch": 0.10429447852760736, + "grad_norm": 0.16826483607292175, + "learning_rate": 9.999516067406818e-06, + "loss": 0.5996, + "step": 17 + }, + { + "epoch": 0.11042944785276074, + "grad_norm": 0.16453488171100616, + "learning_rate": 9.999303142012008e-06, + "loss": 0.5984, + "step": 18 + }, + { + "epoch": 0.1165644171779141, + "grad_norm": 0.16882814466953278, + "learning_rate": 9.999051506806e-06, + "loss": 0.5684, + "step": 19 + }, + { + "epoch": 0.12269938650306748, + "grad_norm": 0.15942800045013428, + "learning_rate": 9.998761163737217e-06, + "loss": 0.5879, + "step": 20 + }, + { + "epoch": 0.12883435582822086, + "grad_norm": 0.17792896926403046, + "learning_rate": 9.998432115053796e-06, + "loss": 0.5572, + "step": 21 + }, + { + "epoch": 0.13496932515337423, + "grad_norm": 0.1712748408317566, + "learning_rate": 9.998064363303573e-06, + "loss": 0.5693, + "step": 22 + }, + { + "epoch": 0.1411042944785276, + "grad_norm": 0.1689537614583969, + "learning_rate": 9.997657911334068e-06, + "loss": 0.5584, + "step": 23 + }, + { + "epoch": 0.147239263803681, + "grad_norm": 0.17206093668937683, + "learning_rate": 9.997212762292453e-06, + "loss": 0.5406, + "step": 24 + }, + { + "epoch": 0.15337423312883436, + "grad_norm": 0.1795872151851654, + "learning_rate": 9.996728919625538e-06, + "loss": 0.5767, + "step": 25 + }, + { + "epoch": 0.15950920245398773, + "grad_norm": 0.16666671633720398, + "learning_rate": 9.996206387079736e-06, + "loss": 0.5792, + "step": 26 + }, + { + "epoch": 0.1656441717791411, + "grad_norm": 0.15964794158935547, + "learning_rate": 9.995645168701038e-06, + "loss": 0.5771, + "step": 27 + }, + { + "epoch": 0.17177914110429449, + "grad_norm": 0.1661137342453003, + "learning_rate": 9.995045268834979e-06, + "loss": 0.5695, + "step": 28 + }, + { + "epoch": 0.17791411042944785, + "grad_norm": 0.16640233993530273, + "learning_rate": 9.99440669212661e-06, + "loss": 0.6123, + "step": 29 + }, + { + "epoch": 0.18404907975460122, + "grad_norm": 0.14831523597240448, + "learning_rate": 9.99372944352046e-06, + "loss": 0.5309, + "step": 30 + }, + { + "epoch": 0.1901840490797546, + "grad_norm": 0.13072659075260162, + "learning_rate": 9.993013528260486e-06, + "loss": 0.5782, + "step": 31 + }, + { + "epoch": 0.19631901840490798, + "grad_norm": 0.12631100416183472, + "learning_rate": 9.992258951890057e-06, + "loss": 0.5361, + "step": 32 + }, + { + "epoch": 0.20245398773006135, + "grad_norm": 0.1304839849472046, + "learning_rate": 9.991465720251885e-06, + "loss": 0.5673, + "step": 33 + }, + { + "epoch": 0.2085889570552147, + "grad_norm": 0.12783069908618927, + "learning_rate": 9.990633839487997e-06, + "loss": 0.5662, + "step": 34 + }, + { + "epoch": 0.2147239263803681, + "grad_norm": 0.12577638030052185, + "learning_rate": 9.989763316039678e-06, + "loss": 0.5328, + "step": 35 + }, + { + "epoch": 0.22085889570552147, + "grad_norm": 0.12679477035999298, + "learning_rate": 9.988854156647428e-06, + "loss": 0.5758, + "step": 36 + }, + { + "epoch": 0.22699386503067484, + "grad_norm": 0.1201774999499321, + "learning_rate": 9.987906368350908e-06, + "loss": 0.5598, + "step": 37 + }, + { + "epoch": 0.2331288343558282, + "grad_norm": 0.1210893765091896, + "learning_rate": 9.98691995848888e-06, + "loss": 0.5902, + "step": 38 + }, + { + "epoch": 0.2392638036809816, + "grad_norm": 0.10759458690881729, + "learning_rate": 9.985894934699154e-06, + "loss": 0.524, + "step": 39 + }, + { + "epoch": 0.24539877300613497, + "grad_norm": 0.10544677078723907, + "learning_rate": 9.984831304918537e-06, + "loss": 0.5535, + "step": 40 + }, + { + "epoch": 0.25153374233128833, + "grad_norm": 0.1004803329706192, + "learning_rate": 9.983729077382755e-06, + "loss": 0.5204, + "step": 41 + }, + { + "epoch": 0.25766871165644173, + "grad_norm": 0.10377446562051773, + "learning_rate": 9.982588260626402e-06, + "loss": 0.5319, + "step": 42 + }, + { + "epoch": 0.26380368098159507, + "grad_norm": 0.09945333003997803, + "learning_rate": 9.981408863482872e-06, + "loss": 0.5473, + "step": 43 + }, + { + "epoch": 0.26993865030674846, + "grad_norm": 0.09462588280439377, + "learning_rate": 9.98019089508428e-06, + "loss": 0.5455, + "step": 44 + }, + { + "epoch": 0.27607361963190186, + "grad_norm": 0.09673507511615753, + "learning_rate": 9.97893436486141e-06, + "loss": 0.5175, + "step": 45 + }, + { + "epoch": 0.2822085889570552, + "grad_norm": 0.09385982155799866, + "learning_rate": 9.977639282543627e-06, + "loss": 0.5519, + "step": 46 + }, + { + "epoch": 0.2883435582822086, + "grad_norm": 0.10252804309129715, + "learning_rate": 9.976305658158806e-06, + "loss": 0.5197, + "step": 47 + }, + { + "epoch": 0.294478527607362, + "grad_norm": 0.09260332584381104, + "learning_rate": 9.97493350203326e-06, + "loss": 0.5454, + "step": 48 + }, + { + "epoch": 0.3006134969325153, + "grad_norm": 0.09458579868078232, + "learning_rate": 9.973522824791643e-06, + "loss": 0.5376, + "step": 49 + }, + { + "epoch": 0.3067484662576687, + "grad_norm": 0.10349428653717041, + "learning_rate": 9.972073637356894e-06, + "loss": 0.5747, + "step": 50 + }, + { + "epoch": 0.3128834355828221, + "grad_norm": 0.09267648309469223, + "learning_rate": 9.970585950950129e-06, + "loss": 0.525, + "step": 51 + }, + { + "epoch": 0.31901840490797545, + "grad_norm": 0.08669942617416382, + "learning_rate": 9.969059777090564e-06, + "loss": 0.5121, + "step": 52 + }, + { + "epoch": 0.32515337423312884, + "grad_norm": 0.09225864708423615, + "learning_rate": 9.967495127595427e-06, + "loss": 0.523, + "step": 53 + }, + { + "epoch": 0.3312883435582822, + "grad_norm": 0.09258433431386948, + "learning_rate": 9.965892014579867e-06, + "loss": 0.5275, + "step": 54 + }, + { + "epoch": 0.3374233128834356, + "grad_norm": 0.08944110572338104, + "learning_rate": 9.96425045045685e-06, + "loss": 0.5338, + "step": 55 + }, + { + "epoch": 0.34355828220858897, + "grad_norm": 0.09073130786418915, + "learning_rate": 9.962570447937077e-06, + "loss": 0.5379, + "step": 56 + }, + { + "epoch": 0.3496932515337423, + "grad_norm": 0.10125018656253815, + "learning_rate": 9.960852020028877e-06, + "loss": 0.5216, + "step": 57 + }, + { + "epoch": 0.3558282208588957, + "grad_norm": 0.09612809121608734, + "learning_rate": 9.95909518003811e-06, + "loss": 0.5076, + "step": 58 + }, + { + "epoch": 0.3619631901840491, + "grad_norm": 0.09233148396015167, + "learning_rate": 9.957299941568058e-06, + "loss": 0.5443, + "step": 59 + }, + { + "epoch": 0.36809815950920244, + "grad_norm": 0.14108285307884216, + "learning_rate": 9.955466318519327e-06, + "loss": 0.5588, + "step": 60 + }, + { + "epoch": 0.37423312883435583, + "grad_norm": 0.09588061273097992, + "learning_rate": 9.953594325089738e-06, + "loss": 0.506, + "step": 61 + }, + { + "epoch": 0.3803680981595092, + "grad_norm": 0.08795749396085739, + "learning_rate": 9.951683975774213e-06, + "loss": 0.5158, + "step": 62 + }, + { + "epoch": 0.38650306748466257, + "grad_norm": 0.0903969258069992, + "learning_rate": 9.949735285364666e-06, + "loss": 0.5337, + "step": 63 + }, + { + "epoch": 0.39263803680981596, + "grad_norm": 0.09337311238050461, + "learning_rate": 9.947748268949885e-06, + "loss": 0.5313, + "step": 64 + }, + { + "epoch": 0.3987730061349693, + "grad_norm": 0.08544723689556122, + "learning_rate": 9.945722941915424e-06, + "loss": 0.5398, + "step": 65 + }, + { + "epoch": 0.4049079754601227, + "grad_norm": 0.08584097027778625, + "learning_rate": 9.943659319943472e-06, + "loss": 0.5116, + "step": 66 + }, + { + "epoch": 0.4110429447852761, + "grad_norm": 0.08828586339950562, + "learning_rate": 9.941557419012742e-06, + "loss": 0.5004, + "step": 67 + }, + { + "epoch": 0.4171779141104294, + "grad_norm": 0.0884728655219078, + "learning_rate": 9.939417255398336e-06, + "loss": 0.5463, + "step": 68 + }, + { + "epoch": 0.4233128834355828, + "grad_norm": 0.08237382769584656, + "learning_rate": 9.93723884567163e-06, + "loss": 0.4999, + "step": 69 + }, + { + "epoch": 0.4294478527607362, + "grad_norm": 0.07927916198968887, + "learning_rate": 9.935022206700145e-06, + "loss": 0.5056, + "step": 70 + }, + { + "epoch": 0.43558282208588955, + "grad_norm": 0.08695581555366516, + "learning_rate": 9.932767355647404e-06, + "loss": 0.5327, + "step": 71 + }, + { + "epoch": 0.44171779141104295, + "grad_norm": 0.0842549055814743, + "learning_rate": 9.930474309972813e-06, + "loss": 0.5371, + "step": 72 + }, + { + "epoch": 0.44785276073619634, + "grad_norm": 0.09977614879608154, + "learning_rate": 9.92814308743152e-06, + "loss": 0.5404, + "step": 73 + }, + { + "epoch": 0.4539877300613497, + "grad_norm": 0.08078821003437042, + "learning_rate": 9.925773706074278e-06, + "loss": 0.5223, + "step": 74 + }, + { + "epoch": 0.4601226993865031, + "grad_norm": 0.08402088284492493, + "learning_rate": 9.923366184247306e-06, + "loss": 0.4808, + "step": 75 + }, + { + "epoch": 0.4662576687116564, + "grad_norm": 0.08832000941038132, + "learning_rate": 9.920920540592141e-06, + "loss": 0.5187, + "step": 76 + }, + { + "epoch": 0.4723926380368098, + "grad_norm": 0.0865439772605896, + "learning_rate": 9.918436794045507e-06, + "loss": 0.5013, + "step": 77 + }, + { + "epoch": 0.4785276073619632, + "grad_norm": 0.0843573585152626, + "learning_rate": 9.915914963839154e-06, + "loss": 0.5203, + "step": 78 + }, + { + "epoch": 0.48466257668711654, + "grad_norm": 0.08602918684482574, + "learning_rate": 9.91335506949972e-06, + "loss": 0.509, + "step": 79 + }, + { + "epoch": 0.49079754601226994, + "grad_norm": 0.11076867580413818, + "learning_rate": 9.910757130848571e-06, + "loss": 0.5022, + "step": 80 + }, + { + "epoch": 0.49693251533742333, + "grad_norm": 0.08907414972782135, + "learning_rate": 9.908121168001657e-06, + "loss": 0.524, + "step": 81 + }, + { + "epoch": 0.5030674846625767, + "grad_norm": 0.09341124445199966, + "learning_rate": 9.90544720136934e-06, + "loss": 0.4973, + "step": 82 + }, + { + "epoch": 0.50920245398773, + "grad_norm": 0.10275765508413315, + "learning_rate": 9.902735251656263e-06, + "loss": 0.5031, + "step": 83 + }, + { + "epoch": 0.5153374233128835, + "grad_norm": 0.08170262724161148, + "learning_rate": 9.89998533986116e-06, + "loss": 0.4829, + "step": 84 + }, + { + "epoch": 0.5214723926380368, + "grad_norm": 0.08169631659984589, + "learning_rate": 9.897197487276712e-06, + "loss": 0.4893, + "step": 85 + }, + { + "epoch": 0.5276073619631901, + "grad_norm": 0.07817935198545456, + "learning_rate": 9.894371715489376e-06, + "loss": 0.5155, + "step": 86 + }, + { + "epoch": 0.5337423312883436, + "grad_norm": 0.0947134792804718, + "learning_rate": 9.891508046379225e-06, + "loss": 0.5513, + "step": 87 + }, + { + "epoch": 0.5398773006134969, + "grad_norm": 0.08149490505456924, + "learning_rate": 9.888606502119763e-06, + "loss": 0.4797, + "step": 88 + }, + { + "epoch": 0.5460122699386503, + "grad_norm": 0.09330648183822632, + "learning_rate": 9.885667105177769e-06, + "loss": 0.4906, + "step": 89 + }, + { + "epoch": 0.5521472392638037, + "grad_norm": 0.07798902690410614, + "learning_rate": 9.882689878313114e-06, + "loss": 0.4751, + "step": 90 + }, + { + "epoch": 0.558282208588957, + "grad_norm": 0.08369090408086777, + "learning_rate": 9.879674844578588e-06, + "loss": 0.5378, + "step": 91 + }, + { + "epoch": 0.5644171779141104, + "grad_norm": 0.09182888269424438, + "learning_rate": 9.876622027319726e-06, + "loss": 0.5062, + "step": 92 + }, + { + "epoch": 0.5705521472392638, + "grad_norm": 0.07969052344560623, + "learning_rate": 9.873531450174616e-06, + "loss": 0.5108, + "step": 93 + }, + { + "epoch": 0.5766871165644172, + "grad_norm": 0.1103825718164444, + "learning_rate": 9.870403137073723e-06, + "loss": 0.5127, + "step": 94 + }, + { + "epoch": 0.5828220858895705, + "grad_norm": 0.0967809185385704, + "learning_rate": 9.867237112239708e-06, + "loss": 0.4884, + "step": 95 + }, + { + "epoch": 0.588957055214724, + "grad_norm": 0.07554975152015686, + "learning_rate": 9.86403340018723e-06, + "loss": 0.4743, + "step": 96 + }, + { + "epoch": 0.5950920245398773, + "grad_norm": 0.09686005860567093, + "learning_rate": 9.860792025722768e-06, + "loss": 0.5033, + "step": 97 + }, + { + "epoch": 0.6012269938650306, + "grad_norm": 0.07932725548744202, + "learning_rate": 9.857513013944413e-06, + "loss": 0.5121, + "step": 98 + }, + { + "epoch": 0.6073619631901841, + "grad_norm": 0.08431375026702881, + "learning_rate": 9.854196390241691e-06, + "loss": 0.5058, + "step": 99 + }, + { + "epoch": 0.6134969325153374, + "grad_norm": 0.081431545317173, + "learning_rate": 9.85084218029536e-06, + "loss": 0.5138, + "step": 100 + }, + { + "epoch": 0.6196319018404908, + "grad_norm": 0.08625893294811249, + "learning_rate": 9.847450410077202e-06, + "loss": 0.5052, + "step": 101 + }, + { + "epoch": 0.6257668711656442, + "grad_norm": 0.08018495887517929, + "learning_rate": 9.844021105849837e-06, + "loss": 0.5018, + "step": 102 + }, + { + "epoch": 0.6319018404907976, + "grad_norm": 0.07922230660915375, + "learning_rate": 9.840554294166507e-06, + "loss": 0.4949, + "step": 103 + }, + { + "epoch": 0.6380368098159509, + "grad_norm": 0.07988546043634415, + "learning_rate": 9.83705000187088e-06, + "loss": 0.4774, + "step": 104 + }, + { + "epoch": 0.6441717791411042, + "grad_norm": 0.08108440041542053, + "learning_rate": 9.833508256096837e-06, + "loss": 0.508, + "step": 105 + }, + { + "epoch": 0.6503067484662577, + "grad_norm": 0.07891630381345749, + "learning_rate": 9.829929084268262e-06, + "loss": 0.4729, + "step": 106 + }, + { + "epoch": 0.656441717791411, + "grad_norm": 0.07818285375833511, + "learning_rate": 9.82631251409883e-06, + "loss": 0.4535, + "step": 107 + }, + { + "epoch": 0.6625766871165644, + "grad_norm": 0.07954879850149155, + "learning_rate": 9.822658573591794e-06, + "loss": 0.5095, + "step": 108 + }, + { + "epoch": 0.6687116564417178, + "grad_norm": 0.08702922612428665, + "learning_rate": 9.818967291039767e-06, + "loss": 0.5049, + "step": 109 + }, + { + "epoch": 0.6748466257668712, + "grad_norm": 0.07859531790018082, + "learning_rate": 9.8152386950245e-06, + "loss": 0.4565, + "step": 110 + }, + { + "epoch": 0.6809815950920245, + "grad_norm": 0.09203047305345535, + "learning_rate": 9.811472814416669e-06, + "loss": 0.487, + "step": 111 + }, + { + "epoch": 0.6871165644171779, + "grad_norm": 0.08309823274612427, + "learning_rate": 9.807669678375643e-06, + "loss": 0.4731, + "step": 112 + }, + { + "epoch": 0.6932515337423313, + "grad_norm": 0.0878622755408287, + "learning_rate": 9.803829316349262e-06, + "loss": 0.5039, + "step": 113 + }, + { + "epoch": 0.6993865030674846, + "grad_norm": 0.09177032113075256, + "learning_rate": 9.799951758073607e-06, + "loss": 0.4605, + "step": 114 + }, + { + "epoch": 0.7055214723926381, + "grad_norm": 0.08374316245317459, + "learning_rate": 9.796037033572771e-06, + "loss": 0.4926, + "step": 115 + }, + { + "epoch": 0.7116564417177914, + "grad_norm": 0.08500220626592636, + "learning_rate": 9.792085173158633e-06, + "loss": 0.5156, + "step": 116 + }, + { + "epoch": 0.7177914110429447, + "grad_norm": 0.10756376385688782, + "learning_rate": 9.788096207430608e-06, + "loss": 0.4892, + "step": 117 + }, + { + "epoch": 0.7239263803680982, + "grad_norm": 0.08716326951980591, + "learning_rate": 9.784070167275422e-06, + "loss": 0.488, + "step": 118 + }, + { + "epoch": 0.7300613496932515, + "grad_norm": 0.08646374195814133, + "learning_rate": 9.780007083866872e-06, + "loss": 0.5176, + "step": 119 + }, + { + "epoch": 0.7361963190184049, + "grad_norm": 0.08787130564451218, + "learning_rate": 9.775906988665583e-06, + "loss": 0.5049, + "step": 120 + }, + { + "epoch": 0.7423312883435583, + "grad_norm": 0.11348209530115128, + "learning_rate": 9.771769913418758e-06, + "loss": 0.4736, + "step": 121 + }, + { + "epoch": 0.7484662576687117, + "grad_norm": 0.09234916418790817, + "learning_rate": 9.767595890159944e-06, + "loss": 0.4943, + "step": 122 + }, + { + "epoch": 0.754601226993865, + "grad_norm": 0.08790959417819977, + "learning_rate": 9.763384951208776e-06, + "loss": 0.5097, + "step": 123 + }, + { + "epoch": 0.7607361963190185, + "grad_norm": 0.08863342553377151, + "learning_rate": 9.759137129170728e-06, + "loss": 0.5241, + "step": 124 + }, + { + "epoch": 0.7668711656441718, + "grad_norm": 0.08124402910470963, + "learning_rate": 9.754852456936862e-06, + "loss": 0.4759, + "step": 125 + }, + { + "epoch": 0.7730061349693251, + "grad_norm": 0.08492821455001831, + "learning_rate": 9.750530967683573e-06, + "loss": 0.478, + "step": 126 + }, + { + "epoch": 0.7791411042944786, + "grad_norm": 0.09666764736175537, + "learning_rate": 9.746172694872332e-06, + "loss": 0.4842, + "step": 127 + }, + { + "epoch": 0.7852760736196319, + "grad_norm": 0.08827357739210129, + "learning_rate": 9.741777672249424e-06, + "loss": 0.5038, + "step": 128 + }, + { + "epoch": 0.7914110429447853, + "grad_norm": 0.09018061310052872, + "learning_rate": 9.737345933845692e-06, + "loss": 0.5357, + "step": 129 + }, + { + "epoch": 0.7975460122699386, + "grad_norm": 0.09164313971996307, + "learning_rate": 9.732877513976269e-06, + "loss": 0.5167, + "step": 130 + }, + { + "epoch": 0.803680981595092, + "grad_norm": 0.08618041127920151, + "learning_rate": 9.728372447240315e-06, + "loss": 0.4807, + "step": 131 + }, + { + "epoch": 0.8098159509202454, + "grad_norm": 0.10933379828929901, + "learning_rate": 9.72383076852075e-06, + "loss": 0.5073, + "step": 132 + }, + { + "epoch": 0.8159509202453987, + "grad_norm": 0.08988262712955475, + "learning_rate": 9.71925251298398e-06, + "loss": 0.5095, + "step": 133 + }, + { + "epoch": 0.8220858895705522, + "grad_norm": 0.08735327422618866, + "learning_rate": 9.714637716079627e-06, + "loss": 0.4838, + "step": 134 + }, + { + "epoch": 0.8282208588957055, + "grad_norm": 0.08958423137664795, + "learning_rate": 9.709986413540254e-06, + "loss": 0.4931, + "step": 135 + }, + { + "epoch": 0.8343558282208589, + "grad_norm": 0.08985975384712219, + "learning_rate": 9.705298641381089e-06, + "loss": 0.4776, + "step": 136 + }, + { + "epoch": 0.8404907975460123, + "grad_norm": 0.10067565739154816, + "learning_rate": 9.700574435899745e-06, + "loss": 0.4672, + "step": 137 + }, + { + "epoch": 0.8466257668711656, + "grad_norm": 0.09510832279920578, + "learning_rate": 9.695813833675943e-06, + "loss": 0.4497, + "step": 138 + }, + { + "epoch": 0.852760736196319, + "grad_norm": 0.09438912570476532, + "learning_rate": 9.691016871571219e-06, + "loss": 0.4935, + "step": 139 + }, + { + "epoch": 0.8588957055214724, + "grad_norm": 0.0871388390660286, + "learning_rate": 9.686183586728654e-06, + "loss": 0.5168, + "step": 140 + }, + { + "epoch": 0.8650306748466258, + "grad_norm": 0.08199909329414368, + "learning_rate": 9.681314016572572e-06, + "loss": 0.4852, + "step": 141 + }, + { + "epoch": 0.8711656441717791, + "grad_norm": 0.08726981282234192, + "learning_rate": 9.676408198808253e-06, + "loss": 0.468, + "step": 142 + }, + { + "epoch": 0.8773006134969326, + "grad_norm": 0.09949006140232086, + "learning_rate": 9.671466171421651e-06, + "loss": 0.4706, + "step": 143 + }, + { + "epoch": 0.8834355828220859, + "grad_norm": 0.0843021422624588, + "learning_rate": 9.666487972679085e-06, + "loss": 0.4729, + "step": 144 + }, + { + "epoch": 0.8895705521472392, + "grad_norm": 0.0966368094086647, + "learning_rate": 9.661473641126954e-06, + "loss": 0.4628, + "step": 145 + }, + { + "epoch": 0.8957055214723927, + "grad_norm": 0.09247634559869766, + "learning_rate": 9.65642321559144e-06, + "loss": 0.4907, + "step": 146 + }, + { + "epoch": 0.901840490797546, + "grad_norm": 0.09630908071994781, + "learning_rate": 9.651336735178191e-06, + "loss": 0.4878, + "step": 147 + }, + { + "epoch": 0.9079754601226994, + "grad_norm": 0.09485550224781036, + "learning_rate": 9.646214239272038e-06, + "loss": 0.4735, + "step": 148 + }, + { + "epoch": 0.9141104294478528, + "grad_norm": 0.09002482891082764, + "learning_rate": 9.64105576753668e-06, + "loss": 0.4755, + "step": 149 + }, + { + "epoch": 0.9202453987730062, + "grad_norm": 0.09443383663892746, + "learning_rate": 9.635861359914374e-06, + "loss": 0.486, + "step": 150 + }, + { + "epoch": 0.9263803680981595, + "grad_norm": 0.08977822959423065, + "learning_rate": 9.630631056625635e-06, + "loss": 0.4966, + "step": 151 + }, + { + "epoch": 0.9325153374233128, + "grad_norm": 0.09370667487382889, + "learning_rate": 9.62536489816892e-06, + "loss": 0.4792, + "step": 152 + }, + { + "epoch": 0.9386503067484663, + "grad_norm": 0.09303581714630127, + "learning_rate": 9.620062925320309e-06, + "loss": 0.4648, + "step": 153 + }, + { + "epoch": 0.9447852760736196, + "grad_norm": 0.08958807587623596, + "learning_rate": 9.614725179133197e-06, + "loss": 0.4786, + "step": 154 + }, + { + "epoch": 0.950920245398773, + "grad_norm": 0.10791140049695969, + "learning_rate": 9.609351700937976e-06, + "loss": 0.472, + "step": 155 + }, + { + "epoch": 0.9570552147239264, + "grad_norm": 0.09399349987506866, + "learning_rate": 9.60394253234171e-06, + "loss": 0.4529, + "step": 156 + }, + { + "epoch": 0.9631901840490797, + "grad_norm": 0.09015163034200668, + "learning_rate": 9.598497715227815e-06, + "loss": 0.4839, + "step": 157 + }, + { + "epoch": 0.9693251533742331, + "grad_norm": 0.09602297842502594, + "learning_rate": 9.593017291755733e-06, + "loss": 0.4806, + "step": 158 + }, + { + "epoch": 0.9754601226993865, + "grad_norm": 0.097842738032341, + "learning_rate": 9.587501304360612e-06, + "loss": 0.4518, + "step": 159 + }, + { + "epoch": 0.9815950920245399, + "grad_norm": 0.11291999369859695, + "learning_rate": 9.581949795752972e-06, + "loss": 0.4375, + "step": 160 + }, + { + "epoch": 0.9877300613496932, + "grad_norm": 0.0913916602730751, + "learning_rate": 9.576362808918368e-06, + "loss": 0.4771, + "step": 161 + }, + { + "epoch": 0.9938650306748467, + "grad_norm": 0.10012573003768921, + "learning_rate": 9.570740387117078e-06, + "loss": 0.4775, + "step": 162 + }, + { + "epoch": 1.0, + "grad_norm": 0.09484586119651794, + "learning_rate": 9.565082573883745e-06, + "loss": 0.4859, + "step": 163 + }, + { + "epoch": 1.0061349693251533, + "grad_norm": 0.09999288618564606, + "learning_rate": 9.559389413027048e-06, + "loss": 0.4723, + "step": 164 + }, + { + "epoch": 1.0122699386503067, + "grad_norm": 0.09577009081840515, + "learning_rate": 9.553660948629369e-06, + "loss": 0.492, + "step": 165 + }, + { + "epoch": 1.01840490797546, + "grad_norm": 0.1010693684220314, + "learning_rate": 9.547897225046445e-06, + "loss": 0.489, + "step": 166 + }, + { + "epoch": 1.0245398773006136, + "grad_norm": 0.10461648553609848, + "learning_rate": 9.542098286907024e-06, + "loss": 0.4795, + "step": 167 + }, + { + "epoch": 1.030674846625767, + "grad_norm": 0.10396389663219452, + "learning_rate": 9.536264179112529e-06, + "loss": 0.464, + "step": 168 + }, + { + "epoch": 1.0368098159509203, + "grad_norm": 0.09160111844539642, + "learning_rate": 9.530394946836694e-06, + "loss": 0.4997, + "step": 169 + }, + { + "epoch": 1.0429447852760736, + "grad_norm": 0.09714648127555847, + "learning_rate": 9.524490635525228e-06, + "loss": 0.4724, + "step": 170 + }, + { + "epoch": 1.049079754601227, + "grad_norm": 0.10137296468019485, + "learning_rate": 9.51855129089546e-06, + "loss": 0.4802, + "step": 171 + }, + { + "epoch": 1.0552147239263803, + "grad_norm": 0.09933356940746307, + "learning_rate": 9.51257695893598e-06, + "loss": 0.4478, + "step": 172 + }, + { + "epoch": 1.0613496932515338, + "grad_norm": 0.0967121571302414, + "learning_rate": 9.506567685906289e-06, + "loss": 0.4496, + "step": 173 + }, + { + "epoch": 1.0674846625766872, + "grad_norm": 0.10460793972015381, + "learning_rate": 9.500523518336435e-06, + "loss": 0.4569, + "step": 174 + }, + { + "epoch": 1.0736196319018405, + "grad_norm": 0.11570947617292404, + "learning_rate": 9.494444503026656e-06, + "loss": 0.4808, + "step": 175 + }, + { + "epoch": 1.0797546012269938, + "grad_norm": 0.1063118651509285, + "learning_rate": 9.488330687047025e-06, + "loss": 0.4889, + "step": 176 + }, + { + "epoch": 1.0858895705521472, + "grad_norm": 0.09846626967191696, + "learning_rate": 9.482182117737066e-06, + "loss": 0.4876, + "step": 177 + }, + { + "epoch": 1.0920245398773005, + "grad_norm": 0.09458671510219574, + "learning_rate": 9.475998842705412e-06, + "loss": 0.4633, + "step": 178 + }, + { + "epoch": 1.098159509202454, + "grad_norm": 0.0982397273182869, + "learning_rate": 9.469780909829411e-06, + "loss": 0.462, + "step": 179 + }, + { + "epoch": 1.1042944785276074, + "grad_norm": 0.09949040412902832, + "learning_rate": 9.46352836725478e-06, + "loss": 0.4677, + "step": 180 + }, + { + "epoch": 1.1104294478527608, + "grad_norm": 0.10287673771381378, + "learning_rate": 9.457241263395212e-06, + "loss": 0.4743, + "step": 181 + }, + { + "epoch": 1.116564417177914, + "grad_norm": 0.10356439650058746, + "learning_rate": 9.450919646932013e-06, + "loss": 0.483, + "step": 182 + }, + { + "epoch": 1.1226993865030674, + "grad_norm": 0.09517747908830643, + "learning_rate": 9.44456356681372e-06, + "loss": 0.4462, + "step": 183 + }, + { + "epoch": 1.1288343558282208, + "grad_norm": 0.12741521000862122, + "learning_rate": 9.438173072255727e-06, + "loss": 0.4577, + "step": 184 + }, + { + "epoch": 1.1349693251533743, + "grad_norm": 0.10354242473840714, + "learning_rate": 9.431748212739897e-06, + "loss": 0.4308, + "step": 185 + }, + { + "epoch": 1.1411042944785277, + "grad_norm": 0.11123675853013992, + "learning_rate": 9.425289038014184e-06, + "loss": 0.4937, + "step": 186 + }, + { + "epoch": 1.147239263803681, + "grad_norm": 0.09957747161388397, + "learning_rate": 9.418795598092243e-06, + "loss": 0.4646, + "step": 187 + }, + { + "epoch": 1.1533742331288344, + "grad_norm": 0.10711025446653366, + "learning_rate": 9.41226794325305e-06, + "loss": 0.5061, + "step": 188 + }, + { + "epoch": 1.1595092024539877, + "grad_norm": 0.09933678060770035, + "learning_rate": 9.405706124040506e-06, + "loss": 0.4725, + "step": 189 + }, + { + "epoch": 1.165644171779141, + "grad_norm": 0.10218477994203568, + "learning_rate": 9.399110191263048e-06, + "loss": 0.4745, + "step": 190 + }, + { + "epoch": 1.1717791411042944, + "grad_norm": 0.10867977887392044, + "learning_rate": 9.392480195993258e-06, + "loss": 0.4866, + "step": 191 + }, + { + "epoch": 1.177914110429448, + "grad_norm": 0.10019568353891373, + "learning_rate": 9.385816189567462e-06, + "loss": 0.4638, + "step": 192 + }, + { + "epoch": 1.1840490797546013, + "grad_norm": 0.0998578816652298, + "learning_rate": 9.379118223585342e-06, + "loss": 0.4609, + "step": 193 + }, + { + "epoch": 1.1901840490797546, + "grad_norm": 0.10544786602258682, + "learning_rate": 9.372386349909521e-06, + "loss": 0.4702, + "step": 194 + }, + { + "epoch": 1.196319018404908, + "grad_norm": 0.11477886140346527, + "learning_rate": 9.365620620665176e-06, + "loss": 0.4664, + "step": 195 + }, + { + "epoch": 1.2024539877300613, + "grad_norm": 0.09951955080032349, + "learning_rate": 9.358821088239632e-06, + "loss": 0.4672, + "step": 196 + }, + { + "epoch": 1.2085889570552146, + "grad_norm": 0.10202579945325851, + "learning_rate": 9.351987805281949e-06, + "loss": 0.4728, + "step": 197 + }, + { + "epoch": 1.2147239263803682, + "grad_norm": 0.11044152826070786, + "learning_rate": 9.345120824702515e-06, + "loss": 0.4306, + "step": 198 + }, + { + "epoch": 1.2208588957055215, + "grad_norm": 0.10835869610309601, + "learning_rate": 9.338220199672652e-06, + "loss": 0.4879, + "step": 199 + }, + { + "epoch": 1.2269938650306749, + "grad_norm": 0.10329335927963257, + "learning_rate": 9.331285983624182e-06, + "loss": 0.4728, + "step": 200 + }, + { + "epoch": 1.2331288343558282, + "grad_norm": 0.10831797868013382, + "learning_rate": 9.324318230249026e-06, + "loss": 0.4789, + "step": 201 + }, + { + "epoch": 1.2392638036809815, + "grad_norm": 0.10910773277282715, + "learning_rate": 9.317316993498788e-06, + "loss": 0.4642, + "step": 202 + }, + { + "epoch": 1.2453987730061349, + "grad_norm": 0.10746899992227554, + "learning_rate": 9.310282327584335e-06, + "loss": 0.4659, + "step": 203 + }, + { + "epoch": 1.2515337423312882, + "grad_norm": 0.11725542694330215, + "learning_rate": 9.303214286975373e-06, + "loss": 0.4629, + "step": 204 + }, + { + "epoch": 1.2576687116564418, + "grad_norm": 0.10740689188241959, + "learning_rate": 9.296112926400038e-06, + "loss": 0.4835, + "step": 205 + }, + { + "epoch": 1.2638036809815951, + "grad_norm": 0.10859764367341995, + "learning_rate": 9.288978300844456e-06, + "loss": 0.4555, + "step": 206 + }, + { + "epoch": 1.2699386503067485, + "grad_norm": 0.11524678766727448, + "learning_rate": 9.281810465552327e-06, + "loss": 0.4597, + "step": 207 + }, + { + "epoch": 1.2760736196319018, + "grad_norm": 0.11158965528011322, + "learning_rate": 9.274609476024499e-06, + "loss": 0.4903, + "step": 208 + }, + { + "epoch": 1.2822085889570551, + "grad_norm": 0.10941608250141144, + "learning_rate": 9.26737538801853e-06, + "loss": 0.4778, + "step": 209 + }, + { + "epoch": 1.2883435582822087, + "grad_norm": 0.10840465873479843, + "learning_rate": 9.260108257548264e-06, + "loss": 0.4635, + "step": 210 + }, + { + "epoch": 1.294478527607362, + "grad_norm": 0.11878825724124908, + "learning_rate": 9.252808140883393e-06, + "loss": 0.4668, + "step": 211 + }, + { + "epoch": 1.3006134969325154, + "grad_norm": 0.11123567819595337, + "learning_rate": 9.24547509454902e-06, + "loss": 0.4587, + "step": 212 + }, + { + "epoch": 1.3067484662576687, + "grad_norm": 0.1218094527721405, + "learning_rate": 9.238109175325232e-06, + "loss": 0.4741, + "step": 213 + }, + { + "epoch": 1.312883435582822, + "grad_norm": 0.11062923818826675, + "learning_rate": 9.230710440246642e-06, + "loss": 0.4712, + "step": 214 + }, + { + "epoch": 1.3190184049079754, + "grad_norm": 0.11379806697368622, + "learning_rate": 9.223278946601963e-06, + "loss": 0.4822, + "step": 215 + }, + { + "epoch": 1.3251533742331287, + "grad_norm": 0.11029627174139023, + "learning_rate": 9.215814751933559e-06, + "loss": 0.4479, + "step": 216 + }, + { + "epoch": 1.331288343558282, + "grad_norm": 0.10827028751373291, + "learning_rate": 9.208317914036997e-06, + "loss": 0.4707, + "step": 217 + }, + { + "epoch": 1.3374233128834356, + "grad_norm": 0.11715540289878845, + "learning_rate": 9.200788490960605e-06, + "loss": 0.4576, + "step": 218 + }, + { + "epoch": 1.343558282208589, + "grad_norm": 0.11236132681369781, + "learning_rate": 9.193226541005015e-06, + "loss": 0.4646, + "step": 219 + }, + { + "epoch": 1.3496932515337423, + "grad_norm": 0.1052820086479187, + "learning_rate": 9.185632122722719e-06, + "loss": 0.4761, + "step": 220 + }, + { + "epoch": 1.3558282208588956, + "grad_norm": 0.12960059940814972, + "learning_rate": 9.178005294917615e-06, + "loss": 0.4741, + "step": 221 + }, + { + "epoch": 1.3619631901840492, + "grad_norm": 0.11204280704259872, + "learning_rate": 9.170346116644545e-06, + "loss": 0.4752, + "step": 222 + }, + { + "epoch": 1.3680981595092025, + "grad_norm": 0.11414473503828049, + "learning_rate": 9.16265464720884e-06, + "loss": 0.479, + "step": 223 + }, + { + "epoch": 1.3742331288343559, + "grad_norm": 0.12588758766651154, + "learning_rate": 9.154930946165872e-06, + "loss": 0.4299, + "step": 224 + }, + { + "epoch": 1.3803680981595092, + "grad_norm": 0.12927886843681335, + "learning_rate": 9.147175073320574e-06, + "loss": 0.4669, + "step": 225 + }, + { + "epoch": 1.3865030674846626, + "grad_norm": 0.1076217070221901, + "learning_rate": 9.13938708872699e-06, + "loss": 0.4788, + "step": 226 + }, + { + "epoch": 1.392638036809816, + "grad_norm": 0.11360085755586624, + "learning_rate": 9.131567052687811e-06, + "loss": 0.4374, + "step": 227 + }, + { + "epoch": 1.3987730061349692, + "grad_norm": 0.11873716115951538, + "learning_rate": 9.123715025753896e-06, + "loss": 0.4401, + "step": 228 + }, + { + "epoch": 1.4049079754601226, + "grad_norm": 0.11451287567615509, + "learning_rate": 9.115831068723816e-06, + "loss": 0.4811, + "step": 229 + }, + { + "epoch": 1.4110429447852761, + "grad_norm": 0.11967512965202332, + "learning_rate": 9.10791524264338e-06, + "loss": 0.4767, + "step": 230 + }, + { + "epoch": 1.4171779141104295, + "grad_norm": 0.11932694166898727, + "learning_rate": 9.099967608805152e-06, + "loss": 0.4567, + "step": 231 + }, + { + "epoch": 1.4233128834355828, + "grad_norm": 0.11423443257808685, + "learning_rate": 9.091988228747992e-06, + "loss": 0.4549, + "step": 232 + }, + { + "epoch": 1.4294478527607362, + "grad_norm": 0.11714685708284378, + "learning_rate": 9.08397716425657e-06, + "loss": 0.4259, + "step": 233 + }, + { + "epoch": 1.4355828220858895, + "grad_norm": 0.1315395087003708, + "learning_rate": 9.07593447736089e-06, + "loss": 0.502, + "step": 234 + }, + { + "epoch": 1.441717791411043, + "grad_norm": 0.1216348186135292, + "learning_rate": 9.06786023033581e-06, + "loss": 0.4622, + "step": 235 + }, + { + "epoch": 1.4478527607361964, + "grad_norm": 0.11694086343050003, + "learning_rate": 9.059754485700557e-06, + "loss": 0.4413, + "step": 236 + }, + { + "epoch": 1.4539877300613497, + "grad_norm": 0.12045416980981827, + "learning_rate": 9.05161730621825e-06, + "loss": 0.4579, + "step": 237 + }, + { + "epoch": 1.460122699386503, + "grad_norm": 0.11791419237852097, + "learning_rate": 9.043448754895405e-06, + "loss": 0.4712, + "step": 238 + }, + { + "epoch": 1.4662576687116564, + "grad_norm": 0.12033417075872421, + "learning_rate": 9.035248894981454e-06, + "loss": 0.4787, + "step": 239 + }, + { + "epoch": 1.4723926380368098, + "grad_norm": 0.1140502542257309, + "learning_rate": 9.02701778996825e-06, + "loss": 0.4909, + "step": 240 + }, + { + "epoch": 1.478527607361963, + "grad_norm": 0.1263529509305954, + "learning_rate": 9.018755503589582e-06, + "loss": 0.4606, + "step": 241 + }, + { + "epoch": 1.4846625766871164, + "grad_norm": 0.11217690259218216, + "learning_rate": 9.010462099820674e-06, + "loss": 0.4455, + "step": 242 + }, + { + "epoch": 1.49079754601227, + "grad_norm": 0.12041988223791122, + "learning_rate": 9.002137642877696e-06, + "loss": 0.4317, + "step": 243 + }, + { + "epoch": 1.4969325153374233, + "grad_norm": 0.128048375248909, + "learning_rate": 8.993782197217262e-06, + "loss": 0.4843, + "step": 244 + }, + { + "epoch": 1.5030674846625767, + "grad_norm": 0.12902449071407318, + "learning_rate": 8.985395827535934e-06, + "loss": 0.4224, + "step": 245 + }, + { + "epoch": 1.50920245398773, + "grad_norm": 0.1286553293466568, + "learning_rate": 8.976978598769719e-06, + "loss": 0.4613, + "step": 246 + }, + { + "epoch": 1.5153374233128836, + "grad_norm": 0.11924152076244354, + "learning_rate": 8.96853057609357e-06, + "loss": 0.4288, + "step": 247 + }, + { + "epoch": 1.521472392638037, + "grad_norm": 0.12467361986637115, + "learning_rate": 8.960051824920873e-06, + "loss": 0.4631, + "step": 248 + }, + { + "epoch": 1.5276073619631902, + "grad_norm": 0.12638317048549652, + "learning_rate": 8.951542410902949e-06, + "loss": 0.4331, + "step": 249 + }, + { + "epoch": 1.5337423312883436, + "grad_norm": 0.11676699668169022, + "learning_rate": 8.943002399928547e-06, + "loss": 0.4565, + "step": 250 + }, + { + "epoch": 1.539877300613497, + "grad_norm": 0.1417926698923111, + "learning_rate": 8.934431858123324e-06, + "loss": 0.4796, + "step": 251 + }, + { + "epoch": 1.5460122699386503, + "grad_norm": 0.1345888376235962, + "learning_rate": 8.925830851849338e-06, + "loss": 0.4658, + "step": 252 + }, + { + "epoch": 1.5521472392638036, + "grad_norm": 0.11821699142456055, + "learning_rate": 8.917199447704538e-06, + "loss": 0.4376, + "step": 253 + }, + { + "epoch": 1.558282208588957, + "grad_norm": 0.12282978743314743, + "learning_rate": 8.908537712522246e-06, + "loss": 0.4799, + "step": 254 + }, + { + "epoch": 1.5644171779141103, + "grad_norm": 0.1122986450791359, + "learning_rate": 8.899845713370632e-06, + "loss": 0.4423, + "step": 255 + }, + { + "epoch": 1.5705521472392638, + "grad_norm": 0.1364981234073639, + "learning_rate": 8.891123517552208e-06, + "loss": 0.4559, + "step": 256 + }, + { + "epoch": 1.5766871165644172, + "grad_norm": 0.14287355542182922, + "learning_rate": 8.882371192603297e-06, + "loss": 0.4446, + "step": 257 + }, + { + "epoch": 1.5828220858895705, + "grad_norm": 0.13158780336380005, + "learning_rate": 8.87358880629351e-06, + "loss": 0.4805, + "step": 258 + }, + { + "epoch": 1.588957055214724, + "grad_norm": 0.14378562569618225, + "learning_rate": 8.864776426625231e-06, + "loss": 0.4656, + "step": 259 + }, + { + "epoch": 1.5950920245398774, + "grad_norm": 0.1245376318693161, + "learning_rate": 8.855934121833083e-06, + "loss": 0.4425, + "step": 260 + }, + { + "epoch": 1.6012269938650308, + "grad_norm": 0.12470446527004242, + "learning_rate": 8.847061960383395e-06, + "loss": 0.4495, + "step": 261 + }, + { + "epoch": 1.607361963190184, + "grad_norm": 0.11994479596614838, + "learning_rate": 8.83816001097368e-06, + "loss": 0.4169, + "step": 262 + }, + { + "epoch": 1.6134969325153374, + "grad_norm": 0.13416039943695068, + "learning_rate": 8.8292283425321e-06, + "loss": 0.4749, + "step": 263 + }, + { + "epoch": 1.6196319018404908, + "grad_norm": 0.12816861271858215, + "learning_rate": 8.820267024216937e-06, + "loss": 0.4464, + "step": 264 + }, + { + "epoch": 1.6257668711656441, + "grad_norm": 0.1400822103023529, + "learning_rate": 8.811276125416048e-06, + "loss": 0.464, + "step": 265 + }, + { + "epoch": 1.6319018404907975, + "grad_norm": 0.1271728277206421, + "learning_rate": 8.802255715746333e-06, + "loss": 0.4798, + "step": 266 + }, + { + "epoch": 1.6380368098159508, + "grad_norm": 0.11546501517295837, + "learning_rate": 8.7932058650532e-06, + "loss": 0.4716, + "step": 267 + }, + { + "epoch": 1.6441717791411041, + "grad_norm": 0.12347917258739471, + "learning_rate": 8.784126643410015e-06, + "loss": 0.446, + "step": 268 + }, + { + "epoch": 1.6503067484662577, + "grad_norm": 0.1298205405473709, + "learning_rate": 8.775018121117569e-06, + "loss": 0.4569, + "step": 269 + }, + { + "epoch": 1.656441717791411, + "grad_norm": 0.12920261919498444, + "learning_rate": 8.765880368703527e-06, + "loss": 0.4946, + "step": 270 + }, + { + "epoch": 1.6625766871165644, + "grad_norm": 0.11939296126365662, + "learning_rate": 8.756713456921885e-06, + "loss": 0.4703, + "step": 271 + }, + { + "epoch": 1.668711656441718, + "grad_norm": 0.12618795037269592, + "learning_rate": 8.747517456752419e-06, + "loss": 0.4665, + "step": 272 + }, + { + "epoch": 1.6748466257668713, + "grad_norm": 0.1314852237701416, + "learning_rate": 8.73829243940014e-06, + "loss": 0.4577, + "step": 273 + }, + { + "epoch": 1.6809815950920246, + "grad_norm": 0.14323115348815918, + "learning_rate": 8.729038476294737e-06, + "loss": 0.428, + "step": 274 + }, + { + "epoch": 1.687116564417178, + "grad_norm": 0.1325852870941162, + "learning_rate": 8.719755639090032e-06, + "loss": 0.443, + "step": 275 + }, + { + "epoch": 1.6932515337423313, + "grad_norm": 0.12698835134506226, + "learning_rate": 8.710443999663417e-06, + "loss": 0.5045, + "step": 276 + }, + { + "epoch": 1.6993865030674846, + "grad_norm": 0.12905320525169373, + "learning_rate": 8.701103630115303e-06, + "loss": 0.4664, + "step": 277 + }, + { + "epoch": 1.705521472392638, + "grad_norm": 0.12309697270393372, + "learning_rate": 8.691734602768554e-06, + "loss": 0.4227, + "step": 278 + }, + { + "epoch": 1.7116564417177913, + "grad_norm": 0.12686602771282196, + "learning_rate": 8.68233699016794e-06, + "loss": 0.4878, + "step": 279 + }, + { + "epoch": 1.7177914110429446, + "grad_norm": 0.13946817815303802, + "learning_rate": 8.672910865079564e-06, + "loss": 0.4487, + "step": 280 + }, + { + "epoch": 1.7239263803680982, + "grad_norm": 0.13267400860786438, + "learning_rate": 8.663456300490302e-06, + "loss": 0.4481, + "step": 281 + }, + { + "epoch": 1.7300613496932515, + "grad_norm": 0.12559852004051208, + "learning_rate": 8.65397336960724e-06, + "loss": 0.449, + "step": 282 + }, + { + "epoch": 1.7361963190184049, + "grad_norm": 0.1440388262271881, + "learning_rate": 8.644462145857104e-06, + "loss": 0.4466, + "step": 283 + }, + { + "epoch": 1.7423312883435584, + "grad_norm": 0.1397363543510437, + "learning_rate": 8.634922702885693e-06, + "loss": 0.4113, + "step": 284 + }, + { + "epoch": 1.7484662576687118, + "grad_norm": 0.13909780979156494, + "learning_rate": 8.62535511455731e-06, + "loss": 0.4504, + "step": 285 + }, + { + "epoch": 1.7546012269938651, + "grad_norm": 0.128562331199646, + "learning_rate": 8.615759454954187e-06, + "loss": 0.4593, + "step": 286 + }, + { + "epoch": 1.7607361963190185, + "grad_norm": 0.12467021495103836, + "learning_rate": 8.60613579837591e-06, + "loss": 0.4728, + "step": 287 + }, + { + "epoch": 1.7668711656441718, + "grad_norm": 0.14602142572402954, + "learning_rate": 8.596484219338856e-06, + "loss": 0.4732, + "step": 288 + }, + { + "epoch": 1.7730061349693251, + "grad_norm": 0.14744794368743896, + "learning_rate": 8.586804792575596e-06, + "loss": 0.4308, + "step": 289 + }, + { + "epoch": 1.7791411042944785, + "grad_norm": 0.12893109023571014, + "learning_rate": 8.577097593034338e-06, + "loss": 0.4589, + "step": 290 + }, + { + "epoch": 1.7852760736196318, + "grad_norm": 0.1237974613904953, + "learning_rate": 8.567362695878325e-06, + "loss": 0.477, + "step": 291 + }, + { + "epoch": 1.7914110429447851, + "grad_norm": 0.12805530428886414, + "learning_rate": 8.55760017648527e-06, + "loss": 0.4544, + "step": 292 + }, + { + "epoch": 1.7975460122699385, + "grad_norm": 0.1318630576133728, + "learning_rate": 8.547810110446766e-06, + "loss": 0.4428, + "step": 293 + }, + { + "epoch": 1.803680981595092, + "grad_norm": 0.13845506310462952, + "learning_rate": 8.537992573567698e-06, + "loss": 0.4704, + "step": 294 + }, + { + "epoch": 1.8098159509202454, + "grad_norm": 0.14310558140277863, + "learning_rate": 8.528147641865661e-06, + "loss": 0.4534, + "step": 295 + }, + { + "epoch": 1.8159509202453987, + "grad_norm": 0.14174222946166992, + "learning_rate": 8.518275391570368e-06, + "loss": 0.435, + "step": 296 + }, + { + "epoch": 1.8220858895705523, + "grad_norm": 0.14490178227424622, + "learning_rate": 8.508375899123064e-06, + "loss": 0.4255, + "step": 297 + }, + { + "epoch": 1.8282208588957056, + "grad_norm": 0.13574454188346863, + "learning_rate": 8.498449241175927e-06, + "loss": 0.4384, + "step": 298 + }, + { + "epoch": 1.834355828220859, + "grad_norm": 0.12258850038051605, + "learning_rate": 8.488495494591482e-06, + "loss": 0.4114, + "step": 299 + }, + { + "epoch": 1.8404907975460123, + "grad_norm": 0.13615848124027252, + "learning_rate": 8.478514736441998e-06, + "loss": 0.4566, + "step": 300 + }, + { + "epoch": 1.8466257668711656, + "grad_norm": 0.1377963125705719, + "learning_rate": 8.468507044008902e-06, + "loss": 0.4452, + "step": 301 + }, + { + "epoch": 1.852760736196319, + "grad_norm": 0.13099685311317444, + "learning_rate": 8.458472494782169e-06, + "loss": 0.432, + "step": 302 + }, + { + "epoch": 1.8588957055214723, + "grad_norm": 0.13411100208759308, + "learning_rate": 8.44841116645973e-06, + "loss": 0.4551, + "step": 303 + }, + { + "epoch": 1.8650306748466257, + "grad_norm": 0.13292278349399567, + "learning_rate": 8.438323136946865e-06, + "loss": 0.4757, + "step": 304 + }, + { + "epoch": 1.871165644171779, + "grad_norm": 0.12634415924549103, + "learning_rate": 8.428208484355606e-06, + "loss": 0.4427, + "step": 305 + }, + { + "epoch": 1.8773006134969326, + "grad_norm": 0.147501140832901, + "learning_rate": 8.418067287004125e-06, + "loss": 0.4714, + "step": 306 + }, + { + "epoch": 1.883435582822086, + "grad_norm": 0.14923636615276337, + "learning_rate": 8.407899623416136e-06, + "loss": 0.4752, + "step": 307 + }, + { + "epoch": 1.8895705521472392, + "grad_norm": 0.126709446310997, + "learning_rate": 8.397705572320275e-06, + "loss": 0.476, + "step": 308 + }, + { + "epoch": 1.8957055214723928, + "grad_norm": 0.12852588295936584, + "learning_rate": 8.387485212649505e-06, + "loss": 0.4335, + "step": 309 + }, + { + "epoch": 1.9018404907975461, + "grad_norm": 0.13321667909622192, + "learning_rate": 8.377238623540491e-06, + "loss": 0.4517, + "step": 310 + }, + { + "epoch": 1.9079754601226995, + "grad_norm": 0.14450277388095856, + "learning_rate": 8.366965884333001e-06, + "loss": 0.4354, + "step": 311 + }, + { + "epoch": 1.9141104294478528, + "grad_norm": 0.12820766866207123, + "learning_rate": 8.356667074569274e-06, + "loss": 0.4088, + "step": 312 + }, + { + "epoch": 1.9202453987730062, + "grad_norm": 0.13799835741519928, + "learning_rate": 8.346342273993427e-06, + "loss": 0.47, + "step": 313 + }, + { + "epoch": 1.9263803680981595, + "grad_norm": 0.1558215469121933, + "learning_rate": 8.335991562550813e-06, + "loss": 0.4682, + "step": 314 + }, + { + "epoch": 1.9325153374233128, + "grad_norm": 0.14499524235725403, + "learning_rate": 8.325615020387422e-06, + "loss": 0.4896, + "step": 315 + }, + { + "epoch": 1.9386503067484662, + "grad_norm": 0.1416790634393692, + "learning_rate": 8.31521272784925e-06, + "loss": 0.4531, + "step": 316 + }, + { + "epoch": 1.9447852760736195, + "grad_norm": 0.1342548131942749, + "learning_rate": 8.304784765481676e-06, + "loss": 0.4403, + "step": 317 + }, + { + "epoch": 1.9509202453987728, + "grad_norm": 0.1562686264514923, + "learning_rate": 8.294331214028845e-06, + "loss": 0.4244, + "step": 318 + }, + { + "epoch": 1.9570552147239264, + "grad_norm": 0.13778996467590332, + "learning_rate": 8.283852154433042e-06, + "loss": 0.484, + "step": 319 + }, + { + "epoch": 1.9631901840490797, + "grad_norm": 0.14441350102424622, + "learning_rate": 8.273347667834057e-06, + "loss": 0.4534, + "step": 320 + }, + { + "epoch": 1.969325153374233, + "grad_norm": 0.15606369078159332, + "learning_rate": 8.262817835568563e-06, + "loss": 0.4403, + "step": 321 + }, + { + "epoch": 1.9754601226993866, + "grad_norm": 0.12662853300571442, + "learning_rate": 8.25226273916949e-06, + "loss": 0.4673, + "step": 322 + }, + { + "epoch": 1.98159509202454, + "grad_norm": 0.13923388719558716, + "learning_rate": 8.241682460365383e-06, + "loss": 0.4613, + "step": 323 + }, + { + "epoch": 1.9877300613496933, + "grad_norm": 0.13272419571876526, + "learning_rate": 8.231077081079781e-06, + "loss": 0.4716, + "step": 324 + }, + { + "epoch": 1.9938650306748467, + "grad_norm": 0.146401509642601, + "learning_rate": 8.220446683430577e-06, + "loss": 0.4427, + "step": 325 + }, + { + "epoch": 2.0, + "grad_norm": 0.15548086166381836, + "learning_rate": 8.209791349729376e-06, + "loss": 0.4837, + "step": 326 + }, + { + "epoch": 2.0061349693251533, + "grad_norm": 0.14179112017154694, + "learning_rate": 8.199111162480871e-06, + "loss": 0.4326, + "step": 327 + }, + { + "epoch": 2.0122699386503067, + "grad_norm": 0.15060554444789886, + "learning_rate": 8.188406204382192e-06, + "loss": 0.4513, + "step": 328 + }, + { + "epoch": 2.01840490797546, + "grad_norm": 0.1416303813457489, + "learning_rate": 8.177676558322274e-06, + "loss": 0.4639, + "step": 329 + }, + { + "epoch": 2.0245398773006134, + "grad_norm": 0.15585218369960785, + "learning_rate": 8.16692230738121e-06, + "loss": 0.4315, + "step": 330 + }, + { + "epoch": 2.0306748466257667, + "grad_norm": 0.1324591487646103, + "learning_rate": 8.15614353482961e-06, + "loss": 0.4231, + "step": 331 + }, + { + "epoch": 2.03680981595092, + "grad_norm": 0.1416635811328888, + "learning_rate": 8.145340324127958e-06, + "loss": 0.4459, + "step": 332 + }, + { + "epoch": 2.042944785276074, + "grad_norm": 0.13037635385990143, + "learning_rate": 8.134512758925958e-06, + "loss": 0.4439, + "step": 333 + }, + { + "epoch": 2.049079754601227, + "grad_norm": 0.1375626176595688, + "learning_rate": 8.123660923061902e-06, + "loss": 0.4516, + "step": 334 + }, + { + "epoch": 2.0552147239263805, + "grad_norm": 0.1339251846075058, + "learning_rate": 8.112784900561997e-06, + "loss": 0.4357, + "step": 335 + }, + { + "epoch": 2.061349693251534, + "grad_norm": 0.14214186370372772, + "learning_rate": 8.10188477563974e-06, + "loss": 0.4511, + "step": 336 + }, + { + "epoch": 2.067484662576687, + "grad_norm": 0.1451665312051773, + "learning_rate": 8.090960632695246e-06, + "loss": 0.4122, + "step": 337 + }, + { + "epoch": 2.0736196319018405, + "grad_norm": 0.1382775753736496, + "learning_rate": 8.080012556314611e-06, + "loss": 0.4119, + "step": 338 + }, + { + "epoch": 2.079754601226994, + "grad_norm": 0.13937808573246002, + "learning_rate": 8.069040631269239e-06, + "loss": 0.4312, + "step": 339 + }, + { + "epoch": 2.085889570552147, + "grad_norm": 0.15605536103248596, + "learning_rate": 8.058044942515204e-06, + "loss": 0.4578, + "step": 340 + }, + { + "epoch": 2.0920245398773005, + "grad_norm": 0.13876931369304657, + "learning_rate": 8.047025575192576e-06, + "loss": 0.4444, + "step": 341 + }, + { + "epoch": 2.098159509202454, + "grad_norm": 0.1334417313337326, + "learning_rate": 8.035982614624774e-06, + "loss": 0.4576, + "step": 342 + }, + { + "epoch": 2.104294478527607, + "grad_norm": 0.14770197868347168, + "learning_rate": 8.024916146317896e-06, + "loss": 0.4584, + "step": 343 + }, + { + "epoch": 2.1104294478527605, + "grad_norm": 0.1419200599193573, + "learning_rate": 8.013826255960069e-06, + "loss": 0.4213, + "step": 344 + }, + { + "epoch": 2.116564417177914, + "grad_norm": 0.14547352492809296, + "learning_rate": 8.00271302942077e-06, + "loss": 0.4589, + "step": 345 + }, + { + "epoch": 2.1226993865030677, + "grad_norm": 0.15418393909931183, + "learning_rate": 7.991576552750173e-06, + "loss": 0.4393, + "step": 346 + }, + { + "epoch": 2.128834355828221, + "grad_norm": 0.1524142473936081, + "learning_rate": 7.980416912178478e-06, + "loss": 0.4594, + "step": 347 + }, + { + "epoch": 2.1349693251533743, + "grad_norm": 0.21241053938865662, + "learning_rate": 7.969234194115245e-06, + "loss": 0.4578, + "step": 348 + }, + { + "epoch": 2.1411042944785277, + "grad_norm": 0.15679003298282623, + "learning_rate": 7.95802848514872e-06, + "loss": 0.4427, + "step": 349 + }, + { + "epoch": 2.147239263803681, + "grad_norm": 0.14540907740592957, + "learning_rate": 7.946799872045173e-06, + "loss": 0.4388, + "step": 350 + }, + { + "epoch": 2.1533742331288344, + "grad_norm": 0.14929597079753876, + "learning_rate": 7.935548441748221e-06, + "loss": 0.4787, + "step": 351 + }, + { + "epoch": 2.1595092024539877, + "grad_norm": 0.1697789877653122, + "learning_rate": 7.924274281378153e-06, + "loss": 0.4433, + "step": 352 + }, + { + "epoch": 2.165644171779141, + "grad_norm": 0.15002626180648804, + "learning_rate": 7.912977478231262e-06, + "loss": 0.4689, + "step": 353 + }, + { + "epoch": 2.1717791411042944, + "grad_norm": 0.15629152953624725, + "learning_rate": 7.90165811977916e-06, + "loss": 0.445, + "step": 354 + }, + { + "epoch": 2.1779141104294477, + "grad_norm": 0.14952926337718964, + "learning_rate": 7.890316293668108e-06, + "loss": 0.452, + "step": 355 + }, + { + "epoch": 2.184049079754601, + "grad_norm": 0.18062762916088104, + "learning_rate": 7.878952087718336e-06, + "loss": 0.4675, + "step": 356 + }, + { + "epoch": 2.190184049079755, + "grad_norm": 0.13083772361278534, + "learning_rate": 7.867565589923364e-06, + "loss": 0.4529, + "step": 357 + }, + { + "epoch": 2.196319018404908, + "grad_norm": 0.3461189270019531, + "learning_rate": 7.856156888449312e-06, + "loss": 0.4596, + "step": 358 + }, + { + "epoch": 2.2024539877300615, + "grad_norm": 0.1565181016921997, + "learning_rate": 7.844726071634228e-06, + "loss": 0.4523, + "step": 359 + }, + { + "epoch": 2.208588957055215, + "grad_norm": 0.1486126184463501, + "learning_rate": 7.8332732279874e-06, + "loss": 0.4408, + "step": 360 + }, + { + "epoch": 2.214723926380368, + "grad_norm": 0.161222904920578, + "learning_rate": 7.82179844618867e-06, + "loss": 0.4455, + "step": 361 + }, + { + "epoch": 2.2208588957055215, + "grad_norm": 0.14370033144950867, + "learning_rate": 7.810301815087753e-06, + "loss": 0.4628, + "step": 362 + }, + { + "epoch": 2.226993865030675, + "grad_norm": 0.1402759701013565, + "learning_rate": 7.798783423703535e-06, + "loss": 0.4103, + "step": 363 + }, + { + "epoch": 2.233128834355828, + "grad_norm": 0.16180723905563354, + "learning_rate": 7.787243361223397e-06, + "loss": 0.4501, + "step": 364 + }, + { + "epoch": 2.2392638036809815, + "grad_norm": 0.14314652979373932, + "learning_rate": 7.775681717002523e-06, + "loss": 0.4504, + "step": 365 + }, + { + "epoch": 2.245398773006135, + "grad_norm": 0.14083176851272583, + "learning_rate": 7.764098580563203e-06, + "loss": 0.4505, + "step": 366 + }, + { + "epoch": 2.2515337423312882, + "grad_norm": 0.14769653975963593, + "learning_rate": 7.75249404159414e-06, + "loss": 0.4289, + "step": 367 + }, + { + "epoch": 2.2576687116564416, + "grad_norm": 0.17489399015903473, + "learning_rate": 7.740868189949762e-06, + "loss": 0.4586, + "step": 368 + }, + { + "epoch": 2.263803680981595, + "grad_norm": 0.15733091533184052, + "learning_rate": 7.729221115649516e-06, + "loss": 0.4499, + "step": 369 + }, + { + "epoch": 2.2699386503067487, + "grad_norm": 0.1377793848514557, + "learning_rate": 7.717552908877185e-06, + "loss": 0.441, + "step": 370 + }, + { + "epoch": 2.276073619631902, + "grad_norm": 0.13807876408100128, + "learning_rate": 7.705863659980175e-06, + "loss": 0.4457, + "step": 371 + }, + { + "epoch": 2.2822085889570554, + "grad_norm": 0.14367403090000153, + "learning_rate": 7.694153459468822e-06, + "loss": 0.4598, + "step": 372 + }, + { + "epoch": 2.2883435582822087, + "grad_norm": 0.1540713906288147, + "learning_rate": 7.682422398015696e-06, + "loss": 0.4124, + "step": 373 + }, + { + "epoch": 2.294478527607362, + "grad_norm": 0.15163351595401764, + "learning_rate": 7.67067056645489e-06, + "loss": 0.4608, + "step": 374 + }, + { + "epoch": 2.3006134969325154, + "grad_norm": 0.13935914635658264, + "learning_rate": 7.658898055781326e-06, + "loss": 0.4198, + "step": 375 + }, + { + "epoch": 2.3067484662576687, + "grad_norm": 0.15032264590263367, + "learning_rate": 7.647104957150037e-06, + "loss": 0.4414, + "step": 376 + }, + { + "epoch": 2.312883435582822, + "grad_norm": 0.16133847832679749, + "learning_rate": 7.635291361875474e-06, + "loss": 0.4228, + "step": 377 + }, + { + "epoch": 2.3190184049079754, + "grad_norm": 0.16057458519935608, + "learning_rate": 7.623457361430798e-06, + "loss": 0.4763, + "step": 378 + }, + { + "epoch": 2.3251533742331287, + "grad_norm": 0.1559583991765976, + "learning_rate": 7.611603047447161e-06, + "loss": 0.4191, + "step": 379 + }, + { + "epoch": 2.331288343558282, + "grad_norm": 0.16082173585891724, + "learning_rate": 7.5997285117130095e-06, + "loss": 0.4454, + "step": 380 + }, + { + "epoch": 2.3374233128834354, + "grad_norm": 0.14822718501091003, + "learning_rate": 7.587833846173363e-06, + "loss": 0.433, + "step": 381 + }, + { + "epoch": 2.3435582822085887, + "grad_norm": 0.16007304191589355, + "learning_rate": 7.57591914292911e-06, + "loss": 0.4177, + "step": 382 + }, + { + "epoch": 2.3496932515337425, + "grad_norm": 0.15621638298034668, + "learning_rate": 7.5639844942362915e-06, + "loss": 0.4429, + "step": 383 + }, + { + "epoch": 2.355828220858896, + "grad_norm": 0.1551215648651123, + "learning_rate": 7.552029992505385e-06, + "loss": 0.4545, + "step": 384 + }, + { + "epoch": 2.361963190184049, + "grad_norm": 0.15754824876785278, + "learning_rate": 7.540055730300595e-06, + "loss": 0.4292, + "step": 385 + }, + { + "epoch": 2.3680981595092025, + "grad_norm": 0.14232178032398224, + "learning_rate": 7.528061800339127e-06, + "loss": 0.4494, + "step": 386 + }, + { + "epoch": 2.374233128834356, + "grad_norm": 0.1486847698688507, + "learning_rate": 7.516048295490479e-06, + "loss": 0.4478, + "step": 387 + }, + { + "epoch": 2.3803680981595092, + "grad_norm": 0.1655314713716507, + "learning_rate": 7.504015308775714e-06, + "loss": 0.4309, + "step": 388 + }, + { + "epoch": 2.3865030674846626, + "grad_norm": 0.14602237939834595, + "learning_rate": 7.491962933366748e-06, + "loss": 0.4472, + "step": 389 + }, + { + "epoch": 2.392638036809816, + "grad_norm": 0.14621604979038239, + "learning_rate": 7.479891262585623e-06, + "loss": 0.4567, + "step": 390 + }, + { + "epoch": 2.3987730061349692, + "grad_norm": 0.1583264321088791, + "learning_rate": 7.467800389903786e-06, + "loss": 0.45, + "step": 391 + }, + { + "epoch": 2.4049079754601226, + "grad_norm": 0.17010322213172913, + "learning_rate": 7.455690408941363e-06, + "loss": 0.4426, + "step": 392 + }, + { + "epoch": 2.411042944785276, + "grad_norm": 0.16225053369998932, + "learning_rate": 7.443561413466439e-06, + "loss": 0.421, + "step": 393 + }, + { + "epoch": 2.4171779141104293, + "grad_norm": 0.20231634378433228, + "learning_rate": 7.431413497394328e-06, + "loss": 0.447, + "step": 394 + }, + { + "epoch": 2.4233128834355826, + "grad_norm": 0.1540592461824417, + "learning_rate": 7.419246754786847e-06, + "loss": 0.463, + "step": 395 + }, + { + "epoch": 2.4294478527607364, + "grad_norm": 0.15883290767669678, + "learning_rate": 7.407061279851589e-06, + "loss": 0.4636, + "step": 396 + }, + { + "epoch": 2.4355828220858897, + "grad_norm": 0.15072090923786163, + "learning_rate": 7.394857166941187e-06, + "loss": 0.4539, + "step": 397 + }, + { + "epoch": 2.441717791411043, + "grad_norm": 0.14263494312763214, + "learning_rate": 7.382634510552596e-06, + "loss": 0.44, + "step": 398 + }, + { + "epoch": 2.4478527607361964, + "grad_norm": 0.15457823872566223, + "learning_rate": 7.370393405326351e-06, + "loss": 0.4534, + "step": 399 + }, + { + "epoch": 2.4539877300613497, + "grad_norm": 0.1526721864938736, + "learning_rate": 7.358133946045834e-06, + "loss": 0.4551, + "step": 400 + }, + { + "epoch": 2.460122699386503, + "grad_norm": 0.14920799434185028, + "learning_rate": 7.345856227636548e-06, + "loss": 0.4593, + "step": 401 + }, + { + "epoch": 2.4662576687116564, + "grad_norm": 0.1798851191997528, + "learning_rate": 7.333560345165371e-06, + "loss": 0.4074, + "step": 402 + }, + { + "epoch": 2.4723926380368098, + "grad_norm": 0.1653115600347519, + "learning_rate": 7.321246393839836e-06, + "loss": 0.44, + "step": 403 + }, + { + "epoch": 2.478527607361963, + "grad_norm": 0.13865536451339722, + "learning_rate": 7.308914469007372e-06, + "loss": 0.4464, + "step": 404 + }, + { + "epoch": 2.4846625766871164, + "grad_norm": 0.16312679648399353, + "learning_rate": 7.296564666154589e-06, + "loss": 0.4617, + "step": 405 + }, + { + "epoch": 2.4907975460122698, + "grad_norm": 0.1640763282775879, + "learning_rate": 7.284197080906517e-06, + "loss": 0.4744, + "step": 406 + }, + { + "epoch": 2.4969325153374236, + "grad_norm": 0.17815761268138885, + "learning_rate": 7.271811809025882e-06, + "loss": 0.4404, + "step": 407 + }, + { + "epoch": 2.5030674846625764, + "grad_norm": 0.15390528738498688, + "learning_rate": 7.259408946412359e-06, + "loss": 0.4353, + "step": 408 + }, + { + "epoch": 2.5092024539877302, + "grad_norm": 0.15139776468276978, + "learning_rate": 7.246988589101825e-06, + "loss": 0.4763, + "step": 409 + }, + { + "epoch": 2.5153374233128836, + "grad_norm": 0.15306057035923004, + "learning_rate": 7.234550833265621e-06, + "loss": 0.4415, + "step": 410 + }, + { + "epoch": 2.521472392638037, + "grad_norm": 0.1511690318584442, + "learning_rate": 7.222095775209805e-06, + "loss": 0.4316, + "step": 411 + }, + { + "epoch": 2.5276073619631902, + "grad_norm": 0.15759505331516266, + "learning_rate": 7.209623511374407e-06, + "loss": 0.4321, + "step": 412 + }, + { + "epoch": 2.5337423312883436, + "grad_norm": 0.15876421332359314, + "learning_rate": 7.197134138332684e-06, + "loss": 0.4431, + "step": 413 + }, + { + "epoch": 2.539877300613497, + "grad_norm": 0.15190748870372772, + "learning_rate": 7.184627752790368e-06, + "loss": 0.4498, + "step": 414 + }, + { + "epoch": 2.5460122699386503, + "grad_norm": 0.1476791799068451, + "learning_rate": 7.1721044515849165e-06, + "loss": 0.4314, + "step": 415 + }, + { + "epoch": 2.5521472392638036, + "grad_norm": 0.1838875710964203, + "learning_rate": 7.159564331684774e-06, + "loss": 0.4257, + "step": 416 + }, + { + "epoch": 2.558282208588957, + "grad_norm": 0.15660858154296875, + "learning_rate": 7.1470074901886065e-06, + "loss": 0.4575, + "step": 417 + }, + { + "epoch": 2.5644171779141103, + "grad_norm": 0.15932102501392365, + "learning_rate": 7.134434024324557e-06, + "loss": 0.4338, + "step": 418 + }, + { + "epoch": 2.5705521472392636, + "grad_norm": 0.16551874577999115, + "learning_rate": 7.121844031449491e-06, + "loss": 0.4746, + "step": 419 + }, + { + "epoch": 2.5766871165644174, + "grad_norm": 0.16659101843833923, + "learning_rate": 7.109237609048247e-06, + "loss": 0.4983, + "step": 420 + }, + { + "epoch": 2.5828220858895703, + "grad_norm": 0.14108383655548096, + "learning_rate": 7.096614854732873e-06, + "loss": 0.4698, + "step": 421 + }, + { + "epoch": 2.588957055214724, + "grad_norm": 0.14742513000965118, + "learning_rate": 7.083975866241881e-06, + "loss": 0.4466, + "step": 422 + }, + { + "epoch": 2.5950920245398774, + "grad_norm": 0.15716223418712616, + "learning_rate": 7.071320741439481e-06, + "loss": 0.4671, + "step": 423 + }, + { + "epoch": 2.6012269938650308, + "grad_norm": 0.1822831928730011, + "learning_rate": 7.058649578314828e-06, + "loss": 0.4133, + "step": 424 + }, + { + "epoch": 2.607361963190184, + "grad_norm": 0.16716191172599792, + "learning_rate": 7.045962474981261e-06, + "loss": 0.4084, + "step": 425 + }, + { + "epoch": 2.6134969325153374, + "grad_norm": 0.1516677588224411, + "learning_rate": 7.03325952967555e-06, + "loss": 0.4342, + "step": 426 + }, + { + "epoch": 2.6196319018404908, + "grad_norm": 0.15387603640556335, + "learning_rate": 7.020540840757124e-06, + "loss": 0.4211, + "step": 427 + }, + { + "epoch": 2.625766871165644, + "grad_norm": 0.1543998122215271, + "learning_rate": 7.007806506707319e-06, + "loss": 0.4273, + "step": 428 + }, + { + "epoch": 2.6319018404907975, + "grad_norm": 0.15574420988559723, + "learning_rate": 6.995056626128609e-06, + "loss": 0.4458, + "step": 429 + }, + { + "epoch": 2.638036809815951, + "grad_norm": 0.1659959852695465, + "learning_rate": 6.982291297743848e-06, + "loss": 0.4592, + "step": 430 + }, + { + "epoch": 2.644171779141104, + "grad_norm": 0.18856996297836304, + "learning_rate": 6.969510620395503e-06, + "loss": 0.4552, + "step": 431 + }, + { + "epoch": 2.6503067484662575, + "grad_norm": 0.14291857182979584, + "learning_rate": 6.956714693044888e-06, + "loss": 0.4204, + "step": 432 + }, + { + "epoch": 2.6564417177914113, + "grad_norm": 0.17281392216682434, + "learning_rate": 6.943903614771397e-06, + "loss": 0.4571, + "step": 433 + }, + { + "epoch": 2.662576687116564, + "grad_norm": 0.16019487380981445, + "learning_rate": 6.931077484771739e-06, + "loss": 0.4563, + "step": 434 + }, + { + "epoch": 2.668711656441718, + "grad_norm": 0.16260547935962677, + "learning_rate": 6.9182364023591706e-06, + "loss": 0.425, + "step": 435 + }, + { + "epoch": 2.6748466257668713, + "grad_norm": 0.1593744456768036, + "learning_rate": 6.905380466962726e-06, + "loss": 0.4211, + "step": 436 + }, + { + "epoch": 2.6809815950920246, + "grad_norm": 0.16132207214832306, + "learning_rate": 6.892509778126442e-06, + "loss": 0.4377, + "step": 437 + }, + { + "epoch": 2.687116564417178, + "grad_norm": 0.15323421359062195, + "learning_rate": 6.879624435508596e-06, + "loss": 0.4284, + "step": 438 + }, + { + "epoch": 2.6932515337423313, + "grad_norm": 0.1697331666946411, + "learning_rate": 6.866724538880931e-06, + "loss": 0.4341, + "step": 439 + }, + { + "epoch": 2.6993865030674846, + "grad_norm": 0.1691737174987793, + "learning_rate": 6.85381018812788e-06, + "loss": 0.4907, + "step": 440 + }, + { + "epoch": 2.705521472392638, + "grad_norm": 0.1564476490020752, + "learning_rate": 6.840881483245797e-06, + "loss": 0.4133, + "step": 441 + }, + { + "epoch": 2.7116564417177913, + "grad_norm": 0.15074561536312103, + "learning_rate": 6.827938524342175e-06, + "loss": 0.4424, + "step": 442 + }, + { + "epoch": 2.7177914110429446, + "grad_norm": 0.15723863244056702, + "learning_rate": 6.814981411634885e-06, + "loss": 0.4469, + "step": 443 + }, + { + "epoch": 2.7239263803680984, + "grad_norm": 0.15343016386032104, + "learning_rate": 6.802010245451382e-06, + "loss": 0.4495, + "step": 444 + }, + { + "epoch": 2.7300613496932513, + "grad_norm": 0.14901430904865265, + "learning_rate": 6.789025126227948e-06, + "loss": 0.4332, + "step": 445 + }, + { + "epoch": 2.736196319018405, + "grad_norm": 0.18850690126419067, + "learning_rate": 6.7760261545088955e-06, + "loss": 0.4741, + "step": 446 + }, + { + "epoch": 2.7423312883435584, + "grad_norm": 0.18473871052265167, + "learning_rate": 6.763013430945803e-06, + "loss": 0.4579, + "step": 447 + }, + { + "epoch": 2.7484662576687118, + "grad_norm": 0.16771845519542694, + "learning_rate": 6.749987056296728e-06, + "loss": 0.4281, + "step": 448 + }, + { + "epoch": 2.754601226993865, + "grad_norm": 0.15645870566368103, + "learning_rate": 6.736947131425423e-06, + "loss": 0.4399, + "step": 449 + }, + { + "epoch": 2.7607361963190185, + "grad_norm": 0.15851353108882904, + "learning_rate": 6.723893757300572e-06, + "loss": 0.4245, + "step": 450 + }, + { + "epoch": 2.766871165644172, + "grad_norm": 0.15412819385528564, + "learning_rate": 6.710827034994991e-06, + "loss": 0.457, + "step": 451 + }, + { + "epoch": 2.773006134969325, + "grad_norm": 0.21733322739601135, + "learning_rate": 6.697747065684851e-06, + "loss": 0.4235, + "step": 452 + }, + { + "epoch": 2.7791411042944785, + "grad_norm": 0.1558465212583542, + "learning_rate": 6.684653950648893e-06, + "loss": 0.3966, + "step": 453 + }, + { + "epoch": 2.785276073619632, + "grad_norm": 0.1692306250333786, + "learning_rate": 6.671547791267652e-06, + "loss": 0.469, + "step": 454 + }, + { + "epoch": 2.791411042944785, + "grad_norm": 0.1569252610206604, + "learning_rate": 6.658428689022661e-06, + "loss": 0.4432, + "step": 455 + }, + { + "epoch": 2.7975460122699385, + "grad_norm": 0.17490547895431519, + "learning_rate": 6.6452967454956744e-06, + "loss": 0.454, + "step": 456 + }, + { + "epoch": 2.8036809815950923, + "grad_norm": 0.16125169396400452, + "learning_rate": 6.632152062367871e-06, + "loss": 0.4699, + "step": 457 + }, + { + "epoch": 2.809815950920245, + "grad_norm": 0.16368430852890015, + "learning_rate": 6.618994741419078e-06, + "loss": 0.4194, + "step": 458 + }, + { + "epoch": 2.815950920245399, + "grad_norm": 0.1626632660627365, + "learning_rate": 6.605824884526978e-06, + "loss": 0.4289, + "step": 459 + }, + { + "epoch": 2.8220858895705523, + "grad_norm": 0.17192265391349792, + "learning_rate": 6.592642593666316e-06, + "loss": 0.4629, + "step": 460 + }, + { + "epoch": 2.8282208588957056, + "grad_norm": 0.15331393480300903, + "learning_rate": 6.579447970908115e-06, + "loss": 0.4261, + "step": 461 + }, + { + "epoch": 2.834355828220859, + "grad_norm": 0.16751109063625336, + "learning_rate": 6.566241118418888e-06, + "loss": 0.4398, + "step": 462 + }, + { + "epoch": 2.8404907975460123, + "grad_norm": 0.20704393088817596, + "learning_rate": 6.553022138459839e-06, + "loss": 0.4196, + "step": 463 + }, + { + "epoch": 2.8466257668711656, + "grad_norm": 0.17874981462955475, + "learning_rate": 6.539791133386077e-06, + "loss": 0.4542, + "step": 464 + }, + { + "epoch": 2.852760736196319, + "grad_norm": 0.1628667265176773, + "learning_rate": 6.526548205645823e-06, + "loss": 0.4278, + "step": 465 + }, + { + "epoch": 2.8588957055214723, + "grad_norm": 0.18325841426849365, + "learning_rate": 6.513293457779614e-06, + "loss": 0.4448, + "step": 466 + }, + { + "epoch": 2.8650306748466257, + "grad_norm": 0.17128658294677734, + "learning_rate": 6.50002699241951e-06, + "loss": 0.4778, + "step": 467 + }, + { + "epoch": 2.871165644171779, + "grad_norm": 0.18761861324310303, + "learning_rate": 6.486748912288305e-06, + "loss": 0.4313, + "step": 468 + }, + { + "epoch": 2.8773006134969323, + "grad_norm": 0.16829295456409454, + "learning_rate": 6.4734593201987205e-06, + "loss": 0.4589, + "step": 469 + }, + { + "epoch": 2.883435582822086, + "grad_norm": 0.16941910982131958, + "learning_rate": 6.46015831905262e-06, + "loss": 0.4844, + "step": 470 + }, + { + "epoch": 2.889570552147239, + "grad_norm": 0.18299958109855652, + "learning_rate": 6.446846011840204e-06, + "loss": 0.4204, + "step": 471 + }, + { + "epoch": 2.895705521472393, + "grad_norm": 0.1755492389202118, + "learning_rate": 6.43352250163922e-06, + "loss": 0.4265, + "step": 472 + }, + { + "epoch": 2.901840490797546, + "grad_norm": 0.17342810332775116, + "learning_rate": 6.420187891614158e-06, + "loss": 0.4314, + "step": 473 + }, + { + "epoch": 2.9079754601226995, + "grad_norm": 0.15891548991203308, + "learning_rate": 6.406842285015455e-06, + "loss": 0.4188, + "step": 474 + }, + { + "epoch": 2.914110429447853, + "grad_norm": 0.17201867699623108, + "learning_rate": 6.393485785178699e-06, + "loss": 0.4459, + "step": 475 + }, + { + "epoch": 2.920245398773006, + "grad_norm": 0.16666850447654724, + "learning_rate": 6.380118495523816e-06, + "loss": 0.437, + "step": 476 + }, + { + "epoch": 2.9263803680981595, + "grad_norm": 0.17189852893352509, + "learning_rate": 6.366740519554286e-06, + "loss": 0.4263, + "step": 477 + }, + { + "epoch": 2.932515337423313, + "grad_norm": 0.16016732156276703, + "learning_rate": 6.353351960856332e-06, + "loss": 0.4358, + "step": 478 + }, + { + "epoch": 2.938650306748466, + "grad_norm": 0.15281671285629272, + "learning_rate": 6.339952923098117e-06, + "loss": 0.4314, + "step": 479 + }, + { + "epoch": 2.9447852760736195, + "grad_norm": 0.19255416095256805, + "learning_rate": 6.326543510028943e-06, + "loss": 0.42, + "step": 480 + }, + { + "epoch": 2.950920245398773, + "grad_norm": 0.1635408252477646, + "learning_rate": 6.3131238254784534e-06, + "loss": 0.4247, + "step": 481 + }, + { + "epoch": 2.957055214723926, + "grad_norm": 0.16226807236671448, + "learning_rate": 6.299693973355821e-06, + "loss": 0.4348, + "step": 482 + }, + { + "epoch": 2.96319018404908, + "grad_norm": 0.17341448366641998, + "learning_rate": 6.286254057648945e-06, + "loss": 0.4174, + "step": 483 + }, + { + "epoch": 2.969325153374233, + "grad_norm": 0.16853678226470947, + "learning_rate": 6.27280418242365e-06, + "loss": 0.4529, + "step": 484 + }, + { + "epoch": 2.9754601226993866, + "grad_norm": 0.16225695610046387, + "learning_rate": 6.259344451822877e-06, + "loss": 0.4685, + "step": 485 + }, + { + "epoch": 2.98159509202454, + "grad_norm": 0.16942046582698822, + "learning_rate": 6.245874970065877e-06, + "loss": 0.4335, + "step": 486 + }, + { + "epoch": 2.9877300613496933, + "grad_norm": 0.1746375411748886, + "learning_rate": 6.2323958414474065e-06, + "loss": 0.4304, + "step": 487 + }, + { + "epoch": 2.9938650306748467, + "grad_norm": 0.19197514653205872, + "learning_rate": 6.218907170336912e-06, + "loss": 0.4651, + "step": 488 + }, + { + "epoch": 3.0, + "grad_norm": 0.16628479957580566, + "learning_rate": 6.2054090611777385e-06, + "loss": 0.4484, + "step": 489 + }, + { + "epoch": 3.0061349693251533, + "grad_norm": 0.1807141751050949, + "learning_rate": 6.191901618486299e-06, + "loss": 0.4403, + "step": 490 + }, + { + "epoch": 3.0122699386503067, + "grad_norm": 0.18243259191513062, + "learning_rate": 6.178384946851284e-06, + "loss": 0.4427, + "step": 491 + }, + { + "epoch": 3.01840490797546, + "grad_norm": 0.17769359052181244, + "learning_rate": 6.164859150932839e-06, + "loss": 0.4177, + "step": 492 + }, + { + "epoch": 3.0245398773006134, + "grad_norm": 0.17457637190818787, + "learning_rate": 6.151324335461766e-06, + "loss": 0.4623, + "step": 493 + }, + { + "epoch": 3.0306748466257667, + "grad_norm": 0.1725994348526001, + "learning_rate": 6.137780605238698e-06, + "loss": 0.4638, + "step": 494 + }, + { + "epoch": 3.03680981595092, + "grad_norm": 0.17166265845298767, + "learning_rate": 6.1242280651332995e-06, + "loss": 0.446, + "step": 495 + }, + { + "epoch": 3.042944785276074, + "grad_norm": 0.17501436173915863, + "learning_rate": 6.11066682008345e-06, + "loss": 0.4405, + "step": 496 + }, + { + "epoch": 3.049079754601227, + "grad_norm": 0.1676449030637741, + "learning_rate": 6.097096975094432e-06, + "loss": 0.4718, + "step": 497 + }, + { + "epoch": 3.0552147239263805, + "grad_norm": 0.1620589643716812, + "learning_rate": 6.083518635238117e-06, + "loss": 0.4353, + "step": 498 + }, + { + "epoch": 3.061349693251534, + "grad_norm": 0.17251572012901306, + "learning_rate": 6.069931905652151e-06, + "loss": 0.4409, + "step": 499 + }, + { + "epoch": 3.067484662576687, + "grad_norm": 0.1733006089925766, + "learning_rate": 6.056336891539144e-06, + "loss": 0.4323, + "step": 500 + }, + { + "epoch": 3.0736196319018405, + "grad_norm": 0.1714855581521988, + "learning_rate": 6.042733698165855e-06, + "loss": 0.4297, + "step": 501 + }, + { + "epoch": 3.079754601226994, + "grad_norm": 0.15556511282920837, + "learning_rate": 6.029122430862373e-06, + "loss": 0.4546, + "step": 502 + }, + { + "epoch": 3.085889570552147, + "grad_norm": 0.18687503039836884, + "learning_rate": 6.015503195021303e-06, + "loss": 0.4586, + "step": 503 + }, + { + "epoch": 3.0920245398773005, + "grad_norm": 0.1955818086862564, + "learning_rate": 6.001876096096951e-06, + "loss": 0.4405, + "step": 504 + }, + { + "epoch": 3.098159509202454, + "grad_norm": 0.18032221496105194, + "learning_rate": 5.988241239604511e-06, + "loss": 0.4621, + "step": 505 + }, + { + "epoch": 3.104294478527607, + "grad_norm": 0.1692107915878296, + "learning_rate": 5.97459873111924e-06, + "loss": 0.4557, + "step": 506 + }, + { + "epoch": 3.1104294478527605, + "grad_norm": 0.18335744738578796, + "learning_rate": 5.9609486762756465e-06, + "loss": 0.43, + "step": 507 + }, + { + "epoch": 3.116564417177914, + "grad_norm": 0.16377264261245728, + "learning_rate": 5.947291180766668e-06, + "loss": 0.4473, + "step": 508 + }, + { + "epoch": 3.1226993865030677, + "grad_norm": 0.1665698140859604, + "learning_rate": 5.933626350342858e-06, + "loss": 0.4262, + "step": 509 + }, + { + "epoch": 3.128834355828221, + "grad_norm": 0.1706293821334839, + "learning_rate": 5.9199542908115694e-06, + "loss": 0.4118, + "step": 510 + }, + { + "epoch": 3.1349693251533743, + "grad_norm": 0.17646320164203644, + "learning_rate": 5.906275108036119e-06, + "loss": 0.4503, + "step": 511 + }, + { + "epoch": 3.1411042944785277, + "grad_norm": 0.18442606925964355, + "learning_rate": 5.892588907934988e-06, + "loss": 0.448, + "step": 512 + }, + { + "epoch": 3.147239263803681, + "grad_norm": 0.16845177114009857, + "learning_rate": 5.87889579648099e-06, + "loss": 0.4533, + "step": 513 + }, + { + "epoch": 3.1533742331288344, + "grad_norm": 0.17161841690540314, + "learning_rate": 5.865195879700454e-06, + "loss": 0.4358, + "step": 514 + }, + { + "epoch": 3.1595092024539877, + "grad_norm": 0.1597752720117569, + "learning_rate": 5.8514892636724005e-06, + "loss": 0.4444, + "step": 515 + }, + { + "epoch": 3.165644171779141, + "grad_norm": 0.1626145988702774, + "learning_rate": 5.83777605452773e-06, + "loss": 0.4317, + "step": 516 + }, + { + "epoch": 3.1717791411042944, + "grad_norm": 0.17902852594852448, + "learning_rate": 5.8240563584483855e-06, + "loss": 0.4604, + "step": 517 + }, + { + "epoch": 3.1779141104294477, + "grad_norm": 0.1681181639432907, + "learning_rate": 5.810330281666542e-06, + "loss": 0.4577, + "step": 518 + }, + { + "epoch": 3.184049079754601, + "grad_norm": 0.18032066524028778, + "learning_rate": 5.796597930463776e-06, + "loss": 0.4214, + "step": 519 + }, + { + "epoch": 3.190184049079755, + "grad_norm": 0.18553897738456726, + "learning_rate": 5.782859411170261e-06, + "loss": 0.3931, + "step": 520 + }, + { + "epoch": 3.196319018404908, + "grad_norm": 0.16626308858394623, + "learning_rate": 5.769114830163913e-06, + "loss": 0.4307, + "step": 521 + }, + { + "epoch": 3.2024539877300615, + "grad_norm": 0.17122527956962585, + "learning_rate": 5.7553642938695945e-06, + "loss": 0.4582, + "step": 522 + }, + { + "epoch": 3.208588957055215, + "grad_norm": 0.1699482649564743, + "learning_rate": 5.741607908758275e-06, + "loss": 0.4575, + "step": 523 + }, + { + "epoch": 3.214723926380368, + "grad_norm": 0.185217946767807, + "learning_rate": 5.727845781346217e-06, + "loss": 0.4063, + "step": 524 + }, + { + "epoch": 3.2208588957055215, + "grad_norm": 0.17612098157405853, + "learning_rate": 5.714078018194141e-06, + "loss": 0.4555, + "step": 525 + }, + { + "epoch": 3.226993865030675, + "grad_norm": 0.17292365431785583, + "learning_rate": 5.7003047259064095e-06, + "loss": 0.4402, + "step": 526 + }, + { + "epoch": 3.233128834355828, + "grad_norm": 0.18183457851409912, + "learning_rate": 5.68652601113019e-06, + "loss": 0.4694, + "step": 527 + }, + { + "epoch": 3.2392638036809815, + "grad_norm": 0.1628892868757248, + "learning_rate": 5.672741980554646e-06, + "loss": 0.4472, + "step": 528 + }, + { + "epoch": 3.245398773006135, + "grad_norm": 0.17642362415790558, + "learning_rate": 5.658952740910094e-06, + "loss": 0.4239, + "step": 529 + }, + { + "epoch": 3.2515337423312882, + "grad_norm": 0.1806386113166809, + "learning_rate": 5.645158398967191e-06, + "loss": 0.4282, + "step": 530 + }, + { + "epoch": 3.2576687116564416, + "grad_norm": 0.1763850599527359, + "learning_rate": 5.6313590615360935e-06, + "loss": 0.4193, + "step": 531 + }, + { + "epoch": 3.263803680981595, + "grad_norm": 0.16797596216201782, + "learning_rate": 5.617554835465646e-06, + "loss": 0.4358, + "step": 532 + }, + { + "epoch": 3.2699386503067487, + "grad_norm": 0.16996163129806519, + "learning_rate": 5.6037458276425394e-06, + "loss": 0.4048, + "step": 533 + }, + { + "epoch": 3.276073619631902, + "grad_norm": 0.16143372654914856, + "learning_rate": 5.589932144990495e-06, + "loss": 0.4421, + "step": 534 + }, + { + "epoch": 3.2822085889570554, + "grad_norm": 0.16983363032341003, + "learning_rate": 5.5761138944694295e-06, + "loss": 0.4253, + "step": 535 + }, + { + "epoch": 3.2883435582822087, + "grad_norm": 0.17256614565849304, + "learning_rate": 5.562291183074627e-06, + "loss": 0.4216, + "step": 536 + }, + { + "epoch": 3.294478527607362, + "grad_norm": 0.18795952200889587, + "learning_rate": 5.548464117835917e-06, + "loss": 0.4246, + "step": 537 + }, + { + "epoch": 3.3006134969325154, + "grad_norm": 0.18537187576293945, + "learning_rate": 5.534632805816835e-06, + "loss": 0.4124, + "step": 538 + }, + { + "epoch": 3.3067484662576687, + "grad_norm": 0.19903261959552765, + "learning_rate": 5.520797354113804e-06, + "loss": 0.46, + "step": 539 + }, + { + "epoch": 3.312883435582822, + "grad_norm": 0.1933709681034088, + "learning_rate": 5.5069578698553e-06, + "loss": 0.431, + "step": 540 + }, + { + "epoch": 3.3190184049079754, + "grad_norm": 0.17167270183563232, + "learning_rate": 5.4931144602010224e-06, + "loss": 0.4773, + "step": 541 + }, + { + "epoch": 3.3251533742331287, + "grad_norm": 0.16057482361793518, + "learning_rate": 5.479267232341064e-06, + "loss": 0.4089, + "step": 542 + }, + { + "epoch": 3.331288343558282, + "grad_norm": 0.17433694005012512, + "learning_rate": 5.465416293495083e-06, + "loss": 0.4502, + "step": 543 + }, + { + "epoch": 3.3374233128834354, + "grad_norm": 0.1779199242591858, + "learning_rate": 5.451561750911475e-06, + "loss": 0.4048, + "step": 544 + }, + { + "epoch": 3.3435582822085887, + "grad_norm": 0.17335927486419678, + "learning_rate": 5.437703711866534e-06, + "loss": 0.4341, + "step": 545 + }, + { + "epoch": 3.3496932515337425, + "grad_norm": 0.17468775808811188, + "learning_rate": 5.4238422836636315e-06, + "loss": 0.4448, + "step": 546 + }, + { + "epoch": 3.355828220858896, + "grad_norm": 0.18919560313224792, + "learning_rate": 5.40997757363238e-06, + "loss": 0.4489, + "step": 547 + }, + { + "epoch": 3.361963190184049, + "grad_norm": 0.17396590113639832, + "learning_rate": 5.3961096891278035e-06, + "loss": 0.4325, + "step": 548 + }, + { + "epoch": 3.3680981595092025, + "grad_norm": 0.16672684252262115, + "learning_rate": 5.382238737529505e-06, + "loss": 0.4077, + "step": 549 + }, + { + "epoch": 3.374233128834356, + "grad_norm": 0.1844366043806076, + "learning_rate": 5.368364826240836e-06, + "loss": 0.4196, + "step": 550 + }, + { + "epoch": 3.3803680981595092, + "grad_norm": 0.18350937962532043, + "learning_rate": 5.354488062688068e-06, + "loss": 0.4523, + "step": 551 + }, + { + "epoch": 3.3865030674846626, + "grad_norm": 0.26154661178588867, + "learning_rate": 5.3406085543195555e-06, + "loss": 0.4761, + "step": 552 + }, + { + "epoch": 3.392638036809816, + "grad_norm": 0.1603960394859314, + "learning_rate": 5.3267264086049054e-06, + "loss": 0.4433, + "step": 553 + }, + { + "epoch": 3.3987730061349692, + "grad_norm": 0.19381873309612274, + "learning_rate": 5.312841733034147e-06, + "loss": 0.4031, + "step": 554 + }, + { + "epoch": 3.4049079754601226, + "grad_norm": 0.21427270770072937, + "learning_rate": 5.2989546351168985e-06, + "loss": 0.4391, + "step": 555 + }, + { + "epoch": 3.411042944785276, + "grad_norm": 0.1873149424791336, + "learning_rate": 5.285065222381533e-06, + "loss": 0.4409, + "step": 556 + }, + { + "epoch": 3.4171779141104293, + "grad_norm": 0.1799328476190567, + "learning_rate": 5.27117360237435e-06, + "loss": 0.4381, + "step": 557 + }, + { + "epoch": 3.4233128834355826, + "grad_norm": 0.1705305278301239, + "learning_rate": 5.257279882658737e-06, + "loss": 0.3905, + "step": 558 + }, + { + "epoch": 3.4294478527607364, + "grad_norm": 0.1969294250011444, + "learning_rate": 5.2433841708143405e-06, + "loss": 0.4264, + "step": 559 + }, + { + "epoch": 3.4355828220858897, + "grad_norm": 0.17147500813007355, + "learning_rate": 5.229486574436236e-06, + "loss": 0.4592, + "step": 560 + }, + { + "epoch": 3.441717791411043, + "grad_norm": 0.1796891838312149, + "learning_rate": 5.215587201134081e-06, + "loss": 0.4223, + "step": 561 + }, + { + "epoch": 3.4478527607361964, + "grad_norm": 0.17482995986938477, + "learning_rate": 5.201686158531304e-06, + "loss": 0.4143, + "step": 562 + }, + { + "epoch": 3.4539877300613497, + "grad_norm": 0.18087799847126007, + "learning_rate": 5.187783554264253e-06, + "loss": 0.4611, + "step": 563 + }, + { + "epoch": 3.460122699386503, + "grad_norm": 0.1888713240623474, + "learning_rate": 5.173879495981367e-06, + "loss": 0.4569, + "step": 564 + }, + { + "epoch": 3.4662576687116564, + "grad_norm": 0.20657221972942352, + "learning_rate": 5.1599740913423435e-06, + "loss": 0.4762, + "step": 565 + }, + { + "epoch": 3.4723926380368098, + "grad_norm": 0.18362171947956085, + "learning_rate": 5.146067448017308e-06, + "loss": 0.4268, + "step": 566 + }, + { + "epoch": 3.478527607361963, + "grad_norm": 0.18593919277191162, + "learning_rate": 5.132159673685976e-06, + "loss": 0.4506, + "step": 567 + }, + { + "epoch": 3.4846625766871164, + "grad_norm": 0.1854875087738037, + "learning_rate": 5.1182508760368195e-06, + "loss": 0.4209, + "step": 568 + }, + { + "epoch": 3.4907975460122698, + "grad_norm": 0.18463847041130066, + "learning_rate": 5.104341162766234e-06, + "loss": 0.4349, + "step": 569 + }, + { + "epoch": 3.4969325153374236, + "grad_norm": 0.18153201043605804, + "learning_rate": 5.090430641577705e-06, + "loss": 0.459, + "step": 570 + }, + { + "epoch": 3.5030674846625764, + "grad_norm": 0.20472465455532074, + "learning_rate": 5.0765194201809755e-06, + "loss": 0.4294, + "step": 571 + }, + { + "epoch": 3.5092024539877302, + "grad_norm": 0.16046775877475739, + "learning_rate": 5.062607606291208e-06, + "loss": 0.4442, + "step": 572 + }, + { + "epoch": 3.5153374233128836, + "grad_norm": 0.17173658311367035, + "learning_rate": 5.048695307628152e-06, + "loss": 0.4155, + "step": 573 + }, + { + "epoch": 3.521472392638037, + "grad_norm": 0.18764469027519226, + "learning_rate": 5.034782631915314e-06, + "loss": 0.4081, + "step": 574 + }, + { + "epoch": 3.5276073619631902, + "grad_norm": 0.18143120408058167, + "learning_rate": 5.020869686879115e-06, + "loss": 0.4273, + "step": 575 + }, + { + "epoch": 3.5337423312883436, + "grad_norm": 0.16679701209068298, + "learning_rate": 5.006956580248069e-06, + "loss": 0.445, + "step": 576 + }, + { + "epoch": 3.539877300613497, + "grad_norm": 0.1785566657781601, + "learning_rate": 4.993043419751933e-06, + "loss": 0.4325, + "step": 577 + }, + { + "epoch": 3.5460122699386503, + "grad_norm": 0.18561996519565582, + "learning_rate": 4.979130313120885e-06, + "loss": 0.4432, + "step": 578 + }, + { + "epoch": 3.5521472392638036, + "grad_norm": 0.17507800459861755, + "learning_rate": 4.965217368084688e-06, + "loss": 0.4358, + "step": 579 + }, + { + "epoch": 3.558282208588957, + "grad_norm": 0.17406027019023895, + "learning_rate": 4.95130469237185e-06, + "loss": 0.4551, + "step": 580 + }, + { + "epoch": 3.5644171779141103, + "grad_norm": 0.18630944192409515, + "learning_rate": 4.937392393708794e-06, + "loss": 0.4398, + "step": 581 + }, + { + "epoch": 3.5705521472392636, + "grad_norm": 0.15710729360580444, + "learning_rate": 4.923480579819025e-06, + "loss": 0.428, + "step": 582 + }, + { + "epoch": 3.5766871165644174, + "grad_norm": 0.17362196743488312, + "learning_rate": 4.909569358422296e-06, + "loss": 0.4388, + "step": 583 + }, + { + "epoch": 3.5828220858895703, + "grad_norm": 0.1887211948633194, + "learning_rate": 4.895658837233767e-06, + "loss": 0.4241, + "step": 584 + }, + { + "epoch": 3.588957055214724, + "grad_norm": 0.18522702157497406, + "learning_rate": 4.881749123963183e-06, + "loss": 0.443, + "step": 585 + }, + { + "epoch": 3.5950920245398774, + "grad_norm": 0.17625677585601807, + "learning_rate": 4.867840326314024e-06, + "loss": 0.4345, + "step": 586 + }, + { + "epoch": 3.6012269938650308, + "grad_norm": 0.17641369998455048, + "learning_rate": 4.853932551982692e-06, + "loss": 0.4018, + "step": 587 + }, + { + "epoch": 3.607361963190184, + "grad_norm": 0.18613062798976898, + "learning_rate": 4.840025908657658e-06, + "loss": 0.4787, + "step": 588 + }, + { + "epoch": 3.6134969325153374, + "grad_norm": 0.17531706392765045, + "learning_rate": 4.826120504018635e-06, + "loss": 0.4197, + "step": 589 + }, + { + "epoch": 3.6196319018404908, + "grad_norm": 0.18549470603466034, + "learning_rate": 4.812216445735749e-06, + "loss": 0.4228, + "step": 590 + }, + { + "epoch": 3.625766871165644, + "grad_norm": 0.17408756911754608, + "learning_rate": 4.798313841468697e-06, + "loss": 0.4054, + "step": 591 + }, + { + "epoch": 3.6319018404907975, + "grad_norm": 0.19779986143112183, + "learning_rate": 4.7844127988659204e-06, + "loss": 0.4277, + "step": 592 + }, + { + "epoch": 3.638036809815951, + "grad_norm": 0.16971039772033691, + "learning_rate": 4.7705134255637676e-06, + "loss": 0.4098, + "step": 593 + }, + { + "epoch": 3.644171779141104, + "grad_norm": 0.18103785812854767, + "learning_rate": 4.756615829185661e-06, + "loss": 0.4254, + "step": 594 + }, + { + "epoch": 3.6503067484662575, + "grad_norm": 0.18030977249145508, + "learning_rate": 4.742720117341265e-06, + "loss": 0.4203, + "step": 595 + }, + { + "epoch": 3.6564417177914113, + "grad_norm": 0.17600463330745697, + "learning_rate": 4.728826397625651e-06, + "loss": 0.429, + "step": 596 + }, + { + "epoch": 3.662576687116564, + "grad_norm": 0.18706922233104706, + "learning_rate": 4.714934777618468e-06, + "loss": 0.4562, + "step": 597 + }, + { + "epoch": 3.668711656441718, + "grad_norm": 0.18239618837833405, + "learning_rate": 4.701045364883103e-06, + "loss": 0.4327, + "step": 598 + }, + { + "epoch": 3.6748466257668713, + "grad_norm": 0.20267638564109802, + "learning_rate": 4.6871582669658545e-06, + "loss": 0.4167, + "step": 599 + }, + { + "epoch": 3.6809815950920246, + "grad_norm": 0.1839340478181839, + "learning_rate": 4.673273591395095e-06, + "loss": 0.4224, + "step": 600 + }, + { + "epoch": 3.687116564417178, + "grad_norm": 0.18033233284950256, + "learning_rate": 4.659391445680446e-06, + "loss": 0.4043, + "step": 601 + }, + { + "epoch": 3.6932515337423313, + "grad_norm": 0.20073480904102325, + "learning_rate": 4.645511937311934e-06, + "loss": 0.4511, + "step": 602 + }, + { + "epoch": 3.6993865030674846, + "grad_norm": 0.20318840444087982, + "learning_rate": 4.631635173759165e-06, + "loss": 0.4279, + "step": 603 + }, + { + "epoch": 3.705521472392638, + "grad_norm": 0.17758074402809143, + "learning_rate": 4.6177612624704975e-06, + "loss": 0.4735, + "step": 604 + }, + { + "epoch": 3.7116564417177913, + "grad_norm": 0.17162683606147766, + "learning_rate": 4.603890310872197e-06, + "loss": 0.4514, + "step": 605 + }, + { + "epoch": 3.7177914110429446, + "grad_norm": 0.19262997806072235, + "learning_rate": 4.590022426367621e-06, + "loss": 0.4313, + "step": 606 + }, + { + "epoch": 3.7239263803680984, + "grad_norm": 0.19909274578094482, + "learning_rate": 4.576157716336369e-06, + "loss": 0.4347, + "step": 607 + }, + { + "epoch": 3.7300613496932513, + "grad_norm": 0.17881116271018982, + "learning_rate": 4.5622962881334666e-06, + "loss": 0.4581, + "step": 608 + }, + { + "epoch": 3.736196319018405, + "grad_norm": 0.18698081374168396, + "learning_rate": 4.5484382490885265e-06, + "loss": 0.4273, + "step": 609 + }, + { + "epoch": 3.7423312883435584, + "grad_norm": 0.17165279388427734, + "learning_rate": 4.534583706504919e-06, + "loss": 0.4009, + "step": 610 + }, + { + "epoch": 3.7484662576687118, + "grad_norm": 0.17795486748218536, + "learning_rate": 4.520732767658938e-06, + "loss": 0.4287, + "step": 611 + }, + { + "epoch": 3.754601226993865, + "grad_norm": 0.18492695689201355, + "learning_rate": 4.50688553979898e-06, + "loss": 0.4405, + "step": 612 + }, + { + "epoch": 3.7607361963190185, + "grad_norm": 0.20490054786205292, + "learning_rate": 4.493042130144702e-06, + "loss": 0.4344, + "step": 613 + }, + { + "epoch": 3.766871165644172, + "grad_norm": 0.1859641969203949, + "learning_rate": 4.479202645886196e-06, + "loss": 0.4283, + "step": 614 + }, + { + "epoch": 3.773006134969325, + "grad_norm": 0.20046532154083252, + "learning_rate": 4.4653671941831665e-06, + "loss": 0.444, + "step": 615 + }, + { + "epoch": 3.7791411042944785, + "grad_norm": 0.1931602507829666, + "learning_rate": 4.451535882164084e-06, + "loss": 0.4278, + "step": 616 + }, + { + "epoch": 3.785276073619632, + "grad_norm": 0.19650715589523315, + "learning_rate": 4.437708816925374e-06, + "loss": 0.4441, + "step": 617 + }, + { + "epoch": 3.791411042944785, + "grad_norm": 0.18395523726940155, + "learning_rate": 4.423886105530573e-06, + "loss": 0.4286, + "step": 618 + }, + { + "epoch": 3.7975460122699385, + "grad_norm": 0.19610300660133362, + "learning_rate": 4.410067855009506e-06, + "loss": 0.4051, + "step": 619 + }, + { + "epoch": 3.8036809815950923, + "grad_norm": 0.18188579380512238, + "learning_rate": 4.396254172357462e-06, + "loss": 0.4446, + "step": 620 + }, + { + "epoch": 3.809815950920245, + "grad_norm": 0.19300203025341034, + "learning_rate": 4.382445164534357e-06, + "loss": 0.4438, + "step": 621 + }, + { + "epoch": 3.815950920245399, + "grad_norm": 0.1908637285232544, + "learning_rate": 4.368640938463909e-06, + "loss": 0.4448, + "step": 622 + }, + { + "epoch": 3.8220858895705523, + "grad_norm": 0.17132696509361267, + "learning_rate": 4.354841601032811e-06, + "loss": 0.4108, + "step": 623 + }, + { + "epoch": 3.8282208588957056, + "grad_norm": 0.16772066056728363, + "learning_rate": 4.341047259089906e-06, + "loss": 0.4376, + "step": 624 + }, + { + "epoch": 3.834355828220859, + "grad_norm": 0.20349860191345215, + "learning_rate": 4.327258019445355e-06, + "loss": 0.4809, + "step": 625 + }, + { + "epoch": 3.8404907975460123, + "grad_norm": 0.1808754950761795, + "learning_rate": 4.313473988869811e-06, + "loss": 0.4217, + "step": 626 + }, + { + "epoch": 3.8466257668711656, + "grad_norm": 0.17573867738246918, + "learning_rate": 4.299695274093593e-06, + "loss": 0.4149, + "step": 627 + }, + { + "epoch": 3.852760736196319, + "grad_norm": 0.1589432954788208, + "learning_rate": 4.28592198180586e-06, + "loss": 0.4216, + "step": 628 + }, + { + "epoch": 3.8588957055214723, + "grad_norm": 0.17543958127498627, + "learning_rate": 4.272154218653784e-06, + "loss": 0.474, + "step": 629 + }, + { + "epoch": 3.8650306748466257, + "grad_norm": 0.19895213842391968, + "learning_rate": 4.258392091241727e-06, + "loss": 0.3786, + "step": 630 + }, + { + "epoch": 3.871165644171779, + "grad_norm": 0.2045029103755951, + "learning_rate": 4.244635706130408e-06, + "loss": 0.4593, + "step": 631 + }, + { + "epoch": 3.8773006134969323, + "grad_norm": 0.2224407196044922, + "learning_rate": 4.23088516983609e-06, + "loss": 0.4171, + "step": 632 + }, + { + "epoch": 3.883435582822086, + "grad_norm": 0.17593076825141907, + "learning_rate": 4.21714058882974e-06, + "loss": 0.4178, + "step": 633 + }, + { + "epoch": 3.889570552147239, + "grad_norm": 0.18816006183624268, + "learning_rate": 4.203402069536224e-06, + "loss": 0.4484, + "step": 634 + }, + { + "epoch": 3.895705521472393, + "grad_norm": 0.18676427006721497, + "learning_rate": 4.18966971833346e-06, + "loss": 0.4609, + "step": 635 + }, + { + "epoch": 3.901840490797546, + "grad_norm": 0.18539687991142273, + "learning_rate": 4.175943641551616e-06, + "loss": 0.4407, + "step": 636 + }, + { + "epoch": 3.9079754601226995, + "grad_norm": 0.18633869290351868, + "learning_rate": 4.162223945472271e-06, + "loss": 0.4665, + "step": 637 + }, + { + "epoch": 3.914110429447853, + "grad_norm": 0.1992420107126236, + "learning_rate": 4.1485107363276e-06, + "loss": 0.4493, + "step": 638 + }, + { + "epoch": 3.920245398773006, + "grad_norm": 0.1835312396287918, + "learning_rate": 4.1348041202995484e-06, + "loss": 0.4172, + "step": 639 + }, + { + "epoch": 3.9263803680981595, + "grad_norm": 0.1969442069530487, + "learning_rate": 4.121104203519012e-06, + "loss": 0.4374, + "step": 640 + }, + { + "epoch": 3.932515337423313, + "grad_norm": 0.20216475427150726, + "learning_rate": 4.107411092065015e-06, + "loss": 0.4207, + "step": 641 + }, + { + "epoch": 3.938650306748466, + "grad_norm": 0.17508967220783234, + "learning_rate": 4.093724891963882e-06, + "loss": 0.4164, + "step": 642 + }, + { + "epoch": 3.9447852760736195, + "grad_norm": 0.19727467000484467, + "learning_rate": 4.080045709188431e-06, + "loss": 0.4335, + "step": 643 + }, + { + "epoch": 3.950920245398773, + "grad_norm": 0.1708287000656128, + "learning_rate": 4.066373649657142e-06, + "loss": 0.4187, + "step": 644 + }, + { + "epoch": 3.957055214723926, + "grad_norm": 0.21632403135299683, + "learning_rate": 4.052708819233334e-06, + "loss": 0.4237, + "step": 645 + }, + { + "epoch": 3.96319018404908, + "grad_norm": 0.17434976994991302, + "learning_rate": 4.039051323724355e-06, + "loss": 0.4238, + "step": 646 + }, + { + "epoch": 3.969325153374233, + "grad_norm": 0.18432791531085968, + "learning_rate": 4.025401268880762e-06, + "loss": 0.4483, + "step": 647 + }, + { + "epoch": 3.9754601226993866, + "grad_norm": 0.1773991733789444, + "learning_rate": 4.011758760395491e-06, + "loss": 0.3934, + "step": 648 + }, + { + "epoch": 3.98159509202454, + "grad_norm": 0.19982725381851196, + "learning_rate": 3.998123903903051e-06, + "loss": 0.4381, + "step": 649 + }, + { + "epoch": 3.9877300613496933, + "grad_norm": 0.21029439568519592, + "learning_rate": 3.9844968049786995e-06, + "loss": 0.4267, + "step": 650 + }, + { + "epoch": 3.9938650306748467, + "grad_norm": 0.21180294454097748, + "learning_rate": 3.97087756913763e-06, + "loss": 0.4185, + "step": 651 + }, + { + "epoch": 4.0, + "grad_norm": 0.17993085086345673, + "learning_rate": 3.957266301834145e-06, + "loss": 0.4382, + "step": 652 + }, + { + "epoch": 4.006134969325154, + "grad_norm": 0.2000538408756256, + "learning_rate": 3.943663108460857e-06, + "loss": 0.4189, + "step": 653 + }, + { + "epoch": 4.012269938650307, + "grad_norm": 0.1886722296476364, + "learning_rate": 3.93006809434785e-06, + "loss": 0.4464, + "step": 654 + }, + { + "epoch": 4.0184049079754605, + "grad_norm": 0.1883607655763626, + "learning_rate": 3.916481364761885e-06, + "loss": 0.417, + "step": 655 + }, + { + "epoch": 4.024539877300613, + "grad_norm": 0.1941026896238327, + "learning_rate": 3.90290302490557e-06, + "loss": 0.4515, + "step": 656 + }, + { + "epoch": 4.030674846625767, + "grad_norm": 0.1868050992488861, + "learning_rate": 3.889333179916552e-06, + "loss": 0.415, + "step": 657 + }, + { + "epoch": 4.03680981595092, + "grad_norm": 0.21741467714309692, + "learning_rate": 3.875771934866702e-06, + "loss": 0.3978, + "step": 658 + }, + { + "epoch": 4.042944785276074, + "grad_norm": 0.18438977003097534, + "learning_rate": 3.862219394761305e-06, + "loss": 0.4197, + "step": 659 + }, + { + "epoch": 4.049079754601227, + "grad_norm": 0.21055185794830322, + "learning_rate": 3.848675664538238e-06, + "loss": 0.4022, + "step": 660 + }, + { + "epoch": 4.0552147239263805, + "grad_norm": 0.1822698414325714, + "learning_rate": 3.8351408490671614e-06, + "loss": 0.4707, + "step": 661 + }, + { + "epoch": 4.061349693251533, + "grad_norm": 0.19663125276565552, + "learning_rate": 3.821615053148717e-06, + "loss": 0.4332, + "step": 662 + }, + { + "epoch": 4.067484662576687, + "grad_norm": 0.20445053279399872, + "learning_rate": 3.8080983815137017e-06, + "loss": 0.4197, + "step": 663 + }, + { + "epoch": 4.07361963190184, + "grad_norm": 0.17627210915088654, + "learning_rate": 3.7945909388222636e-06, + "loss": 0.4325, + "step": 664 + }, + { + "epoch": 4.079754601226994, + "grad_norm": 0.17431975901126862, + "learning_rate": 3.781092829663089e-06, + "loss": 0.4354, + "step": 665 + }, + { + "epoch": 4.085889570552148, + "grad_norm": 0.1871926486492157, + "learning_rate": 3.7676041585525956e-06, + "loss": 0.4155, + "step": 666 + }, + { + "epoch": 4.0920245398773005, + "grad_norm": 0.19517648220062256, + "learning_rate": 3.7541250299341243e-06, + "loss": 0.4327, + "step": 667 + }, + { + "epoch": 4.098159509202454, + "grad_norm": 0.1866348683834076, + "learning_rate": 3.740655548177125e-06, + "loss": 0.4042, + "step": 668 + }, + { + "epoch": 4.104294478527607, + "grad_norm": 0.17768587172031403, + "learning_rate": 3.7271958175763518e-06, + "loss": 0.4258, + "step": 669 + }, + { + "epoch": 4.110429447852761, + "grad_norm": 0.17544397711753845, + "learning_rate": 3.713745942351056e-06, + "loss": 0.4033, + "step": 670 + }, + { + "epoch": 4.116564417177914, + "grad_norm": 0.18574115633964539, + "learning_rate": 3.7003060266441804e-06, + "loss": 0.4229, + "step": 671 + }, + { + "epoch": 4.122699386503068, + "grad_norm": 0.1867496371269226, + "learning_rate": 3.6868761745215474e-06, + "loss": 0.432, + "step": 672 + }, + { + "epoch": 4.128834355828221, + "grad_norm": 0.19182687997817993, + "learning_rate": 3.6734564899710577e-06, + "loss": 0.4408, + "step": 673 + }, + { + "epoch": 4.134969325153374, + "grad_norm": 0.18049779534339905, + "learning_rate": 3.660047076901885e-06, + "loss": 0.4351, + "step": 674 + }, + { + "epoch": 4.141104294478527, + "grad_norm": 0.19350336492061615, + "learning_rate": 3.646648039143669e-06, + "loss": 0.4494, + "step": 675 + }, + { + "epoch": 4.147239263803681, + "grad_norm": 0.20113109052181244, + "learning_rate": 3.633259480445715e-06, + "loss": 0.4387, + "step": 676 + }, + { + "epoch": 4.153374233128835, + "grad_norm": 0.18117064237594604, + "learning_rate": 3.6198815044761847e-06, + "loss": 0.4792, + "step": 677 + }, + { + "epoch": 4.159509202453988, + "grad_norm": 0.1871654987335205, + "learning_rate": 3.6065142148213033e-06, + "loss": 0.4623, + "step": 678 + }, + { + "epoch": 4.1656441717791415, + "grad_norm": 0.17341911792755127, + "learning_rate": 3.5931577149845465e-06, + "loss": 0.4431, + "step": 679 + }, + { + "epoch": 4.171779141104294, + "grad_norm": 0.1755349338054657, + "learning_rate": 3.579812108385843e-06, + "loss": 0.4163, + "step": 680 + }, + { + "epoch": 4.177914110429448, + "grad_norm": 0.20305219292640686, + "learning_rate": 3.566477498360782e-06, + "loss": 0.4365, + "step": 681 + }, + { + "epoch": 4.184049079754601, + "grad_norm": 0.1874455362558365, + "learning_rate": 3.5531539881597967e-06, + "loss": 0.4727, + "step": 682 + }, + { + "epoch": 4.190184049079755, + "grad_norm": 0.20131394267082214, + "learning_rate": 3.5398416809473813e-06, + "loss": 0.4145, + "step": 683 + }, + { + "epoch": 4.196319018404908, + "grad_norm": 0.18988259136676788, + "learning_rate": 3.5265406798012804e-06, + "loss": 0.4457, + "step": 684 + }, + { + "epoch": 4.2024539877300615, + "grad_norm": 0.1912226378917694, + "learning_rate": 3.5132510877116953e-06, + "loss": 0.4358, + "step": 685 + }, + { + "epoch": 4.208588957055214, + "grad_norm": 0.17754784226417542, + "learning_rate": 3.4999730075804907e-06, + "loss": 0.4116, + "step": 686 + }, + { + "epoch": 4.214723926380368, + "grad_norm": 0.1884073168039322, + "learning_rate": 3.4867065422203885e-06, + "loss": 0.4366, + "step": 687 + }, + { + "epoch": 4.220858895705521, + "grad_norm": 0.18211857974529266, + "learning_rate": 3.473451794354179e-06, + "loss": 0.4276, + "step": 688 + }, + { + "epoch": 4.226993865030675, + "grad_norm": 0.20465627312660217, + "learning_rate": 3.460208866613923e-06, + "loss": 0.4266, + "step": 689 + }, + { + "epoch": 4.233128834355828, + "grad_norm": 0.26603659987449646, + "learning_rate": 3.4469778615401616e-06, + "loss": 0.4693, + "step": 690 + }, + { + "epoch": 4.2392638036809815, + "grad_norm": 0.19085724651813507, + "learning_rate": 3.4337588815811128e-06, + "loss": 0.4322, + "step": 691 + }, + { + "epoch": 4.245398773006135, + "grad_norm": 0.19165652990341187, + "learning_rate": 3.420552029091886e-06, + "loss": 0.4401, + "step": 692 + }, + { + "epoch": 4.251533742331288, + "grad_norm": 0.21447667479515076, + "learning_rate": 3.4073574063336857e-06, + "loss": 0.4327, + "step": 693 + }, + { + "epoch": 4.257668711656442, + "grad_norm": 0.21556028723716736, + "learning_rate": 3.394175115473024e-06, + "loss": 0.4507, + "step": 694 + }, + { + "epoch": 4.263803680981595, + "grad_norm": 0.16888567805290222, + "learning_rate": 3.3810052585809233e-06, + "loss": 0.4292, + "step": 695 + }, + { + "epoch": 4.269938650306749, + "grad_norm": 0.19774934649467468, + "learning_rate": 3.3678479376321304e-06, + "loss": 0.4345, + "step": 696 + }, + { + "epoch": 4.276073619631902, + "grad_norm": 0.18974056839942932, + "learning_rate": 3.354703254504328e-06, + "loss": 0.401, + "step": 697 + }, + { + "epoch": 4.282208588957055, + "grad_norm": 0.18433569371700287, + "learning_rate": 3.3415713109773386e-06, + "loss": 0.4327, + "step": 698 + }, + { + "epoch": 4.288343558282208, + "grad_norm": 0.1922151893377304, + "learning_rate": 3.328452208732349e-06, + "loss": 0.4239, + "step": 699 + }, + { + "epoch": 4.294478527607362, + "grad_norm": 0.18833968043327332, + "learning_rate": 3.3153460493511086e-06, + "loss": 0.4287, + "step": 700 + }, + { + "epoch": 4.300613496932515, + "grad_norm": 0.18338987231254578, + "learning_rate": 3.302252934315151e-06, + "loss": 0.4467, + "step": 701 + }, + { + "epoch": 4.306748466257669, + "grad_norm": 0.19972403347492218, + "learning_rate": 3.2891729650050096e-06, + "loss": 0.4153, + "step": 702 + }, + { + "epoch": 4.3128834355828225, + "grad_norm": 0.19376607239246368, + "learning_rate": 3.276106242699429e-06, + "loss": 0.4117, + "step": 703 + }, + { + "epoch": 4.319018404907975, + "grad_norm": 0.18517455458641052, + "learning_rate": 3.263052868574578e-06, + "loss": 0.4168, + "step": 704 + }, + { + "epoch": 4.325153374233129, + "grad_norm": 0.18251436948776245, + "learning_rate": 3.2500129437032756e-06, + "loss": 0.4552, + "step": 705 + }, + { + "epoch": 4.331288343558282, + "grad_norm": 0.1943109780550003, + "learning_rate": 3.236986569054199e-06, + "loss": 0.431, + "step": 706 + }, + { + "epoch": 4.337423312883436, + "grad_norm": 0.18778099119663239, + "learning_rate": 3.2239738454911057e-06, + "loss": 0.413, + "step": 707 + }, + { + "epoch": 4.343558282208589, + "grad_norm": 0.18466182053089142, + "learning_rate": 3.2109748737720537e-06, + "loss": 0.4332, + "step": 708 + }, + { + "epoch": 4.3496932515337425, + "grad_norm": 0.19483985006809235, + "learning_rate": 3.197989754548618e-06, + "loss": 0.436, + "step": 709 + }, + { + "epoch": 4.355828220858895, + "grad_norm": 0.17666223645210266, + "learning_rate": 3.1850185883651175e-06, + "loss": 0.4102, + "step": 710 + }, + { + "epoch": 4.361963190184049, + "grad_norm": 0.19580869376659393, + "learning_rate": 3.1720614756578267e-06, + "loss": 0.4151, + "step": 711 + }, + { + "epoch": 4.368098159509202, + "grad_norm": 0.20442509651184082, + "learning_rate": 3.1591185167542047e-06, + "loss": 0.4353, + "step": 712 + }, + { + "epoch": 4.374233128834356, + "grad_norm": 0.19967563450336456, + "learning_rate": 3.14618981187212e-06, + "loss": 0.4239, + "step": 713 + }, + { + "epoch": 4.38036809815951, + "grad_norm": 0.18093548715114594, + "learning_rate": 3.1332754611190695e-06, + "loss": 0.4312, + "step": 714 + }, + { + "epoch": 4.386503067484663, + "grad_norm": 0.2043561488389969, + "learning_rate": 3.1203755644914046e-06, + "loss": 0.4263, + "step": 715 + }, + { + "epoch": 4.392638036809816, + "grad_norm": 0.18950863182544708, + "learning_rate": 3.1074902218735602e-06, + "loss": 0.4453, + "step": 716 + }, + { + "epoch": 4.398773006134969, + "grad_norm": 0.19419826567173004, + "learning_rate": 3.0946195330372754e-06, + "loss": 0.4252, + "step": 717 + }, + { + "epoch": 4.404907975460123, + "grad_norm": 0.1931648850440979, + "learning_rate": 3.08176359764083e-06, + "loss": 0.422, + "step": 718 + }, + { + "epoch": 4.411042944785276, + "grad_norm": 0.19961263239383698, + "learning_rate": 3.0689225152282627e-06, + "loss": 0.4541, + "step": 719 + }, + { + "epoch": 4.41717791411043, + "grad_norm": 0.1944529414176941, + "learning_rate": 3.0560963852286046e-06, + "loss": 0.424, + "step": 720 + }, + { + "epoch": 4.423312883435583, + "grad_norm": 0.19636866450309753, + "learning_rate": 3.043285306955114e-06, + "loss": 0.4994, + "step": 721 + }, + { + "epoch": 4.429447852760736, + "grad_norm": 0.18336038291454315, + "learning_rate": 3.0304893796044988e-06, + "loss": 0.4109, + "step": 722 + }, + { + "epoch": 4.435582822085889, + "grad_norm": 0.18142063915729523, + "learning_rate": 3.017708702256153e-06, + "loss": 0.4464, + "step": 723 + }, + { + "epoch": 4.441717791411043, + "grad_norm": 0.17481929063796997, + "learning_rate": 3.004943373871393e-06, + "loss": 0.4341, + "step": 724 + }, + { + "epoch": 4.447852760736196, + "grad_norm": 0.19967781007289886, + "learning_rate": 2.9921934932926837e-06, + "loss": 0.3999, + "step": 725 + }, + { + "epoch": 4.45398773006135, + "grad_norm": 0.19884563982486725, + "learning_rate": 2.9794591592428767e-06, + "loss": 0.4417, + "step": 726 + }, + { + "epoch": 4.460122699386503, + "grad_norm": 0.1929159313440323, + "learning_rate": 2.966740470324451e-06, + "loss": 0.4371, + "step": 727 + }, + { + "epoch": 4.466257668711656, + "grad_norm": 0.19332391023635864, + "learning_rate": 2.954037525018739e-06, + "loss": 0.4668, + "step": 728 + }, + { + "epoch": 4.47239263803681, + "grad_norm": 0.18918125331401825, + "learning_rate": 2.9413504216851742e-06, + "loss": 0.4179, + "step": 729 + }, + { + "epoch": 4.478527607361963, + "grad_norm": 0.19312560558319092, + "learning_rate": 2.9286792585605206e-06, + "loss": 0.4045, + "step": 730 + }, + { + "epoch": 4.484662576687117, + "grad_norm": 0.19809098541736603, + "learning_rate": 2.9160241337581198e-06, + "loss": 0.4417, + "step": 731 + }, + { + "epoch": 4.49079754601227, + "grad_norm": 0.19697564840316772, + "learning_rate": 2.903385145267129e-06, + "loss": 0.4272, + "step": 732 + }, + { + "epoch": 4.4969325153374236, + "grad_norm": 0.2234029769897461, + "learning_rate": 2.8907623909517555e-06, + "loss": 0.4365, + "step": 733 + }, + { + "epoch": 4.5030674846625764, + "grad_norm": 0.2001408040523529, + "learning_rate": 2.8781559685505106e-06, + "loss": 0.4427, + "step": 734 + }, + { + "epoch": 4.50920245398773, + "grad_norm": 0.18446412682533264, + "learning_rate": 2.8655659756754474e-06, + "loss": 0.4511, + "step": 735 + }, + { + "epoch": 4.515337423312883, + "grad_norm": 0.20659488439559937, + "learning_rate": 2.8529925098113943e-06, + "loss": 0.4249, + "step": 736 + }, + { + "epoch": 4.521472392638037, + "grad_norm": 0.20220941305160522, + "learning_rate": 2.8404356683152256e-06, + "loss": 0.4228, + "step": 737 + }, + { + "epoch": 4.52760736196319, + "grad_norm": 0.21185429394245148, + "learning_rate": 2.827895548415084e-06, + "loss": 0.402, + "step": 738 + }, + { + "epoch": 4.533742331288344, + "grad_norm": 0.189244344830513, + "learning_rate": 2.8153722472096334e-06, + "loss": 0.4236, + "step": 739 + }, + { + "epoch": 4.539877300613497, + "grad_norm": 0.1980522722005844, + "learning_rate": 2.8028658616673184e-06, + "loss": 0.4302, + "step": 740 + }, + { + "epoch": 4.54601226993865, + "grad_norm": 0.19940122961997986, + "learning_rate": 2.7903764886255942e-06, + "loss": 0.408, + "step": 741 + }, + { + "epoch": 4.552147239263804, + "grad_norm": 0.19964726269245148, + "learning_rate": 2.777904224790197e-06, + "loss": 0.4316, + "step": 742 + }, + { + "epoch": 4.558282208588957, + "grad_norm": 0.20940682291984558, + "learning_rate": 2.765449166734382e-06, + "loss": 0.4208, + "step": 743 + }, + { + "epoch": 4.564417177914111, + "grad_norm": 0.1969623565673828, + "learning_rate": 2.7530114108981775e-06, + "loss": 0.4035, + "step": 744 + }, + { + "epoch": 4.570552147239264, + "grad_norm": 0.20081190764904022, + "learning_rate": 2.7405910535876407e-06, + "loss": 0.4469, + "step": 745 + }, + { + "epoch": 4.576687116564417, + "grad_norm": 0.1978064477443695, + "learning_rate": 2.7281881909741185e-06, + "loss": 0.4336, + "step": 746 + }, + { + "epoch": 4.58282208588957, + "grad_norm": 0.2098206877708435, + "learning_rate": 2.715802919093484e-06, + "loss": 0.4359, + "step": 747 + }, + { + "epoch": 4.588957055214724, + "grad_norm": 0.20042365789413452, + "learning_rate": 2.7034353338454142e-06, + "loss": 0.417, + "step": 748 + }, + { + "epoch": 4.595092024539877, + "grad_norm": 0.2288142442703247, + "learning_rate": 2.691085530992629e-06, + "loss": 0.4272, + "step": 749 + }, + { + "epoch": 4.601226993865031, + "grad_norm": 0.2015533745288849, + "learning_rate": 2.678753606160166e-06, + "loss": 0.407, + "step": 750 + }, + { + "epoch": 4.6073619631901845, + "grad_norm": 0.19977515935897827, + "learning_rate": 2.6664396548346303e-06, + "loss": 0.4324, + "step": 751 + }, + { + "epoch": 4.613496932515337, + "grad_norm": 0.1808634102344513, + "learning_rate": 2.654143772363455e-06, + "loss": 0.3987, + "step": 752 + }, + { + "epoch": 4.61963190184049, + "grad_norm": 0.20292384922504425, + "learning_rate": 2.6418660539541674e-06, + "loss": 0.405, + "step": 753 + }, + { + "epoch": 4.625766871165644, + "grad_norm": 0.19244877994060516, + "learning_rate": 2.6296065946736506e-06, + "loss": 0.4307, + "step": 754 + }, + { + "epoch": 4.631901840490798, + "grad_norm": 0.20017580687999725, + "learning_rate": 2.617365489447404e-06, + "loss": 0.4162, + "step": 755 + }, + { + "epoch": 4.638036809815951, + "grad_norm": 0.18756996095180511, + "learning_rate": 2.6051428330588147e-06, + "loss": 0.4307, + "step": 756 + }, + { + "epoch": 4.644171779141105, + "grad_norm": 0.1955227255821228, + "learning_rate": 2.5929387201484133e-06, + "loss": 0.4546, + "step": 757 + }, + { + "epoch": 4.6503067484662575, + "grad_norm": 0.1984601467847824, + "learning_rate": 2.5807532452131533e-06, + "loss": 0.4017, + "step": 758 + }, + { + "epoch": 4.656441717791411, + "grad_norm": 0.19388411939144135, + "learning_rate": 2.5685865026056745e-06, + "loss": 0.4706, + "step": 759 + }, + { + "epoch": 4.662576687116564, + "grad_norm": 0.18463459610939026, + "learning_rate": 2.5564385865335628e-06, + "loss": 0.4108, + "step": 760 + }, + { + "epoch": 4.668711656441718, + "grad_norm": 0.18664699792861938, + "learning_rate": 2.544309591058638e-06, + "loss": 0.4299, + "step": 761 + }, + { + "epoch": 4.674846625766871, + "grad_norm": 0.18623848259449005, + "learning_rate": 2.5321996100962163e-06, + "loss": 0.4277, + "step": 762 + }, + { + "epoch": 4.680981595092025, + "grad_norm": 0.20391374826431274, + "learning_rate": 2.5201087374143783e-06, + "loss": 0.4436, + "step": 763 + }, + { + "epoch": 4.6871165644171775, + "grad_norm": 0.20646578073501587, + "learning_rate": 2.5080370666332532e-06, + "loss": 0.4502, + "step": 764 + }, + { + "epoch": 4.693251533742331, + "grad_norm": 0.20507745444774628, + "learning_rate": 2.495984691224287e-06, + "loss": 0.4164, + "step": 765 + }, + { + "epoch": 4.699386503067485, + "grad_norm": 0.19352522492408752, + "learning_rate": 2.4839517045095225e-06, + "loss": 0.412, + "step": 766 + }, + { + "epoch": 4.705521472392638, + "grad_norm": 0.19516626000404358, + "learning_rate": 2.4719381996608748e-06, + "loss": 0.4253, + "step": 767 + }, + { + "epoch": 4.711656441717792, + "grad_norm": 0.19005592167377472, + "learning_rate": 2.459944269699407e-06, + "loss": 0.4253, + "step": 768 + }, + { + "epoch": 4.717791411042945, + "grad_norm": 0.18863846361637115, + "learning_rate": 2.4479700074946154e-06, + "loss": 0.4063, + "step": 769 + }, + { + "epoch": 4.723926380368098, + "grad_norm": 0.2115408331155777, + "learning_rate": 2.4360155057637115e-06, + "loss": 0.439, + "step": 770 + }, + { + "epoch": 4.730061349693251, + "grad_norm": 0.1989760547876358, + "learning_rate": 2.4240808570708926e-06, + "loss": 0.4358, + "step": 771 + }, + { + "epoch": 4.736196319018405, + "grad_norm": 0.18984295427799225, + "learning_rate": 2.412166153826639e-06, + "loss": 0.4262, + "step": 772 + }, + { + "epoch": 4.742331288343558, + "grad_norm": 0.17575299739837646, + "learning_rate": 2.400271488286992e-06, + "loss": 0.4263, + "step": 773 + }, + { + "epoch": 4.748466257668712, + "grad_norm": 0.18755771219730377, + "learning_rate": 2.3883969525528396e-06, + "loss": 0.4089, + "step": 774 + }, + { + "epoch": 4.754601226993865, + "grad_norm": 0.2058447301387787, + "learning_rate": 2.3765426385692044e-06, + "loss": 0.4427, + "step": 775 + }, + { + "epoch": 4.7607361963190185, + "grad_norm": 0.20019108057022095, + "learning_rate": 2.3647086381245267e-06, + "loss": 0.4548, + "step": 776 + }, + { + "epoch": 4.766871165644172, + "grad_norm": 0.17807462811470032, + "learning_rate": 2.352895042849965e-06, + "loss": 0.4172, + "step": 777 + }, + { + "epoch": 4.773006134969325, + "grad_norm": 0.18490706384181976, + "learning_rate": 2.3411019442186766e-06, + "loss": 0.4319, + "step": 778 + }, + { + "epoch": 4.779141104294479, + "grad_norm": 0.1955891102552414, + "learning_rate": 2.3293294335451104e-06, + "loss": 0.4516, + "step": 779 + }, + { + "epoch": 4.785276073619632, + "grad_norm": 0.19553668797016144, + "learning_rate": 2.317577601984305e-06, + "loss": 0.4211, + "step": 780 + }, + { + "epoch": 4.791411042944786, + "grad_norm": 0.1864728480577469, + "learning_rate": 2.30584654053118e-06, + "loss": 0.3994, + "step": 781 + }, + { + "epoch": 4.7975460122699385, + "grad_norm": 0.1869369000196457, + "learning_rate": 2.294136340019826e-06, + "loss": 0.4526, + "step": 782 + }, + { + "epoch": 4.803680981595092, + "grad_norm": 0.19105935096740723, + "learning_rate": 2.282447091122816e-06, + "loss": 0.473, + "step": 783 + }, + { + "epoch": 4.809815950920245, + "grad_norm": 0.18605674803256989, + "learning_rate": 2.2707788843504836e-06, + "loss": 0.4634, + "step": 784 + }, + { + "epoch": 4.815950920245399, + "grad_norm": 0.18209435045719147, + "learning_rate": 2.2591318100502385e-06, + "loss": 0.4566, + "step": 785 + }, + { + "epoch": 4.822085889570552, + "grad_norm": 0.19514133036136627, + "learning_rate": 2.2475059584058612e-06, + "loss": 0.4082, + "step": 786 + }, + { + "epoch": 4.828220858895706, + "grad_norm": 0.192851722240448, + "learning_rate": 2.2359014194367986e-06, + "loss": 0.4739, + "step": 787 + }, + { + "epoch": 4.8343558282208585, + "grad_norm": 0.2023897022008896, + "learning_rate": 2.2243182829974775e-06, + "loss": 0.4416, + "step": 788 + }, + { + "epoch": 4.840490797546012, + "grad_norm": 0.18871212005615234, + "learning_rate": 2.2127566387766045e-06, + "loss": 0.4408, + "step": 789 + }, + { + "epoch": 4.846625766871165, + "grad_norm": 0.20089678466320038, + "learning_rate": 2.2012165762964677e-06, + "loss": 0.4236, + "step": 790 + }, + { + "epoch": 4.852760736196319, + "grad_norm": 0.2131776064634323, + "learning_rate": 2.189698184912249e-06, + "loss": 0.4368, + "step": 791 + }, + { + "epoch": 4.858895705521473, + "grad_norm": 0.2049061506986618, + "learning_rate": 2.17820155381133e-06, + "loss": 0.4444, + "step": 792 + }, + { + "epoch": 4.865030674846626, + "grad_norm": 0.18744398653507233, + "learning_rate": 2.1667267720126014e-06, + "loss": 0.4292, + "step": 793 + }, + { + "epoch": 4.871165644171779, + "grad_norm": 0.20928369462490082, + "learning_rate": 2.1552739283657753e-06, + "loss": 0.4321, + "step": 794 + }, + { + "epoch": 4.877300613496932, + "grad_norm": 0.18477670848369598, + "learning_rate": 2.1438431115506908e-06, + "loss": 0.4629, + "step": 795 + }, + { + "epoch": 4.883435582822086, + "grad_norm": 0.17708535492420197, + "learning_rate": 2.1324344100766376e-06, + "loss": 0.4647, + "step": 796 + }, + { + "epoch": 4.889570552147239, + "grad_norm": 0.1869410127401352, + "learning_rate": 2.1210479122816646e-06, + "loss": 0.4141, + "step": 797 + }, + { + "epoch": 4.895705521472393, + "grad_norm": 0.18549315631389618, + "learning_rate": 2.109683706331893e-06, + "loss": 0.4649, + "step": 798 + }, + { + "epoch": 4.901840490797546, + "grad_norm": 0.21219757199287415, + "learning_rate": 2.0983418802208416e-06, + "loss": 0.4252, + "step": 799 + }, + { + "epoch": 4.9079754601226995, + "grad_norm": 0.1829107105731964, + "learning_rate": 2.0870225217687408e-06, + "loss": 0.4095, + "step": 800 + }, + { + "epoch": 4.914110429447852, + "grad_norm": 0.2121373414993286, + "learning_rate": 2.0757257186218465e-06, + "loss": 0.3889, + "step": 801 + }, + { + "epoch": 4.920245398773006, + "grad_norm": 0.19195012748241425, + "learning_rate": 2.0644515582517803e-06, + "loss": 0.4324, + "step": 802 + }, + { + "epoch": 4.92638036809816, + "grad_norm": 0.20068559050559998, + "learning_rate": 2.053200127954828e-06, + "loss": 0.4484, + "step": 803 + }, + { + "epoch": 4.932515337423313, + "grad_norm": 0.18420425057411194, + "learning_rate": 2.0419715148512807e-06, + "loss": 0.4237, + "step": 804 + }, + { + "epoch": 4.938650306748467, + "grad_norm": 0.2045467048883438, + "learning_rate": 2.0307658058847577e-06, + "loss": 0.4114, + "step": 805 + }, + { + "epoch": 4.9447852760736195, + "grad_norm": 0.19473262131214142, + "learning_rate": 2.0195830878215236e-06, + "loss": 0.4305, + "step": 806 + }, + { + "epoch": 4.950920245398773, + "grad_norm": 0.1820230484008789, + "learning_rate": 2.0084234472498274e-06, + "loss": 0.4232, + "step": 807 + }, + { + "epoch": 4.957055214723926, + "grad_norm": 0.20162828266620636, + "learning_rate": 1.997286970579232e-06, + "loss": 0.3802, + "step": 808 + }, + { + "epoch": 4.96319018404908, + "grad_norm": 0.194149449467659, + "learning_rate": 1.9861737440399327e-06, + "loss": 0.4432, + "step": 809 + }, + { + "epoch": 4.969325153374233, + "grad_norm": 0.21044090390205383, + "learning_rate": 1.9750838536821048e-06, + "loss": 0.4031, + "step": 810 + }, + { + "epoch": 4.975460122699387, + "grad_norm": 0.18757662177085876, + "learning_rate": 1.964017385375228e-06, + "loss": 0.4047, + "step": 811 + }, + { + "epoch": 4.9815950920245395, + "grad_norm": 0.2228487730026245, + "learning_rate": 1.952974424807425e-06, + "loss": 0.432, + "step": 812 + }, + { + "epoch": 4.987730061349693, + "grad_norm": 0.21380779147148132, + "learning_rate": 1.9419550574847986e-06, + "loss": 0.4277, + "step": 813 + }, + { + "epoch": 4.993865030674847, + "grad_norm": 0.21621406078338623, + "learning_rate": 1.9309593687307622e-06, + "loss": 0.4616, + "step": 814 + }, + { + "epoch": 5.0, + "grad_norm": 0.19357597827911377, + "learning_rate": 1.9199874436853904e-06, + "loss": 0.4202, + "step": 815 + }, + { + "epoch": 5.006134969325154, + "grad_norm": 0.18243946135044098, + "learning_rate": 1.9090393673047557e-06, + "loss": 0.4184, + "step": 816 + }, + { + "epoch": 5.012269938650307, + "grad_norm": 0.21097031235694885, + "learning_rate": 1.898115224360263e-06, + "loss": 0.4323, + "step": 817 + }, + { + "epoch": 5.0184049079754605, + "grad_norm": 0.19240233302116394, + "learning_rate": 1.8872150994380045e-06, + "loss": 0.4077, + "step": 818 + }, + { + "epoch": 5.024539877300613, + "grad_norm": 0.20225709676742554, + "learning_rate": 1.8763390769381017e-06, + "loss": 0.4164, + "step": 819 + }, + { + "epoch": 5.030674846625767, + "grad_norm": 0.22095705568790436, + "learning_rate": 1.865487241074041e-06, + "loss": 0.417, + "step": 820 + }, + { + "epoch": 5.03680981595092, + "grad_norm": 0.18732763826847076, + "learning_rate": 1.8546596758720437e-06, + "loss": 0.406, + "step": 821 + }, + { + "epoch": 5.042944785276074, + "grad_norm": 0.19568726420402527, + "learning_rate": 1.84385646517039e-06, + "loss": 0.4335, + "step": 822 + }, + { + "epoch": 5.049079754601227, + "grad_norm": 0.22073768079280853, + "learning_rate": 1.8330776926187904e-06, + "loss": 0.4485, + "step": 823 + }, + { + "epoch": 5.0552147239263805, + "grad_norm": 0.22986707091331482, + "learning_rate": 1.8223234416777275e-06, + "loss": 0.4347, + "step": 824 + }, + { + "epoch": 5.061349693251533, + "grad_norm": 0.17984724044799805, + "learning_rate": 1.8115937956178093e-06, + "loss": 0.4037, + "step": 825 + }, + { + "epoch": 5.067484662576687, + "grad_norm": 0.2089032679796219, + "learning_rate": 1.8008888375191302e-06, + "loss": 0.4223, + "step": 826 + }, + { + "epoch": 5.07361963190184, + "grad_norm": 0.18969732522964478, + "learning_rate": 1.7902086502706256e-06, + "loss": 0.4375, + "step": 827 + }, + { + "epoch": 5.079754601226994, + "grad_norm": 0.203078955411911, + "learning_rate": 1.779553316569425e-06, + "loss": 0.4424, + "step": 828 + }, + { + "epoch": 5.085889570552148, + "grad_norm": 0.20583704113960266, + "learning_rate": 1.7689229189202196e-06, + "loss": 0.4161, + "step": 829 + }, + { + "epoch": 5.0920245398773005, + "grad_norm": 0.20547014474868774, + "learning_rate": 1.758317539634618e-06, + "loss": 0.4291, + "step": 830 + }, + { + "epoch": 5.098159509202454, + "grad_norm": 0.20702853798866272, + "learning_rate": 1.747737260830512e-06, + "loss": 0.3902, + "step": 831 + }, + { + "epoch": 5.104294478527607, + "grad_norm": 0.20404289662837982, + "learning_rate": 1.7371821644314392e-06, + "loss": 0.4249, + "step": 832 + }, + { + "epoch": 5.110429447852761, + "grad_norm": 0.19528424739837646, + "learning_rate": 1.726652332165945e-06, + "loss": 0.4356, + "step": 833 + }, + { + "epoch": 5.116564417177914, + "grad_norm": 0.18722812831401825, + "learning_rate": 1.716147845566959e-06, + "loss": 0.4146, + "step": 834 + }, + { + "epoch": 5.122699386503068, + "grad_norm": 0.20289267599582672, + "learning_rate": 1.7056687859711563e-06, + "loss": 0.4343, + "step": 835 + }, + { + "epoch": 5.128834355828221, + "grad_norm": 0.19126549363136292, + "learning_rate": 1.695215234518326e-06, + "loss": 0.4834, + "step": 836 + }, + { + "epoch": 5.134969325153374, + "grad_norm": 0.20215460658073425, + "learning_rate": 1.6847872721507525e-06, + "loss": 0.4272, + "step": 837 + }, + { + "epoch": 5.141104294478527, + "grad_norm": 0.200180321931839, + "learning_rate": 1.674384979612579e-06, + "loss": 0.4332, + "step": 838 + }, + { + "epoch": 5.147239263803681, + "grad_norm": 0.20032553374767303, + "learning_rate": 1.6640084374491872e-06, + "loss": 0.459, + "step": 839 + }, + { + "epoch": 5.153374233128835, + "grad_norm": 0.20815366506576538, + "learning_rate": 1.653657726006575e-06, + "loss": 0.4564, + "step": 840 + }, + { + "epoch": 5.159509202453988, + "grad_norm": 0.19371084868907928, + "learning_rate": 1.6433329254307261e-06, + "loss": 0.4595, + "step": 841 + }, + { + "epoch": 5.1656441717791415, + "grad_norm": 0.179207444190979, + "learning_rate": 1.633034115667001e-06, + "loss": 0.4564, + "step": 842 + }, + { + "epoch": 5.171779141104294, + "grad_norm": 0.19510361552238464, + "learning_rate": 1.6227613764595107e-06, + "loss": 0.4207, + "step": 843 + }, + { + "epoch": 5.177914110429448, + "grad_norm": 0.19818256795406342, + "learning_rate": 1.6125147873504971e-06, + "loss": 0.4274, + "step": 844 + }, + { + "epoch": 5.184049079754601, + "grad_norm": 0.18541868031024933, + "learning_rate": 1.6022944276797265e-06, + "loss": 0.4103, + "step": 845 + }, + { + "epoch": 5.190184049079755, + "grad_norm": 0.19620756804943085, + "learning_rate": 1.5921003765838673e-06, + "loss": 0.4019, + "step": 846 + }, + { + "epoch": 5.196319018404908, + "grad_norm": 0.19282202422618866, + "learning_rate": 1.5819327129958762e-06, + "loss": 0.4135, + "step": 847 + }, + { + "epoch": 5.2024539877300615, + "grad_norm": 0.20788726210594177, + "learning_rate": 1.5717915156443953e-06, + "loss": 0.3979, + "step": 848 + }, + { + "epoch": 5.208588957055214, + "grad_norm": 0.20662984251976013, + "learning_rate": 1.5616768630531353e-06, + "loss": 0.4236, + "step": 849 + }, + { + "epoch": 5.214723926380368, + "grad_norm": 0.1801910698413849, + "learning_rate": 1.5515888335402706e-06, + "loss": 0.4094, + "step": 850 + }, + { + "epoch": 5.220858895705521, + "grad_norm": 0.21069598197937012, + "learning_rate": 1.5415275052178318e-06, + "loss": 0.4229, + "step": 851 + }, + { + "epoch": 5.226993865030675, + "grad_norm": 0.18792353570461273, + "learning_rate": 1.5314929559910985e-06, + "loss": 0.3851, + "step": 852 + }, + { + "epoch": 5.233128834355828, + "grad_norm": 0.21205176413059235, + "learning_rate": 1.5214852635580018e-06, + "loss": 0.4524, + "step": 853 + }, + { + "epoch": 5.2392638036809815, + "grad_norm": 0.20375634729862213, + "learning_rate": 1.5115045054085204e-06, + "loss": 0.4293, + "step": 854 + }, + { + "epoch": 5.245398773006135, + "grad_norm": 0.19179388880729675, + "learning_rate": 1.5015507588240742e-06, + "loss": 0.4438, + "step": 855 + }, + { + "epoch": 5.251533742331288, + "grad_norm": 0.1827925592660904, + "learning_rate": 1.4916241008769372e-06, + "loss": 0.4183, + "step": 856 + }, + { + "epoch": 5.257668711656442, + "grad_norm": 0.21141961216926575, + "learning_rate": 1.4817246084296327e-06, + "loss": 0.4388, + "step": 857 + }, + { + "epoch": 5.263803680981595, + "grad_norm": 0.18689335882663727, + "learning_rate": 1.4718523581343403e-06, + "loss": 0.4312, + "step": 858 + }, + { + "epoch": 5.269938650306749, + "grad_norm": 0.195315420627594, + "learning_rate": 1.4620074264323048e-06, + "loss": 0.4287, + "step": 859 + }, + { + "epoch": 5.276073619631902, + "grad_norm": 0.1855771541595459, + "learning_rate": 1.452189889553236e-06, + "loss": 0.4252, + "step": 860 + }, + { + "epoch": 5.282208588957055, + "grad_norm": 0.18047569692134857, + "learning_rate": 1.4423998235147307e-06, + "loss": 0.3753, + "step": 861 + }, + { + "epoch": 5.288343558282208, + "grad_norm": 0.19961078464984894, + "learning_rate": 1.4326373041216774e-06, + "loss": 0.4277, + "step": 862 + }, + { + "epoch": 5.294478527607362, + "grad_norm": 0.19070571660995483, + "learning_rate": 1.422902406965664e-06, + "loss": 0.4258, + "step": 863 + }, + { + "epoch": 5.300613496932515, + "grad_norm": 0.18950672447681427, + "learning_rate": 1.4131952074244037e-06, + "loss": 0.4223, + "step": 864 + }, + { + "epoch": 5.306748466257669, + "grad_norm": 0.1713619977235794, + "learning_rate": 1.4035157806611465e-06, + "loss": 0.4282, + "step": 865 + }, + { + "epoch": 5.3128834355828225, + "grad_norm": 0.18861691653728485, + "learning_rate": 1.39386420162409e-06, + "loss": 0.4332, + "step": 866 + }, + { + "epoch": 5.319018404907975, + "grad_norm": 0.20134612917900085, + "learning_rate": 1.3842405450458158e-06, + "loss": 0.431, + "step": 867 + }, + { + "epoch": 5.325153374233129, + "grad_norm": 0.19348467886447906, + "learning_rate": 1.3746448854426908e-06, + "loss": 0.3873, + "step": 868 + }, + { + "epoch": 5.331288343558282, + "grad_norm": 0.20825737714767456, + "learning_rate": 1.3650772971143067e-06, + "loss": 0.409, + "step": 869 + }, + { + "epoch": 5.337423312883436, + "grad_norm": 0.20485500991344452, + "learning_rate": 1.355537854142897e-06, + "loss": 0.4514, + "step": 870 + }, + { + "epoch": 5.343558282208589, + "grad_norm": 0.19826869666576385, + "learning_rate": 1.3460266303927604e-06, + "loss": 0.4406, + "step": 871 + }, + { + "epoch": 5.3496932515337425, + "grad_norm": 0.1876692771911621, + "learning_rate": 1.336543699509698e-06, + "loss": 0.445, + "step": 872 + }, + { + "epoch": 5.355828220858895, + "grad_norm": 0.19540998339653015, + "learning_rate": 1.3270891349204378e-06, + "loss": 0.4144, + "step": 873 + }, + { + "epoch": 5.361963190184049, + "grad_norm": 0.20348355174064636, + "learning_rate": 1.3176630098320615e-06, + "loss": 0.4201, + "step": 874 + }, + { + "epoch": 5.368098159509202, + "grad_norm": 0.1816159337759018, + "learning_rate": 1.3082653972314475e-06, + "loss": 0.4181, + "step": 875 + }, + { + "epoch": 5.374233128834356, + "grad_norm": 0.19366130232810974, + "learning_rate": 1.2988963698846997e-06, + "loss": 0.4246, + "step": 876 + }, + { + "epoch": 5.38036809815951, + "grad_norm": 0.1955079287290573, + "learning_rate": 1.2895560003365837e-06, + "loss": 0.4378, + "step": 877 + }, + { + "epoch": 5.386503067484663, + "grad_norm": 0.20214815437793732, + "learning_rate": 1.2802443609099696e-06, + "loss": 0.4306, + "step": 878 + }, + { + "epoch": 5.392638036809816, + "grad_norm": 0.1942247748374939, + "learning_rate": 1.270961523705264e-06, + "loss": 0.4158, + "step": 879 + }, + { + "epoch": 5.398773006134969, + "grad_norm": 0.19706179201602936, + "learning_rate": 1.2617075605998618e-06, + "loss": 0.4227, + "step": 880 + }, + { + "epoch": 5.404907975460123, + "grad_norm": 0.19230398535728455, + "learning_rate": 1.2524825432475828e-06, + "loss": 0.4289, + "step": 881 + }, + { + "epoch": 5.411042944785276, + "grad_norm": 0.1848897784948349, + "learning_rate": 1.2432865430781166e-06, + "loss": 0.4216, + "step": 882 + }, + { + "epoch": 5.41717791411043, + "grad_norm": 0.20822712779045105, + "learning_rate": 1.234119631296473e-06, + "loss": 0.4419, + "step": 883 + }, + { + "epoch": 5.423312883435583, + "grad_norm": 0.20798738300800323, + "learning_rate": 1.2249818788824324e-06, + "loss": 0.4567, + "step": 884 + }, + { + "epoch": 5.429447852760736, + "grad_norm": 0.2089272290468216, + "learning_rate": 1.2158733565899855e-06, + "loss": 0.4499, + "step": 885 + }, + { + "epoch": 5.435582822085889, + "grad_norm": 0.18762832880020142, + "learning_rate": 1.2067941349468021e-06, + "loss": 0.4365, + "step": 886 + }, + { + "epoch": 5.441717791411043, + "grad_norm": 0.22340965270996094, + "learning_rate": 1.1977442842536685e-06, + "loss": 0.4296, + "step": 887 + }, + { + "epoch": 5.447852760736196, + "grad_norm": 0.18473917245864868, + "learning_rate": 1.1887238745839536e-06, + "loss": 0.4347, + "step": 888 + }, + { + "epoch": 5.45398773006135, + "grad_norm": 0.2267259657382965, + "learning_rate": 1.179732975783065e-06, + "loss": 0.4137, + "step": 889 + }, + { + "epoch": 5.460122699386503, + "grad_norm": 0.20931611955165863, + "learning_rate": 1.1707716574679007e-06, + "loss": 0.3982, + "step": 890 + }, + { + "epoch": 5.466257668711656, + "grad_norm": 0.19619382917881012, + "learning_rate": 1.1618399890263215e-06, + "loss": 0.4455, + "step": 891 + }, + { + "epoch": 5.47239263803681, + "grad_norm": 0.18555289506912231, + "learning_rate": 1.1529380396166074e-06, + "loss": 0.4423, + "step": 892 + }, + { + "epoch": 5.478527607361963, + "grad_norm": 0.18679551780223846, + "learning_rate": 1.1440658781669179e-06, + "loss": 0.4392, + "step": 893 + }, + { + "epoch": 5.484662576687117, + "grad_norm": 0.19644379615783691, + "learning_rate": 1.1352235733747685e-06, + "loss": 0.4744, + "step": 894 + }, + { + "epoch": 5.49079754601227, + "grad_norm": 0.18453319370746613, + "learning_rate": 1.1264111937064902e-06, + "loss": 0.4172, + "step": 895 + }, + { + "epoch": 5.4969325153374236, + "grad_norm": 0.19223150610923767, + "learning_rate": 1.117628807396705e-06, + "loss": 0.435, + "step": 896 + }, + { + "epoch": 5.5030674846625764, + "grad_norm": 0.1897895187139511, + "learning_rate": 1.1088764824477938e-06, + "loss": 0.4312, + "step": 897 + }, + { + "epoch": 5.50920245398773, + "grad_norm": 0.20507152378559113, + "learning_rate": 1.100154286629369e-06, + "loss": 0.4133, + "step": 898 + }, + { + "epoch": 5.515337423312883, + "grad_norm": 0.183609277009964, + "learning_rate": 1.0914622874777547e-06, + "loss": 0.4109, + "step": 899 + }, + { + "epoch": 5.521472392638037, + "grad_norm": 0.18945349752902985, + "learning_rate": 1.0828005522954626e-06, + "loss": 0.4336, + "step": 900 + }, + { + "epoch": 5.52760736196319, + "grad_norm": 0.22751018404960632, + "learning_rate": 1.0741691481506627e-06, + "loss": 0.4164, + "step": 901 + }, + { + "epoch": 5.533742331288344, + "grad_norm": 0.21165773272514343, + "learning_rate": 1.0655681418766772e-06, + "loss": 0.4538, + "step": 902 + }, + { + "epoch": 5.539877300613497, + "grad_norm": 0.18855099380016327, + "learning_rate": 1.0569976000714544e-06, + "loss": 0.4405, + "step": 903 + }, + { + "epoch": 5.54601226993865, + "grad_norm": 0.19720859825611115, + "learning_rate": 1.0484575890970505e-06, + "loss": 0.4143, + "step": 904 + }, + { + "epoch": 5.552147239263804, + "grad_norm": 0.19412867724895477, + "learning_rate": 1.0399481750791291e-06, + "loss": 0.4443, + "step": 905 + }, + { + "epoch": 5.558282208588957, + "grad_norm": 0.19512735307216644, + "learning_rate": 1.0314694239064315e-06, + "loss": 0.4435, + "step": 906 + }, + { + "epoch": 5.564417177914111, + "grad_norm": 0.19326624274253845, + "learning_rate": 1.0230214012302807e-06, + "loss": 0.4268, + "step": 907 + }, + { + "epoch": 5.570552147239264, + "grad_norm": 0.18657195568084717, + "learning_rate": 1.014604172464067e-06, + "loss": 0.4158, + "step": 908 + }, + { + "epoch": 5.576687116564417, + "grad_norm": 0.20302285254001617, + "learning_rate": 1.0062178027827385e-06, + "loss": 0.4361, + "step": 909 + }, + { + "epoch": 5.58282208588957, + "grad_norm": 0.18828590214252472, + "learning_rate": 9.978623571223045e-07, + "loss": 0.439, + "step": 910 + }, + { + "epoch": 5.588957055214724, + "grad_norm": 0.1946876496076584, + "learning_rate": 9.89537900179327e-07, + "loss": 0.4277, + "step": 911 + }, + { + "epoch": 5.595092024539877, + "grad_norm": 0.19324910640716553, + "learning_rate": 9.812444964104195e-07, + "loss": 0.4745, + "step": 912 + }, + { + "epoch": 5.601226993865031, + "grad_norm": 0.21109642088413239, + "learning_rate": 9.72982210031751e-07, + "loss": 0.4066, + "step": 913 + }, + { + "epoch": 5.6073619631901845, + "grad_norm": 0.2006876915693283, + "learning_rate": 9.647511050185475e-07, + "loss": 0.459, + "step": 914 + }, + { + "epoch": 5.613496932515337, + "grad_norm": 0.21445654332637787, + "learning_rate": 9.56551245104596e-07, + "loss": 0.4202, + "step": 915 + }, + { + "epoch": 5.61963190184049, + "grad_norm": 0.2019462138414383, + "learning_rate": 9.48382693781752e-07, + "loss": 0.4487, + "step": 916 + }, + { + "epoch": 5.625766871165644, + "grad_norm": 0.21154631674289703, + "learning_rate": 9.402455142994443e-07, + "loss": 0.4411, + "step": 917 + }, + { + "epoch": 5.631901840490798, + "grad_norm": 0.2066999077796936, + "learning_rate": 9.321397696641916e-07, + "loss": 0.4351, + "step": 918 + }, + { + "epoch": 5.638036809815951, + "grad_norm": 0.21491585671901703, + "learning_rate": 9.240655226391121e-07, + "loss": 0.4178, + "step": 919 + }, + { + "epoch": 5.644171779141105, + "grad_norm": 0.20312045514583588, + "learning_rate": 9.160228357434314e-07, + "loss": 0.4321, + "step": 920 + }, + { + "epoch": 5.6503067484662575, + "grad_norm": 0.20171020925045013, + "learning_rate": 9.080117712520087e-07, + "loss": 0.4243, + "step": 921 + }, + { + "epoch": 5.656441717791411, + "grad_norm": 0.19424133002758026, + "learning_rate": 9.000323911948483e-07, + "loss": 0.4096, + "step": 922 + }, + { + "epoch": 5.662576687116564, + "grad_norm": 0.20163863897323608, + "learning_rate": 8.920847573566204e-07, + "loss": 0.4381, + "step": 923 + }, + { + "epoch": 5.668711656441718, + "grad_norm": 0.1946740597486496, + "learning_rate": 8.841689312761837e-07, + "loss": 0.4318, + "step": 924 + }, + { + "epoch": 5.674846625766871, + "grad_norm": 0.19821976125240326, + "learning_rate": 8.762849742461044e-07, + "loss": 0.4143, + "step": 925 + }, + { + "epoch": 5.680981595092025, + "grad_norm": 0.1881324201822281, + "learning_rate": 8.6843294731219e-07, + "loss": 0.4343, + "step": 926 + }, + { + "epoch": 5.6871165644171775, + "grad_norm": 0.1956545114517212, + "learning_rate": 8.60612911273011e-07, + "loss": 0.4057, + "step": 927 + }, + { + "epoch": 5.693251533742331, + "grad_norm": 0.17811179161071777, + "learning_rate": 8.528249266794286e-07, + "loss": 0.4045, + "step": 928 + }, + { + "epoch": 5.699386503067485, + "grad_norm": 0.21344789862632751, + "learning_rate": 8.450690538341299e-07, + "loss": 0.3894, + "step": 929 + }, + { + "epoch": 5.705521472392638, + "grad_norm": 0.20089948177337646, + "learning_rate": 8.373453527911618e-07, + "loss": 0.4302, + "step": 930 + }, + { + "epoch": 5.711656441717792, + "grad_norm": 0.19986341893672943, + "learning_rate": 8.29653883355458e-07, + "loss": 0.4502, + "step": 931 + }, + { + "epoch": 5.717791411042945, + "grad_norm": 0.21868270635604858, + "learning_rate": 8.219947050823862e-07, + "loss": 0.4134, + "step": 932 + }, + { + "epoch": 5.723926380368098, + "grad_norm": 0.17949172854423523, + "learning_rate": 8.143678772772811e-07, + "loss": 0.4315, + "step": 933 + }, + { + "epoch": 5.730061349693251, + "grad_norm": 0.20468908548355103, + "learning_rate": 8.06773458994986e-07, + "loss": 0.4446, + "step": 934 + }, + { + "epoch": 5.736196319018405, + "grad_norm": 0.21158595383167267, + "learning_rate": 7.99211509039397e-07, + "loss": 0.4719, + "step": 935 + }, + { + "epoch": 5.742331288343558, + "grad_norm": 0.22054874897003174, + "learning_rate": 7.916820859630031e-07, + "loss": 0.4244, + "step": 936 + }, + { + "epoch": 5.748466257668712, + "grad_norm": 0.2146049439907074, + "learning_rate": 7.841852480664414e-07, + "loss": 0.419, + "step": 937 + }, + { + "epoch": 5.754601226993865, + "grad_norm": 0.20754463970661163, + "learning_rate": 7.767210533980373e-07, + "loss": 0.4288, + "step": 938 + }, + { + "epoch": 5.7607361963190185, + "grad_norm": 0.2121291607618332, + "learning_rate": 7.692895597533584e-07, + "loss": 0.4149, + "step": 939 + }, + { + "epoch": 5.766871165644172, + "grad_norm": 0.21436981856822968, + "learning_rate": 7.618908246747686e-07, + "loss": 0.4888, + "step": 940 + }, + { + "epoch": 5.773006134969325, + "grad_norm": 0.20958930253982544, + "learning_rate": 7.54524905450979e-07, + "loss": 0.423, + "step": 941 + }, + { + "epoch": 5.779141104294479, + "grad_norm": 0.20395760238170624, + "learning_rate": 7.471918591166078e-07, + "loss": 0.4115, + "step": 942 + }, + { + "epoch": 5.785276073619632, + "grad_norm": 0.18619965016841888, + "learning_rate": 7.398917424517377e-07, + "loss": 0.4395, + "step": 943 + }, + { + "epoch": 5.791411042944786, + "grad_norm": 0.18316420912742615, + "learning_rate": 7.326246119814712e-07, + "loss": 0.4201, + "step": 944 + }, + { + "epoch": 5.7975460122699385, + "grad_norm": 0.18815039098262787, + "learning_rate": 7.253905239755021e-07, + "loss": 0.4224, + "step": 945 + }, + { + "epoch": 5.803680981595092, + "grad_norm": 0.2220267653465271, + "learning_rate": 7.181895344476747e-07, + "loss": 0.4289, + "step": 946 + }, + { + "epoch": 5.809815950920245, + "grad_norm": 0.20367726683616638, + "learning_rate": 7.110216991555457e-07, + "loss": 0.4518, + "step": 947 + }, + { + "epoch": 5.815950920245399, + "grad_norm": 0.2034081518650055, + "learning_rate": 7.038870735999631e-07, + "loss": 0.4344, + "step": 948 + }, + { + "epoch": 5.822085889570552, + "grad_norm": 0.1995425820350647, + "learning_rate": 6.96785713024628e-07, + "loss": 0.4201, + "step": 949 + }, + { + "epoch": 5.828220858895706, + "grad_norm": 0.2018972784280777, + "learning_rate": 6.897176724156663e-07, + "loss": 0.4358, + "step": 950 + }, + { + "epoch": 5.8343558282208585, + "grad_norm": 0.20849905908107758, + "learning_rate": 6.82683006501213e-07, + "loss": 0.4192, + "step": 951 + }, + { + "epoch": 5.840490797546012, + "grad_norm": 0.18639759719371796, + "learning_rate": 6.756817697509755e-07, + "loss": 0.4372, + "step": 952 + }, + { + "epoch": 5.846625766871165, + "grad_norm": 0.19128410518169403, + "learning_rate": 6.687140163758194e-07, + "loss": 0.4123, + "step": 953 + }, + { + "epoch": 5.852760736196319, + "grad_norm": 0.21019329130649567, + "learning_rate": 6.617798003273496e-07, + "loss": 0.4193, + "step": 954 + }, + { + "epoch": 5.858895705521473, + "grad_norm": 0.17338204383850098, + "learning_rate": 6.548791752974853e-07, + "loss": 0.4051, + "step": 955 + }, + { + "epoch": 5.865030674846626, + "grad_norm": 0.20529429614543915, + "learning_rate": 6.480121947180534e-07, + "loss": 0.4616, + "step": 956 + }, + { + "epoch": 5.871165644171779, + "grad_norm": 0.19057132303714752, + "learning_rate": 6.411789117603701e-07, + "loss": 0.423, + "step": 957 + }, + { + "epoch": 5.877300613496932, + "grad_norm": 0.1914457529783249, + "learning_rate": 6.343793793348247e-07, + "loss": 0.4017, + "step": 958 + }, + { + "epoch": 5.883435582822086, + "grad_norm": 0.20379380881786346, + "learning_rate": 6.276136500904823e-07, + "loss": 0.4191, + "step": 959 + }, + { + "epoch": 5.889570552147239, + "grad_norm": 0.19549894332885742, + "learning_rate": 6.208817764146596e-07, + "loss": 0.4201, + "step": 960 + }, + { + "epoch": 5.895705521472393, + "grad_norm": 0.2176375687122345, + "learning_rate": 6.141838104325376e-07, + "loss": 0.443, + "step": 961 + }, + { + "epoch": 5.901840490797546, + "grad_norm": 0.18703949451446533, + "learning_rate": 6.075198040067432e-07, + "loss": 0.4184, + "step": 962 + }, + { + "epoch": 5.9079754601226995, + "grad_norm": 0.19391953945159912, + "learning_rate": 6.00889808736953e-07, + "loss": 0.4547, + "step": 963 + }, + { + "epoch": 5.914110429447852, + "grad_norm": 0.21159960329532623, + "learning_rate": 5.942938759594952e-07, + "loss": 0.4461, + "step": 964 + }, + { + "epoch": 5.920245398773006, + "grad_norm": 0.17359314858913422, + "learning_rate": 5.877320567469514e-07, + "loss": 0.3967, + "step": 965 + }, + { + "epoch": 5.92638036809816, + "grad_norm": 0.2088865041732788, + "learning_rate": 5.812044019077578e-07, + "loss": 0.4448, + "step": 966 + }, + { + "epoch": 5.932515337423313, + "grad_norm": 0.20267169177532196, + "learning_rate": 5.747109619858176e-07, + "loss": 0.4304, + "step": 967 + }, + { + "epoch": 5.938650306748467, + "grad_norm": 0.22262024879455566, + "learning_rate": 5.682517872601034e-07, + "loss": 0.44, + "step": 968 + }, + { + "epoch": 5.9447852760736195, + "grad_norm": 0.20060719549655914, + "learning_rate": 5.618269277442723e-07, + "loss": 0.4505, + "step": 969 + }, + { + "epoch": 5.950920245398773, + "grad_norm": 0.20760180056095123, + "learning_rate": 5.554364331862799e-07, + "loss": 0.4116, + "step": 970 + }, + { + "epoch": 5.957055214723926, + "grad_norm": 0.19337566196918488, + "learning_rate": 5.490803530679883e-07, + "loss": 0.4193, + "step": 971 + }, + { + "epoch": 5.96319018404908, + "grad_norm": 0.17379286885261536, + "learning_rate": 5.427587366047893e-07, + "loss": 0.428, + "step": 972 + }, + { + "epoch": 5.969325153374233, + "grad_norm": 0.19079700112342834, + "learning_rate": 5.364716327452219e-07, + "loss": 0.4439, + "step": 973 + }, + { + "epoch": 5.975460122699387, + "grad_norm": 0.21255679428577423, + "learning_rate": 5.3021909017059e-07, + "loss": 0.4143, + "step": 974 + }, + { + "epoch": 5.9815950920245395, + "grad_norm": 0.2059745490550995, + "learning_rate": 5.240011572945896e-07, + "loss": 0.4334, + "step": 975 + }, + { + "epoch": 5.987730061349693, + "grad_norm": 0.19311602413654327, + "learning_rate": 5.178178822629348e-07, + "loss": 0.421, + "step": 976 + }, + { + "epoch": 5.993865030674847, + "grad_norm": 0.20050852000713348, + "learning_rate": 5.11669312952977e-07, + "loss": 0.4219, + "step": 977 + }, + { + "epoch": 6.0, + "grad_norm": 0.19952666759490967, + "learning_rate": 5.05555496973344e-07, + "loss": 0.4253, + "step": 978 + }, + { + "epoch": 6.006134969325154, + "grad_norm": 0.19700153172016144, + "learning_rate": 4.994764816635666e-07, + "loss": 0.4002, + "step": 979 + }, + { + "epoch": 6.012269938650307, + "grad_norm": 0.19736388325691223, + "learning_rate": 4.934323140937125e-07, + "loss": 0.4436, + "step": 980 + }, + { + "epoch": 6.0184049079754605, + "grad_norm": 0.20198658108711243, + "learning_rate": 4.874230410640207e-07, + "loss": 0.4454, + "step": 981 + }, + { + "epoch": 6.024539877300613, + "grad_norm": 0.18840666115283966, + "learning_rate": 4.814487091045405e-07, + "loss": 0.448, + "step": 982 + }, + { + "epoch": 6.030674846625767, + "grad_norm": 0.18371936678886414, + "learning_rate": 4.7550936447477215e-07, + "loss": 0.3951, + "step": 983 + }, + { + "epoch": 6.03680981595092, + "grad_norm": 0.20922499895095825, + "learning_rate": 4.6960505316330783e-07, + "loss": 0.4124, + "step": 984 + }, + { + "epoch": 6.042944785276074, + "grad_norm": 0.1890077441930771, + "learning_rate": 4.637358208874726e-07, + "loss": 0.4318, + "step": 985 + }, + { + "epoch": 6.049079754601227, + "grad_norm": 0.1936153620481491, + "learning_rate": 4.579017130929775e-07, + "loss": 0.4279, + "step": 986 + }, + { + "epoch": 6.0552147239263805, + "grad_norm": 0.19301314651966095, + "learning_rate": 4.521027749535578e-07, + "loss": 0.4398, + "step": 987 + }, + { + "epoch": 6.061349693251533, + "grad_norm": 0.18920563161373138, + "learning_rate": 4.463390513706317e-07, + "loss": 0.4367, + "step": 988 + }, + { + "epoch": 6.067484662576687, + "grad_norm": 0.20188719034194946, + "learning_rate": 4.406105869729532e-07, + "loss": 0.4287, + "step": 989 + }, + { + "epoch": 6.07361963190184, + "grad_norm": 0.19266493618488312, + "learning_rate": 4.3491742611625587e-07, + "loss": 0.4405, + "step": 990 + }, + { + "epoch": 6.079754601226994, + "grad_norm": 0.1787448674440384, + "learning_rate": 4.292596128829207e-07, + "loss": 0.3901, + "step": 991 + }, + { + "epoch": 6.085889570552148, + "grad_norm": 0.20086398720741272, + "learning_rate": 4.2363719108163113e-07, + "loss": 0.3748, + "step": 992 + }, + { + "epoch": 6.0920245398773005, + "grad_norm": 0.2082991749048233, + "learning_rate": 4.1805020424703024e-07, + "loss": 0.4626, + "step": 993 + }, + { + "epoch": 6.098159509202454, + "grad_norm": 0.20025362074375153, + "learning_rate": 4.124986956393895e-07, + "loss": 0.466, + "step": 994 + }, + { + "epoch": 6.104294478527607, + "grad_norm": 0.18952858448028564, + "learning_rate": 4.0698270824426846e-07, + "loss": 0.4022, + "step": 995 + }, + { + "epoch": 6.110429447852761, + "grad_norm": 0.19780947268009186, + "learning_rate": 4.0150228477218664e-07, + "loss": 0.4298, + "step": 996 + }, + { + "epoch": 6.116564417177914, + "grad_norm": 0.19643534719944, + "learning_rate": 3.960574676582901e-07, + "loss": 0.4248, + "step": 997 + }, + { + "epoch": 6.122699386503068, + "grad_norm": 0.1984051913022995, + "learning_rate": 3.906482990620236e-07, + "loss": 0.4615, + "step": 998 + }, + { + "epoch": 6.128834355828221, + "grad_norm": 0.20450861752033234, + "learning_rate": 3.8527482086680277e-07, + "loss": 0.4138, + "step": 999 + }, + { + "epoch": 6.134969325153374, + "grad_norm": 0.1938575804233551, + "learning_rate": 3.7993707467969267e-07, + "loss": 0.4107, + "step": 1000 + }, + { + "epoch": 6.141104294478527, + "grad_norm": 0.19877435266971588, + "learning_rate": 3.746351018310812e-07, + "loss": 0.4304, + "step": 1001 + }, + { + "epoch": 6.147239263803681, + "grad_norm": 0.19607821106910706, + "learning_rate": 3.693689433743658e-07, + "loss": 0.4031, + "step": 1002 + }, + { + "epoch": 6.153374233128835, + "grad_norm": 0.208278626203537, + "learning_rate": 3.6413864008562785e-07, + "loss": 0.4144, + "step": 1003 + }, + { + "epoch": 6.159509202453988, + "grad_norm": 0.19486194849014282, + "learning_rate": 3.589442324633224e-07, + "loss": 0.4729, + "step": 1004 + }, + { + "epoch": 6.1656441717791415, + "grad_norm": 0.1959027498960495, + "learning_rate": 3.537857607279638e-07, + "loss": 0.4195, + "step": 1005 + }, + { + "epoch": 6.171779141104294, + "grad_norm": 0.2013474851846695, + "learning_rate": 3.4866326482181025e-07, + "loss": 0.4364, + "step": 1006 + }, + { + "epoch": 6.177914110429448, + "grad_norm": 0.19486162066459656, + "learning_rate": 3.4357678440856136e-07, + "loss": 0.413, + "step": 1007 + }, + { + "epoch": 6.184049079754601, + "grad_norm": 0.18348956108093262, + "learning_rate": 3.385263588730464e-07, + "loss": 0.427, + "step": 1008 + }, + { + "epoch": 6.190184049079755, + "grad_norm": 0.18834654986858368, + "learning_rate": 3.3351202732091693e-07, + "loss": 0.4004, + "step": 1009 + }, + { + "epoch": 6.196319018404908, + "grad_norm": 0.22560544312000275, + "learning_rate": 3.2853382857835124e-07, + "loss": 0.382, + "step": 1010 + }, + { + "epoch": 6.2024539877300615, + "grad_norm": 0.19283907115459442, + "learning_rate": 3.235918011917477e-07, + "loss": 0.4918, + "step": 1011 + }, + { + "epoch": 6.208588957055214, + "grad_norm": 0.21347446739673615, + "learning_rate": 3.186859834274292e-07, + "loss": 0.4492, + "step": 1012 + }, + { + "epoch": 6.214723926380368, + "grad_norm": 0.2025686800479889, + "learning_rate": 3.1381641327134616e-07, + "loss": 0.4489, + "step": 1013 + }, + { + "epoch": 6.220858895705521, + "grad_norm": 0.23214280605316162, + "learning_rate": 3.0898312842878144e-07, + "loss": 0.4648, + "step": 1014 + }, + { + "epoch": 6.226993865030675, + "grad_norm": 0.19760558009147644, + "learning_rate": 3.041861663240592e-07, + "loss": 0.4261, + "step": 1015 + }, + { + "epoch": 6.233128834355828, + "grad_norm": 0.18486978113651276, + "learning_rate": 2.994255641002564e-07, + "loss": 0.443, + "step": 1016 + }, + { + "epoch": 6.2392638036809815, + "grad_norm": 0.2192317545413971, + "learning_rate": 2.947013586189124e-07, + "loss": 0.4466, + "step": 1017 + }, + { + "epoch": 6.245398773006135, + "grad_norm": 0.22236081957817078, + "learning_rate": 2.9001358645974696e-07, + "loss": 0.401, + "step": 1018 + }, + { + "epoch": 6.251533742331288, + "grad_norm": 0.1953679621219635, + "learning_rate": 2.85362283920374e-07, + "loss": 0.403, + "step": 1019 + }, + { + "epoch": 6.257668711656442, + "grad_norm": 0.19780360162258148, + "learning_rate": 2.8074748701601984e-07, + "loss": 0.4384, + "step": 1020 + }, + { + "epoch": 6.263803680981595, + "grad_norm": 0.20466431975364685, + "learning_rate": 2.761692314792502e-07, + "loss": 0.4312, + "step": 1021 + }, + { + "epoch": 6.269938650306749, + "grad_norm": 0.18782320618629456, + "learning_rate": 2.7162755275968513e-07, + "loss": 0.4042, + "step": 1022 + }, + { + "epoch": 6.276073619631902, + "grad_norm": 0.1981232464313507, + "learning_rate": 2.6712248602373205e-07, + "loss": 0.4474, + "step": 1023 + }, + { + "epoch": 6.282208588957055, + "grad_norm": 0.2031874656677246, + "learning_rate": 2.626540661543103e-07, + "loss": 0.4364, + "step": 1024 + }, + { + "epoch": 6.288343558282208, + "grad_norm": 0.18200956284999847, + "learning_rate": 2.582223277505769e-07, + "loss": 0.4289, + "step": 1025 + }, + { + "epoch": 6.294478527607362, + "grad_norm": 0.18988868594169617, + "learning_rate": 2.538273051276685e-07, + "loss": 0.4263, + "step": 1026 + }, + { + "epoch": 6.300613496932515, + "grad_norm": 0.1987670212984085, + "learning_rate": 2.4946903231642727e-07, + "loss": 0.4316, + "step": 1027 + }, + { + "epoch": 6.306748466257669, + "grad_norm": 0.21598701179027557, + "learning_rate": 2.451475430631384e-07, + "loss": 0.4416, + "step": 1028 + }, + { + "epoch": 6.3128834355828225, + "grad_norm": 0.20041204988956451, + "learning_rate": 2.408628708292732e-07, + "loss": 0.4592, + "step": 1029 + }, + { + "epoch": 6.319018404907975, + "grad_norm": 0.22675997018814087, + "learning_rate": 2.3661504879122554e-07, + "loss": 0.4562, + "step": 1030 + }, + { + "epoch": 6.325153374233129, + "grad_norm": 0.2256935089826584, + "learning_rate": 2.3240410984005701e-07, + "loss": 0.4044, + "step": 1031 + }, + { + "epoch": 6.331288343558282, + "grad_norm": 0.21728554368019104, + "learning_rate": 2.2823008658124425e-07, + "loss": 0.3976, + "step": 1032 + }, + { + "epoch": 6.337423312883436, + "grad_norm": 0.19935831427574158, + "learning_rate": 2.2409301133441918e-07, + "loss": 0.4648, + "step": 1033 + }, + { + "epoch": 6.343558282208589, + "grad_norm": 0.1918664127588272, + "learning_rate": 2.1999291613312824e-07, + "loss": 0.4083, + "step": 1034 + }, + { + "epoch": 6.3496932515337425, + "grad_norm": 0.18631911277770996, + "learning_rate": 2.15929832724579e-07, + "loss": 0.4585, + "step": 1035 + }, + { + "epoch": 6.355828220858895, + "grad_norm": 0.20234107971191406, + "learning_rate": 2.1190379256939342e-07, + "loss": 0.4341, + "step": 1036 + }, + { + "epoch": 6.361963190184049, + "grad_norm": 0.20566298067569733, + "learning_rate": 2.0791482684136833e-07, + "loss": 0.4054, + "step": 1037 + }, + { + "epoch": 6.368098159509202, + "grad_norm": 0.19845980405807495, + "learning_rate": 2.0396296642722856e-07, + "loss": 0.4592, + "step": 1038 + }, + { + "epoch": 6.374233128834356, + "grad_norm": 0.19033271074295044, + "learning_rate": 2.0004824192639437e-07, + "loss": 0.4201, + "step": 1039 + }, + { + "epoch": 6.38036809815951, + "grad_norm": 0.20748300850391388, + "learning_rate": 1.9617068365073987e-07, + "loss": 0.4349, + "step": 1040 + }, + { + "epoch": 6.386503067484663, + "grad_norm": 0.19087129831314087, + "learning_rate": 1.9233032162435828e-07, + "loss": 0.4122, + "step": 1041 + }, + { + "epoch": 6.392638036809816, + "grad_norm": 0.2218639850616455, + "learning_rate": 1.885271855833315e-07, + "loss": 0.4133, + "step": 1042 + }, + { + "epoch": 6.398773006134969, + "grad_norm": 0.19954140484333038, + "learning_rate": 1.847613049755015e-07, + "loss": 0.4433, + "step": 1043 + }, + { + "epoch": 6.404907975460123, + "grad_norm": 0.18886399269104004, + "learning_rate": 1.8103270896023427e-07, + "loss": 0.3899, + "step": 1044 + }, + { + "epoch": 6.411042944785276, + "grad_norm": 0.20953959226608276, + "learning_rate": 1.7734142640820684e-07, + "loss": 0.4357, + "step": 1045 + }, + { + "epoch": 6.41717791411043, + "grad_norm": 0.20415045320987701, + "learning_rate": 1.7368748590117058e-07, + "loss": 0.434, + "step": 1046 + }, + { + "epoch": 6.423312883435583, + "grad_norm": 0.2486049383878708, + "learning_rate": 1.7007091573173818e-07, + "loss": 0.4239, + "step": 1047 + }, + { + "epoch": 6.429447852760736, + "grad_norm": 0.19526998698711395, + "learning_rate": 1.6649174390316325e-07, + "loss": 0.3995, + "step": 1048 + }, + { + "epoch": 6.435582822085889, + "grad_norm": 0.19858838617801666, + "learning_rate": 1.629499981291205e-07, + "loss": 0.4451, + "step": 1049 + }, + { + "epoch": 6.441717791411043, + "grad_norm": 0.1742226481437683, + "learning_rate": 1.5944570583349416e-07, + "loss": 0.4165, + "step": 1050 + }, + { + "epoch": 6.447852760736196, + "grad_norm": 0.20162968337535858, + "learning_rate": 1.5597889415016609e-07, + "loss": 0.4152, + "step": 1051 + }, + { + "epoch": 6.45398773006135, + "grad_norm": 0.22722604870796204, + "learning_rate": 1.5254958992280022e-07, + "loss": 0.4472, + "step": 1052 + }, + { + "epoch": 6.460122699386503, + "grad_norm": 0.20080749690532684, + "learning_rate": 1.4915781970464226e-07, + "loss": 0.4206, + "step": 1053 + }, + { + "epoch": 6.466257668711656, + "grad_norm": 0.20352134108543396, + "learning_rate": 1.4580360975830988e-07, + "loss": 0.4739, + "step": 1054 + }, + { + "epoch": 6.47239263803681, + "grad_norm": 0.2271064966917038, + "learning_rate": 1.4248698605558887e-07, + "loss": 0.4051, + "step": 1055 + }, + { + "epoch": 6.478527607361963, + "grad_norm": 0.21113221347332, + "learning_rate": 1.3920797427723454e-07, + "loss": 0.4652, + "step": 1056 + }, + { + "epoch": 6.484662576687117, + "grad_norm": 0.19622337818145752, + "learning_rate": 1.3596659981277016e-07, + "loss": 0.4079, + "step": 1057 + }, + { + "epoch": 6.49079754601227, + "grad_norm": 0.20004568994045258, + "learning_rate": 1.3276288776029267e-07, + "loss": 0.437, + "step": 1058 + }, + { + "epoch": 6.4969325153374236, + "grad_norm": 0.19560450315475464, + "learning_rate": 1.2959686292627782e-07, + "loss": 0.4172, + "step": 1059 + }, + { + "epoch": 6.5030674846625764, + "grad_norm": 0.20478709042072296, + "learning_rate": 1.2646854982538593e-07, + "loss": 0.4426, + "step": 1060 + }, + { + "epoch": 6.50920245398773, + "grad_norm": 0.20099417865276337, + "learning_rate": 1.2337797268027475e-07, + "loss": 0.4246, + "step": 1061 + }, + { + "epoch": 6.515337423312883, + "grad_norm": 0.19599393010139465, + "learning_rate": 1.2032515542141188e-07, + "loss": 0.4089, + "step": 1062 + }, + { + "epoch": 6.521472392638037, + "grad_norm": 0.20901450514793396, + "learning_rate": 1.1731012168688715e-07, + "loss": 0.4031, + "step": 1063 + }, + { + "epoch": 6.52760736196319, + "grad_norm": 0.20133835077285767, + "learning_rate": 1.1433289482223276e-07, + "loss": 0.429, + "step": 1064 + }, + { + "epoch": 6.533742331288344, + "grad_norm": 0.2047884315252304, + "learning_rate": 1.1139349788023779e-07, + "loss": 0.4024, + "step": 1065 + }, + { + "epoch": 6.539877300613497, + "grad_norm": 0.18399380147457123, + "learning_rate": 1.084919536207757e-07, + "loss": 0.4377, + "step": 1066 + }, + { + "epoch": 6.54601226993865, + "grad_norm": 0.18939992785453796, + "learning_rate": 1.0562828451062323e-07, + "loss": 0.4141, + "step": 1067 + }, + { + "epoch": 6.552147239263804, + "grad_norm": 0.21095708012580872, + "learning_rate": 1.0280251272328956e-07, + "loss": 0.4187, + "step": 1068 + }, + { + "epoch": 6.558282208588957, + "grad_norm": 0.20305517315864563, + "learning_rate": 1.0001466013884131e-07, + "loss": 0.4238, + "step": 1069 + }, + { + "epoch": 6.564417177914111, + "grad_norm": 0.20672884583473206, + "learning_rate": 9.726474834373833e-08, + "loss": 0.3993, + "step": 1070 + }, + { + "epoch": 6.570552147239264, + "grad_norm": 0.19539766013622284, + "learning_rate": 9.455279863065936e-08, + "loss": 0.422, + "step": 1071 + }, + { + "epoch": 6.576687116564417, + "grad_norm": 0.2007475346326828, + "learning_rate": 9.187883199834491e-08, + "loss": 0.3905, + "step": 1072 + }, + { + "epoch": 6.58282208588957, + "grad_norm": 0.2838771641254425, + "learning_rate": 8.924286915142854e-08, + "loss": 0.4672, + "step": 1073 + }, + { + "epoch": 6.588957055214724, + "grad_norm": 0.19780975580215454, + "learning_rate": 8.664493050028033e-08, + "loss": 0.4534, + "step": 1074 + }, + { + "epoch": 6.595092024539877, + "grad_norm": 0.18463049829006195, + "learning_rate": 8.40850361608464e-08, + "loss": 0.4187, + "step": 1075 + }, + { + "epoch": 6.601226993865031, + "grad_norm": 0.20436058938503265, + "learning_rate": 8.156320595449463e-08, + "loss": 0.4615, + "step": 1076 + }, + { + "epoch": 6.6073619631901845, + "grad_norm": 0.1944579929113388, + "learning_rate": 7.907945940786033e-08, + "loss": 0.4576, + "step": 1077 + }, + { + "epoch": 6.613496932515337, + "grad_norm": 0.20546551048755646, + "learning_rate": 7.66338157526969e-08, + "loss": 0.4644, + "step": 1078 + }, + { + "epoch": 6.61963190184049, + "grad_norm": 0.1846093386411667, + "learning_rate": 7.422629392572323e-08, + "loss": 0.4321, + "step": 1079 + }, + { + "epoch": 6.625766871165644, + "grad_norm": 0.1991308182477951, + "learning_rate": 7.185691256848093e-08, + "loss": 0.441, + "step": 1080 + }, + { + "epoch": 6.631901840490798, + "grad_norm": 0.1854790449142456, + "learning_rate": 6.95256900271879e-08, + "loss": 0.4384, + "step": 1081 + }, + { + "epoch": 6.638036809815951, + "grad_norm": 0.20065605640411377, + "learning_rate": 6.723264435259725e-08, + "loss": 0.4157, + "step": 1082 + }, + { + "epoch": 6.644171779141105, + "grad_norm": 0.1937716007232666, + "learning_rate": 6.497779329985631e-08, + "loss": 0.4157, + "step": 1083 + }, + { + "epoch": 6.6503067484662575, + "grad_norm": 0.18098995089530945, + "learning_rate": 6.27611543283696e-08, + "loss": 0.4376, + "step": 1084 + }, + { + "epoch": 6.656441717791411, + "grad_norm": 0.1942816972732544, + "learning_rate": 6.058274460166547e-08, + "loss": 0.4482, + "step": 1085 + }, + { + "epoch": 6.662576687116564, + "grad_norm": 0.1918404996395111, + "learning_rate": 5.8442580987259637e-08, + "loss": 0.41, + "step": 1086 + }, + { + "epoch": 6.668711656441718, + "grad_norm": 0.20013810694217682, + "learning_rate": 5.634068005652804e-08, + "loss": 0.383, + "step": 1087 + }, + { + "epoch": 6.674846625766871, + "grad_norm": 0.1878250539302826, + "learning_rate": 5.4277058084576394e-08, + "loss": 0.4637, + "step": 1088 + }, + { + "epoch": 6.680981595092025, + "grad_norm": 0.20763535797595978, + "learning_rate": 5.225173105011583e-08, + "loss": 0.4177, + "step": 1089 + }, + { + "epoch": 6.6871165644171775, + "grad_norm": 0.19868603348731995, + "learning_rate": 5.026471463533578e-08, + "loss": 0.405, + "step": 1090 + }, + { + "epoch": 6.693251533742331, + "grad_norm": 0.19065415859222412, + "learning_rate": 4.831602422578852e-08, + "loss": 0.3965, + "step": 1091 + }, + { + "epoch": 6.699386503067485, + "grad_norm": 0.2003355175256729, + "learning_rate": 4.640567491026316e-08, + "loss": 0.4255, + "step": 1092 + }, + { + "epoch": 6.705521472392638, + "grad_norm": 0.22053086757659912, + "learning_rate": 4.453368148067405e-08, + "loss": 0.4219, + "step": 1093 + }, + { + "epoch": 6.711656441717792, + "grad_norm": 0.18970713019371033, + "learning_rate": 4.2700058431943694e-08, + "loss": 0.445, + "step": 1094 + }, + { + "epoch": 6.717791411042945, + "grad_norm": 0.19756844639778137, + "learning_rate": 4.090481996189166e-08, + "loss": 0.4183, + "step": 1095 + }, + { + "epoch": 6.723926380368098, + "grad_norm": 0.20232686400413513, + "learning_rate": 3.914797997112307e-08, + "loss": 0.42, + "step": 1096 + }, + { + "epoch": 6.730061349693251, + "grad_norm": 0.1890253722667694, + "learning_rate": 3.7429552062923644e-08, + "loss": 0.4202, + "step": 1097 + }, + { + "epoch": 6.736196319018405, + "grad_norm": 0.18478119373321533, + "learning_rate": 3.574954954315091e-08, + "loss": 0.3997, + "step": 1098 + }, + { + "epoch": 6.742331288343558, + "grad_norm": 0.18402665853500366, + "learning_rate": 3.410798542013483e-08, + "loss": 0.4306, + "step": 1099 + }, + { + "epoch": 6.748466257668712, + "grad_norm": 0.2112373560667038, + "learning_rate": 3.25048724045729e-08, + "loss": 0.4423, + "step": 1100 + }, + { + "epoch": 6.754601226993865, + "grad_norm": 0.18628022074699402, + "learning_rate": 3.0940222909437434e-08, + "loss": 0.4172, + "step": 1101 + }, + { + "epoch": 6.7607361963190185, + "grad_norm": 0.18921811878681183, + "learning_rate": 2.9414049049872883e-08, + "loss": 0.4129, + "step": 1102 + }, + { + "epoch": 6.766871165644172, + "grad_norm": 0.20240849256515503, + "learning_rate": 2.7926362643106997e-08, + "loss": 0.4621, + "step": 1103 + }, + { + "epoch": 6.773006134969325, + "grad_norm": 0.18727080523967743, + "learning_rate": 2.647717520835702e-08, + "loss": 0.4166, + "step": 1104 + }, + { + "epoch": 6.779141104294479, + "grad_norm": 0.19762536883354187, + "learning_rate": 2.5066497966741987e-08, + "loss": 0.4223, + "step": 1105 + }, + { + "epoch": 6.785276073619632, + "grad_norm": 0.2059212028980255, + "learning_rate": 2.3694341841193902e-08, + "loss": 0.4323, + "step": 1106 + }, + { + "epoch": 6.791411042944786, + "grad_norm": 0.21039064228534698, + "learning_rate": 2.236071745637336e-08, + "loss": 0.4219, + "step": 1107 + }, + { + "epoch": 6.7975460122699385, + "grad_norm": 0.1902393400669098, + "learning_rate": 2.1065635138590724e-08, + "loss": 0.4157, + "step": 1108 + }, + { + "epoch": 6.803680981595092, + "grad_norm": 0.20540878176689148, + "learning_rate": 1.980910491572119e-08, + "loss": 0.4339, + "step": 1109 + }, + { + "epoch": 6.809815950920245, + "grad_norm": 0.22091785073280334, + "learning_rate": 1.8591136517130404e-08, + "loss": 0.4306, + "step": 1110 + }, + { + "epoch": 6.815950920245399, + "grad_norm": 0.20457226037979126, + "learning_rate": 1.7411739373598413e-08, + "loss": 0.4332, + "step": 1111 + }, + { + "epoch": 6.822085889570552, + "grad_norm": 0.23524275422096252, + "learning_rate": 1.627092261724583e-08, + "loss": 0.4358, + "step": 1112 + }, + { + "epoch": 6.828220858895706, + "grad_norm": 0.18031160533428192, + "learning_rate": 1.5168695081463904e-08, + "loss": 0.4402, + "step": 1113 + }, + { + "epoch": 6.8343558282208585, + "grad_norm": 0.21345287561416626, + "learning_rate": 1.410506530084621e-08, + "loss": 0.4317, + "step": 1114 + }, + { + "epoch": 6.840490797546012, + "grad_norm": 0.2095663696527481, + "learning_rate": 1.3080041511122077e-08, + "loss": 0.4239, + "step": 1115 + }, + { + "epoch": 6.846625766871165, + "grad_norm": 0.2007758915424347, + "learning_rate": 1.2093631649093828e-08, + "loss": 0.3961, + "step": 1116 + }, + { + "epoch": 6.852760736196319, + "grad_norm": 0.20119042694568634, + "learning_rate": 1.1145843352572406e-08, + "loss": 0.441, + "step": 1117 + }, + { + "epoch": 6.858895705521473, + "grad_norm": 0.18511272966861725, + "learning_rate": 1.0236683960323512e-08, + "loss": 0.4392, + "step": 1118 + }, + { + "epoch": 6.865030674846626, + "grad_norm": 0.198281928896904, + "learning_rate": 9.36616051200434e-09, + "loss": 0.4244, + "step": 1119 + }, + { + "epoch": 6.871165644171779, + "grad_norm": 0.18932987749576569, + "learning_rate": 8.53427974811527e-09, + "loss": 0.4387, + "step": 1120 + }, + { + "epoch": 6.877300613496932, + "grad_norm": 0.19044747948646545, + "learning_rate": 7.74104810994325e-09, + "loss": 0.4252, + "step": 1121 + }, + { + "epoch": 6.883435582822086, + "grad_norm": 0.1831924319267273, + "learning_rate": 6.986471739513501e-09, + "loss": 0.4281, + "step": 1122 + }, + { + "epoch": 6.889570552147239, + "grad_norm": 0.18994773924350739, + "learning_rate": 6.270556479541778e-09, + "loss": 0.4423, + "step": 1123 + }, + { + "epoch": 6.895705521472393, + "grad_norm": 0.20347394049167633, + "learning_rate": 5.593307873389963e-09, + "loss": 0.4119, + "step": 1124 + }, + { + "epoch": 6.901840490797546, + "grad_norm": 0.19256287813186646, + "learning_rate": 4.954731165022209e-09, + "loss": 0.4218, + "step": 1125 + }, + { + "epoch": 6.9079754601226995, + "grad_norm": 0.2047315537929535, + "learning_rate": 4.354831298963858e-09, + "loss": 0.428, + "step": 1126 + }, + { + "epoch": 6.914110429447852, + "grad_norm": 0.2375705987215042, + "learning_rate": 3.7936129202648106e-09, + "loss": 0.4402, + "step": 1127 + }, + { + "epoch": 6.920245398773006, + "grad_norm": 0.19552327692508698, + "learning_rate": 3.271080374462332e-09, + "loss": 0.4127, + "step": 1128 + }, + { + "epoch": 6.92638036809816, + "grad_norm": 0.22805239260196686, + "learning_rate": 2.7872377075471856e-09, + "loss": 0.4411, + "step": 1129 + }, + { + "epoch": 6.932515337423313, + "grad_norm": 0.2000674456357956, + "learning_rate": 2.3420886659331067e-09, + "loss": 0.4226, + "step": 1130 + }, + { + "epoch": 6.938650306748467, + "grad_norm": 0.22863265872001648, + "learning_rate": 1.9356366964279338e-09, + "loss": 0.4253, + "step": 1131 + }, + { + "epoch": 6.9447852760736195, + "grad_norm": 0.20869435369968414, + "learning_rate": 1.5678849462058554e-09, + "loss": 0.4374, + "step": 1132 + }, + { + "epoch": 6.950920245398773, + "grad_norm": 0.20107249915599823, + "learning_rate": 1.2388362627840934e-09, + "loss": 0.4414, + "step": 1133 + }, + { + "epoch": 6.957055214723926, + "grad_norm": 0.1913885623216629, + "learning_rate": 9.484931940001442e-10, + "loss": 0.4179, + "step": 1134 + }, + { + "epoch": 6.96319018404908, + "grad_norm": 0.19177880883216858, + "learning_rate": 6.968579879923498e-10, + "loss": 0.438, + "step": 1135 + }, + { + "epoch": 6.969325153374233, + "grad_norm": 0.21289180219173431, + "learning_rate": 4.8393259318269e-10, + "loss": 0.4331, + "step": 1136 + }, + { + "epoch": 6.975460122699387, + "grad_norm": 0.20402106642723083, + "learning_rate": 3.097186582606826e-10, + "loss": 0.4162, + "step": 1137 + }, + { + "epoch": 6.9815950920245395, + "grad_norm": 0.19780924916267395, + "learning_rate": 1.7421753217283788e-10, + "loss": 0.4331, + "step": 1138 + }, + { + "epoch": 6.987730061349693, + "grad_norm": 0.19273407757282257, + "learning_rate": 7.743026410989007e-11, + "loss": 0.3837, + "step": 1139 + }, + { + "epoch": 6.993865030674847, + "grad_norm": 0.19189181923866272, + "learning_rate": 1.9357603499026157e-11, + "loss": 0.4353, + "step": 1140 + }, + { + "epoch": 7.0, + "grad_norm": 0.21704550087451935, + "learning_rate": 0.0, + "loss": 0.4444, + "step": 1141 + }, + { + "epoch": 7.0, + "step": 1141, + "total_flos": 2.996752297507999e+19, + "train_loss": 0.44955052823272323, + "train_runtime": 45593.8747, + "train_samples_per_second": 3.202, + "train_steps_per_second": 0.025 + } + ], + "logging_steps": 1, + "max_steps": 1141, + "num_input_tokens_seen": 0, + "num_train_epochs": 7, + "save_steps": 1000000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.996752297507999e+19, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}