diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,3826 +1,3805 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 3.0, + "epoch": 2.988929889298893, "eval_steps": 500, - "global_step": 543, + "global_step": 540, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { - "epoch": 0.0055248618784530384, - "grad_norm": 4.8501728653187755, - "learning_rate": 1.8181818181818183e-07, - "loss": 0.882, + "epoch": 0.005535055350553505, + "grad_norm": 4.88795566772026, + "learning_rate": 1.8518518518518518e-07, + "loss": 0.8842, "step": 1 }, { - "epoch": 0.011049723756906077, - "grad_norm": 5.284618547033595, - "learning_rate": 3.6363636363636366e-07, - "loss": 0.9499, + "epoch": 0.01107011070110701, + "grad_norm": 5.294943049287627, + "learning_rate": 3.7037037037037036e-07, + "loss": 0.9582, "step": 2 }, { - "epoch": 0.016574585635359115, - "grad_norm": 4.71655301867202, - "learning_rate": 5.454545454545455e-07, - "loss": 0.8375, + "epoch": 0.016605166051660517, + "grad_norm": 4.721450383705418, + "learning_rate": 5.555555555555555e-07, + "loss": 0.8326, "step": 3 }, { - "epoch": 0.022099447513812154, - "grad_norm": 4.691610018967258, - "learning_rate": 7.272727272727273e-07, - "loss": 0.8694, + "epoch": 0.02214022140221402, + "grad_norm": 4.702865900802319, + "learning_rate": 7.407407407407407e-07, + "loss": 0.8705, "step": 4 }, { - "epoch": 0.027624309392265192, - "grad_norm": 4.525811293283286, - "learning_rate": 9.090909090909091e-07, - "loss": 0.7922, + "epoch": 0.027675276752767528, + "grad_norm": 4.588661299548798, + "learning_rate": 9.259259259259259e-07, + "loss": 0.7979, "step": 5 }, { - "epoch": 0.03314917127071823, - "grad_norm": 4.515684677465201, - "learning_rate": 1.090909090909091e-06, - "loss": 0.8158, + "epoch": 0.033210332103321034, + "grad_norm": 4.514875582905285, + "learning_rate": 1.111111111111111e-06, + "loss": 0.8133, "step": 6 }, { - "epoch": 0.03867403314917127, - "grad_norm": 4.521464462348852, - "learning_rate": 1.2727272727272728e-06, - "loss": 0.8463, + "epoch": 0.03874538745387454, + "grad_norm": 4.3889371427334725, + "learning_rate": 1.2962962962962962e-06, + "loss": 0.8289, "step": 7 }, { - "epoch": 0.04419889502762431, - "grad_norm": 4.01317039922344, - "learning_rate": 1.4545454545454546e-06, - "loss": 0.7804, + "epoch": 0.04428044280442804, + "grad_norm": 3.681708725528932, + "learning_rate": 1.4814814814814815e-06, + "loss": 0.7494, "step": 8 }, { - "epoch": 0.049723756906077346, - "grad_norm": 3.725743569896934, - "learning_rate": 1.6363636363636365e-06, - "loss": 0.8386, + "epoch": 0.04981549815498155, + "grad_norm": 3.7696517468375483, + "learning_rate": 1.6666666666666667e-06, + "loss": 0.8392, "step": 9 }, { - "epoch": 0.055248618784530384, - "grad_norm": 3.547034380075145, - "learning_rate": 1.8181818181818183e-06, - "loss": 0.7804, + "epoch": 0.055350553505535055, + "grad_norm": 3.5736119140088474, + "learning_rate": 1.8518518518518519e-06, + "loss": 0.7802, "step": 10 }, { - "epoch": 0.06077348066298342, - "grad_norm": 2.385734305081293, - "learning_rate": 2.0000000000000003e-06, - "loss": 0.7536, + "epoch": 0.06088560885608856, + "grad_norm": 2.352449321846944, + "learning_rate": 2.037037037037037e-06, + "loss": 0.7531, "step": 11 }, { - "epoch": 0.06629834254143646, - "grad_norm": 2.118903283570385, - "learning_rate": 2.181818181818182e-06, - "loss": 0.8003, + "epoch": 0.06642066420664207, + "grad_norm": 2.1396669501274728, + "learning_rate": 2.222222222222222e-06, + "loss": 0.8033, "step": 12 }, { - "epoch": 0.0718232044198895, - "grad_norm": 2.071463502085266, - "learning_rate": 2.363636363636364e-06, - "loss": 0.7728, + "epoch": 0.07195571955719557, + "grad_norm": 2.0686046715471744, + "learning_rate": 2.4074074074074075e-06, + "loss": 0.7739, "step": 13 }, { - "epoch": 0.07734806629834254, - "grad_norm": 2.11975896906294, - "learning_rate": 2.5454545454545456e-06, - "loss": 0.7816, + "epoch": 0.07749077490774908, + "grad_norm": 2.112065821246985, + "learning_rate": 2.5925925925925925e-06, + "loss": 0.7789, "step": 14 }, { - "epoch": 0.08287292817679558, - "grad_norm": 1.6109945627440467, - "learning_rate": 2.7272727272727272e-06, - "loss": 0.7138, + "epoch": 0.08302583025830258, + "grad_norm": 1.6986741641118395, + "learning_rate": 2.7777777777777783e-06, + "loss": 0.7151, "step": 15 }, { - "epoch": 0.08839779005524862, - "grad_norm": 1.946567883491615, - "learning_rate": 2.9090909090909093e-06, - "loss": 0.7254, + "epoch": 0.08856088560885608, + "grad_norm": 1.9671299367169393, + "learning_rate": 2.962962962962963e-06, + "loss": 0.7321, "step": 16 }, { - "epoch": 0.09392265193370165, - "grad_norm": 1.7266080007457147, - "learning_rate": 3.090909090909091e-06, - "loss": 0.7294, + "epoch": 0.0940959409594096, + "grad_norm": 1.727327433279914, + "learning_rate": 3.1481481481481483e-06, + "loss": 0.7381, "step": 17 }, { - "epoch": 0.09944751381215469, - "grad_norm": 1.6539369196579166, - "learning_rate": 3.272727272727273e-06, - "loss": 0.711, + "epoch": 0.0996309963099631, + "grad_norm": 1.65624237671333, + "learning_rate": 3.3333333333333333e-06, + "loss": 0.7159, "step": 18 }, { - "epoch": 0.10497237569060773, - "grad_norm": 1.3760121270687458, - "learning_rate": 3.454545454545455e-06, - "loss": 0.6706, + "epoch": 0.10516605166051661, + "grad_norm": 1.3535125694848185, + "learning_rate": 3.5185185185185187e-06, + "loss": 0.6755, "step": 19 }, { - "epoch": 0.11049723756906077, - "grad_norm": 1.2322332262531337, - "learning_rate": 3.6363636363636366e-06, - "loss": 0.714, + "epoch": 0.11070110701107011, + "grad_norm": 1.2125088936587491, + "learning_rate": 3.7037037037037037e-06, + "loss": 0.7032, "step": 20 }, { - "epoch": 0.11602209944751381, - "grad_norm": 1.092460838605022, - "learning_rate": 3.818181818181819e-06, - "loss": 0.6221, + "epoch": 0.11623616236162361, + "grad_norm": 1.107066250339579, + "learning_rate": 3.88888888888889e-06, + "loss": 0.6217, "step": 21 }, { - "epoch": 0.12154696132596685, - "grad_norm": 1.3847134225952789, - "learning_rate": 4.000000000000001e-06, - "loss": 0.6874, + "epoch": 0.12177121771217712, + "grad_norm": 1.3653673904628112, + "learning_rate": 4.074074074074074e-06, + "loss": 0.6824, "step": 22 }, { - "epoch": 0.1270718232044199, - "grad_norm": 1.2895226334792846, - "learning_rate": 4.181818181818182e-06, - "loss": 0.6741, + "epoch": 0.12730627306273062, + "grad_norm": 1.289828643250104, + "learning_rate": 4.2592592592592596e-06, + "loss": 0.6796, "step": 23 }, { - "epoch": 0.13259668508287292, - "grad_norm": 1.3973784329721743, - "learning_rate": 4.363636363636364e-06, - "loss": 0.8114, + "epoch": 0.13284132841328414, + "grad_norm": 1.379866304591633, + "learning_rate": 4.444444444444444e-06, + "loss": 0.8102, "step": 24 }, { - "epoch": 0.13812154696132597, - "grad_norm": 1.1221564987656045, - "learning_rate": 4.5454545454545455e-06, - "loss": 0.6904, + "epoch": 0.13837638376383765, + "grad_norm": 1.1200972001221432, + "learning_rate": 4.62962962962963e-06, + "loss": 0.6953, "step": 25 }, { - "epoch": 0.143646408839779, - "grad_norm": 1.0813494118759077, - "learning_rate": 4.727272727272728e-06, - "loss": 0.6887, + "epoch": 0.14391143911439114, + "grad_norm": 1.071016299230562, + "learning_rate": 4.814814814814815e-06, + "loss": 0.6914, "step": 26 }, { - "epoch": 0.14917127071823205, - "grad_norm": 1.039002428401238, - "learning_rate": 4.90909090909091e-06, - "loss": 0.7305, + "epoch": 0.14944649446494465, + "grad_norm": 1.007816252436763, + "learning_rate": 5e-06, + "loss": 0.7266, "step": 27 }, { - "epoch": 0.15469613259668508, - "grad_norm": 0.9005740059397435, - "learning_rate": 5.090909090909091e-06, - "loss": 0.7216, + "epoch": 0.15498154981549817, + "grad_norm": 0.8824750556078126, + "learning_rate": 5.185185185185185e-06, + "loss": 0.716, "step": 28 }, { - "epoch": 0.16022099447513813, - "grad_norm": 1.0434084181940664, - "learning_rate": 5.272727272727273e-06, - "loss": 0.7262, + "epoch": 0.16051660516605165, + "grad_norm": 1.0612669135866166, + "learning_rate": 5.370370370370371e-06, + "loss": 0.7289, "step": 29 }, { - "epoch": 0.16574585635359115, - "grad_norm": 0.8640462038859854, - "learning_rate": 5.4545454545454545e-06, - "loss": 0.6818, + "epoch": 0.16605166051660517, + "grad_norm": 0.8626830347073103, + "learning_rate": 5.555555555555557e-06, + "loss": 0.6848, "step": 30 }, { - "epoch": 0.1712707182320442, - "grad_norm": 0.8711602353776259, - "learning_rate": 5.636363636363636e-06, - "loss": 0.6709, + "epoch": 0.17158671586715868, + "grad_norm": 0.8679382464145016, + "learning_rate": 5.740740740740741e-06, + "loss": 0.6615, "step": 31 }, { - "epoch": 0.17679558011049723, - "grad_norm": 0.7874062745697313, - "learning_rate": 5.8181818181818185e-06, - "loss": 0.7056, + "epoch": 0.17712177121771217, + "grad_norm": 0.8023350481646084, + "learning_rate": 5.925925925925926e-06, + "loss": 0.708, "step": 32 }, { - "epoch": 0.18232044198895028, - "grad_norm": 0.910787056447978, - "learning_rate": 6e-06, - "loss": 0.7321, + "epoch": 0.18265682656826568, + "grad_norm": 0.9095885057238613, + "learning_rate": 6.111111111111112e-06, + "loss": 0.7367, "step": 33 }, { - "epoch": 0.1878453038674033, - "grad_norm": 0.8834639779382965, - "learning_rate": 6.181818181818182e-06, - "loss": 0.6943, + "epoch": 0.1881918819188192, + "grad_norm": 0.8887858125212083, + "learning_rate": 6.296296296296297e-06, + "loss": 0.7039, "step": 34 }, { - "epoch": 0.19337016574585636, - "grad_norm": 0.9011491907153977, - "learning_rate": 6.363636363636364e-06, - "loss": 0.7474, + "epoch": 0.1937269372693727, + "grad_norm": 0.9119816001938974, + "learning_rate": 6.481481481481482e-06, + "loss": 0.7479, "step": 35 }, { - "epoch": 0.19889502762430938, - "grad_norm": 0.9863965225958002, - "learning_rate": 6.545454545454546e-06, - "loss": 0.7448, + "epoch": 0.1992619926199262, + "grad_norm": 0.9750213405228529, + "learning_rate": 6.666666666666667e-06, + "loss": 0.7354, "step": 36 }, { - "epoch": 0.20441988950276244, - "grad_norm": 0.804971042531312, - "learning_rate": 6.7272727272727275e-06, - "loss": 0.6026, + "epoch": 0.2047970479704797, + "grad_norm": 0.7893115852707117, + "learning_rate": 6.851851851851853e-06, + "loss": 0.5972, "step": 37 }, { - "epoch": 0.20994475138121546, - "grad_norm": 0.8495022527176822, - "learning_rate": 6.90909090909091e-06, - "loss": 0.686, + "epoch": 0.21033210332103322, + "grad_norm": 0.8249712148416332, + "learning_rate": 7.0370370370370375e-06, + "loss": 0.6766, "step": 38 }, { - "epoch": 0.2154696132596685, - "grad_norm": 0.8536642393909392, - "learning_rate": 7.0909090909090916e-06, - "loss": 0.6863, + "epoch": 0.2158671586715867, + "grad_norm": 0.8548009416827502, + "learning_rate": 7.222222222222223e-06, + "loss": 0.6979, "step": 39 }, { - "epoch": 0.22099447513812154, - "grad_norm": 0.7505296486129153, - "learning_rate": 7.272727272727273e-06, - "loss": 0.6495, + "epoch": 0.22140221402214022, + "grad_norm": 0.7430327142800517, + "learning_rate": 7.4074074074074075e-06, + "loss": 0.6364, "step": 40 }, { - "epoch": 0.2265193370165746, - "grad_norm": 0.8613764113585566, - "learning_rate": 7.454545454545456e-06, - "loss": 0.6903, + "epoch": 0.22693726937269373, + "grad_norm": 0.8804985241917266, + "learning_rate": 7.592592592592594e-06, + "loss": 0.7009, "step": 41 }, { - "epoch": 0.23204419889502761, - "grad_norm": 0.9071577576208497, - "learning_rate": 7.636363636363638e-06, - "loss": 0.6681, + "epoch": 0.23247232472324722, + "grad_norm": 0.8685723187944967, + "learning_rate": 7.77777777777778e-06, + "loss": 0.665, "step": 42 }, { - "epoch": 0.23756906077348067, - "grad_norm": 0.7806943160679269, - "learning_rate": 7.81818181818182e-06, - "loss": 0.6454, + "epoch": 0.23800738007380073, + "grad_norm": 0.7863830014618772, + "learning_rate": 7.962962962962963e-06, + "loss": 0.6506, "step": 43 }, { - "epoch": 0.2430939226519337, - "grad_norm": 0.8542316753334585, - "learning_rate": 8.000000000000001e-06, - "loss": 0.6943, + "epoch": 0.24354243542435425, + "grad_norm": 0.89002673850871, + "learning_rate": 8.148148148148148e-06, + "loss": 0.704, "step": 44 }, { - "epoch": 0.24861878453038674, - "grad_norm": 0.8806155598957247, - "learning_rate": 8.181818181818183e-06, - "loss": 0.6376, + "epoch": 0.24907749077490776, + "grad_norm": 0.9353559189317285, + "learning_rate": 8.333333333333334e-06, + "loss": 0.6407, "step": 45 }, { - "epoch": 0.2541436464088398, - "grad_norm": 0.9140053110890461, - "learning_rate": 8.363636363636365e-06, - "loss": 0.6873, + "epoch": 0.25461254612546125, + "grad_norm": 0.8860584109544853, + "learning_rate": 8.518518518518519e-06, + "loss": 0.6877, "step": 46 }, { - "epoch": 0.2596685082872928, - "grad_norm": 0.9696816209283575, - "learning_rate": 8.545454545454546e-06, - "loss": 0.6275, + "epoch": 0.26014760147601473, + "grad_norm": 1.0345385393930384, + "learning_rate": 8.703703703703705e-06, + "loss": 0.6195, "step": 47 }, { - "epoch": 0.26519337016574585, - "grad_norm": 0.8011554339035484, - "learning_rate": 8.727272727272728e-06, - "loss": 0.7049, + "epoch": 0.2656826568265683, + "grad_norm": 0.8390822845074132, + "learning_rate": 8.888888888888888e-06, + "loss": 0.7143, "step": 48 }, { - "epoch": 0.27071823204419887, - "grad_norm": 0.8214450369964688, - "learning_rate": 8.90909090909091e-06, - "loss": 0.6914, + "epoch": 0.27121771217712176, + "grad_norm": 0.7829416679007993, + "learning_rate": 9.074074074074075e-06, + "loss": 0.6848, "step": 49 }, { - "epoch": 0.27624309392265195, - "grad_norm": 0.9615553631195245, - "learning_rate": 9.090909090909091e-06, - "loss": 0.6982, + "epoch": 0.2767527675276753, + "grad_norm": 1.0206245762170603, + "learning_rate": 9.25925925925926e-06, + "loss": 0.7, "step": 50 }, { - "epoch": 0.281767955801105, - "grad_norm": 0.8354942020446344, - "learning_rate": 9.272727272727273e-06, - "loss": 0.7023, + "epoch": 0.2822878228782288, + "grad_norm": 0.8735389450158357, + "learning_rate": 9.444444444444445e-06, + "loss": 0.7061, "step": 51 }, { - "epoch": 0.287292817679558, - "grad_norm": 0.8299384272303495, - "learning_rate": 9.454545454545456e-06, - "loss": 0.695, + "epoch": 0.2878228782287823, + "grad_norm": 0.813152245836957, + "learning_rate": 9.62962962962963e-06, + "loss": 0.6943, "step": 52 }, { - "epoch": 0.292817679558011, - "grad_norm": 0.919798873510162, - "learning_rate": 9.636363636363638e-06, - "loss": 0.7185, + "epoch": 0.2933579335793358, + "grad_norm": 0.9984159950425506, + "learning_rate": 9.814814814814815e-06, + "loss": 0.736, "step": 53 }, { - "epoch": 0.2983425414364641, - "grad_norm": 0.8232925581988623, - "learning_rate": 9.81818181818182e-06, - "loss": 0.6864, + "epoch": 0.2988929889298893, + "grad_norm": 0.8289001717630332, + "learning_rate": 1e-05, + "loss": 0.6843, "step": 54 }, { - "epoch": 0.30386740331491713, - "grad_norm": 0.7920084479497609, - "learning_rate": 1e-05, - "loss": 0.6191, + "epoch": 0.3044280442804428, + "grad_norm": 0.7804823508929742, + "learning_rate": 9.999895536228031e-06, + "loss": 0.6178, "step": 55 }, { - "epoch": 0.30939226519337015, - "grad_norm": 0.8758601266592275, - "learning_rate": 9.999896390730872e-06, - "loss": 0.6469, + "epoch": 0.30996309963099633, + "grad_norm": 0.8747433453571559, + "learning_rate": 9.999582149277188e-06, + "loss": 0.6482, "step": 56 }, { - "epoch": 0.3149171270718232, - "grad_norm": 1.0142576779420058, - "learning_rate": 9.99958556721744e-06, - "loss": 0.7147, + "epoch": 0.3154981549815498, + "grad_norm": 0.9957208410673493, + "learning_rate": 9.999059852242508e-06, + "loss": 0.7054, "step": 57 }, { - "epoch": 0.32044198895027626, - "grad_norm": 0.788012902837433, - "learning_rate": 9.99906754234138e-06, - "loss": 0.6935, + "epoch": 0.3210332103321033, + "grad_norm": 0.7818825969466411, + "learning_rate": 9.998328666948437e-06, + "loss": 0.6968, "step": 58 }, { - "epoch": 0.3259668508287293, - "grad_norm": 0.8574442174017383, - "learning_rate": 9.998342337571566e-06, - "loss": 0.6852, + "epoch": 0.32656826568265684, + "grad_norm": 0.8999544343447169, + "learning_rate": 9.997388623947927e-06, + "loss": 0.6979, "step": 59 }, { - "epoch": 0.3314917127071823, - "grad_norm": 0.8678503120132645, - "learning_rate": 9.997409982963173e-06, - "loss": 0.594, + "epoch": 0.33210332103321033, + "grad_norm": 0.877731722321392, + "learning_rate": 9.996239762521152e-06, + "loss": 0.5952, "step": 60 }, { - "epoch": 0.3370165745856354, - "grad_norm": 0.7829472641187216, - "learning_rate": 9.996270517156431e-06, - "loss": 0.6359, + "epoch": 0.3376383763837638, + "grad_norm": 0.7490764317219278, + "learning_rate": 9.994882130673869e-06, + "loss": 0.6225, "step": 61 }, { - "epoch": 0.3425414364640884, - "grad_norm": 0.9057620068251797, - "learning_rate": 9.994923987375029e-06, - "loss": 0.6948, + "epoch": 0.34317343173431736, + "grad_norm": 0.8743296067498224, + "learning_rate": 9.993315785135417e-06, + "loss": 0.6924, "step": 62 }, { - "epoch": 0.34806629834254144, - "grad_norm": 0.865674232498531, - "learning_rate": 9.993370449424153e-06, - "loss": 0.73, + "epoch": 0.34870848708487084, + "grad_norm": 0.8559368435923252, + "learning_rate": 9.991540791356342e-06, + "loss": 0.7266, "step": 63 }, { - "epoch": 0.35359116022099446, - "grad_norm": 0.9444683215382766, - "learning_rate": 9.991609967688177e-06, - "loss": 0.7042, + "epoch": 0.35424354243542433, + "grad_norm": 0.8920516831324726, + "learning_rate": 9.989557223505661e-06, + "loss": 0.705, "step": 64 }, { - "epoch": 0.35911602209944754, - "grad_norm": 0.8179346757831395, - "learning_rate": 9.98964261512799e-06, - "loss": 0.7034, + "epoch": 0.35977859778597787, + "grad_norm": 0.8193570293903212, + "learning_rate": 9.987365164467767e-06, + "loss": 0.6945, "step": 65 }, { - "epoch": 0.36464088397790057, - "grad_norm": 0.8424333161772681, - "learning_rate": 9.987468473277975e-06, - "loss": 0.6368, + "epoch": 0.36531365313653136, + "grad_norm": 0.8473504309187051, + "learning_rate": 9.98496470583896e-06, + "loss": 0.6412, "step": 66 }, { - "epoch": 0.3701657458563536, - "grad_norm": 1.1453016585540032, - "learning_rate": 9.985087632242634e-06, - "loss": 0.7059, + "epoch": 0.37084870848708484, + "grad_norm": 1.14566744243091, + "learning_rate": 9.98235594792363e-06, + "loss": 0.7258, "step": 67 }, { - "epoch": 0.3756906077348066, - "grad_norm": 0.8910424821188626, - "learning_rate": 9.982500190692846e-06, - "loss": 0.7011, + "epoch": 0.3763837638376384, + "grad_norm": 0.8763668658056738, + "learning_rate": 9.979538999730047e-06, + "loss": 0.7084, "step": 68 }, { - "epoch": 0.3812154696132597, - "grad_norm": 0.8430209673516222, - "learning_rate": 9.97970625586178e-06, - "loss": 0.7038, + "epoch": 0.38191881918819187, + "grad_norm": 0.8849767742762311, + "learning_rate": 9.976513978965829e-06, + "loss": 0.6984, "step": 69 }, { - "epoch": 0.3867403314917127, - "grad_norm": 1.0961889806167662, - "learning_rate": 9.976705943540458e-06, - "loss": 0.7167, + "epoch": 0.3874538745387454, + "grad_norm": 1.0687474160518997, + "learning_rate": 9.973281012033009e-06, + "loss": 0.7183, "step": 70 }, { - "epoch": 0.39226519337016574, - "grad_norm": 0.776673389768424, - "learning_rate": 9.973499378072947e-06, - "loss": 0.6383, + "epoch": 0.3929889298892989, + "grad_norm": 0.781213177981436, + "learning_rate": 9.96984023402275e-06, + "loss": 0.6504, "step": 71 }, { - "epoch": 0.39779005524861877, - "grad_norm": 0.7677036148013285, - "learning_rate": 9.970086692351204e-06, - "loss": 0.6711, + "epoch": 0.3985239852398524, + "grad_norm": 0.7379835559329746, + "learning_rate": 9.966191788709716e-06, + "loss": 0.6661, "step": 72 }, { - "epoch": 0.40331491712707185, - "grad_norm": 0.7769311932254055, - "learning_rate": 9.966468027809582e-06, - "loss": 0.6631, + "epoch": 0.4040590405904059, + "grad_norm": 0.7984703551016957, + "learning_rate": 9.962335828546049e-06, + "loss": 0.6613, "step": 73 }, { - "epoch": 0.4088397790055249, - "grad_norm": 1.345391434823482, - "learning_rate": 9.962643534418954e-06, - "loss": 0.7296, + "epoch": 0.4095940959409594, + "grad_norm": 1.2794348232710167, + "learning_rate": 9.958272514655006e-06, + "loss": 0.7307, "step": 74 }, { - "epoch": 0.4143646408839779, - "grad_norm": 0.8085666056467051, - "learning_rate": 9.958613370680507e-06, - "loss": 0.6619, + "epoch": 0.4151291512915129, + "grad_norm": 0.7630430424338107, + "learning_rate": 9.954002016824226e-06, + "loss": 0.665, "step": 75 }, { - "epoch": 0.4198895027624309, - "grad_norm": 0.726842812492243, - "learning_rate": 9.954377703619171e-06, - "loss": 0.6449, + "epoch": 0.42066420664206644, + "grad_norm": 0.7527869433219629, + "learning_rate": 9.949524513498636e-06, + "loss": 0.6424, "step": 76 }, { - "epoch": 0.425414364640884, - "grad_norm": 1.0071149507052566, - "learning_rate": 9.949936708776692e-06, - "loss": 0.761, + "epoch": 0.4261992619926199, + "grad_norm": 1.0051926115967615, + "learning_rate": 9.944840191772987e-06, + "loss": 0.7625, "step": 77 }, { - "epoch": 0.430939226519337, - "grad_norm": 0.8013700509400903, - "learning_rate": 9.945290570204361e-06, - "loss": 0.6456, + "epoch": 0.4317343173431734, + "grad_norm": 0.7754322119272198, + "learning_rate": 9.939949247384046e-06, + "loss": 0.648, "step": 78 }, { - "epoch": 0.43646408839779005, - "grad_norm": 0.7153613260752815, - "learning_rate": 9.940439480455386e-06, - "loss": 0.6883, + "epoch": 0.43726937269372695, + "grad_norm": 0.7146858924414144, + "learning_rate": 9.934851884702415e-06, + "loss": 0.6895, "step": 79 }, { - "epoch": 0.4419889502762431, - "grad_norm": 0.8042269533559475, - "learning_rate": 9.935383640576915e-06, - "loss": 0.7192, + "epoch": 0.44280442804428044, + "grad_norm": 0.8084465018612776, + "learning_rate": 9.929548316723983e-06, + "loss": 0.7189, "step": 80 }, { - "epoch": 0.44751381215469616, - "grad_norm": 0.8007389578297808, - "learning_rate": 9.930123260101697e-06, - "loss": 0.7022, + "epoch": 0.4483394833948339, + "grad_norm": 0.8066377981893613, + "learning_rate": 9.924038765061042e-06, + "loss": 0.6995, "step": 81 }, { - "epoch": 0.4530386740331492, - "grad_norm": 0.7451865462622602, - "learning_rate": 9.9246585570394e-06, - "loss": 0.653, + "epoch": 0.45387453874538747, + "grad_norm": 0.738107027929258, + "learning_rate": 9.918323459933006e-06, + "loss": 0.6529, "step": 82 }, { - "epoch": 0.4585635359116022, - "grad_norm": 0.7629901737313624, - "learning_rate": 9.918989757867584e-06, - "loss": 0.7316, + "epoch": 0.45940959409594095, + "grad_norm": 0.7741808681072159, + "learning_rate": 9.912402640156812e-06, + "loss": 0.7351, "step": 83 }, { - "epoch": 0.46408839779005523, - "grad_norm": 0.694181698027841, - "learning_rate": 9.9131170975223e-06, - "loss": 0.632, + "epoch": 0.46494464944649444, + "grad_norm": 0.7180376057707722, + "learning_rate": 9.906276553136924e-06, + "loss": 0.6305, "step": 84 }, { - "epoch": 0.4696132596685083, - "grad_norm": 0.8031579121005685, - "learning_rate": 9.907040819388372e-06, - "loss": 0.7125, + "epoch": 0.470479704797048, + "grad_norm": 0.8145618675350315, + "learning_rate": 9.899945454855007e-06, + "loss": 0.7026, "step": 85 }, { - "epoch": 0.47513812154696133, - "grad_norm": 0.8027715808171925, - "learning_rate": 9.90076117528929e-06, - "loss": 0.7054, + "epoch": 0.47601476014760147, + "grad_norm": 0.8245175153559692, + "learning_rate": 9.893409609859221e-06, + "loss": 0.7033, "step": 86 }, { - "epoch": 0.48066298342541436, - "grad_norm": 0.684436413306382, - "learning_rate": 9.89427842547679e-06, - "loss": 0.6908, + "epoch": 0.48154981549815495, + "grad_norm": 0.7026182893077407, + "learning_rate": 9.886669291253178e-06, + "loss": 0.6886, "step": 87 }, { - "epoch": 0.4861878453038674, - "grad_norm": 0.7492065837893762, - "learning_rate": 9.88759283862006e-06, - "loss": 0.6951, + "epoch": 0.4870848708487085, + "grad_norm": 0.7364819802578174, + "learning_rate": 9.879724780684518e-06, + "loss": 0.6942, "step": 88 }, { - "epoch": 0.49171270718232046, - "grad_norm": 0.815556456770822, - "learning_rate": 9.880704691794608e-06, - "loss": 0.6829, + "epoch": 0.492619926199262, + "grad_norm": 0.7981620114825012, + "learning_rate": 9.872576368333152e-06, + "loss": 0.6716, "step": 89 }, { - "epoch": 0.4972375690607735, - "grad_norm": 0.848254246152851, - "learning_rate": 9.873614270470778e-06, - "loss": 0.704, + "epoch": 0.4981549815498155, + "grad_norm": 0.8836384710800025, + "learning_rate": 9.86522435289912e-06, + "loss": 0.7237, "step": 90 }, { - "epoch": 0.5027624309392266, - "grad_norm": 0.7783965679247608, - "learning_rate": 9.866321868501914e-06, - "loss": 0.7535, + "epoch": 0.503690036900369, + "grad_norm": 0.7914423436808887, + "learning_rate": 9.857669041590135e-06, + "loss": 0.7616, "step": 91 }, { - "epoch": 0.5082872928176796, - "grad_norm": 0.7186199448098722, - "learning_rate": 9.858827788112195e-06, - "loss": 0.6771, + "epoch": 0.5092250922509225, + "grad_norm": 0.7232583216067142, + "learning_rate": 9.849910750108718e-06, + "loss": 0.6715, "step": 92 }, { - "epoch": 0.5138121546961326, - "grad_norm": 0.8382444185720348, - "learning_rate": 9.851132339884097e-06, - "loss": 0.6821, + "epoch": 0.514760147601476, + "grad_norm": 0.8941702082274658, + "learning_rate": 9.841949802639031e-06, + "loss": 0.6854, "step": 93 }, { - "epoch": 0.5193370165745856, - "grad_norm": 0.7135375433267422, - "learning_rate": 9.843235842745527e-06, - "loss": 0.6985, + "epoch": 0.5202952029520295, + "grad_norm": 0.7537078088279118, + "learning_rate": 9.833786531833311e-06, + "loss": 0.6934, "step": 94 }, { - "epoch": 0.5248618784530387, - "grad_norm": 0.7467452705314399, - "learning_rate": 9.835138623956603e-06, - "loss": 0.6875, + "epoch": 0.525830258302583, + "grad_norm": 0.7496363805903822, + "learning_rate": 9.825421278797984e-06, + "loss": 0.6757, "step": 95 }, { - "epoch": 0.5303867403314917, - "grad_norm": 0.6945947858932874, - "learning_rate": 9.826841019096095e-06, - "loss": 0.6488, + "epoch": 0.5313653136531366, + "grad_norm": 0.7811094162416035, + "learning_rate": 9.816854393079402e-06, + "loss": 0.6575, "step": 96 }, { - "epoch": 0.5359116022099447, - "grad_norm": 0.7817121402690996, - "learning_rate": 9.818343372047509e-06, - "loss": 0.6749, + "epoch": 0.5369003690036901, + "grad_norm": 0.80064135608355, + "learning_rate": 9.808086232649246e-06, + "loss": 0.6753, "step": 97 }, { - "epoch": 0.5414364640883977, - "grad_norm": 0.7141735093592115, - "learning_rate": 9.80964603498485e-06, - "loss": 0.6468, + "epoch": 0.5424354243542435, + "grad_norm": 0.7536916723498005, + "learning_rate": 9.79911716388956e-06, + "loss": 0.6473, "step": 98 }, { - "epoch": 0.5469613259668509, - "grad_norm": 0.7962920109310585, - "learning_rate": 9.80074936835801e-06, - "loss": 0.7068, + "epoch": 0.5479704797047971, + "grad_norm": 0.8008444136665656, + "learning_rate": 9.789947561577445e-06, + "loss": 0.6942, "step": 99 }, { - "epoch": 0.5524861878453039, - "grad_norm": 0.7317304477983744, - "learning_rate": 9.79165374087784e-06, - "loss": 0.7422, + "epoch": 0.5535055350553506, + "grad_norm": 0.7175301808160046, + "learning_rate": 9.7805778088694e-06, + "loss": 0.7231, "step": 100 }, { - "epoch": 0.5580110497237569, - "grad_norm": 0.7000495326190369, - "learning_rate": 9.782359529500867e-06, - "loss": 0.6541, + "epoch": 0.559040590405904, + "grad_norm": 0.6865056427054099, + "learning_rate": 9.771008297285307e-06, + "loss": 0.6522, "step": 101 }, { - "epoch": 0.56353591160221, - "grad_norm": 0.669926861217796, - "learning_rate": 9.772867119413667e-06, - "loss": 0.6509, + "epoch": 0.5645756457564576, + "grad_norm": 0.6945540437205031, + "learning_rate": 9.761239426692077e-06, + "loss": 0.659, "step": 102 }, { - "epoch": 0.569060773480663, - "grad_norm": 0.7233355042750117, - "learning_rate": 9.763176904016914e-06, - "loss": 0.7281, + "epoch": 0.5701107011070111, + "grad_norm": 0.7631536227470039, + "learning_rate": 9.75127160528694e-06, + "loss": 0.7251, "step": 103 }, { - "epoch": 0.574585635359116, - "grad_norm": 0.7297609103522801, - "learning_rate": 9.753289284909058e-06, - "loss": 0.6853, + "epoch": 0.5756457564575646, + "grad_norm": 0.7528731350152382, + "learning_rate": 9.741105249580383e-06, + "loss": 0.6988, "step": 104 }, { - "epoch": 0.580110497237569, - "grad_norm": 0.7837111907679791, - "learning_rate": 9.743204671869694e-06, - "loss": 0.7378, + "epoch": 0.5811808118081181, + "grad_norm": 0.7900410030089848, + "learning_rate": 9.730740784378755e-06, + "loss": 0.7346, "step": 105 }, { - "epoch": 0.585635359116022, - "grad_norm": 0.695348976767688, - "learning_rate": 9.73292348284258e-06, - "loss": 0.6548, + "epoch": 0.5867158671586716, + "grad_norm": 0.7179179917128015, + "learning_rate": 9.7201786427665e-06, + "loss": 0.6467, "step": 106 }, { - "epoch": 0.5911602209944752, - "grad_norm": 0.7062726297235884, - "learning_rate": 9.722446143918307e-06, - "loss": 0.7006, + "epoch": 0.5922509225092251, + "grad_norm": 0.754739804813151, + "learning_rate": 9.709419266088086e-06, + "loss": 0.7126, "step": 107 }, { - "epoch": 0.5966850828729282, - "grad_norm": 0.6884204999018269, - "learning_rate": 9.711773089316645e-06, - "loss": 0.6197, + "epoch": 0.5977859778597786, + "grad_norm": 0.7167700794769842, + "learning_rate": 9.698463103929542e-06, + "loss": 0.6259, "step": 108 }, { - "epoch": 0.6022099447513812, - "grad_norm": 0.7974626566246549, - "learning_rate": 9.70090476136855e-06, - "loss": 0.7452, + "epoch": 0.6033210332103321, + "grad_norm": 0.862393241288747, + "learning_rate": 9.687310614099676e-06, + "loss": 0.7462, "step": 109 }, { - "epoch": 0.6077348066298343, - "grad_norm": 0.7519249005086731, - "learning_rate": 9.689841610497828e-06, - "loss": 0.687, + "epoch": 0.6088560885608856, + "grad_norm": 0.7999243808596918, + "learning_rate": 9.67596226261095e-06, + "loss": 0.6879, "step": 110 }, { - "epoch": 0.6132596685082873, - "grad_norm": 0.7530757425183137, - "learning_rate": 9.678584095202468e-06, - "loss": 0.7223, + "epoch": 0.6143911439114391, + "grad_norm": 0.7608890684314967, + "learning_rate": 9.664418523660004e-06, + "loss": 0.7107, "step": 111 }, { - "epoch": 0.6187845303867403, - "grad_norm": 0.8292920525912827, - "learning_rate": 9.667132682035646e-06, - "loss": 0.6301, + "epoch": 0.6199261992619927, + "grad_norm": 0.8642130795032539, + "learning_rate": 9.652679879607843e-06, + "loss": 0.6217, "step": 112 }, { - "epoch": 0.6243093922651933, - "grad_norm": 0.7985728833994651, - "learning_rate": 9.655487845586378e-06, - "loss": 0.6117, + "epoch": 0.6254612546125461, + "grad_norm": 0.8331393122517589, + "learning_rate": 9.640746820959684e-06, + "loss": 0.621, "step": 113 }, { - "epoch": 0.6298342541436464, - "grad_norm": 0.675379570131735, - "learning_rate": 9.643650068459863e-06, - "loss": 0.5902, + "epoch": 0.6309963099630996, + "grad_norm": 0.6800899888090565, + "learning_rate": 9.628619846344453e-06, + "loss": 0.5866, "step": 114 }, { - "epoch": 0.6353591160220995, - "grad_norm": 0.7621705816706639, - "learning_rate": 9.631619841257477e-06, - "loss": 0.6144, + "epoch": 0.6365313653136532, + "grad_norm": 0.7683010600627519, + "learning_rate": 9.616299462493952e-06, + "loss": 0.6161, "step": 115 }, { - "epoch": 0.6408839779005525, - "grad_norm": 0.7372229704093334, - "learning_rate": 9.619397662556434e-06, - "loss": 0.6444, + "epoch": 0.6420664206642066, + "grad_norm": 0.8092930286332166, + "learning_rate": 9.603786184221693e-06, + "loss": 0.652, "step": 116 }, { - "epoch": 0.6464088397790055, - "grad_norm": 0.7171400121498231, - "learning_rate": 9.606984038889142e-06, - "loss": 0.6478, + "epoch": 0.6476014760147601, + "grad_norm": 0.75053691727633, + "learning_rate": 9.591080534401371e-06, + "loss": 0.6423, "step": 117 }, { - "epoch": 0.6519337016574586, - "grad_norm": 0.7162391776467386, - "learning_rate": 9.594379484722185e-06, - "loss": 0.6368, + "epoch": 0.6531365313653137, + "grad_norm": 0.713112544635858, + "learning_rate": 9.578183043945031e-06, + "loss": 0.6363, "step": 118 }, { - "epoch": 0.6574585635359116, - "grad_norm": 0.8119977538487404, - "learning_rate": 9.581584522435025e-06, - "loss": 0.6971, + "epoch": 0.6586715867158671, + "grad_norm": 0.8815831434837667, + "learning_rate": 9.565094251780872e-06, + "loss": 0.7031, "step": 119 }, { - "epoch": 0.6629834254143646, - "grad_norm": 0.7697231468364938, - "learning_rate": 9.568599682298337e-06, - "loss": 0.6323, + "epoch": 0.6642066420664207, + "grad_norm": 0.8001162241305896, + "learning_rate": 9.551814704830734e-06, + "loss": 0.6202, "step": 120 }, { - "epoch": 0.6685082872928176, - "grad_norm": 0.7222511824646166, - "learning_rate": 9.555425502452038e-06, - "loss": 0.7068, + "epoch": 0.6697416974169742, + "grad_norm": 0.7135742572247368, + "learning_rate": 9.538344957987245e-06, + "loss": 0.704, "step": 121 }, { - "epoch": 0.6740331491712708, - "grad_norm": 0.7059581814243575, - "learning_rate": 9.542062528882989e-06, - "loss": 0.6758, + "epoch": 0.6752767527675276, + "grad_norm": 0.7687650494748194, + "learning_rate": 9.524685574090627e-06, + "loss": 0.6892, "step": 122 }, { - "epoch": 0.6795580110497238, - "grad_norm": 0.7911262183983974, - "learning_rate": 9.528511315402358e-06, - "loss": 0.6844, + "epoch": 0.6808118081180812, + "grad_norm": 0.8827181678396219, + "learning_rate": 9.51083712390519e-06, + "loss": 0.685, "step": 123 }, { - "epoch": 0.6850828729281768, - "grad_norm": 0.9567455662688182, - "learning_rate": 9.514772423622675e-06, - "loss": 0.7468, + "epoch": 0.6863468634686347, + "grad_norm": 0.8882151233390665, + "learning_rate": 9.496800186095466e-06, + "loss": 0.7506, "step": 124 }, { - "epoch": 0.6906077348066298, - "grad_norm": 0.8075170793709282, - "learning_rate": 9.500846422934557e-06, - "loss": 0.6884, + "epoch": 0.6918819188191881, + "grad_norm": 0.7930642149741711, + "learning_rate": 9.482575347202047e-06, + "loss": 0.6835, "step": 125 }, { - "epoch": 0.6961325966850829, - "grad_norm": 0.6617434418531425, - "learning_rate": 9.4867338904831e-06, - "loss": 0.6807, + "epoch": 0.6974169741697417, + "grad_norm": 0.6933167090473802, + "learning_rate": 9.468163201617063e-06, + "loss": 0.6992, "step": 126 }, { - "epoch": 0.7016574585635359, - "grad_norm": 0.6852663876058741, - "learning_rate": 9.472435411143979e-06, - "loss": 0.631, + "epoch": 0.7029520295202952, + "grad_norm": 0.715214953281436, + "learning_rate": 9.453564351559348e-06, + "loss": 0.6442, "step": 127 }, { - "epoch": 0.7071823204419889, - "grad_norm": 0.8062476288399656, - "learning_rate": 9.457951577499187e-06, - "loss": 0.6678, + "epoch": 0.7084870848708487, + "grad_norm": 0.7554434464632439, + "learning_rate": 9.438779407049282e-06, + "loss": 0.6666, "step": 128 }, { - "epoch": 0.712707182320442, - "grad_norm": 0.6969129316123458, - "learning_rate": 9.443282989812494e-06, - "loss": 0.6413, + "epoch": 0.7140221402214022, + "grad_norm": 0.7346261351044595, + "learning_rate": 9.423808985883289e-06, + "loss": 0.6314, "step": 129 }, { - "epoch": 0.7182320441988951, - "grad_norm": 0.7139049384459147, - "learning_rate": 9.428430256004558e-06, - "loss": 0.6523, + "epoch": 0.7195571955719557, + "grad_norm": 0.7394564505170546, + "learning_rate": 9.40865371360804e-06, + "loss": 0.6532, "step": 130 }, { - "epoch": 0.7237569060773481, - "grad_norm": 0.7008590174042524, - "learning_rate": 9.413393991627737e-06, - "loss": 0.6826, + "epoch": 0.7250922509225092, + "grad_norm": 0.6920823973121119, + "learning_rate": 9.393314223494297e-06, + "loss": 0.6871, "step": 131 }, { - "epoch": 0.7292817679558011, - "grad_norm": 0.8833945053457845, - "learning_rate": 9.398174819840577e-06, - "loss": 0.6405, + "epoch": 0.7306273062730627, + "grad_norm": 0.8019568716365827, + "learning_rate": 9.377791156510456e-06, + "loss": 0.6313, "step": 132 }, { - "epoch": 0.7348066298342542, - "grad_norm": 0.7572193126349432, - "learning_rate": 9.382773371381986e-06, - "loss": 0.6689, + "epoch": 0.7361623616236163, + "grad_norm": 0.731865215320956, + "learning_rate": 9.362085161295768e-06, + "loss": 0.6757, "step": 133 }, { - "epoch": 0.7403314917127072, - "grad_norm": 0.7524950122671795, - "learning_rate": 9.367190284545087e-06, - "loss": 0.7109, + "epoch": 0.7416974169741697, + "grad_norm": 0.7331269825104397, + "learning_rate": 9.346196894133239e-06, + "loss": 0.7192, "step": 134 }, { - "epoch": 0.7458563535911602, - "grad_norm": 0.6769451685165733, - "learning_rate": 9.351426205150778e-06, - "loss": 0.66, + "epoch": 0.7472324723247232, + "grad_norm": 0.6689559430615435, + "learning_rate": 9.330127018922195e-06, + "loss": 0.6578, "step": 135 }, { - "epoch": 0.7513812154696132, - "grad_norm": 0.790773238492282, - "learning_rate": 9.335481786520955e-06, - "loss": 0.7216, + "epoch": 0.7527675276752768, + "grad_norm": 0.8011088236436561, + "learning_rate": 9.313876207150544e-06, + "loss": 0.7185, "step": 136 }, { - "epoch": 0.7569060773480663, - "grad_norm": 0.7100066068783747, - "learning_rate": 9.319357689451444e-06, - "loss": 0.6562, + "epoch": 0.7583025830258303, + "grad_norm": 0.7497062827280815, + "learning_rate": 9.297445137866726e-06, + "loss": 0.6652, "step": 137 }, { - "epoch": 0.7624309392265194, - "grad_norm": 0.7853719538210431, - "learning_rate": 9.30305458218461e-06, - "loss": 0.6882, + "epoch": 0.7638376383763837, + "grad_norm": 0.7999274641250289, + "learning_rate": 9.280834497651334e-06, + "loss": 0.6987, "step": 138 }, { - "epoch": 0.7679558011049724, - "grad_norm": 0.6638407553470856, - "learning_rate": 9.286573140381663e-06, - "loss": 0.6541, + "epoch": 0.7693726937269373, + "grad_norm": 0.6443825206939584, + "learning_rate": 9.264044980588415e-06, + "loss": 0.6537, "step": 139 }, { - "epoch": 0.7734806629834254, - "grad_norm": 0.6834239770243532, - "learning_rate": 9.26991404709466e-06, - "loss": 0.6289, + "epoch": 0.7749077490774908, + "grad_norm": 0.6830543309432857, + "learning_rate": 9.247077288236488e-06, + "loss": 0.6389, "step": 140 }, { - "epoch": 0.7790055248618785, - "grad_norm": 0.8713388179210387, - "learning_rate": 9.253077992738193e-06, - "loss": 0.73, + "epoch": 0.7804428044280443, + "grad_norm": 0.7912659097565806, + "learning_rate": 9.229932129599206e-06, + "loss": 0.7265, "step": 141 }, { - "epoch": 0.7845303867403315, - "grad_norm": 0.7629447190554447, - "learning_rate": 9.236065675060775e-06, - "loss": 0.693, + "epoch": 0.7859778597785978, + "grad_norm": 0.7608099518203012, + "learning_rate": 9.212610221095748e-06, + "loss": 0.6902, "step": 142 }, { - "epoch": 0.7900552486187845, - "grad_norm": 0.7544036306635357, - "learning_rate": 9.218877799115929e-06, - "loss": 0.7252, + "epoch": 0.7915129151291513, + "grad_norm": 0.7598999658417419, + "learning_rate": 9.195112286530874e-06, + "loss": 0.7237, "step": 143 }, { - "epoch": 0.7955801104972375, - "grad_norm": 0.6847405196212653, - "learning_rate": 9.201515077232958e-06, - "loss": 0.703, + "epoch": 0.7970479704797048, + "grad_norm": 0.6790407377272779, + "learning_rate": 9.177439057064684e-06, + "loss": 0.689, "step": 144 }, { - "epoch": 0.8011049723756906, - "grad_norm": 0.7322209407647546, - "learning_rate": 9.183978228987436e-06, - "loss": 0.6511, + "epoch": 0.8025830258302583, + "grad_norm": 0.7024267843652401, + "learning_rate": 9.159591271182058e-06, + "loss": 0.6356, "step": 145 }, { - "epoch": 0.8066298342541437, - "grad_norm": 0.7306655102140895, - "learning_rate": 9.166267981171369e-06, - "loss": 0.6477, + "epoch": 0.8081180811808119, + "grad_norm": 0.704433084066074, + "learning_rate": 9.141569674661816e-06, + "loss": 0.6453, "step": 146 }, { - "epoch": 0.8121546961325967, - "grad_norm": 0.7206398058794518, - "learning_rate": 9.148385067763094e-06, - "loss": 0.6407, + "epoch": 0.8136531365313653, + "grad_norm": 0.7169550452222242, + "learning_rate": 9.123375020545534e-06, + "loss": 0.6373, "step": 147 }, { - "epoch": 0.8176795580110497, - "grad_norm": 0.7036158657538799, - "learning_rate": 9.130330229896846e-06, - "loss": 0.6096, + "epoch": 0.8191881918819188, + "grad_norm": 0.694690233125659, + "learning_rate": 9.105008069106093e-06, + "loss": 0.6091, "step": 148 }, { - "epoch": 0.8232044198895028, - "grad_norm": 0.7807874970263721, - "learning_rate": 9.112104215832047e-06, - "loss": 0.658, + "epoch": 0.8247232472324724, + "grad_norm": 0.7335227368162949, + "learning_rate": 9.086469587815904e-06, + "loss": 0.6582, "step": 149 }, { - "epoch": 0.8287292817679558, - "grad_norm": 0.6887926406142375, - "learning_rate": 9.093707780922293e-06, - "loss": 0.6985, + "epoch": 0.8302583025830258, + "grad_norm": 0.6841111000042602, + "learning_rate": 9.067760351314838e-06, + "loss": 0.7072, "step": 150 }, { - "epoch": 0.8342541436464088, - "grad_norm": 0.7163709674127419, - "learning_rate": 9.075141687584056e-06, - "loss": 0.6801, + "epoch": 0.8357933579335793, + "grad_norm": 0.7083136755269921, + "learning_rate": 9.048881141377863e-06, + "loss": 0.6677, "step": 151 }, { - "epoch": 0.8397790055248618, - "grad_norm": 0.6837817476434366, - "learning_rate": 9.056406705265084e-06, - "loss": 0.6442, + "epoch": 0.8413284132841329, + "grad_norm": 0.6982890610414855, + "learning_rate": 9.029832746882372e-06, + "loss": 0.6526, "step": 152 }, { - "epoch": 0.8453038674033149, - "grad_norm": 0.6843857902663828, - "learning_rate": 9.037503610412502e-06, - "loss": 0.6446, + "epoch": 0.8468634686346863, + "grad_norm": 0.6860196076239153, + "learning_rate": 9.01061596377522e-06, + "loss": 0.6461, "step": 153 }, { - "epoch": 0.850828729281768, - "grad_norm": 0.7547759124038085, - "learning_rate": 9.018433186440648e-06, - "loss": 0.7106, + "epoch": 0.8523985239852399, + "grad_norm": 0.7376158689862853, + "learning_rate": 8.991231595039464e-06, + "loss": 0.7101, "step": 154 }, { - "epoch": 0.856353591160221, - "grad_norm": 0.6923617352622536, - "learning_rate": 8.999196223698599e-06, - "loss": 0.7418, + "epoch": 0.8579335793357934, + "grad_norm": 0.6975228556068289, + "learning_rate": 8.97168045066082e-06, + "loss": 0.7475, "step": 155 }, { - "epoch": 0.861878453038674, - "grad_norm": 0.7589388571367478, - "learning_rate": 8.979793519437413e-06, - "loss": 0.7064, + "epoch": 0.8634686346863468, + "grad_norm": 0.7486491127857194, + "learning_rate": 8.951963347593797e-06, + "loss": 0.6996, "step": 156 }, { - "epoch": 0.8674033149171271, - "grad_norm": 0.8111254921639891, - "learning_rate": 8.960225877777095e-06, - "loss": 0.7153, + "epoch": 0.8690036900369004, + "grad_norm": 0.7912254693906845, + "learning_rate": 8.932081109727582e-06, + "loss": 0.6985, "step": 157 }, { - "epoch": 0.8729281767955801, - "grad_norm": 0.6405807331541313, - "learning_rate": 8.940494109673266e-06, - "loss": 0.6121, + "epoch": 0.8745387453874539, + "grad_norm": 0.6439570726845703, + "learning_rate": 8.9120345678516e-06, + "loss": 0.625, "step": 158 }, { - "epoch": 0.8784530386740331, - "grad_norm": 0.6742570632281877, - "learning_rate": 8.920599032883553e-06, - "loss": 0.6781, + "epoch": 0.8800738007380073, + "grad_norm": 0.6572694602060432, + "learning_rate": 8.891824559620801e-06, + "loss": 0.6784, "step": 159 }, { - "epoch": 0.8839779005524862, - "grad_norm": 0.7843439776633434, - "learning_rate": 8.900541471933703e-06, - "loss": 0.6979, + "epoch": 0.8856088560885609, + "grad_norm": 0.7971159900336966, + "learning_rate": 8.871451929520662e-06, + "loss": 0.6988, "step": 160 }, { - "epoch": 0.8895027624309392, - "grad_norm": 0.725960679493056, - "learning_rate": 8.880322258083408e-06, - "loss": 0.6557, + "epoch": 0.8911439114391144, + "grad_norm": 0.7050522383893928, + "learning_rate": 8.8509175288319e-06, + "loss": 0.6445, "step": 161 }, { - "epoch": 0.8950276243093923, - "grad_norm": 0.6898678148433862, - "learning_rate": 8.859942229291856e-06, - "loss": 0.662, + "epoch": 0.8966789667896679, + "grad_norm": 0.6897534640630667, + "learning_rate": 8.83022221559489e-06, + "loss": 0.6698, "step": 162 }, { - "epoch": 0.9005524861878453, - "grad_norm": 0.7710968128854336, - "learning_rate": 8.839402230183e-06, - "loss": 0.662, + "epoch": 0.9022140221402214, + "grad_norm": 0.7286182755253351, + "learning_rate": 8.80936685457383e-06, + "loss": 0.6527, "step": 163 }, { - "epoch": 0.9060773480662984, - "grad_norm": 0.741382073637584, - "learning_rate": 8.818703112010562e-06, - "loss": 0.6935, + "epoch": 0.9077490774907749, + "grad_norm": 0.7250443386250167, + "learning_rate": 8.78835231722059e-06, + "loss": 0.6785, "step": 164 }, { - "epoch": 0.9116022099447514, - "grad_norm": 0.6831369278396727, - "learning_rate": 8.797845732622742e-06, - "loss": 0.7031, + "epoch": 0.9132841328413284, + "grad_norm": 0.6795202817087662, + "learning_rate": 8.767179481638303e-06, + "loss": 0.712, "step": 165 }, { - "epoch": 0.9171270718232044, - "grad_norm": 0.7492006743864014, - "learning_rate": 8.776830956426674e-06, - "loss": 0.708, + "epoch": 0.9188191881918819, + "grad_norm": 0.7673545156991856, + "learning_rate": 8.74584923254468e-06, + "loss": 0.7073, "step": 166 }, { - "epoch": 0.9226519337016574, - "grad_norm": 0.7613459925623692, - "learning_rate": 8.755659654352599e-06, - "loss": 0.7169, + "epoch": 0.9243542435424354, + "grad_norm": 0.7100681832870371, + "learning_rate": 8.72436246123503e-06, + "loss": 0.7097, "step": 167 }, { - "epoch": 0.9281767955801105, - "grad_norm": 0.6791670574304287, - "learning_rate": 8.734332703817771e-06, - "loss": 0.6991, + "epoch": 0.9298892988929889, + "grad_norm": 0.6750568200393077, + "learning_rate": 8.702720065545024e-06, + "loss": 0.6976, "step": 168 }, { - "epoch": 0.9337016574585635, - "grad_norm": 0.6681861340818701, - "learning_rate": 8.712850988690094e-06, - "loss": 0.7032, + "epoch": 0.9354243542435424, + "grad_norm": 0.6760810557529758, + "learning_rate": 8.680922949813177e-06, + "loss": 0.6883, "step": 169 }, { - "epoch": 0.9392265193370166, - "grad_norm": 0.6569011217941949, - "learning_rate": 8.691215399251489e-06, - "loss": 0.6439, + "epoch": 0.940959409594096, + "grad_norm": 0.6506834973761045, + "learning_rate": 8.658972024843063e-06, + "loss": 0.6434, "step": 170 }, { - "epoch": 0.9447513812154696, - "grad_norm": 0.7086161196938886, - "learning_rate": 8.669426832160997e-06, - "loss": 0.6193, + "epoch": 0.9464944649446494, + "grad_norm": 0.668877599120538, + "learning_rate": 8.636868207865244e-06, + "loss": 0.6198, "step": 171 }, { - "epoch": 0.9502762430939227, - "grad_norm": 0.654556823801651, - "learning_rate": 8.647486190417624e-06, - "loss": 0.6929, + "epoch": 0.9520295202952029, + "grad_norm": 0.6532330641471769, + "learning_rate": 8.614612422498965e-06, + "loss": 0.6806, "step": 172 }, { - "epoch": 0.9558011049723757, - "grad_norm": 0.7440213169432797, - "learning_rate": 8.625394383322914e-06, - "loss": 0.6779, + "epoch": 0.9575645756457565, + "grad_norm": 0.7178919179692898, + "learning_rate": 8.592205598713539e-06, + "loss": 0.6887, "step": 173 }, { - "epoch": 0.9613259668508287, - "grad_norm": 0.7345541693491946, - "learning_rate": 8.603152326443262e-06, - "loss": 0.6412, + "epoch": 0.9630996309963099, + "grad_norm": 0.6999250045861761, + "learning_rate": 8.569648672789496e-06, + "loss": 0.6442, "step": 174 }, { - "epoch": 0.9668508287292817, - "grad_norm": 0.7017136954703048, - "learning_rate": 8.580760941571968e-06, - "loss": 0.7249, + "epoch": 0.9686346863468634, + "grad_norm": 0.7075371495307748, + "learning_rate": 8.546942587279465e-06, + "loss": 0.7256, "step": 175 }, { - "epoch": 0.9723756906077348, - "grad_norm": 0.7533884250226044, - "learning_rate": 8.55822115669104e-06, - "loss": 0.6725, + "epoch": 0.974169741697417, + "grad_norm": 0.7139921209949371, + "learning_rate": 8.524088290968781e-06, + "loss": 0.669, "step": 176 }, { - "epoch": 0.9779005524861878, - "grad_norm": 0.8635896869993843, - "learning_rate": 8.535533905932739e-06, - "loss": 0.6778, + "epoch": 0.9797047970479705, + "grad_norm": 0.7843465216757487, + "learning_rate": 8.501086738835843e-06, + "loss": 0.679, "step": 177 }, { - "epoch": 0.9834254143646409, - "grad_norm": 0.7046176922723474, - "learning_rate": 8.512700129540847e-06, - "loss": 0.646, + "epoch": 0.985239852398524, + "grad_norm": 0.7078117075067463, + "learning_rate": 8.477938892012209e-06, + "loss": 0.6443, "step": 178 }, { - "epoch": 0.988950276243094, - "grad_norm": 0.7469626256845687, - "learning_rate": 8.489720773831717e-06, - "loss": 0.6742, + "epoch": 0.9907749077490775, + "grad_norm": 0.7191146480817366, + "learning_rate": 8.45464571774244e-06, + "loss": 0.6752, "step": 179 }, { - "epoch": 0.994475138121547, - "grad_norm": 0.8073184194204243, - "learning_rate": 8.466596791155055e-06, - "loss": 0.6745, + "epoch": 0.996309963099631, + "grad_norm": 0.7793217733052689, + "learning_rate": 8.43120818934367e-06, + "loss": 0.6809, "step": 180 }, { - "epoch": 1.0, - "grad_norm": 0.6469115640516496, - "learning_rate": 8.443329139854434e-06, - "loss": 0.635, + "epoch": 1.0018450184501846, + "grad_norm": 0.9535816156617457, + "learning_rate": 8.407627286164948e-06, + "loss": 0.8581, "step": 181 }, { - "epoch": 1.0055248618784531, - "grad_norm": 0.7680249419301477, - "learning_rate": 8.419918784227592e-06, - "loss": 0.6638, + "epoch": 1.007380073800738, + "grad_norm": 0.7904687166212481, + "learning_rate": 8.38390399354631e-06, + "loss": 0.6589, "step": 182 }, { - "epoch": 1.011049723756906, - "grad_norm": 0.7714771127607805, - "learning_rate": 8.396366694486466e-06, - "loss": 0.6176, + "epoch": 1.0129151291512914, + "grad_norm": 0.7871113082768205, + "learning_rate": 8.360039302777614e-06, + "loss": 0.5974, "step": 183 }, { - "epoch": 1.0165745856353592, - "grad_norm": 0.6836633389900814, - "learning_rate": 8.372673846716977e-06, - "loss": 0.5742, + "epoch": 1.018450184501845, + "grad_norm": 0.7159872827730034, + "learning_rate": 8.336034211057098e-06, + "loss": 0.6009, "step": 184 }, { - "epoch": 1.022099447513812, - "grad_norm": 0.671339247626321, - "learning_rate": 8.348841222838579e-06, - "loss": 0.5965, + "epoch": 1.0239852398523985, + "grad_norm": 0.6616682950001487, + "learning_rate": 8.31188972144974e-06, + "loss": 0.5881, "step": 185 }, { - "epoch": 1.0276243093922652, - "grad_norm": 0.6818502606595346, - "learning_rate": 8.324869810563573e-06, - "loss": 0.6031, + "epoch": 1.029520295202952, + "grad_norm": 0.6850474409430557, + "learning_rate": 8.28760684284532e-06, + "loss": 0.5939, "step": 186 }, { - "epoch": 1.0331491712707181, - "grad_norm": 0.7679534342017695, - "learning_rate": 8.30076060335616e-06, - "loss": 0.6378, + "epoch": 1.0350553505535056, + "grad_norm": 0.7461860778494099, + "learning_rate": 8.263186589916273e-06, + "loss": 0.6383, "step": 187 }, { - "epoch": 1.0386740331491713, - "grad_norm": 0.6610817415773053, - "learning_rate": 8.276514600391272e-06, - "loss": 0.5828, + "epoch": 1.040590405904059, + "grad_norm": 0.7089074245925379, + "learning_rate": 8.238629983075296e-06, + "loss": 0.618, "step": 188 }, { - "epoch": 1.0441988950276242, - "grad_norm": 0.7265666171607357, - "learning_rate": 8.25213280651317e-06, - "loss": 0.5991, + "epoch": 1.0461254612546125, + "grad_norm": 0.6738261402996001, + "learning_rate": 8.213938048432697e-06, + "loss": 0.5554, "step": 189 }, { - "epoch": 1.0497237569060773, - "grad_norm": 0.7356973203535483, - "learning_rate": 8.227616232193794e-06, - "loss": 0.5877, + "epoch": 1.051660516605166, + "grad_norm": 0.7206220611502636, + "learning_rate": 8.18911181775353e-06, + "loss": 0.6136, "step": 190 }, { - "epoch": 1.0552486187845305, - "grad_norm": 0.7865650351374615, - "learning_rate": 8.202965893490877e-06, - "loss": 0.6476, + "epoch": 1.0571955719557196, + "grad_norm": 0.7352230176592326, + "learning_rate": 8.164152328414476e-06, + "loss": 0.6188, "step": 191 }, { - "epoch": 1.0607734806629834, - "grad_norm": 0.7194978525901626, - "learning_rate": 8.178182812005853e-06, - "loss": 0.5147, + "epoch": 1.062730627306273, + "grad_norm": 0.6943757541874687, + "learning_rate": 8.139060623360494e-06, + "loss": 0.5167, "step": 192 }, { - "epoch": 1.0662983425414365, - "grad_norm": 0.70366807919922, - "learning_rate": 8.153268014841507e-06, - "loss": 0.6128, + "epoch": 1.0682656826568266, + "grad_norm": 0.7778412839253838, + "learning_rate": 8.113837751061246e-06, + "loss": 0.6154, "step": 193 }, { - "epoch": 1.0718232044198894, - "grad_norm": 0.6751029207263034, - "learning_rate": 8.128222534559406e-06, - "loss": 0.5743, + "epoch": 1.07380073800738, + "grad_norm": 0.6551339832525483, + "learning_rate": 8.088484765467286e-06, + "loss": 0.5878, "step": 194 }, { - "epoch": 1.0773480662983426, - "grad_norm": 0.769584710702697, - "learning_rate": 8.103047409137114e-06, - "loss": 0.6736, + "epoch": 1.0793357933579335, + "grad_norm": 0.8149186205476209, + "learning_rate": 8.063002725966014e-06, + "loss": 0.7048, "step": 195 }, { - "epoch": 1.0828729281767955, - "grad_norm": 0.7906391664348668, - "learning_rate": 8.07774368192517e-06, - "loss": 0.6658, + "epoch": 1.084870848708487, + "grad_norm": 0.6880593641520117, + "learning_rate": 8.037392697337418e-06, + "loss": 0.6041, "step": 196 }, { - "epoch": 1.0883977900552486, - "grad_norm": 0.6510299748322192, - "learning_rate": 8.052312401603848e-06, - "loss": 0.5765, + "epoch": 1.0904059040590406, + "grad_norm": 0.7040695710194661, + "learning_rate": 8.011655749709575e-06, + "loss": 0.5979, "step": 197 }, { - "epoch": 1.0939226519337018, - "grad_norm": 0.8547974979381732, - "learning_rate": 8.026754622139691e-06, - "loss": 0.5903, + "epoch": 1.0959409594095941, + "grad_norm": 0.7965545246789066, + "learning_rate": 7.985792958513932e-06, + "loss": 0.5892, "step": 198 }, { - "epoch": 1.0994475138121547, - "grad_norm": 0.8338835673156341, - "learning_rate": 8.001071402741843e-06, - "loss": 0.6027, + "epoch": 1.1014760147601477, + "grad_norm": 0.7621766315488933, + "learning_rate": 7.95980540444038e-06, + "loss": 0.594, "step": 199 }, { - "epoch": 1.1049723756906078, - "grad_norm": 0.7681594836262089, - "learning_rate": 7.975263807818136e-06, - "loss": 0.6428, + "epoch": 1.1070110701107012, + "grad_norm": 0.7643147205973972, + "learning_rate": 7.93369417339209e-06, + "loss": 0.6597, "step": 200 }, { - "epoch": 1.1104972375690607, - "grad_norm": 0.7716286060753629, - "learning_rate": 7.949332906930995e-06, - "loss": 0.6502, + "epoch": 1.1125461254612545, + "grad_norm": 0.8216988466319578, + "learning_rate": 7.907460356440133e-06, + "loss": 0.6398, "step": 201 }, { - "epoch": 1.1160220994475138, - "grad_norm": 0.8404857249117871, - "learning_rate": 7.923279774753092e-06, - "loss": 0.5865, + "epoch": 1.118081180811808, + "grad_norm": 0.7718237762248213, + "learning_rate": 7.881105049777902e-06, + "loss": 0.5818, "step": 202 }, { - "epoch": 1.1215469613259668, - "grad_norm": 0.6784714640462896, - "learning_rate": 7.897105491022819e-06, - "loss": 0.5737, + "epoch": 1.1236162361623616, + "grad_norm": 0.6351521048330241, + "learning_rate": 7.854629354675292e-06, + "loss": 0.5774, "step": 203 }, { - "epoch": 1.12707182320442, - "grad_norm": 0.6611561272377011, - "learning_rate": 7.870811140499543e-06, - "loss": 0.5916, + "epoch": 1.1291512915129152, + "grad_norm": 0.6908992052540243, + "learning_rate": 7.828034377432694e-06, + "loss": 0.557, "step": 204 }, { - "epoch": 1.132596685082873, - "grad_norm": 0.6694407525681161, - "learning_rate": 7.844397812918637e-06, - "loss": 0.5629, + "epoch": 1.1346863468634687, + "grad_norm": 0.7232308696829104, + "learning_rate": 7.801321229334764e-06, + "loss": 0.5865, "step": 205 }, { - "epoch": 1.138121546961326, - "grad_norm": 0.6628808525691233, - "learning_rate": 7.817866602946326e-06, - "loss": 0.6174, + "epoch": 1.140221402214022, + "grad_norm": 0.6705064929908767, + "learning_rate": 7.774491026603985e-06, + "loss": 0.5934, "step": 206 }, { - "epoch": 1.143646408839779, - "grad_norm": 0.6433613204625658, - "learning_rate": 7.791218610134324e-06, - "loss": 0.5891, + "epoch": 1.1457564575645756, + "grad_norm": 0.6880567256154578, + "learning_rate": 7.747544890354031e-06, + "loss": 0.6281, "step": 207 }, { - "epoch": 1.149171270718232, - "grad_norm": 0.7083946072498601, - "learning_rate": 7.764454938874252e-06, - "loss": 0.5973, + "epoch": 1.151291512915129, + "grad_norm": 0.7043185315494912, + "learning_rate": 7.720483946542913e-06, + "loss": 0.5789, "step": 208 }, { - "epoch": 1.1546961325966851, - "grad_norm": 0.6920035749512741, - "learning_rate": 7.737576698351878e-06, - "loss": 0.6368, + "epoch": 1.1568265682656826, + "grad_norm": 0.749312030144366, + "learning_rate": 7.69330932592594e-06, + "loss": 0.6413, "step": 209 }, { - "epoch": 1.160220994475138, - "grad_norm": 0.77090065904851, - "learning_rate": 7.710585002501145e-06, - "loss": 0.6458, + "epoch": 1.1623616236162362, + "grad_norm": 0.8155713214886554, + "learning_rate": 7.666022164008458e-06, + "loss": 0.6249, "step": 210 }, { - "epoch": 1.1657458563535912, - "grad_norm": 0.7373547338916707, - "learning_rate": 7.683480969958005e-06, - "loss": 0.5674, + "epoch": 1.1678966789667897, + "grad_norm": 0.736980019422596, + "learning_rate": 7.638623600998409e-06, + "loss": 0.5835, "step": 211 }, { - "epoch": 1.1712707182320443, - "grad_norm": 0.6775847537528304, - "learning_rate": 7.656265724014054e-06, - "loss": 0.6039, + "epoch": 1.1734317343173433, + "grad_norm": 0.6987882968304525, + "learning_rate": 7.6111147817586925e-06, + "loss": 0.6169, "step": 212 }, { - "epoch": 1.1767955801104972, - "grad_norm": 0.7080491269295629, - "learning_rate": 7.628940392569995e-06, - "loss": 0.6395, + "epoch": 1.1789667896678966, + "grad_norm": 0.7052697317791319, + "learning_rate": 7.5834968557593155e-06, + "loss": 0.5993, "step": 213 }, { - "epoch": 1.1823204419889504, - "grad_norm": 0.6741064097723004, - "learning_rate": 7.601506108088874e-06, - "loss": 0.5882, + "epoch": 1.1845018450184501, + "grad_norm": 0.7070482187757918, + "learning_rate": 7.5557709770293664e-06, + "loss": 0.5973, "step": 214 }, { - "epoch": 1.1878453038674033, - "grad_norm": 0.657603362174331, - "learning_rate": 7.5739640075491546e-06, - "loss": 0.6184, + "epoch": 1.1900369003690037, + "grad_norm": 0.6930393067005186, + "learning_rate": 7.527938304108795e-06, + "loss": 0.6242, "step": 215 }, { - "epoch": 1.1933701657458564, - "grad_norm": 0.6791394246768295, - "learning_rate": 7.546315232397601e-06, - "loss": 0.6138, + "epoch": 1.1955719557195572, + "grad_norm": 0.6851172667466551, + "learning_rate": 7.500000000000001e-06, + "loss": 0.6435, "step": 216 }, { - "epoch": 1.1988950276243093, - "grad_norm": 0.6894132654756862, - "learning_rate": 7.518560928501969e-06, - "loss": 0.6252, + "epoch": 1.2011070110701108, + "grad_norm": 0.7259877682356197, + "learning_rate": 7.471957232119235e-06, + "loss": 0.6212, "step": 217 }, { - "epoch": 1.2044198895027625, - "grad_norm": 0.697165533998509, - "learning_rate": 7.4907022461035125e-06, - "loss": 0.6002, + "epoch": 1.2066420664206643, + "grad_norm": 0.640453551277484, + "learning_rate": 7.443811172247822e-06, + "loss": 0.5481, "step": 218 }, { - "epoch": 1.2099447513812154, - "grad_norm": 0.6444399979671767, - "learning_rate": 7.462740339769323e-06, - "loss": 0.5868, + "epoch": 1.2121771217712176, + "grad_norm": 0.6922785890990849, + "learning_rate": 7.415562996483193e-06, + "loss": 0.6075, "step": 219 }, { - "epoch": 1.2154696132596685, - "grad_norm": 0.7259698993482928, - "learning_rate": 7.434676368344469e-06, - "loss": 0.655, + "epoch": 1.2177121771217712, + "grad_norm": 0.7252156376468682, + "learning_rate": 7.387213885189746e-06, + "loss": 0.6357, "step": 220 }, { - "epoch": 1.2209944751381214, - "grad_norm": 0.7245469567674288, - "learning_rate": 7.406511494903982e-06, - "loss": 0.5975, + "epoch": 1.2232472324723247, + "grad_norm": 0.6646431209628268, + "learning_rate": 7.358765022949519e-06, + "loss": 0.6337, "step": 221 }, { - "epoch": 1.2265193370165746, - "grad_norm": 0.6743396575371344, - "learning_rate": 7.378246886704638e-06, - "loss": 0.6729, + "epoch": 1.2287822878228782, + "grad_norm": 0.6874785057119748, + "learning_rate": 7.330217598512696e-06, + "loss": 0.6359, "step": 222 }, { - "epoch": 1.2320441988950277, - "grad_norm": 0.7206318344181539, - "learning_rate": 7.349883715136601e-06, - "loss": 0.6123, + "epoch": 1.2343173431734318, + "grad_norm": 0.7165999600358128, + "learning_rate": 7.30157280474793e-06, + "loss": 0.5538, "step": 223 }, { - "epoch": 1.2375690607734806, - "grad_norm": 0.602101018107306, - "learning_rate": 7.321423155674858e-06, - "loss": 0.5287, + "epoch": 1.2398523985239853, + "grad_norm": 0.6714722516214744, + "learning_rate": 7.2728318385925035e-06, + "loss": 0.6, "step": 224 }, { - "epoch": 1.2430939226519337, - "grad_norm": 0.6466479963244331, - "learning_rate": 7.292866387830515e-06, - "loss": 0.5927, + "epoch": 1.2453874538745389, + "grad_norm": 0.7107004304709478, + "learning_rate": 7.243995901002312e-06, + "loss": 0.6217, "step": 225 }, { - "epoch": 1.2486187845303867, - "grad_norm": 0.7693210833084159, - "learning_rate": 7.264214595101913e-06, - "loss": 0.6321, + "epoch": 1.2509225092250922, + "grad_norm": 0.7303659741053162, + "learning_rate": 7.215066196901676e-06, + "loss": 0.6022, "step": 226 }, { - "epoch": 1.2541436464088398, - "grad_norm": 0.6789133468563491, - "learning_rate": 7.235468964925571e-06, - "loss": 0.5794, + "epoch": 1.2564575645756457, + "grad_norm": 0.6912084957079973, + "learning_rate": 7.186043935133005e-06, + "loss": 0.6017, "step": 227 }, { - "epoch": 1.2596685082872927, - "grad_norm": 0.7739752082859613, - "learning_rate": 7.206630688626981e-06, - "loss": 0.6563, + "epoch": 1.2619926199261993, + "grad_norm": 0.8016568540326536, + "learning_rate": 7.156930328406268e-06, + "loss": 0.6542, "step": 228 }, { - "epoch": 1.2651933701657458, - "grad_norm": 0.7370007253011229, - "learning_rate": 7.177700961371239e-06, - "loss": 0.6075, + "epoch": 1.2675276752767528, + "grad_norm": 0.743350369247089, + "learning_rate": 7.127726593248337e-06, + "loss": 0.5978, "step": 229 }, { - "epoch": 1.270718232044199, - "grad_norm": 0.7982040126875192, - "learning_rate": 7.148680982113502e-06, - "loss": 0.6422, + "epoch": 1.2730627306273063, + "grad_norm": 0.7404143977218907, + "learning_rate": 7.098433949952146e-06, + "loss": 0.6437, "step": 230 }, { - "epoch": 1.276243093922652, - "grad_norm": 0.6629426029965108, - "learning_rate": 7.119571953549305e-06, - "loss": 0.645, + "epoch": 1.2785977859778597, + "grad_norm": 0.684639141766216, + "learning_rate": 7.069053622525697e-06, + "loss": 0.6318, "step": 231 }, { - "epoch": 1.281767955801105, - "grad_norm": 0.6597217484497837, - "learning_rate": 7.0903750820647175e-06, - "loss": 0.6046, + "epoch": 1.2841328413284132, + "grad_norm": 0.6506218001874067, + "learning_rate": 7.039586838640918e-06, + "loss": 0.5983, "step": 232 }, { - "epoch": 1.287292817679558, - "grad_norm": 0.6171612521609076, - "learning_rate": 7.061091577686349e-06, - "loss": 0.5663, + "epoch": 1.2896678966789668, + "grad_norm": 0.6249292386050818, + "learning_rate": 7.0100348295823706e-06, + "loss": 0.5439, "step": 233 }, { - "epoch": 1.292817679558011, - "grad_norm": 0.6930974359608171, - "learning_rate": 7.031722654031192e-06, - "loss": 0.5901, + "epoch": 1.2952029520295203, + "grad_norm": 0.7671687660019585, + "learning_rate": 6.980398830195785e-06, + "loss": 0.6191, "step": 234 }, { - "epoch": 1.298342541436464, - "grad_norm": 0.6340608907226989, - "learning_rate": 7.002269528256334e-06, - "loss": 0.6084, + "epoch": 1.3007380073800738, + "grad_norm": 0.6567756554372846, + "learning_rate": 6.950680078836475e-06, + "loss": 0.6097, "step": 235 }, { - "epoch": 1.3038674033149171, - "grad_norm": 0.6695648424102065, - "learning_rate": 6.972733421008505e-06, - "loss": 0.5938, + "epoch": 1.3062730627306274, + "grad_norm": 0.7256041165241631, + "learning_rate": 6.920879817317588e-06, + "loss": 0.6237, "step": 236 }, { - "epoch": 1.3093922651933703, - "grad_norm": 0.7162730447160397, - "learning_rate": 6.943115556373503e-06, - "loss": 0.606, + "epoch": 1.311808118081181, + "grad_norm": 0.7062531126646335, + "learning_rate": 6.890999290858213e-06, + "loss": 0.5881, "step": 237 }, { - "epoch": 1.3149171270718232, - "grad_norm": 0.7136879200055026, - "learning_rate": 6.913417161825449e-06, - "loss": 0.6491, + "epoch": 1.3173431734317342, + "grad_norm": 0.7074515896572966, + "learning_rate": 6.861039748031351e-06, + "loss": 0.6399, "step": 238 }, { - "epoch": 1.3204419889502763, - "grad_norm": 0.696786721330877, - "learning_rate": 6.883639468175926e-06, - "loss": 0.6072, + "epoch": 1.3228782287822878, + "grad_norm": 0.7132720107486031, + "learning_rate": 6.8310024407117405e-06, + "loss": 0.6356, "step": 239 }, { - "epoch": 1.3259668508287292, - "grad_norm": 0.753245759137927, - "learning_rate": 6.853783709522963e-06, - "loss": 0.6018, + "epoch": 1.3284132841328413, + "grad_norm": 0.6998988120144802, + "learning_rate": 6.800888624023552e-06, + "loss": 0.6086, "step": 240 }, { - "epoch": 1.3314917127071824, - "grad_norm": 0.6968519374034413, - "learning_rate": 6.823851123199894e-06, - "loss": 0.6434, + "epoch": 1.3339483394833949, + "grad_norm": 0.7003663999513423, + "learning_rate": 6.770699556287939e-06, + "loss": 0.6242, "step": 241 }, { - "epoch": 1.3370165745856353, - "grad_norm": 0.6761342537400503, - "learning_rate": 6.793842949724074e-06, - "loss": 0.6225, + "epoch": 1.3394833948339484, + "grad_norm": 0.698672108839009, + "learning_rate": 6.740436498970453e-06, + "loss": 0.6583, "step": 242 }, { - "epoch": 1.3425414364640884, - "grad_norm": 0.7322253178916728, - "learning_rate": 6.763760432745475e-06, - "loss": 0.6131, + "epoch": 1.3450184501845017, + "grad_norm": 0.6864478737367914, + "learning_rate": 6.710100716628345e-06, + "loss": 0.5885, "step": 243 }, { - "epoch": 1.3480662983425415, - "grad_norm": 0.7699522982913194, - "learning_rate": 6.733604818995133e-06, - "loss": 0.6366, + "epoch": 1.3505535055350553, + "grad_norm": 0.7196144827279044, + "learning_rate": 6.679693476857712e-06, + "loss": 0.6075, "step": 244 }, { - "epoch": 1.3535911602209945, - "grad_norm": 0.6196932838702959, - "learning_rate": 6.703377358233489e-06, - "loss": 0.595, + "epoch": 1.3560885608856088, + "grad_norm": 0.6437948099966094, + "learning_rate": 6.649216050240539e-06, + "loss": 0.6097, "step": 245 }, { - "epoch": 1.3591160220994476, - "grad_norm": 0.7047536218041381, - "learning_rate": 6.673079303198591e-06, - "loss": 0.5941, + "epoch": 1.3616236162361623, + "grad_norm": 0.7620305140396721, + "learning_rate": 6.618669710291607e-06, + "loss": 0.5926, "step": 246 }, { - "epoch": 1.3646408839779005, - "grad_norm": 0.8335370354195214, - "learning_rate": 6.6427119095541745e-06, - "loss": 0.5967, + "epoch": 1.367158671586716, + "grad_norm": 0.7830266879556336, + "learning_rate": 6.588055733405266e-06, + "loss": 0.6033, "step": 247 }, { - "epoch": 1.3701657458563536, - "grad_norm": 0.7706739113368778, - "learning_rate": 6.612276435837622e-06, - "loss": 0.5917, + "epoch": 1.3726937269372694, + "grad_norm": 0.7123926132432568, + "learning_rate": 6.557375398802124e-06, + "loss": 0.6074, "step": 248 }, { - "epoch": 1.3756906077348066, - "grad_norm": 0.7446603300489363, - "learning_rate": 6.58177414340781e-06, - "loss": 0.6342, + "epoch": 1.378228782287823, + "grad_norm": 0.6724440991233431, + "learning_rate": 6.526629988475567e-06, + "loss": 0.6154, "step": 249 }, { - "epoch": 1.3812154696132597, - "grad_norm": 0.6592546584880928, - "learning_rate": 6.551206296392827e-06, - "loss": 0.5791, + "epoch": 1.3837638376383765, + "grad_norm": 0.705678216248485, + "learning_rate": 6.495820787138209e-06, + "loss": 0.5891, "step": 250 }, { - "epoch": 1.3867403314917128, - "grad_norm": 0.8080309844429614, - "learning_rate": 6.520574161637591e-06, - "loss": 0.6555, + "epoch": 1.3892988929889298, + "grad_norm": 0.7606972010978245, + "learning_rate": 6.4649490821682035e-06, + "loss": 0.6586, "step": 251 }, { - "epoch": 1.3922651933701657, - "grad_norm": 0.6757755218068, - "learning_rate": 6.4898790086513366e-06, - "loss": 0.6002, + "epoch": 1.3948339483394834, + "grad_norm": 0.627422187069653, + "learning_rate": 6.434016163555452e-06, + "loss": 0.5917, "step": 252 }, { - "epoch": 1.3977900552486187, - "grad_norm": 0.6540398994889906, - "learning_rate": 6.459122109555011e-06, - "loss": 0.5968, + "epoch": 1.400369003690037, + "grad_norm": 0.619815027726477, + "learning_rate": 6.403023323847695e-06, + "loss": 0.5878, "step": 253 }, { - "epoch": 1.4033149171270718, - "grad_norm": 0.6515476627729059, - "learning_rate": 6.42830473902855e-06, - "loss": 0.5915, + "epoch": 1.4059040590405905, + "grad_norm": 0.689330526944227, + "learning_rate": 6.371971858096509e-06, + "loss": 0.5972, "step": 254 }, { - "epoch": 1.408839779005525, - "grad_norm": 0.768393574101539, - "learning_rate": 6.397428174258048e-06, - "loss": 0.6129, + "epoch": 1.4114391143911438, + "grad_norm": 0.7173239755647481, + "learning_rate": 6.340863063803187e-06, + "loss": 0.6063, "step": 255 }, { - "epoch": 1.4143646408839778, - "grad_norm": 0.6602226522160092, - "learning_rate": 6.3664936948828296e-06, - "loss": 0.618, + "epoch": 1.4169741697416973, + "grad_norm": 0.6961778167779811, + "learning_rate": 6.30969824086453e-06, + "loss": 0.5807, "step": 256 }, { - "epoch": 1.419889502762431, - "grad_norm": 0.7186347942888126, - "learning_rate": 6.335502582942409e-06, - "loss": 0.58, + "epoch": 1.4225092250922509, + "grad_norm": 0.6879266863201914, + "learning_rate": 6.278478691518519e-06, + "loss": 0.5948, "step": 257 }, { - "epoch": 1.4254143646408841, - "grad_norm": 0.6867146462508941, - "learning_rate": 6.304456122823377e-06, - "loss": 0.6284, + "epoch": 1.4280442804428044, + "grad_norm": 0.7552082447717653, + "learning_rate": 6.247205720289907e-06, + "loss": 0.6406, "step": 258 }, { - "epoch": 1.430939226519337, - "grad_norm": 0.6640537281896691, - "learning_rate": 6.273355601206143e-06, - "loss": 0.628, + "epoch": 1.433579335793358, + "grad_norm": 0.6434001937152157, + "learning_rate": 6.215880633935709e-06, + "loss": 0.5952, "step": 259 }, { - "epoch": 1.43646408839779, - "grad_norm": 0.680561183563057, - "learning_rate": 6.24220230701164e-06, - "loss": 0.6601, + "epoch": 1.4391143911439115, + "grad_norm": 0.662148188293842, + "learning_rate": 6.184504741390596e-06, + "loss": 0.6319, "step": 260 }, { - "epoch": 1.441988950276243, - "grad_norm": 0.6266987287558203, - "learning_rate": 6.210997531347879e-06, - "loss": 0.5342, + "epoch": 1.444649446494465, + "grad_norm": 0.6153480539853726, + "learning_rate": 6.153079353712201e-06, + "loss": 0.5907, "step": 261 }, { - "epoch": 1.4475138121546962, - "grad_norm": 0.6970171405272507, - "learning_rate": 6.179742567456464e-06, - "loss": 0.6339, + "epoch": 1.4501845018450186, + "grad_norm": 0.6846732208662787, + "learning_rate": 6.121605784026339e-06, + "loss": 0.575, "step": 262 }, { - "epoch": 1.4530386740331491, - "grad_norm": 0.6699218536254724, - "learning_rate": 6.148438710658979e-06, - "loss": 0.5782, + "epoch": 1.455719557195572, + "grad_norm": 0.7039311923510397, + "learning_rate": 6.09008534747213e-06, + "loss": 0.6249, "step": 263 }, { - "epoch": 1.4585635359116023, - "grad_norm": 0.7406730671514048, - "learning_rate": 6.117087258303314e-06, - "loss": 0.6392, + "epoch": 1.4612546125461254, + "grad_norm": 0.7025974163318355, + "learning_rate": 6.058519361147055e-06, + "loss": 0.6097, "step": 264 }, { - "epoch": 1.4640883977900552, - "grad_norm": 0.6496784259742956, - "learning_rate": 6.085689509709893e-06, - "loss": 0.5848, + "epoch": 1.466789667896679, + "grad_norm": 0.6897172606752235, + "learning_rate": 6.02690914405191e-06, + "loss": 0.6459, "step": 265 }, { - "epoch": 1.4696132596685083, - "grad_norm": 0.6630461769430164, - "learning_rate": 6.0542467661178325e-06, - "loss": 0.6243, + "epoch": 1.4723247232472325, + "grad_norm": 0.6775134380343946, + "learning_rate": 5.995256017035703e-06, + "loss": 0.5369, "step": 266 }, { - "epoch": 1.4751381215469612, - "grad_norm": 0.7024087077842354, - "learning_rate": 6.022760330631006e-06, - "loss": 0.6081, + "epoch": 1.477859778597786, + "grad_norm": 0.7387800906697658, + "learning_rate": 5.9635613027404495e-06, + "loss": 0.6659, "step": 267 }, { - "epoch": 1.4806629834254144, - "grad_norm": 0.644335689792361, - "learning_rate": 5.991231508164037e-06, - "loss": 0.6315, + "epoch": 1.4833948339483394, + "grad_norm": 0.5992142954740902, + "learning_rate": 5.931826325545912e-06, + "loss": 0.6023, "step": 268 }, { - "epoch": 1.4861878453038675, - "grad_norm": 0.6011260710575211, - "learning_rate": 5.959661605388229e-06, - "loss": 0.5479, + "epoch": 1.488929889298893, + "grad_norm": 0.6869613178503449, + "learning_rate": 5.900052411514257e-06, + "loss": 0.5711, "step": 269 }, { - "epoch": 1.4917127071823204, - "grad_norm": 0.6735133008955199, - "learning_rate": 5.928051930677404e-06, - "loss": 0.6074, + "epoch": 1.4944649446494465, + "grad_norm": 0.6727011895310602, + "learning_rate": 5.8682408883346535e-06, + "loss": 0.5675, "step": 270 }, { - "epoch": 1.4972375690607735, - "grad_norm": 0.724897423389795, - "learning_rate": 5.896403794053679e-06, - "loss": 0.6117, + "epoch": 1.5, + "grad_norm": 0.6728450572952694, + "learning_rate": 5.836393085267777e-06, + "loss": 0.6195, "step": 271 }, { - "epoch": 1.5027624309392267, - "grad_norm": 0.6432683628472128, - "learning_rate": 5.864718507133176e-06, - "loss": 0.6015, + "epoch": 1.5055350553505535, + "grad_norm": 0.6846152979341545, + "learning_rate": 5.804510333090287e-06, + "loss": 0.6213, "step": 272 }, { - "epoch": 1.5082872928176796, - "grad_norm": 0.6345537221217502, - "learning_rate": 5.83299738307166e-06, - "loss": 0.6491, + "epoch": 1.511070110701107, + "grad_norm": 0.6507562063592163, + "learning_rate": 5.772593964039203e-06, + "loss": 0.6403, "step": 273 }, { - "epoch": 1.5138121546961325, - "grad_norm": 0.6218808254180483, - "learning_rate": 5.801241736510128e-06, - "loss": 0.6134, + "epoch": 1.5166051660516606, + "grad_norm": 0.6432647166128416, + "learning_rate": 5.740645311756246e-06, + "loss": 0.6248, "step": 274 }, { - "epoch": 1.5193370165745856, - "grad_norm": 0.6782578788601987, - "learning_rate": 5.76945288352031e-06, - "loss": 0.6308, + "epoch": 1.5221402214022142, + "grad_norm": 0.6415893639295898, + "learning_rate": 5.708665711232103e-06, + "loss": 0.5917, "step": 275 }, { - "epoch": 1.5248618784530388, - "grad_norm": 0.5841892465281848, - "learning_rate": 5.7376321415501356e-06, - "loss": 0.5434, + "epoch": 1.5276752767527675, + "grad_norm": 0.6033176569216087, + "learning_rate": 5.6766564987506564e-06, + "loss": 0.5589, "step": 276 }, { - "epoch": 1.5303867403314917, - "grad_norm": 0.6650299495850774, - "learning_rate": 5.7057808293691305e-06, - "loss": 0.6293, + "epoch": 1.533210332103321, + "grad_norm": 0.7017492010981081, + "learning_rate": 5.644619011833134e-06, + "loss": 0.6301, "step": 277 }, { - "epoch": 1.5359116022099446, - "grad_norm": 0.752419119034265, - "learning_rate": 5.67390026701377e-06, - "loss": 0.6223, + "epoch": 1.5387453874538746, + "grad_norm": 0.7655426961892206, + "learning_rate": 5.612554589182228e-06, + "loss": 0.6593, "step": 278 }, { - "epoch": 1.5414364640883977, - "grad_norm": 0.8282947869336723, - "learning_rate": 5.641991775732756e-06, - "loss": 0.7056, + "epoch": 1.5442804428044279, + "grad_norm": 0.7237077018760665, + "learning_rate": 5.5804645706261515e-06, + "loss": 0.6946, "step": 279 }, { - "epoch": 1.5469613259668509, - "grad_norm": 0.7173511510171976, - "learning_rate": 5.610056677932274e-06, - "loss": 0.6027, + "epoch": 1.5498154981549814, + "grad_norm": 0.6848993007754327, + "learning_rate": 5.548350297062659e-06, + "loss": 0.6106, "step": 280 }, { - "epoch": 1.5524861878453038, - "grad_norm": 0.6564750666347287, - "learning_rate": 5.5780962971211795e-06, - "loss": 0.6608, + "epoch": 1.555350553505535, + "grad_norm": 0.6816964703469334, + "learning_rate": 5.516213110403009e-06, + "loss": 0.6959, "step": 281 }, { - "epoch": 1.558011049723757, - "grad_norm": 0.686798968992177, - "learning_rate": 5.546111957856155e-06, - "loss": 0.6637, + "epoch": 1.5608856088560885, + "grad_norm": 0.7121757712642759, + "learning_rate": 5.484054353515896e-06, + "loss": 0.6304, "step": 282 }, { - "epoch": 1.56353591160221, - "grad_norm": 0.8467673992557097, - "learning_rate": 5.514104985686802e-06, - "loss": 0.6916, + "epoch": 1.566420664206642, + "grad_norm": 0.785585840187818, + "learning_rate": 5.451875370171341e-06, + "loss": 0.6969, "step": 283 }, { - "epoch": 1.569060773480663, - "grad_norm": 0.7454256879275183, - "learning_rate": 5.482076707100723e-06, - "loss": 0.6741, + "epoch": 1.5719557195571956, + "grad_norm": 0.6929061668579762, + "learning_rate": 5.419677504984534e-06, + "loss": 0.6396, "step": 284 }, { - "epoch": 1.5745856353591159, - "grad_norm": 0.7068205882972726, - "learning_rate": 5.4500284494685275e-06, - "loss": 0.6049, + "epoch": 1.5774907749077491, + "grad_norm": 0.6711279274727765, + "learning_rate": 5.387462103359655e-06, + "loss": 0.6007, "step": 285 }, { - "epoch": 1.580110497237569, - "grad_norm": 0.6123366279350007, - "learning_rate": 5.417961540988837e-06, - "loss": 0.5804, + "epoch": 1.5830258302583027, + "grad_norm": 0.6361660730682954, + "learning_rate": 5.3552305114336515e-06, + "loss": 0.6057, "step": 286 }, { - "epoch": 1.5856353591160222, - "grad_norm": 0.6753783255596223, - "learning_rate": 5.385877310633233e-06, - "loss": 0.6144, + "epoch": 1.5885608856088562, + "grad_norm": 0.6937146439997242, + "learning_rate": 5.32298407601999e-06, + "loss": 0.5955, "step": 287 }, { - "epoch": 1.591160220994475, - "grad_norm": 0.6787303073540277, - "learning_rate": 5.353777088091177e-06, - "loss": 0.6109, + "epoch": 1.5940959409594095, + "grad_norm": 0.6641404122154957, + "learning_rate": 5.290724144552379e-06, + "loss": 0.5694, "step": 288 }, { - "epoch": 1.5966850828729282, - "grad_norm": 0.6689374877161456, - "learning_rate": 5.321662203714909e-06, - "loss": 0.5701, + "epoch": 1.599630996309963, + "grad_norm": 0.6925213117648781, + "learning_rate": 5.258452065028473e-06, + "loss": 0.578, "step": 289 }, { - "epoch": 1.6022099447513813, - "grad_norm": 0.7341945035228687, - "learning_rate": 5.289533988464307e-06, - "loss": 0.6451, + "epoch": 1.6051660516605166, + "grad_norm": 0.6749065542362322, + "learning_rate": 5.2261691859535325e-06, + "loss": 0.6517, "step": 290 }, { - "epoch": 1.6077348066298343, - "grad_norm": 0.7003036927886831, - "learning_rate": 5.257393773851733e-06, - "loss": 0.6009, + "epoch": 1.6107011070110702, + "grad_norm": 0.6569945077124647, + "learning_rate": 5.193876856284085e-06, + "loss": 0.5928, "step": 291 }, { - "epoch": 1.6132596685082872, - "grad_norm": 0.6798929226991296, - "learning_rate": 5.2252428918868446e-06, - "loss": 0.5811, + "epoch": 1.6162361623616235, + "grad_norm": 0.6611834974919497, + "learning_rate": 5.161576425371554e-06, + "loss": 0.5757, "step": 292 }, { - "epoch": 1.6187845303867403, - "grad_norm": 0.603212047810036, - "learning_rate": 5.193082675021393e-06, - "loss": 0.5346, + "epoch": 1.621771217712177, + "grad_norm": 0.6443267893510465, + "learning_rate": 5.1292692429058824e-06, + "loss": 0.567, "step": 293 }, { - "epoch": 1.6243093922651934, - "grad_norm": 0.6908051872037912, - "learning_rate": 5.160914456094005e-06, - "loss": 0.6487, + "epoch": 1.6273062730627306, + "grad_norm": 0.7370247594136543, + "learning_rate": 5.096956658859122e-06, + "loss": 0.6723, "step": 294 }, { - "epoch": 1.6298342541436464, - "grad_norm": 0.759496464512415, - "learning_rate": 5.1287395682749444e-06, - "loss": 0.6885, + "epoch": 1.632841328413284, + "grad_norm": 0.6633312016915573, + "learning_rate": 5.064640023429042e-06, + "loss": 0.6907, "step": 295 }, { - "epoch": 1.6353591160220995, - "grad_norm": 0.7182799104547218, - "learning_rate": 5.0965593450108495e-06, - "loss": 0.6172, + "epoch": 1.6383763837638377, + "grad_norm": 0.6796625855678173, + "learning_rate": 5.032320686982697e-06, + "loss": 0.6061, "step": 296 }, { - "epoch": 1.6408839779005526, - "grad_norm": 0.6463166442166366, - "learning_rate": 5.064375119969491e-06, - "loss": 0.6075, + "epoch": 1.6439114391143912, + "grad_norm": 0.6703488843908327, + "learning_rate": 5e-06, + "loss": 0.5949, "step": 297 }, { - "epoch": 1.6464088397790055, - "grad_norm": 0.7000976763577735, - "learning_rate": 5.03218822698448e-06, - "loss": 0.6416, + "epoch": 1.6494464944649447, + "grad_norm": 0.6879324468041449, + "learning_rate": 4.967679313017304e-06, + "loss": 0.6459, "step": 298 }, { - "epoch": 1.6519337016574585, - "grad_norm": 0.7020061467617188, - "learning_rate": 5e-06, - "loss": 0.5947, + "epoch": 1.6549815498154983, + "grad_norm": 0.6993913963211266, + "learning_rate": 4.9353599765709585e-06, + "loss": 0.5606, "step": 299 }, { - "epoch": 1.6574585635359116, - "grad_norm": 0.7488808415617535, - "learning_rate": 4.967811773015521e-06, - "loss": 0.5809, + "epoch": 1.6605166051660518, + "grad_norm": 0.6842342047928159, + "learning_rate": 4.903043341140879e-06, + "loss": 0.5859, "step": 300 }, { - "epoch": 1.6629834254143647, - "grad_norm": 0.634571743970242, - "learning_rate": 4.93562488003051e-06, - "loss": 0.5944, + "epoch": 1.6660516605166051, + "grad_norm": 0.6414031000352342, + "learning_rate": 4.870730757094121e-06, + "loss": 0.6028, "step": 301 }, { - "epoch": 1.6685082872928176, - "grad_norm": 0.6445574283924204, - "learning_rate": 4.90344065498915e-06, - "loss": 0.5984, + "epoch": 1.6715867158671587, + "grad_norm": 0.6730213374583364, + "learning_rate": 4.838423574628447e-06, + "loss": 0.5758, "step": 302 }, { - "epoch": 1.6740331491712708, - "grad_norm": 0.7137775145837433, - "learning_rate": 4.871260431725058e-06, - "loss": 0.5448, + "epoch": 1.6771217712177122, + "grad_norm": 0.685094126165947, + "learning_rate": 4.806123143715916e-06, + "loss": 0.5783, "step": 303 }, { - "epoch": 1.679558011049724, - "grad_norm": 0.6620727258550688, - "learning_rate": 4.8390855439059955e-06, - "loss": 0.5916, + "epoch": 1.6826568265682655, + "grad_norm": 0.6832171407137465, + "learning_rate": 4.773830814046469e-06, + "loss": 0.5928, "step": 304 }, { - "epoch": 1.6850828729281768, - "grad_norm": 0.648075981508642, - "learning_rate": 4.806917324978608e-06, - "loss": 0.5546, + "epoch": 1.688191881918819, + "grad_norm": 0.6333752235654893, + "learning_rate": 4.741547934971528e-06, + "loss": 0.5476, "step": 305 }, { - "epoch": 1.6906077348066297, - "grad_norm": 0.7077774255913108, - "learning_rate": 4.774757108113156e-06, - "loss": 0.5546, + "epoch": 1.6937269372693726, + "grad_norm": 0.7064803499084308, + "learning_rate": 4.7092758554476215e-06, + "loss": 0.6216, "step": 306 }, { - "epoch": 1.6961325966850829, - "grad_norm": 0.7237233387785816, - "learning_rate": 4.742606226148268e-06, - "loss": 0.6898, + "epoch": 1.6992619926199262, + "grad_norm": 0.6392420306364016, + "learning_rate": 4.677015923980012e-06, + "loss": 0.6679, "step": 307 }, { - "epoch": 1.701657458563536, - "grad_norm": 0.6284936804624925, - "learning_rate": 4.710466011535695e-06, - "loss": 0.6397, + "epoch": 1.7047970479704797, + "grad_norm": 0.6494731361510496, + "learning_rate": 4.644769488566351e-06, + "loss": 0.5886, "step": 308 }, { - "epoch": 1.707182320441989, - "grad_norm": 0.7927972591279655, - "learning_rate": 4.678337796285093e-06, - "loss": 0.6436, + "epoch": 1.7103321033210332, + "grad_norm": 0.6913018221886911, + "learning_rate": 4.6125378966403465e-06, + "loss": 0.6472, "step": 309 }, { - "epoch": 1.7127071823204418, - "grad_norm": 0.6592299730654411, - "learning_rate": 4.6462229119088234e-06, - "loss": 0.5714, + "epoch": 1.7158671586715868, + "grad_norm": 0.640802770083508, + "learning_rate": 4.580322495015466e-06, + "loss": 0.5561, "step": 310 }, { - "epoch": 1.7182320441988952, - "grad_norm": 0.6876331470178856, - "learning_rate": 4.614122689366769e-06, - "loss": 0.6147, + "epoch": 1.7214022140221403, + "grad_norm": 0.6960948975174029, + "learning_rate": 4.548124629828661e-06, + "loss": 0.6236, "step": 311 }, { - "epoch": 1.723756906077348, - "grad_norm": 0.6820420879873159, - "learning_rate": 4.582038459011165e-06, - "loss": 0.5625, + "epoch": 1.7269372693726939, + "grad_norm": 0.6389521774827648, + "learning_rate": 4.515945646484105e-06, + "loss": 0.5764, "step": 312 }, { - "epoch": 1.729281767955801, - "grad_norm": 0.7107222428969042, - "learning_rate": 4.549971550531474e-06, - "loss": 0.6362, + "epoch": 1.7324723247232472, + "grad_norm": 0.6505284331128637, + "learning_rate": 4.483786889596993e-06, + "loss": 0.6177, "step": 313 }, { - "epoch": 1.7348066298342542, - "grad_norm": 0.6991332889693773, - "learning_rate": 4.51792329289928e-06, - "loss": 0.6662, + "epoch": 1.7380073800738007, + "grad_norm": 0.7322008377502368, + "learning_rate": 4.451649702937343e-06, + "loss": 0.7031, "step": 314 }, { - "epoch": 1.7403314917127073, - "grad_norm": 0.6509307723816805, - "learning_rate": 4.485895014313198e-06, - "loss": 0.6239, + "epoch": 1.7435424354243543, + "grad_norm": 0.667856136908068, + "learning_rate": 4.4195354293738484e-06, + "loss": 0.6017, "step": 315 }, { - "epoch": 1.7458563535911602, - "grad_norm": 0.6475315801384613, - "learning_rate": 4.453888042143847e-06, - "loss": 0.6175, + "epoch": 1.7490774907749076, + "grad_norm": 0.6167961116394701, + "learning_rate": 4.387445410817774e-06, + "loss": 0.5948, "step": 316 }, { - "epoch": 1.7513812154696131, - "grad_norm": 0.6737553368323486, - "learning_rate": 4.421903702878822e-06, - "loss": 0.5807, + "epoch": 1.7546125461254611, + "grad_norm": 0.6488335556890084, + "learning_rate": 4.355380988166867e-06, + "loss": 0.5784, "step": 317 }, { - "epoch": 1.7569060773480663, - "grad_norm": 0.6579280198374517, - "learning_rate": 4.389943322067728e-06, - "loss": 0.5769, + "epoch": 1.7601476014760147, + "grad_norm": 0.6433975581483212, + "learning_rate": 4.323343501249346e-06, + "loss": 0.5302, "step": 318 }, { - "epoch": 1.7624309392265194, - "grad_norm": 0.6747867888004441, - "learning_rate": 4.358008224267245e-06, - "loss": 0.5521, + "epoch": 1.7656826568265682, + "grad_norm": 0.6706524043913139, + "learning_rate": 4.291334288767899e-06, + "loss": 0.5922, "step": 319 }, { - "epoch": 1.7679558011049723, - "grad_norm": 0.5916747700224341, - "learning_rate": 4.326099732986231e-06, - "loss": 0.533, + "epoch": 1.7712177121771218, + "grad_norm": 0.6591469335328922, + "learning_rate": 4.259354688243758e-06, + "loss": 0.5898, "step": 320 }, { - "epoch": 1.7734806629834254, - "grad_norm": 0.6766138160854878, - "learning_rate": 4.29421917063087e-06, - "loss": 0.6057, + "epoch": 1.7767527675276753, + "grad_norm": 0.6481007609954463, + "learning_rate": 4.227406035960798e-06, + "loss": 0.5922, "step": 321 }, { - "epoch": 1.7790055248618786, - "grad_norm": 0.6880698365575919, - "learning_rate": 4.262367858449867e-06, - "loss": 0.6024, + "epoch": 1.7822878228782288, + "grad_norm": 0.6650428282720707, + "learning_rate": 4.195489666909714e-06, + "loss": 0.6023, "step": 322 }, { - "epoch": 1.7845303867403315, - "grad_norm": 0.6506755980460356, - "learning_rate": 4.230547116479691e-06, - "loss": 0.5889, + "epoch": 1.7878228782287824, + "grad_norm": 0.636497135234757, + "learning_rate": 4.163606914732224e-06, + "loss": 0.5807, "step": 323 }, { - "epoch": 1.7900552486187844, - "grad_norm": 0.6597664588077232, - "learning_rate": 4.1987582634898724e-06, - "loss": 0.5664, + "epoch": 1.793357933579336, + "grad_norm": 0.6574211931288638, + "learning_rate": 4.131759111665349e-06, + "loss": 0.5459, "step": 324 }, { - "epoch": 1.7955801104972375, - "grad_norm": 0.6385623465686507, - "learning_rate": 4.167002616928341e-06, - "loss": 0.5943, + "epoch": 1.7988929889298892, + "grad_norm": 0.6684303900701675, + "learning_rate": 4.099947588485744e-06, + "loss": 0.6271, "step": 325 }, { - "epoch": 1.8011049723756907, - "grad_norm": 0.6967677614778646, - "learning_rate": 4.135281492866826e-06, - "loss": 0.6442, + "epoch": 1.8044280442804428, + "grad_norm": 0.7115325195567614, + "learning_rate": 4.06817367445409e-06, + "loss": 0.6444, "step": 326 }, { - "epoch": 1.8066298342541436, - "grad_norm": 0.6482473642073888, - "learning_rate": 4.103596205946323e-06, - "loss": 0.6201, + "epoch": 1.8099630996309963, + "grad_norm": 0.6896195168622968, + "learning_rate": 4.036438697259551e-06, + "loss": 0.5971, "step": 327 }, { - "epoch": 1.8121546961325967, - "grad_norm": 0.7193129213261211, - "learning_rate": 4.0719480693225964e-06, - "loss": 0.6096, + "epoch": 1.8154981549815496, + "grad_norm": 0.6687990794813583, + "learning_rate": 4.004743982964298e-06, + "loss": 0.6449, "step": 328 }, { - "epoch": 1.8176795580110499, - "grad_norm": 0.6888287876327789, - "learning_rate": 4.040338394611772e-06, - "loss": 0.6086, + "epoch": 1.8210332103321032, + "grad_norm": 0.6668553440261493, + "learning_rate": 3.9730908559480904e-06, + "loss": 0.5765, "step": 329 }, { - "epoch": 1.8232044198895028, - "grad_norm": 0.6553298757478724, - "learning_rate": 4.0087684918359646e-06, - "loss": 0.6059, + "epoch": 1.8265682656826567, + "grad_norm": 0.658154260938912, + "learning_rate": 3.941480638852948e-06, + "loss": 0.6424, "step": 330 }, { - "epoch": 1.8287292817679557, - "grad_norm": 0.6945457240360784, - "learning_rate": 3.977239669368998e-06, - "loss": 0.6322, + "epoch": 1.8321033210332103, + "grad_norm": 0.7676795703743602, + "learning_rate": 3.909914652527872e-06, + "loss": 0.5954, "step": 331 }, { - "epoch": 1.8342541436464088, - "grad_norm": 0.7470607646196792, - "learning_rate": 3.945753233882168e-06, - "loss": 0.6203, + "epoch": 1.8376383763837638, + "grad_norm": 0.753359036105102, + "learning_rate": 3.878394215973663e-06, + "loss": 0.6623, "step": 332 }, { - "epoch": 1.839779005524862, - "grad_norm": 0.6675969015071959, - "learning_rate": 3.9143104902901085e-06, - "loss": 0.6047, + "epoch": 1.8431734317343174, + "grad_norm": 0.6328329983343084, + "learning_rate": 3.8469206462878e-06, + "loss": 0.5646, "step": 333 }, { - "epoch": 1.8453038674033149, - "grad_norm": 0.631838443419436, - "learning_rate": 3.882912741696688e-06, - "loss": 0.5619, + "epoch": 1.848708487084871, + "grad_norm": 0.6522576574675761, + "learning_rate": 3.815495258609404e-06, + "loss": 0.5644, "step": 334 }, { - "epoch": 1.850828729281768, - "grad_norm": 0.6921273737693909, - "learning_rate": 3.851561289341023e-06, - "loss": 0.5852, + "epoch": 1.8542435424354244, + "grad_norm": 0.6478750849380562, + "learning_rate": 3.784119366064293e-06, + "loss": 0.5752, "step": 335 }, { - "epoch": 1.8563535911602211, - "grad_norm": 0.6212122428108089, - "learning_rate": 3.820257432543539e-06, - "loss": 0.5631, + "epoch": 1.859778597785978, + "grad_norm": 0.6468377900933637, + "learning_rate": 3.752794279710094e-06, + "loss": 0.6167, "step": 336 }, { - "epoch": 1.861878453038674, - "grad_norm": 0.7555498667849819, - "learning_rate": 3.789002468652121e-06, - "loss": 0.6727, + "epoch": 1.8653136531365315, + "grad_norm": 0.6738182100392553, + "learning_rate": 3.721521308481483e-06, + "loss": 0.6567, "step": 337 }, { - "epoch": 1.867403314917127, - "grad_norm": 0.6679491834699562, - "learning_rate": 3.7577976929883608e-06, - "loss": 0.5829, + "epoch": 1.8708487084870848, + "grad_norm": 0.6812077855923881, + "learning_rate": 3.690301759135471e-06, + "loss": 0.5609, "step": 338 }, { - "epoch": 1.87292817679558, - "grad_norm": 0.7841296013368897, - "learning_rate": 3.726644398793857e-06, - "loss": 0.6562, + "epoch": 1.8763837638376384, + "grad_norm": 0.739726141586659, + "learning_rate": 3.6591369361968127e-06, + "loss": 0.6721, "step": 339 }, { - "epoch": 1.8784530386740332, - "grad_norm": 0.6836490585487593, - "learning_rate": 3.695543877176626e-06, - "loss": 0.6354, + "epoch": 1.881918819188192, + "grad_norm": 0.6632323020436683, + "learning_rate": 3.6280281419034934e-06, + "loss": 0.6158, "step": 340 }, { - "epoch": 1.8839779005524862, - "grad_norm": 0.6394642182719062, - "learning_rate": 3.6644974170575907e-06, - "loss": 0.6078, + "epoch": 1.8874538745387452, + "grad_norm": 0.6554347656581475, + "learning_rate": 3.596976676152306e-06, + "loss": 0.6118, "step": 341 }, { - "epoch": 1.889502762430939, - "grad_norm": 0.7143267899941073, - "learning_rate": 3.6335063051171725e-06, - "loss": 0.6029, + "epoch": 1.8929889298892988, + "grad_norm": 0.6943502042805372, + "learning_rate": 3.5659838364445505e-06, + "loss": 0.6257, "step": 342 }, { - "epoch": 1.8950276243093924, - "grad_norm": 0.6909108022068383, - "learning_rate": 3.6025718257419532e-06, - "loss": 0.6233, + "epoch": 1.8985239852398523, + "grad_norm": 0.7344400676037952, + "learning_rate": 3.535050917831797e-06, + "loss": 0.6154, "step": 343 }, { - "epoch": 1.9005524861878453, - "grad_norm": 0.6329351448339191, - "learning_rate": 3.5716952609714517e-06, - "loss": 0.5878, + "epoch": 1.9040590405904059, + "grad_norm": 0.6960672172751073, + "learning_rate": 3.504179212861793e-06, + "loss": 0.6025, "step": 344 }, { - "epoch": 1.9060773480662982, - "grad_norm": 0.6478503333940434, - "learning_rate": 3.540877890444989e-06, - "loss": 0.5693, + "epoch": 1.9095940959409594, + "grad_norm": 0.6793261432569583, + "learning_rate": 3.473370011524435e-06, + "loss": 0.5547, "step": 345 }, { - "epoch": 1.9116022099447514, - "grad_norm": 0.676681543424856, - "learning_rate": 3.5101209913486655e-06, - "loss": 0.5788, + "epoch": 1.915129151291513, + "grad_norm": 0.6780588276365805, + "learning_rate": 3.442624601197877e-06, + "loss": 0.5933, "step": 346 }, { - "epoch": 1.9171270718232045, - "grad_norm": 0.6431181230720638, - "learning_rate": 3.4794258383624115e-06, - "loss": 0.6159, + "epoch": 1.9206642066420665, + "grad_norm": 0.6333360607546321, + "learning_rate": 3.4119442665947346e-06, + "loss": 0.6066, "step": 347 }, { - "epoch": 1.9226519337016574, - "grad_norm": 0.6037104641827318, - "learning_rate": 3.448793703607175e-06, - "loss": 0.5596, + "epoch": 1.92619926199262, + "grad_norm": 0.6297918625305783, + "learning_rate": 3.3813302897083955e-06, + "loss": 0.6139, "step": 348 }, { - "epoch": 1.9281767955801103, - "grad_norm": 0.6475020350302072, - "learning_rate": 3.4182258565921933e-06, - "loss": 0.6464, + "epoch": 1.9317343173431736, + "grad_norm": 0.65128547682773, + "learning_rate": 3.350783949759462e-06, + "loss": 0.6065, "step": 349 }, { - "epoch": 1.9337016574585635, - "grad_norm": 0.6509624192770844, - "learning_rate": 3.3877235641623797e-06, - "loss": 0.5798, + "epoch": 1.937269372693727, + "grad_norm": 0.663702410189263, + "learning_rate": 3.3203065231422904e-06, + "loss": 0.5595, "step": 350 }, { - "epoch": 1.9392265193370166, - "grad_norm": 0.73517544137987, - "learning_rate": 3.3572880904458267e-06, - "loss": 0.5513, + "epoch": 1.9428044280442804, + "grad_norm": 0.7456337908851745, + "learning_rate": 3.289899283371657e-06, + "loss": 0.5635, "step": 351 }, { - "epoch": 1.9447513812154695, - "grad_norm": 0.693346997717625, - "learning_rate": 3.32692069680141e-06, - "loss": 0.6178, + "epoch": 1.948339483394834, + "grad_norm": 0.6245107839051393, + "learning_rate": 3.259563501029548e-06, + "loss": 0.6149, "step": 352 }, { - "epoch": 1.9502762430939227, - "grad_norm": 0.6108811315736592, - "learning_rate": 3.2966226417665125e-06, - "loss": 0.6236, + "epoch": 1.9538745387453873, + "grad_norm": 0.6389202208791118, + "learning_rate": 3.2293004437120622e-06, + "loss": 0.6318, "step": 353 }, { - "epoch": 1.9558011049723758, - "grad_norm": 0.629537696210223, - "learning_rate": 3.2663951810048683e-06, - "loss": 0.6122, + "epoch": 1.9594095940959408, + "grad_norm": 0.6408635995037254, + "learning_rate": 3.1991113759764493e-06, + "loss": 0.6557, "step": 354 }, { - "epoch": 1.9613259668508287, - "grad_norm": 0.6789659150656049, - "learning_rate": 3.236239567254526e-06, - "loss": 0.6318, + "epoch": 1.9649446494464944, + "grad_norm": 0.6610364476788523, + "learning_rate": 3.1689975592882603e-06, + "loss": 0.6132, "step": 355 }, { - "epoch": 1.9668508287292816, - "grad_norm": 0.6281193326590523, - "learning_rate": 3.206157050275927e-06, - "loss": 0.5859, + "epoch": 1.970479704797048, + "grad_norm": 0.6162945914008929, + "learning_rate": 3.1389602519686515e-06, + "loss": 0.5645, "step": 356 }, { - "epoch": 1.9723756906077348, - "grad_norm": 0.6576491463900354, - "learning_rate": 3.176148876800109e-06, - "loss": 0.5849, + "epoch": 1.9760147601476015, + "grad_norm": 0.6343892270612764, + "learning_rate": 3.1090007091417884e-06, + "loss": 0.5662, "step": 357 }, { - "epoch": 1.977900552486188, - "grad_norm": 0.6853036340913029, - "learning_rate": 3.1462162904770376e-06, - "loss": 0.5779, + "epoch": 1.981549815498155, + "grad_norm": 0.6456644070716252, + "learning_rate": 3.0791201826824117e-06, + "loss": 0.5729, "step": 358 }, { - "epoch": 1.9834254143646408, - "grad_norm": 0.6363463477287988, - "learning_rate": 3.116360531824074e-06, - "loss": 0.5699, + "epoch": 1.9870848708487086, + "grad_norm": 0.6692278565767988, + "learning_rate": 3.049319921163526e-06, + "loss": 0.5692, "step": 359 }, { - "epoch": 1.988950276243094, - "grad_norm": 0.6709331445271008, - "learning_rate": 3.0865828381745515e-06, - "loss": 0.5765, + "epoch": 1.992619926199262, + "grad_norm": 0.6513390055411725, + "learning_rate": 3.019601169804216e-06, + "loss": 0.5702, "step": 360 }, { - "epoch": 1.994475138121547, - "grad_norm": 0.6288565179176786, - "learning_rate": 3.056884443626499e-06, - "loss": 0.5679, + "epoch": 1.9981549815498156, + "grad_norm": 0.8207020809923308, + "learning_rate": 2.9899651704176324e-06, + "loss": 0.712, "step": 361 }, { - "epoch": 2.0, - "grad_norm": 0.6979469101641466, - "learning_rate": 3.027266578991497e-06, - "loss": 0.6038, + "epoch": 2.003690036900369, + "grad_norm": 0.8316443587754938, + "learning_rate": 2.9604131613590825e-06, + "loss": 0.6622, "step": 362 }, { - "epoch": 2.005524861878453, - "grad_norm": 0.6857162137141414, - "learning_rate": 2.997730471743667e-06, - "loss": 0.6094, + "epoch": 2.0092250922509227, + "grad_norm": 0.6321179010589463, + "learning_rate": 2.9309463774743047e-06, + "loss": 0.5349, "step": 363 }, { - "epoch": 2.0110497237569063, - "grad_norm": 0.6086171132783301, - "learning_rate": 2.9682773459688087e-06, - "loss": 0.4948, + "epoch": 2.014760147601476, + "grad_norm": 0.6623408052994709, + "learning_rate": 2.901566050047855e-06, + "loss": 0.558, "step": 364 }, { - "epoch": 2.016574585635359, - "grad_norm": 0.6681480376436151, - "learning_rate": 2.9389084223136523e-06, - "loss": 0.5459, + "epoch": 2.0202952029520294, + "grad_norm": 0.6148409552359754, + "learning_rate": 2.8722734067516637e-06, + "loss": 0.5131, "step": 365 }, { - "epoch": 2.022099447513812, - "grad_norm": 0.593654083071827, - "learning_rate": 2.9096249179352833e-06, - "loss": 0.5109, + "epoch": 2.025830258302583, + "grad_norm": 0.628811924344138, + "learning_rate": 2.843069671593734e-06, + "loss": 0.5053, "step": 366 }, { - "epoch": 2.027624309392265, - "grad_norm": 0.7007646517763502, - "learning_rate": 2.880428046450697e-06, - "loss": 0.5265, + "epoch": 2.0313653136531364, + "grad_norm": 0.7178060640719511, + "learning_rate": 2.813956064866996e-06, + "loss": 0.5215, "step": 367 }, { - "epoch": 2.0331491712707184, - "grad_norm": 0.702650158812983, - "learning_rate": 2.8513190178865004e-06, - "loss": 0.5257, + "epoch": 2.03690036900369, + "grad_norm": 0.64237638492648, + "learning_rate": 2.784933803098326e-06, + "loss": 0.5344, "step": 368 }, { - "epoch": 2.0386740331491713, - "grad_norm": 0.6531178861723401, - "learning_rate": 2.822299038628762e-06, - "loss": 0.5403, + "epoch": 2.0424354243542435, + "grad_norm": 0.660701484636481, + "learning_rate": 2.7560040989976894e-06, + "loss": 0.5522, "step": 369 }, { - "epoch": 2.044198895027624, - "grad_norm": 0.6472401575778369, - "learning_rate": 2.793369311373021e-06, - "loss": 0.5365, + "epoch": 2.047970479704797, + "grad_norm": 0.6442004508033177, + "learning_rate": 2.7271681614074973e-06, + "loss": 0.5692, "step": 370 }, { - "epoch": 2.0497237569060776, - "grad_norm": 0.6641615455285143, - "learning_rate": 2.7645310350744296e-06, - "loss": 0.5952, + "epoch": 2.0535055350553506, + "grad_norm": 0.701530505525817, + "learning_rate": 2.6984271952520723e-06, + "loss": 0.558, "step": 371 }, { - "epoch": 2.0552486187845305, - "grad_norm": 0.7123204501299103, - "learning_rate": 2.7357854048980893e-06, - "loss": 0.5631, + "epoch": 2.059040590405904, + "grad_norm": 0.6677439032763148, + "learning_rate": 2.6697824014873076e-06, + "loss": 0.5507, "step": 372 }, { - "epoch": 2.0607734806629834, - "grad_norm": 0.6900539716574728, - "learning_rate": 2.7071336121694856e-06, - "loss": 0.5452, + "epoch": 2.0645756457564577, + "grad_norm": 0.750156552596835, + "learning_rate": 2.641234977050484e-06, + "loss": 0.5605, "step": 373 }, { - "epoch": 2.0662983425414363, - "grad_norm": 0.7021705227314888, - "learning_rate": 2.6785768443251437e-06, - "loss": 0.5436, + "epoch": 2.0701107011070112, + "grad_norm": 0.7047824557323877, + "learning_rate": 2.6127861148102552e-06, + "loss": 0.5489, "step": 374 }, { - "epoch": 2.0718232044198897, - "grad_norm": 0.7605695654764374, - "learning_rate": 2.6501162848634023e-06, - "loss": 0.5488, + "epoch": 2.0756457564575648, + "grad_norm": 0.711760947816998, + "learning_rate": 2.5844370035168077e-06, + "loss": 0.5525, "step": 375 }, { - "epoch": 2.0773480662983426, - "grad_norm": 0.683125655831491, - "learning_rate": 2.621753113295361e-06, - "loss": 0.5851, + "epoch": 2.081180811808118, + "grad_norm": 0.6970505989934466, + "learning_rate": 2.5561888277521797e-06, + "loss": 0.533, "step": 376 }, { - "epoch": 2.0828729281767955, - "grad_norm": 0.6551843848513452, - "learning_rate": 2.5934885050960183e-06, - "loss": 0.5096, + "epoch": 2.0867158671586714, + "grad_norm": 0.6679637897186145, + "learning_rate": 2.528042767880766e-06, + "loss": 0.5421, "step": 377 }, { - "epoch": 2.0883977900552484, - "grad_norm": 0.7056721518164418, - "learning_rate": 2.565323631655532e-06, - "loss": 0.5273, + "epoch": 2.092250922509225, + "grad_norm": 0.6516580625237066, + "learning_rate": 2.5000000000000015e-06, + "loss": 0.5511, "step": 378 }, { - "epoch": 2.0939226519337018, - "grad_norm": 0.6611500614916435, - "learning_rate": 2.537259660230679e-06, - "loss": 0.5454, + "epoch": 2.0977859778597785, + "grad_norm": 0.6626636324231238, + "learning_rate": 2.4720616958912054e-06, + "loss": 0.5732, "step": 379 }, { - "epoch": 2.0994475138121547, - "grad_norm": 0.7019525646170253, - "learning_rate": 2.5092977538964887e-06, - "loss": 0.5911, + "epoch": 2.103321033210332, + "grad_norm": 0.6723428723615049, + "learning_rate": 2.4442290229706344e-06, + "loss": 0.5727, "step": 380 }, { - "epoch": 2.1049723756906076, - "grad_norm": 0.6472748593975306, - "learning_rate": 2.4814390714980325e-06, - "loss": 0.5211, + "epoch": 2.1088560885608856, + "grad_norm": 0.6599974503924196, + "learning_rate": 2.4165031442406857e-06, + "loss": 0.5238, "step": 381 }, { - "epoch": 2.110497237569061, - "grad_norm": 0.7243217306106386, - "learning_rate": 2.4536847676023995e-06, - "loss": 0.5731, + "epoch": 2.114391143911439, + "grad_norm": 0.7008041331642095, + "learning_rate": 2.3888852182413087e-06, + "loss": 0.4859, "step": 382 }, { - "epoch": 2.116022099447514, - "grad_norm": 0.6881568022419648, - "learning_rate": 2.426035992450848e-06, - "loss": 0.4937, + "epoch": 2.1199261992619927, + "grad_norm": 0.7500710516966634, + "learning_rate": 2.361376399001592e-06, + "loss": 0.545, "step": 383 }, { - "epoch": 2.1215469613259668, - "grad_norm": 0.675968032373875, - "learning_rate": 2.398493891911127e-06, - "loss": 0.5293, + "epoch": 2.125461254612546, + "grad_norm": 0.6693296360494665, + "learning_rate": 2.333977835991545e-06, + "loss": 0.5467, "step": 384 }, { - "epoch": 2.12707182320442, - "grad_norm": 0.6793280674446409, - "learning_rate": 2.3710596074300045e-06, - "loss": 0.5502, + "epoch": 2.1309963099630997, + "grad_norm": 0.6478267617206592, + "learning_rate": 2.3066906740740626e-06, + "loss": 0.5347, "step": 385 }, { - "epoch": 2.132596685082873, - "grad_norm": 0.6827900429979371, - "learning_rate": 2.3437342759859472e-06, - "loss": 0.5774, + "epoch": 2.1365313653136533, + "grad_norm": 0.7177671429812185, + "learning_rate": 2.2795160534570866e-06, + "loss": 0.623, "step": 386 }, { - "epoch": 2.138121546961326, - "grad_norm": 0.6299741302424408, - "learning_rate": 2.316519030041998e-06, - "loss": 0.5355, + "epoch": 2.142066420664207, + "grad_norm": 0.6423839307947838, + "learning_rate": 2.2524551096459703e-06, + "loss": 0.5646, "step": 387 }, { - "epoch": 2.143646408839779, - "grad_norm": 0.679310688963614, - "learning_rate": 2.289414997498856e-06, - "loss": 0.579, + "epoch": 2.14760147601476, + "grad_norm": 0.6793354954895883, + "learning_rate": 2.2255089733960162e-06, + "loss": 0.5316, "step": 388 }, { - "epoch": 2.149171270718232, - "grad_norm": 0.6255767903887023, - "learning_rate": 2.2624233016481224e-06, - "loss": 0.5316, + "epoch": 2.1531365313653135, + "grad_norm": 0.663196223424689, + "learning_rate": 2.1986787706652377e-06, + "loss": 0.5788, "step": 389 }, { - "epoch": 2.154696132596685, - "grad_norm": 0.6592300712497258, - "learning_rate": 2.235545061125748e-06, - "loss": 0.5532, + "epoch": 2.158671586715867, + "grad_norm": 0.613244135525591, + "learning_rate": 2.171965622567308e-06, + "loss": 0.4808, "step": 390 }, { - "epoch": 2.160220994475138, - "grad_norm": 0.6610713391367827, - "learning_rate": 2.2087813898656775e-06, - "loss": 0.507, + "epoch": 2.1642066420664205, + "grad_norm": 0.6636202632875322, + "learning_rate": 2.1453706453247088e-06, + "loss": 0.5427, "step": 391 }, { - "epoch": 2.165745856353591, - "grad_norm": 0.6350291188218904, - "learning_rate": 2.182133397053675e-06, - "loss": 0.5256, + "epoch": 2.169741697416974, + "grad_norm": 0.7361578196639839, + "learning_rate": 2.1188949502220987e-06, + "loss": 0.555, "step": 392 }, { - "epoch": 2.1712707182320443, - "grad_norm": 0.7851921217380164, - "learning_rate": 2.1556021870813653e-06, - "loss": 0.5521, + "epoch": 2.1752767527675276, + "grad_norm": 0.7185291233253194, + "learning_rate": 2.0925396435598665e-06, + "loss": 0.5743, "step": 393 }, { - "epoch": 2.1767955801104972, - "grad_norm": 0.6866503114131093, - "learning_rate": 2.129188859500459e-06, - "loss": 0.5784, + "epoch": 2.180811808118081, + "grad_norm": 0.6519229097148226, + "learning_rate": 2.066305826607911e-06, + "loss": 0.5818, "step": 394 }, { - "epoch": 2.18232044198895, - "grad_norm": 0.6494728733892837, - "learning_rate": 2.102894508977182e-06, - "loss": 0.5601, + "epoch": 2.1863468634686347, + "grad_norm": 0.6790978922388056, + "learning_rate": 2.0401945955596206e-06, + "loss": 0.517, "step": 395 }, { - "epoch": 2.1878453038674035, - "grad_norm": 0.6678705573550681, - "learning_rate": 2.0767202252469113e-06, - "loss": 0.4986, + "epoch": 2.1918819188191883, + "grad_norm": 0.6753695790228323, + "learning_rate": 2.0142070414860704e-06, + "loss": 0.5176, "step": 396 }, { - "epoch": 2.1933701657458564, - "grad_norm": 0.7160045418734877, - "learning_rate": 2.0506670930690074e-06, - "loss": 0.5683, + "epoch": 2.197416974169742, + "grad_norm": 0.7265806499649696, + "learning_rate": 1.9883442502904284e-06, + "loss": 0.6055, "step": 397 }, { - "epoch": 2.1988950276243093, - "grad_norm": 0.6856778789385509, - "learning_rate": 2.0247361921818638e-06, - "loss": 0.5593, + "epoch": 2.2029520295202953, + "grad_norm": 0.6457410100738195, + "learning_rate": 1.962607302662582e-06, + "loss": 0.5406, "step": 398 }, { - "epoch": 2.2044198895027622, - "grad_norm": 0.6486089048078473, - "learning_rate": 1.9989285972581595e-06, - "loss": 0.5482, + "epoch": 2.208487084870849, + "grad_norm": 0.625524661837684, + "learning_rate": 1.936997274033986e-06, + "loss": 0.5076, "step": 399 }, { - "epoch": 2.2099447513812156, - "grad_norm": 0.6894974735994067, - "learning_rate": 1.9732453778603104e-06, - "loss": 0.5486, + "epoch": 2.2140221402214024, + "grad_norm": 0.637857740685298, + "learning_rate": 1.9115152345327154e-06, + "loss": 0.5578, "step": 400 }, { - "epoch": 2.2154696132596685, - "grad_norm": 0.6135965995020478, - "learning_rate": 1.947687598396154e-06, - "loss": 0.523, + "epoch": 2.2195571955719555, + "grad_norm": 0.6101593685630197, + "learning_rate": 1.8861622489387555e-06, + "loss": 0.4875, "step": 401 }, { - "epoch": 2.2209944751381214, - "grad_norm": 0.5824847944758283, - "learning_rate": 1.92225631807483e-06, - "loss": 0.4781, + "epoch": 2.225092250922509, + "grad_norm": 0.6518718553669549, + "learning_rate": 1.8609393766395083e-06, + "loss": 0.57, "step": 402 }, { - "epoch": 2.226519337016575, - "grad_norm": 0.6880197469905236, - "learning_rate": 1.896952590862886e-06, - "loss": 0.5563, + "epoch": 2.2306273062730626, + "grad_norm": 0.6624958993518161, + "learning_rate": 1.8358476715855262e-06, + "loss": 0.5176, "step": 403 }, { - "epoch": 2.2320441988950277, - "grad_norm": 0.6446697768546276, - "learning_rate": 1.8717774654405962e-06, - "loss": 0.5182, + "epoch": 2.236162361623616, + "grad_norm": 0.6719691100692955, + "learning_rate": 1.8108881822464697e-06, + "loss": 0.5313, "step": 404 }, { - "epoch": 2.2375690607734806, - "grad_norm": 0.6867656203180093, - "learning_rate": 1.8467319851584952e-06, - "loss": 0.5602, + "epoch": 2.2416974169741697, + "grad_norm": 0.6736416728036624, + "learning_rate": 1.7860619515673034e-06, + "loss": 0.5625, "step": 405 }, { - "epoch": 2.2430939226519335, - "grad_norm": 0.6379130888342771, - "learning_rate": 1.8218171879941465e-06, - "loss": 0.5657, + "epoch": 2.2472324723247232, + "grad_norm": 0.6422294559583493, + "learning_rate": 1.7613700169247055e-06, + "loss": 0.5705, "step": 406 }, { - "epoch": 2.248618784530387, - "grad_norm": 0.6345085950204016, - "learning_rate": 1.7970341065091246e-06, - "loss": 0.5266, + "epoch": 2.2527675276752768, + "grad_norm": 0.6603028481218244, + "learning_rate": 1.7368134100837286e-06, + "loss": 0.589, "step": 407 }, { - "epoch": 2.25414364640884, - "grad_norm": 0.6637996504745941, - "learning_rate": 1.7723837678062083e-06, - "loss": 0.5885, + "epoch": 2.2583025830258303, + "grad_norm": 0.5987866874874727, + "learning_rate": 1.7123931571546826e-06, + "loss": 0.4857, "step": 408 }, { - "epoch": 2.2596685082872927, - "grad_norm": 0.6029850494243252, - "learning_rate": 1.7478671934868302e-06, - "loss": 0.4729, + "epoch": 2.263837638376384, + "grad_norm": 0.6594047301752323, + "learning_rate": 1.6881102785502618e-06, + "loss": 0.5477, "step": 409 }, { - "epoch": 2.265193370165746, - "grad_norm": 0.6459276687581147, - "learning_rate": 1.7234853996087304e-06, - "loss": 0.5752, + "epoch": 2.2693726937269374, + "grad_norm": 0.6196530000721581, + "learning_rate": 1.6639657889429017e-06, + "loss": 0.5506, "step": 410 }, { - "epoch": 2.270718232044199, - "grad_norm": 0.6226965789608703, - "learning_rate": 1.699239396643841e-06, - "loss": 0.5411, + "epoch": 2.274907749077491, + "grad_norm": 0.6722537304232283, + "learning_rate": 1.639960697222388e-06, + "loss": 0.5797, "step": 411 }, { - "epoch": 2.276243093922652, - "grad_norm": 0.6397669563858227, - "learning_rate": 1.6751301894364274e-06, - "loss": 0.5467, + "epoch": 2.280442804428044, + "grad_norm": 0.615523426294855, + "learning_rate": 1.6160960064536907e-06, + "loss": 0.4981, "step": 412 }, { - "epoch": 2.281767955801105, - "grad_norm": 0.6608828543106198, - "learning_rate": 1.6511587771614208e-06, - "loss": 0.5103, + "epoch": 2.2859778597785976, + "grad_norm": 0.6568878327953758, + "learning_rate": 1.5923727138350548e-06, + "loss": 0.5263, "step": 413 }, { - "epoch": 2.287292817679558, - "grad_norm": 0.6785181930874952, - "learning_rate": 1.6273261532830242e-06, - "loss": 0.5667, + "epoch": 2.291512915129151, + "grad_norm": 0.6557176714698197, + "learning_rate": 1.5687918106563326e-06, + "loss": 0.5977, "step": 414 }, { - "epoch": 2.292817679558011, - "grad_norm": 0.6387931680506953, - "learning_rate": 1.6036333055135345e-06, - "loss": 0.5469, + "epoch": 2.2970479704797047, + "grad_norm": 0.6343521214286307, + "learning_rate": 1.5453542822575624e-06, + "loss": 0.4672, "step": 415 }, { - "epoch": 2.298342541436464, - "grad_norm": 0.6325771066839061, - "learning_rate": 1.5800812157724084e-06, - "loss": 0.4923, + "epoch": 2.302583025830258, + "grad_norm": 0.6160998844576888, + "learning_rate": 1.52206110798779e-06, + "loss": 0.5385, "step": 416 }, { - "epoch": 2.303867403314917, - "grad_norm": 0.6382423549193763, - "learning_rate": 1.556670860145567e-06, - "loss": 0.536, + "epoch": 2.3081180811808117, + "grad_norm": 0.6465234486304856, + "learning_rate": 1.4989132611641576e-06, + "loss": 0.5165, "step": 417 }, { - "epoch": 2.3093922651933703, - "grad_norm": 0.655241299309122, - "learning_rate": 1.533403208844947e-06, - "loss": 0.5437, + "epoch": 2.3136531365313653, + "grad_norm": 0.6620294899666387, + "learning_rate": 1.4759117090312197e-06, + "loss": 0.5605, "step": 418 }, { - "epoch": 2.314917127071823, - "grad_norm": 0.6805728068716513, - "learning_rate": 1.5102792261682813e-06, - "loss": 0.5418, + "epoch": 2.319188191881919, + "grad_norm": 0.6822424027974909, + "learning_rate": 1.453057412720536e-06, + "loss": 0.5921, "step": 419 }, { - "epoch": 2.320441988950276, - "grad_norm": 0.6836442326534425, - "learning_rate": 1.487299870459155e-06, - "loss": 0.6096, + "epoch": 2.3247232472324724, + "grad_norm": 0.6384847627809093, + "learning_rate": 1.4303513272105057e-06, + "loss": 0.5403, "step": 420 }, { - "epoch": 2.3259668508287294, - "grad_norm": 0.6284514006381783, - "learning_rate": 1.4644660940672628e-06, - "loss": 0.534, + "epoch": 2.330258302583026, + "grad_norm": 0.6719145192710272, + "learning_rate": 1.4077944012864636e-06, + "loss": 0.5854, "step": 421 }, { - "epoch": 2.3314917127071824, - "grad_norm": 0.6642371366553435, - "learning_rate": 1.4417788433089596e-06, - "loss": 0.5647, + "epoch": 2.3357933579335795, + "grad_norm": 0.640451661360814, + "learning_rate": 1.3853875775010355e-06, + "loss": 0.517, "step": 422 }, { - "epoch": 2.3370165745856353, - "grad_norm": 0.6541492771250856, - "learning_rate": 1.4192390584280347e-06, - "loss": 0.5209, + "epoch": 2.341328413284133, + "grad_norm": 0.6398126379640671, + "learning_rate": 1.3631317921347564e-06, + "loss": 0.5791, "step": 423 }, { - "epoch": 2.3425414364640886, - "grad_norm": 0.67885347112545, - "learning_rate": 1.3968476735567392e-06, - "loss": 0.5806, + "epoch": 2.3468634686346865, + "grad_norm": 0.6487742664028805, + "learning_rate": 1.3410279751569399e-06, + "loss": 0.5157, "step": 424 }, { - "epoch": 2.3480662983425415, - "grad_norm": 0.6716876869006629, - "learning_rate": 1.3746056166770872e-06, - "loss": 0.5247, + "epoch": 2.35239852398524, + "grad_norm": 0.6671465550992883, + "learning_rate": 1.3190770501868243e-06, + "loss": 0.5588, "step": 425 }, { - "epoch": 2.3535911602209945, - "grad_norm": 0.6934129162480718, - "learning_rate": 1.352513809582377e-06, - "loss": 0.5466, + "epoch": 2.357933579335793, + "grad_norm": 0.6454481634549197, + "learning_rate": 1.297279934454978e-06, + "loss": 0.5026, "step": 426 }, { - "epoch": 2.3591160220994474, - "grad_norm": 0.6400098878071714, - "learning_rate": 1.330573167839005e-06, - "loss": 0.4947, + "epoch": 2.3634686346863467, + "grad_norm": 0.6127427807997947, + "learning_rate": 1.2756375387649717e-06, + "loss": 0.5454, "step": 427 }, { - "epoch": 2.3646408839779007, - "grad_norm": 0.6163300040150086, - "learning_rate": 1.3087846007485134e-06, - "loss": 0.5675, + "epoch": 2.3690036900369003, + "grad_norm": 0.6438945248930303, + "learning_rate": 1.25415076745532e-06, + "loss": 0.545, "step": 428 }, { - "epoch": 2.3701657458563536, - "grad_norm": 0.6148189692207581, - "learning_rate": 1.2871490113099066e-06, - "loss": 0.5162, + "epoch": 2.374538745387454, + "grad_norm": 0.6124407692016695, + "learning_rate": 1.2328205183616964e-06, + "loss": 0.5189, "step": 429 }, { - "epoch": 2.3756906077348066, - "grad_norm": 0.6296512007771748, - "learning_rate": 1.2656672961822285e-06, - "loss": 0.5354, + "epoch": 2.3800738007380073, + "grad_norm": 0.6639137528285265, + "learning_rate": 1.2116476827794104e-06, + "loss": 0.549, "step": 430 }, { - "epoch": 2.3812154696132595, - "grad_norm": 0.6659353506949467, - "learning_rate": 1.2443403456474017e-06, - "loss": 0.5558, + "epoch": 2.385608856088561, + "grad_norm": 0.6472668638709654, + "learning_rate": 1.1906331454261704e-06, + "loss": 0.5616, "step": 431 }, { - "epoch": 2.386740331491713, - "grad_norm": 0.6250696714712421, - "learning_rate": 1.223169043573325e-06, - "loss": 0.5161, + "epoch": 2.3911439114391144, + "grad_norm": 0.6382738061476565, + "learning_rate": 1.1697777844051105e-06, + "loss": 0.5069, "step": 432 }, { - "epoch": 2.3922651933701657, - "grad_norm": 0.628285669477399, - "learning_rate": 1.2021542673772584e-06, - "loss": 0.5309, + "epoch": 2.396678966789668, + "grad_norm": 0.6095694583232508, + "learning_rate": 1.1490824711681026e-06, + "loss": 0.5096, "step": 433 }, { - "epoch": 2.3977900552486187, - "grad_norm": 0.6470067569677168, - "learning_rate": 1.1812968879894387e-06, - "loss": 0.5133, + "epoch": 2.4022140221402215, + "grad_norm": 0.6389566278508614, + "learning_rate": 1.1285480704793378e-06, + "loss": 0.5554, "step": 434 }, { - "epoch": 2.403314917127072, - "grad_norm": 0.6690982254466323, - "learning_rate": 1.1605977698170001e-06, - "loss": 0.5498, + "epoch": 2.407749077490775, + "grad_norm": 0.6565575087635012, + "learning_rate": 1.1081754403792e-06, + "loss": 0.5289, "step": 435 }, { - "epoch": 2.408839779005525, - "grad_norm": 0.6773043112171515, - "learning_rate": 1.1400577707081467e-06, - "loss": 0.5279, + "epoch": 2.4132841328413286, + "grad_norm": 0.6298283487396668, + "learning_rate": 1.0879654321484012e-06, + "loss": 0.5732, "step": 436 }, { - "epoch": 2.414364640883978, - "grad_norm": 0.6465508208097143, - "learning_rate": 1.1196777419165927e-06, - "loss": 0.5645, + "epoch": 2.4188191881918817, + "grad_norm": 0.6546518202275479, + "learning_rate": 1.067918890272419e-06, + "loss": 0.5408, "step": 437 }, { - "epoch": 2.4198895027624308, - "grad_norm": 0.6754843790206556, - "learning_rate": 1.0994585280662978e-06, - "loss": 0.5709, + "epoch": 2.4243542435424352, + "grad_norm": 0.6656591924546393, + "learning_rate": 1.0480366524062041e-06, + "loss": 0.551, "step": 438 }, { - "epoch": 2.425414364640884, - "grad_norm": 0.6373867979221823, - "learning_rate": 1.0794009671164484e-06, - "loss": 0.5301, + "epoch": 2.4298892988929888, + "grad_norm": 0.6298524026967492, + "learning_rate": 1.0283195493391823e-06, + "loss": 0.5011, "step": 439 }, { - "epoch": 2.430939226519337, - "grad_norm": 0.6529002066290224, - "learning_rate": 1.0595058903267357e-06, - "loss": 0.5283, + "epoch": 2.4354243542435423, + "grad_norm": 0.6525468703564422, + "learning_rate": 1.008768404960535e-06, + "loss": 0.5737, "step": 440 }, { - "epoch": 2.43646408839779, - "grad_norm": 0.6785490644572046, - "learning_rate": 1.0397741222229057e-06, - "loss": 0.5368, + "epoch": 2.440959409594096, + "grad_norm": 0.6544594096244687, + "learning_rate": 9.893840362247809e-07, + "loss": 0.5503, "step": 441 }, { - "epoch": 2.441988950276243, - "grad_norm": 0.6906247289237696, - "learning_rate": 1.0202064805625883e-06, - "loss": 0.5811, + "epoch": 2.4464944649446494, + "grad_norm": 0.6318395119148414, + "learning_rate": 9.701672531176287e-07, + "loss": 0.5224, "step": 442 }, { - "epoch": 2.447513812154696, - "grad_norm": 0.6332990091280457, - "learning_rate": 1.0008037763014033e-06, - "loss": 0.5324, + "epoch": 2.452029520295203, + "grad_norm": 0.6196921977020754, + "learning_rate": 9.511188586221376e-07, + "loss": 0.545, "step": 443 }, { - "epoch": 2.453038674033149, - "grad_norm": 0.5981606081277429, - "learning_rate": 9.815668135593548e-07, - "loss": 0.4981, + "epoch": 2.4575645756457565, + "grad_norm": 0.6327900918490961, + "learning_rate": 9.322396486851626e-07, + "loss": 0.5424, "step": 444 }, { - "epoch": 2.458563535911602, - "grad_norm": 0.6658535935861339, - "learning_rate": 9.624963895874995e-07, - "loss": 0.5804, + "epoch": 2.46309963099631, + "grad_norm": 0.6363476446254773, + "learning_rate": 9.135304121840976e-07, + "loss": 0.5538, "step": 445 }, { - "epoch": 2.4640883977900554, - "grad_norm": 0.6173994326467493, - "learning_rate": 9.435932947349169e-07, - "loss": 0.5223, + "epoch": 2.4686346863468636, + "grad_norm": 0.6313076689214548, + "learning_rate": 8.949919308939081e-07, + "loss": 0.5218, "step": 446 }, { - "epoch": 2.4696132596685083, - "grad_norm": 0.6696683287460099, - "learning_rate": 9.248583124159438e-07, - "loss": 0.5488, + "epoch": 2.474169741697417, + "grad_norm": 0.657993153154449, + "learning_rate": 8.766249794544662e-07, + "loss": 0.5321, "step": 447 }, { - "epoch": 2.4751381215469612, - "grad_norm": 0.681352463917548, - "learning_rate": 9.062922190777079e-07, - "loss": 0.5377, + "epoch": 2.4797047970479706, + "grad_norm": 0.6418969902696497, + "learning_rate": 8.584303253381848e-07, + "loss": 0.5194, "step": 448 }, { - "epoch": 2.4806629834254146, - "grad_norm": 0.6413299506586427, - "learning_rate": 8.878957841679542e-07, - "loss": 0.538, + "epoch": 2.485239852398524, + "grad_norm": 0.6043587008301916, + "learning_rate": 8.404087288179425e-07, + "loss": 0.5134, "step": 449 }, { - "epoch": 2.4861878453038675, - "grad_norm": 0.5976825742538746, - "learning_rate": 8.696697701031543e-07, - "loss": 0.4698, + "epoch": 2.4907749077490777, + "grad_norm": 0.6431540180087669, + "learning_rate": 8.225609429353187e-07, + "loss": 0.5348, "step": 450 }, { - "epoch": 2.4917127071823204, - "grad_norm": 0.6586286738339465, - "learning_rate": 8.516149322369055e-07, - "loss": 0.5529, + "epoch": 2.496309963099631, + "grad_norm": 0.6268189919104177, + "learning_rate": 8.048877134691269e-07, + "loss": 0.5043, "step": 451 }, { - "epoch": 2.4972375690607733, - "grad_norm": 0.6374084552099051, - "learning_rate": 8.337320188286318e-07, - "loss": 0.4934, + "epoch": 2.5018450184501844, + "grad_norm": 0.6178939835577518, + "learning_rate": 7.873897789042523e-07, + "loss": 0.5165, "step": 452 }, { - "epoch": 2.5027624309392267, - "grad_norm": 0.6429007341483025, - "learning_rate": 8.160217710125661e-07, - "loss": 0.5544, + "epoch": 2.507380073800738, + "grad_norm": 0.6747581351563309, + "learning_rate": 7.700678704007947e-07, + "loss": 0.6184, "step": 453 }, { - "epoch": 2.5082872928176796, - "grad_norm": 0.6664059102374921, - "learning_rate": 7.984849227670421e-07, - "loss": 0.6073, + "epoch": 2.5129151291512914, + "grad_norm": 0.6379894631440596, + "learning_rate": 7.529227117635135e-07, + "loss": 0.5297, "step": 454 }, { - "epoch": 2.5138121546961325, - "grad_norm": 0.6392153790680253, - "learning_rate": 7.811222008840719e-07, - "loss": 0.5542, + "epoch": 2.518450184501845, + "grad_norm": 0.6345560431082808, + "learning_rate": 7.35955019411585e-07, + "loss": 0.5723, "step": 455 }, { - "epoch": 2.5193370165745854, - "grad_norm": 0.6018857165135849, - "learning_rate": 7.639343249392256e-07, - "loss": 0.5285, + "epoch": 2.5239852398523985, + "grad_norm": 0.7041260690229593, + "learning_rate": 7.191655023486682e-07, + "loss": 0.5518, "step": 456 }, { - "epoch": 2.5248618784530388, - "grad_norm": 0.7674240660472489, - "learning_rate": 7.469220072618094e-07, - "loss": 0.5761, + "epoch": 2.529520295202952, + "grad_norm": 0.6596174352194049, + "learning_rate": 7.02554862133275e-07, + "loss": 0.5507, "step": 457 }, { - "epoch": 2.5303867403314917, - "grad_norm": 0.6496289775570502, - "learning_rate": 7.300859529053422e-07, - "loss": 0.5525, + "epoch": 2.5350553505535056, + "grad_norm": 0.6491422253781083, + "learning_rate": 6.86123792849458e-07, + "loss": 0.5146, "step": 458 }, { - "epoch": 2.5359116022099446, - "grad_norm": 0.6547899116663247, - "learning_rate": 7.13426859618338e-07, - "loss": 0.4973, + "epoch": 2.540590405904059, + "grad_norm": 0.6702481216161904, + "learning_rate": 6.698729810778065e-07, + "loss": 0.5837, "step": 459 }, { - "epoch": 2.541436464088398, - "grad_norm": 0.6505467220043211, - "learning_rate": 6.969454178153923e-07, - "loss": 0.5752, + "epoch": 2.5461254612546127, + "grad_norm": 0.6189552978352973, + "learning_rate": 6.53803105866761e-07, + "loss": 0.5343, "step": 460 }, { - "epoch": 2.546961325966851, - "grad_norm": 0.6735594708393216, - "learning_rate": 6.806423105485576e-07, - "loss": 0.5679, + "epoch": 2.551660516605166, + "grad_norm": 0.6835144543692675, + "learning_rate": 6.379148387042317e-07, + "loss": 0.5462, "step": 461 }, { - "epoch": 2.552486187845304, - "grad_norm": 0.7191676631486709, - "learning_rate": 6.645182134790467e-07, - "loss": 0.5415, + "epoch": 2.5571955719557193, + "grad_norm": 0.6464350965970691, + "learning_rate": 6.222088434895462e-07, + "loss": 0.5381, "step": 462 }, { - "epoch": 2.558011049723757, - "grad_norm": 0.6766548740002727, - "learning_rate": 6.485737948492237e-07, - "loss": 0.5656, + "epoch": 2.562730627306273, + "grad_norm": 0.6957103842105568, + "learning_rate": 6.066857765057055e-07, + "loss": 0.6151, "step": 463 }, { - "epoch": 2.56353591160221, - "grad_norm": 0.6610159478158815, - "learning_rate": 6.328097154549146e-07, - "loss": 0.576, + "epoch": 2.5682656826568264, + "grad_norm": 0.6030789961609527, + "learning_rate": 5.9134628639196e-07, + "loss": 0.5061, "step": 464 }, { - "epoch": 2.569060773480663, - "grad_norm": 0.6227047055678046, - "learning_rate": 6.172266286180162e-07, - "loss": 0.5068, + "epoch": 2.57380073800738, + "grad_norm": 0.6317615119983371, + "learning_rate": 5.76191014116711e-07, + "loss": 0.507, "step": 465 }, { - "epoch": 2.574585635359116, - "grad_norm": 0.6389412833805688, - "learning_rate": 6.018251801594232e-07, - "loss": 0.5258, + "epoch": 2.5793357933579335, + "grad_norm": 0.6435467983289933, + "learning_rate": 5.612205929507209e-07, + "loss": 0.5501, "step": 466 }, { - "epoch": 2.580110497237569, - "grad_norm": 0.6232609013170282, - "learning_rate": 5.866060083722624e-07, - "loss": 0.5355, + "epoch": 2.584870848708487, + "grad_norm": 0.6358149902010037, + "learning_rate": 5.464356484406535e-07, + "loss": 0.5211, "step": 467 }, { - "epoch": 2.585635359116022, - "grad_norm": 0.6278311488371855, - "learning_rate": 5.715697439954432e-07, - "loss": 0.5126, + "epoch": 2.5904059040590406, + "grad_norm": 0.6195286275874121, + "learning_rate": 5.318367983829393e-07, + "loss": 0.5032, "step": 468 }, { - "epoch": 2.591160220994475, - "grad_norm": 0.6077201096302113, - "learning_rate": 5.567170101875074e-07, - "loss": 0.4949, + "epoch": 2.595940959409594, + "grad_norm": 0.6692400503094423, + "learning_rate": 5.174246527979532e-07, + "loss": 0.5574, "step": 469 }, { - "epoch": 2.596685082872928, - "grad_norm": 0.6686676733895326, - "learning_rate": 5.420484225008138e-07, - "loss": 0.5645, + "epoch": 2.6014760147601477, + "grad_norm": 0.6342700435947363, + "learning_rate": 5.031998139045352e-07, + "loss": 0.5523, "step": 470 }, { - "epoch": 2.6022099447513813, - "grad_norm": 0.6427639590105313, - "learning_rate": 5.275645888560233e-07, - "loss": 0.5939, + "epoch": 2.607011070110701, + "grad_norm": 0.6412069996618961, + "learning_rate": 4.891628760948114e-07, + "loss": 0.5423, "step": 471 }, { - "epoch": 2.6077348066298343, - "grad_norm": 0.5974938612500432, - "learning_rate": 5.132661095168994e-07, - "loss": 0.5061, + "epoch": 2.6125461254612548, + "grad_norm": 0.628500100341704, + "learning_rate": 4.753144259093734e-07, + "loss": 0.5451, "step": 472 }, { - "epoch": 2.613259668508287, - "grad_norm": 0.6408484309671157, - "learning_rate": 4.991535770654449e-07, - "loss": 0.5004, + "epoch": 2.6180811808118083, + "grad_norm": 0.5904702310704353, + "learning_rate": 4.6165504201275635e-07, + "loss": 0.4915, "step": 473 }, { - "epoch": 2.6187845303867405, - "grad_norm": 0.6464185272075329, - "learning_rate": 4.852275763773251e-07, - "loss": 0.5614, + "epoch": 2.623616236162362, + "grad_norm": 0.7363244318199392, + "learning_rate": 4.481852951692672e-07, + "loss": 0.6614, "step": 474 }, { - "epoch": 2.6243093922651934, - "grad_norm": 0.7272955176468657, - "learning_rate": 4.71488684597643e-07, - "loss": 0.6543, + "epoch": 2.6291512915129154, + "grad_norm": 0.6437098900248531, + "learning_rate": 4.349057482191299e-07, + "loss": 0.5644, "step": 475 }, { - "epoch": 2.6298342541436464, - "grad_norm": 0.6731985657132716, - "learning_rate": 4.57937471117012e-07, - "loss": 0.5776, + "epoch": 2.6346863468634685, + "grad_norm": 0.6735179848745367, + "learning_rate": 4.2181695605497066e-07, + "loss": 0.6371, "step": 476 }, { - "epoch": 2.6353591160220997, - "grad_norm": 0.6348396347822451, - "learning_rate": 4.445744975479627e-07, - "loss": 0.6063, + "epoch": 2.640221402214022, + "grad_norm": 0.6166395492738032, + "learning_rate": 4.089194655986306e-07, + "loss": 0.5439, "step": 477 }, { - "epoch": 2.6408839779005526, - "grad_norm": 0.635930709870673, - "learning_rate": 4.3140031770166457e-07, - "loss": 0.5416, + "epoch": 2.6457564575645756, + "grad_norm": 0.5978015771749204, + "learning_rate": 3.9621381577830855e-07, + "loss": 0.4831, "step": 478 }, { - "epoch": 2.6464088397790055, - "grad_norm": 0.6098205107229787, - "learning_rate": 4.184154775649768e-07, - "loss": 0.5063, + "epoch": 2.651291512915129, + "grad_norm": 0.6462673792890513, + "learning_rate": 3.837005375060482e-07, + "loss": 0.597, "step": 479 }, { - "epoch": 2.6519337016574585, - "grad_norm": 0.6607556626481141, - "learning_rate": 4.0562051527781534e-07, - "loss": 0.5875, + "epoch": 2.6568265682656826, + "grad_norm": 0.6130295271007709, + "learning_rate": 3.7138015365554834e-07, + "loss": 0.5106, "step": 480 }, { - "epoch": 2.6574585635359114, - "grad_norm": 0.6324042696562345, - "learning_rate": 3.930159611108603e-07, - "loss": 0.4783, + "epoch": 2.662361623616236, + "grad_norm": 0.6671222024328364, + "learning_rate": 3.592531790403159e-07, + "loss": 0.5363, "step": 481 }, { - "epoch": 2.6629834254143647, - "grad_norm": 0.6938336791108509, - "learning_rate": 3.8060233744356634e-07, - "loss": 0.5537, + "epoch": 2.6678966789667897, + "grad_norm": 0.6138396843380316, + "learning_rate": 3.473201203921578e-07, + "loss": 0.5469, "step": 482 }, { - "epoch": 2.6685082872928176, - "grad_norm": 0.6325704993286224, - "learning_rate": 3.683801587425251e-07, - "loss": 0.5502, + "epoch": 2.6734317343173433, + "grad_norm": 0.6660598208965025, + "learning_rate": 3.355814763399973e-07, + "loss": 0.581, "step": 483 }, { - "epoch": 2.6740331491712706, - "grad_norm": 0.6934512585984158, - "learning_rate": 3.563499315401386e-07, - "loss": 0.5962, + "epoch": 2.678966789667897, + "grad_norm": 0.6108073985826324, + "learning_rate": 3.2403773738905185e-07, + "loss": 0.5273, "step": 484 }, { - "epoch": 2.679558011049724, - "grad_norm": 0.6128926230516574, - "learning_rate": 3.4451215441362264e-07, - "loss": 0.5282, + "epoch": 2.6845018450184504, + "grad_norm": 0.7379112157721128, + "learning_rate": 3.1268938590032495e-07, + "loss": 0.5626, "step": 485 }, { - "epoch": 2.685082872928177, - "grad_norm": 0.6812281139244459, - "learning_rate": 3.328673179643555e-07, - "loss": 0.5655, + "epoch": 2.6900369003690034, + "grad_norm": 0.6732052466349552, + "learning_rate": 3.015368960704584e-07, + "loss": 0.5891, "step": 486 }, { - "epoch": 2.6906077348066297, - "grad_norm": 0.6773585472582779, - "learning_rate": 3.214159047975324e-07, - "loss": 0.5818, + "epoch": 2.695571955719557, + "grad_norm": 0.6659946641749191, + "learning_rate": 2.905807339119138e-07, + "loss": 0.5258, "step": 487 }, { - "epoch": 2.696132596685083, - "grad_norm": 0.6516795165171537, - "learning_rate": 3.101583895021731e-07, - "loss": 0.5326, + "epoch": 2.7011070110701105, + "grad_norm": 0.6952105961310837, + "learning_rate": 2.798213572335001e-07, + "loss": 0.5581, "step": 488 }, { - "epoch": 2.701657458563536, - "grad_norm": 0.6662378644824801, - "learning_rate": 2.990952386314505e-07, - "loss": 0.5417, + "epoch": 2.706642066420664, + "grad_norm": 0.6365348737573899, + "learning_rate": 2.6925921562124867e-07, + "loss": 0.5311, "step": 489 }, { - "epoch": 2.707182320441989, - "grad_norm": 0.6210703639589708, - "learning_rate": 2.8822691068335515e-07, - "loss": 0.4857, + "epoch": 2.7121771217712176, + "grad_norm": 0.5766266659794174, + "learning_rate": 2.5889475041961767e-07, + "loss": 0.4352, "step": 490 }, { - "epoch": 2.712707182320442, - "grad_norm": 0.6196505032930314, - "learning_rate": 2.7755385608169374e-07, - "loss": 0.4956, + "epoch": 2.717712177121771, + "grad_norm": 0.7319731694850389, + "learning_rate": 2.487283947130609e-07, + "loss": 0.5061, "step": 491 }, { - "epoch": 2.718232044198895, - "grad_norm": 0.713069198760707, - "learning_rate": 2.6707651715742075e-07, - "loss": 0.5266, + "epoch": 2.7232472324723247, + "grad_norm": 0.6859080435441426, + "learning_rate": 2.3876057330792344e-07, + "loss": 0.55, "step": 492 }, { - "epoch": 2.723756906077348, - "grad_norm": 0.6649253442646119, - "learning_rate": 2.567953281303059e-07, - "loss": 0.5354, + "epoch": 2.7287822878228782, + "grad_norm": 0.6487814667155516, + "learning_rate": 2.289917027146943e-07, + "loss": 0.5418, "step": 493 }, { - "epoch": 2.729281767955801, - "grad_norm": 0.6407239217065679, - "learning_rate": 2.4671071509094367e-07, - "loss": 0.5376, + "epoch": 2.734317343173432, + "grad_norm": 0.6761313635926454, + "learning_rate": 2.1942219113060215e-07, + "loss": 0.5819, "step": 494 }, { - "epoch": 2.734806629834254, - "grad_norm": 0.6804096600469481, - "learning_rate": 2.368230959830875e-07, - "loss": 0.6262, + "epoch": 2.7398523985239853, + "grad_norm": 0.6242690303129593, + "learning_rate": 2.1005243842255552e-07, + "loss": 0.5655, "step": 495 }, { - "epoch": 2.7403314917127073, - "grad_norm": 0.6396360467624123, - "learning_rate": 2.2713288058633321e-07, - "loss": 0.5281, + "epoch": 2.745387453874539, + "grad_norm": 0.6222359002332329, + "learning_rate": 2.0088283611044034e-07, + "loss": 0.5279, "step": 496 }, { - "epoch": 2.74585635359116, - "grad_norm": 0.6476227999392966, - "learning_rate": 2.1764047049913528e-07, - "loss": 0.5318, + "epoch": 2.7509225092250924, + "grad_norm": 0.6148480903080046, + "learning_rate": 1.919137673507543e-07, + "loss": 0.5385, "step": 497 }, { - "epoch": 2.751381215469613, - "grad_norm": 0.6122643116968306, - "learning_rate": 2.0834625912216133e-07, - "loss": 0.4926, + "epoch": 2.756457564575646, + "grad_norm": 0.6004801018266677, + "learning_rate": 1.8314560692059836e-07, + "loss": 0.4995, "step": 498 }, { - "epoch": 2.7569060773480665, - "grad_norm": 0.6528467839639731, - "learning_rate": 1.992506316419912e-07, - "loss": 0.546, + "epoch": 2.7619926199261995, + "grad_norm": 0.6362099075257641, + "learning_rate": 1.745787212020178e-07, + "loss": 0.5248, "step": 499 }, { - "epoch": 2.7624309392265194, - "grad_norm": 0.61759278073022, - "learning_rate": 1.9035396501515148e-07, - "loss": 0.4984, + "epoch": 2.767527675276753, + "grad_norm": 0.6405183086515418, + "learning_rate": 1.6621346816668993e-07, + "loss": 0.5456, "step": 500 }, { - "epoch": 2.7679558011049723, - "grad_norm": 0.6443700794544784, - "learning_rate": 1.8165662795249172e-07, - "loss": 0.5885, + "epoch": 2.773062730627306, + "grad_norm": 0.6383996319685834, + "learning_rate": 1.5805019736097105e-07, + "loss": 0.5484, "step": 501 }, { - "epoch": 2.7734806629834257, - "grad_norm": 0.6316205180357609, - "learning_rate": 1.7315898090390748e-07, - "loss": 0.5291, + "epoch": 2.7785977859778597, + "grad_norm": 0.6484780190882176, + "learning_rate": 1.500892498912826e-07, + "loss": 0.5614, "step": 502 }, { - "epoch": 2.7790055248618786, - "grad_norm": 0.6628524099587767, - "learning_rate": 1.6486137604339813e-07, - "loss": 0.569, + "epoch": 2.784132841328413, + "grad_norm": 0.6104614408300383, + "learning_rate": 1.4233095840986756e-07, + "loss": 0.53, "step": 503 }, { - "epoch": 2.7845303867403315, - "grad_norm": 0.6149733294537271, - "learning_rate": 1.5676415725447424e-07, - "loss": 0.4872, + "epoch": 2.7896678966789668, + "grad_norm": 0.602450823572722, + "learning_rate": 1.3477564710088097e-07, + "loss": 0.4869, "step": 504 }, { - "epoch": 2.7900552486187844, - "grad_norm": 0.6199230062873984, - "learning_rate": 1.4886766011590449e-07, - "loss": 0.5333, + "epoch": 2.7952029520295203, + "grad_norm": 0.6213278708400987, + "learning_rate": 1.2742363166685035e-07, + "loss": 0.5867, "step": 505 }, { - "epoch": 2.7955801104972373, - "grad_norm": 0.65667152629127, - "learning_rate": 1.4117221188780616e-07, - "loss": 0.597, + "epoch": 2.800738007380074, + "grad_norm": 0.6572722468206588, + "learning_rate": 1.2027521931548214e-07, + "loss": 0.5681, "step": 506 }, { - "epoch": 2.8011049723756907, - "grad_norm": 0.6818138783253694, - "learning_rate": 1.3367813149808728e-07, - "loss": 0.5724, + "epoch": 2.8062730627306274, + "grad_norm": 0.6458832457141599, + "learning_rate": 1.1333070874682217e-07, + "loss": 0.5781, "step": 507 }, { - "epoch": 2.8066298342541436, - "grad_norm": 0.6178280944589913, - "learning_rate": 1.2638572952922478e-07, - "loss": 0.5352, + "epoch": 2.811808118081181, + "grad_norm": 0.6173074533807742, + "learning_rate": 1.0659039014077943e-07, + "loss": 0.5485, "step": 508 }, { - "epoch": 2.8121546961325965, - "grad_norm": 0.6134027399269469, - "learning_rate": 1.192953082053927e-07, - "loss": 0.5361, + "epoch": 2.8173431734317345, + "grad_norm": 0.576088238404349, + "learning_rate": 1.0005454514499413e-07, + "loss": 0.4937, "step": 509 }, { - "epoch": 2.81767955801105, - "grad_norm": 0.5711150946348813, - "learning_rate": 1.1240716137994045e-07, - "loss": 0.4995, + "epoch": 2.8228782287822876, + "grad_norm": 0.6463368067791495, + "learning_rate": 9.372344686307655e-08, + "loss": 0.5686, "step": 510 }, { - "epoch": 2.8232044198895028, - "grad_norm": 0.6559442348032254, - "learning_rate": 1.0572157452321097e-07, - "loss": 0.5625, + "epoch": 2.828413284132841, + "grad_norm": 0.6380928953673762, + "learning_rate": 8.759735984318896e-08, + "loss": 0.4994, "step": 511 }, { - "epoch": 2.8287292817679557, - "grad_norm": 0.6552526778451414, - "learning_rate": 9.923882471071123e-08, - "loss": 0.5439, + "epoch": 2.8339483394833946, + "grad_norm": 0.6305602061857106, + "learning_rate": 8.167654006699444e-08, + "loss": 0.5506, "step": 512 }, { - "epoch": 2.834254143646409, - "grad_norm": 0.6605300263231985, - "learning_rate": 9.295918061163034e-08, - "loss": 0.5461, + "epoch": 2.839483394833948, + "grad_norm": 0.6225751887694984, + "learning_rate": 7.59612349389599e-08, + "loss": 0.5042, "step": 513 }, { - "epoch": 2.839779005524862, - "grad_norm": 0.6591460854201797, - "learning_rate": 8.688290247770071e-08, - "loss": 0.4975, + "epoch": 2.8450184501845017, + "grad_norm": 0.6644219206184678, + "learning_rate": 7.04516832760177e-08, + "loss": 0.5969, "step": 514 }, { - "epoch": 2.845303867403315, - "grad_norm": 0.6367498607019844, - "learning_rate": 8.101024213241826e-08, - "loss": 0.5556, + "epoch": 2.8505535055350553, + "grad_norm": 0.6453523828993342, + "learning_rate": 6.514811529758747e-08, + "loss": 0.5817, "step": 515 }, { - "epoch": 2.8508287292817682, - "grad_norm": 0.654902042221914, - "learning_rate": 7.534144296060142e-08, - "loss": 0.5925, + "epoch": 2.856088560885609, + "grad_norm": 0.6229940991961801, + "learning_rate": 6.005075261595495e-08, + "loss": 0.537, "step": 516 }, { - "epoch": 2.856353591160221, - "grad_norm": 0.6837595818853758, - "learning_rate": 6.987673989830523e-08, - "loss": 0.5811, + "epoch": 2.8616236162361623, + "grad_norm": 0.6614904718651109, + "learning_rate": 5.515980822701439e-08, + "loss": 0.5757, "step": 517 }, { - "epoch": 2.861878453038674, - "grad_norm": 0.6227316374587448, - "learning_rate": 6.461635942308641e-08, - "loss": 0.5371, + "epoch": 2.867158671586716, + "grad_norm": 0.6494054996388359, + "learning_rate": 5.047548650136513e-08, + "loss": 0.5035, "step": 518 }, { - "epoch": 2.867403314917127, - "grad_norm": 0.6780391903150306, - "learning_rate": 5.9560519544614725e-08, - "loss": 0.4954, + "epoch": 2.8726937269372694, + "grad_norm": 0.5994286550317681, + "learning_rate": 4.599798317577342e-08, + "loss": 0.4803, "step": 519 }, { - "epoch": 2.87292817679558, - "grad_norm": 0.6322452524518853, - "learning_rate": 5.47094297956402e-08, - "loss": 0.5109, + "epoch": 2.878228782287823, + "grad_norm": 0.6745239619202922, + "learning_rate": 4.172748534499449e-08, + "loss": 0.5271, "step": 520 }, { - "epoch": 2.8784530386740332, - "grad_norm": 0.6320105555650557, - "learning_rate": 5.0063291223308993e-08, - "loss": 0.4752, + "epoch": 2.8837638376383765, + "grad_norm": 0.626628102926196, + "learning_rate": 3.766417145395218e-08, + "loss": 0.5359, "step": 521 }, { - "epoch": 2.883977900552486, - "grad_norm": 0.6772494967739778, - "learning_rate": 4.5622296380828936e-08, - "loss": 0.5787, + "epoch": 2.88929889298893, + "grad_norm": 0.6508218671654497, + "learning_rate": 3.3808211290284886e-08, + "loss": 0.5114, "step": 522 }, { - "epoch": 2.889502762430939, - "grad_norm": 0.6123147921965099, - "learning_rate": 4.138662931949255e-08, - "loss": 0.4779, + "epoch": 2.8948339483394836, + "grad_norm": 0.5951817188696978, + "learning_rate": 3.015976597725068e-08, + "loss": 0.5294, "step": 523 }, { - "epoch": 2.8950276243093924, - "grad_norm": 0.6387696683129898, - "learning_rate": 3.7356465581047105e-08, - "loss": 0.5329, + "epoch": 2.900369003690037, + "grad_norm": 0.671303151438165, + "learning_rate": 2.6718987966992683e-08, + "loss": 0.5213, "step": 524 }, { - "epoch": 2.9005524861878453, - "grad_norm": 0.6541159566548991, - "learning_rate": 3.353197219041981e-08, - "loss": 0.5193, + "epoch": 2.9059040590405907, + "grad_norm": 0.676031871740552, + "learning_rate": 2.3486021034170857e-08, + "loss": 0.5459, "step": 525 }, { - "epoch": 2.9060773480662982, - "grad_norm": 0.6447424767896685, - "learning_rate": 2.9913307648797293e-08, - "loss": 0.5528, + "epoch": 2.911439114391144, + "grad_norm": 0.6208018052133741, + "learning_rate": 2.0461000269953457e-08, + "loss": 0.5358, "step": 526 }, { - "epoch": 2.9116022099447516, - "grad_norm": 0.6455122275019286, - "learning_rate": 2.6500621927054716e-08, - "loss": 0.5462, + "epoch": 2.9169741697416973, + "grad_norm": 0.6381166866678656, + "learning_rate": 1.7644052076371544e-08, + "loss": 0.5146, "step": 527 }, { - "epoch": 2.9171270718232045, - "grad_norm": 0.6329939876148144, - "learning_rate": 2.3294056459541302e-08, - "loss": 0.4936, + "epoch": 2.922509225092251, + "grad_norm": 0.6198869071038576, + "learning_rate": 1.5035294161039882e-08, + "loss": 0.5176, "step": 528 }, { - "epoch": 2.9226519337016574, - "grad_norm": 0.6557883836151688, - "learning_rate": 2.0293744138219495e-08, - "loss": 0.5273, + "epoch": 2.9280442804428044, + "grad_norm": 0.6701848507204047, + "learning_rate": 1.2634835532233658e-08, + "loss": 0.5598, "step": 529 }, { - "epoch": 2.9281767955801103, - "grad_norm": 0.6686955747117752, - "learning_rate": 1.7499809307154892e-08, - "loss": 0.5506, + "epoch": 2.933579335793358, + "grad_norm": 0.6207812803170829, + "learning_rate": 1.044277649433989e-08, + "loss": 0.4992, "step": 530 }, { - "epoch": 2.9337016574585633, - "grad_norm": 0.5955974502813394, - "learning_rate": 1.4912367757366485e-08, - "loss": 0.487, + "epoch": 2.9391143911439115, + "grad_norm": 0.6255257686553812, + "learning_rate": 8.459208643659122e-09, + "loss": 0.474, "step": 531 }, { - "epoch": 2.9392265193370166, - "grad_norm": 0.6114812296017272, - "learning_rate": 1.2531526722026067e-08, - "loss": 0.4649, + "epoch": 2.944649446494465, + "grad_norm": 0.6636870576719867, + "learning_rate": 6.6842148645840374e-09, + "loss": 0.5284, "step": 532 }, { - "epoch": 2.9447513812154695, - "grad_norm": 0.6612511110875763, - "learning_rate": 1.0357384872011767e-08, - "loss": 0.5328, + "epoch": 2.9501845018450186, + "grad_norm": 0.6474860377915387, + "learning_rate": 5.11786932613223e-09, + "loss": 0.5327, "step": 533 }, { - "epoch": 2.9502762430939224, - "grad_norm": 0.6280473844471933, - "learning_rate": 8.390032311824115e-09, - "loss": 0.5157, + "epoch": 2.955719557195572, + "grad_norm": 0.6112767808361741, + "learning_rate": 3.760237478849793e-09, + "loss": 0.5189, "step": 534 }, { - "epoch": 2.955801104972376, - "grad_norm": 0.6485700698849401, - "learning_rate": 6.629550575847355e-09, - "loss": 0.5375, + "epoch": 2.961254612546125, + "grad_norm": 0.680320025195111, + "learning_rate": 2.611376052073511e-09, + "loss": 0.55, "step": 535 }, { - "epoch": 2.9613259668508287, - "grad_norm": 0.6621455408098162, - "learning_rate": 5.0760126249715935e-09, - "loss": 0.6062, + "epoch": 2.9667896678966788, + "grad_norm": 0.6580418332010902, + "learning_rate": 1.6713330515627512e-09, + "loss": 0.5855, "step": 536 }, { - "epoch": 2.9668508287292816, - "grad_norm": 0.6340833044767035, - "learning_rate": 3.729482843569665e-09, - "loss": 0.5398, + "epoch": 2.9723247232472323, + "grad_norm": 0.6263030989787438, + "learning_rate": 9.401477574932927e-10, + "loss": 0.5423, "step": 537 }, { - "epoch": 2.972375690607735, - "grad_norm": 0.6235121401681177, - "learning_rate": 2.5900170368281517e-09, - "loss": 0.5235, + "epoch": 2.977859778597786, + "grad_norm": 0.6000757044114218, + "learning_rate": 4.178507228136397e-10, + "loss": 0.4987, "step": 538 }, { - "epoch": 2.977900552486188, - "grad_norm": 0.626406886227414, - "learning_rate": 1.657662428434792e-09, - "loss": 0.5245, + "epoch": 2.9833948339483394, + "grad_norm": 0.6436284868599454, + "learning_rate": 1.0446377197104174e-10, + "loss": 0.5854, "step": 539 }, { - "epoch": 2.983425414364641, - "grad_norm": 0.6517990473785358, - "learning_rate": 9.324576586211553e-10, - "loss": 0.5615, - "step": 540 - }, - { - "epoch": 2.988950276243094, - "grad_norm": 0.6588091394692884, - "learning_rate": 4.1443278256170227e-10, - "loss": 0.5629, - "step": 541 - }, - { - "epoch": 2.994475138121547, - "grad_norm": 0.6095291946882728, - "learning_rate": 1.0360926912866831e-10, - "loss": 0.4717, - "step": 542 - }, - { - "epoch": 3.0, - "grad_norm": 0.6878897999106169, + "epoch": 2.988929889298893, + "grad_norm": 0.6261144170087621, "learning_rate": 0.0, - "loss": 0.5853, - "step": 543 + "loss": 0.5354, + "step": 540 }, { - "epoch": 3.0, - "step": 543, - "total_flos": 136664820957184.0, - "train_loss": 0.613760003727444, - "train_runtime": 1147.7371, - "train_samples_per_second": 45.3, - "train_steps_per_second": 0.473 + "epoch": 2.988929889298893, + "step": 540, + "total_flos": 110732096700416.0, + "train_loss": 0.6148645067104587, + "train_runtime": 2111.1531, + "train_samples_per_second": 24.628, + "train_steps_per_second": 0.256 } ], "logging_steps": 1, - "max_steps": 543, + "max_steps": 540, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, @@ -3836,8 +3815,8 @@ "attributes": {} } }, - "total_flos": 136664820957184.0, - "train_batch_size": 3, + "total_flos": 110732096700416.0, + "train_batch_size": 1, "trial_name": null, "trial_params": null }