diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,14526 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 2069, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00048344210780759005, + "grad_norm": 2.4757904153326913, + "learning_rate": 0.0, + "loss": 0.9924, + "step": 1 + }, + { + "epoch": 0.0009668842156151801, + "grad_norm": 2.3793618820940923, + "learning_rate": 9.615384615384617e-08, + "loss": 0.9738, + "step": 2 + }, + { + "epoch": 0.0014503263234227702, + "grad_norm": 2.375691201697703, + "learning_rate": 1.9230769230769234e-07, + "loss": 0.9588, + "step": 3 + }, + { + "epoch": 0.0019337684312303602, + "grad_norm": 2.3403619553808497, + "learning_rate": 2.884615384615385e-07, + "loss": 0.9862, + "step": 4 + }, + { + "epoch": 0.00241721053903795, + "grad_norm": 2.3613475552419394, + "learning_rate": 3.846153846153847e-07, + "loss": 0.9758, + "step": 5 + }, + { + "epoch": 0.0029006526468455403, + "grad_norm": 2.374422358129782, + "learning_rate": 4.807692307692308e-07, + "loss": 0.9716, + "step": 6 + }, + { + "epoch": 0.00338409475465313, + "grad_norm": 2.478706471115894, + "learning_rate": 5.76923076923077e-07, + "loss": 0.976, + "step": 7 + }, + { + "epoch": 0.0038675368624607204, + "grad_norm": 2.3811968693026198, + "learning_rate": 6.730769230769231e-07, + "loss": 0.9873, + "step": 8 + }, + { + "epoch": 0.00435097897026831, + "grad_norm": 2.2147341913024956, + "learning_rate": 7.692307692307694e-07, + "loss": 0.9286, + "step": 9 + }, + { + "epoch": 0.0048344210780759, + "grad_norm": 2.335255162349414, + "learning_rate": 8.653846153846154e-07, + "loss": 0.9845, + "step": 10 + }, + { + "epoch": 0.005317863185883491, + "grad_norm": 2.218894644037587, + "learning_rate": 9.615384615384617e-07, + "loss": 0.95, + "step": 11 + }, + { + "epoch": 0.005801305293691081, + "grad_norm": 2.2403773948516226, + "learning_rate": 1.0576923076923078e-06, + "loss": 0.9715, + "step": 12 + }, + { + "epoch": 0.0062847474014986705, + "grad_norm": 2.164785866254398, + "learning_rate": 1.153846153846154e-06, + "loss": 0.9422, + "step": 13 + }, + { + "epoch": 0.00676818950930626, + "grad_norm": 2.2075110374685947, + "learning_rate": 1.25e-06, + "loss": 0.965, + "step": 14 + }, + { + "epoch": 0.007251631617113851, + "grad_norm": 1.9308798071113116, + "learning_rate": 1.3461538461538462e-06, + "loss": 0.9372, + "step": 15 + }, + { + "epoch": 0.007735073724921441, + "grad_norm": 1.8705357350667309, + "learning_rate": 1.4423076923076922e-06, + "loss": 0.9443, + "step": 16 + }, + { + "epoch": 0.00821851583272903, + "grad_norm": 1.775691766227149, + "learning_rate": 1.5384615384615387e-06, + "loss": 0.9362, + "step": 17 + }, + { + "epoch": 0.00870195794053662, + "grad_norm": 1.7290053738093054, + "learning_rate": 1.6346153846153848e-06, + "loss": 0.9298, + "step": 18 + }, + { + "epoch": 0.00918540004834421, + "grad_norm": 1.6541389612298973, + "learning_rate": 1.7307692307692308e-06, + "loss": 0.9336, + "step": 19 + }, + { + "epoch": 0.0096688421561518, + "grad_norm": 1.2338607620225968, + "learning_rate": 1.826923076923077e-06, + "loss": 0.9055, + "step": 20 + }, + { + "epoch": 0.01015228426395939, + "grad_norm": 1.1808086522456918, + "learning_rate": 1.9230769230769234e-06, + "loss": 0.8962, + "step": 21 + }, + { + "epoch": 0.010635726371766982, + "grad_norm": 1.090531117286559, + "learning_rate": 2.0192307692307692e-06, + "loss": 0.8702, + "step": 22 + }, + { + "epoch": 0.011119168479574571, + "grad_norm": 1.095517820053717, + "learning_rate": 2.1153846153846155e-06, + "loss": 0.8816, + "step": 23 + }, + { + "epoch": 0.011602610587382161, + "grad_norm": 1.0208393908518454, + "learning_rate": 2.211538461538462e-06, + "loss": 0.8699, + "step": 24 + }, + { + "epoch": 0.012086052695189751, + "grad_norm": 1.004109121666044, + "learning_rate": 2.307692307692308e-06, + "loss": 0.8669, + "step": 25 + }, + { + "epoch": 0.012569494802997341, + "grad_norm": 0.98169412760157, + "learning_rate": 2.403846153846154e-06, + "loss": 0.8371, + "step": 26 + }, + { + "epoch": 0.01305293691080493, + "grad_norm": 0.9209444270757048, + "learning_rate": 2.5e-06, + "loss": 0.8388, + "step": 27 + }, + { + "epoch": 0.01353637901861252, + "grad_norm": 0.8619822284316448, + "learning_rate": 2.5961538461538465e-06, + "loss": 0.8041, + "step": 28 + }, + { + "epoch": 0.01401982112642011, + "grad_norm": 0.9241232197315488, + "learning_rate": 2.6923076923076923e-06, + "loss": 0.8091, + "step": 29 + }, + { + "epoch": 0.014503263234227702, + "grad_norm": 0.917429451582305, + "learning_rate": 2.7884615384615386e-06, + "loss": 0.7749, + "step": 30 + }, + { + "epoch": 0.014986705342035292, + "grad_norm": 0.9043786370452085, + "learning_rate": 2.8846153846153845e-06, + "loss": 0.8144, + "step": 31 + }, + { + "epoch": 0.015470147449842882, + "grad_norm": 0.820031258272968, + "learning_rate": 2.980769230769231e-06, + "loss": 0.7931, + "step": 32 + }, + { + "epoch": 0.01595358955765047, + "grad_norm": 0.7726701862119408, + "learning_rate": 3.0769230769230774e-06, + "loss": 0.7903, + "step": 33 + }, + { + "epoch": 0.01643703166545806, + "grad_norm": 0.6871638945331215, + "learning_rate": 3.1730769230769233e-06, + "loss": 0.7256, + "step": 34 + }, + { + "epoch": 0.01692047377326565, + "grad_norm": 0.7302580243312591, + "learning_rate": 3.2692307692307696e-06, + "loss": 0.7956, + "step": 35 + }, + { + "epoch": 0.01740391588107324, + "grad_norm": 0.6737498928543134, + "learning_rate": 3.365384615384616e-06, + "loss": 0.7478, + "step": 36 + }, + { + "epoch": 0.01788735798888083, + "grad_norm": 0.6914440787905148, + "learning_rate": 3.4615384615384617e-06, + "loss": 0.7621, + "step": 37 + }, + { + "epoch": 0.01837080009668842, + "grad_norm": 0.6869329802424697, + "learning_rate": 3.557692307692308e-06, + "loss": 0.7706, + "step": 38 + }, + { + "epoch": 0.01885424220449601, + "grad_norm": 0.6549740713679569, + "learning_rate": 3.653846153846154e-06, + "loss": 0.7237, + "step": 39 + }, + { + "epoch": 0.0193376843123036, + "grad_norm": 0.6922145753217636, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.7537, + "step": 40 + }, + { + "epoch": 0.01982112642011119, + "grad_norm": 0.665714225887781, + "learning_rate": 3.846153846153847e-06, + "loss": 0.7656, + "step": 41 + }, + { + "epoch": 0.02030456852791878, + "grad_norm": 0.6390200477155564, + "learning_rate": 3.942307692307692e-06, + "loss": 0.7558, + "step": 42 + }, + { + "epoch": 0.020788010635726373, + "grad_norm": 0.6558268717213803, + "learning_rate": 4.0384615384615385e-06, + "loss": 0.7408, + "step": 43 + }, + { + "epoch": 0.021271452743533963, + "grad_norm": 0.6191043915893901, + "learning_rate": 4.134615384615385e-06, + "loss": 0.7482, + "step": 44 + }, + { + "epoch": 0.021754894851341553, + "grad_norm": 0.6193196066166552, + "learning_rate": 4.230769230769231e-06, + "loss": 0.7424, + "step": 45 + }, + { + "epoch": 0.022238336959149143, + "grad_norm": 0.6141046550876093, + "learning_rate": 4.326923076923077e-06, + "loss": 0.7372, + "step": 46 + }, + { + "epoch": 0.022721779066956733, + "grad_norm": 0.6265315399192994, + "learning_rate": 4.423076923076924e-06, + "loss": 0.7362, + "step": 47 + }, + { + "epoch": 0.023205221174764323, + "grad_norm": 0.6704953048927751, + "learning_rate": 4.51923076923077e-06, + "loss": 0.7326, + "step": 48 + }, + { + "epoch": 0.023688663282571912, + "grad_norm": 0.6544522629648533, + "learning_rate": 4.615384615384616e-06, + "loss": 0.7275, + "step": 49 + }, + { + "epoch": 0.024172105390379502, + "grad_norm": 0.6277879949612973, + "learning_rate": 4.711538461538462e-06, + "loss": 0.7311, + "step": 50 + }, + { + "epoch": 0.024655547498187092, + "grad_norm": 0.5924725867824154, + "learning_rate": 4.807692307692308e-06, + "loss": 0.7261, + "step": 51 + }, + { + "epoch": 0.025138989605994682, + "grad_norm": 0.591545350722231, + "learning_rate": 4.903846153846154e-06, + "loss": 0.7092, + "step": 52 + }, + { + "epoch": 0.025622431713802272, + "grad_norm": 0.5698079528908845, + "learning_rate": 5e-06, + "loss": 0.7093, + "step": 53 + }, + { + "epoch": 0.02610587382160986, + "grad_norm": 0.6162650749995418, + "learning_rate": 5.096153846153846e-06, + "loss": 0.687, + "step": 54 + }, + { + "epoch": 0.02658931592941745, + "grad_norm": 0.5577930499697958, + "learning_rate": 5.192307692307693e-06, + "loss": 0.7143, + "step": 55 + }, + { + "epoch": 0.02707275803722504, + "grad_norm": 0.5640209708881836, + "learning_rate": 5.288461538461539e-06, + "loss": 0.7059, + "step": 56 + }, + { + "epoch": 0.02755620014503263, + "grad_norm": 0.5430126891143467, + "learning_rate": 5.384615384615385e-06, + "loss": 0.704, + "step": 57 + }, + { + "epoch": 0.02803964225284022, + "grad_norm": 0.6009517822786309, + "learning_rate": 5.480769230769232e-06, + "loss": 0.7169, + "step": 58 + }, + { + "epoch": 0.02852308436064781, + "grad_norm": 0.6167733540891279, + "learning_rate": 5.576923076923077e-06, + "loss": 0.7065, + "step": 59 + }, + { + "epoch": 0.029006526468455404, + "grad_norm": 0.5731458421774205, + "learning_rate": 5.6730769230769235e-06, + "loss": 0.6373, + "step": 60 + }, + { + "epoch": 0.029489968576262994, + "grad_norm": 0.5489403473955915, + "learning_rate": 5.769230769230769e-06, + "loss": 0.7018, + "step": 61 + }, + { + "epoch": 0.029973410684070584, + "grad_norm": 0.5325599545502842, + "learning_rate": 5.865384615384616e-06, + "loss": 0.6959, + "step": 62 + }, + { + "epoch": 0.030456852791878174, + "grad_norm": 0.5478537783639954, + "learning_rate": 5.961538461538462e-06, + "loss": 0.6896, + "step": 63 + }, + { + "epoch": 0.030940294899685764, + "grad_norm": 0.5363515063211778, + "learning_rate": 6.057692307692308e-06, + "loss": 0.7014, + "step": 64 + }, + { + "epoch": 0.03142373700749335, + "grad_norm": 0.5641946867306303, + "learning_rate": 6.153846153846155e-06, + "loss": 0.6903, + "step": 65 + }, + { + "epoch": 0.03190717911530094, + "grad_norm": 0.5481835775113026, + "learning_rate": 6.25e-06, + "loss": 0.6893, + "step": 66 + }, + { + "epoch": 0.03239062122310853, + "grad_norm": 0.5143476489389097, + "learning_rate": 6.3461538461538466e-06, + "loss": 0.6946, + "step": 67 + }, + { + "epoch": 0.03287406333091612, + "grad_norm": 0.590656354467126, + "learning_rate": 6.442307692307693e-06, + "loss": 0.6788, + "step": 68 + }, + { + "epoch": 0.03335750543872371, + "grad_norm": 0.5203563663337313, + "learning_rate": 6.538461538461539e-06, + "loss": 0.6847, + "step": 69 + }, + { + "epoch": 0.0338409475465313, + "grad_norm": 0.5084500426939229, + "learning_rate": 6.6346153846153846e-06, + "loss": 0.7086, + "step": 70 + }, + { + "epoch": 0.03432438965433889, + "grad_norm": 0.5032784264719405, + "learning_rate": 6.730769230769232e-06, + "loss": 0.6724, + "step": 71 + }, + { + "epoch": 0.03480783176214648, + "grad_norm": 0.5205048813341548, + "learning_rate": 6.826923076923078e-06, + "loss": 0.6592, + "step": 72 + }, + { + "epoch": 0.03529127386995407, + "grad_norm": 0.5066251849073853, + "learning_rate": 6.923076923076923e-06, + "loss": 0.6674, + "step": 73 + }, + { + "epoch": 0.03577471597776166, + "grad_norm": 0.5305760257061701, + "learning_rate": 7.01923076923077e-06, + "loss": 0.6665, + "step": 74 + }, + { + "epoch": 0.03625815808556925, + "grad_norm": 0.5583636863825877, + "learning_rate": 7.115384615384616e-06, + "loss": 0.6685, + "step": 75 + }, + { + "epoch": 0.03674160019337684, + "grad_norm": 0.6055857508188283, + "learning_rate": 7.211538461538462e-06, + "loss": 0.6826, + "step": 76 + }, + { + "epoch": 0.03722504230118443, + "grad_norm": 0.5576393446552599, + "learning_rate": 7.307692307692308e-06, + "loss": 0.6811, + "step": 77 + }, + { + "epoch": 0.03770848440899202, + "grad_norm": 0.5550469150359895, + "learning_rate": 7.403846153846155e-06, + "loss": 0.6752, + "step": 78 + }, + { + "epoch": 0.03819192651679961, + "grad_norm": 0.5095358853416947, + "learning_rate": 7.500000000000001e-06, + "loss": 0.657, + "step": 79 + }, + { + "epoch": 0.0386753686246072, + "grad_norm": 0.519449515803278, + "learning_rate": 7.5961538461538465e-06, + "loss": 0.6326, + "step": 80 + }, + { + "epoch": 0.03915881073241479, + "grad_norm": 0.5360371671954463, + "learning_rate": 7.692307692307694e-06, + "loss": 0.6577, + "step": 81 + }, + { + "epoch": 0.03964225284022238, + "grad_norm": 0.5220981103197152, + "learning_rate": 7.78846153846154e-06, + "loss": 0.6803, + "step": 82 + }, + { + "epoch": 0.04012569494802997, + "grad_norm": 0.5357442529489778, + "learning_rate": 7.884615384615384e-06, + "loss": 0.662, + "step": 83 + }, + { + "epoch": 0.04060913705583756, + "grad_norm": 0.5922567510802571, + "learning_rate": 7.980769230769232e-06, + "loss": 0.6784, + "step": 84 + }, + { + "epoch": 0.04109257916364515, + "grad_norm": 0.5471555288958341, + "learning_rate": 8.076923076923077e-06, + "loss": 0.663, + "step": 85 + }, + { + "epoch": 0.04157602127145275, + "grad_norm": 0.5234614832210157, + "learning_rate": 8.173076923076923e-06, + "loss": 0.6633, + "step": 86 + }, + { + "epoch": 0.04205946337926034, + "grad_norm": 0.5014680527453607, + "learning_rate": 8.26923076923077e-06, + "loss": 0.6345, + "step": 87 + }, + { + "epoch": 0.04254290548706793, + "grad_norm": 0.5541925854592269, + "learning_rate": 8.365384615384616e-06, + "loss": 0.661, + "step": 88 + }, + { + "epoch": 0.043026347594875516, + "grad_norm": 0.5556737957241218, + "learning_rate": 8.461538461538462e-06, + "loss": 0.6476, + "step": 89 + }, + { + "epoch": 0.043509789702683106, + "grad_norm": 0.5358812925229628, + "learning_rate": 8.557692307692308e-06, + "loss": 0.6667, + "step": 90 + }, + { + "epoch": 0.043993231810490696, + "grad_norm": 0.5285944473021625, + "learning_rate": 8.653846153846155e-06, + "loss": 0.6558, + "step": 91 + }, + { + "epoch": 0.044476673918298286, + "grad_norm": 0.6130129115794695, + "learning_rate": 8.750000000000001e-06, + "loss": 0.6662, + "step": 92 + }, + { + "epoch": 0.044960116026105876, + "grad_norm": 0.6086871477606206, + "learning_rate": 8.846153846153847e-06, + "loss": 0.6768, + "step": 93 + }, + { + "epoch": 0.045443558133913466, + "grad_norm": 0.534737794998822, + "learning_rate": 8.942307692307693e-06, + "loss": 0.633, + "step": 94 + }, + { + "epoch": 0.045927000241721055, + "grad_norm": 0.5048674854153722, + "learning_rate": 9.03846153846154e-06, + "loss": 0.6075, + "step": 95 + }, + { + "epoch": 0.046410442349528645, + "grad_norm": 0.5516912026027078, + "learning_rate": 9.134615384615384e-06, + "loss": 0.623, + "step": 96 + }, + { + "epoch": 0.046893884457336235, + "grad_norm": 0.6083291149980872, + "learning_rate": 9.230769230769232e-06, + "loss": 0.6556, + "step": 97 + }, + { + "epoch": 0.047377326565143825, + "grad_norm": 0.5460750932826393, + "learning_rate": 9.326923076923079e-06, + "loss": 0.6524, + "step": 98 + }, + { + "epoch": 0.047860768672951415, + "grad_norm": 0.5459534721301705, + "learning_rate": 9.423076923076923e-06, + "loss": 0.6449, + "step": 99 + }, + { + "epoch": 0.048344210780759005, + "grad_norm": 0.5622412415254093, + "learning_rate": 9.51923076923077e-06, + "loss": 0.6517, + "step": 100 + }, + { + "epoch": 0.048827652888566594, + "grad_norm": 0.6148179967646931, + "learning_rate": 9.615384615384616e-06, + "loss": 0.636, + "step": 101 + }, + { + "epoch": 0.049311094996374184, + "grad_norm": 0.5377477077942675, + "learning_rate": 9.711538461538462e-06, + "loss": 0.6569, + "step": 102 + }, + { + "epoch": 0.049794537104181774, + "grad_norm": 0.535881794576154, + "learning_rate": 9.807692307692308e-06, + "loss": 0.6515, + "step": 103 + }, + { + "epoch": 0.050277979211989364, + "grad_norm": 0.5554528998874018, + "learning_rate": 9.903846153846155e-06, + "loss": 0.6471, + "step": 104 + }, + { + "epoch": 0.050761421319796954, + "grad_norm": 0.5472055318440415, + "learning_rate": 1e-05, + "loss": 0.6212, + "step": 105 + }, + { + "epoch": 0.051244863427604544, + "grad_norm": 0.5562423079812571, + "learning_rate": 9.99999360979851e-06, + "loss": 0.6483, + "step": 106 + }, + { + "epoch": 0.051728305535412133, + "grad_norm": 0.6203972023036308, + "learning_rate": 9.999974439210376e-06, + "loss": 0.6474, + "step": 107 + }, + { + "epoch": 0.05221174764321972, + "grad_norm": 0.5879110259866966, + "learning_rate": 9.999942488284598e-06, + "loss": 0.6506, + "step": 108 + }, + { + "epoch": 0.05269518975102731, + "grad_norm": 0.5415023727684817, + "learning_rate": 9.999897757102843e-06, + "loss": 0.641, + "step": 109 + }, + { + "epoch": 0.0531786318588349, + "grad_norm": 0.5661080832571289, + "learning_rate": 9.99984024577945e-06, + "loss": 0.6561, + "step": 110 + }, + { + "epoch": 0.05366207396664249, + "grad_norm": 0.6384080684659277, + "learning_rate": 9.999769954461425e-06, + "loss": 0.6181, + "step": 111 + }, + { + "epoch": 0.05414551607445008, + "grad_norm": 0.5321086465207798, + "learning_rate": 9.999686883328433e-06, + "loss": 0.6269, + "step": 112 + }, + { + "epoch": 0.05462895818225767, + "grad_norm": 0.5658443861351884, + "learning_rate": 9.999591032592813e-06, + "loss": 0.6317, + "step": 113 + }, + { + "epoch": 0.05511240029006526, + "grad_norm": 0.5759014415066968, + "learning_rate": 9.999482402499569e-06, + "loss": 0.6468, + "step": 114 + }, + { + "epoch": 0.05559584239787285, + "grad_norm": 0.5480587710988183, + "learning_rate": 9.999360993326366e-06, + "loss": 0.6359, + "step": 115 + }, + { + "epoch": 0.05607928450568044, + "grad_norm": 0.6380718424826206, + "learning_rate": 9.999226805383534e-06, + "loss": 0.6349, + "step": 116 + }, + { + "epoch": 0.05656272661348803, + "grad_norm": 0.5246861209498886, + "learning_rate": 9.999079839014074e-06, + "loss": 0.6399, + "step": 117 + }, + { + "epoch": 0.05704616872129562, + "grad_norm": 0.515686613549135, + "learning_rate": 9.998920094593637e-06, + "loss": 0.5984, + "step": 118 + }, + { + "epoch": 0.05752961082910321, + "grad_norm": 0.5607127828178857, + "learning_rate": 9.998747572530548e-06, + "loss": 0.6398, + "step": 119 + }, + { + "epoch": 0.05801305293691081, + "grad_norm": 0.5459763409466101, + "learning_rate": 9.998562273265786e-06, + "loss": 0.626, + "step": 120 + }, + { + "epoch": 0.0584964950447184, + "grad_norm": 0.5525418327052581, + "learning_rate": 9.998364197272988e-06, + "loss": 0.6537, + "step": 121 + }, + { + "epoch": 0.05897993715252599, + "grad_norm": 6.74083445541264, + "learning_rate": 9.998153345058454e-06, + "loss": 0.9475, + "step": 122 + }, + { + "epoch": 0.05946337926033358, + "grad_norm": 0.6141628359508349, + "learning_rate": 9.997929717161142e-06, + "loss": 0.6473, + "step": 123 + }, + { + "epoch": 0.05994682136814117, + "grad_norm": 0.6652958169663876, + "learning_rate": 9.997693314152658e-06, + "loss": 0.6342, + "step": 124 + }, + { + "epoch": 0.06043026347594876, + "grad_norm": 0.606711498986106, + "learning_rate": 9.99744413663727e-06, + "loss": 0.623, + "step": 125 + }, + { + "epoch": 0.06091370558375635, + "grad_norm": 0.5175309840849823, + "learning_rate": 9.997182185251896e-06, + "loss": 0.6221, + "step": 126 + }, + { + "epoch": 0.06139714769156394, + "grad_norm": 0.5341566674562975, + "learning_rate": 9.996907460666104e-06, + "loss": 0.6357, + "step": 127 + }, + { + "epoch": 0.06188058979937153, + "grad_norm": 0.5922792510376619, + "learning_rate": 9.996619963582113e-06, + "loss": 0.6043, + "step": 128 + }, + { + "epoch": 0.06236403190717912, + "grad_norm": 0.5694036510960461, + "learning_rate": 9.996319694734787e-06, + "loss": 0.6311, + "step": 129 + }, + { + "epoch": 0.0628474740149867, + "grad_norm": 0.5540844850790518, + "learning_rate": 9.99600665489164e-06, + "loss": 0.6411, + "step": 130 + }, + { + "epoch": 0.06333091612279429, + "grad_norm": 0.5371960793753483, + "learning_rate": 9.995680844852824e-06, + "loss": 0.6403, + "step": 131 + }, + { + "epoch": 0.06381435823060189, + "grad_norm": 0.5225384791967033, + "learning_rate": 9.995342265451138e-06, + "loss": 0.6269, + "step": 132 + }, + { + "epoch": 0.06429780033840947, + "grad_norm": 0.6035451474536077, + "learning_rate": 9.994990917552017e-06, + "loss": 0.6321, + "step": 133 + }, + { + "epoch": 0.06478124244621707, + "grad_norm": 0.6507380493478006, + "learning_rate": 9.994626802053536e-06, + "loss": 0.6236, + "step": 134 + }, + { + "epoch": 0.06526468455402465, + "grad_norm": 0.5456651842881993, + "learning_rate": 9.994249919886402e-06, + "loss": 0.6258, + "step": 135 + }, + { + "epoch": 0.06574812666183225, + "grad_norm": 0.5172506944070536, + "learning_rate": 9.993860272013958e-06, + "loss": 0.6162, + "step": 136 + }, + { + "epoch": 0.06623156876963984, + "grad_norm": 0.6233262394445207, + "learning_rate": 9.993457859432172e-06, + "loss": 0.6261, + "step": 137 + }, + { + "epoch": 0.06671501087744743, + "grad_norm": 0.6073445562745826, + "learning_rate": 9.993042683169647e-06, + "loss": 0.6371, + "step": 138 + }, + { + "epoch": 0.06719845298525502, + "grad_norm": 0.5857241687958673, + "learning_rate": 9.992614744287605e-06, + "loss": 0.6275, + "step": 139 + }, + { + "epoch": 0.0676818950930626, + "grad_norm": 0.5304150460003405, + "learning_rate": 9.992174043879893e-06, + "loss": 0.6175, + "step": 140 + }, + { + "epoch": 0.0681653372008702, + "grad_norm": 0.5933722892089892, + "learning_rate": 9.991720583072975e-06, + "loss": 0.6255, + "step": 141 + }, + { + "epoch": 0.06864877930867778, + "grad_norm": 0.561723953482763, + "learning_rate": 9.991254363025935e-06, + "loss": 0.6257, + "step": 142 + }, + { + "epoch": 0.06913222141648538, + "grad_norm": 0.532228224452236, + "learning_rate": 9.99077538493047e-06, + "loss": 0.6301, + "step": 143 + }, + { + "epoch": 0.06961566352429296, + "grad_norm": 0.541783938730816, + "learning_rate": 9.990283650010883e-06, + "loss": 0.619, + "step": 144 + }, + { + "epoch": 0.07009910563210056, + "grad_norm": 0.5606995950440783, + "learning_rate": 9.989779159524091e-06, + "loss": 0.5818, + "step": 145 + }, + { + "epoch": 0.07058254773990814, + "grad_norm": 0.5286741282148979, + "learning_rate": 9.989261914759612e-06, + "loss": 0.6105, + "step": 146 + }, + { + "epoch": 0.07106598984771574, + "grad_norm": 0.525375741245272, + "learning_rate": 9.988731917039564e-06, + "loss": 0.6154, + "step": 147 + }, + { + "epoch": 0.07154943195552332, + "grad_norm": 0.5132546936158348, + "learning_rate": 9.988189167718665e-06, + "loss": 0.5533, + "step": 148 + }, + { + "epoch": 0.07203287406333092, + "grad_norm": 0.5330232205089095, + "learning_rate": 9.987633668184227e-06, + "loss": 0.6281, + "step": 149 + }, + { + "epoch": 0.0725163161711385, + "grad_norm": 0.5459730729112252, + "learning_rate": 9.98706541985615e-06, + "loss": 0.5836, + "step": 150 + }, + { + "epoch": 0.0729997582789461, + "grad_norm": 0.5818263727750432, + "learning_rate": 9.986484424186922e-06, + "loss": 0.6246, + "step": 151 + }, + { + "epoch": 0.07348320038675368, + "grad_norm": 0.5754133435232375, + "learning_rate": 9.985890682661616e-06, + "loss": 0.6038, + "step": 152 + }, + { + "epoch": 0.07396664249456128, + "grad_norm": 0.5528911744587542, + "learning_rate": 9.985284196797884e-06, + "loss": 0.6246, + "step": 153 + }, + { + "epoch": 0.07445008460236886, + "grad_norm": 0.5484687585797547, + "learning_rate": 9.984664968145953e-06, + "loss": 0.6318, + "step": 154 + }, + { + "epoch": 0.07493352671017646, + "grad_norm": 0.5351986552762329, + "learning_rate": 9.984032998288617e-06, + "loss": 0.6184, + "step": 155 + }, + { + "epoch": 0.07541696881798404, + "grad_norm": 0.519416066205614, + "learning_rate": 9.983388288841246e-06, + "loss": 0.6185, + "step": 156 + }, + { + "epoch": 0.07590041092579164, + "grad_norm": 0.5470449402548487, + "learning_rate": 9.982730841451768e-06, + "loss": 0.625, + "step": 157 + }, + { + "epoch": 0.07638385303359922, + "grad_norm": 0.5887016805140373, + "learning_rate": 9.982060657800672e-06, + "loss": 0.6183, + "step": 158 + }, + { + "epoch": 0.07686729514140682, + "grad_norm": 0.5522566946881194, + "learning_rate": 9.981377739601002e-06, + "loss": 0.6137, + "step": 159 + }, + { + "epoch": 0.0773507372492144, + "grad_norm": 0.5411997809451911, + "learning_rate": 9.980682088598349e-06, + "loss": 0.6229, + "step": 160 + }, + { + "epoch": 0.077834179357022, + "grad_norm": 0.52840707851752, + "learning_rate": 9.979973706570856e-06, + "loss": 0.614, + "step": 161 + }, + { + "epoch": 0.07831762146482958, + "grad_norm": 0.6047062373713257, + "learning_rate": 9.979252595329204e-06, + "loss": 0.6222, + "step": 162 + }, + { + "epoch": 0.07880106357263718, + "grad_norm": 0.5420471794760692, + "learning_rate": 9.978518756716611e-06, + "loss": 0.5856, + "step": 163 + }, + { + "epoch": 0.07928450568044476, + "grad_norm": 0.5857386315586672, + "learning_rate": 9.977772192608827e-06, + "loss": 0.6291, + "step": 164 + }, + { + "epoch": 0.07976794778825236, + "grad_norm": 0.5691356356316107, + "learning_rate": 9.977012904914133e-06, + "loss": 0.6149, + "step": 165 + }, + { + "epoch": 0.08025138989605994, + "grad_norm": 0.5823273363045892, + "learning_rate": 9.976240895573326e-06, + "loss": 0.6147, + "step": 166 + }, + { + "epoch": 0.08073483200386754, + "grad_norm": 0.538212010864403, + "learning_rate": 9.975456166559725e-06, + "loss": 0.6002, + "step": 167 + }, + { + "epoch": 0.08121827411167512, + "grad_norm": 0.601371610274862, + "learning_rate": 9.974658719879163e-06, + "loss": 0.606, + "step": 168 + }, + { + "epoch": 0.08170171621948272, + "grad_norm": 0.588104162701253, + "learning_rate": 9.973848557569974e-06, + "loss": 0.6226, + "step": 169 + }, + { + "epoch": 0.0821851583272903, + "grad_norm": 0.5316828963553285, + "learning_rate": 9.973025681703e-06, + "loss": 0.6144, + "step": 170 + }, + { + "epoch": 0.0826686004350979, + "grad_norm": 0.5405916050680715, + "learning_rate": 9.972190094381578e-06, + "loss": 0.6148, + "step": 171 + }, + { + "epoch": 0.0831520425429055, + "grad_norm": 0.5102891757426009, + "learning_rate": 9.971341797741538e-06, + "loss": 0.616, + "step": 172 + }, + { + "epoch": 0.08363548465071308, + "grad_norm": 0.5551757535954606, + "learning_rate": 9.970480793951194e-06, + "loss": 0.6196, + "step": 173 + }, + { + "epoch": 0.08411892675852067, + "grad_norm": 0.5349760515746151, + "learning_rate": 9.96960708521134e-06, + "loss": 0.5902, + "step": 174 + }, + { + "epoch": 0.08460236886632826, + "grad_norm": 0.5713299053870873, + "learning_rate": 9.968720673755246e-06, + "loss": 0.6039, + "step": 175 + }, + { + "epoch": 0.08508581097413585, + "grad_norm": 0.5886201187493544, + "learning_rate": 9.96782156184865e-06, + "loss": 0.6128, + "step": 176 + }, + { + "epoch": 0.08556925308194344, + "grad_norm": 0.52487297166769, + "learning_rate": 9.966909751789758e-06, + "loss": 0.6201, + "step": 177 + }, + { + "epoch": 0.08605269518975103, + "grad_norm": 0.47488673856360863, + "learning_rate": 9.965985245909226e-06, + "loss": 0.581, + "step": 178 + }, + { + "epoch": 0.08653613729755862, + "grad_norm": 0.5390345004627665, + "learning_rate": 9.96504804657017e-06, + "loss": 0.5748, + "step": 179 + }, + { + "epoch": 0.08701957940536621, + "grad_norm": 0.5030595297893009, + "learning_rate": 9.964098156168143e-06, + "loss": 0.6025, + "step": 180 + }, + { + "epoch": 0.0875030215131738, + "grad_norm": 0.5468598312459072, + "learning_rate": 9.963135577131144e-06, + "loss": 0.6086, + "step": 181 + }, + { + "epoch": 0.08798646362098139, + "grad_norm": 0.48113219800404783, + "learning_rate": 9.962160311919601e-06, + "loss": 0.5759, + "step": 182 + }, + { + "epoch": 0.08846990572878898, + "grad_norm": 0.5498772940672643, + "learning_rate": 9.96117236302637e-06, + "loss": 0.6009, + "step": 183 + }, + { + "epoch": 0.08895334783659657, + "grad_norm": 0.572150853367621, + "learning_rate": 9.960171732976731e-06, + "loss": 0.5891, + "step": 184 + }, + { + "epoch": 0.08943678994440415, + "grad_norm": 0.5440182913032069, + "learning_rate": 9.959158424328373e-06, + "loss": 0.6126, + "step": 185 + }, + { + "epoch": 0.08992023205221175, + "grad_norm": 0.5124606491120447, + "learning_rate": 9.958132439671392e-06, + "loss": 0.6113, + "step": 186 + }, + { + "epoch": 0.09040367416001933, + "grad_norm": 0.5122426086233111, + "learning_rate": 9.957093781628294e-06, + "loss": 0.5585, + "step": 187 + }, + { + "epoch": 0.09088711626782693, + "grad_norm": 0.5466339032920954, + "learning_rate": 9.956042452853967e-06, + "loss": 0.5829, + "step": 188 + }, + { + "epoch": 0.09137055837563451, + "grad_norm": 0.5319185267267565, + "learning_rate": 9.954978456035695e-06, + "loss": 0.6014, + "step": 189 + }, + { + "epoch": 0.09185400048344211, + "grad_norm": 0.5439360347029544, + "learning_rate": 9.953901793893137e-06, + "loss": 0.6135, + "step": 190 + }, + { + "epoch": 0.0923374425912497, + "grad_norm": 0.5572467498872743, + "learning_rate": 9.95281246917833e-06, + "loss": 0.6126, + "step": 191 + }, + { + "epoch": 0.09282088469905729, + "grad_norm": 0.5541110285684123, + "learning_rate": 9.951710484675677e-06, + "loss": 0.6077, + "step": 192 + }, + { + "epoch": 0.09330432680686487, + "grad_norm": 0.4850481807152515, + "learning_rate": 9.950595843201936e-06, + "loss": 0.6052, + "step": 193 + }, + { + "epoch": 0.09378776891467247, + "grad_norm": 0.4982494369774088, + "learning_rate": 9.949468547606222e-06, + "loss": 0.608, + "step": 194 + }, + { + "epoch": 0.09427121102248005, + "grad_norm": 0.5222210926075901, + "learning_rate": 9.948328600769996e-06, + "loss": 0.5725, + "step": 195 + }, + { + "epoch": 0.09475465313028765, + "grad_norm": 0.5156665548407187, + "learning_rate": 9.94717600560705e-06, + "loss": 0.5981, + "step": 196 + }, + { + "epoch": 0.09523809523809523, + "grad_norm": 0.4789398218595176, + "learning_rate": 9.946010765063512e-06, + "loss": 0.6163, + "step": 197 + }, + { + "epoch": 0.09572153734590283, + "grad_norm": 0.5066106303118647, + "learning_rate": 9.94483288211783e-06, + "loss": 0.6049, + "step": 198 + }, + { + "epoch": 0.09620497945371041, + "grad_norm": 0.519086410125638, + "learning_rate": 9.943642359780767e-06, + "loss": 0.6034, + "step": 199 + }, + { + "epoch": 0.09668842156151801, + "grad_norm": 0.5726309849663989, + "learning_rate": 9.942439201095398e-06, + "loss": 0.5977, + "step": 200 + }, + { + "epoch": 0.09717186366932559, + "grad_norm": 0.5149014744932526, + "learning_rate": 9.941223409137088e-06, + "loss": 0.6147, + "step": 201 + }, + { + "epoch": 0.09765530577713319, + "grad_norm": 0.5009166664227639, + "learning_rate": 9.939994987013505e-06, + "loss": 0.595, + "step": 202 + }, + { + "epoch": 0.09813874788494077, + "grad_norm": 0.5677576117209191, + "learning_rate": 9.93875393786459e-06, + "loss": 0.5825, + "step": 203 + }, + { + "epoch": 0.09862218999274837, + "grad_norm": 0.5705628060741978, + "learning_rate": 9.937500264862567e-06, + "loss": 0.6106, + "step": 204 + }, + { + "epoch": 0.09910563210055595, + "grad_norm": 0.5166084751955315, + "learning_rate": 9.936233971211926e-06, + "loss": 0.5724, + "step": 205 + }, + { + "epoch": 0.09958907420836355, + "grad_norm": 0.4998369485071646, + "learning_rate": 9.934955060149413e-06, + "loss": 0.5702, + "step": 206 + }, + { + "epoch": 0.10007251631617115, + "grad_norm": 0.4773861112208611, + "learning_rate": 9.933663534944029e-06, + "loss": 0.5976, + "step": 207 + }, + { + "epoch": 0.10055595842397873, + "grad_norm": 0.5142399648385931, + "learning_rate": 9.932359398897018e-06, + "loss": 0.5662, + "step": 208 + }, + { + "epoch": 0.10103940053178632, + "grad_norm": 0.5152331134346968, + "learning_rate": 9.931042655341856e-06, + "loss": 0.5987, + "step": 209 + }, + { + "epoch": 0.10152284263959391, + "grad_norm": 0.5697107336495173, + "learning_rate": 9.929713307644245e-06, + "loss": 0.5956, + "step": 210 + }, + { + "epoch": 0.1020062847474015, + "grad_norm": 0.5437386464507225, + "learning_rate": 9.928371359202103e-06, + "loss": 0.6023, + "step": 211 + }, + { + "epoch": 0.10248972685520909, + "grad_norm": 0.48930400532530816, + "learning_rate": 9.927016813445562e-06, + "loss": 0.5941, + "step": 212 + }, + { + "epoch": 0.10297316896301668, + "grad_norm": 0.5691895002113943, + "learning_rate": 9.925649673836949e-06, + "loss": 0.5977, + "step": 213 + }, + { + "epoch": 0.10345661107082427, + "grad_norm": 0.518358121778254, + "learning_rate": 9.924269943870781e-06, + "loss": 0.599, + "step": 214 + }, + { + "epoch": 0.10394005317863186, + "grad_norm": 0.5179203447080591, + "learning_rate": 9.922877627073763e-06, + "loss": 0.565, + "step": 215 + }, + { + "epoch": 0.10442349528643945, + "grad_norm": 0.5348210146349037, + "learning_rate": 9.921472727004765e-06, + "loss": 0.6038, + "step": 216 + }, + { + "epoch": 0.10490693739424704, + "grad_norm": 0.5011388091471438, + "learning_rate": 9.920055247254827e-06, + "loss": 0.5951, + "step": 217 + }, + { + "epoch": 0.10539037950205463, + "grad_norm": 0.5706178448892886, + "learning_rate": 9.91862519144714e-06, + "loss": 0.604, + "step": 218 + }, + { + "epoch": 0.10587382160986222, + "grad_norm": 0.5667257328777994, + "learning_rate": 9.917182563237045e-06, + "loss": 0.6006, + "step": 219 + }, + { + "epoch": 0.1063572637176698, + "grad_norm": 0.5402529870671051, + "learning_rate": 9.915727366312012e-06, + "loss": 0.591, + "step": 220 + }, + { + "epoch": 0.1068407058254774, + "grad_norm": 0.5123066262170495, + "learning_rate": 9.914259604391642e-06, + "loss": 0.5818, + "step": 221 + }, + { + "epoch": 0.10732414793328499, + "grad_norm": 0.5104812232878251, + "learning_rate": 9.912779281227656e-06, + "loss": 0.5991, + "step": 222 + }, + { + "epoch": 0.10780759004109258, + "grad_norm": 0.5073553912253322, + "learning_rate": 9.911286400603878e-06, + "loss": 0.5783, + "step": 223 + }, + { + "epoch": 0.10829103214890017, + "grad_norm": 0.5516103650201469, + "learning_rate": 9.90978096633623e-06, + "loss": 0.6007, + "step": 224 + }, + { + "epoch": 0.10877447425670776, + "grad_norm": 0.5241670992889956, + "learning_rate": 9.908262982272724e-06, + "loss": 0.5865, + "step": 225 + }, + { + "epoch": 0.10925791636451535, + "grad_norm": 0.4894067875331202, + "learning_rate": 9.906732452293448e-06, + "loss": 0.5635, + "step": 226 + }, + { + "epoch": 0.10974135847232294, + "grad_norm": 0.5079732216995924, + "learning_rate": 9.905189380310564e-06, + "loss": 0.5982, + "step": 227 + }, + { + "epoch": 0.11022480058013052, + "grad_norm": 0.47288266380376864, + "learning_rate": 9.903633770268286e-06, + "loss": 0.5734, + "step": 228 + }, + { + "epoch": 0.11070824268793812, + "grad_norm": 0.5195973051222883, + "learning_rate": 9.902065626142876e-06, + "loss": 0.6021, + "step": 229 + }, + { + "epoch": 0.1111916847957457, + "grad_norm": 0.5159734590151601, + "learning_rate": 9.900484951942642e-06, + "loss": 0.5847, + "step": 230 + }, + { + "epoch": 0.1116751269035533, + "grad_norm": 0.5157347113387764, + "learning_rate": 9.89889175170791e-06, + "loss": 0.5946, + "step": 231 + }, + { + "epoch": 0.11215856901136088, + "grad_norm": 0.49409523170190334, + "learning_rate": 9.89728602951103e-06, + "loss": 0.5941, + "step": 232 + }, + { + "epoch": 0.11264201111916848, + "grad_norm": 0.5699641967141135, + "learning_rate": 9.89566778945636e-06, + "loss": 0.5965, + "step": 233 + }, + { + "epoch": 0.11312545322697606, + "grad_norm": 0.5565932357020583, + "learning_rate": 9.894037035680246e-06, + "loss": 0.6076, + "step": 234 + }, + { + "epoch": 0.11360889533478366, + "grad_norm": 0.4762368359891958, + "learning_rate": 9.892393772351033e-06, + "loss": 0.5749, + "step": 235 + }, + { + "epoch": 0.11409233744259124, + "grad_norm": 0.5226269336653058, + "learning_rate": 9.890738003669029e-06, + "loss": 0.5882, + "step": 236 + }, + { + "epoch": 0.11457577955039884, + "grad_norm": 0.5893232226185929, + "learning_rate": 9.889069733866515e-06, + "loss": 0.5978, + "step": 237 + }, + { + "epoch": 0.11505922165820642, + "grad_norm": 0.5556325697280562, + "learning_rate": 9.887388967207722e-06, + "loss": 0.6, + "step": 238 + }, + { + "epoch": 0.11554266376601402, + "grad_norm": 0.48160661753964396, + "learning_rate": 9.885695707988825e-06, + "loss": 0.5977, + "step": 239 + }, + { + "epoch": 0.11602610587382162, + "grad_norm": 0.5122405505133801, + "learning_rate": 9.883989960537934e-06, + "loss": 0.6044, + "step": 240 + }, + { + "epoch": 0.1165095479816292, + "grad_norm": 0.5812889541684825, + "learning_rate": 9.882271729215071e-06, + "loss": 0.5849, + "step": 241 + }, + { + "epoch": 0.1169929900894368, + "grad_norm": 0.4906401332764143, + "learning_rate": 9.880541018412179e-06, + "loss": 0.5986, + "step": 242 + }, + { + "epoch": 0.11747643219724438, + "grad_norm": 0.48951055967126716, + "learning_rate": 9.878797832553093e-06, + "loss": 0.5646, + "step": 243 + }, + { + "epoch": 0.11795987430505198, + "grad_norm": 0.4836474446158179, + "learning_rate": 9.877042176093537e-06, + "loss": 0.5998, + "step": 244 + }, + { + "epoch": 0.11844331641285956, + "grad_norm": 0.4962973453940785, + "learning_rate": 9.875274053521107e-06, + "loss": 0.5846, + "step": 245 + }, + { + "epoch": 0.11892675852066716, + "grad_norm": 0.45261755838242107, + "learning_rate": 9.873493469355271e-06, + "loss": 0.5912, + "step": 246 + }, + { + "epoch": 0.11941020062847474, + "grad_norm": 0.49934013758424506, + "learning_rate": 9.871700428147342e-06, + "loss": 0.5836, + "step": 247 + }, + { + "epoch": 0.11989364273628234, + "grad_norm": 0.48318779237357384, + "learning_rate": 9.86989493448048e-06, + "loss": 0.5898, + "step": 248 + }, + { + "epoch": 0.12037708484408992, + "grad_norm": 0.4877998807669757, + "learning_rate": 9.868076992969672e-06, + "loss": 0.5933, + "step": 249 + }, + { + "epoch": 0.12086052695189752, + "grad_norm": 0.4697579805390032, + "learning_rate": 9.866246608261725e-06, + "loss": 0.5855, + "step": 250 + }, + { + "epoch": 0.1213439690597051, + "grad_norm": 0.512552573820198, + "learning_rate": 9.864403785035246e-06, + "loss": 0.5989, + "step": 251 + }, + { + "epoch": 0.1218274111675127, + "grad_norm": 0.4865753568683563, + "learning_rate": 9.862548528000644e-06, + "loss": 0.5722, + "step": 252 + }, + { + "epoch": 0.12231085327532028, + "grad_norm": 0.5276925045930954, + "learning_rate": 9.860680841900101e-06, + "loss": 0.5879, + "step": 253 + }, + { + "epoch": 0.12279429538312787, + "grad_norm": 0.5213718677505005, + "learning_rate": 9.858800731507575e-06, + "loss": 0.5999, + "step": 254 + }, + { + "epoch": 0.12327773749093546, + "grad_norm": 0.5180845494091726, + "learning_rate": 9.85690820162878e-06, + "loss": 0.586, + "step": 255 + }, + { + "epoch": 0.12376117959874305, + "grad_norm": 0.5698025401421347, + "learning_rate": 9.855003257101177e-06, + "loss": 0.6011, + "step": 256 + }, + { + "epoch": 0.12424462170655064, + "grad_norm": 0.562343589994959, + "learning_rate": 9.853085902793952e-06, + "loss": 0.5894, + "step": 257 + }, + { + "epoch": 0.12472806381435823, + "grad_norm": 0.5160827286882833, + "learning_rate": 9.851156143608025e-06, + "loss": 0.5897, + "step": 258 + }, + { + "epoch": 0.12521150592216582, + "grad_norm": 0.5407107287832078, + "learning_rate": 9.84921398447601e-06, + "loss": 0.59, + "step": 259 + }, + { + "epoch": 0.1256949480299734, + "grad_norm": 0.4828245059112851, + "learning_rate": 9.847259430362222e-06, + "loss": 0.5642, + "step": 260 + }, + { + "epoch": 0.126178390137781, + "grad_norm": 0.5766667340207283, + "learning_rate": 9.845292486262664e-06, + "loss": 0.6016, + "step": 261 + }, + { + "epoch": 0.12666183224558858, + "grad_norm": 0.5818866932241936, + "learning_rate": 9.843313157204999e-06, + "loss": 0.5807, + "step": 262 + }, + { + "epoch": 0.12714527435339618, + "grad_norm": 0.5140923007570054, + "learning_rate": 9.841321448248552e-06, + "loss": 0.5858, + "step": 263 + }, + { + "epoch": 0.12762871646120377, + "grad_norm": 0.513399510660716, + "learning_rate": 9.839317364484295e-06, + "loss": 0.5847, + "step": 264 + }, + { + "epoch": 0.12811215856901137, + "grad_norm": 0.5227642580781724, + "learning_rate": 9.837300911034824e-06, + "loss": 0.5888, + "step": 265 + }, + { + "epoch": 0.12859560067681894, + "grad_norm": 0.5579358896097371, + "learning_rate": 9.83527209305436e-06, + "loss": 0.5928, + "step": 266 + }, + { + "epoch": 0.12907904278462654, + "grad_norm": 0.5145348442577231, + "learning_rate": 9.83323091572872e-06, + "loss": 0.5872, + "step": 267 + }, + { + "epoch": 0.12956248489243413, + "grad_norm": 0.5112821410236051, + "learning_rate": 9.831177384275323e-06, + "loss": 0.5805, + "step": 268 + }, + { + "epoch": 0.13004592700024173, + "grad_norm": 0.5497912960403669, + "learning_rate": 9.829111503943159e-06, + "loss": 0.5837, + "step": 269 + }, + { + "epoch": 0.1305293691080493, + "grad_norm": 0.5226743950335115, + "learning_rate": 9.827033280012783e-06, + "loss": 0.5539, + "step": 270 + }, + { + "epoch": 0.1310128112158569, + "grad_norm": 0.5713921241049837, + "learning_rate": 9.824942717796304e-06, + "loss": 0.5881, + "step": 271 + }, + { + "epoch": 0.1314962533236645, + "grad_norm": 0.5241764388189555, + "learning_rate": 9.822839822637369e-06, + "loss": 0.6032, + "step": 272 + }, + { + "epoch": 0.1319796954314721, + "grad_norm": 0.5162440352522167, + "learning_rate": 9.820724599911147e-06, + "loss": 0.5842, + "step": 273 + }, + { + "epoch": 0.13246313753927969, + "grad_norm": 0.5431692492650363, + "learning_rate": 9.818597055024315e-06, + "loss": 0.585, + "step": 274 + }, + { + "epoch": 0.13294657964708725, + "grad_norm": 0.5124783198553914, + "learning_rate": 9.816457193415055e-06, + "loss": 0.5779, + "step": 275 + }, + { + "epoch": 0.13343002175489485, + "grad_norm": 0.5257695390265421, + "learning_rate": 9.81430502055302e-06, + "loss": 0.5798, + "step": 276 + }, + { + "epoch": 0.13391346386270245, + "grad_norm": 0.49781008962990064, + "learning_rate": 9.812140541939338e-06, + "loss": 0.5836, + "step": 277 + }, + { + "epoch": 0.13439690597051004, + "grad_norm": 0.5327804269781539, + "learning_rate": 9.809963763106593e-06, + "loss": 0.5733, + "step": 278 + }, + { + "epoch": 0.1348803480783176, + "grad_norm": 0.5167258655366103, + "learning_rate": 9.807774689618806e-06, + "loss": 0.58, + "step": 279 + }, + { + "epoch": 0.1353637901861252, + "grad_norm": 0.5321175943512093, + "learning_rate": 9.805573327071428e-06, + "loss": 0.5911, + "step": 280 + }, + { + "epoch": 0.1358472322939328, + "grad_norm": 0.49961117510050285, + "learning_rate": 9.803359681091313e-06, + "loss": 0.5737, + "step": 281 + }, + { + "epoch": 0.1363306744017404, + "grad_norm": 0.5314962622355859, + "learning_rate": 9.801133757336726e-06, + "loss": 0.593, + "step": 282 + }, + { + "epoch": 0.13681411650954797, + "grad_norm": 0.48173417582091976, + "learning_rate": 9.798895561497299e-06, + "loss": 0.5818, + "step": 283 + }, + { + "epoch": 0.13729755861735557, + "grad_norm": 0.5127693228983886, + "learning_rate": 9.796645099294049e-06, + "loss": 0.6024, + "step": 284 + }, + { + "epoch": 0.13778100072516317, + "grad_norm": 0.5128313174228813, + "learning_rate": 9.794382376479334e-06, + "loss": 0.5837, + "step": 285 + }, + { + "epoch": 0.13826444283297076, + "grad_norm": 0.502862882638082, + "learning_rate": 9.792107398836859e-06, + "loss": 0.5781, + "step": 286 + }, + { + "epoch": 0.13874788494077833, + "grad_norm": 0.5169656633134686, + "learning_rate": 9.789820172181648e-06, + "loss": 0.5821, + "step": 287 + }, + { + "epoch": 0.13923132704858593, + "grad_norm": 6.7246508188992, + "learning_rate": 9.787520702360035e-06, + "loss": 1.0972, + "step": 288 + }, + { + "epoch": 0.13971476915639353, + "grad_norm": 0.6005251051430991, + "learning_rate": 9.785208995249655e-06, + "loss": 0.5803, + "step": 289 + }, + { + "epoch": 0.14019821126420112, + "grad_norm": 0.5531574758650235, + "learning_rate": 9.782885056759413e-06, + "loss": 0.563, + "step": 290 + }, + { + "epoch": 0.1406816533720087, + "grad_norm": 0.5273779406180227, + "learning_rate": 9.780548892829486e-06, + "loss": 0.5872, + "step": 291 + }, + { + "epoch": 0.1411650954798163, + "grad_norm": 0.5063770192301159, + "learning_rate": 9.778200509431297e-06, + "loss": 0.5782, + "step": 292 + }, + { + "epoch": 0.14164853758762389, + "grad_norm": 0.5401099132225082, + "learning_rate": 9.775839912567502e-06, + "loss": 0.5804, + "step": 293 + }, + { + "epoch": 0.14213197969543148, + "grad_norm": 0.607784811294971, + "learning_rate": 9.773467108271978e-06, + "loss": 0.5831, + "step": 294 + }, + { + "epoch": 0.14261542180323905, + "grad_norm": 0.5051370116219928, + "learning_rate": 9.771082102609803e-06, + "loss": 0.5597, + "step": 295 + }, + { + "epoch": 0.14309886391104665, + "grad_norm": 0.5723810352863865, + "learning_rate": 9.768684901677245e-06, + "loss": 0.5779, + "step": 296 + }, + { + "epoch": 0.14358230601885424, + "grad_norm": 0.529491415132923, + "learning_rate": 9.766275511601742e-06, + "loss": 0.5849, + "step": 297 + }, + { + "epoch": 0.14406574812666184, + "grad_norm": 0.6275998382003428, + "learning_rate": 9.763853938541887e-06, + "loss": 0.5915, + "step": 298 + }, + { + "epoch": 0.1445491902344694, + "grad_norm": 0.5906428033404255, + "learning_rate": 9.76142018868742e-06, + "loss": 0.5816, + "step": 299 + }, + { + "epoch": 0.145032632342277, + "grad_norm": 0.597638837356143, + "learning_rate": 9.7589742682592e-06, + "loss": 0.5578, + "step": 300 + }, + { + "epoch": 0.1455160744500846, + "grad_norm": 0.5365546900890564, + "learning_rate": 9.756516183509198e-06, + "loss": 0.5833, + "step": 301 + }, + { + "epoch": 0.1459995165578922, + "grad_norm": 0.554155920273677, + "learning_rate": 9.754045940720471e-06, + "loss": 0.581, + "step": 302 + }, + { + "epoch": 0.14648295866569977, + "grad_norm": 0.5290449152773149, + "learning_rate": 9.751563546207167e-06, + "loss": 0.5879, + "step": 303 + }, + { + "epoch": 0.14696640077350737, + "grad_norm": 0.5303051981230842, + "learning_rate": 9.749069006314481e-06, + "loss": 0.557, + "step": 304 + }, + { + "epoch": 0.14744984288131496, + "grad_norm": 0.4750712434505446, + "learning_rate": 9.74656232741866e-06, + "loss": 0.5236, + "step": 305 + }, + { + "epoch": 0.14793328498912256, + "grad_norm": 0.515780571537496, + "learning_rate": 9.744043515926975e-06, + "loss": 0.5827, + "step": 306 + }, + { + "epoch": 0.14841672709693013, + "grad_norm": 0.5886066507830542, + "learning_rate": 9.741512578277715e-06, + "loss": 0.5741, + "step": 307 + }, + { + "epoch": 0.14890016920473773, + "grad_norm": 0.5712616310834069, + "learning_rate": 9.738969520940158e-06, + "loss": 0.587, + "step": 308 + }, + { + "epoch": 0.14938361131254532, + "grad_norm": 0.5883909446108012, + "learning_rate": 9.736414350414564e-06, + "loss": 0.5836, + "step": 309 + }, + { + "epoch": 0.14986705342035292, + "grad_norm": 0.49300111186175044, + "learning_rate": 9.733847073232156e-06, + "loss": 0.583, + "step": 310 + }, + { + "epoch": 0.15035049552816052, + "grad_norm": 0.47057695692490953, + "learning_rate": 9.7312676959551e-06, + "loss": 0.5433, + "step": 311 + }, + { + "epoch": 0.15083393763596809, + "grad_norm": 0.5647156070035382, + "learning_rate": 9.72867622517649e-06, + "loss": 0.5859, + "step": 312 + }, + { + "epoch": 0.15131737974377568, + "grad_norm": 0.5698749374107666, + "learning_rate": 9.726072667520338e-06, + "loss": 0.5759, + "step": 313 + }, + { + "epoch": 0.15180082185158328, + "grad_norm": 0.4935935341959304, + "learning_rate": 9.723457029641547e-06, + "loss": 0.5883, + "step": 314 + }, + { + "epoch": 0.15228426395939088, + "grad_norm": 0.5040267732247843, + "learning_rate": 9.720829318225897e-06, + "loss": 0.5723, + "step": 315 + }, + { + "epoch": 0.15276770606719844, + "grad_norm": 0.5390674583456238, + "learning_rate": 9.718189539990029e-06, + "loss": 0.5748, + "step": 316 + }, + { + "epoch": 0.15325114817500604, + "grad_norm": 0.5449958057788811, + "learning_rate": 9.715537701681431e-06, + "loss": 0.5831, + "step": 317 + }, + { + "epoch": 0.15373459028281364, + "grad_norm": 0.48895966772949706, + "learning_rate": 9.712873810078415e-06, + "loss": 0.5505, + "step": 318 + }, + { + "epoch": 0.15421803239062123, + "grad_norm": 0.5694877152526486, + "learning_rate": 9.710197871990101e-06, + "loss": 0.5789, + "step": 319 + }, + { + "epoch": 0.1547014744984288, + "grad_norm": 0.5390854150150773, + "learning_rate": 9.707509894256406e-06, + "loss": 0.5699, + "step": 320 + }, + { + "epoch": 0.1551849166062364, + "grad_norm": 0.5339825765060972, + "learning_rate": 9.704809883748012e-06, + "loss": 0.5841, + "step": 321 + }, + { + "epoch": 0.155668358714044, + "grad_norm": 0.5691147363910026, + "learning_rate": 9.70209784736637e-06, + "loss": 0.5791, + "step": 322 + }, + { + "epoch": 0.1561518008218516, + "grad_norm": 0.5098897525025804, + "learning_rate": 9.699373792043658e-06, + "loss": 0.5789, + "step": 323 + }, + { + "epoch": 0.15663524292965916, + "grad_norm": 0.5233093422091403, + "learning_rate": 9.696637724742785e-06, + "loss": 0.5791, + "step": 324 + }, + { + "epoch": 0.15711868503746676, + "grad_norm": 0.4951608627676522, + "learning_rate": 9.693889652457359e-06, + "loss": 0.5664, + "step": 325 + }, + { + "epoch": 0.15760212714527436, + "grad_norm": 0.5085606430384619, + "learning_rate": 9.691129582211671e-06, + "loss": 0.5777, + "step": 326 + }, + { + "epoch": 0.15808556925308195, + "grad_norm": 0.5137102450781047, + "learning_rate": 9.688357521060685e-06, + "loss": 0.5843, + "step": 327 + }, + { + "epoch": 0.15856901136088952, + "grad_norm": 0.4769071854330559, + "learning_rate": 9.685573476090015e-06, + "loss": 0.578, + "step": 328 + }, + { + "epoch": 0.15905245346869712, + "grad_norm": 0.542975418114207, + "learning_rate": 9.6827774544159e-06, + "loss": 0.5859, + "step": 329 + }, + { + "epoch": 0.15953589557650472, + "grad_norm": 0.4926718305346952, + "learning_rate": 9.6799694631852e-06, + "loss": 0.5871, + "step": 330 + }, + { + "epoch": 0.1600193376843123, + "grad_norm": 0.5010989320404932, + "learning_rate": 9.677149509575365e-06, + "loss": 0.5841, + "step": 331 + }, + { + "epoch": 0.16050277979211988, + "grad_norm": 0.5446382005351177, + "learning_rate": 9.674317600794426e-06, + "loss": 0.5762, + "step": 332 + }, + { + "epoch": 0.16098622189992748, + "grad_norm": 0.5406240370145704, + "learning_rate": 9.67147374408097e-06, + "loss": 0.5685, + "step": 333 + }, + { + "epoch": 0.16146966400773508, + "grad_norm": 0.5171074604025283, + "learning_rate": 9.66861794670412e-06, + "loss": 0.5856, + "step": 334 + }, + { + "epoch": 0.16195310611554267, + "grad_norm": 0.5545080974369176, + "learning_rate": 9.665750215963528e-06, + "loss": 0.5789, + "step": 335 + }, + { + "epoch": 0.16243654822335024, + "grad_norm": 0.49939805294647144, + "learning_rate": 9.662870559189344e-06, + "loss": 0.5702, + "step": 336 + }, + { + "epoch": 0.16291999033115784, + "grad_norm": 0.49295646596373777, + "learning_rate": 9.6599789837422e-06, + "loss": 0.5742, + "step": 337 + }, + { + "epoch": 0.16340343243896543, + "grad_norm": 0.5522231456414357, + "learning_rate": 9.657075497013202e-06, + "loss": 0.5752, + "step": 338 + }, + { + "epoch": 0.16388687454677303, + "grad_norm": 0.5606395929711875, + "learning_rate": 9.654160106423891e-06, + "loss": 0.5854, + "step": 339 + }, + { + "epoch": 0.1643703166545806, + "grad_norm": 0.5086990809592122, + "learning_rate": 9.651232819426242e-06, + "loss": 0.5764, + "step": 340 + }, + { + "epoch": 0.1648537587623882, + "grad_norm": 0.4984930367771814, + "learning_rate": 9.648293643502636e-06, + "loss": 0.5619, + "step": 341 + }, + { + "epoch": 0.1653372008701958, + "grad_norm": 0.5217470426797576, + "learning_rate": 9.645342586165845e-06, + "loss": 0.5833, + "step": 342 + }, + { + "epoch": 0.1658206429780034, + "grad_norm": 0.546389261380125, + "learning_rate": 9.642379654959006e-06, + "loss": 0.5381, + "step": 343 + }, + { + "epoch": 0.166304085085811, + "grad_norm": 0.5439151860872452, + "learning_rate": 9.639404857455614e-06, + "loss": 0.5674, + "step": 344 + }, + { + "epoch": 0.16678752719361856, + "grad_norm": 0.5469688158149608, + "learning_rate": 9.63641820125949e-06, + "loss": 0.5705, + "step": 345 + }, + { + "epoch": 0.16727096930142615, + "grad_norm": 0.4994352161741759, + "learning_rate": 9.633419694004767e-06, + "loss": 0.555, + "step": 346 + }, + { + "epoch": 0.16775441140923375, + "grad_norm": 0.5270157823994652, + "learning_rate": 9.63040934335587e-06, + "loss": 0.5741, + "step": 347 + }, + { + "epoch": 0.16823785351704135, + "grad_norm": 0.5302701119307424, + "learning_rate": 9.627387157007502e-06, + "loss": 0.5775, + "step": 348 + }, + { + "epoch": 0.16872129562484892, + "grad_norm": 0.5005904286760833, + "learning_rate": 9.624353142684611e-06, + "loss": 0.5724, + "step": 349 + }, + { + "epoch": 0.1692047377326565, + "grad_norm": 0.5035595085634601, + "learning_rate": 9.621307308142385e-06, + "loss": 0.5794, + "step": 350 + }, + { + "epoch": 0.1696881798404641, + "grad_norm": 0.521381746170865, + "learning_rate": 9.618249661166218e-06, + "loss": 0.5764, + "step": 351 + }, + { + "epoch": 0.1701716219482717, + "grad_norm": 0.48214165657815927, + "learning_rate": 9.615180209571709e-06, + "loss": 0.5804, + "step": 352 + }, + { + "epoch": 0.17065506405607928, + "grad_norm": 0.47552991671065514, + "learning_rate": 9.612098961204617e-06, + "loss": 0.5581, + "step": 353 + }, + { + "epoch": 0.17113850616388687, + "grad_norm": 0.46097880469562935, + "learning_rate": 9.609005923940865e-06, + "loss": 0.5618, + "step": 354 + }, + { + "epoch": 0.17162194827169447, + "grad_norm": 0.5629931104502605, + "learning_rate": 9.605901105686503e-06, + "loss": 0.5694, + "step": 355 + }, + { + "epoch": 0.17210539037950207, + "grad_norm": 0.5179757776717347, + "learning_rate": 9.602784514377701e-06, + "loss": 0.5897, + "step": 356 + }, + { + "epoch": 0.17258883248730963, + "grad_norm": 0.5355839686571028, + "learning_rate": 9.599656157980715e-06, + "loss": 0.5724, + "step": 357 + }, + { + "epoch": 0.17307227459511723, + "grad_norm": 0.6350286695754506, + "learning_rate": 9.596516044491873e-06, + "loss": 0.577, + "step": 358 + }, + { + "epoch": 0.17355571670292483, + "grad_norm": 0.5295601313068036, + "learning_rate": 9.593364181937563e-06, + "loss": 0.5834, + "step": 359 + }, + { + "epoch": 0.17403915881073242, + "grad_norm": 0.5016272467409, + "learning_rate": 9.590200578374198e-06, + "loss": 0.5848, + "step": 360 + }, + { + "epoch": 0.17452260091854, + "grad_norm": 0.4734403734457174, + "learning_rate": 9.587025241888202e-06, + "loss": 0.5629, + "step": 361 + }, + { + "epoch": 0.1750060430263476, + "grad_norm": 0.5345541955737336, + "learning_rate": 9.583838180595993e-06, + "loss": 0.5619, + "step": 362 + }, + { + "epoch": 0.1754894851341552, + "grad_norm": 0.5159159294276754, + "learning_rate": 9.580639402643957e-06, + "loss": 0.5788, + "step": 363 + }, + { + "epoch": 0.17597292724196278, + "grad_norm": 0.5475730953848408, + "learning_rate": 9.577428916208426e-06, + "loss": 0.5758, + "step": 364 + }, + { + "epoch": 0.17645636934977035, + "grad_norm": 0.5065491502971655, + "learning_rate": 9.574206729495662e-06, + "loss": 0.5739, + "step": 365 + }, + { + "epoch": 0.17693981145757795, + "grad_norm": 0.5385122338140608, + "learning_rate": 9.570972850741839e-06, + "loss": 0.5646, + "step": 366 + }, + { + "epoch": 0.17742325356538555, + "grad_norm": 0.5282114345918013, + "learning_rate": 9.567727288213005e-06, + "loss": 0.5809, + "step": 367 + }, + { + "epoch": 0.17790669567319314, + "grad_norm": 0.5183724179001736, + "learning_rate": 9.564470050205084e-06, + "loss": 0.5745, + "step": 368 + }, + { + "epoch": 0.1783901377810007, + "grad_norm": 0.501228022506401, + "learning_rate": 9.561201145043835e-06, + "loss": 0.5759, + "step": 369 + }, + { + "epoch": 0.1788735798888083, + "grad_norm": 0.5161478035704796, + "learning_rate": 9.557920581084848e-06, + "loss": 0.5716, + "step": 370 + }, + { + "epoch": 0.1793570219966159, + "grad_norm": 0.5508440640900468, + "learning_rate": 9.554628366713506e-06, + "loss": 0.5681, + "step": 371 + }, + { + "epoch": 0.1798404641044235, + "grad_norm": 0.4958022642187558, + "learning_rate": 9.551324510344972e-06, + "loss": 0.5674, + "step": 372 + }, + { + "epoch": 0.18032390621223107, + "grad_norm": 0.5211800045547449, + "learning_rate": 9.548009020424172e-06, + "loss": 0.5759, + "step": 373 + }, + { + "epoch": 0.18080734832003867, + "grad_norm": 0.5234346072417955, + "learning_rate": 9.544681905425767e-06, + "loss": 0.5761, + "step": 374 + }, + { + "epoch": 0.18129079042784627, + "grad_norm": 0.5277623761050696, + "learning_rate": 9.541343173854128e-06, + "loss": 0.5846, + "step": 375 + }, + { + "epoch": 0.18177423253565386, + "grad_norm": 0.5159488960453931, + "learning_rate": 9.537992834243323e-06, + "loss": 0.5655, + "step": 376 + }, + { + "epoch": 0.18225767464346146, + "grad_norm": 0.5036893425002033, + "learning_rate": 9.53463089515709e-06, + "loss": 0.578, + "step": 377 + }, + { + "epoch": 0.18274111675126903, + "grad_norm": 0.5500694186101432, + "learning_rate": 9.531257365188818e-06, + "loss": 0.5683, + "step": 378 + }, + { + "epoch": 0.18322455885907662, + "grad_norm": 0.4446123327167339, + "learning_rate": 9.527872252961518e-06, + "loss": 0.5112, + "step": 379 + }, + { + "epoch": 0.18370800096688422, + "grad_norm": 0.49646226307611685, + "learning_rate": 9.524475567127813e-06, + "loss": 0.5799, + "step": 380 + }, + { + "epoch": 0.18419144307469182, + "grad_norm": 0.5455620647014985, + "learning_rate": 9.521067316369903e-06, + "loss": 0.5601, + "step": 381 + }, + { + "epoch": 0.1846748851824994, + "grad_norm": 0.5073331374598753, + "learning_rate": 9.517647509399555e-06, + "loss": 0.5399, + "step": 382 + }, + { + "epoch": 0.18515832729030698, + "grad_norm": 0.5171824333562809, + "learning_rate": 9.514216154958067e-06, + "loss": 0.5754, + "step": 383 + }, + { + "epoch": 0.18564176939811458, + "grad_norm": 0.5085818096253197, + "learning_rate": 9.510773261816261e-06, + "loss": 0.5623, + "step": 384 + }, + { + "epoch": 0.18612521150592218, + "grad_norm": 0.50056273177622, + "learning_rate": 9.507318838774448e-06, + "loss": 0.5774, + "step": 385 + }, + { + "epoch": 0.18660865361372975, + "grad_norm": 0.5493241761943409, + "learning_rate": 9.50385289466241e-06, + "loss": 0.5698, + "step": 386 + }, + { + "epoch": 0.18709209572153734, + "grad_norm": 0.48083872272472233, + "learning_rate": 9.500375438339384e-06, + "loss": 0.5634, + "step": 387 + }, + { + "epoch": 0.18757553782934494, + "grad_norm": 0.48598643847981954, + "learning_rate": 9.496886478694025e-06, + "loss": 0.5642, + "step": 388 + }, + { + "epoch": 0.18805897993715254, + "grad_norm": 0.4945695421669264, + "learning_rate": 9.493386024644396e-06, + "loss": 0.5763, + "step": 389 + }, + { + "epoch": 0.1885424220449601, + "grad_norm": 0.4630609260733735, + "learning_rate": 9.48987408513794e-06, + "loss": 0.5667, + "step": 390 + }, + { + "epoch": 0.1890258641527677, + "grad_norm": 0.5178132025025237, + "learning_rate": 9.486350669151455e-06, + "loss": 0.5633, + "step": 391 + }, + { + "epoch": 0.1895093062605753, + "grad_norm": 0.4855261545618926, + "learning_rate": 9.482815785691082e-06, + "loss": 0.5705, + "step": 392 + }, + { + "epoch": 0.1899927483683829, + "grad_norm": 0.48580056178653924, + "learning_rate": 9.47926944379226e-06, + "loss": 0.5703, + "step": 393 + }, + { + "epoch": 0.19047619047619047, + "grad_norm": 0.5308237684959329, + "learning_rate": 9.475711652519732e-06, + "loss": 0.5583, + "step": 394 + }, + { + "epoch": 0.19095963258399806, + "grad_norm": 0.5127712618313278, + "learning_rate": 9.472142420967496e-06, + "loss": 0.5674, + "step": 395 + }, + { + "epoch": 0.19144307469180566, + "grad_norm": 0.4833488281294125, + "learning_rate": 9.468561758258795e-06, + "loss": 0.578, + "step": 396 + }, + { + "epoch": 0.19192651679961326, + "grad_norm": 0.5644191416840888, + "learning_rate": 9.464969673546092e-06, + "loss": 0.582, + "step": 397 + }, + { + "epoch": 0.19240995890742082, + "grad_norm": 0.5561428050479044, + "learning_rate": 9.461366176011047e-06, + "loss": 0.5762, + "step": 398 + }, + { + "epoch": 0.19289340101522842, + "grad_norm": 0.49800634280761286, + "learning_rate": 9.457751274864486e-06, + "loss": 0.5786, + "step": 399 + }, + { + "epoch": 0.19337684312303602, + "grad_norm": 0.464098426014889, + "learning_rate": 9.454124979346392e-06, + "loss": 0.531, + "step": 400 + }, + { + "epoch": 0.19386028523084362, + "grad_norm": 0.5317711530861378, + "learning_rate": 9.450487298725866e-06, + "loss": 0.5735, + "step": 401 + }, + { + "epoch": 0.19434372733865118, + "grad_norm": 0.5700860255634325, + "learning_rate": 9.446838242301113e-06, + "loss": 0.5736, + "step": 402 + }, + { + "epoch": 0.19482716944645878, + "grad_norm": 0.5415575586047788, + "learning_rate": 9.443177819399416e-06, + "loss": 0.5682, + "step": 403 + }, + { + "epoch": 0.19531061155426638, + "grad_norm": 0.45162964809703743, + "learning_rate": 9.439506039377111e-06, + "loss": 0.5457, + "step": 404 + }, + { + "epoch": 0.19579405366207397, + "grad_norm": 0.48073200361222107, + "learning_rate": 9.435822911619564e-06, + "loss": 0.5452, + "step": 405 + }, + { + "epoch": 0.19627749576988154, + "grad_norm": 0.5218011226870963, + "learning_rate": 9.432128445541147e-06, + "loss": 0.5569, + "step": 406 + }, + { + "epoch": 0.19676093787768914, + "grad_norm": 0.5241766492312198, + "learning_rate": 9.42842265058521e-06, + "loss": 0.5791, + "step": 407 + }, + { + "epoch": 0.19724437998549674, + "grad_norm": 0.4747479232641684, + "learning_rate": 9.424705536224065e-06, + "loss": 0.572, + "step": 408 + }, + { + "epoch": 0.19772782209330433, + "grad_norm": 0.4892195750767198, + "learning_rate": 9.420977111958957e-06, + "loss": 0.577, + "step": 409 + }, + { + "epoch": 0.1982112642011119, + "grad_norm": 0.49625147154018395, + "learning_rate": 9.41723738732004e-06, + "loss": 0.5673, + "step": 410 + }, + { + "epoch": 0.1986947063089195, + "grad_norm": 0.553969116933997, + "learning_rate": 9.41348637186635e-06, + "loss": 0.5805, + "step": 411 + }, + { + "epoch": 0.1991781484167271, + "grad_norm": 0.5271833056864474, + "learning_rate": 9.409724075185782e-06, + "loss": 0.5811, + "step": 412 + }, + { + "epoch": 0.1996615905245347, + "grad_norm": 0.541152410560869, + "learning_rate": 9.405950506895074e-06, + "loss": 0.5539, + "step": 413 + }, + { + "epoch": 0.2001450326323423, + "grad_norm": 0.4827367980584999, + "learning_rate": 9.40216567663977e-06, + "loss": 0.5754, + "step": 414 + }, + { + "epoch": 0.20062847474014986, + "grad_norm": 0.49177545628835745, + "learning_rate": 9.398369594094198e-06, + "loss": 0.508, + "step": 415 + }, + { + "epoch": 0.20111191684795746, + "grad_norm": 0.50467312755319, + "learning_rate": 9.394562268961454e-06, + "loss": 0.5681, + "step": 416 + }, + { + "epoch": 0.20159535895576505, + "grad_norm": 0.4916777572033636, + "learning_rate": 9.390743710973366e-06, + "loss": 0.575, + "step": 417 + }, + { + "epoch": 0.20207880106357265, + "grad_norm": 0.5183550927798377, + "learning_rate": 9.386913929890478e-06, + "loss": 0.57, + "step": 418 + }, + { + "epoch": 0.20256224317138022, + "grad_norm": 0.47362092706218123, + "learning_rate": 9.383072935502018e-06, + "loss": 0.5644, + "step": 419 + }, + { + "epoch": 0.20304568527918782, + "grad_norm": 0.49530019201729136, + "learning_rate": 9.379220737625877e-06, + "loss": 0.564, + "step": 420 + }, + { + "epoch": 0.2035291273869954, + "grad_norm": 0.5108751966700111, + "learning_rate": 9.375357346108583e-06, + "loss": 0.5602, + "step": 421 + }, + { + "epoch": 0.204012569494803, + "grad_norm": 0.5230318233484302, + "learning_rate": 9.371482770825277e-06, + "loss": 0.5695, + "step": 422 + }, + { + "epoch": 0.20449601160261058, + "grad_norm": 0.4802393361720882, + "learning_rate": 9.367597021679686e-06, + "loss": 0.5661, + "step": 423 + }, + { + "epoch": 0.20497945371041817, + "grad_norm": 0.541773712373739, + "learning_rate": 9.363700108604096e-06, + "loss": 0.5582, + "step": 424 + }, + { + "epoch": 0.20546289581822577, + "grad_norm": 0.5110126727655455, + "learning_rate": 9.359792041559334e-06, + "loss": 0.5645, + "step": 425 + }, + { + "epoch": 0.20594633792603337, + "grad_norm": 0.5486480496411716, + "learning_rate": 9.35587283053473e-06, + "loss": 0.5677, + "step": 426 + }, + { + "epoch": 0.20642978003384094, + "grad_norm": 0.5379779057549923, + "learning_rate": 9.351942485548109e-06, + "loss": 0.5435, + "step": 427 + }, + { + "epoch": 0.20691322214164853, + "grad_norm": 0.5341397558862222, + "learning_rate": 9.348001016645744e-06, + "loss": 0.5599, + "step": 428 + }, + { + "epoch": 0.20739666424945613, + "grad_norm": 0.44238086682442823, + "learning_rate": 9.344048433902351e-06, + "loss": 0.541, + "step": 429 + }, + { + "epoch": 0.20788010635726373, + "grad_norm": 0.5213851954927032, + "learning_rate": 9.340084747421048e-06, + "loss": 0.5366, + "step": 430 + }, + { + "epoch": 0.2083635484650713, + "grad_norm": 0.5349032988779688, + "learning_rate": 9.336109967333337e-06, + "loss": 0.5571, + "step": 431 + }, + { + "epoch": 0.2088469905728789, + "grad_norm": 0.4554230771685569, + "learning_rate": 9.332124103799075e-06, + "loss": 0.5516, + "step": 432 + }, + { + "epoch": 0.2093304326806865, + "grad_norm": 0.5021585721937876, + "learning_rate": 9.328127167006457e-06, + "loss": 0.5679, + "step": 433 + }, + { + "epoch": 0.2098138747884941, + "grad_norm": 0.5025134126056662, + "learning_rate": 9.324119167171967e-06, + "loss": 0.5659, + "step": 434 + }, + { + "epoch": 0.21029731689630166, + "grad_norm": 0.48977518403096176, + "learning_rate": 9.320100114540382e-06, + "loss": 0.5753, + "step": 435 + }, + { + "epoch": 0.21078075900410925, + "grad_norm": 0.4789181842167065, + "learning_rate": 9.316070019384722e-06, + "loss": 0.558, + "step": 436 + }, + { + "epoch": 0.21126420111191685, + "grad_norm": 0.48417362744631853, + "learning_rate": 9.312028892006233e-06, + "loss": 0.5637, + "step": 437 + }, + { + "epoch": 0.21174764321972445, + "grad_norm": 0.5040441298097904, + "learning_rate": 9.307976742734366e-06, + "loss": 0.5603, + "step": 438 + }, + { + "epoch": 0.21223108532753202, + "grad_norm": 0.5003182083782678, + "learning_rate": 9.30391358192674e-06, + "loss": 0.5583, + "step": 439 + }, + { + "epoch": 0.2127145274353396, + "grad_norm": 0.5188458903874932, + "learning_rate": 9.299839419969119e-06, + "loss": 0.5614, + "step": 440 + }, + { + "epoch": 0.2131979695431472, + "grad_norm": 0.4990120996823676, + "learning_rate": 9.295754267275393e-06, + "loss": 0.5732, + "step": 441 + }, + { + "epoch": 0.2136814116509548, + "grad_norm": 0.43407580533296863, + "learning_rate": 9.291658134287537e-06, + "loss": 0.5451, + "step": 442 + }, + { + "epoch": 0.21416485375876237, + "grad_norm": 0.487299832131986, + "learning_rate": 9.287551031475604e-06, + "loss": 0.5486, + "step": 443 + }, + { + "epoch": 0.21464829586656997, + "grad_norm": 0.4748601209022523, + "learning_rate": 9.283432969337672e-06, + "loss": 0.5568, + "step": 444 + }, + { + "epoch": 0.21513173797437757, + "grad_norm": 0.5116954397180901, + "learning_rate": 9.279303958399846e-06, + "loss": 0.5561, + "step": 445 + }, + { + "epoch": 0.21561518008218516, + "grad_norm": 0.5103832796562369, + "learning_rate": 9.275164009216205e-06, + "loss": 0.5653, + "step": 446 + }, + { + "epoch": 0.21609862218999276, + "grad_norm": 0.453674255766726, + "learning_rate": 9.271013132368799e-06, + "loss": 0.5359, + "step": 447 + }, + { + "epoch": 0.21658206429780033, + "grad_norm": 0.4865827031825044, + "learning_rate": 9.266851338467598e-06, + "loss": 0.5627, + "step": 448 + }, + { + "epoch": 0.21706550640560793, + "grad_norm": 0.4474998958247519, + "learning_rate": 9.262678638150486e-06, + "loss": 0.5372, + "step": 449 + }, + { + "epoch": 0.21754894851341552, + "grad_norm": 0.5312817145455567, + "learning_rate": 9.258495042083222e-06, + "loss": 0.583, + "step": 450 + }, + { + "epoch": 0.21803239062122312, + "grad_norm": 0.5326646088756841, + "learning_rate": 9.254300560959413e-06, + "loss": 0.5641, + "step": 451 + }, + { + "epoch": 0.2185158327290307, + "grad_norm": 0.47741110714076435, + "learning_rate": 9.25009520550049e-06, + "loss": 0.5692, + "step": 452 + }, + { + "epoch": 0.2189992748368383, + "grad_norm": 0.4992778758439529, + "learning_rate": 9.245878986455684e-06, + "loss": 0.5732, + "step": 453 + }, + { + "epoch": 0.21948271694464588, + "grad_norm": 0.5067531688765293, + "learning_rate": 9.241651914601986e-06, + "loss": 0.5684, + "step": 454 + }, + { + "epoch": 0.21996615905245348, + "grad_norm": 0.5259329600281596, + "learning_rate": 9.237414000744134e-06, + "loss": 0.5728, + "step": 455 + }, + { + "epoch": 0.22044960116026105, + "grad_norm": 0.4912112930780334, + "learning_rate": 9.23316525571458e-06, + "loss": 0.5543, + "step": 456 + }, + { + "epoch": 0.22093304326806865, + "grad_norm": 0.4325116439857764, + "learning_rate": 9.228905690373456e-06, + "loss": 0.5109, + "step": 457 + }, + { + "epoch": 0.22141648537587624, + "grad_norm": 0.5251969417490432, + "learning_rate": 9.224635315608554e-06, + "loss": 0.5613, + "step": 458 + }, + { + "epoch": 0.22189992748368384, + "grad_norm": 0.5371164613513753, + "learning_rate": 9.2203541423353e-06, + "loss": 0.5758, + "step": 459 + }, + { + "epoch": 0.2223833695914914, + "grad_norm": 0.49879877094748626, + "learning_rate": 9.216062181496712e-06, + "loss": 0.5656, + "step": 460 + }, + { + "epoch": 0.222866811699299, + "grad_norm": 0.4666739038962981, + "learning_rate": 9.211759444063392e-06, + "loss": 0.5643, + "step": 461 + }, + { + "epoch": 0.2233502538071066, + "grad_norm": 0.5019702713381807, + "learning_rate": 9.207445941033483e-06, + "loss": 0.5645, + "step": 462 + }, + { + "epoch": 0.2238336959149142, + "grad_norm": 0.560484985437826, + "learning_rate": 9.203121683432646e-06, + "loss": 0.5622, + "step": 463 + }, + { + "epoch": 0.22431713802272177, + "grad_norm": 0.501701537299382, + "learning_rate": 9.19878668231403e-06, + "loss": 0.5686, + "step": 464 + }, + { + "epoch": 0.22480058013052936, + "grad_norm": 0.48640275847390047, + "learning_rate": 9.19444094875825e-06, + "loss": 0.5617, + "step": 465 + }, + { + "epoch": 0.22528402223833696, + "grad_norm": 0.5066662929437282, + "learning_rate": 9.190084493873353e-06, + "loss": 0.5733, + "step": 466 + }, + { + "epoch": 0.22576746434614456, + "grad_norm": 0.5297511031777309, + "learning_rate": 9.185717328794784e-06, + "loss": 0.5632, + "step": 467 + }, + { + "epoch": 0.22625090645395213, + "grad_norm": 0.5778692323663056, + "learning_rate": 9.18133946468537e-06, + "loss": 0.5684, + "step": 468 + }, + { + "epoch": 0.22673434856175972, + "grad_norm": 0.5148715492097395, + "learning_rate": 9.176950912735287e-06, + "loss": 0.5559, + "step": 469 + }, + { + "epoch": 0.22721779066956732, + "grad_norm": 0.5157447753884506, + "learning_rate": 9.172551684162025e-06, + "loss": 0.5731, + "step": 470 + }, + { + "epoch": 0.22770123277737492, + "grad_norm": 0.4783319000473412, + "learning_rate": 9.16814179021037e-06, + "loss": 0.5671, + "step": 471 + }, + { + "epoch": 0.2281846748851825, + "grad_norm": 0.5017422895280137, + "learning_rate": 9.163721242152362e-06, + "loss": 0.5661, + "step": 472 + }, + { + "epoch": 0.22866811699299008, + "grad_norm": 0.49272124386072536, + "learning_rate": 9.159290051287282e-06, + "loss": 0.5627, + "step": 473 + }, + { + "epoch": 0.22915155910079768, + "grad_norm": 0.47471736533769476, + "learning_rate": 9.154848228941607e-06, + "loss": 0.5615, + "step": 474 + }, + { + "epoch": 0.22963500120860528, + "grad_norm": 0.5071884927272643, + "learning_rate": 9.150395786468998e-06, + "loss": 0.5645, + "step": 475 + }, + { + "epoch": 0.23011844331641285, + "grad_norm": 0.48690399925776484, + "learning_rate": 9.14593273525025e-06, + "loss": 0.5647, + "step": 476 + }, + { + "epoch": 0.23060188542422044, + "grad_norm": 0.5041235784595942, + "learning_rate": 9.14145908669329e-06, + "loss": 0.5729, + "step": 477 + }, + { + "epoch": 0.23108532753202804, + "grad_norm": 0.5265161224054821, + "learning_rate": 9.136974852233118e-06, + "loss": 0.5587, + "step": 478 + }, + { + "epoch": 0.23156876963983564, + "grad_norm": 0.4778337324840926, + "learning_rate": 9.132480043331801e-06, + "loss": 0.5646, + "step": 479 + }, + { + "epoch": 0.23205221174764323, + "grad_norm": 0.5036800160533508, + "learning_rate": 9.127974671478432e-06, + "loss": 0.5655, + "step": 480 + }, + { + "epoch": 0.2325356538554508, + "grad_norm": 0.4915164507750186, + "learning_rate": 9.123458748189105e-06, + "loss": 0.5608, + "step": 481 + }, + { + "epoch": 0.2330190959632584, + "grad_norm": 0.4447947403953834, + "learning_rate": 9.118932285006886e-06, + "loss": 0.5254, + "step": 482 + }, + { + "epoch": 0.233502538071066, + "grad_norm": 0.4936810479165672, + "learning_rate": 9.114395293501775e-06, + "loss": 0.5751, + "step": 483 + }, + { + "epoch": 0.2339859801788736, + "grad_norm": 0.4933009245810686, + "learning_rate": 9.10984778527069e-06, + "loss": 0.5603, + "step": 484 + }, + { + "epoch": 0.23446942228668116, + "grad_norm": 0.4720549987110232, + "learning_rate": 9.10528977193743e-06, + "loss": 0.5703, + "step": 485 + }, + { + "epoch": 0.23495286439448876, + "grad_norm": 0.5362136689894559, + "learning_rate": 9.100721265152644e-06, + "loss": 0.5635, + "step": 486 + }, + { + "epoch": 0.23543630650229636, + "grad_norm": 0.47602005538977166, + "learning_rate": 9.096142276593802e-06, + "loss": 0.5721, + "step": 487 + }, + { + "epoch": 0.23591974861010395, + "grad_norm": 0.48887012727323886, + "learning_rate": 9.09155281796517e-06, + "loss": 0.5502, + "step": 488 + }, + { + "epoch": 0.23640319071791152, + "grad_norm": 0.5468866437635687, + "learning_rate": 9.086952900997774e-06, + "loss": 0.5628, + "step": 489 + }, + { + "epoch": 0.23688663282571912, + "grad_norm": 0.468285091758703, + "learning_rate": 9.082342537449369e-06, + "loss": 0.5649, + "step": 490 + }, + { + "epoch": 0.23737007493352671, + "grad_norm": 0.49449575173177474, + "learning_rate": 9.07772173910442e-06, + "loss": 0.5363, + "step": 491 + }, + { + "epoch": 0.2378535170413343, + "grad_norm": 0.5665277859908898, + "learning_rate": 9.073090517774057e-06, + "loss": 0.5679, + "step": 492 + }, + { + "epoch": 0.23833695914914188, + "grad_norm": 0.559218042712036, + "learning_rate": 9.068448885296057e-06, + "loss": 0.5598, + "step": 493 + }, + { + "epoch": 0.23882040125694948, + "grad_norm": 0.5572180299965971, + "learning_rate": 9.063796853534808e-06, + "loss": 0.5606, + "step": 494 + }, + { + "epoch": 0.23930384336475707, + "grad_norm": 0.4852501650353095, + "learning_rate": 9.059134434381274e-06, + "loss": 0.5614, + "step": 495 + }, + { + "epoch": 0.23978728547256467, + "grad_norm": 0.5235782249928449, + "learning_rate": 9.054461639752976e-06, + "loss": 0.5637, + "step": 496 + }, + { + "epoch": 0.24027072758037224, + "grad_norm": 0.5028533022976227, + "learning_rate": 9.049778481593954e-06, + "loss": 0.5718, + "step": 497 + }, + { + "epoch": 0.24075416968817984, + "grad_norm": 0.508045864936268, + "learning_rate": 9.045084971874738e-06, + "loss": 0.5651, + "step": 498 + }, + { + "epoch": 0.24123761179598743, + "grad_norm": 0.5575870011120908, + "learning_rate": 9.040381122592317e-06, + "loss": 0.565, + "step": 499 + }, + { + "epoch": 0.24172105390379503, + "grad_norm": 0.5201685839473924, + "learning_rate": 9.035666945770107e-06, + "loss": 0.5593, + "step": 500 + }, + { + "epoch": 0.2422044960116026, + "grad_norm": 0.48179233555943923, + "learning_rate": 9.030942453457928e-06, + "loss": 0.5199, + "step": 501 + }, + { + "epoch": 0.2426879381194102, + "grad_norm": 0.4867208952029737, + "learning_rate": 9.02620765773196e-06, + "loss": 0.5548, + "step": 502 + }, + { + "epoch": 0.2431713802272178, + "grad_norm": 0.5240394440690106, + "learning_rate": 9.02146257069472e-06, + "loss": 0.5611, + "step": 503 + }, + { + "epoch": 0.2436548223350254, + "grad_norm": 0.48307750050965703, + "learning_rate": 9.01670720447504e-06, + "loss": 0.5577, + "step": 504 + }, + { + "epoch": 0.24413826444283296, + "grad_norm": 0.5034030614527921, + "learning_rate": 9.011941571228015e-06, + "loss": 0.5608, + "step": 505 + }, + { + "epoch": 0.24462170655064056, + "grad_norm": 0.46379490536223517, + "learning_rate": 9.007165683134986e-06, + "loss": 0.5315, + "step": 506 + }, + { + "epoch": 0.24510514865844815, + "grad_norm": 0.5103811282689319, + "learning_rate": 9.00237955240351e-06, + "loss": 0.5613, + "step": 507 + }, + { + "epoch": 0.24558859076625575, + "grad_norm": 0.47564392120255755, + "learning_rate": 8.997583191267326e-06, + "loss": 0.5764, + "step": 508 + }, + { + "epoch": 0.24607203287406332, + "grad_norm": 0.4811799201923712, + "learning_rate": 8.992776611986313e-06, + "loss": 0.5704, + "step": 509 + }, + { + "epoch": 0.24655547498187091, + "grad_norm": 0.4799439081762819, + "learning_rate": 8.987959826846479e-06, + "loss": 0.5573, + "step": 510 + }, + { + "epoch": 0.2470389170896785, + "grad_norm": 0.5219349618857427, + "learning_rate": 8.983132848159916e-06, + "loss": 0.5583, + "step": 511 + }, + { + "epoch": 0.2475223591974861, + "grad_norm": 0.5010818591918965, + "learning_rate": 8.978295688264768e-06, + "loss": 0.5699, + "step": 512 + }, + { + "epoch": 0.2480058013052937, + "grad_norm": 0.5282819201955711, + "learning_rate": 8.973448359525207e-06, + "loss": 0.5641, + "step": 513 + }, + { + "epoch": 0.24848924341310127, + "grad_norm": 0.5025819972323563, + "learning_rate": 8.968590874331395e-06, + "loss": 0.5649, + "step": 514 + }, + { + "epoch": 0.24897268552090887, + "grad_norm": 0.4880024154213522, + "learning_rate": 8.963723245099456e-06, + "loss": 0.5533, + "step": 515 + }, + { + "epoch": 0.24945612762871647, + "grad_norm": 0.4844265343558768, + "learning_rate": 8.958845484271443e-06, + "loss": 0.5571, + "step": 516 + }, + { + "epoch": 0.24993956973652406, + "grad_norm": 0.4918270286134992, + "learning_rate": 8.953957604315306e-06, + "loss": 0.5612, + "step": 517 + }, + { + "epoch": 0.25042301184433163, + "grad_norm": 0.4532098318099568, + "learning_rate": 8.949059617724859e-06, + "loss": 0.5532, + "step": 518 + }, + { + "epoch": 0.25090645395213923, + "grad_norm": 0.4784777680132966, + "learning_rate": 8.944151537019752e-06, + "loss": 0.5314, + "step": 519 + }, + { + "epoch": 0.2513898960599468, + "grad_norm": 0.49834032614411844, + "learning_rate": 8.939233374745432e-06, + "loss": 0.561, + "step": 520 + }, + { + "epoch": 0.2518733381677544, + "grad_norm": 0.43922831313439964, + "learning_rate": 8.934305143473123e-06, + "loss": 0.5229, + "step": 521 + }, + { + "epoch": 0.252356780275562, + "grad_norm": 0.4774051999235377, + "learning_rate": 8.929366855799777e-06, + "loss": 0.5584, + "step": 522 + }, + { + "epoch": 0.2528402223833696, + "grad_norm": 0.4860585540987837, + "learning_rate": 8.924418524348058e-06, + "loss": 0.5722, + "step": 523 + }, + { + "epoch": 0.25332366449117716, + "grad_norm": 0.476115105724116, + "learning_rate": 8.919460161766299e-06, + "loss": 0.5527, + "step": 524 + }, + { + "epoch": 0.25380710659898476, + "grad_norm": 0.49670836036646415, + "learning_rate": 8.914491780728471e-06, + "loss": 0.565, + "step": 525 + }, + { + "epoch": 0.25429054870679235, + "grad_norm": 0.49705890206049747, + "learning_rate": 8.909513393934162e-06, + "loss": 0.5562, + "step": 526 + }, + { + "epoch": 0.25477399081459995, + "grad_norm": 0.5118474736649574, + "learning_rate": 8.904525014108529e-06, + "loss": 0.5536, + "step": 527 + }, + { + "epoch": 0.25525743292240755, + "grad_norm": 0.5301718242423505, + "learning_rate": 8.899526654002268e-06, + "loss": 0.5612, + "step": 528 + }, + { + "epoch": 0.25574087503021514, + "grad_norm": 0.4796891269551852, + "learning_rate": 8.894518326391595e-06, + "loss": 0.5578, + "step": 529 + }, + { + "epoch": 0.25622431713802274, + "grad_norm": 0.4825310469483714, + "learning_rate": 8.889500044078199e-06, + "loss": 0.5554, + "step": 530 + }, + { + "epoch": 0.25670775924583034, + "grad_norm": 0.474416307358851, + "learning_rate": 8.88447181988921e-06, + "loss": 0.5466, + "step": 531 + }, + { + "epoch": 0.2571912013536379, + "grad_norm": 0.4754427571901456, + "learning_rate": 8.87943366667718e-06, + "loss": 0.5232, + "step": 532 + }, + { + "epoch": 0.2576746434614455, + "grad_norm": 0.5283380707149146, + "learning_rate": 8.87438559732003e-06, + "loss": 0.5575, + "step": 533 + }, + { + "epoch": 0.25815808556925307, + "grad_norm": 0.49022668890084664, + "learning_rate": 8.869327624721033e-06, + "loss": 0.5584, + "step": 534 + }, + { + "epoch": 0.25864152767706067, + "grad_norm": 0.45329648879294543, + "learning_rate": 8.864259761808778e-06, + "loss": 0.5557, + "step": 535 + }, + { + "epoch": 0.25912496978486826, + "grad_norm": 0.5150923796193744, + "learning_rate": 8.859182021537126e-06, + "loss": 0.5672, + "step": 536 + }, + { + "epoch": 0.25960841189267586, + "grad_norm": 0.49475203737919254, + "learning_rate": 8.854094416885192e-06, + "loss": 0.5513, + "step": 537 + }, + { + "epoch": 0.26009185400048346, + "grad_norm": 0.48640723658571816, + "learning_rate": 8.848996960857308e-06, + "loss": 0.5542, + "step": 538 + }, + { + "epoch": 0.26057529610829105, + "grad_norm": 0.5011403090647114, + "learning_rate": 8.843889666482977e-06, + "loss": 0.5503, + "step": 539 + }, + { + "epoch": 0.2610587382160986, + "grad_norm": 0.45868293065964316, + "learning_rate": 8.838772546816857e-06, + "loss": 0.5245, + "step": 540 + }, + { + "epoch": 0.2615421803239062, + "grad_norm": 0.5028670832415251, + "learning_rate": 8.833645614938716e-06, + "loss": 0.563, + "step": 541 + }, + { + "epoch": 0.2620256224317138, + "grad_norm": 0.4652958998559184, + "learning_rate": 8.82850888395341e-06, + "loss": 0.5214, + "step": 542 + }, + { + "epoch": 0.2625090645395214, + "grad_norm": 0.48814680090193757, + "learning_rate": 8.823362366990833e-06, + "loss": 0.5539, + "step": 543 + }, + { + "epoch": 0.262992506647329, + "grad_norm": 0.4686742850265713, + "learning_rate": 8.818206077205899e-06, + "loss": 0.5432, + "step": 544 + }, + { + "epoch": 0.2634759487551366, + "grad_norm": 0.44177435010013455, + "learning_rate": 8.8130400277785e-06, + "loss": 0.5432, + "step": 545 + }, + { + "epoch": 0.2639593908629442, + "grad_norm": 0.48745119596264225, + "learning_rate": 8.807864231913475e-06, + "loss": 0.5609, + "step": 546 + }, + { + "epoch": 0.2644428329707518, + "grad_norm": 0.5387031701921053, + "learning_rate": 8.802678702840575e-06, + "loss": 0.5608, + "step": 547 + }, + { + "epoch": 0.26492627507855937, + "grad_norm": 0.47706550642594997, + "learning_rate": 8.79748345381443e-06, + "loss": 0.5487, + "step": 548 + }, + { + "epoch": 0.2654097171863669, + "grad_norm": 0.4694250929319588, + "learning_rate": 8.792278498114517e-06, + "loss": 0.549, + "step": 549 + }, + { + "epoch": 0.2658931592941745, + "grad_norm": 0.4937111232536657, + "learning_rate": 8.78706384904512e-06, + "loss": 0.5564, + "step": 550 + }, + { + "epoch": 0.2663766014019821, + "grad_norm": 0.468357648344204, + "learning_rate": 8.7818395199353e-06, + "loss": 0.5546, + "step": 551 + }, + { + "epoch": 0.2668600435097897, + "grad_norm": 0.4713572915315673, + "learning_rate": 8.77660552413887e-06, + "loss": 0.5512, + "step": 552 + }, + { + "epoch": 0.2673434856175973, + "grad_norm": 0.4740909938486332, + "learning_rate": 8.77136187503434e-06, + "loss": 0.5631, + "step": 553 + }, + { + "epoch": 0.2678269277254049, + "grad_norm": 0.4472174307551216, + "learning_rate": 8.766108586024904e-06, + "loss": 0.5222, + "step": 554 + }, + { + "epoch": 0.2683103698332125, + "grad_norm": 0.4548110236983466, + "learning_rate": 8.760845670538387e-06, + "loss": 0.5485, + "step": 555 + }, + { + "epoch": 0.2687938119410201, + "grad_norm": 0.5173119662805489, + "learning_rate": 8.755573142027228e-06, + "loss": 0.5624, + "step": 556 + }, + { + "epoch": 0.26927725404882763, + "grad_norm": 0.4812632123799694, + "learning_rate": 8.750291013968432e-06, + "loss": 0.5562, + "step": 557 + }, + { + "epoch": 0.2697606961566352, + "grad_norm": 0.472663174890125, + "learning_rate": 8.744999299863549e-06, + "loss": 0.5669, + "step": 558 + }, + { + "epoch": 0.2702441382644428, + "grad_norm": 0.463122081686998, + "learning_rate": 8.739698013238625e-06, + "loss": 0.557, + "step": 559 + }, + { + "epoch": 0.2707275803722504, + "grad_norm": 0.5188284707009508, + "learning_rate": 8.734387167644171e-06, + "loss": 0.5202, + "step": 560 + }, + { + "epoch": 0.271211022480058, + "grad_norm": 0.49659334079030504, + "learning_rate": 8.729066776655144e-06, + "loss": 0.5605, + "step": 561 + }, + { + "epoch": 0.2716944645878656, + "grad_norm": 0.4433765304016552, + "learning_rate": 8.723736853870888e-06, + "loss": 0.5193, + "step": 562 + }, + { + "epoch": 0.2721779066956732, + "grad_norm": 0.46285084832800716, + "learning_rate": 8.718397412915114e-06, + "loss": 0.5583, + "step": 563 + }, + { + "epoch": 0.2726613488034808, + "grad_norm": 0.4406166472711255, + "learning_rate": 8.713048467435865e-06, + "loss": 0.5365, + "step": 564 + }, + { + "epoch": 0.27314479091128835, + "grad_norm": 0.49591339367367465, + "learning_rate": 8.707690031105478e-06, + "loss": 0.5638, + "step": 565 + }, + { + "epoch": 0.27362823301909595, + "grad_norm": 0.47717175741546425, + "learning_rate": 8.702322117620547e-06, + "loss": 0.5375, + "step": 566 + }, + { + "epoch": 0.27411167512690354, + "grad_norm": 0.49399204569955096, + "learning_rate": 8.696944740701891e-06, + "loss": 0.5502, + "step": 567 + }, + { + "epoch": 0.27459511723471114, + "grad_norm": 0.462084403002843, + "learning_rate": 8.69155791409452e-06, + "loss": 0.549, + "step": 568 + }, + { + "epoch": 0.27507855934251874, + "grad_norm": 0.4733870628371529, + "learning_rate": 8.686161651567596e-06, + "loss": 0.5479, + "step": 569 + }, + { + "epoch": 0.27556200145032633, + "grad_norm": 0.4586305030542931, + "learning_rate": 8.6807559669144e-06, + "loss": 0.517, + "step": 570 + }, + { + "epoch": 0.27604544355813393, + "grad_norm": 0.4786603573138254, + "learning_rate": 8.6753408739523e-06, + "loss": 0.5449, + "step": 571 + }, + { + "epoch": 0.2765288856659415, + "grad_norm": 0.4923356872720239, + "learning_rate": 8.669916386522708e-06, + "loss": 0.5516, + "step": 572 + }, + { + "epoch": 0.27701232777374907, + "grad_norm": 0.47497918747290174, + "learning_rate": 8.664482518491053e-06, + "loss": 0.5527, + "step": 573 + }, + { + "epoch": 0.27749576988155666, + "grad_norm": 0.5463551243922615, + "learning_rate": 8.659039283746738e-06, + "loss": 0.5528, + "step": 574 + }, + { + "epoch": 0.27797921198936426, + "grad_norm": 0.5125817786426824, + "learning_rate": 8.653586696203111e-06, + "loss": 0.5428, + "step": 575 + }, + { + "epoch": 0.27846265409717186, + "grad_norm": 0.46930359618316736, + "learning_rate": 8.648124769797424e-06, + "loss": 0.5566, + "step": 576 + }, + { + "epoch": 0.27894609620497945, + "grad_norm": 0.48871061545968875, + "learning_rate": 8.6426535184908e-06, + "loss": 0.5517, + "step": 577 + }, + { + "epoch": 0.27942953831278705, + "grad_norm": 0.49517469382405177, + "learning_rate": 8.637172956268203e-06, + "loss": 0.5537, + "step": 578 + }, + { + "epoch": 0.27991298042059465, + "grad_norm": 0.46885534036424203, + "learning_rate": 8.631683097138386e-06, + "loss": 0.5455, + "step": 579 + }, + { + "epoch": 0.28039642252840224, + "grad_norm": 0.4404595141316285, + "learning_rate": 8.626183955133876e-06, + "loss": 0.5216, + "step": 580 + }, + { + "epoch": 0.2808798646362098, + "grad_norm": 0.45805738086780906, + "learning_rate": 8.620675544310921e-06, + "loss": 0.5483, + "step": 581 + }, + { + "epoch": 0.2813633067440174, + "grad_norm": 0.43283074014496, + "learning_rate": 8.615157878749462e-06, + "loss": 0.546, + "step": 582 + }, + { + "epoch": 0.281846748851825, + "grad_norm": 0.45714306807295113, + "learning_rate": 8.609630972553098e-06, + "loss": 0.5521, + "step": 583 + }, + { + "epoch": 0.2823301909596326, + "grad_norm": 0.47311376331001226, + "learning_rate": 8.604094839849047e-06, + "loss": 0.5586, + "step": 584 + }, + { + "epoch": 0.2828136330674402, + "grad_norm": 0.4424955765808361, + "learning_rate": 8.598549494788111e-06, + "loss": 0.5384, + "step": 585 + }, + { + "epoch": 0.28329707517524777, + "grad_norm": 0.467505480407099, + "learning_rate": 8.592994951544637e-06, + "loss": 0.5368, + "step": 586 + }, + { + "epoch": 0.28378051728305537, + "grad_norm": 0.48553503600686004, + "learning_rate": 8.587431224316488e-06, + "loss": 0.5475, + "step": 587 + }, + { + "epoch": 0.28426395939086296, + "grad_norm": 0.46529868946828945, + "learning_rate": 8.581858327324996e-06, + "loss": 0.5212, + "step": 588 + }, + { + "epoch": 0.28474740149867056, + "grad_norm": 0.4898248932325677, + "learning_rate": 8.576276274814936e-06, + "loss": 0.553, + "step": 589 + }, + { + "epoch": 0.2852308436064781, + "grad_norm": 0.4455201034159363, + "learning_rate": 8.570685081054487e-06, + "loss": 0.5216, + "step": 590 + }, + { + "epoch": 0.2857142857142857, + "grad_norm": 0.46516198660507346, + "learning_rate": 8.565084760335188e-06, + "loss": 0.5505, + "step": 591 + }, + { + "epoch": 0.2861977278220933, + "grad_norm": 0.5006509760317717, + "learning_rate": 8.559475326971907e-06, + "loss": 0.551, + "step": 592 + }, + { + "epoch": 0.2866811699299009, + "grad_norm": 0.4994980078510237, + "learning_rate": 8.553856795302815e-06, + "loss": 0.5421, + "step": 593 + }, + { + "epoch": 0.2871646120377085, + "grad_norm": 0.5029150812228765, + "learning_rate": 8.548229179689325e-06, + "loss": 0.5519, + "step": 594 + }, + { + "epoch": 0.2876480541455161, + "grad_norm": 0.5073270485472724, + "learning_rate": 8.54259249451608e-06, + "loss": 0.5537, + "step": 595 + }, + { + "epoch": 0.2881314962533237, + "grad_norm": 0.45709172284548705, + "learning_rate": 8.536946754190903e-06, + "loss": 0.564, + "step": 596 + }, + { + "epoch": 0.2886149383611313, + "grad_norm": 0.47174309410425874, + "learning_rate": 8.531291973144755e-06, + "loss": 0.5452, + "step": 597 + }, + { + "epoch": 0.2890983804689388, + "grad_norm": 0.46639713589843634, + "learning_rate": 8.52562816583172e-06, + "loss": 0.5509, + "step": 598 + }, + { + "epoch": 0.2895818225767464, + "grad_norm": 0.4508036851803557, + "learning_rate": 8.519955346728939e-06, + "loss": 0.5428, + "step": 599 + }, + { + "epoch": 0.290065264684554, + "grad_norm": 0.44468353218524803, + "learning_rate": 8.5142735303366e-06, + "loss": 0.5205, + "step": 600 + }, + { + "epoch": 0.2905487067923616, + "grad_norm": 0.4850164433619974, + "learning_rate": 8.50858273117788e-06, + "loss": 0.5476, + "step": 601 + }, + { + "epoch": 0.2910321489001692, + "grad_norm": 0.49303870805000655, + "learning_rate": 8.502882963798923e-06, + "loss": 0.545, + "step": 602 + }, + { + "epoch": 0.2915155910079768, + "grad_norm": 0.47572858582093197, + "learning_rate": 8.497174242768792e-06, + "loss": 0.5515, + "step": 603 + }, + { + "epoch": 0.2919990331157844, + "grad_norm": 0.5284607359345597, + "learning_rate": 8.49145658267944e-06, + "loss": 0.5453, + "step": 604 + }, + { + "epoch": 0.292482475223592, + "grad_norm": 0.47829654266425203, + "learning_rate": 8.485729998145665e-06, + "loss": 0.5452, + "step": 605 + }, + { + "epoch": 0.29296591733139954, + "grad_norm": 0.4503645291799449, + "learning_rate": 8.479994503805079e-06, + "loss": 0.5536, + "step": 606 + }, + { + "epoch": 0.29344935943920714, + "grad_norm": 0.4693738299713831, + "learning_rate": 8.474250114318066e-06, + "loss": 0.5216, + "step": 607 + }, + { + "epoch": 0.29393280154701473, + "grad_norm": 0.4988674830387375, + "learning_rate": 8.468496844367752e-06, + "loss": 0.5582, + "step": 608 + }, + { + "epoch": 0.29441624365482233, + "grad_norm": 0.47627140431869974, + "learning_rate": 8.462734708659959e-06, + "loss": 0.5511, + "step": 609 + }, + { + "epoch": 0.2948996857626299, + "grad_norm": 0.43233992742433075, + "learning_rate": 8.456963721923166e-06, + "loss": 0.5279, + "step": 610 + }, + { + "epoch": 0.2953831278704375, + "grad_norm": 0.4908070311501362, + "learning_rate": 8.451183898908484e-06, + "loss": 0.5546, + "step": 611 + }, + { + "epoch": 0.2958665699782451, + "grad_norm": 0.4519643584485447, + "learning_rate": 8.445395254389605e-06, + "loss": 0.5221, + "step": 612 + }, + { + "epoch": 0.2963500120860527, + "grad_norm": 0.48396713453490725, + "learning_rate": 8.439597803162773e-06, + "loss": 0.5489, + "step": 613 + }, + { + "epoch": 0.29683345419386026, + "grad_norm": 0.4611763742603572, + "learning_rate": 8.433791560046737e-06, + "loss": 0.5457, + "step": 614 + }, + { + "epoch": 0.29731689630166785, + "grad_norm": 0.472544396347692, + "learning_rate": 8.427976539882725e-06, + "loss": 0.5553, + "step": 615 + }, + { + "epoch": 0.29780033840947545, + "grad_norm": 0.5058827141310254, + "learning_rate": 8.422152757534395e-06, + "loss": 0.5435, + "step": 616 + }, + { + "epoch": 0.29828378051728305, + "grad_norm": 0.4766589825937423, + "learning_rate": 8.416320227887805e-06, + "loss": 0.5526, + "step": 617 + }, + { + "epoch": 0.29876722262509064, + "grad_norm": 0.47223702801719897, + "learning_rate": 8.410478965851371e-06, + "loss": 0.5542, + "step": 618 + }, + { + "epoch": 0.29925066473289824, + "grad_norm": 0.4819039683875086, + "learning_rate": 8.404628986355832e-06, + "loss": 0.5546, + "step": 619 + }, + { + "epoch": 0.29973410684070584, + "grad_norm": 0.49462386708237827, + "learning_rate": 8.398770304354203e-06, + "loss": 0.5566, + "step": 620 + }, + { + "epoch": 0.30021754894851344, + "grad_norm": 0.4977376021667819, + "learning_rate": 8.39290293482175e-06, + "loss": 0.5508, + "step": 621 + }, + { + "epoch": 0.30070099105632103, + "grad_norm": 0.506465713525892, + "learning_rate": 8.387026892755942e-06, + "loss": 0.5568, + "step": 622 + }, + { + "epoch": 0.3011844331641286, + "grad_norm": 0.4712688161265267, + "learning_rate": 8.381142193176414e-06, + "loss": 0.5489, + "step": 623 + }, + { + "epoch": 0.30166787527193617, + "grad_norm": 0.4615215470431895, + "learning_rate": 8.375248851124937e-06, + "loss": 0.5554, + "step": 624 + }, + { + "epoch": 0.30215131737974377, + "grad_norm": 0.5110895222198079, + "learning_rate": 8.369346881665364e-06, + "loss": 0.5466, + "step": 625 + }, + { + "epoch": 0.30263475948755136, + "grad_norm": 0.47157470051165545, + "learning_rate": 8.363436299883604e-06, + "loss": 0.5644, + "step": 626 + }, + { + "epoch": 0.30311820159535896, + "grad_norm": 0.4789841431133952, + "learning_rate": 8.357517120887586e-06, + "loss": 0.5493, + "step": 627 + }, + { + "epoch": 0.30360164370316656, + "grad_norm": 0.47629409809645545, + "learning_rate": 8.351589359807204e-06, + "loss": 0.5523, + "step": 628 + }, + { + "epoch": 0.30408508581097415, + "grad_norm": 0.4618925314784255, + "learning_rate": 8.345653031794292e-06, + "loss": 0.5348, + "step": 629 + }, + { + "epoch": 0.30456852791878175, + "grad_norm": 0.500646417496574, + "learning_rate": 8.339708152022586e-06, + "loss": 0.554, + "step": 630 + }, + { + "epoch": 0.3050519700265893, + "grad_norm": 0.4553680483630167, + "learning_rate": 8.333754735687677e-06, + "loss": 0.5489, + "step": 631 + }, + { + "epoch": 0.3055354121343969, + "grad_norm": 0.4591000336987377, + "learning_rate": 8.327792798006977e-06, + "loss": 0.5508, + "step": 632 + }, + { + "epoch": 0.3060188542422045, + "grad_norm": 0.48696012413599493, + "learning_rate": 8.321822354219677e-06, + "loss": 0.5505, + "step": 633 + }, + { + "epoch": 0.3065022963500121, + "grad_norm": 0.4952413093498077, + "learning_rate": 8.315843419586717e-06, + "loss": 0.5574, + "step": 634 + }, + { + "epoch": 0.3069857384578197, + "grad_norm": 0.4716786308005616, + "learning_rate": 8.309856009390732e-06, + "loss": 0.5281, + "step": 635 + }, + { + "epoch": 0.3074691805656273, + "grad_norm": 0.5207738583309734, + "learning_rate": 8.303860138936027e-06, + "loss": 0.5607, + "step": 636 + }, + { + "epoch": 0.3079526226734349, + "grad_norm": 0.5226978234399785, + "learning_rate": 8.297855823548528e-06, + "loss": 0.5565, + "step": 637 + }, + { + "epoch": 0.30843606478124247, + "grad_norm": 0.49251100209183046, + "learning_rate": 8.291843078575752e-06, + "loss": 0.5485, + "step": 638 + }, + { + "epoch": 0.30891950688905, + "grad_norm": 0.4769824051475033, + "learning_rate": 8.285821919386758e-06, + "loss": 0.5456, + "step": 639 + }, + { + "epoch": 0.3094029489968576, + "grad_norm": 0.503019530780954, + "learning_rate": 8.279792361372114e-06, + "loss": 0.5602, + "step": 640 + }, + { + "epoch": 0.3098863911046652, + "grad_norm": 0.48405162661408385, + "learning_rate": 8.273754419943856e-06, + "loss": 0.5536, + "step": 641 + }, + { + "epoch": 0.3103698332124728, + "grad_norm": 0.4657304337869963, + "learning_rate": 8.267708110535449e-06, + "loss": 0.5477, + "step": 642 + }, + { + "epoch": 0.3108532753202804, + "grad_norm": 0.5106373655355231, + "learning_rate": 8.26165344860175e-06, + "loss": 0.571, + "step": 643 + }, + { + "epoch": 0.311336717428088, + "grad_norm": 0.4854760780132044, + "learning_rate": 8.255590449618958e-06, + "loss": 0.546, + "step": 644 + }, + { + "epoch": 0.3118201595358956, + "grad_norm": 0.4817908473273075, + "learning_rate": 8.24951912908459e-06, + "loss": 0.5446, + "step": 645 + }, + { + "epoch": 0.3123036016437032, + "grad_norm": 0.4792564537130554, + "learning_rate": 8.243439502517432e-06, + "loss": 0.5352, + "step": 646 + }, + { + "epoch": 0.31278704375151073, + "grad_norm": 0.5188490831185355, + "learning_rate": 8.237351585457499e-06, + "loss": 0.5298, + "step": 647 + }, + { + "epoch": 0.3132704858593183, + "grad_norm": 0.5232755336111542, + "learning_rate": 8.231255393465993e-06, + "loss": 0.5387, + "step": 648 + }, + { + "epoch": 0.3137539279671259, + "grad_norm": 0.48933101067554713, + "learning_rate": 8.225150942125278e-06, + "loss": 0.5156, + "step": 649 + }, + { + "epoch": 0.3142373700749335, + "grad_norm": 0.47579138598403903, + "learning_rate": 8.21903824703882e-06, + "loss": 0.552, + "step": 650 + }, + { + "epoch": 0.3147208121827411, + "grad_norm": 0.47742859766681844, + "learning_rate": 8.21291732383116e-06, + "loss": 0.5498, + "step": 651 + }, + { + "epoch": 0.3152042542905487, + "grad_norm": 0.5282098462854927, + "learning_rate": 8.206788188147874e-06, + "loss": 0.5327, + "step": 652 + }, + { + "epoch": 0.3156876963983563, + "grad_norm": 0.4655983753785802, + "learning_rate": 8.200650855655525e-06, + "loss": 0.5523, + "step": 653 + }, + { + "epoch": 0.3161711385061639, + "grad_norm": 0.46598993965098007, + "learning_rate": 8.19450534204163e-06, + "loss": 0.5428, + "step": 654 + }, + { + "epoch": 0.3166545806139715, + "grad_norm": 0.48320040727215685, + "learning_rate": 8.188351663014615e-06, + "loss": 0.5511, + "step": 655 + }, + { + "epoch": 0.31713802272177904, + "grad_norm": 0.4851268795547935, + "learning_rate": 8.182189834303783e-06, + "loss": 0.5515, + "step": 656 + }, + { + "epoch": 0.31762146482958664, + "grad_norm": 0.4829311813743368, + "learning_rate": 8.176019871659263e-06, + "loss": 0.5425, + "step": 657 + }, + { + "epoch": 0.31810490693739424, + "grad_norm": 0.4268110510337058, + "learning_rate": 8.169841790851976e-06, + "loss": 0.5192, + "step": 658 + }, + { + "epoch": 0.31858834904520184, + "grad_norm": 0.46970357309915234, + "learning_rate": 8.163655607673594e-06, + "loss": 0.5516, + "step": 659 + }, + { + "epoch": 0.31907179115300943, + "grad_norm": 0.4688205789040297, + "learning_rate": 8.157461337936506e-06, + "loss": 0.5398, + "step": 660 + }, + { + "epoch": 0.31955523326081703, + "grad_norm": 0.49966496418563966, + "learning_rate": 8.151258997473757e-06, + "loss": 0.5501, + "step": 661 + }, + { + "epoch": 0.3200386753686246, + "grad_norm": 0.4715831395525512, + "learning_rate": 8.145048602139031e-06, + "loss": 0.5473, + "step": 662 + }, + { + "epoch": 0.3205221174764322, + "grad_norm": 0.46025918766438206, + "learning_rate": 8.138830167806601e-06, + "loss": 0.5481, + "step": 663 + }, + { + "epoch": 0.32100555958423976, + "grad_norm": 0.49250849769551697, + "learning_rate": 8.132603710371287e-06, + "loss": 0.5563, + "step": 664 + }, + { + "epoch": 0.32148900169204736, + "grad_norm": 0.46277397720994495, + "learning_rate": 8.126369245748413e-06, + "loss": 0.5418, + "step": 665 + }, + { + "epoch": 0.32197244379985496, + "grad_norm": 0.44842320811529324, + "learning_rate": 8.120126789873775e-06, + "loss": 0.549, + "step": 666 + }, + { + "epoch": 0.32245588590766255, + "grad_norm": 0.4487718178782243, + "learning_rate": 8.113876358703593e-06, + "loss": 0.5515, + "step": 667 + }, + { + "epoch": 0.32293932801547015, + "grad_norm": 0.49737040438900676, + "learning_rate": 8.10761796821447e-06, + "loss": 0.5529, + "step": 668 + }, + { + "epoch": 0.32342277012327775, + "grad_norm": 0.5088088437400782, + "learning_rate": 8.10135163440336e-06, + "loss": 0.5507, + "step": 669 + }, + { + "epoch": 0.32390621223108534, + "grad_norm": 0.5221100660415426, + "learning_rate": 8.095077373287517e-06, + "loss": 0.5363, + "step": 670 + }, + { + "epoch": 0.32438965433889294, + "grad_norm": 0.5098038198929602, + "learning_rate": 8.088795200904457e-06, + "loss": 0.5443, + "step": 671 + }, + { + "epoch": 0.3248730964467005, + "grad_norm": 0.5299548080054053, + "learning_rate": 8.08250513331192e-06, + "loss": 0.5547, + "step": 672 + }, + { + "epoch": 0.3253565385545081, + "grad_norm": 0.47991648628747413, + "learning_rate": 8.076207186587826e-06, + "loss": 0.552, + "step": 673 + }, + { + "epoch": 0.3258399806623157, + "grad_norm": 0.4928995313967277, + "learning_rate": 8.069901376830232e-06, + "loss": 0.5449, + "step": 674 + }, + { + "epoch": 0.3263234227701233, + "grad_norm": 0.526245201002504, + "learning_rate": 8.063587720157298e-06, + "loss": 0.5544, + "step": 675 + }, + { + "epoch": 0.32680686487793087, + "grad_norm": 0.5169185895561939, + "learning_rate": 8.057266232707239e-06, + "loss": 0.5388, + "step": 676 + }, + { + "epoch": 0.32729030698573847, + "grad_norm": 0.45862190884382065, + "learning_rate": 8.050936930638285e-06, + "loss": 0.5523, + "step": 677 + }, + { + "epoch": 0.32777374909354606, + "grad_norm": 0.4791194354627634, + "learning_rate": 8.044599830128643e-06, + "loss": 0.5498, + "step": 678 + }, + { + "epoch": 0.32825719120135366, + "grad_norm": 0.5040011739287719, + "learning_rate": 8.038254947376454e-06, + "loss": 0.5378, + "step": 679 + }, + { + "epoch": 0.3287406333091612, + "grad_norm": 0.42346684737245893, + "learning_rate": 8.03190229859975e-06, + "loss": 0.5541, + "step": 680 + }, + { + "epoch": 0.3292240754169688, + "grad_norm": 0.48225697444636256, + "learning_rate": 8.02554190003641e-06, + "loss": 0.5505, + "step": 681 + }, + { + "epoch": 0.3297075175247764, + "grad_norm": 0.480200233217211, + "learning_rate": 8.019173767944128e-06, + "loss": 0.5563, + "step": 682 + }, + { + "epoch": 0.330190959632584, + "grad_norm": 0.4406037883552503, + "learning_rate": 8.012797918600363e-06, + "loss": 0.5241, + "step": 683 + }, + { + "epoch": 0.3306744017403916, + "grad_norm": 0.4838913486529156, + "learning_rate": 8.006414368302297e-06, + "loss": 0.5251, + "step": 684 + }, + { + "epoch": 0.3311578438481992, + "grad_norm": 0.45454190895682295, + "learning_rate": 8.000023133366804e-06, + "loss": 0.5449, + "step": 685 + }, + { + "epoch": 0.3316412859560068, + "grad_norm": 0.49869890620532237, + "learning_rate": 7.99362423013039e-06, + "loss": 0.5401, + "step": 686 + }, + { + "epoch": 0.3321247280638144, + "grad_norm": 0.4727231220514769, + "learning_rate": 7.98721767494917e-06, + "loss": 0.5381, + "step": 687 + }, + { + "epoch": 0.332608170171622, + "grad_norm": 0.46944667758244535, + "learning_rate": 7.980803484198817e-06, + "loss": 0.5542, + "step": 688 + }, + { + "epoch": 0.3330916122794295, + "grad_norm": 0.4643616722232514, + "learning_rate": 7.974381674274517e-06, + "loss": 0.5394, + "step": 689 + }, + { + "epoch": 0.3335750543872371, + "grad_norm": 0.4529493856728362, + "learning_rate": 7.967952261590936e-06, + "loss": 0.5478, + "step": 690 + }, + { + "epoch": 0.3340584964950447, + "grad_norm": 0.4497900124215144, + "learning_rate": 7.961515262582168e-06, + "loss": 0.5387, + "step": 691 + }, + { + "epoch": 0.3345419386028523, + "grad_norm": 0.4613195703294155, + "learning_rate": 7.955070693701704e-06, + "loss": 0.5488, + "step": 692 + }, + { + "epoch": 0.3350253807106599, + "grad_norm": 0.45208853687907335, + "learning_rate": 7.94861857142238e-06, + "loss": 0.5161, + "step": 693 + }, + { + "epoch": 0.3355088228184675, + "grad_norm": 0.45338462953665065, + "learning_rate": 7.942158912236339e-06, + "loss": 0.5504, + "step": 694 + }, + { + "epoch": 0.3359922649262751, + "grad_norm": 0.45784135957705213, + "learning_rate": 7.935691732654995e-06, + "loss": 0.5525, + "step": 695 + }, + { + "epoch": 0.3364757070340827, + "grad_norm": 0.4745455134248678, + "learning_rate": 7.929217049208977e-06, + "loss": 0.5549, + "step": 696 + }, + { + "epoch": 0.33695914914189024, + "grad_norm": 0.46788843343497605, + "learning_rate": 7.922734878448099e-06, + "loss": 0.5543, + "step": 697 + }, + { + "epoch": 0.33744259124969783, + "grad_norm": 0.4894111106267614, + "learning_rate": 7.916245236941311e-06, + "loss": 0.5456, + "step": 698 + }, + { + "epoch": 0.33792603335750543, + "grad_norm": 0.4818527781927651, + "learning_rate": 7.90974814127666e-06, + "loss": 0.5436, + "step": 699 + }, + { + "epoch": 0.338409475465313, + "grad_norm": 0.48230512049955104, + "learning_rate": 7.903243608061246e-06, + "loss": 0.5569, + "step": 700 + }, + { + "epoch": 0.3388929175731206, + "grad_norm": 0.4651013778967097, + "learning_rate": 7.89673165392118e-06, + "loss": 0.5497, + "step": 701 + }, + { + "epoch": 0.3393763596809282, + "grad_norm": 0.5263037891579944, + "learning_rate": 7.890212295501542e-06, + "loss": 0.5489, + "step": 702 + }, + { + "epoch": 0.3398598017887358, + "grad_norm": 0.47525750483933155, + "learning_rate": 7.883685549466337e-06, + "loss": 0.5438, + "step": 703 + }, + { + "epoch": 0.3403432438965434, + "grad_norm": 0.48435256135519467, + "learning_rate": 7.877151432498456e-06, + "loss": 0.5506, + "step": 704 + }, + { + "epoch": 0.34082668600435095, + "grad_norm": 0.49040296450298604, + "learning_rate": 7.870609961299627e-06, + "loss": 0.536, + "step": 705 + }, + { + "epoch": 0.34131012811215855, + "grad_norm": 0.4437135993163076, + "learning_rate": 7.864061152590376e-06, + "loss": 0.5539, + "step": 706 + }, + { + "epoch": 0.34179357021996615, + "grad_norm": 0.48585487486606105, + "learning_rate": 7.857505023109989e-06, + "loss": 0.5461, + "step": 707 + }, + { + "epoch": 0.34227701232777374, + "grad_norm": 0.48202275018795376, + "learning_rate": 7.850941589616458e-06, + "loss": 0.5371, + "step": 708 + }, + { + "epoch": 0.34276045443558134, + "grad_norm": 0.4716406712767161, + "learning_rate": 7.844370868886452e-06, + "loss": 0.5557, + "step": 709 + }, + { + "epoch": 0.34324389654338894, + "grad_norm": 0.49083867550017374, + "learning_rate": 7.83779287771526e-06, + "loss": 0.5459, + "step": 710 + }, + { + "epoch": 0.34372733865119653, + "grad_norm": 0.49576681886311147, + "learning_rate": 7.831207632916757e-06, + "loss": 0.5466, + "step": 711 + }, + { + "epoch": 0.34421078075900413, + "grad_norm": 0.4533195955521626, + "learning_rate": 7.824615151323363e-06, + "loss": 0.519, + "step": 712 + }, + { + "epoch": 0.3446942228668117, + "grad_norm": 0.4638295505748454, + "learning_rate": 7.818015449785987e-06, + "loss": 0.5485, + "step": 713 + }, + { + "epoch": 0.34517766497461927, + "grad_norm": 0.4802273717901249, + "learning_rate": 7.811408545174001e-06, + "loss": 0.5453, + "step": 714 + }, + { + "epoch": 0.34566110708242687, + "grad_norm": 0.4633815927205105, + "learning_rate": 7.804794454375189e-06, + "loss": 0.5504, + "step": 715 + }, + { + "epoch": 0.34614454919023446, + "grad_norm": 0.4455507327132057, + "learning_rate": 7.798173194295693e-06, + "loss": 0.5425, + "step": 716 + }, + { + "epoch": 0.34662799129804206, + "grad_norm": 0.4555461305882042, + "learning_rate": 7.791544781859993e-06, + "loss": 0.5402, + "step": 717 + }, + { + "epoch": 0.34711143340584966, + "grad_norm": 0.4459519768062681, + "learning_rate": 7.784909234010843e-06, + "loss": 0.5448, + "step": 718 + }, + { + "epoch": 0.34759487551365725, + "grad_norm": 0.43036102684437805, + "learning_rate": 7.778266567709239e-06, + "loss": 0.5532, + "step": 719 + }, + { + "epoch": 0.34807831762146485, + "grad_norm": 0.4640780423848208, + "learning_rate": 7.771616799934372e-06, + "loss": 0.5403, + "step": 720 + }, + { + "epoch": 0.34856175972927245, + "grad_norm": 0.4785048230046999, + "learning_rate": 7.764959947683581e-06, + "loss": 0.5484, + "step": 721 + }, + { + "epoch": 0.34904520183708, + "grad_norm": 0.5054523460781126, + "learning_rate": 7.758296027972324e-06, + "loss": 0.5367, + "step": 722 + }, + { + "epoch": 0.3495286439448876, + "grad_norm": 0.46292028395178175, + "learning_rate": 7.751625057834107e-06, + "loss": 0.5221, + "step": 723 + }, + { + "epoch": 0.3500120860526952, + "grad_norm": 0.5080355944834025, + "learning_rate": 7.744947054320475e-06, + "loss": 0.552, + "step": 724 + }, + { + "epoch": 0.3504955281605028, + "grad_norm": 0.4692605361826857, + "learning_rate": 7.73826203450094e-06, + "loss": 0.5516, + "step": 725 + }, + { + "epoch": 0.3509789702683104, + "grad_norm": 0.4452014990295793, + "learning_rate": 7.731570015462953e-06, + "loss": 0.5385, + "step": 726 + }, + { + "epoch": 0.35146241237611797, + "grad_norm": 0.4407238797603078, + "learning_rate": 7.724871014311853e-06, + "loss": 0.5512, + "step": 727 + }, + { + "epoch": 0.35194585448392557, + "grad_norm": 0.4578450461185458, + "learning_rate": 7.718165048170827e-06, + "loss": 0.5436, + "step": 728 + }, + { + "epoch": 0.35242929659173317, + "grad_norm": 0.47134089664050416, + "learning_rate": 7.711452134180865e-06, + "loss": 0.5439, + "step": 729 + }, + { + "epoch": 0.3529127386995407, + "grad_norm": 0.45807802586279717, + "learning_rate": 7.704732289500717e-06, + "loss": 0.535, + "step": 730 + }, + { + "epoch": 0.3533961808073483, + "grad_norm": 0.46923078006226726, + "learning_rate": 7.698005531306844e-06, + "loss": 0.5438, + "step": 731 + }, + { + "epoch": 0.3538796229151559, + "grad_norm": 0.47163216368627525, + "learning_rate": 7.691271876793387e-06, + "loss": 0.5412, + "step": 732 + }, + { + "epoch": 0.3543630650229635, + "grad_norm": 0.43982060964801745, + "learning_rate": 7.684531343172108e-06, + "loss": 0.5326, + "step": 733 + }, + { + "epoch": 0.3548465071307711, + "grad_norm": 0.49898422329655673, + "learning_rate": 7.677783947672352e-06, + "loss": 0.5352, + "step": 734 + }, + { + "epoch": 0.3553299492385787, + "grad_norm": 0.4426035951569431, + "learning_rate": 7.67102970754101e-06, + "loss": 0.5083, + "step": 735 + }, + { + "epoch": 0.3558133913463863, + "grad_norm": 0.473881609856312, + "learning_rate": 7.664268640042459e-06, + "loss": 0.5493, + "step": 736 + }, + { + "epoch": 0.3562968334541939, + "grad_norm": 0.48477702288906854, + "learning_rate": 7.657500762458536e-06, + "loss": 0.5415, + "step": 737 + }, + { + "epoch": 0.3567802755620014, + "grad_norm": 0.4673684560489235, + "learning_rate": 7.65072609208848e-06, + "loss": 0.5402, + "step": 738 + }, + { + "epoch": 0.357263717669809, + "grad_norm": 0.45922228645390506, + "learning_rate": 7.643944646248898e-06, + "loss": 0.5523, + "step": 739 + }, + { + "epoch": 0.3577471597776166, + "grad_norm": 0.5023203702238386, + "learning_rate": 7.637156442273705e-06, + "loss": 0.5472, + "step": 740 + }, + { + "epoch": 0.3582306018854242, + "grad_norm": 0.5158066743775931, + "learning_rate": 7.630361497514104e-06, + "loss": 0.5409, + "step": 741 + }, + { + "epoch": 0.3587140439932318, + "grad_norm": 0.4279389055361383, + "learning_rate": 7.6235598293385184e-06, + "loss": 0.548, + "step": 742 + }, + { + "epoch": 0.3591974861010394, + "grad_norm": 0.48124896416843527, + "learning_rate": 7.616751455132561e-06, + "loss": 0.5061, + "step": 743 + }, + { + "epoch": 0.359680928208847, + "grad_norm": 0.45130820986839676, + "learning_rate": 7.6099363922989845e-06, + "loss": 0.5408, + "step": 744 + }, + { + "epoch": 0.3601643703166546, + "grad_norm": 0.43968032097493187, + "learning_rate": 7.60311465825764e-06, + "loss": 0.5419, + "step": 745 + }, + { + "epoch": 0.36064781242446214, + "grad_norm": 0.4638052394642039, + "learning_rate": 7.596286270445429e-06, + "loss": 0.5474, + "step": 746 + }, + { + "epoch": 0.36113125453226974, + "grad_norm": 0.47215533812036253, + "learning_rate": 7.5894512463162595e-06, + "loss": 0.5481, + "step": 747 + }, + { + "epoch": 0.36161469664007734, + "grad_norm": 0.4910077404120728, + "learning_rate": 7.5826096033410056e-06, + "loss": 0.5483, + "step": 748 + }, + { + "epoch": 0.36209813874788493, + "grad_norm": 0.49435519282302404, + "learning_rate": 7.575761359007459e-06, + "loss": 0.5375, + "step": 749 + }, + { + "epoch": 0.36258158085569253, + "grad_norm": 0.4645080520487796, + "learning_rate": 7.568906530820281e-06, + "loss": 0.5406, + "step": 750 + }, + { + "epoch": 0.36306502296350013, + "grad_norm": 0.48056836362105476, + "learning_rate": 7.562045136300969e-06, + "loss": 0.547, + "step": 751 + }, + { + "epoch": 0.3635484650713077, + "grad_norm": 0.4648855493783244, + "learning_rate": 7.555177192987797e-06, + "loss": 0.5372, + "step": 752 + }, + { + "epoch": 0.3640319071791153, + "grad_norm": 0.43851001194612105, + "learning_rate": 7.5483027184357825e-06, + "loss": 0.5484, + "step": 753 + }, + { + "epoch": 0.3645153492869229, + "grad_norm": 0.4234049796935857, + "learning_rate": 7.541421730216638e-06, + "loss": 0.4914, + "step": 754 + }, + { + "epoch": 0.36499879139473046, + "grad_norm": 0.4886945785128111, + "learning_rate": 7.534534245918723e-06, + "loss": 0.5362, + "step": 755 + }, + { + "epoch": 0.36548223350253806, + "grad_norm": 0.47490196043064764, + "learning_rate": 7.527640283147003e-06, + "loss": 0.5387, + "step": 756 + }, + { + "epoch": 0.36596567561034565, + "grad_norm": 0.4422931000418374, + "learning_rate": 7.520739859523001e-06, + "loss": 0.5334, + "step": 757 + }, + { + "epoch": 0.36644911771815325, + "grad_norm": 0.45103949345201827, + "learning_rate": 7.513832992684758e-06, + "loss": 0.5423, + "step": 758 + }, + { + "epoch": 0.36693255982596085, + "grad_norm": 0.5023542886793314, + "learning_rate": 7.50691970028678e-06, + "loss": 0.5371, + "step": 759 + }, + { + "epoch": 0.36741600193376844, + "grad_norm": 0.48961990962706975, + "learning_rate": 7.500000000000001e-06, + "loss": 0.5602, + "step": 760 + }, + { + "epoch": 0.36789944404157604, + "grad_norm": 0.47244479352550756, + "learning_rate": 7.493073909511732e-06, + "loss": 0.5399, + "step": 761 + }, + { + "epoch": 0.36838288614938364, + "grad_norm": 0.47262373426445514, + "learning_rate": 7.486141446525619e-06, + "loss": 0.5465, + "step": 762 + }, + { + "epoch": 0.3688663282571912, + "grad_norm": 0.47837508137309714, + "learning_rate": 7.479202628761597e-06, + "loss": 0.5412, + "step": 763 + }, + { + "epoch": 0.3693497703649988, + "grad_norm": 0.47771254343171743, + "learning_rate": 7.472257473955841e-06, + "loss": 0.5429, + "step": 764 + }, + { + "epoch": 0.36983321247280637, + "grad_norm": 0.45892920422210776, + "learning_rate": 7.465305999860728e-06, + "loss": 0.5358, + "step": 765 + }, + { + "epoch": 0.37031665458061397, + "grad_norm": 0.4636269912834914, + "learning_rate": 7.4583482242447856e-06, + "loss": 0.528, + "step": 766 + }, + { + "epoch": 0.37080009668842157, + "grad_norm": 0.45196732778688614, + "learning_rate": 7.45138416489265e-06, + "loss": 0.5466, + "step": 767 + }, + { + "epoch": 0.37128353879622916, + "grad_norm": 0.44240214579051484, + "learning_rate": 7.444413839605017e-06, + "loss": 0.5315, + "step": 768 + }, + { + "epoch": 0.37176698090403676, + "grad_norm": 0.45295775865600874, + "learning_rate": 7.437437266198602e-06, + "loss": 0.5443, + "step": 769 + }, + { + "epoch": 0.37225042301184436, + "grad_norm": 0.44966146652009026, + "learning_rate": 7.430454462506085e-06, + "loss": 0.5417, + "step": 770 + }, + { + "epoch": 0.3727338651196519, + "grad_norm": 0.451144144721217, + "learning_rate": 7.423465446376079e-06, + "loss": 0.5389, + "step": 771 + }, + { + "epoch": 0.3732173072274595, + "grad_norm": 0.45620688667035586, + "learning_rate": 7.416470235673069e-06, + "loss": 0.538, + "step": 772 + }, + { + "epoch": 0.3737007493352671, + "grad_norm": 0.44604596469243557, + "learning_rate": 7.40946884827738e-06, + "loss": 0.5293, + "step": 773 + }, + { + "epoch": 0.3741841914430747, + "grad_norm": 0.4476638425696451, + "learning_rate": 7.402461302085121e-06, + "loss": 0.5402, + "step": 774 + }, + { + "epoch": 0.3746676335508823, + "grad_norm": 0.46401364076227924, + "learning_rate": 7.395447615008147e-06, + "loss": 0.5377, + "step": 775 + }, + { + "epoch": 0.3751510756586899, + "grad_norm": 0.46250351228418424, + "learning_rate": 7.388427804974003e-06, + "loss": 0.5455, + "step": 776 + }, + { + "epoch": 0.3756345177664975, + "grad_norm": 0.448294668881327, + "learning_rate": 7.381401889925894e-06, + "loss": 0.5311, + "step": 777 + }, + { + "epoch": 0.3761179598743051, + "grad_norm": 0.4490599419042619, + "learning_rate": 7.374369887822623e-06, + "loss": 0.5416, + "step": 778 + }, + { + "epoch": 0.3766014019821126, + "grad_norm": 0.44904272890455516, + "learning_rate": 7.367331816638554e-06, + "loss": 0.5464, + "step": 779 + }, + { + "epoch": 0.3770848440899202, + "grad_norm": 0.4731428544902919, + "learning_rate": 7.360287694363566e-06, + "loss": 0.5415, + "step": 780 + }, + { + "epoch": 0.3775682861977278, + "grad_norm": 0.4760085542862297, + "learning_rate": 7.353237539002999e-06, + "loss": 0.5388, + "step": 781 + }, + { + "epoch": 0.3780517283055354, + "grad_norm": 0.4817431722421546, + "learning_rate": 7.346181368577624e-06, + "loss": 0.5513, + "step": 782 + }, + { + "epoch": 0.378535170413343, + "grad_norm": 0.462287277146555, + "learning_rate": 7.3391192011235764e-06, + "loss": 0.5393, + "step": 783 + }, + { + "epoch": 0.3790186125211506, + "grad_norm": 0.44812435609118556, + "learning_rate": 7.3320510546923285e-06, + "loss": 0.5509, + "step": 784 + }, + { + "epoch": 0.3795020546289582, + "grad_norm": 0.48025260306275075, + "learning_rate": 7.324976947350631e-06, + "loss": 0.5387, + "step": 785 + }, + { + "epoch": 0.3799854967367658, + "grad_norm": 0.4639545538957294, + "learning_rate": 7.317896897180472e-06, + "loss": 0.5298, + "step": 786 + }, + { + "epoch": 0.3804689388445734, + "grad_norm": 0.46088677266135386, + "learning_rate": 7.31081092227903e-06, + "loss": 0.5371, + "step": 787 + }, + { + "epoch": 0.38095238095238093, + "grad_norm": 0.4446814202000039, + "learning_rate": 7.303719040758631e-06, + "loss": 0.5368, + "step": 788 + }, + { + "epoch": 0.38143582306018853, + "grad_norm": 0.4628164716114684, + "learning_rate": 7.296621270746691e-06, + "loss": 0.5439, + "step": 789 + }, + { + "epoch": 0.3819192651679961, + "grad_norm": 0.46916898249294825, + "learning_rate": 7.289517630385687e-06, + "loss": 0.5188, + "step": 790 + }, + { + "epoch": 0.3824027072758037, + "grad_norm": 0.44187034218765375, + "learning_rate": 7.282408137833093e-06, + "loss": 0.5404, + "step": 791 + }, + { + "epoch": 0.3828861493836113, + "grad_norm": 0.47877844839355055, + "learning_rate": 7.275292811261346e-06, + "loss": 0.5377, + "step": 792 + }, + { + "epoch": 0.3833695914914189, + "grad_norm": 0.46034254680546527, + "learning_rate": 7.268171668857794e-06, + "loss": 0.5489, + "step": 793 + }, + { + "epoch": 0.3838530335992265, + "grad_norm": 0.4557903207271804, + "learning_rate": 7.261044728824652e-06, + "loss": 0.5415, + "step": 794 + }, + { + "epoch": 0.3843364757070341, + "grad_norm": 0.477573699663043, + "learning_rate": 7.253912009378953e-06, + "loss": 0.5526, + "step": 795 + }, + { + "epoch": 0.38481991781484165, + "grad_norm": 0.5200587123977845, + "learning_rate": 7.246773528752501e-06, + "loss": 0.5452, + "step": 796 + }, + { + "epoch": 0.38530335992264925, + "grad_norm": 0.4504279257869106, + "learning_rate": 7.239629305191828e-06, + "loss": 0.528, + "step": 797 + }, + { + "epoch": 0.38578680203045684, + "grad_norm": 0.47332344951633437, + "learning_rate": 7.2324793569581474e-06, + "loss": 0.5413, + "step": 798 + }, + { + "epoch": 0.38627024413826444, + "grad_norm": 0.4658360771399747, + "learning_rate": 7.2253237023273e-06, + "loss": 0.5111, + "step": 799 + }, + { + "epoch": 0.38675368624607204, + "grad_norm": 0.4820288472202763, + "learning_rate": 7.21816235958972e-06, + "loss": 0.5472, + "step": 800 + }, + { + "epoch": 0.38723712835387963, + "grad_norm": 0.4713168204495061, + "learning_rate": 7.210995347050372e-06, + "loss": 0.5441, + "step": 801 + }, + { + "epoch": 0.38772057046168723, + "grad_norm": 0.45204543208032005, + "learning_rate": 7.203822683028721e-06, + "loss": 0.5403, + "step": 802 + }, + { + "epoch": 0.3882040125694948, + "grad_norm": 0.4577372851640406, + "learning_rate": 7.196644385858673e-06, + "loss": 0.5303, + "step": 803 + }, + { + "epoch": 0.38868745467730237, + "grad_norm": 0.4458565390228467, + "learning_rate": 7.189460473888535e-06, + "loss": 0.5453, + "step": 804 + }, + { + "epoch": 0.38917089678510997, + "grad_norm": 0.4735785239686837, + "learning_rate": 7.182270965480963e-06, + "loss": 0.5491, + "step": 805 + }, + { + "epoch": 0.38965433889291756, + "grad_norm": 0.44995709460533084, + "learning_rate": 7.17507587901292e-06, + "loss": 0.5328, + "step": 806 + }, + { + "epoch": 0.39013778100072516, + "grad_norm": 0.45714294309440695, + "learning_rate": 7.167875232875632e-06, + "loss": 0.5401, + "step": 807 + }, + { + "epoch": 0.39062122310853276, + "grad_norm": 0.45625350898261685, + "learning_rate": 7.160669045474524e-06, + "loss": 0.5198, + "step": 808 + }, + { + "epoch": 0.39110466521634035, + "grad_norm": 0.45109568818047574, + "learning_rate": 7.153457335229196e-06, + "loss": 0.5396, + "step": 809 + }, + { + "epoch": 0.39158810732414795, + "grad_norm": 0.4362283889114229, + "learning_rate": 7.146240120573358e-06, + "loss": 0.5421, + "step": 810 + }, + { + "epoch": 0.39207154943195555, + "grad_norm": 0.44454460051068084, + "learning_rate": 7.1390174199547945e-06, + "loss": 0.5495, + "step": 811 + }, + { + "epoch": 0.3925549915397631, + "grad_norm": 0.47028424443890243, + "learning_rate": 7.131789251835309e-06, + "loss": 0.5528, + "step": 812 + }, + { + "epoch": 0.3930384336475707, + "grad_norm": 0.44600311061331627, + "learning_rate": 7.124555634690684e-06, + "loss": 0.546, + "step": 813 + }, + { + "epoch": 0.3935218757553783, + "grad_norm": 0.42127806705345067, + "learning_rate": 7.117316587010625e-06, + "loss": 0.5164, + "step": 814 + }, + { + "epoch": 0.3940053178631859, + "grad_norm": 0.4919804442725305, + "learning_rate": 7.110072127298722e-06, + "loss": 0.5405, + "step": 815 + }, + { + "epoch": 0.3944887599709935, + "grad_norm": 0.433673881020081, + "learning_rate": 7.1028222740724e-06, + "loss": 0.5474, + "step": 816 + }, + { + "epoch": 0.39497220207880107, + "grad_norm": 0.44154650103792475, + "learning_rate": 7.095567045862867e-06, + "loss": 0.537, + "step": 817 + }, + { + "epoch": 0.39545564418660867, + "grad_norm": 0.4862239172154185, + "learning_rate": 7.0883064612150684e-06, + "loss": 0.5418, + "step": 818 + }, + { + "epoch": 0.39593908629441626, + "grad_norm": 0.44482243313717656, + "learning_rate": 7.081040538687649e-06, + "loss": 0.5421, + "step": 819 + }, + { + "epoch": 0.3964225284022238, + "grad_norm": 0.4674091065145529, + "learning_rate": 7.073769296852888e-06, + "loss": 0.5322, + "step": 820 + }, + { + "epoch": 0.3969059705100314, + "grad_norm": 0.42008927529357837, + "learning_rate": 7.066492754296668e-06, + "loss": 0.5021, + "step": 821 + }, + { + "epoch": 0.397389412617839, + "grad_norm": 0.4604995654095531, + "learning_rate": 7.059210929618416e-06, + "loss": 0.5449, + "step": 822 + }, + { + "epoch": 0.3978728547256466, + "grad_norm": 0.45094698267399413, + "learning_rate": 7.051923841431063e-06, + "loss": 0.5376, + "step": 823 + }, + { + "epoch": 0.3983562968334542, + "grad_norm": 0.46562198473083916, + "learning_rate": 7.044631508360996e-06, + "loss": 0.5449, + "step": 824 + }, + { + "epoch": 0.3988397389412618, + "grad_norm": 0.45370053407350325, + "learning_rate": 7.037333949048005e-06, + "loss": 0.5443, + "step": 825 + }, + { + "epoch": 0.3993231810490694, + "grad_norm": 0.45490375066885613, + "learning_rate": 7.03003118214524e-06, + "loss": 0.4994, + "step": 826 + }, + { + "epoch": 0.399806623156877, + "grad_norm": 0.4689025885486327, + "learning_rate": 7.022723226319159e-06, + "loss": 0.5249, + "step": 827 + }, + { + "epoch": 0.4002900652646846, + "grad_norm": 0.4587224558807211, + "learning_rate": 7.0154101002494914e-06, + "loss": 0.531, + "step": 828 + }, + { + "epoch": 0.4007735073724921, + "grad_norm": 0.43840918685603564, + "learning_rate": 7.008091822629172e-06, + "loss": 0.5331, + "step": 829 + }, + { + "epoch": 0.4012569494802997, + "grad_norm": 0.5046588696515812, + "learning_rate": 7.00076841216431e-06, + "loss": 0.5302, + "step": 830 + }, + { + "epoch": 0.4017403915881073, + "grad_norm": 0.485313377425035, + "learning_rate": 6.993439887574133e-06, + "loss": 0.5418, + "step": 831 + }, + { + "epoch": 0.4022238336959149, + "grad_norm": 0.47268076488419086, + "learning_rate": 6.986106267590942e-06, + "loss": 0.535, + "step": 832 + }, + { + "epoch": 0.4027072758037225, + "grad_norm": 0.4528644527398112, + "learning_rate": 6.978767570960057e-06, + "loss": 0.5362, + "step": 833 + }, + { + "epoch": 0.4031907179115301, + "grad_norm": 0.458241227473586, + "learning_rate": 6.971423816439782e-06, + "loss": 0.5346, + "step": 834 + }, + { + "epoch": 0.4036741600193377, + "grad_norm": 0.476721951726036, + "learning_rate": 6.964075022801341e-06, + "loss": 0.541, + "step": 835 + }, + { + "epoch": 0.4041576021271453, + "grad_norm": 0.4634216130492689, + "learning_rate": 6.956721208828847e-06, + "loss": 0.5441, + "step": 836 + }, + { + "epoch": 0.40464104423495284, + "grad_norm": 0.46202808494163927, + "learning_rate": 6.949362393319239e-06, + "loss": 0.5416, + "step": 837 + }, + { + "epoch": 0.40512448634276044, + "grad_norm": 0.44981029617918733, + "learning_rate": 6.941998595082243e-06, + "loss": 0.5438, + "step": 838 + }, + { + "epoch": 0.40560792845056803, + "grad_norm": 0.44835769630421096, + "learning_rate": 6.934629832940322e-06, + "loss": 0.5322, + "step": 839 + }, + { + "epoch": 0.40609137055837563, + "grad_norm": 0.4879526087044361, + "learning_rate": 6.927256125728624e-06, + "loss": 0.544, + "step": 840 + }, + { + "epoch": 0.4065748126661832, + "grad_norm": 0.47319742567660167, + "learning_rate": 6.91987749229494e-06, + "loss": 0.5401, + "step": 841 + }, + { + "epoch": 0.4070582547739908, + "grad_norm": 0.4492955794688954, + "learning_rate": 6.91249395149965e-06, + "loss": 0.5393, + "step": 842 + }, + { + "epoch": 0.4075416968817984, + "grad_norm": 0.4436463500280526, + "learning_rate": 6.905105522215684e-06, + "loss": 0.5384, + "step": 843 + }, + { + "epoch": 0.408025138989606, + "grad_norm": 0.43897712052796284, + "learning_rate": 6.897712223328457e-06, + "loss": 0.5297, + "step": 844 + }, + { + "epoch": 0.40850858109741356, + "grad_norm": 0.5141765277378779, + "learning_rate": 6.89031407373584e-06, + "loss": 0.5386, + "step": 845 + }, + { + "epoch": 0.40899202320522116, + "grad_norm": 0.48662407879743685, + "learning_rate": 6.8829110923481e-06, + "loss": 0.5429, + "step": 846 + }, + { + "epoch": 0.40947546531302875, + "grad_norm": 0.4671606204232888, + "learning_rate": 6.875503298087853e-06, + "loss": 0.5339, + "step": 847 + }, + { + "epoch": 0.40995890742083635, + "grad_norm": 0.47636170298906977, + "learning_rate": 6.868090709890016e-06, + "loss": 0.5392, + "step": 848 + }, + { + "epoch": 0.41044234952864395, + "grad_norm": 0.4249603079226182, + "learning_rate": 6.8606733467017675e-06, + "loss": 0.5046, + "step": 849 + }, + { + "epoch": 0.41092579163645154, + "grad_norm": 0.4669023850804772, + "learning_rate": 6.85325122748248e-06, + "loss": 0.5331, + "step": 850 + }, + { + "epoch": 0.41140923374425914, + "grad_norm": 0.43795275267260886, + "learning_rate": 6.845824371203691e-06, + "loss": 0.5409, + "step": 851 + }, + { + "epoch": 0.41189267585206674, + "grad_norm": 0.44363617551756607, + "learning_rate": 6.838392796849042e-06, + "loss": 0.5371, + "step": 852 + }, + { + "epoch": 0.4123761179598743, + "grad_norm": 0.4451816120752011, + "learning_rate": 6.830956523414239e-06, + "loss": 0.5304, + "step": 853 + }, + { + "epoch": 0.4128595600676819, + "grad_norm": 0.4684744852800341, + "learning_rate": 6.8235155699069944e-06, + "loss": 0.5316, + "step": 854 + }, + { + "epoch": 0.41334300217548947, + "grad_norm": 0.44092290998190986, + "learning_rate": 6.816069955346986e-06, + "loss": 0.5127, + "step": 855 + }, + { + "epoch": 0.41382644428329707, + "grad_norm": 0.49751628964469147, + "learning_rate": 6.808619698765804e-06, + "loss": 0.5459, + "step": 856 + }, + { + "epoch": 0.41430988639110466, + "grad_norm": 0.45953817226939175, + "learning_rate": 6.8011648192069045e-06, + "loss": 0.5316, + "step": 857 + }, + { + "epoch": 0.41479332849891226, + "grad_norm": 0.470694123027967, + "learning_rate": 6.7937053357255585e-06, + "loss": 0.5341, + "step": 858 + }, + { + "epoch": 0.41527677060671986, + "grad_norm": 0.482654546386677, + "learning_rate": 6.786241267388812e-06, + "loss": 0.5392, + "step": 859 + }, + { + "epoch": 0.41576021271452746, + "grad_norm": 0.49418422954918506, + "learning_rate": 6.778772633275421e-06, + "loss": 0.5259, + "step": 860 + }, + { + "epoch": 0.41624365482233505, + "grad_norm": 0.4508128718503279, + "learning_rate": 6.771299452475818e-06, + "loss": 0.5439, + "step": 861 + }, + { + "epoch": 0.4167270969301426, + "grad_norm": 0.483501145929134, + "learning_rate": 6.763821744092054e-06, + "loss": 0.521, + "step": 862 + }, + { + "epoch": 0.4172105390379502, + "grad_norm": 0.45484324344169746, + "learning_rate": 6.756339527237756e-06, + "loss": 0.5282, + "step": 863 + }, + { + "epoch": 0.4176939811457578, + "grad_norm": 0.47800071067084154, + "learning_rate": 6.748852821038075e-06, + "loss": 0.5362, + "step": 864 + }, + { + "epoch": 0.4181774232535654, + "grad_norm": 0.4561384481336352, + "learning_rate": 6.741361644629629e-06, + "loss": 0.5452, + "step": 865 + }, + { + "epoch": 0.418660865361373, + "grad_norm": 0.4974806458018085, + "learning_rate": 6.733866017160475e-06, + "loss": 0.5374, + "step": 866 + }, + { + "epoch": 0.4191443074691806, + "grad_norm": 0.4696802681175673, + "learning_rate": 6.7263659577900375e-06, + "loss": 0.5368, + "step": 867 + }, + { + "epoch": 0.4196277495769882, + "grad_norm": 0.46421987800289705, + "learning_rate": 6.718861485689077e-06, + "loss": 0.5361, + "step": 868 + }, + { + "epoch": 0.42011119168479577, + "grad_norm": 0.42817223127930704, + "learning_rate": 6.711352620039623e-06, + "loss": 0.5132, + "step": 869 + }, + { + "epoch": 0.4205946337926033, + "grad_norm": 0.4191221628297101, + "learning_rate": 6.703839380034945e-06, + "loss": 0.5282, + "step": 870 + }, + { + "epoch": 0.4210780759004109, + "grad_norm": 0.4151086659722137, + "learning_rate": 6.6963217848794895e-06, + "loss": 0.5046, + "step": 871 + }, + { + "epoch": 0.4215615180082185, + "grad_norm": 0.4638906293888461, + "learning_rate": 6.6887998537888354e-06, + "loss": 0.5365, + "step": 872 + }, + { + "epoch": 0.4220449601160261, + "grad_norm": 0.46061668652958593, + "learning_rate": 6.681273605989643e-06, + "loss": 0.5315, + "step": 873 + }, + { + "epoch": 0.4225284022238337, + "grad_norm": 0.5190170389663172, + "learning_rate": 6.673743060719613e-06, + "loss": 0.5328, + "step": 874 + }, + { + "epoch": 0.4230118443316413, + "grad_norm": 0.45460321890251315, + "learning_rate": 6.666208237227421e-06, + "loss": 0.5359, + "step": 875 + }, + { + "epoch": 0.4234952864394489, + "grad_norm": 0.4292973087733905, + "learning_rate": 6.6586691547726855e-06, + "loss": 0.5139, + "step": 876 + }, + { + "epoch": 0.4239787285472565, + "grad_norm": 0.47082103025419264, + "learning_rate": 6.651125832625908e-06, + "loss": 0.5455, + "step": 877 + }, + { + "epoch": 0.42446217065506403, + "grad_norm": 0.5113179606142492, + "learning_rate": 6.6435782900684284e-06, + "loss": 0.5528, + "step": 878 + }, + { + "epoch": 0.4249456127628716, + "grad_norm": 0.48001824284611705, + "learning_rate": 6.636026546392374e-06, + "loss": 0.5391, + "step": 879 + }, + { + "epoch": 0.4254290548706792, + "grad_norm": 0.4190847750909328, + "learning_rate": 6.628470620900611e-06, + "loss": 0.5309, + "step": 880 + }, + { + "epoch": 0.4259124969784868, + "grad_norm": 0.4429086362697282, + "learning_rate": 6.620910532906692e-06, + "loss": 0.5194, + "step": 881 + }, + { + "epoch": 0.4263959390862944, + "grad_norm": 0.4425962079317176, + "learning_rate": 6.613346301734813e-06, + "loss": 0.5132, + "step": 882 + }, + { + "epoch": 0.426879381194102, + "grad_norm": 0.4614396271141222, + "learning_rate": 6.605777946719757e-06, + "loss": 0.5358, + "step": 883 + }, + { + "epoch": 0.4273628233019096, + "grad_norm": 7.793396607210369, + "learning_rate": 6.59820548720685e-06, + "loss": 0.74, + "step": 884 + }, + { + "epoch": 0.4278462654097172, + "grad_norm": 0.47916638143499957, + "learning_rate": 6.590628942551909e-06, + "loss": 0.5401, + "step": 885 + }, + { + "epoch": 0.42832970751752475, + "grad_norm": 0.5010520173856451, + "learning_rate": 6.583048332121193e-06, + "loss": 0.5384, + "step": 886 + }, + { + "epoch": 0.42881314962533235, + "grad_norm": 0.43637840250511184, + "learning_rate": 6.5754636752913535e-06, + "loss": 0.5018, + "step": 887 + }, + { + "epoch": 0.42929659173313994, + "grad_norm": 0.4347273985958766, + "learning_rate": 6.567874991449383e-06, + "loss": 0.5303, + "step": 888 + }, + { + "epoch": 0.42978003384094754, + "grad_norm": 0.470696307591686, + "learning_rate": 6.560282299992571e-06, + "loss": 0.5454, + "step": 889 + }, + { + "epoch": 0.43026347594875514, + "grad_norm": 0.43949603254219816, + "learning_rate": 6.552685620328447e-06, + "loss": 0.5115, + "step": 890 + }, + { + "epoch": 0.43074691805656273, + "grad_norm": 0.5002179505262337, + "learning_rate": 6.545084971874738e-06, + "loss": 0.5386, + "step": 891 + }, + { + "epoch": 0.43123036016437033, + "grad_norm": 0.48898211514461637, + "learning_rate": 6.537480374059313e-06, + "loss": 0.5464, + "step": 892 + }, + { + "epoch": 0.4317138022721779, + "grad_norm": 0.47111181264654595, + "learning_rate": 6.529871846320138e-06, + "loss": 0.5225, + "step": 893 + }, + { + "epoch": 0.4321972443799855, + "grad_norm": 0.47111234243294525, + "learning_rate": 6.522259408105223e-06, + "loss": 0.5363, + "step": 894 + }, + { + "epoch": 0.43268068648779306, + "grad_norm": 0.4867092311804253, + "learning_rate": 6.514643078872571e-06, + "loss": 0.533, + "step": 895 + }, + { + "epoch": 0.43316412859560066, + "grad_norm": 0.4583905211154658, + "learning_rate": 6.507022878090137e-06, + "loss": 0.5428, + "step": 896 + }, + { + "epoch": 0.43364757070340826, + "grad_norm": 0.4950332559469317, + "learning_rate": 6.499398825235767e-06, + "loss": 0.5337, + "step": 897 + }, + { + "epoch": 0.43413101281121586, + "grad_norm": 0.46856791974797646, + "learning_rate": 6.491770939797152e-06, + "loss": 0.5323, + "step": 898 + }, + { + "epoch": 0.43461445491902345, + "grad_norm": 0.4646364739679311, + "learning_rate": 6.4841392412717864e-06, + "loss": 0.5407, + "step": 899 + }, + { + "epoch": 0.43509789702683105, + "grad_norm": 0.43558921933796657, + "learning_rate": 6.476503749166903e-06, + "loss": 0.5347, + "step": 900 + }, + { + "epoch": 0.43558133913463865, + "grad_norm": 0.4961569425990492, + "learning_rate": 6.4688644829994385e-06, + "loss": 0.5295, + "step": 901 + }, + { + "epoch": 0.43606478124244624, + "grad_norm": 0.4698493360586227, + "learning_rate": 6.4612214622959705e-06, + "loss": 0.5457, + "step": 902 + }, + { + "epoch": 0.4365482233502538, + "grad_norm": 0.435939008717009, + "learning_rate": 6.453574706592676e-06, + "loss": 0.521, + "step": 903 + }, + { + "epoch": 0.4370316654580614, + "grad_norm": 0.5347621942726917, + "learning_rate": 6.44592423543528e-06, + "loss": 0.5354, + "step": 904 + }, + { + "epoch": 0.437515107565869, + "grad_norm": 0.49607229965381217, + "learning_rate": 6.4382700683790025e-06, + "loss": 0.544, + "step": 905 + }, + { + "epoch": 0.4379985496736766, + "grad_norm": 0.4645864217211478, + "learning_rate": 6.4306122249885105e-06, + "loss": 0.5192, + "step": 906 + }, + { + "epoch": 0.43848199178148417, + "grad_norm": 0.46522668277788565, + "learning_rate": 6.422950724837872e-06, + "loss": 0.526, + "step": 907 + }, + { + "epoch": 0.43896543388929177, + "grad_norm": 0.46658657241667495, + "learning_rate": 6.415285587510495e-06, + "loss": 0.5088, + "step": 908 + }, + { + "epoch": 0.43944887599709936, + "grad_norm": 0.49352648435992785, + "learning_rate": 6.407616832599091e-06, + "loss": 0.5291, + "step": 909 + }, + { + "epoch": 0.43993231810490696, + "grad_norm": 0.5022475233173346, + "learning_rate": 6.399944479705615e-06, + "loss": 0.5349, + "step": 910 + }, + { + "epoch": 0.4404157602127145, + "grad_norm": 0.4645078975290342, + "learning_rate": 6.392268548441218e-06, + "loss": 0.5356, + "step": 911 + }, + { + "epoch": 0.4408992023205221, + "grad_norm": 0.4656283146754065, + "learning_rate": 6.384589058426201e-06, + "loss": 0.5297, + "step": 912 + }, + { + "epoch": 0.4413826444283297, + "grad_norm": 0.4778447743673107, + "learning_rate": 6.3769060292899585e-06, + "loss": 0.531, + "step": 913 + }, + { + "epoch": 0.4418660865361373, + "grad_norm": 0.5067362493290994, + "learning_rate": 6.3692194806709326e-06, + "loss": 0.5266, + "step": 914 + }, + { + "epoch": 0.4423495286439449, + "grad_norm": 0.4583050351500669, + "learning_rate": 6.36152943221656e-06, + "loss": 0.5068, + "step": 915 + }, + { + "epoch": 0.4428329707517525, + "grad_norm": 0.44674303828563183, + "learning_rate": 6.353835903583225e-06, + "loss": 0.5135, + "step": 916 + }, + { + "epoch": 0.4433164128595601, + "grad_norm": 0.4845057402223313, + "learning_rate": 6.346138914436207e-06, + "loss": 0.53, + "step": 917 + }, + { + "epoch": 0.4437998549673677, + "grad_norm": 0.46927160898270703, + "learning_rate": 6.338438484449632e-06, + "loss": 0.5282, + "step": 918 + }, + { + "epoch": 0.4442832970751752, + "grad_norm": 0.46844884470128584, + "learning_rate": 6.330734633306415e-06, + "loss": 0.5205, + "step": 919 + }, + { + "epoch": 0.4447667391829828, + "grad_norm": 0.46793830654029384, + "learning_rate": 6.3230273806982254e-06, + "loss": 0.5354, + "step": 920 + }, + { + "epoch": 0.4452501812907904, + "grad_norm": 0.46904655646766624, + "learning_rate": 6.31531674632542e-06, + "loss": 0.5403, + "step": 921 + }, + { + "epoch": 0.445733623398598, + "grad_norm": 0.4797829974325548, + "learning_rate": 6.307602749897001e-06, + "loss": 0.5285, + "step": 922 + }, + { + "epoch": 0.4462170655064056, + "grad_norm": 0.5126549879385887, + "learning_rate": 6.299885411130566e-06, + "loss": 0.5293, + "step": 923 + }, + { + "epoch": 0.4467005076142132, + "grad_norm": 0.47873452298166375, + "learning_rate": 6.292164749752256e-06, + "loss": 0.5358, + "step": 924 + }, + { + "epoch": 0.4471839497220208, + "grad_norm": 0.4697820734594197, + "learning_rate": 6.284440785496701e-06, + "loss": 0.5303, + "step": 925 + }, + { + "epoch": 0.4476673918298284, + "grad_norm": 0.48049048675766015, + "learning_rate": 6.27671353810698e-06, + "loss": 0.5384, + "step": 926 + }, + { + "epoch": 0.448150833937636, + "grad_norm": 0.44829305517954404, + "learning_rate": 6.268983027334557e-06, + "loss": 0.5349, + "step": 927 + }, + { + "epoch": 0.44863427604544354, + "grad_norm": 0.47046261605559686, + "learning_rate": 6.2612492729392396e-06, + "loss": 0.5445, + "step": 928 + }, + { + "epoch": 0.44911771815325113, + "grad_norm": 0.43943250727373434, + "learning_rate": 6.25351229468913e-06, + "loss": 0.5243, + "step": 929 + }, + { + "epoch": 0.44960116026105873, + "grad_norm": 0.457243800658822, + "learning_rate": 6.245772112360568e-06, + "loss": 0.5335, + "step": 930 + }, + { + "epoch": 0.4500846023688663, + "grad_norm": 0.4558342802943274, + "learning_rate": 6.2380287457380814e-06, + "loss": 0.5295, + "step": 931 + }, + { + "epoch": 0.4505680444766739, + "grad_norm": 0.46490304474994104, + "learning_rate": 6.230282214614342e-06, + "loss": 0.5277, + "step": 932 + }, + { + "epoch": 0.4510514865844815, + "grad_norm": 0.46808167357734604, + "learning_rate": 6.222532538790107e-06, + "loss": 0.5038, + "step": 933 + }, + { + "epoch": 0.4515349286922891, + "grad_norm": 0.44746189473984266, + "learning_rate": 6.214779738074169e-06, + "loss": 0.5482, + "step": 934 + }, + { + "epoch": 0.4520183708000967, + "grad_norm": 0.4749348946440431, + "learning_rate": 6.2070238322833165e-06, + "loss": 0.5408, + "step": 935 + }, + { + "epoch": 0.45250181290790426, + "grad_norm": 0.4105832023778957, + "learning_rate": 6.199264841242267e-06, + "loss": 0.507, + "step": 936 + }, + { + "epoch": 0.45298525501571185, + "grad_norm": 0.45569820188618954, + "learning_rate": 6.191502784783627e-06, + "loss": 0.5361, + "step": 937 + }, + { + "epoch": 0.45346869712351945, + "grad_norm": 0.4487391558411953, + "learning_rate": 6.183737682747839e-06, + "loss": 0.5404, + "step": 938 + }, + { + "epoch": 0.45395213923132705, + "grad_norm": 0.38508572249224443, + "learning_rate": 6.17596955498313e-06, + "loss": 0.475, + "step": 939 + }, + { + "epoch": 0.45443558133913464, + "grad_norm": 0.47287816273000344, + "learning_rate": 6.16819842134546e-06, + "loss": 0.5293, + "step": 940 + }, + { + "epoch": 0.45491902344694224, + "grad_norm": 0.4650031849319042, + "learning_rate": 6.160424301698472e-06, + "loss": 0.5315, + "step": 941 + }, + { + "epoch": 0.45540246555474984, + "grad_norm": 0.4613271364601288, + "learning_rate": 6.1526472159134454e-06, + "loss": 0.5398, + "step": 942 + }, + { + "epoch": 0.45588590766255743, + "grad_norm": 0.42626048007656836, + "learning_rate": 6.1448671838692365e-06, + "loss": 0.5246, + "step": 943 + }, + { + "epoch": 0.456369349770365, + "grad_norm": 0.45709658015590027, + "learning_rate": 6.1370842254522325e-06, + "loss": 0.5392, + "step": 944 + }, + { + "epoch": 0.45685279187817257, + "grad_norm": 0.4969316742066114, + "learning_rate": 6.129298360556304e-06, + "loss": 0.5216, + "step": 945 + }, + { + "epoch": 0.45733623398598017, + "grad_norm": 0.48874038272824616, + "learning_rate": 6.1215096090827485e-06, + "loss": 0.5341, + "step": 946 + }, + { + "epoch": 0.45781967609378776, + "grad_norm": 0.43416511670164404, + "learning_rate": 6.1137179909402445e-06, + "loss": 0.5357, + "step": 947 + }, + { + "epoch": 0.45830311820159536, + "grad_norm": 0.4398289245515299, + "learning_rate": 6.105923526044794e-06, + "loss": 0.5312, + "step": 948 + }, + { + "epoch": 0.45878656030940296, + "grad_norm": 0.506729531362168, + "learning_rate": 6.098126234319679e-06, + "loss": 0.5164, + "step": 949 + }, + { + "epoch": 0.45927000241721055, + "grad_norm": 0.49661020376021475, + "learning_rate": 6.0903261356954035e-06, + "loss": 0.5406, + "step": 950 + }, + { + "epoch": 0.45975344452501815, + "grad_norm": 0.49213602324131056, + "learning_rate": 6.08252325010965e-06, + "loss": 0.5369, + "step": 951 + }, + { + "epoch": 0.4602368866328257, + "grad_norm": 0.4522403269153531, + "learning_rate": 6.074717597507223e-06, + "loss": 0.5318, + "step": 952 + }, + { + "epoch": 0.4607203287406333, + "grad_norm": 0.46334490270574613, + "learning_rate": 6.066909197839996e-06, + "loss": 0.5053, + "step": 953 + }, + { + "epoch": 0.4612037708484409, + "grad_norm": 0.4555176211572043, + "learning_rate": 6.059098071066874e-06, + "loss": 0.5313, + "step": 954 + }, + { + "epoch": 0.4616872129562485, + "grad_norm": 0.4697386359927019, + "learning_rate": 6.051284237153723e-06, + "loss": 0.5304, + "step": 955 + }, + { + "epoch": 0.4621706550640561, + "grad_norm": 0.47606054578122203, + "learning_rate": 6.043467716073333e-06, + "loss": 0.5392, + "step": 956 + }, + { + "epoch": 0.4626540971718637, + "grad_norm": 0.4504863880612236, + "learning_rate": 6.035648527805359e-06, + "loss": 0.5333, + "step": 957 + }, + { + "epoch": 0.4631375392796713, + "grad_norm": 0.47075391826579777, + "learning_rate": 6.0278266923362805e-06, + "loss": 0.5331, + "step": 958 + }, + { + "epoch": 0.46362098138747887, + "grad_norm": 0.4403281566352786, + "learning_rate": 6.0200022296593375e-06, + "loss": 0.5432, + "step": 959 + }, + { + "epoch": 0.46410442349528647, + "grad_norm": 0.4572023392519702, + "learning_rate": 6.012175159774488e-06, + "loss": 0.5323, + "step": 960 + }, + { + "epoch": 0.464587865603094, + "grad_norm": 0.44773102967818523, + "learning_rate": 6.004345502688353e-06, + "loss": 0.5299, + "step": 961 + }, + { + "epoch": 0.4650713077109016, + "grad_norm": 0.4371461582728146, + "learning_rate": 5.996513278414166e-06, + "loss": 0.5385, + "step": 962 + }, + { + "epoch": 0.4655547498187092, + "grad_norm": 0.42182149099377253, + "learning_rate": 5.988678506971726e-06, + "loss": 0.5303, + "step": 963 + }, + { + "epoch": 0.4660381919265168, + "grad_norm": 0.4594942911793403, + "learning_rate": 5.980841208387338e-06, + "loss": 0.503, + "step": 964 + }, + { + "epoch": 0.4665216340343244, + "grad_norm": 0.4501324492723119, + "learning_rate": 5.973001402693769e-06, + "loss": 0.5253, + "step": 965 + }, + { + "epoch": 0.467005076142132, + "grad_norm": 0.4373460975849127, + "learning_rate": 5.965159109930196e-06, + "loss": 0.5386, + "step": 966 + }, + { + "epoch": 0.4674885182499396, + "grad_norm": 0.4418857621109752, + "learning_rate": 5.957314350142149e-06, + "loss": 0.529, + "step": 967 + }, + { + "epoch": 0.4679719603577472, + "grad_norm": 0.485651415733174, + "learning_rate": 5.94946714338147e-06, + "loss": 0.538, + "step": 968 + }, + { + "epoch": 0.4684554024655547, + "grad_norm": 0.44827050412474007, + "learning_rate": 5.941617509706247e-06, + "loss": 0.5333, + "step": 969 + }, + { + "epoch": 0.4689388445733623, + "grad_norm": 0.44674045160443804, + "learning_rate": 5.933765469180779e-06, + "loss": 0.5329, + "step": 970 + }, + { + "epoch": 0.4694222866811699, + "grad_norm": 0.47047904115025807, + "learning_rate": 5.925911041875514e-06, + "loss": 0.5304, + "step": 971 + }, + { + "epoch": 0.4699057287889775, + "grad_norm": 0.47618192657266745, + "learning_rate": 5.9180542478670025e-06, + "loss": 0.5339, + "step": 972 + }, + { + "epoch": 0.4703891708967851, + "grad_norm": 0.4372302817189476, + "learning_rate": 5.910195107237842e-06, + "loss": 0.5311, + "step": 973 + }, + { + "epoch": 0.4708726130045927, + "grad_norm": 0.4599705922186888, + "learning_rate": 5.902333640076627e-06, + "loss": 0.5294, + "step": 974 + }, + { + "epoch": 0.4713560551124003, + "grad_norm": 0.4668491954105507, + "learning_rate": 5.894469866477905e-06, + "loss": 0.5319, + "step": 975 + }, + { + "epoch": 0.4718394972202079, + "grad_norm": 0.4298713287991765, + "learning_rate": 5.886603806542114e-06, + "loss": 0.5308, + "step": 976 + }, + { + "epoch": 0.47232293932801545, + "grad_norm": 0.44120727064086296, + "learning_rate": 5.878735480375537e-06, + "loss": 0.5271, + "step": 977 + }, + { + "epoch": 0.47280638143582304, + "grad_norm": 0.45517085979612587, + "learning_rate": 5.87086490809025e-06, + "loss": 0.5363, + "step": 978 + }, + { + "epoch": 0.47328982354363064, + "grad_norm": 0.41718667843710106, + "learning_rate": 5.862992109804071e-06, + "loss": 0.5208, + "step": 979 + }, + { + "epoch": 0.47377326565143824, + "grad_norm": 0.4698208514148652, + "learning_rate": 5.855117105640503e-06, + "loss": 0.5045, + "step": 980 + }, + { + "epoch": 0.47425670775924583, + "grad_norm": 0.44330430106357593, + "learning_rate": 5.847239915728695e-06, + "loss": 0.5213, + "step": 981 + }, + { + "epoch": 0.47474014986705343, + "grad_norm": 0.46348151704515983, + "learning_rate": 5.839360560203379e-06, + "loss": 0.5416, + "step": 982 + }, + { + "epoch": 0.475223591974861, + "grad_norm": 0.4369134554030048, + "learning_rate": 5.831479059204822e-06, + "loss": 0.5397, + "step": 983 + }, + { + "epoch": 0.4757070340826686, + "grad_norm": 0.434332595795343, + "learning_rate": 5.823595432878775e-06, + "loss": 0.5253, + "step": 984 + }, + { + "epoch": 0.47619047619047616, + "grad_norm": 0.45066877191979876, + "learning_rate": 5.815709701376424e-06, + "loss": 0.501, + "step": 985 + }, + { + "epoch": 0.47667391829828376, + "grad_norm": 0.43680722385313575, + "learning_rate": 5.8078218848543326e-06, + "loss": 0.5425, + "step": 986 + }, + { + "epoch": 0.47715736040609136, + "grad_norm": 0.462367177083934, + "learning_rate": 5.799932003474398e-06, + "loss": 0.5293, + "step": 987 + }, + { + "epoch": 0.47764080251389895, + "grad_norm": 0.46046005650848004, + "learning_rate": 5.7920400774037884e-06, + "loss": 0.5273, + "step": 988 + }, + { + "epoch": 0.47812424462170655, + "grad_norm": 0.4325701614014005, + "learning_rate": 5.784146126814909e-06, + "loss": 0.523, + "step": 989 + }, + { + "epoch": 0.47860768672951415, + "grad_norm": 0.42688145845821907, + "learning_rate": 5.776250171885329e-06, + "loss": 0.5289, + "step": 990 + }, + { + "epoch": 0.47909112883732174, + "grad_norm": 0.4553909518433745, + "learning_rate": 5.768352232797748e-06, + "loss": 0.5354, + "step": 991 + }, + { + "epoch": 0.47957457094512934, + "grad_norm": 0.4612709888611227, + "learning_rate": 5.760452329739933e-06, + "loss": 0.5346, + "step": 992 + }, + { + "epoch": 0.48005801305293694, + "grad_norm": 0.4671180697912862, + "learning_rate": 5.752550482904674e-06, + "loss": 0.5381, + "step": 993 + }, + { + "epoch": 0.4805414551607445, + "grad_norm": 0.44306247126150655, + "learning_rate": 5.744646712489729e-06, + "loss": 0.5347, + "step": 994 + }, + { + "epoch": 0.4810248972685521, + "grad_norm": 0.47845590337101257, + "learning_rate": 5.736741038697771e-06, + "loss": 0.546, + "step": 995 + }, + { + "epoch": 0.4815083393763597, + "grad_norm": 0.42271596693767594, + "learning_rate": 5.728833481736339e-06, + "loss": 0.5189, + "step": 996 + }, + { + "epoch": 0.48199178148416727, + "grad_norm": 0.4538365308944498, + "learning_rate": 5.720924061817786e-06, + "loss": 0.5405, + "step": 997 + }, + { + "epoch": 0.48247522359197487, + "grad_norm": 0.4492861025701022, + "learning_rate": 5.71301279915923e-06, + "loss": 0.5317, + "step": 998 + }, + { + "epoch": 0.48295866569978246, + "grad_norm": 0.4745326397538611, + "learning_rate": 5.705099713982491e-06, + "loss": 0.532, + "step": 999 + }, + { + "epoch": 0.48344210780759006, + "grad_norm": 0.44214626511814337, + "learning_rate": 5.697184826514058e-06, + "loss": 0.5305, + "step": 1000 + }, + { + "epoch": 0.48392554991539766, + "grad_norm": 0.46953984488621225, + "learning_rate": 5.689268156985015e-06, + "loss": 0.5385, + "step": 1001 + }, + { + "epoch": 0.4844089920232052, + "grad_norm": 0.421787366978404, + "learning_rate": 5.6813497256310124e-06, + "loss": 0.5468, + "step": 1002 + }, + { + "epoch": 0.4848924341310128, + "grad_norm": 0.44994209423666665, + "learning_rate": 5.673429552692196e-06, + "loss": 0.5259, + "step": 1003 + }, + { + "epoch": 0.4853758762388204, + "grad_norm": 0.4777649344533823, + "learning_rate": 5.66550765841317e-06, + "loss": 0.5312, + "step": 1004 + }, + { + "epoch": 0.485859318346628, + "grad_norm": 0.4450135880363142, + "learning_rate": 5.6575840630429295e-06, + "loss": 0.5234, + "step": 1005 + }, + { + "epoch": 0.4863427604544356, + "grad_norm": 0.48478431421225965, + "learning_rate": 5.649658786834825e-06, + "loss": 0.5337, + "step": 1006 + }, + { + "epoch": 0.4868262025622432, + "grad_norm": 0.4600376201705413, + "learning_rate": 5.641731850046503e-06, + "loss": 0.5292, + "step": 1007 + }, + { + "epoch": 0.4873096446700508, + "grad_norm": 0.46358685580492737, + "learning_rate": 5.633803272939851e-06, + "loss": 0.5033, + "step": 1008 + }, + { + "epoch": 0.4877930867778584, + "grad_norm": 0.4424655348417247, + "learning_rate": 5.62587307578095e-06, + "loss": 0.5199, + "step": 1009 + }, + { + "epoch": 0.4882765288856659, + "grad_norm": 0.44958190643210255, + "learning_rate": 5.6179412788400255e-06, + "loss": 0.5285, + "step": 1010 + }, + { + "epoch": 0.4887599709934735, + "grad_norm": 0.46154354908053036, + "learning_rate": 5.610007902391387e-06, + "loss": 0.5302, + "step": 1011 + }, + { + "epoch": 0.4892434131012811, + "grad_norm": 0.4787830389554928, + "learning_rate": 5.602072966713389e-06, + "loss": 0.5319, + "step": 1012 + }, + { + "epoch": 0.4897268552090887, + "grad_norm": 0.46247126201641375, + "learning_rate": 5.594136492088363e-06, + "loss": 0.533, + "step": 1013 + }, + { + "epoch": 0.4902102973168963, + "grad_norm": 0.4497969101747785, + "learning_rate": 5.586198498802577e-06, + "loss": 0.5207, + "step": 1014 + }, + { + "epoch": 0.4906937394247039, + "grad_norm": 0.4711750674532031, + "learning_rate": 5.578259007146183e-06, + "loss": 0.5182, + "step": 1015 + }, + { + "epoch": 0.4911771815325115, + "grad_norm": 0.44280948751548693, + "learning_rate": 5.570318037413162e-06, + "loss": 0.5335, + "step": 1016 + }, + { + "epoch": 0.4916606236403191, + "grad_norm": 0.4140593593091692, + "learning_rate": 5.562375609901273e-06, + "loss": 0.498, + "step": 1017 + }, + { + "epoch": 0.49214406574812664, + "grad_norm": 0.5035482968527856, + "learning_rate": 5.5544317449119975e-06, + "loss": 0.535, + "step": 1018 + }, + { + "epoch": 0.49262750785593423, + "grad_norm": 0.470430802340302, + "learning_rate": 5.546486462750499e-06, + "loss": 0.529, + "step": 1019 + }, + { + "epoch": 0.49311094996374183, + "grad_norm": 0.4614608636774156, + "learning_rate": 5.538539783725556e-06, + "loss": 0.5415, + "step": 1020 + }, + { + "epoch": 0.4935943920715494, + "grad_norm": 0.43439106514800707, + "learning_rate": 5.530591728149522e-06, + "loss": 0.5237, + "step": 1021 + }, + { + "epoch": 0.494077834179357, + "grad_norm": 0.4532581332185661, + "learning_rate": 5.522642316338268e-06, + "loss": 0.5275, + "step": 1022 + }, + { + "epoch": 0.4945612762871646, + "grad_norm": 0.44400011265861933, + "learning_rate": 5.51469156861113e-06, + "loss": 0.5279, + "step": 1023 + }, + { + "epoch": 0.4950447183949722, + "grad_norm": 0.464713276215896, + "learning_rate": 5.50673950529086e-06, + "loss": 0.5261, + "step": 1024 + }, + { + "epoch": 0.4955281605027798, + "grad_norm": 0.46801515065857147, + "learning_rate": 5.498786146703575e-06, + "loss": 0.5392, + "step": 1025 + }, + { + "epoch": 0.4960116026105874, + "grad_norm": 0.43826397020984875, + "learning_rate": 5.490831513178698e-06, + "loss": 0.513, + "step": 1026 + }, + { + "epoch": 0.49649504471839495, + "grad_norm": 0.4545609138302334, + "learning_rate": 5.482875625048916e-06, + "loss": 0.5342, + "step": 1027 + }, + { + "epoch": 0.49697848682620255, + "grad_norm": 0.45528220157554766, + "learning_rate": 5.474918502650116e-06, + "loss": 0.5371, + "step": 1028 + }, + { + "epoch": 0.49746192893401014, + "grad_norm": 0.4918245291291218, + "learning_rate": 5.466960166321348e-06, + "loss": 0.5248, + "step": 1029 + }, + { + "epoch": 0.49794537104181774, + "grad_norm": 0.43603238272665606, + "learning_rate": 5.459000636404759e-06, + "loss": 0.5236, + "step": 1030 + }, + { + "epoch": 0.49842881314962534, + "grad_norm": 0.43802907063700564, + "learning_rate": 5.451039933245551e-06, + "loss": 0.5342, + "step": 1031 + }, + { + "epoch": 0.49891225525743294, + "grad_norm": 0.4660487433831574, + "learning_rate": 5.44307807719192e-06, + "loss": 0.5249, + "step": 1032 + }, + { + "epoch": 0.49939569736524053, + "grad_norm": 0.4349479566561577, + "learning_rate": 5.435115088595016e-06, + "loss": 0.4997, + "step": 1033 + }, + { + "epoch": 0.49987913947304813, + "grad_norm": 0.43592525172206065, + "learning_rate": 5.4271509878088755e-06, + "loss": 0.5263, + "step": 1034 + }, + { + "epoch": 0.5003625815808557, + "grad_norm": 0.42380458268711035, + "learning_rate": 5.4191857951903825e-06, + "loss": 0.503, + "step": 1035 + }, + { + "epoch": 0.5008460236886633, + "grad_norm": 0.47347721204466, + "learning_rate": 5.4112195310992144e-06, + "loss": 0.5228, + "step": 1036 + }, + { + "epoch": 0.5013294657964709, + "grad_norm": 0.4523581952116975, + "learning_rate": 5.403252215897781e-06, + "loss": 0.5295, + "step": 1037 + }, + { + "epoch": 0.5018129079042785, + "grad_norm": 0.44718772307460525, + "learning_rate": 5.395283869951184e-06, + "loss": 0.5402, + "step": 1038 + }, + { + "epoch": 0.5022963500120861, + "grad_norm": 0.42574725470561453, + "learning_rate": 5.387314513627156e-06, + "loss": 0.5228, + "step": 1039 + }, + { + "epoch": 0.5027797921198937, + "grad_norm": 0.4816073830916942, + "learning_rate": 5.379344167296017e-06, + "loss": 0.5302, + "step": 1040 + }, + { + "epoch": 0.5032632342277013, + "grad_norm": 0.454535172796951, + "learning_rate": 5.371372851330612e-06, + "loss": 0.5337, + "step": 1041 + }, + { + "epoch": 0.5037466763355088, + "grad_norm": 0.44304402033518747, + "learning_rate": 5.3634005861062675e-06, + "loss": 0.5348, + "step": 1042 + }, + { + "epoch": 0.5042301184433164, + "grad_norm": 0.4647077624164304, + "learning_rate": 5.355427392000736e-06, + "loss": 0.5367, + "step": 1043 + }, + { + "epoch": 0.504713560551124, + "grad_norm": 0.47204921033701974, + "learning_rate": 5.347453289394146e-06, + "loss": 0.5236, + "step": 1044 + }, + { + "epoch": 0.5051970026589316, + "grad_norm": 0.444269913904137, + "learning_rate": 5.339478298668943e-06, + "loss": 0.5374, + "step": 1045 + }, + { + "epoch": 0.5056804447667392, + "grad_norm": 0.437749383717649, + "learning_rate": 5.331502440209849e-06, + "loss": 0.529, + "step": 1046 + }, + { + "epoch": 0.5061638868745467, + "grad_norm": 0.44850371734571, + "learning_rate": 5.3235257344037996e-06, + "loss": 0.5363, + "step": 1047 + }, + { + "epoch": 0.5066473289823543, + "grad_norm": 0.4687290830806752, + "learning_rate": 5.3155482016398995e-06, + "loss": 0.5335, + "step": 1048 + }, + { + "epoch": 0.5071307710901619, + "grad_norm": 0.43863314490483407, + "learning_rate": 5.307569862309363e-06, + "loss": 0.5269, + "step": 1049 + }, + { + "epoch": 0.5076142131979695, + "grad_norm": 0.44499358455418236, + "learning_rate": 5.29959073680547e-06, + "loss": 0.5269, + "step": 1050 + }, + { + "epoch": 0.5080976553057771, + "grad_norm": 0.4443034689772289, + "learning_rate": 5.2916108455235084e-06, + "loss": 0.5247, + "step": 1051 + }, + { + "epoch": 0.5085810974135847, + "grad_norm": 0.41196428985814043, + "learning_rate": 5.2836302088607235e-06, + "loss": 0.5257, + "step": 1052 + }, + { + "epoch": 0.5090645395213923, + "grad_norm": 0.4196111969098687, + "learning_rate": 5.275648847216263e-06, + "loss": 0.5326, + "step": 1053 + }, + { + "epoch": 0.5095479816291999, + "grad_norm": 0.4396027925039716, + "learning_rate": 5.267666780991135e-06, + "loss": 0.5384, + "step": 1054 + }, + { + "epoch": 0.5100314237370075, + "grad_norm": 0.4918850008899013, + "learning_rate": 5.259684030588141e-06, + "loss": 0.5217, + "step": 1055 + }, + { + "epoch": 0.5105148658448151, + "grad_norm": 0.4636011028874602, + "learning_rate": 5.251700616411836e-06, + "loss": 0.5292, + "step": 1056 + }, + { + "epoch": 0.5109983079526227, + "grad_norm": 0.45080015588791633, + "learning_rate": 5.243716558868469e-06, + "loss": 0.5335, + "step": 1057 + }, + { + "epoch": 0.5114817500604303, + "grad_norm": 0.4430606358386943, + "learning_rate": 5.235731878365935e-06, + "loss": 0.5366, + "step": 1058 + }, + { + "epoch": 0.5119651921682379, + "grad_norm": 0.4765633831252005, + "learning_rate": 5.22774659531372e-06, + "loss": 0.5343, + "step": 1059 + }, + { + "epoch": 0.5124486342760455, + "grad_norm": 0.47759199812886083, + "learning_rate": 5.219760730122854e-06, + "loss": 0.5318, + "step": 1060 + }, + { + "epoch": 0.5129320763838531, + "grad_norm": 0.4277450357502153, + "learning_rate": 5.211774303205849e-06, + "loss": 0.5055, + "step": 1061 + }, + { + "epoch": 0.5134155184916607, + "grad_norm": 0.42824551582926373, + "learning_rate": 5.203787334976655e-06, + "loss": 0.5015, + "step": 1062 + }, + { + "epoch": 0.5138989605994683, + "grad_norm": 0.5041405711353805, + "learning_rate": 5.195799845850611e-06, + "loss": 0.525, + "step": 1063 + }, + { + "epoch": 0.5143824027072758, + "grad_norm": 0.4306413852615145, + "learning_rate": 5.18781185624438e-06, + "loss": 0.5265, + "step": 1064 + }, + { + "epoch": 0.5148658448150834, + "grad_norm": 0.4361432944536921, + "learning_rate": 5.179823386575908e-06, + "loss": 0.5311, + "step": 1065 + }, + { + "epoch": 0.515349286922891, + "grad_norm": 0.4240690760700838, + "learning_rate": 5.171834457264364e-06, + "loss": 0.5286, + "step": 1066 + }, + { + "epoch": 0.5158327290306985, + "grad_norm": 0.47256746355593565, + "learning_rate": 5.1638450887301006e-06, + "loss": 0.5282, + "step": 1067 + }, + { + "epoch": 0.5163161711385061, + "grad_norm": 0.4445988854998036, + "learning_rate": 5.155855301394585e-06, + "loss": 0.527, + "step": 1068 + }, + { + "epoch": 0.5167996132463137, + "grad_norm": 0.46405127876443125, + "learning_rate": 5.147865115680357e-06, + "loss": 0.5289, + "step": 1069 + }, + { + "epoch": 0.5172830553541213, + "grad_norm": 0.45801111107179227, + "learning_rate": 5.139874552010975e-06, + "loss": 0.531, + "step": 1070 + }, + { + "epoch": 0.5177664974619289, + "grad_norm": 0.454345233725776, + "learning_rate": 5.131883630810966e-06, + "loss": 0.5428, + "step": 1071 + }, + { + "epoch": 0.5182499395697365, + "grad_norm": 0.4649418568564353, + "learning_rate": 5.123892372505768e-06, + "loss": 0.524, + "step": 1072 + }, + { + "epoch": 0.5187333816775441, + "grad_norm": 0.4609921972037312, + "learning_rate": 5.11590079752168e-06, + "loss": 0.5337, + "step": 1073 + }, + { + "epoch": 0.5192168237853517, + "grad_norm": 0.4257268056499296, + "learning_rate": 5.107908926285813e-06, + "loss": 0.5247, + "step": 1074 + }, + { + "epoch": 0.5197002658931593, + "grad_norm": 0.4277925159892485, + "learning_rate": 5.099916779226032e-06, + "loss": 0.5314, + "step": 1075 + }, + { + "epoch": 0.5201837080009669, + "grad_norm": 0.46374173796570095, + "learning_rate": 5.091924376770912e-06, + "loss": 0.5267, + "step": 1076 + }, + { + "epoch": 0.5206671501087745, + "grad_norm": 0.46413399868377414, + "learning_rate": 5.083931739349675e-06, + "loss": 0.5227, + "step": 1077 + }, + { + "epoch": 0.5211505922165821, + "grad_norm": 0.42805047816232233, + "learning_rate": 5.075938887392149e-06, + "loss": 0.5148, + "step": 1078 + }, + { + "epoch": 0.5216340343243897, + "grad_norm": 0.42348168283938675, + "learning_rate": 5.0679458413287055e-06, + "loss": 0.5168, + "step": 1079 + }, + { + "epoch": 0.5221174764321972, + "grad_norm": 0.48349127354280697, + "learning_rate": 5.059952621590216e-06, + "loss": 0.5274, + "step": 1080 + }, + { + "epoch": 0.5226009185400048, + "grad_norm": 0.447828940551305, + "learning_rate": 5.051959248607993e-06, + "loss": 0.5251, + "step": 1081 + }, + { + "epoch": 0.5230843606478124, + "grad_norm": 0.45995219297674483, + "learning_rate": 5.043965742813744e-06, + "loss": 0.5246, + "step": 1082 + }, + { + "epoch": 0.52356780275562, + "grad_norm": 0.4744443087812869, + "learning_rate": 5.035972124639511e-06, + "loss": 0.5299, + "step": 1083 + }, + { + "epoch": 0.5240512448634276, + "grad_norm": 0.4140590259366713, + "learning_rate": 5.02797841451763e-06, + "loss": 0.5273, + "step": 1084 + }, + { + "epoch": 0.5245346869712352, + "grad_norm": 0.4421687910150297, + "learning_rate": 5.019984632880665e-06, + "loss": 0.5342, + "step": 1085 + }, + { + "epoch": 0.5250181290790428, + "grad_norm": 0.4665180180836343, + "learning_rate": 5.011990800161369e-06, + "loss": 0.5314, + "step": 1086 + }, + { + "epoch": 0.5255015711868504, + "grad_norm": 0.45569707259235365, + "learning_rate": 5.00399693679262e-06, + "loss": 0.5291, + "step": 1087 + }, + { + "epoch": 0.525985013294658, + "grad_norm": 0.3966652463954235, + "learning_rate": 4.9960030632073815e-06, + "loss": 0.4852, + "step": 1088 + }, + { + "epoch": 0.5264684554024656, + "grad_norm": 0.4156779611396039, + "learning_rate": 4.988009199838632e-06, + "loss": 0.5266, + "step": 1089 + }, + { + "epoch": 0.5269518975102732, + "grad_norm": 0.4459927015276638, + "learning_rate": 4.980015367119336e-06, + "loss": 0.5128, + "step": 1090 + }, + { + "epoch": 0.5274353396180808, + "grad_norm": 0.4470940459251613, + "learning_rate": 4.9720215854823716e-06, + "loss": 0.5215, + "step": 1091 + }, + { + "epoch": 0.5279187817258884, + "grad_norm": 0.4379040861596386, + "learning_rate": 4.96402787536049e-06, + "loss": 0.529, + "step": 1092 + }, + { + "epoch": 0.528402223833696, + "grad_norm": 0.42846117019918506, + "learning_rate": 4.956034257186258e-06, + "loss": 0.5196, + "step": 1093 + }, + { + "epoch": 0.5288856659415035, + "grad_norm": 0.45571668646782787, + "learning_rate": 4.9480407513920086e-06, + "loss": 0.527, + "step": 1094 + }, + { + "epoch": 0.5293691080493111, + "grad_norm": 0.4680208745726276, + "learning_rate": 4.940047378409786e-06, + "loss": 0.523, + "step": 1095 + }, + { + "epoch": 0.5298525501571187, + "grad_norm": 0.44029959009092084, + "learning_rate": 4.932054158671295e-06, + "loss": 0.5244, + "step": 1096 + }, + { + "epoch": 0.5303359922649262, + "grad_norm": 0.45471058687459337, + "learning_rate": 4.924061112607853e-06, + "loss": 0.532, + "step": 1097 + }, + { + "epoch": 0.5308194343727338, + "grad_norm": 0.43297324457453135, + "learning_rate": 4.9160682606503255e-06, + "loss": 0.5226, + "step": 1098 + }, + { + "epoch": 0.5313028764805414, + "grad_norm": 0.4269816974887771, + "learning_rate": 4.908075623229089e-06, + "loss": 0.5242, + "step": 1099 + }, + { + "epoch": 0.531786318588349, + "grad_norm": 0.4280455143246443, + "learning_rate": 4.900083220773968e-06, + "loss": 0.5082, + "step": 1100 + }, + { + "epoch": 0.5322697606961566, + "grad_norm": 0.4422972364197889, + "learning_rate": 4.892091073714189e-06, + "loss": 0.5162, + "step": 1101 + }, + { + "epoch": 0.5327532028039642, + "grad_norm": 0.44177794728378245, + "learning_rate": 4.88409920247832e-06, + "loss": 0.5287, + "step": 1102 + }, + { + "epoch": 0.5332366449117718, + "grad_norm": 0.4384466281073893, + "learning_rate": 4.876107627494234e-06, + "loss": 0.5224, + "step": 1103 + }, + { + "epoch": 0.5337200870195794, + "grad_norm": 0.42791066381831555, + "learning_rate": 4.868116369189033e-06, + "loss": 0.519, + "step": 1104 + }, + { + "epoch": 0.534203529127387, + "grad_norm": 0.43287603203122865, + "learning_rate": 4.860125447989026e-06, + "loss": 0.5315, + "step": 1105 + }, + { + "epoch": 0.5346869712351946, + "grad_norm": 0.40835340581900775, + "learning_rate": 4.852134884319646e-06, + "loss": 0.5013, + "step": 1106 + }, + { + "epoch": 0.5351704133430022, + "grad_norm": 0.4564877621647975, + "learning_rate": 4.844144698605418e-06, + "loss": 0.5163, + "step": 1107 + }, + { + "epoch": 0.5356538554508098, + "grad_norm": 0.4350849446506223, + "learning_rate": 4.836154911269902e-06, + "loss": 0.5216, + "step": 1108 + }, + { + "epoch": 0.5361372975586174, + "grad_norm": 0.45777989132001196, + "learning_rate": 4.8281655427356375e-06, + "loss": 0.5248, + "step": 1109 + }, + { + "epoch": 0.536620739666425, + "grad_norm": 0.463821530722432, + "learning_rate": 4.820176613424095e-06, + "loss": 0.5302, + "step": 1110 + }, + { + "epoch": 0.5371041817742326, + "grad_norm": 0.44517887708797427, + "learning_rate": 4.812188143755621e-06, + "loss": 0.5214, + "step": 1111 + }, + { + "epoch": 0.5375876238820402, + "grad_norm": 0.5017058167871686, + "learning_rate": 4.80420015414939e-06, + "loss": 0.5305, + "step": 1112 + }, + { + "epoch": 0.5380710659898477, + "grad_norm": 0.4672768776486219, + "learning_rate": 4.796212665023345e-06, + "loss": 0.5237, + "step": 1113 + }, + { + "epoch": 0.5385545080976553, + "grad_norm": 0.4419995316595335, + "learning_rate": 4.788225696794153e-06, + "loss": 0.5277, + "step": 1114 + }, + { + "epoch": 0.5390379502054629, + "grad_norm": 0.44730511943868706, + "learning_rate": 4.780239269877147e-06, + "loss": 0.5313, + "step": 1115 + }, + { + "epoch": 0.5395213923132705, + "grad_norm": 0.45204850110236905, + "learning_rate": 4.7722534046862805e-06, + "loss": 0.5231, + "step": 1116 + }, + { + "epoch": 0.540004834421078, + "grad_norm": 0.45779960123303604, + "learning_rate": 4.764268121634066e-06, + "loss": 0.507, + "step": 1117 + }, + { + "epoch": 0.5404882765288856, + "grad_norm": 0.41695878051285573, + "learning_rate": 4.7562834411315324e-06, + "loss": 0.5042, + "step": 1118 + }, + { + "epoch": 0.5409717186366932, + "grad_norm": 0.4513235919318933, + "learning_rate": 4.748299383588167e-06, + "loss": 0.5258, + "step": 1119 + }, + { + "epoch": 0.5414551607445008, + "grad_norm": 0.467590473541893, + "learning_rate": 4.74031596941186e-06, + "loss": 0.5375, + "step": 1120 + }, + { + "epoch": 0.5419386028523084, + "grad_norm": 0.4524946558422428, + "learning_rate": 4.7323332190088675e-06, + "loss": 0.5199, + "step": 1121 + }, + { + "epoch": 0.542422044960116, + "grad_norm": 0.42113263739650647, + "learning_rate": 4.7243511527837374e-06, + "loss": 0.5251, + "step": 1122 + }, + { + "epoch": 0.5429054870679236, + "grad_norm": 0.4636225245244688, + "learning_rate": 4.716369791139279e-06, + "loss": 0.5308, + "step": 1123 + }, + { + "epoch": 0.5433889291757312, + "grad_norm": 0.4398566927754983, + "learning_rate": 4.708389154476492e-06, + "loss": 0.5201, + "step": 1124 + }, + { + "epoch": 0.5438723712835388, + "grad_norm": 0.43330211663967066, + "learning_rate": 4.7004092631945315e-06, + "loss": 0.5258, + "step": 1125 + }, + { + "epoch": 0.5443558133913464, + "grad_norm": 0.44482177202458967, + "learning_rate": 4.692430137690638e-06, + "loss": 0.5222, + "step": 1126 + }, + { + "epoch": 0.544839255499154, + "grad_norm": 0.4318024796342877, + "learning_rate": 4.684451798360102e-06, + "loss": 0.5204, + "step": 1127 + }, + { + "epoch": 0.5453226976069616, + "grad_norm": 0.45444414518744425, + "learning_rate": 4.6764742655962e-06, + "loss": 0.5255, + "step": 1128 + }, + { + "epoch": 0.5458061397147692, + "grad_norm": 0.4372839038666406, + "learning_rate": 4.6684975597901526e-06, + "loss": 0.5275, + "step": 1129 + }, + { + "epoch": 0.5462895818225767, + "grad_norm": 0.43767287064021165, + "learning_rate": 4.660521701331058e-06, + "loss": 0.5046, + "step": 1130 + }, + { + "epoch": 0.5467730239303843, + "grad_norm": 0.42146624184063447, + "learning_rate": 4.652546710605857e-06, + "loss": 0.5284, + "step": 1131 + }, + { + "epoch": 0.5472564660381919, + "grad_norm": 0.4528806888549099, + "learning_rate": 4.644572607999267e-06, + "loss": 0.5234, + "step": 1132 + }, + { + "epoch": 0.5477399081459995, + "grad_norm": 0.41615032339392954, + "learning_rate": 4.636599413893734e-06, + "loss": 0.5149, + "step": 1133 + }, + { + "epoch": 0.5482233502538071, + "grad_norm": 0.4080780400252472, + "learning_rate": 4.628627148669391e-06, + "loss": 0.5069, + "step": 1134 + }, + { + "epoch": 0.5487067923616147, + "grad_norm": 0.4376373218589361, + "learning_rate": 4.620655832703984e-06, + "loss": 0.5232, + "step": 1135 + }, + { + "epoch": 0.5491902344694223, + "grad_norm": 0.44817086720050736, + "learning_rate": 4.612685486372846e-06, + "loss": 0.5284, + "step": 1136 + }, + { + "epoch": 0.5496736765772299, + "grad_norm": 0.4269717422470433, + "learning_rate": 4.604716130048818e-06, + "loss": 0.5292, + "step": 1137 + }, + { + "epoch": 0.5501571186850375, + "grad_norm": 0.4463889311004707, + "learning_rate": 4.596747784102221e-06, + "loss": 0.5296, + "step": 1138 + }, + { + "epoch": 0.5506405607928451, + "grad_norm": 0.4544805899306494, + "learning_rate": 4.588780468900787e-06, + "loss": 0.5342, + "step": 1139 + }, + { + "epoch": 0.5511240029006527, + "grad_norm": 0.43223974279832084, + "learning_rate": 4.580814204809618e-06, + "loss": 0.5278, + "step": 1140 + }, + { + "epoch": 0.5516074450084603, + "grad_norm": 0.4526101073084036, + "learning_rate": 4.572849012191126e-06, + "loss": 0.5274, + "step": 1141 + }, + { + "epoch": 0.5520908871162679, + "grad_norm": 0.46135315991278786, + "learning_rate": 4.564884911404986e-06, + "loss": 0.5308, + "step": 1142 + }, + { + "epoch": 0.5525743292240755, + "grad_norm": 0.451744932774668, + "learning_rate": 4.5569219228080805e-06, + "loss": 0.5228, + "step": 1143 + }, + { + "epoch": 0.553057771331883, + "grad_norm": 0.4431407162306295, + "learning_rate": 4.54896006675445e-06, + "loss": 0.5001, + "step": 1144 + }, + { + "epoch": 0.5535412134396906, + "grad_norm": 0.4115173802208087, + "learning_rate": 4.540999363595242e-06, + "loss": 0.4963, + "step": 1145 + }, + { + "epoch": 0.5540246555474981, + "grad_norm": 0.418962800898634, + "learning_rate": 4.5330398336786526e-06, + "loss": 0.5277, + "step": 1146 + }, + { + "epoch": 0.5545080976553057, + "grad_norm": 0.46372386433812574, + "learning_rate": 4.525081497349887e-06, + "loss": 0.5427, + "step": 1147 + }, + { + "epoch": 0.5549915397631133, + "grad_norm": 0.4530031866173555, + "learning_rate": 4.517124374951086e-06, + "loss": 0.5178, + "step": 1148 + }, + { + "epoch": 0.5554749818709209, + "grad_norm": 0.440822304311767, + "learning_rate": 4.509168486821304e-06, + "loss": 0.5225, + "step": 1149 + }, + { + "epoch": 0.5559584239787285, + "grad_norm": 0.4127961848352273, + "learning_rate": 4.501213853296425e-06, + "loss": 0.523, + "step": 1150 + }, + { + "epoch": 0.5564418660865361, + "grad_norm": 0.461288293810827, + "learning_rate": 4.493260494709141e-06, + "loss": 0.5251, + "step": 1151 + }, + { + "epoch": 0.5569253081943437, + "grad_norm": 0.43458067348060225, + "learning_rate": 4.48530843138887e-06, + "loss": 0.5419, + "step": 1152 + }, + { + "epoch": 0.5574087503021513, + "grad_norm": 0.42472609884541546, + "learning_rate": 4.477357683661734e-06, + "loss": 0.5318, + "step": 1153 + }, + { + "epoch": 0.5578921924099589, + "grad_norm": 0.4256107384490566, + "learning_rate": 4.469408271850479e-06, + "loss": 0.506, + "step": 1154 + }, + { + "epoch": 0.5583756345177665, + "grad_norm": 0.4098807197744025, + "learning_rate": 4.4614602162744455e-06, + "loss": 0.5152, + "step": 1155 + }, + { + "epoch": 0.5588590766255741, + "grad_norm": 0.4390148331821114, + "learning_rate": 4.453513537249503e-06, + "loss": 0.527, + "step": 1156 + }, + { + "epoch": 0.5593425187333817, + "grad_norm": 0.4711204869094555, + "learning_rate": 4.445568255088003e-06, + "loss": 0.5247, + "step": 1157 + }, + { + "epoch": 0.5598259608411893, + "grad_norm": 0.41653734784347574, + "learning_rate": 4.4376243900987296e-06, + "loss": 0.5232, + "step": 1158 + }, + { + "epoch": 0.5603094029489969, + "grad_norm": 0.44411194102788987, + "learning_rate": 4.429681962586839e-06, + "loss": 0.5365, + "step": 1159 + }, + { + "epoch": 0.5607928450568045, + "grad_norm": 0.44003715852304276, + "learning_rate": 4.421740992853818e-06, + "loss": 0.5311, + "step": 1160 + }, + { + "epoch": 0.5612762871646121, + "grad_norm": 0.43445702749764165, + "learning_rate": 4.413801501197424e-06, + "loss": 0.5192, + "step": 1161 + }, + { + "epoch": 0.5617597292724196, + "grad_norm": 0.4315566299087621, + "learning_rate": 4.405863507911638e-06, + "loss": 0.5007, + "step": 1162 + }, + { + "epoch": 0.5622431713802272, + "grad_norm": 0.3786991552102392, + "learning_rate": 4.3979270332866105e-06, + "loss": 0.4867, + "step": 1163 + }, + { + "epoch": 0.5627266134880348, + "grad_norm": 0.4244074214223604, + "learning_rate": 4.389992097608613e-06, + "loss": 0.5271, + "step": 1164 + }, + { + "epoch": 0.5632100555958424, + "grad_norm": 0.430673286759424, + "learning_rate": 4.3820587211599745e-06, + "loss": 0.5292, + "step": 1165 + }, + { + "epoch": 0.56369349770365, + "grad_norm": 0.46109079230422667, + "learning_rate": 4.374126924219052e-06, + "loss": 0.5197, + "step": 1166 + }, + { + "epoch": 0.5641769398114576, + "grad_norm": 0.4275407630254517, + "learning_rate": 4.366196727060152e-06, + "loss": 0.5196, + "step": 1167 + }, + { + "epoch": 0.5646603819192652, + "grad_norm": 0.4407289737300367, + "learning_rate": 4.3582681499535e-06, + "loss": 0.5206, + "step": 1168 + }, + { + "epoch": 0.5651438240270727, + "grad_norm": 0.4119248969357382, + "learning_rate": 4.3503412131651765e-06, + "loss": 0.5341, + "step": 1169 + }, + { + "epoch": 0.5656272661348803, + "grad_norm": 0.4621359921349015, + "learning_rate": 4.342415936957073e-06, + "loss": 0.5225, + "step": 1170 + }, + { + "epoch": 0.5661107082426879, + "grad_norm": 0.45885127710831636, + "learning_rate": 4.334492341586833e-06, + "loss": 0.5328, + "step": 1171 + }, + { + "epoch": 0.5665941503504955, + "grad_norm": 0.4217251248929531, + "learning_rate": 4.326570447307804e-06, + "loss": 0.5024, + "step": 1172 + }, + { + "epoch": 0.5670775924583031, + "grad_norm": 0.43349297844561585, + "learning_rate": 4.318650274368989e-06, + "loss": 0.5302, + "step": 1173 + }, + { + "epoch": 0.5675610345661107, + "grad_norm": 0.44173213520395654, + "learning_rate": 4.310731843014985e-06, + "loss": 0.518, + "step": 1174 + }, + { + "epoch": 0.5680444766739183, + "grad_norm": 0.44550331806578247, + "learning_rate": 4.302815173485944e-06, + "loss": 0.5262, + "step": 1175 + }, + { + "epoch": 0.5685279187817259, + "grad_norm": 0.41790214183276025, + "learning_rate": 4.294900286017509e-06, + "loss": 0.5249, + "step": 1176 + }, + { + "epoch": 0.5690113608895335, + "grad_norm": 0.4406462454965783, + "learning_rate": 4.286987200840772e-06, + "loss": 0.5399, + "step": 1177 + }, + { + "epoch": 0.5694948029973411, + "grad_norm": 0.42899916006281247, + "learning_rate": 4.279075938182214e-06, + "loss": 0.522, + "step": 1178 + }, + { + "epoch": 0.5699782451051486, + "grad_norm": 0.39425395091405524, + "learning_rate": 4.271166518263662e-06, + "loss": 0.4916, + "step": 1179 + }, + { + "epoch": 0.5704616872129562, + "grad_norm": 0.4325306883401716, + "learning_rate": 4.263258961302232e-06, + "loss": 0.5297, + "step": 1180 + }, + { + "epoch": 0.5709451293207638, + "grad_norm": 0.42524338171666753, + "learning_rate": 4.255353287510272e-06, + "loss": 0.524, + "step": 1181 + }, + { + "epoch": 0.5714285714285714, + "grad_norm": 0.44279478264935357, + "learning_rate": 4.247449517095329e-06, + "loss": 0.5215, + "step": 1182 + }, + { + "epoch": 0.571912013536379, + "grad_norm": 0.4197931530174695, + "learning_rate": 4.239547670260069e-06, + "loss": 0.5099, + "step": 1183 + }, + { + "epoch": 0.5723954556441866, + "grad_norm": 0.4310563818067479, + "learning_rate": 4.231647767202254e-06, + "loss": 0.5191, + "step": 1184 + }, + { + "epoch": 0.5728788977519942, + "grad_norm": 0.42356166524796646, + "learning_rate": 4.223749828114672e-06, + "loss": 0.528, + "step": 1185 + }, + { + "epoch": 0.5733623398598018, + "grad_norm": 0.4095445643557658, + "learning_rate": 4.215853873185093e-06, + "loss": 0.4862, + "step": 1186 + }, + { + "epoch": 0.5738457819676094, + "grad_norm": 0.47015761976525233, + "learning_rate": 4.2079599225962115e-06, + "loss": 0.5183, + "step": 1187 + }, + { + "epoch": 0.574329224075417, + "grad_norm": 0.4175313339147143, + "learning_rate": 4.2000679965256045e-06, + "loss": 0.5045, + "step": 1188 + }, + { + "epoch": 0.5748126661832246, + "grad_norm": 0.43952093100735207, + "learning_rate": 4.192178115145668e-06, + "loss": 0.4942, + "step": 1189 + }, + { + "epoch": 0.5752961082910322, + "grad_norm": 0.41782905847478197, + "learning_rate": 4.184290298623578e-06, + "loss": 0.4962, + "step": 1190 + }, + { + "epoch": 0.5757795503988398, + "grad_norm": 0.4321674737064156, + "learning_rate": 4.176404567121225e-06, + "loss": 0.5397, + "step": 1191 + }, + { + "epoch": 0.5762629925066474, + "grad_norm": 0.4802929727845466, + "learning_rate": 4.16852094079518e-06, + "loss": 0.5249, + "step": 1192 + }, + { + "epoch": 0.576746434614455, + "grad_norm": 0.4829071925445664, + "learning_rate": 4.160639439796624e-06, + "loss": 0.5192, + "step": 1193 + }, + { + "epoch": 0.5772298767222626, + "grad_norm": 0.41462388567526964, + "learning_rate": 4.152760084271305e-06, + "loss": 0.5224, + "step": 1194 + }, + { + "epoch": 0.57771331883007, + "grad_norm": 0.41336785715866026, + "learning_rate": 4.1448828943595e-06, + "loss": 0.531, + "step": 1195 + }, + { + "epoch": 0.5781967609378776, + "grad_norm": 0.4424735299028098, + "learning_rate": 4.1370078901959306e-06, + "loss": 0.5309, + "step": 1196 + }, + { + "epoch": 0.5786802030456852, + "grad_norm": 0.43755728178912967, + "learning_rate": 4.129135091909752e-06, + "loss": 0.5314, + "step": 1197 + }, + { + "epoch": 0.5791636451534928, + "grad_norm": 0.4234445834897067, + "learning_rate": 4.121264519624463e-06, + "loss": 0.4971, + "step": 1198 + }, + { + "epoch": 0.5796470872613004, + "grad_norm": 0.4465261505400547, + "learning_rate": 4.113396193457887e-06, + "loss": 0.5421, + "step": 1199 + }, + { + "epoch": 0.580130529369108, + "grad_norm": 0.43864096822609294, + "learning_rate": 4.105530133522096e-06, + "loss": 0.5286, + "step": 1200 + }, + { + "epoch": 0.5806139714769156, + "grad_norm": 0.44812939972631666, + "learning_rate": 4.0976663599233745e-06, + "loss": 0.5283, + "step": 1201 + }, + { + "epoch": 0.5810974135847232, + "grad_norm": 0.41015688502886277, + "learning_rate": 4.08980489276216e-06, + "loss": 0.5031, + "step": 1202 + }, + { + "epoch": 0.5815808556925308, + "grad_norm": 0.4371592707730246, + "learning_rate": 4.081945752133e-06, + "loss": 0.5247, + "step": 1203 + }, + { + "epoch": 0.5820642978003384, + "grad_norm": 0.4545452075633195, + "learning_rate": 4.074088958124488e-06, + "loss": 0.5233, + "step": 1204 + }, + { + "epoch": 0.582547739908146, + "grad_norm": 0.4034341944937049, + "learning_rate": 4.066234530819222e-06, + "loss": 0.4997, + "step": 1205 + }, + { + "epoch": 0.5830311820159536, + "grad_norm": 0.44158682076629885, + "learning_rate": 4.058382490293755e-06, + "loss": 0.5327, + "step": 1206 + }, + { + "epoch": 0.5835146241237612, + "grad_norm": 0.44126859021633347, + "learning_rate": 4.050532856618532e-06, + "loss": 0.5172, + "step": 1207 + }, + { + "epoch": 0.5839980662315688, + "grad_norm": 0.44983862546632325, + "learning_rate": 4.0426856498578515e-06, + "loss": 0.5321, + "step": 1208 + }, + { + "epoch": 0.5844815083393764, + "grad_norm": 0.4306823960936312, + "learning_rate": 4.034840890069805e-06, + "loss": 0.5286, + "step": 1209 + }, + { + "epoch": 0.584964950447184, + "grad_norm": 0.46647403495000134, + "learning_rate": 4.0269985973062325e-06, + "loss": 0.5205, + "step": 1210 + }, + { + "epoch": 0.5854483925549916, + "grad_norm": 0.43512362259260234, + "learning_rate": 4.019158791612662e-06, + "loss": 0.5186, + "step": 1211 + }, + { + "epoch": 0.5859318346627991, + "grad_norm": 0.42474616162914414, + "learning_rate": 4.0113214930282765e-06, + "loss": 0.5019, + "step": 1212 + }, + { + "epoch": 0.5864152767706067, + "grad_norm": 0.4496255266024326, + "learning_rate": 4.003486721585834e-06, + "loss": 0.5292, + "step": 1213 + }, + { + "epoch": 0.5868987188784143, + "grad_norm": 0.4692064186214037, + "learning_rate": 3.995654497311649e-06, + "loss": 0.524, + "step": 1214 + }, + { + "epoch": 0.5873821609862219, + "grad_norm": 0.47446797627391324, + "learning_rate": 3.987824840225512e-06, + "loss": 0.5296, + "step": 1215 + }, + { + "epoch": 0.5878656030940295, + "grad_norm": 0.45351628397562704, + "learning_rate": 3.979997770340664e-06, + "loss": 0.5191, + "step": 1216 + }, + { + "epoch": 0.5883490452018371, + "grad_norm": 0.4469658212338775, + "learning_rate": 3.972173307663721e-06, + "loss": 0.5259, + "step": 1217 + }, + { + "epoch": 0.5888324873096447, + "grad_norm": 0.42106841746243373, + "learning_rate": 3.964351472194642e-06, + "loss": 0.5169, + "step": 1218 + }, + { + "epoch": 0.5893159294174523, + "grad_norm": 0.46081869250650287, + "learning_rate": 3.95653228392667e-06, + "loss": 0.5324, + "step": 1219 + }, + { + "epoch": 0.5897993715252599, + "grad_norm": 0.43787433991965447, + "learning_rate": 3.9487157628462784e-06, + "loss": 0.5253, + "step": 1220 + }, + { + "epoch": 0.5902828136330674, + "grad_norm": 0.4446710684339124, + "learning_rate": 3.940901928933127e-06, + "loss": 0.5207, + "step": 1221 + }, + { + "epoch": 0.590766255740875, + "grad_norm": 0.4878921355151572, + "learning_rate": 3.933090802160004e-06, + "loss": 0.5216, + "step": 1222 + }, + { + "epoch": 0.5912496978486826, + "grad_norm": 0.43014793362728476, + "learning_rate": 3.925282402492779e-06, + "loss": 0.5158, + "step": 1223 + }, + { + "epoch": 0.5917331399564902, + "grad_norm": 0.4785437280134671, + "learning_rate": 3.917476749890351e-06, + "loss": 0.5337, + "step": 1224 + }, + { + "epoch": 0.5922165820642978, + "grad_norm": 0.4374190045344671, + "learning_rate": 3.909673864304597e-06, + "loss": 0.528, + "step": 1225 + }, + { + "epoch": 0.5927000241721054, + "grad_norm": 0.4547685840805692, + "learning_rate": 3.901873765680322e-06, + "loss": 0.5366, + "step": 1226 + }, + { + "epoch": 0.593183466279913, + "grad_norm": 0.4594395711536057, + "learning_rate": 3.894076473955207e-06, + "loss": 0.5202, + "step": 1227 + }, + { + "epoch": 0.5936669083877205, + "grad_norm": 0.5197830445285274, + "learning_rate": 3.886282009059757e-06, + "loss": 0.5293, + "step": 1228 + }, + { + "epoch": 0.5941503504955281, + "grad_norm": 0.438870970435199, + "learning_rate": 3.878490390917253e-06, + "loss": 0.521, + "step": 1229 + }, + { + "epoch": 0.5946337926033357, + "grad_norm": 0.43955571970066076, + "learning_rate": 3.8707016394436985e-06, + "loss": 0.5219, + "step": 1230 + }, + { + "epoch": 0.5951172347111433, + "grad_norm": 0.4319400896121196, + "learning_rate": 3.86291577454777e-06, + "loss": 0.5274, + "step": 1231 + }, + { + "epoch": 0.5956006768189509, + "grad_norm": 0.42866945423374303, + "learning_rate": 3.855132816130767e-06, + "loss": 0.5287, + "step": 1232 + }, + { + "epoch": 0.5960841189267585, + "grad_norm": 0.4905566924544953, + "learning_rate": 3.847352784086556e-06, + "loss": 0.5214, + "step": 1233 + }, + { + "epoch": 0.5965675610345661, + "grad_norm": 0.4815238783873125, + "learning_rate": 3.839575698301529e-06, + "loss": 0.5348, + "step": 1234 + }, + { + "epoch": 0.5970510031423737, + "grad_norm": 0.4611545386580029, + "learning_rate": 3.831801578654541e-06, + "loss": 0.521, + "step": 1235 + }, + { + "epoch": 0.5975344452501813, + "grad_norm": 0.44138236799554953, + "learning_rate": 3.8240304450168716e-06, + "loss": 0.5012, + "step": 1236 + }, + { + "epoch": 0.5980178873579889, + "grad_norm": 0.4729557809805851, + "learning_rate": 3.8162623172521615e-06, + "loss": 0.5239, + "step": 1237 + }, + { + "epoch": 0.5985013294657965, + "grad_norm": 0.47751713396917145, + "learning_rate": 3.808497215216374e-06, + "loss": 0.518, + "step": 1238 + }, + { + "epoch": 0.5989847715736041, + "grad_norm": 0.4632727328684821, + "learning_rate": 3.8007351587577342e-06, + "loss": 0.5212, + "step": 1239 + }, + { + "epoch": 0.5994682136814117, + "grad_norm": 0.43452448731632676, + "learning_rate": 3.7929761677166847e-06, + "loss": 0.5256, + "step": 1240 + }, + { + "epoch": 0.5999516557892193, + "grad_norm": 0.41707858133064807, + "learning_rate": 3.7852202619258327e-06, + "loss": 0.5258, + "step": 1241 + }, + { + "epoch": 0.6004350978970269, + "grad_norm": 0.42484738876263983, + "learning_rate": 3.777467461209895e-06, + "loss": 0.5226, + "step": 1242 + }, + { + "epoch": 0.6009185400048345, + "grad_norm": 0.46278022630826876, + "learning_rate": 3.76971778538566e-06, + "loss": 0.5265, + "step": 1243 + }, + { + "epoch": 0.6014019821126421, + "grad_norm": 0.4136178806145892, + "learning_rate": 3.76197125426192e-06, + "loss": 0.521, + "step": 1244 + }, + { + "epoch": 0.6018854242204495, + "grad_norm": 0.42100500706131366, + "learning_rate": 3.754227887639434e-06, + "loss": 0.5119, + "step": 1245 + }, + { + "epoch": 0.6023688663282571, + "grad_norm": 0.4123940096542578, + "learning_rate": 3.7464877053108706e-06, + "loss": 0.5258, + "step": 1246 + }, + { + "epoch": 0.6028523084360647, + "grad_norm": 0.4195138029502561, + "learning_rate": 3.7387507270607617e-06, + "loss": 0.529, + "step": 1247 + }, + { + "epoch": 0.6033357505438723, + "grad_norm": 0.45482061749805036, + "learning_rate": 3.7310169726654444e-06, + "loss": 0.528, + "step": 1248 + }, + { + "epoch": 0.6038191926516799, + "grad_norm": 0.4371974850319641, + "learning_rate": 3.7232864618930217e-06, + "loss": 0.5182, + "step": 1249 + }, + { + "epoch": 0.6043026347594875, + "grad_norm": 0.43099991632770085, + "learning_rate": 3.715559214503298e-06, + "loss": 0.5133, + "step": 1250 + }, + { + "epoch": 0.6047860768672951, + "grad_norm": 0.4048318396541149, + "learning_rate": 3.707835250247745e-06, + "loss": 0.4877, + "step": 1251 + }, + { + "epoch": 0.6052695189751027, + "grad_norm": 0.43040118158566426, + "learning_rate": 3.7001145888694335e-06, + "loss": 0.5256, + "step": 1252 + }, + { + "epoch": 0.6057529610829103, + "grad_norm": 0.43151561170744174, + "learning_rate": 3.6923972501029996e-06, + "loss": 0.5028, + "step": 1253 + }, + { + "epoch": 0.6062364031907179, + "grad_norm": 0.4168861988285146, + "learning_rate": 3.684683253674583e-06, + "loss": 0.5249, + "step": 1254 + }, + { + "epoch": 0.6067198452985255, + "grad_norm": 0.4341349343804055, + "learning_rate": 3.676972619301776e-06, + "loss": 0.5119, + "step": 1255 + }, + { + "epoch": 0.6072032874063331, + "grad_norm": 0.45716952276328937, + "learning_rate": 3.6692653666935875e-06, + "loss": 0.5262, + "step": 1256 + }, + { + "epoch": 0.6076867295141407, + "grad_norm": 0.4304219021067875, + "learning_rate": 3.6615615155503703e-06, + "loss": 0.5168, + "step": 1257 + }, + { + "epoch": 0.6081701716219483, + "grad_norm": 0.4098861167543977, + "learning_rate": 3.6538610855637953e-06, + "loss": 0.5193, + "step": 1258 + }, + { + "epoch": 0.6086536137297559, + "grad_norm": 0.45344600794993284, + "learning_rate": 3.6461640964167755e-06, + "loss": 0.5213, + "step": 1259 + }, + { + "epoch": 0.6091370558375635, + "grad_norm": 0.408785994340013, + "learning_rate": 3.638470567783442e-06, + "loss": 0.4982, + "step": 1260 + }, + { + "epoch": 0.609620497945371, + "grad_norm": 0.4481815917202834, + "learning_rate": 3.630780519329069e-06, + "loss": 0.5329, + "step": 1261 + }, + { + "epoch": 0.6101039400531786, + "grad_norm": 0.4271346926738626, + "learning_rate": 3.623093970710043e-06, + "loss": 0.5278, + "step": 1262 + }, + { + "epoch": 0.6105873821609862, + "grad_norm": 0.4443519643294819, + "learning_rate": 3.615410941573799e-06, + "loss": 0.5358, + "step": 1263 + }, + { + "epoch": 0.6110708242687938, + "grad_norm": 0.41093410035963396, + "learning_rate": 3.607731451558783e-06, + "loss": 0.4973, + "step": 1264 + }, + { + "epoch": 0.6115542663766014, + "grad_norm": 0.4526917508007144, + "learning_rate": 3.6000555202943872e-06, + "loss": 0.5223, + "step": 1265 + }, + { + "epoch": 0.612037708484409, + "grad_norm": 0.458600770079082, + "learning_rate": 3.59238316740091e-06, + "loss": 0.5206, + "step": 1266 + }, + { + "epoch": 0.6125211505922166, + "grad_norm": 0.4390522720994981, + "learning_rate": 3.584714412489506e-06, + "loss": 0.5306, + "step": 1267 + }, + { + "epoch": 0.6130045927000242, + "grad_norm": 0.41799452227515504, + "learning_rate": 3.5770492751621292e-06, + "loss": 0.5029, + "step": 1268 + }, + { + "epoch": 0.6134880348078318, + "grad_norm": 0.43636835439611227, + "learning_rate": 3.5693877750114903e-06, + "loss": 0.5167, + "step": 1269 + }, + { + "epoch": 0.6139714769156394, + "grad_norm": 0.43211619164840076, + "learning_rate": 3.5617299316209984e-06, + "loss": 0.5049, + "step": 1270 + }, + { + "epoch": 0.614454919023447, + "grad_norm": 0.42547953690955836, + "learning_rate": 3.5540757645647217e-06, + "loss": 0.4939, + "step": 1271 + }, + { + "epoch": 0.6149383611312546, + "grad_norm": 0.4170182860561763, + "learning_rate": 3.546425293407324e-06, + "loss": 0.5199, + "step": 1272 + }, + { + "epoch": 0.6154218032390621, + "grad_norm": 0.4222424350681242, + "learning_rate": 3.5387785377040316e-06, + "loss": 0.5132, + "step": 1273 + }, + { + "epoch": 0.6159052453468697, + "grad_norm": 0.4552864030500758, + "learning_rate": 3.531135517000561e-06, + "loss": 0.5269, + "step": 1274 + }, + { + "epoch": 0.6163886874546773, + "grad_norm": 0.43855551812482985, + "learning_rate": 3.523496250833098e-06, + "loss": 0.5122, + "step": 1275 + }, + { + "epoch": 0.6168721295624849, + "grad_norm": 0.46449768544610603, + "learning_rate": 3.515860758728214e-06, + "loss": 0.5234, + "step": 1276 + }, + { + "epoch": 0.6173555716702925, + "grad_norm": 0.46363731713711515, + "learning_rate": 3.5082290602028492e-06, + "loss": 0.5269, + "step": 1277 + }, + { + "epoch": 0.6178390137781, + "grad_norm": 0.42830833200680596, + "learning_rate": 3.5006011747642366e-06, + "loss": 0.5177, + "step": 1278 + }, + { + "epoch": 0.6183224558859076, + "grad_norm": 0.45403297568672957, + "learning_rate": 3.492977121909865e-06, + "loss": 0.5329, + "step": 1279 + }, + { + "epoch": 0.6188058979937152, + "grad_norm": 0.4707193517755616, + "learning_rate": 3.4853569211274306e-06, + "loss": 0.5275, + "step": 1280 + }, + { + "epoch": 0.6192893401015228, + "grad_norm": 0.4382585499371175, + "learning_rate": 3.4777405918947795e-06, + "loss": 0.5117, + "step": 1281 + }, + { + "epoch": 0.6197727822093304, + "grad_norm": 0.43247243116721396, + "learning_rate": 3.4701281536798638e-06, + "loss": 0.5274, + "step": 1282 + }, + { + "epoch": 0.620256224317138, + "grad_norm": 0.4457194202455219, + "learning_rate": 3.462519625940688e-06, + "loss": 0.5282, + "step": 1283 + }, + { + "epoch": 0.6207396664249456, + "grad_norm": 0.45208181214242377, + "learning_rate": 3.4549150281252635e-06, + "loss": 0.5224, + "step": 1284 + }, + { + "epoch": 0.6212231085327532, + "grad_norm": 0.43419084686544124, + "learning_rate": 3.4473143796715537e-06, + "loss": 0.5221, + "step": 1285 + }, + { + "epoch": 0.6217065506405608, + "grad_norm": 0.45834613229050314, + "learning_rate": 3.4397177000074307e-06, + "loss": 0.5286, + "step": 1286 + }, + { + "epoch": 0.6221899927483684, + "grad_norm": 0.4551713196802731, + "learning_rate": 3.4321250085506174e-06, + "loss": 0.519, + "step": 1287 + }, + { + "epoch": 0.622673434856176, + "grad_norm": 0.45716705644935435, + "learning_rate": 3.4245363247086477e-06, + "loss": 0.5291, + "step": 1288 + }, + { + "epoch": 0.6231568769639836, + "grad_norm": 0.4214582611290155, + "learning_rate": 3.4169516678788096e-06, + "loss": 0.5084, + "step": 1289 + }, + { + "epoch": 0.6236403190717912, + "grad_norm": 0.43304150590207136, + "learning_rate": 3.4093710574480926e-06, + "loss": 0.5181, + "step": 1290 + }, + { + "epoch": 0.6241237611795988, + "grad_norm": 0.4201723521255349, + "learning_rate": 3.4017945127931517e-06, + "loss": 0.5215, + "step": 1291 + }, + { + "epoch": 0.6246072032874064, + "grad_norm": 0.43513341637312203, + "learning_rate": 3.394222053280245e-06, + "loss": 0.5219, + "step": 1292 + }, + { + "epoch": 0.625090645395214, + "grad_norm": 0.41778354176218346, + "learning_rate": 3.386653698265189e-06, + "loss": 0.5295, + "step": 1293 + }, + { + "epoch": 0.6255740875030215, + "grad_norm": 0.4289158469228602, + "learning_rate": 3.3790894670933096e-06, + "loss": 0.4993, + "step": 1294 + }, + { + "epoch": 0.626057529610829, + "grad_norm": 0.4362452363722638, + "learning_rate": 3.3715293790993906e-06, + "loss": 0.5212, + "step": 1295 + }, + { + "epoch": 0.6265409717186367, + "grad_norm": 0.4287020140556269, + "learning_rate": 3.3639734536076263e-06, + "loss": 0.5145, + "step": 1296 + }, + { + "epoch": 0.6270244138264442, + "grad_norm": 0.4371619386426176, + "learning_rate": 3.356421709931573e-06, + "loss": 0.5263, + "step": 1297 + }, + { + "epoch": 0.6275078559342518, + "grad_norm": 0.4558196176944881, + "learning_rate": 3.348874167374093e-06, + "loss": 0.5193, + "step": 1298 + }, + { + "epoch": 0.6279912980420594, + "grad_norm": 0.433400379634257, + "learning_rate": 3.341330845227316e-06, + "loss": 0.5342, + "step": 1299 + }, + { + "epoch": 0.628474740149867, + "grad_norm": 0.44874534866829735, + "learning_rate": 3.33379176277258e-06, + "loss": 0.5192, + "step": 1300 + }, + { + "epoch": 0.6289581822576746, + "grad_norm": 0.42814892447345076, + "learning_rate": 3.326256939280389e-06, + "loss": 0.5222, + "step": 1301 + }, + { + "epoch": 0.6294416243654822, + "grad_norm": 0.4175511142456795, + "learning_rate": 3.3187263940103587e-06, + "loss": 0.5008, + "step": 1302 + }, + { + "epoch": 0.6299250664732898, + "grad_norm": 0.43012465279292367, + "learning_rate": 3.3112001462111666e-06, + "loss": 0.5252, + "step": 1303 + }, + { + "epoch": 0.6304085085810974, + "grad_norm": 0.4343943982087317, + "learning_rate": 3.3036782151205134e-06, + "loss": 0.5269, + "step": 1304 + }, + { + "epoch": 0.630891950688905, + "grad_norm": 0.4399879585469454, + "learning_rate": 3.296160619965056e-06, + "loss": 0.5348, + "step": 1305 + }, + { + "epoch": 0.6313753927967126, + "grad_norm": 0.41469773355380823, + "learning_rate": 3.2886473799603793e-06, + "loss": 0.5315, + "step": 1306 + }, + { + "epoch": 0.6318588349045202, + "grad_norm": 0.4118723735223055, + "learning_rate": 3.2811385143109254e-06, + "loss": 0.5199, + "step": 1307 + }, + { + "epoch": 0.6323422770123278, + "grad_norm": 0.42911493217824326, + "learning_rate": 3.2736340422099633e-06, + "loss": 0.5287, + "step": 1308 + }, + { + "epoch": 0.6328257191201354, + "grad_norm": 0.42229266051589764, + "learning_rate": 3.2661339828395263e-06, + "loss": 0.5034, + "step": 1309 + }, + { + "epoch": 0.633309161227943, + "grad_norm": 0.4171629681078508, + "learning_rate": 3.2586383553703723e-06, + "loss": 0.5169, + "step": 1310 + }, + { + "epoch": 0.6337926033357505, + "grad_norm": 0.4335972937122555, + "learning_rate": 3.2511471789619274e-06, + "loss": 0.5084, + "step": 1311 + }, + { + "epoch": 0.6342760454435581, + "grad_norm": 0.42465524204681926, + "learning_rate": 3.2436604727622447e-06, + "loss": 0.5126, + "step": 1312 + }, + { + "epoch": 0.6347594875513657, + "grad_norm": 0.4075010185296818, + "learning_rate": 3.2361782559079465e-06, + "loss": 0.5158, + "step": 1313 + }, + { + "epoch": 0.6352429296591733, + "grad_norm": 0.4308541178278502, + "learning_rate": 3.228700547524184e-06, + "loss": 0.5145, + "step": 1314 + }, + { + "epoch": 0.6357263717669809, + "grad_norm": 0.428226473042128, + "learning_rate": 3.221227366724581e-06, + "loss": 0.5146, + "step": 1315 + }, + { + "epoch": 0.6362098138747885, + "grad_norm": 0.4299253923140349, + "learning_rate": 3.2137587326111896e-06, + "loss": 0.5207, + "step": 1316 + }, + { + "epoch": 0.6366932559825961, + "grad_norm": 0.4181473728080887, + "learning_rate": 3.206294664274443e-06, + "loss": 0.5268, + "step": 1317 + }, + { + "epoch": 0.6371766980904037, + "grad_norm": 0.4474913351660176, + "learning_rate": 3.198835180793097e-06, + "loss": 0.5277, + "step": 1318 + }, + { + "epoch": 0.6376601401982113, + "grad_norm": 0.432676302059824, + "learning_rate": 3.1913803012341987e-06, + "loss": 0.5195, + "step": 1319 + }, + { + "epoch": 0.6381435823060189, + "grad_norm": 0.44131963361999216, + "learning_rate": 3.183930044653014e-06, + "loss": 0.5157, + "step": 1320 + }, + { + "epoch": 0.6386270244138265, + "grad_norm": 0.4300971230394046, + "learning_rate": 3.176484430093007e-06, + "loss": 0.5312, + "step": 1321 + }, + { + "epoch": 0.6391104665216341, + "grad_norm": 0.44998544162450493, + "learning_rate": 3.1690434765857604e-06, + "loss": 0.523, + "step": 1322 + }, + { + "epoch": 0.6395939086294417, + "grad_norm": 0.4472778564601584, + "learning_rate": 3.1616072031509594e-06, + "loss": 0.5155, + "step": 1323 + }, + { + "epoch": 0.6400773507372493, + "grad_norm": 0.4278886331070072, + "learning_rate": 3.154175628796311e-06, + "loss": 0.5214, + "step": 1324 + }, + { + "epoch": 0.6405607928450568, + "grad_norm": 0.4354238172365612, + "learning_rate": 3.146748772517523e-06, + "loss": 0.5202, + "step": 1325 + }, + { + "epoch": 0.6410442349528644, + "grad_norm": 0.48506314375368076, + "learning_rate": 3.139326653298236e-06, + "loss": 0.5226, + "step": 1326 + }, + { + "epoch": 0.6415276770606719, + "grad_norm": 0.42689588677759766, + "learning_rate": 3.1319092901099847e-06, + "loss": 0.5192, + "step": 1327 + }, + { + "epoch": 0.6420111191684795, + "grad_norm": 0.44087925863432936, + "learning_rate": 3.1244967019121496e-06, + "loss": 0.5127, + "step": 1328 + }, + { + "epoch": 0.6424945612762871, + "grad_norm": 0.42605880529211515, + "learning_rate": 3.117088907651902e-06, + "loss": 0.5176, + "step": 1329 + }, + { + "epoch": 0.6429780033840947, + "grad_norm": 0.42151112378627853, + "learning_rate": 3.109685926264161e-06, + "loss": 0.5165, + "step": 1330 + }, + { + "epoch": 0.6434614454919023, + "grad_norm": 0.4268979096606091, + "learning_rate": 3.102287776671544e-06, + "loss": 0.5339, + "step": 1331 + }, + { + "epoch": 0.6439448875997099, + "grad_norm": 0.4392831430463705, + "learning_rate": 3.094894477784318e-06, + "loss": 0.5166, + "step": 1332 + }, + { + "epoch": 0.6444283297075175, + "grad_norm": 0.4146076328411298, + "learning_rate": 3.0875060485003496e-06, + "loss": 0.5274, + "step": 1333 + }, + { + "epoch": 0.6449117718153251, + "grad_norm": 0.42157017978455763, + "learning_rate": 3.080122507705062e-06, + "loss": 0.5243, + "step": 1334 + }, + { + "epoch": 0.6453952139231327, + "grad_norm": 0.41448886842899935, + "learning_rate": 3.0727438742713766e-06, + "loss": 0.4982, + "step": 1335 + }, + { + "epoch": 0.6458786560309403, + "grad_norm": 0.4311221980804021, + "learning_rate": 3.0653701670596805e-06, + "loss": 0.5222, + "step": 1336 + }, + { + "epoch": 0.6463620981387479, + "grad_norm": 0.42860321112689415, + "learning_rate": 3.0580014049177566e-06, + "loss": 0.5203, + "step": 1337 + }, + { + "epoch": 0.6468455402465555, + "grad_norm": 0.42363404336164073, + "learning_rate": 3.0506376066807632e-06, + "loss": 0.5131, + "step": 1338 + }, + { + "epoch": 0.6473289823543631, + "grad_norm": 0.4352925336415172, + "learning_rate": 3.0432787911711553e-06, + "loss": 0.5244, + "step": 1339 + }, + { + "epoch": 0.6478124244621707, + "grad_norm": 0.4213888881156135, + "learning_rate": 3.0359249771986605e-06, + "loss": 0.5114, + "step": 1340 + }, + { + "epoch": 0.6482958665699783, + "grad_norm": 0.44310700039649703, + "learning_rate": 3.028576183560221e-06, + "loss": 0.5345, + "step": 1341 + }, + { + "epoch": 0.6487793086777859, + "grad_norm": 0.46384351518694394, + "learning_rate": 3.021232429039944e-06, + "loss": 0.5103, + "step": 1342 + }, + { + "epoch": 0.6492627507855935, + "grad_norm": 0.420074285156663, + "learning_rate": 3.01389373240906e-06, + "loss": 0.5307, + "step": 1343 + }, + { + "epoch": 0.649746192893401, + "grad_norm": 0.46717936599561755, + "learning_rate": 3.006560112425867e-06, + "loss": 0.5146, + "step": 1344 + }, + { + "epoch": 0.6502296350012086, + "grad_norm": 0.4411181920960548, + "learning_rate": 2.999231587835691e-06, + "loss": 0.5113, + "step": 1345 + }, + { + "epoch": 0.6507130771090162, + "grad_norm": 0.4353633086273451, + "learning_rate": 2.9919081773708293e-06, + "loss": 0.5195, + "step": 1346 + }, + { + "epoch": 0.6511965192168238, + "grad_norm": 0.4555577360086876, + "learning_rate": 2.9845898997505102e-06, + "loss": 0.5201, + "step": 1347 + }, + { + "epoch": 0.6516799613246314, + "grad_norm": 0.43234112941806857, + "learning_rate": 2.9772767736808406e-06, + "loss": 0.5194, + "step": 1348 + }, + { + "epoch": 0.652163403432439, + "grad_norm": 0.43499867909496204, + "learning_rate": 2.9699688178547615e-06, + "loss": 0.5252, + "step": 1349 + }, + { + "epoch": 0.6526468455402465, + "grad_norm": 0.4243771527145887, + "learning_rate": 2.962666050951997e-06, + "loss": 0.5122, + "step": 1350 + }, + { + "epoch": 0.6531302876480541, + "grad_norm": 0.4158873835022681, + "learning_rate": 2.9553684916390053e-06, + "loss": 0.5092, + "step": 1351 + }, + { + "epoch": 0.6536137297558617, + "grad_norm": 0.4494940146550669, + "learning_rate": 2.948076158568939e-06, + "loss": 0.5256, + "step": 1352 + }, + { + "epoch": 0.6540971718636693, + "grad_norm": 0.39860657725442444, + "learning_rate": 2.940789070381587e-06, + "loss": 0.5001, + "step": 1353 + }, + { + "epoch": 0.6545806139714769, + "grad_norm": 0.41241371956595596, + "learning_rate": 2.933507245703335e-06, + "loss": 0.5174, + "step": 1354 + }, + { + "epoch": 0.6550640560792845, + "grad_norm": 0.41370768103275585, + "learning_rate": 2.9262307031471132e-06, + "loss": 0.5142, + "step": 1355 + }, + { + "epoch": 0.6555474981870921, + "grad_norm": 0.4457235721009106, + "learning_rate": 2.918959461312353e-06, + "loss": 0.5212, + "step": 1356 + }, + { + "epoch": 0.6560309402948997, + "grad_norm": 0.44189033261031596, + "learning_rate": 2.911693538784931e-06, + "loss": 0.5315, + "step": 1357 + }, + { + "epoch": 0.6565143824027073, + "grad_norm": 0.417441027459776, + "learning_rate": 2.904432954137136e-06, + "loss": 0.5197, + "step": 1358 + }, + { + "epoch": 0.6569978245105149, + "grad_norm": 0.41446617019595194, + "learning_rate": 2.897177725927599e-06, + "loss": 0.4977, + "step": 1359 + }, + { + "epoch": 0.6574812666183224, + "grad_norm": 0.424230216094722, + "learning_rate": 2.889927872701278e-06, + "loss": 0.5319, + "step": 1360 + }, + { + "epoch": 0.65796470872613, + "grad_norm": 0.42985260891150956, + "learning_rate": 2.8826834129893755e-06, + "loss": 0.5166, + "step": 1361 + }, + { + "epoch": 0.6584481508339376, + "grad_norm": 0.38447115699767576, + "learning_rate": 2.8754443653093186e-06, + "loss": 0.4786, + "step": 1362 + }, + { + "epoch": 0.6589315929417452, + "grad_norm": 0.4044201835297723, + "learning_rate": 2.8682107481646915e-06, + "loss": 0.5216, + "step": 1363 + }, + { + "epoch": 0.6594150350495528, + "grad_norm": 0.4022645350617216, + "learning_rate": 2.8609825800452063e-06, + "loss": 0.4988, + "step": 1364 + }, + { + "epoch": 0.6598984771573604, + "grad_norm": 0.4401229027571195, + "learning_rate": 2.853759879426644e-06, + "loss": 0.5181, + "step": 1365 + }, + { + "epoch": 0.660381919265168, + "grad_norm": 0.42349548120906483, + "learning_rate": 2.8465426647708067e-06, + "loss": 0.5163, + "step": 1366 + }, + { + "epoch": 0.6608653613729756, + "grad_norm": 0.40013997451662586, + "learning_rate": 2.8393309545254776e-06, + "loss": 0.5214, + "step": 1367 + }, + { + "epoch": 0.6613488034807832, + "grad_norm": 0.4359244125864156, + "learning_rate": 2.8321247671243695e-06, + "loss": 0.5179, + "step": 1368 + }, + { + "epoch": 0.6618322455885908, + "grad_norm": 0.41425895295471055, + "learning_rate": 2.82492412098708e-06, + "loss": 0.5081, + "step": 1369 + }, + { + "epoch": 0.6623156876963984, + "grad_norm": 0.4210065663342879, + "learning_rate": 2.8177290345190387e-06, + "loss": 0.5194, + "step": 1370 + }, + { + "epoch": 0.662799129804206, + "grad_norm": 0.4028980901393777, + "learning_rate": 2.8105395261114666e-06, + "loss": 0.5234, + "step": 1371 + }, + { + "epoch": 0.6632825719120136, + "grad_norm": 0.4325922757476261, + "learning_rate": 2.803355614141327e-06, + "loss": 0.5188, + "step": 1372 + }, + { + "epoch": 0.6637660140198212, + "grad_norm": 0.4308186918740408, + "learning_rate": 2.7961773169712803e-06, + "loss": 0.5125, + "step": 1373 + }, + { + "epoch": 0.6642494561276288, + "grad_norm": 0.4211885259856405, + "learning_rate": 2.7890046529496284e-06, + "loss": 0.5233, + "step": 1374 + }, + { + "epoch": 0.6647328982354364, + "grad_norm": 0.4304676159038956, + "learning_rate": 2.7818376404102832e-06, + "loss": 0.5188, + "step": 1375 + }, + { + "epoch": 0.665216340343244, + "grad_norm": 0.4137521174014562, + "learning_rate": 2.774676297672701e-06, + "loss": 0.5248, + "step": 1376 + }, + { + "epoch": 0.6656997824510514, + "grad_norm": 0.4389331875357886, + "learning_rate": 2.7675206430418542e-06, + "loss": 0.5265, + "step": 1377 + }, + { + "epoch": 0.666183224558859, + "grad_norm": 0.46429330512304384, + "learning_rate": 2.7603706948081745e-06, + "loss": 0.5211, + "step": 1378 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.4260734411731187, + "learning_rate": 2.753226471247501e-06, + "loss": 0.517, + "step": 1379 + }, + { + "epoch": 0.6671501087744742, + "grad_norm": 0.4189810127916622, + "learning_rate": 2.7460879906210485e-06, + "loss": 0.5107, + "step": 1380 + }, + { + "epoch": 0.6676335508822818, + "grad_norm": 0.41204910620329505, + "learning_rate": 2.7389552711753477e-06, + "loss": 0.5191, + "step": 1381 + }, + { + "epoch": 0.6681169929900894, + "grad_norm": 0.4267680612975131, + "learning_rate": 2.731828331142207e-06, + "loss": 0.5128, + "step": 1382 + }, + { + "epoch": 0.668600435097897, + "grad_norm": 0.42901984315752384, + "learning_rate": 2.7247071887386544e-06, + "loss": 0.5257, + "step": 1383 + }, + { + "epoch": 0.6690838772057046, + "grad_norm": 0.4146728225846163, + "learning_rate": 2.7175918621669074e-06, + "loss": 0.5184, + "step": 1384 + }, + { + "epoch": 0.6695673193135122, + "grad_norm": 0.4782489091382579, + "learning_rate": 2.7104823696143136e-06, + "loss": 0.5298, + "step": 1385 + }, + { + "epoch": 0.6700507614213198, + "grad_norm": 0.40703765978893935, + "learning_rate": 2.70337872925331e-06, + "loss": 0.5111, + "step": 1386 + }, + { + "epoch": 0.6705342035291274, + "grad_norm": 0.4109547447766556, + "learning_rate": 2.6962809592413726e-06, + "loss": 0.5002, + "step": 1387 + }, + { + "epoch": 0.671017645636935, + "grad_norm": 0.4498968198632276, + "learning_rate": 2.6891890777209696e-06, + "loss": 0.5256, + "step": 1388 + }, + { + "epoch": 0.6715010877447426, + "grad_norm": 0.42208190857564254, + "learning_rate": 2.68210310281953e-06, + "loss": 0.5193, + "step": 1389 + }, + { + "epoch": 0.6719845298525502, + "grad_norm": 0.41822528698390377, + "learning_rate": 2.67502305264937e-06, + "loss": 0.5163, + "step": 1390 + }, + { + "epoch": 0.6724679719603578, + "grad_norm": 0.4218034674050614, + "learning_rate": 2.667948945307674e-06, + "loss": 0.5174, + "step": 1391 + }, + { + "epoch": 0.6729514140681654, + "grad_norm": 0.4206471334382422, + "learning_rate": 2.6608807988764252e-06, + "loss": 0.4936, + "step": 1392 + }, + { + "epoch": 0.6734348561759729, + "grad_norm": 0.42181885072694014, + "learning_rate": 2.653818631422378e-06, + "loss": 0.5138, + "step": 1393 + }, + { + "epoch": 0.6739182982837805, + "grad_norm": 0.4261589725068296, + "learning_rate": 2.6467624609970005e-06, + "loss": 0.5145, + "step": 1394 + }, + { + "epoch": 0.6744017403915881, + "grad_norm": 0.40519700853309554, + "learning_rate": 2.6397123056364364e-06, + "loss": 0.5013, + "step": 1395 + }, + { + "epoch": 0.6748851824993957, + "grad_norm": 0.4510436140721377, + "learning_rate": 2.6326681833614464e-06, + "loss": 0.5184, + "step": 1396 + }, + { + "epoch": 0.6753686246072033, + "grad_norm": 0.45003681113297744, + "learning_rate": 2.6256301121773775e-06, + "loss": 0.5149, + "step": 1397 + }, + { + "epoch": 0.6758520667150109, + "grad_norm": 0.4263810181960221, + "learning_rate": 2.618598110074105e-06, + "loss": 0.5115, + "step": 1398 + }, + { + "epoch": 0.6763355088228185, + "grad_norm": 0.4263005297393967, + "learning_rate": 2.6115721950259977e-06, + "loss": 0.5243, + "step": 1399 + }, + { + "epoch": 0.676818950930626, + "grad_norm": 0.4563967376255983, + "learning_rate": 2.6045523849918553e-06, + "loss": 0.5314, + "step": 1400 + }, + { + "epoch": 0.6773023930384336, + "grad_norm": 0.4478593986013541, + "learning_rate": 2.5975386979148792e-06, + "loss": 0.5179, + "step": 1401 + }, + { + "epoch": 0.6777858351462412, + "grad_norm": 0.40707261007936574, + "learning_rate": 2.590531151722622e-06, + "loss": 0.5165, + "step": 1402 + }, + { + "epoch": 0.6782692772540488, + "grad_norm": 0.45689999209163507, + "learning_rate": 2.5835297643269326e-06, + "loss": 0.5212, + "step": 1403 + }, + { + "epoch": 0.6787527193618564, + "grad_norm": 0.41521250872284, + "learning_rate": 2.576534553623925e-06, + "loss": 0.5197, + "step": 1404 + }, + { + "epoch": 0.679236161469664, + "grad_norm": 0.3969654860159799, + "learning_rate": 2.5695455374939147e-06, + "loss": 0.4939, + "step": 1405 + }, + { + "epoch": 0.6797196035774716, + "grad_norm": 0.4115250925249713, + "learning_rate": 2.5625627338014004e-06, + "loss": 0.5242, + "step": 1406 + }, + { + "epoch": 0.6802030456852792, + "grad_norm": 0.4253454941567133, + "learning_rate": 2.5555861603949832e-06, + "loss": 0.513, + "step": 1407 + }, + { + "epoch": 0.6806864877930868, + "grad_norm": 0.4448844424181978, + "learning_rate": 2.548615835107352e-06, + "loss": 0.5047, + "step": 1408 + }, + { + "epoch": 0.6811699299008944, + "grad_norm": 0.41222858577096244, + "learning_rate": 2.5416517757552157e-06, + "loss": 0.5286, + "step": 1409 + }, + { + "epoch": 0.6816533720087019, + "grad_norm": 0.42285086542458045, + "learning_rate": 2.534694000139273e-06, + "loss": 0.5169, + "step": 1410 + }, + { + "epoch": 0.6821368141165095, + "grad_norm": 0.4122433378845125, + "learning_rate": 2.5277425260441616e-06, + "loss": 0.515, + "step": 1411 + }, + { + "epoch": 0.6826202562243171, + "grad_norm": 0.4362061175188878, + "learning_rate": 2.520797371238406e-06, + "loss": 0.5225, + "step": 1412 + }, + { + "epoch": 0.6831036983321247, + "grad_norm": 0.4411789430289944, + "learning_rate": 2.513858553474382e-06, + "loss": 0.5191, + "step": 1413 + }, + { + "epoch": 0.6835871404399323, + "grad_norm": 0.4415744443134195, + "learning_rate": 2.506926090488269e-06, + "loss": 0.5306, + "step": 1414 + }, + { + "epoch": 0.6840705825477399, + "grad_norm": 0.4477316137829116, + "learning_rate": 2.5000000000000015e-06, + "loss": 0.5248, + "step": 1415 + }, + { + "epoch": 0.6845540246555475, + "grad_norm": 0.41099572818531255, + "learning_rate": 2.4930802997132213e-06, + "loss": 0.5218, + "step": 1416 + }, + { + "epoch": 0.6850374667633551, + "grad_norm": 0.4009913145578469, + "learning_rate": 2.486167007315243e-06, + "loss": 0.5189, + "step": 1417 + }, + { + "epoch": 0.6855209088711627, + "grad_norm": 0.43880257019064667, + "learning_rate": 2.479260140476999e-06, + "loss": 0.5114, + "step": 1418 + }, + { + "epoch": 0.6860043509789703, + "grad_norm": 0.4322007294880164, + "learning_rate": 2.4723597168529984e-06, + "loss": 0.5066, + "step": 1419 + }, + { + "epoch": 0.6864877930867779, + "grad_norm": 0.4106120224272021, + "learning_rate": 2.465465754081277e-06, + "loss": 0.4888, + "step": 1420 + }, + { + "epoch": 0.6869712351945855, + "grad_norm": 0.422067985874925, + "learning_rate": 2.458578269783364e-06, + "loss": 0.5155, + "step": 1421 + }, + { + "epoch": 0.6874546773023931, + "grad_norm": 0.4136266956566046, + "learning_rate": 2.4516972815642166e-06, + "loss": 0.5143, + "step": 1422 + }, + { + "epoch": 0.6879381194102007, + "grad_norm": 0.4335536983962682, + "learning_rate": 2.444822807012204e-06, + "loss": 0.5196, + "step": 1423 + }, + { + "epoch": 0.6884215615180083, + "grad_norm": 0.42723749184962806, + "learning_rate": 2.4379548636990343e-06, + "loss": 0.5136, + "step": 1424 + }, + { + "epoch": 0.6889050036258159, + "grad_norm": 0.4307011628135296, + "learning_rate": 2.4310934691797207e-06, + "loss": 0.5305, + "step": 1425 + }, + { + "epoch": 0.6893884457336233, + "grad_norm": 0.45161428649005025, + "learning_rate": 2.4242386409925435e-06, + "loss": 0.5048, + "step": 1426 + }, + { + "epoch": 0.6898718878414309, + "grad_norm": 0.4351186095813856, + "learning_rate": 2.4173903966589957e-06, + "loss": 0.5216, + "step": 1427 + }, + { + "epoch": 0.6903553299492385, + "grad_norm": 0.4128958039987362, + "learning_rate": 2.410548753683743e-06, + "loss": 0.5206, + "step": 1428 + }, + { + "epoch": 0.6908387720570461, + "grad_norm": 0.4185374425485222, + "learning_rate": 2.4037137295545737e-06, + "loss": 0.5205, + "step": 1429 + }, + { + "epoch": 0.6913222141648537, + "grad_norm": 0.4256083734187945, + "learning_rate": 2.396885341742361e-06, + "loss": 0.4804, + "step": 1430 + }, + { + "epoch": 0.6918056562726613, + "grad_norm": 0.411514639053229, + "learning_rate": 2.390063607701016e-06, + "loss": 0.5194, + "step": 1431 + }, + { + "epoch": 0.6922890983804689, + "grad_norm": 0.43241514860902464, + "learning_rate": 2.3832485448674407e-06, + "loss": 0.53, + "step": 1432 + }, + { + "epoch": 0.6927725404882765, + "grad_norm": 0.4291596725507727, + "learning_rate": 2.3764401706614832e-06, + "loss": 0.5144, + "step": 1433 + }, + { + "epoch": 0.6932559825960841, + "grad_norm": 0.42041788788695633, + "learning_rate": 2.369638502485897e-06, + "loss": 0.5148, + "step": 1434 + }, + { + "epoch": 0.6937394247038917, + "grad_norm": 0.4482987713314786, + "learning_rate": 2.3628435577262947e-06, + "loss": 0.5191, + "step": 1435 + }, + { + "epoch": 0.6942228668116993, + "grad_norm": 0.42573448798758273, + "learning_rate": 2.3560553537511043e-06, + "loss": 0.5021, + "step": 1436 + }, + { + "epoch": 0.6947063089195069, + "grad_norm": 0.41739963072931596, + "learning_rate": 2.3492739079115214e-06, + "loss": 0.5061, + "step": 1437 + }, + { + "epoch": 0.6951897510273145, + "grad_norm": 0.4366261411331466, + "learning_rate": 2.3424992375414655e-06, + "loss": 0.5133, + "step": 1438 + }, + { + "epoch": 0.6956731931351221, + "grad_norm": 0.42225675860612266, + "learning_rate": 2.3357313599575422e-06, + "loss": 0.5254, + "step": 1439 + }, + { + "epoch": 0.6961566352429297, + "grad_norm": 0.4347650420428982, + "learning_rate": 2.3289702924589914e-06, + "loss": 0.5143, + "step": 1440 + }, + { + "epoch": 0.6966400773507373, + "grad_norm": 0.4220266027824235, + "learning_rate": 2.3222160523276486e-06, + "loss": 0.5194, + "step": 1441 + }, + { + "epoch": 0.6971235194585449, + "grad_norm": 0.400495176856287, + "learning_rate": 2.3154686568278933e-06, + "loss": 0.5315, + "step": 1442 + }, + { + "epoch": 0.6976069615663524, + "grad_norm": 0.4149083634198192, + "learning_rate": 2.3087281232066134e-06, + "loss": 0.5109, + "step": 1443 + }, + { + "epoch": 0.69809040367416, + "grad_norm": 0.43831779922906355, + "learning_rate": 2.3019944686931554e-06, + "loss": 0.5256, + "step": 1444 + }, + { + "epoch": 0.6985738457819676, + "grad_norm": 0.4379300687242213, + "learning_rate": 2.2952677104992855e-06, + "loss": 0.5287, + "step": 1445 + }, + { + "epoch": 0.6990572878897752, + "grad_norm": 0.43973213205463885, + "learning_rate": 2.2885478658191364e-06, + "loss": 0.5192, + "step": 1446 + }, + { + "epoch": 0.6995407299975828, + "grad_norm": 0.42002084857343974, + "learning_rate": 2.281834951829174e-06, + "loss": 0.521, + "step": 1447 + }, + { + "epoch": 0.7000241721053904, + "grad_norm": 0.38595076036167364, + "learning_rate": 2.2751289856881487e-06, + "loss": 0.4869, + "step": 1448 + }, + { + "epoch": 0.700507614213198, + "grad_norm": 0.436647846778714, + "learning_rate": 2.268429984537048e-06, + "loss": 0.5216, + "step": 1449 + }, + { + "epoch": 0.7009910563210056, + "grad_norm": 0.4140253730185284, + "learning_rate": 2.2617379654990623e-06, + "loss": 0.5165, + "step": 1450 + }, + { + "epoch": 0.7014744984288132, + "grad_norm": 0.4644944125638521, + "learning_rate": 2.255052945679525e-06, + "loss": 0.5183, + "step": 1451 + }, + { + "epoch": 0.7019579405366208, + "grad_norm": 0.41536119938345195, + "learning_rate": 2.248374942165894e-06, + "loss": 0.5231, + "step": 1452 + }, + { + "epoch": 0.7024413826444283, + "grad_norm": 0.4012349549582878, + "learning_rate": 2.241703972027679e-06, + "loss": 0.5168, + "step": 1453 + }, + { + "epoch": 0.7029248247522359, + "grad_norm": 0.4521292215779327, + "learning_rate": 2.23504005231642e-06, + "loss": 0.5158, + "step": 1454 + }, + { + "epoch": 0.7034082668600435, + "grad_norm": 0.4172271643387044, + "learning_rate": 2.2283832000656304e-06, + "loss": 0.4941, + "step": 1455 + }, + { + "epoch": 0.7038917089678511, + "grad_norm": 0.421958406666486, + "learning_rate": 2.221733432290762e-06, + "loss": 0.5209, + "step": 1456 + }, + { + "epoch": 0.7043751510756587, + "grad_norm": 0.42224698163781604, + "learning_rate": 2.2150907659891566e-06, + "loss": 0.5173, + "step": 1457 + }, + { + "epoch": 0.7048585931834663, + "grad_norm": 0.43523243642666853, + "learning_rate": 2.2084552181400087e-06, + "loss": 0.5186, + "step": 1458 + }, + { + "epoch": 0.7053420352912738, + "grad_norm": 0.4437233504227722, + "learning_rate": 2.201826805704308e-06, + "loss": 0.5125, + "step": 1459 + }, + { + "epoch": 0.7058254773990814, + "grad_norm": 0.42532048824174346, + "learning_rate": 2.195205545624813e-06, + "loss": 0.5243, + "step": 1460 + }, + { + "epoch": 0.706308919506889, + "grad_norm": 0.4322950043512432, + "learning_rate": 2.188591454826e-06, + "loss": 0.5135, + "step": 1461 + }, + { + "epoch": 0.7067923616146966, + "grad_norm": 0.4272575345234204, + "learning_rate": 2.181984550214015e-06, + "loss": 0.5116, + "step": 1462 + }, + { + "epoch": 0.7072758037225042, + "grad_norm": 0.41921770884395154, + "learning_rate": 2.175384848676639e-06, + "loss": 0.5165, + "step": 1463 + }, + { + "epoch": 0.7077592458303118, + "grad_norm": 0.43176187181049736, + "learning_rate": 2.168792367083243e-06, + "loss": 0.5138, + "step": 1464 + }, + { + "epoch": 0.7082426879381194, + "grad_norm": 0.41695232513283254, + "learning_rate": 2.162207122284742e-06, + "loss": 0.5091, + "step": 1465 + }, + { + "epoch": 0.708726130045927, + "grad_norm": 0.41339935320490057, + "learning_rate": 2.155629131113549e-06, + "loss": 0.5158, + "step": 1466 + }, + { + "epoch": 0.7092095721537346, + "grad_norm": 0.40689486411834114, + "learning_rate": 2.1490584103835433e-06, + "loss": 0.4847, + "step": 1467 + }, + { + "epoch": 0.7096930142615422, + "grad_norm": 0.417060588337446, + "learning_rate": 2.142494976890011e-06, + "loss": 0.5241, + "step": 1468 + }, + { + "epoch": 0.7101764563693498, + "grad_norm": 0.4289677663647557, + "learning_rate": 2.135938847409625e-06, + "loss": 0.5206, + "step": 1469 + }, + { + "epoch": 0.7106598984771574, + "grad_norm": 0.43410470718447147, + "learning_rate": 2.1293900387003742e-06, + "loss": 0.4931, + "step": 1470 + }, + { + "epoch": 0.711143340584965, + "grad_norm": 0.42958196993128944, + "learning_rate": 2.1228485675015455e-06, + "loss": 0.5204, + "step": 1471 + }, + { + "epoch": 0.7116267826927726, + "grad_norm": 0.4311771692424152, + "learning_rate": 2.1163144505336634e-06, + "loss": 0.5219, + "step": 1472 + }, + { + "epoch": 0.7121102248005802, + "grad_norm": 0.4150104118521869, + "learning_rate": 2.109787704498459e-06, + "loss": 0.519, + "step": 1473 + }, + { + "epoch": 0.7125936669083878, + "grad_norm": 0.43013467795196153, + "learning_rate": 2.1032683460788223e-06, + "loss": 0.4979, + "step": 1474 + }, + { + "epoch": 0.7130771090161954, + "grad_norm": 0.4303795815833922, + "learning_rate": 2.0967563919387563e-06, + "loss": 0.5256, + "step": 1475 + }, + { + "epoch": 0.7135605511240029, + "grad_norm": 0.4386538663824397, + "learning_rate": 2.0902518587233418e-06, + "loss": 0.5195, + "step": 1476 + }, + { + "epoch": 0.7140439932318104, + "grad_norm": 0.41141211228553354, + "learning_rate": 2.08375476305869e-06, + "loss": 0.5238, + "step": 1477 + }, + { + "epoch": 0.714527435339618, + "grad_norm": 0.3832973623968104, + "learning_rate": 2.077265121551903e-06, + "loss": 0.4914, + "step": 1478 + }, + { + "epoch": 0.7150108774474256, + "grad_norm": 0.4396380345403612, + "learning_rate": 2.0707829507910237e-06, + "loss": 0.5224, + "step": 1479 + }, + { + "epoch": 0.7154943195552332, + "grad_norm": 0.4084969868928133, + "learning_rate": 2.0643082673450053e-06, + "loss": 0.5214, + "step": 1480 + }, + { + "epoch": 0.7159777616630408, + "grad_norm": 0.41940449704789057, + "learning_rate": 2.05784108776366e-06, + "loss": 0.5098, + "step": 1481 + }, + { + "epoch": 0.7164612037708484, + "grad_norm": 0.4368606150106444, + "learning_rate": 2.051381428577622e-06, + "loss": 0.5213, + "step": 1482 + }, + { + "epoch": 0.716944645878656, + "grad_norm": 0.4475169176125263, + "learning_rate": 2.044929306298298e-06, + "loss": 0.5169, + "step": 1483 + }, + { + "epoch": 0.7174280879864636, + "grad_norm": 0.4192404761939798, + "learning_rate": 2.0384847374178346e-06, + "loss": 0.5214, + "step": 1484 + }, + { + "epoch": 0.7179115300942712, + "grad_norm": 0.4000794067095613, + "learning_rate": 2.0320477384090665e-06, + "loss": 0.5002, + "step": 1485 + }, + { + "epoch": 0.7183949722020788, + "grad_norm": 0.4083964682274076, + "learning_rate": 2.0256183257254837e-06, + "loss": 0.5057, + "step": 1486 + }, + { + "epoch": 0.7188784143098864, + "grad_norm": 0.4286205023949667, + "learning_rate": 2.0191965158011854e-06, + "loss": 0.4815, + "step": 1487 + }, + { + "epoch": 0.719361856417694, + "grad_norm": 0.40907099979637535, + "learning_rate": 2.012782325050831e-06, + "loss": 0.5283, + "step": 1488 + }, + { + "epoch": 0.7198452985255016, + "grad_norm": 0.41946463733283473, + "learning_rate": 2.006375769869611e-06, + "loss": 0.522, + "step": 1489 + }, + { + "epoch": 0.7203287406333092, + "grad_norm": 0.4222854300641897, + "learning_rate": 1.9999768666331974e-06, + "loss": 0.5132, + "step": 1490 + }, + { + "epoch": 0.7208121827411168, + "grad_norm": 0.3830302288103666, + "learning_rate": 1.9935856316977044e-06, + "loss": 0.4938, + "step": 1491 + }, + { + "epoch": 0.7212956248489243, + "grad_norm": 0.47757660690611003, + "learning_rate": 1.987202081399639e-06, + "loss": 0.5251, + "step": 1492 + }, + { + "epoch": 0.7217790669567319, + "grad_norm": 0.3992903621119011, + "learning_rate": 1.9808262320558724e-06, + "loss": 0.506, + "step": 1493 + }, + { + "epoch": 0.7222625090645395, + "grad_norm": 0.41142424465140587, + "learning_rate": 1.9744580999635902e-06, + "loss": 0.5143, + "step": 1494 + }, + { + "epoch": 0.7227459511723471, + "grad_norm": 0.4124129943865437, + "learning_rate": 1.968097701400252e-06, + "loss": 0.5245, + "step": 1495 + }, + { + "epoch": 0.7232293932801547, + "grad_norm": 0.4312737875038871, + "learning_rate": 1.9617450526235464e-06, + "loss": 0.5178, + "step": 1496 + }, + { + "epoch": 0.7237128353879623, + "grad_norm": 0.43509903197162936, + "learning_rate": 1.9554001698713572e-06, + "loss": 0.5131, + "step": 1497 + }, + { + "epoch": 0.7241962774957699, + "grad_norm": 0.4260008705271214, + "learning_rate": 1.949063069361717e-06, + "loss": 0.5136, + "step": 1498 + }, + { + "epoch": 0.7246797196035775, + "grad_norm": 0.42356802738060345, + "learning_rate": 1.9427337672927632e-06, + "loss": 0.5146, + "step": 1499 + }, + { + "epoch": 0.7251631617113851, + "grad_norm": 0.4027997963462275, + "learning_rate": 1.936412279842705e-06, + "loss": 0.4913, + "step": 1500 + }, + { + "epoch": 0.7256466038191927, + "grad_norm": 0.4124397793510055, + "learning_rate": 1.9300986231697705e-06, + "loss": 0.5175, + "step": 1501 + }, + { + "epoch": 0.7261300459270003, + "grad_norm": 0.4442811918906246, + "learning_rate": 1.9237928134121757e-06, + "loss": 0.516, + "step": 1502 + }, + { + "epoch": 0.7266134880348079, + "grad_norm": 0.4393627100062481, + "learning_rate": 1.9174948666880805e-06, + "loss": 0.5155, + "step": 1503 + }, + { + "epoch": 0.7270969301426154, + "grad_norm": 0.43133527501756386, + "learning_rate": 1.9112047990955446e-06, + "loss": 0.5136, + "step": 1504 + }, + { + "epoch": 0.727580372250423, + "grad_norm": 0.45322135855021595, + "learning_rate": 1.9049226267124844e-06, + "loss": 0.5172, + "step": 1505 + }, + { + "epoch": 0.7280638143582306, + "grad_norm": 0.41078461158260915, + "learning_rate": 1.8986483655966408e-06, + "loss": 0.5179, + "step": 1506 + }, + { + "epoch": 0.7285472564660382, + "grad_norm": 0.4178604053793329, + "learning_rate": 1.8923820317855307e-06, + "loss": 0.5076, + "step": 1507 + }, + { + "epoch": 0.7290306985738458, + "grad_norm": 0.42623268157040256, + "learning_rate": 1.8861236412964106e-06, + "loss": 0.5172, + "step": 1508 + }, + { + "epoch": 0.7295141406816533, + "grad_norm": 0.42835046843347674, + "learning_rate": 1.879873210126229e-06, + "loss": 0.5259, + "step": 1509 + }, + { + "epoch": 0.7299975827894609, + "grad_norm": 0.4196504177616674, + "learning_rate": 1.873630754251588e-06, + "loss": 0.5177, + "step": 1510 + }, + { + "epoch": 0.7304810248972685, + "grad_norm": 0.4079644120305993, + "learning_rate": 1.8673962896287152e-06, + "loss": 0.5201, + "step": 1511 + }, + { + "epoch": 0.7309644670050761, + "grad_norm": 0.43892341061011425, + "learning_rate": 1.8611698321933991e-06, + "loss": 0.5186, + "step": 1512 + }, + { + "epoch": 0.7314479091128837, + "grad_norm": 0.42683430911112086, + "learning_rate": 1.8549513978609707e-06, + "loss": 0.5111, + "step": 1513 + }, + { + "epoch": 0.7319313512206913, + "grad_norm": 0.41062878136002484, + "learning_rate": 1.8487410025262436e-06, + "loss": 0.5103, + "step": 1514 + }, + { + "epoch": 0.7324147933284989, + "grad_norm": 0.4256013874707191, + "learning_rate": 1.8425386620634961e-06, + "loss": 0.5167, + "step": 1515 + }, + { + "epoch": 0.7328982354363065, + "grad_norm": 0.4388797350675763, + "learning_rate": 1.8363443923264046e-06, + "loss": 0.5125, + "step": 1516 + }, + { + "epoch": 0.7333816775441141, + "grad_norm": 0.4394233254146738, + "learning_rate": 1.8301582091480264e-06, + "loss": 0.5217, + "step": 1517 + }, + { + "epoch": 0.7338651196519217, + "grad_norm": 0.41564422037394944, + "learning_rate": 1.8239801283407393e-06, + "loss": 0.5164, + "step": 1518 + }, + { + "epoch": 0.7343485617597293, + "grad_norm": 0.4173422643681329, + "learning_rate": 1.8178101656962188e-06, + "loss": 0.5205, + "step": 1519 + }, + { + "epoch": 0.7348320038675369, + "grad_norm": 0.39698118648442665, + "learning_rate": 1.8116483369853853e-06, + "loss": 0.4835, + "step": 1520 + }, + { + "epoch": 0.7353154459753445, + "grad_norm": 0.42300362992419904, + "learning_rate": 1.8054946579583732e-06, + "loss": 0.5143, + "step": 1521 + }, + { + "epoch": 0.7357988880831521, + "grad_norm": 0.42464469919772974, + "learning_rate": 1.7993491443444771e-06, + "loss": 0.5129, + "step": 1522 + }, + { + "epoch": 0.7362823301909597, + "grad_norm": 0.4501988280108448, + "learning_rate": 1.7932118118521274e-06, + "loss": 0.5131, + "step": 1523 + }, + { + "epoch": 0.7367657722987673, + "grad_norm": 0.41493548901611477, + "learning_rate": 1.787082676168842e-06, + "loss": 0.5268, + "step": 1524 + }, + { + "epoch": 0.7372492144065748, + "grad_norm": 0.4436917707906808, + "learning_rate": 1.7809617529611828e-06, + "loss": 0.5126, + "step": 1525 + }, + { + "epoch": 0.7377326565143824, + "grad_norm": 0.39767655781448813, + "learning_rate": 1.7748490578747257e-06, + "loss": 0.4945, + "step": 1526 + }, + { + "epoch": 0.73821609862219, + "grad_norm": 0.4281607415979641, + "learning_rate": 1.7687446065340074e-06, + "loss": 0.5189, + "step": 1527 + }, + { + "epoch": 0.7386995407299976, + "grad_norm": 0.4123906023331037, + "learning_rate": 1.7626484145425038e-06, + "loss": 0.5117, + "step": 1528 + }, + { + "epoch": 0.7391829828378051, + "grad_norm": 0.39861909677156787, + "learning_rate": 1.7565604974825678e-06, + "loss": 0.4917, + "step": 1529 + }, + { + "epoch": 0.7396664249456127, + "grad_norm": 0.4164290248459804, + "learning_rate": 1.7504808709154104e-06, + "loss": 0.5187, + "step": 1530 + }, + { + "epoch": 0.7401498670534203, + "grad_norm": 0.4149617264710624, + "learning_rate": 1.744409550381041e-06, + "loss": 0.529, + "step": 1531 + }, + { + "epoch": 0.7406333091612279, + "grad_norm": 0.402995768205116, + "learning_rate": 1.7383465513982517e-06, + "loss": 0.4906, + "step": 1532 + }, + { + "epoch": 0.7411167512690355, + "grad_norm": 0.4357911248878148, + "learning_rate": 1.7322918894645525e-06, + "loss": 0.5209, + "step": 1533 + }, + { + "epoch": 0.7416001933768431, + "grad_norm": 0.4310636351470309, + "learning_rate": 1.7262455800561456e-06, + "loss": 0.529, + "step": 1534 + }, + { + "epoch": 0.7420836354846507, + "grad_norm": 0.40110062198063573, + "learning_rate": 1.7202076386278876e-06, + "loss": 0.5218, + "step": 1535 + }, + { + "epoch": 0.7425670775924583, + "grad_norm": 0.4044655145984996, + "learning_rate": 1.7141780806132429e-06, + "loss": 0.5038, + "step": 1536 + }, + { + "epoch": 0.7430505197002659, + "grad_norm": 0.4169687562172726, + "learning_rate": 1.70815692142425e-06, + "loss": 0.5094, + "step": 1537 + }, + { + "epoch": 0.7435339618080735, + "grad_norm": 0.3892005945860465, + "learning_rate": 1.702144176451473e-06, + "loss": 0.4909, + "step": 1538 + }, + { + "epoch": 0.7440174039158811, + "grad_norm": 0.4059894671987348, + "learning_rate": 1.696139861063974e-06, + "loss": 0.5231, + "step": 1539 + }, + { + "epoch": 0.7445008460236887, + "grad_norm": 0.4235285224343199, + "learning_rate": 1.690143990609268e-06, + "loss": 0.5116, + "step": 1540 + }, + { + "epoch": 0.7449842881314963, + "grad_norm": 0.4066059462995061, + "learning_rate": 1.6841565804132843e-06, + "loss": 0.5159, + "step": 1541 + }, + { + "epoch": 0.7454677302393038, + "grad_norm": 0.41374792014057904, + "learning_rate": 1.6781776457803227e-06, + "loss": 0.5146, + "step": 1542 + }, + { + "epoch": 0.7459511723471114, + "grad_norm": 0.41330516594974576, + "learning_rate": 1.6722072019930242e-06, + "loss": 0.4841, + "step": 1543 + }, + { + "epoch": 0.746434614454919, + "grad_norm": 0.4342078760633199, + "learning_rate": 1.6662452643123234e-06, + "loss": 0.5181, + "step": 1544 + }, + { + "epoch": 0.7469180565627266, + "grad_norm": 0.4366803318877013, + "learning_rate": 1.660291847977415e-06, + "loss": 0.5056, + "step": 1545 + }, + { + "epoch": 0.7474014986705342, + "grad_norm": 0.4107968782550443, + "learning_rate": 1.6543469682057105e-06, + "loss": 0.5102, + "step": 1546 + }, + { + "epoch": 0.7478849407783418, + "grad_norm": 0.43703346533243426, + "learning_rate": 1.6484106401927991e-06, + "loss": 0.517, + "step": 1547 + }, + { + "epoch": 0.7483683828861494, + "grad_norm": 0.4185149815126949, + "learning_rate": 1.6424828791124159e-06, + "loss": 0.5162, + "step": 1548 + }, + { + "epoch": 0.748851824993957, + "grad_norm": 0.3941815905233016, + "learning_rate": 1.6365637001163958e-06, + "loss": 0.4694, + "step": 1549 + }, + { + "epoch": 0.7493352671017646, + "grad_norm": 0.4069386532862478, + "learning_rate": 1.6306531183346387e-06, + "loss": 0.5172, + "step": 1550 + }, + { + "epoch": 0.7498187092095722, + "grad_norm": 0.44449597102378385, + "learning_rate": 1.624751148875065e-06, + "loss": 0.5227, + "step": 1551 + }, + { + "epoch": 0.7503021513173798, + "grad_norm": 0.4200070436877298, + "learning_rate": 1.6188578068235855e-06, + "loss": 0.5227, + "step": 1552 + }, + { + "epoch": 0.7507855934251874, + "grad_norm": 0.4134676341568954, + "learning_rate": 1.6129731072440586e-06, + "loss": 0.5197, + "step": 1553 + }, + { + "epoch": 0.751269035532995, + "grad_norm": 0.4342416540931307, + "learning_rate": 1.6070970651782514e-06, + "loss": 0.5234, + "step": 1554 + }, + { + "epoch": 0.7517524776408026, + "grad_norm": 0.4621699665968105, + "learning_rate": 1.6012296956457972e-06, + "loss": 0.5224, + "step": 1555 + }, + { + "epoch": 0.7522359197486101, + "grad_norm": 0.39794619123328484, + "learning_rate": 1.5953710136441685e-06, + "loss": 0.5222, + "step": 1556 + }, + { + "epoch": 0.7527193618564177, + "grad_norm": 0.39795969856270086, + "learning_rate": 1.5895210341486279e-06, + "loss": 0.4697, + "step": 1557 + }, + { + "epoch": 0.7532028039642252, + "grad_norm": 0.4348573897259895, + "learning_rate": 1.583679772112196e-06, + "loss": 0.5256, + "step": 1558 + }, + { + "epoch": 0.7536862460720328, + "grad_norm": 0.4108494121358044, + "learning_rate": 1.5778472424656083e-06, + "loss": 0.5185, + "step": 1559 + }, + { + "epoch": 0.7541696881798404, + "grad_norm": 0.41224584403564757, + "learning_rate": 1.5720234601172767e-06, + "loss": 0.5203, + "step": 1560 + }, + { + "epoch": 0.754653130287648, + "grad_norm": 0.4348874788487397, + "learning_rate": 1.566208439953265e-06, + "loss": 0.5189, + "step": 1561 + }, + { + "epoch": 0.7551365723954556, + "grad_norm": 0.42842919833727694, + "learning_rate": 1.5604021968372286e-06, + "loss": 0.5111, + "step": 1562 + }, + { + "epoch": 0.7556200145032632, + "grad_norm": 0.43772492324957596, + "learning_rate": 1.5546047456103964e-06, + "loss": 0.5147, + "step": 1563 + }, + { + "epoch": 0.7561034566110708, + "grad_norm": 0.41431446343362865, + "learning_rate": 1.548816101091517e-06, + "loss": 0.5149, + "step": 1564 + }, + { + "epoch": 0.7565868987188784, + "grad_norm": 0.40777837421338714, + "learning_rate": 1.5430362780768343e-06, + "loss": 0.5117, + "step": 1565 + }, + { + "epoch": 0.757070340826686, + "grad_norm": 0.4454487846070906, + "learning_rate": 1.537265291340042e-06, + "loss": 0.5074, + "step": 1566 + }, + { + "epoch": 0.7575537829344936, + "grad_norm": 0.46396843002779686, + "learning_rate": 1.531503155632249e-06, + "loss": 0.5223, + "step": 1567 + }, + { + "epoch": 0.7580372250423012, + "grad_norm": 0.41741600165011983, + "learning_rate": 1.5257498856819353e-06, + "loss": 0.5158, + "step": 1568 + }, + { + "epoch": 0.7585206671501088, + "grad_norm": 0.4059061868499258, + "learning_rate": 1.5200054961949233e-06, + "loss": 0.5049, + "step": 1569 + }, + { + "epoch": 0.7590041092579164, + "grad_norm": 0.41330390270516437, + "learning_rate": 1.5142700018543382e-06, + "loss": 0.5305, + "step": 1570 + }, + { + "epoch": 0.759487551365724, + "grad_norm": 0.43099056056318497, + "learning_rate": 1.508543417320562e-06, + "loss": 0.5212, + "step": 1571 + }, + { + "epoch": 0.7599709934735316, + "grad_norm": 0.39882553101049034, + "learning_rate": 1.5028257572312105e-06, + "loss": 0.4883, + "step": 1572 + }, + { + "epoch": 0.7604544355813392, + "grad_norm": 0.4581685557000849, + "learning_rate": 1.4971170362010774e-06, + "loss": 0.5225, + "step": 1573 + }, + { + "epoch": 0.7609378776891468, + "grad_norm": 0.4428964310587446, + "learning_rate": 1.4914172688221213e-06, + "loss": 0.5195, + "step": 1574 + }, + { + "epoch": 0.7614213197969543, + "grad_norm": 0.4170791170307987, + "learning_rate": 1.485726469663401e-06, + "loss": 0.5294, + "step": 1575 + }, + { + "epoch": 0.7619047619047619, + "grad_norm": 0.4212168944035229, + "learning_rate": 1.4800446532710627e-06, + "loss": 0.5143, + "step": 1576 + }, + { + "epoch": 0.7623882040125695, + "grad_norm": 0.4317778496296824, + "learning_rate": 1.4743718341682806e-06, + "loss": 0.5242, + "step": 1577 + }, + { + "epoch": 0.7628716461203771, + "grad_norm": 0.3887549768642727, + "learning_rate": 1.468708026855245e-06, + "loss": 0.4927, + "step": 1578 + }, + { + "epoch": 0.7633550882281847, + "grad_norm": 0.41991973562573803, + "learning_rate": 1.463053245809099e-06, + "loss": 0.5248, + "step": 1579 + }, + { + "epoch": 0.7638385303359922, + "grad_norm": 0.41267795471721197, + "learning_rate": 1.457407505483921e-06, + "loss": 0.5187, + "step": 1580 + }, + { + "epoch": 0.7643219724437998, + "grad_norm": 0.44716407911896383, + "learning_rate": 1.4517708203106763e-06, + "loss": 0.523, + "step": 1581 + }, + { + "epoch": 0.7648054145516074, + "grad_norm": 0.4254440302923612, + "learning_rate": 1.446143204697187e-06, + "loss": 0.5233, + "step": 1582 + }, + { + "epoch": 0.765288856659415, + "grad_norm": 0.39996785018921494, + "learning_rate": 1.4405246730280946e-06, + "loss": 0.5172, + "step": 1583 + }, + { + "epoch": 0.7657722987672226, + "grad_norm": 0.443369622770567, + "learning_rate": 1.4349152396648153e-06, + "loss": 0.5183, + "step": 1584 + }, + { + "epoch": 0.7662557408750302, + "grad_norm": 0.40505843584897416, + "learning_rate": 1.4293149189455146e-06, + "loss": 0.5161, + "step": 1585 + }, + { + "epoch": 0.7667391829828378, + "grad_norm": 0.4077704595280849, + "learning_rate": 1.4237237251850634e-06, + "loss": 0.5107, + "step": 1586 + }, + { + "epoch": 0.7672226250906454, + "grad_norm": 0.40791039312028615, + "learning_rate": 1.4181416726750052e-06, + "loss": 0.5146, + "step": 1587 + }, + { + "epoch": 0.767706067198453, + "grad_norm": 0.41705043398231784, + "learning_rate": 1.4125687756835132e-06, + "loss": 0.4812, + "step": 1588 + }, + { + "epoch": 0.7681895093062606, + "grad_norm": 0.4235182346193989, + "learning_rate": 1.4070050484553644e-06, + "loss": 0.5129, + "step": 1589 + }, + { + "epoch": 0.7686729514140682, + "grad_norm": 0.414137655909364, + "learning_rate": 1.4014505052118893e-06, + "loss": 0.5236, + "step": 1590 + }, + { + "epoch": 0.7691563935218757, + "grad_norm": 0.43611300077847176, + "learning_rate": 1.3959051601509537e-06, + "loss": 0.5345, + "step": 1591 + }, + { + "epoch": 0.7696398356296833, + "grad_norm": 0.410845648388898, + "learning_rate": 1.3903690274469029e-06, + "loss": 0.5115, + "step": 1592 + }, + { + "epoch": 0.7701232777374909, + "grad_norm": 0.3961083948871449, + "learning_rate": 1.3848421212505404e-06, + "loss": 0.5168, + "step": 1593 + }, + { + "epoch": 0.7706067198452985, + "grad_norm": 0.42179325369386034, + "learning_rate": 1.37932445568908e-06, + "loss": 0.5125, + "step": 1594 + }, + { + "epoch": 0.7710901619531061, + "grad_norm": 0.4213217250215216, + "learning_rate": 1.3738160448661253e-06, + "loss": 0.5267, + "step": 1595 + }, + { + "epoch": 0.7715736040609137, + "grad_norm": 0.4143253090473424, + "learning_rate": 1.3683169028616155e-06, + "loss": 0.5178, + "step": 1596 + }, + { + "epoch": 0.7720570461687213, + "grad_norm": 0.4171850827541685, + "learning_rate": 1.3628270437317993e-06, + "loss": 0.5211, + "step": 1597 + }, + { + "epoch": 0.7725404882765289, + "grad_norm": 0.39565458081679644, + "learning_rate": 1.3573464815092003e-06, + "loss": 0.5055, + "step": 1598 + }, + { + "epoch": 0.7730239303843365, + "grad_norm": 0.4271922188091497, + "learning_rate": 1.3518752302025773e-06, + "loss": 0.5279, + "step": 1599 + }, + { + "epoch": 0.7735073724921441, + "grad_norm": 0.4151739224827406, + "learning_rate": 1.3464133037968914e-06, + "loss": 0.5239, + "step": 1600 + }, + { + "epoch": 0.7739908145999517, + "grad_norm": 0.3960683162461613, + "learning_rate": 1.3409607162532628e-06, + "loss": 0.4987, + "step": 1601 + }, + { + "epoch": 0.7744742567077593, + "grad_norm": 0.43044333694614223, + "learning_rate": 1.3355174815089477e-06, + "loss": 0.5273, + "step": 1602 + }, + { + "epoch": 0.7749576988155669, + "grad_norm": 0.4121649380386113, + "learning_rate": 1.3300836134772916e-06, + "loss": 0.5162, + "step": 1603 + }, + { + "epoch": 0.7754411409233745, + "grad_norm": 0.4005354058641754, + "learning_rate": 1.3246591260477015e-06, + "loss": 0.5167, + "step": 1604 + }, + { + "epoch": 0.7759245830311821, + "grad_norm": 0.3951020817933521, + "learning_rate": 1.3192440330856005e-06, + "loss": 0.5251, + "step": 1605 + }, + { + "epoch": 0.7764080251389897, + "grad_norm": 0.42611917105831465, + "learning_rate": 1.3138383484324063e-06, + "loss": 0.5252, + "step": 1606 + }, + { + "epoch": 0.7768914672467973, + "grad_norm": 0.40098636118444037, + "learning_rate": 1.308442085905482e-06, + "loss": 0.5101, + "step": 1607 + }, + { + "epoch": 0.7773749093546047, + "grad_norm": 0.4404415072756006, + "learning_rate": 1.30305525929811e-06, + "loss": 0.5224, + "step": 1608 + }, + { + "epoch": 0.7778583514624123, + "grad_norm": 0.40390400609014704, + "learning_rate": 1.297677882379455e-06, + "loss": 0.5191, + "step": 1609 + }, + { + "epoch": 0.7783417935702199, + "grad_norm": 0.43645719023114843, + "learning_rate": 1.2923099688945234e-06, + "loss": 0.5096, + "step": 1610 + }, + { + "epoch": 0.7788252356780275, + "grad_norm": 0.401799031041578, + "learning_rate": 1.2869515325641357e-06, + "loss": 0.4812, + "step": 1611 + }, + { + "epoch": 0.7793086777858351, + "grad_norm": 0.40544675897829047, + "learning_rate": 1.281602587084887e-06, + "loss": 0.5211, + "step": 1612 + }, + { + "epoch": 0.7797921198936427, + "grad_norm": 0.4166351291750946, + "learning_rate": 1.2762631461291148e-06, + "loss": 0.5294, + "step": 1613 + }, + { + "epoch": 0.7802755620014503, + "grad_norm": 0.4334981607396633, + "learning_rate": 1.2709332233448573e-06, + "loss": 0.5096, + "step": 1614 + }, + { + "epoch": 0.7807590041092579, + "grad_norm": 0.437984950036233, + "learning_rate": 1.2656128323558286e-06, + "loss": 0.5135, + "step": 1615 + }, + { + "epoch": 0.7812424462170655, + "grad_norm": 0.41467240914944964, + "learning_rate": 1.2603019867613764e-06, + "loss": 0.5162, + "step": 1616 + }, + { + "epoch": 0.7817258883248731, + "grad_norm": 0.40797210573439474, + "learning_rate": 1.2550007001364518e-06, + "loss": 0.5064, + "step": 1617 + }, + { + "epoch": 0.7822093304326807, + "grad_norm": 0.40625079236189654, + "learning_rate": 1.2497089860315675e-06, + "loss": 0.5057, + "step": 1618 + }, + { + "epoch": 0.7826927725404883, + "grad_norm": 0.3973135238618207, + "learning_rate": 1.244426857972773e-06, + "loss": 0.5125, + "step": 1619 + }, + { + "epoch": 0.7831762146482959, + "grad_norm": 0.41758654400468537, + "learning_rate": 1.239154329461615e-06, + "loss": 0.5146, + "step": 1620 + }, + { + "epoch": 0.7836596567561035, + "grad_norm": 0.4546571879884002, + "learning_rate": 1.233891413975098e-06, + "loss": 0.5138, + "step": 1621 + }, + { + "epoch": 0.7841430988639111, + "grad_norm": 0.4501304501527847, + "learning_rate": 1.228638124965661e-06, + "loss": 0.5111, + "step": 1622 + }, + { + "epoch": 0.7846265409717187, + "grad_norm": 0.40173574952002505, + "learning_rate": 1.223394475861131e-06, + "loss": 0.5134, + "step": 1623 + }, + { + "epoch": 0.7851099830795262, + "grad_norm": 0.4105768174048188, + "learning_rate": 1.2181604800646996e-06, + "loss": 0.5092, + "step": 1624 + }, + { + "epoch": 0.7855934251873338, + "grad_norm": 0.39390517153871624, + "learning_rate": 1.212936150954882e-06, + "loss": 0.498, + "step": 1625 + }, + { + "epoch": 0.7860768672951414, + "grad_norm": 0.41453725871465896, + "learning_rate": 1.207721501885486e-06, + "loss": 0.5063, + "step": 1626 + }, + { + "epoch": 0.786560309402949, + "grad_norm": 0.44249465126635484, + "learning_rate": 1.2025165461855714e-06, + "loss": 0.5212, + "step": 1627 + }, + { + "epoch": 0.7870437515107566, + "grad_norm": 0.4079816768267276, + "learning_rate": 1.1973212971594262e-06, + "loss": 0.5155, + "step": 1628 + }, + { + "epoch": 0.7875271936185642, + "grad_norm": 0.4318458945961838, + "learning_rate": 1.1921357680865258e-06, + "loss": 0.5183, + "step": 1629 + }, + { + "epoch": 0.7880106357263718, + "grad_norm": 0.40656198305401237, + "learning_rate": 1.1869599722215013e-06, + "loss": 0.4949, + "step": 1630 + }, + { + "epoch": 0.7884940778341794, + "grad_norm": 0.4056814293942294, + "learning_rate": 1.181793922794102e-06, + "loss": 0.5206, + "step": 1631 + }, + { + "epoch": 0.788977519941987, + "grad_norm": 0.42895763169120843, + "learning_rate": 1.1766376330091684e-06, + "loss": 0.503, + "step": 1632 + }, + { + "epoch": 0.7894609620497945, + "grad_norm": 0.4165970675717556, + "learning_rate": 1.1714911160465924e-06, + "loss": 0.5255, + "step": 1633 + }, + { + "epoch": 0.7899444041576021, + "grad_norm": 0.4123917311937627, + "learning_rate": 1.1663543850612847e-06, + "loss": 0.5169, + "step": 1634 + }, + { + "epoch": 0.7904278462654097, + "grad_norm": 0.41612583641837364, + "learning_rate": 1.1612274531831463e-06, + "loss": 0.4938, + "step": 1635 + }, + { + "epoch": 0.7909112883732173, + "grad_norm": 0.40728900719245686, + "learning_rate": 1.1561103335170242e-06, + "loss": 0.5222, + "step": 1636 + }, + { + "epoch": 0.7913947304810249, + "grad_norm": 0.4348645075910405, + "learning_rate": 1.1510030391426941e-06, + "loss": 0.5192, + "step": 1637 + }, + { + "epoch": 0.7918781725888325, + "grad_norm": 0.4086546804175218, + "learning_rate": 1.1459055831148074e-06, + "loss": 0.5232, + "step": 1638 + }, + { + "epoch": 0.7923616146966401, + "grad_norm": 0.40880965205946446, + "learning_rate": 1.140817978462876e-06, + "loss": 0.5212, + "step": 1639 + }, + { + "epoch": 0.7928450568044476, + "grad_norm": 0.3893016631161895, + "learning_rate": 1.1357402381912224e-06, + "loss": 0.4873, + "step": 1640 + }, + { + "epoch": 0.7933284989122552, + "grad_norm": 0.4215992969510908, + "learning_rate": 1.1306723752789672e-06, + "loss": 0.5211, + "step": 1641 + }, + { + "epoch": 0.7938119410200628, + "grad_norm": 0.420615559845491, + "learning_rate": 1.1256144026799703e-06, + "loss": 0.5179, + "step": 1642 + }, + { + "epoch": 0.7942953831278704, + "grad_norm": 0.39236133338098145, + "learning_rate": 1.1205663333228217e-06, + "loss": 0.4911, + "step": 1643 + }, + { + "epoch": 0.794778825235678, + "grad_norm": 0.4158254754636244, + "learning_rate": 1.1155281801107897e-06, + "loss": 0.5146, + "step": 1644 + }, + { + "epoch": 0.7952622673434856, + "grad_norm": 0.4092049660763265, + "learning_rate": 1.1104999559218022e-06, + "loss": 0.5063, + "step": 1645 + }, + { + "epoch": 0.7957457094512932, + "grad_norm": 0.43121118572534733, + "learning_rate": 1.1054816736084057e-06, + "loss": 0.5122, + "step": 1646 + }, + { + "epoch": 0.7962291515591008, + "grad_norm": 0.40574034047521074, + "learning_rate": 1.1004733459977325e-06, + "loss": 0.5089, + "step": 1647 + }, + { + "epoch": 0.7967125936669084, + "grad_norm": 0.4343773778355907, + "learning_rate": 1.0954749858914727e-06, + "loss": 0.5177, + "step": 1648 + }, + { + "epoch": 0.797196035774716, + "grad_norm": 0.429877165339691, + "learning_rate": 1.0904866060658376e-06, + "loss": 0.5211, + "step": 1649 + }, + { + "epoch": 0.7976794778825236, + "grad_norm": 0.4108995062804379, + "learning_rate": 1.0855082192715294e-06, + "loss": 0.5174, + "step": 1650 + }, + { + "epoch": 0.7981629199903312, + "grad_norm": 0.4018616150052113, + "learning_rate": 1.0805398382337035e-06, + "loss": 0.5049, + "step": 1651 + }, + { + "epoch": 0.7986463620981388, + "grad_norm": 0.4089174910335269, + "learning_rate": 1.0755814756519445e-06, + "loss": 0.5226, + "step": 1652 + }, + { + "epoch": 0.7991298042059464, + "grad_norm": 0.3964537076582955, + "learning_rate": 1.0706331442002226e-06, + "loss": 0.5095, + "step": 1653 + }, + { + "epoch": 0.799613246313754, + "grad_norm": 0.4267767025207229, + "learning_rate": 1.0656948565268782e-06, + "loss": 0.5168, + "step": 1654 + }, + { + "epoch": 0.8000966884215616, + "grad_norm": 0.41363796984886936, + "learning_rate": 1.0607666252545673e-06, + "loss": 0.5128, + "step": 1655 + }, + { + "epoch": 0.8005801305293692, + "grad_norm": 0.43264146945425214, + "learning_rate": 1.0558484629802502e-06, + "loss": 0.514, + "step": 1656 + }, + { + "epoch": 0.8010635726371766, + "grad_norm": 0.42544390140386235, + "learning_rate": 1.0509403822751425e-06, + "loss": 0.512, + "step": 1657 + }, + { + "epoch": 0.8015470147449842, + "grad_norm": 0.3932679351449648, + "learning_rate": 1.0460423956846955e-06, + "loss": 0.4941, + "step": 1658 + }, + { + "epoch": 0.8020304568527918, + "grad_norm": 0.4096876585407803, + "learning_rate": 1.041154515728559e-06, + "loss": 0.5088, + "step": 1659 + }, + { + "epoch": 0.8025138989605994, + "grad_norm": 0.4173497731763413, + "learning_rate": 1.0362767549005454e-06, + "loss": 0.5119, + "step": 1660 + }, + { + "epoch": 0.802997341068407, + "grad_norm": 0.41012015779324845, + "learning_rate": 1.0314091256686065e-06, + "loss": 0.5212, + "step": 1661 + }, + { + "epoch": 0.8034807831762146, + "grad_norm": 0.38447439239259856, + "learning_rate": 1.0265516404747943e-06, + "loss": 0.5052, + "step": 1662 + }, + { + "epoch": 0.8039642252840222, + "grad_norm": 0.40948392634706504, + "learning_rate": 1.0217043117352337e-06, + "loss": 0.5109, + "step": 1663 + }, + { + "epoch": 0.8044476673918298, + "grad_norm": 0.40148827230751766, + "learning_rate": 1.0168671518400853e-06, + "loss": 0.5118, + "step": 1664 + }, + { + "epoch": 0.8049311094996374, + "grad_norm": 0.3939565441232479, + "learning_rate": 1.0120401731535213e-06, + "loss": 0.4879, + "step": 1665 + }, + { + "epoch": 0.805414551607445, + "grad_norm": 0.4394864393242481, + "learning_rate": 1.0072233880136872e-06, + "loss": 0.5104, + "step": 1666 + }, + { + "epoch": 0.8058979937152526, + "grad_norm": 0.4318770671908104, + "learning_rate": 1.0024168087326764e-06, + "loss": 0.5235, + "step": 1667 + }, + { + "epoch": 0.8063814358230602, + "grad_norm": 0.4281259140520081, + "learning_rate": 9.976204475964907e-07, + "loss": 0.5149, + "step": 1668 + }, + { + "epoch": 0.8068648779308678, + "grad_norm": 0.43979946361695016, + "learning_rate": 9.92834316865015e-07, + "loss": 0.5191, + "step": 1669 + }, + { + "epoch": 0.8073483200386754, + "grad_norm": 0.4312412015437643, + "learning_rate": 9.88058428771987e-07, + "loss": 0.5188, + "step": 1670 + }, + { + "epoch": 0.807831762146483, + "grad_norm": 0.4461824252192259, + "learning_rate": 9.832927955249605e-07, + "loss": 0.518, + "step": 1671 + }, + { + "epoch": 0.8083152042542906, + "grad_norm": 0.40455517199845253, + "learning_rate": 9.785374293052802e-07, + "loss": 0.5279, + "step": 1672 + }, + { + "epoch": 0.8087986463620981, + "grad_norm": 0.40500700400967726, + "learning_rate": 9.737923422680424e-07, + "loss": 0.5267, + "step": 1673 + }, + { + "epoch": 0.8092820884699057, + "grad_norm": 0.4053422468834684, + "learning_rate": 9.690575465420733e-07, + "loss": 0.5098, + "step": 1674 + }, + { + "epoch": 0.8097655305777133, + "grad_norm": 0.41221923071964073, + "learning_rate": 9.643330542298929e-07, + "loss": 0.5171, + "step": 1675 + }, + { + "epoch": 0.8102489726855209, + "grad_norm": 0.4289210188727792, + "learning_rate": 9.596188774076849e-07, + "loss": 0.5164, + "step": 1676 + }, + { + "epoch": 0.8107324147933285, + "grad_norm": 0.4119920227929362, + "learning_rate": 9.549150281252633e-07, + "loss": 0.5167, + "step": 1677 + }, + { + "epoch": 0.8112158569011361, + "grad_norm": 0.43146374267443927, + "learning_rate": 9.50221518406047e-07, + "loss": 0.5198, + "step": 1678 + }, + { + "epoch": 0.8116992990089437, + "grad_norm": 0.3915995001014536, + "learning_rate": 9.455383602470247e-07, + "loss": 0.5194, + "step": 1679 + }, + { + "epoch": 0.8121827411167513, + "grad_norm": 0.42092897815810126, + "learning_rate": 9.408655656187282e-07, + "loss": 0.5154, + "step": 1680 + }, + { + "epoch": 0.8126661832245589, + "grad_norm": 0.43929014126287974, + "learning_rate": 9.362031464651955e-07, + "loss": 0.5111, + "step": 1681 + }, + { + "epoch": 0.8131496253323665, + "grad_norm": 0.419403258433708, + "learning_rate": 9.31551114703943e-07, + "loss": 0.5175, + "step": 1682 + }, + { + "epoch": 0.813633067440174, + "grad_norm": 0.4235039718034734, + "learning_rate": 9.269094822259439e-07, + "loss": 0.5219, + "step": 1683 + }, + { + "epoch": 0.8141165095479816, + "grad_norm": 0.403949404981181, + "learning_rate": 9.22278260895581e-07, + "loss": 0.5257, + "step": 1684 + }, + { + "epoch": 0.8145999516557892, + "grad_norm": 0.40201626032689436, + "learning_rate": 9.176574625506324e-07, + "loss": 0.5065, + "step": 1685 + }, + { + "epoch": 0.8150833937635968, + "grad_norm": 0.42029809516611727, + "learning_rate": 9.130470990022283e-07, + "loss": 0.5198, + "step": 1686 + }, + { + "epoch": 0.8155668358714044, + "grad_norm": 0.4443584968330059, + "learning_rate": 9.084471820348306e-07, + "loss": 0.5054, + "step": 1687 + }, + { + "epoch": 0.816050277979212, + "grad_norm": 0.4011266291605723, + "learning_rate": 9.038577234061979e-07, + "loss": 0.481, + "step": 1688 + }, + { + "epoch": 0.8165337200870196, + "grad_norm": 0.4116565403445696, + "learning_rate": 8.992787348473575e-07, + "loss": 0.512, + "step": 1689 + }, + { + "epoch": 0.8170171621948271, + "grad_norm": 0.3855753519601646, + "learning_rate": 8.947102280625708e-07, + "loss": 0.4919, + "step": 1690 + }, + { + "epoch": 0.8175006043026347, + "grad_norm": 0.3998193393341577, + "learning_rate": 8.901522147293107e-07, + "loss": 0.5063, + "step": 1691 + }, + { + "epoch": 0.8179840464104423, + "grad_norm": 0.40465428030335077, + "learning_rate": 8.856047064982276e-07, + "loss": 0.4969, + "step": 1692 + }, + { + "epoch": 0.8184674885182499, + "grad_norm": 0.3993077607842942, + "learning_rate": 8.810677149931168e-07, + "loss": 0.5123, + "step": 1693 + }, + { + "epoch": 0.8189509306260575, + "grad_norm": 0.41845032917424874, + "learning_rate": 8.765412518108957e-07, + "loss": 0.5222, + "step": 1694 + }, + { + "epoch": 0.8194343727338651, + "grad_norm": 0.4482989172909152, + "learning_rate": 8.720253285215685e-07, + "loss": 0.5245, + "step": 1695 + }, + { + "epoch": 0.8199178148416727, + "grad_norm": 0.4096945568958353, + "learning_rate": 8.675199566682002e-07, + "loss": 0.4987, + "step": 1696 + }, + { + "epoch": 0.8204012569494803, + "grad_norm": 0.42715377043083036, + "learning_rate": 8.630251477668828e-07, + "loss": 0.4956, + "step": 1697 + }, + { + "epoch": 0.8208846990572879, + "grad_norm": 0.42586545844645524, + "learning_rate": 8.585409133067119e-07, + "loss": 0.5096, + "step": 1698 + }, + { + "epoch": 0.8213681411650955, + "grad_norm": 0.43766586659276707, + "learning_rate": 8.540672647497483e-07, + "loss": 0.5136, + "step": 1699 + }, + { + "epoch": 0.8218515832729031, + "grad_norm": 0.4371618341766256, + "learning_rate": 8.49604213531004e-07, + "loss": 0.5213, + "step": 1700 + }, + { + "epoch": 0.8223350253807107, + "grad_norm": 0.4375571316772861, + "learning_rate": 8.451517710583934e-07, + "loss": 0.5051, + "step": 1701 + }, + { + "epoch": 0.8228184674885183, + "grad_norm": 0.4132441919616583, + "learning_rate": 8.407099487127207e-07, + "loss": 0.5257, + "step": 1702 + }, + { + "epoch": 0.8233019095963259, + "grad_norm": 0.42607745465695845, + "learning_rate": 8.362787578476395e-07, + "loss": 0.5249, + "step": 1703 + }, + { + "epoch": 0.8237853517041335, + "grad_norm": 0.4075673839523143, + "learning_rate": 8.318582097896316e-07, + "loss": 0.5058, + "step": 1704 + }, + { + "epoch": 0.8242687938119411, + "grad_norm": 0.42693741052199397, + "learning_rate": 8.274483158379759e-07, + "loss": 0.5111, + "step": 1705 + }, + { + "epoch": 0.8247522359197486, + "grad_norm": 0.39832416179935565, + "learning_rate": 8.230490872647146e-07, + "loss": 0.4938, + "step": 1706 + }, + { + "epoch": 0.8252356780275562, + "grad_norm": 0.422151557962671, + "learning_rate": 8.18660535314631e-07, + "loss": 0.5183, + "step": 1707 + }, + { + "epoch": 0.8257191201353637, + "grad_norm": 0.4003210551929738, + "learning_rate": 8.142826712052177e-07, + "loss": 0.5131, + "step": 1708 + }, + { + "epoch": 0.8262025622431713, + "grad_norm": 0.41552515229148246, + "learning_rate": 8.099155061266495e-07, + "loss": 0.5104, + "step": 1709 + }, + { + "epoch": 0.8266860043509789, + "grad_norm": 0.4199192751255081, + "learning_rate": 8.055590512417499e-07, + "loss": 0.504, + "step": 1710 + }, + { + "epoch": 0.8271694464587865, + "grad_norm": 0.4183052253157522, + "learning_rate": 8.012133176859705e-07, + "loss": 0.5183, + "step": 1711 + }, + { + "epoch": 0.8276528885665941, + "grad_norm": 0.40771463289221466, + "learning_rate": 7.968783165673554e-07, + "loss": 0.5134, + "step": 1712 + }, + { + "epoch": 0.8281363306744017, + "grad_norm": 0.4201027836512912, + "learning_rate": 7.925540589665187e-07, + "loss": 0.5074, + "step": 1713 + }, + { + "epoch": 0.8286197727822093, + "grad_norm": 0.395143526726159, + "learning_rate": 7.882405559366091e-07, + "loss": 0.4907, + "step": 1714 + }, + { + "epoch": 0.8291032148900169, + "grad_norm": 0.39924930985003787, + "learning_rate": 7.839378185032897e-07, + "loss": 0.5107, + "step": 1715 + }, + { + "epoch": 0.8295866569978245, + "grad_norm": 0.4132095601626946, + "learning_rate": 7.796458576647015e-07, + "loss": 0.5185, + "step": 1716 + }, + { + "epoch": 0.8300700991056321, + "grad_norm": 0.40587201306044, + "learning_rate": 7.753646843914465e-07, + "loss": 0.5182, + "step": 1717 + }, + { + "epoch": 0.8305535412134397, + "grad_norm": 0.4094094956774689, + "learning_rate": 7.710943096265461e-07, + "loss": 0.5029, + "step": 1718 + }, + { + "epoch": 0.8310369833212473, + "grad_norm": 0.41067812349491495, + "learning_rate": 7.668347442854218e-07, + "loss": 0.5021, + "step": 1719 + }, + { + "epoch": 0.8315204254290549, + "grad_norm": 0.39956787890532264, + "learning_rate": 7.625859992558665e-07, + "loss": 0.5206, + "step": 1720 + }, + { + "epoch": 0.8320038675368625, + "grad_norm": 0.43928086956712875, + "learning_rate": 7.583480853980158e-07, + "loss": 0.5134, + "step": 1721 + }, + { + "epoch": 0.8324873096446701, + "grad_norm": 0.4100632271699525, + "learning_rate": 7.541210135443188e-07, + "loss": 0.5184, + "step": 1722 + }, + { + "epoch": 0.8329707517524776, + "grad_norm": 0.3961555211112688, + "learning_rate": 7.499047944995108e-07, + "loss": 0.5222, + "step": 1723 + }, + { + "epoch": 0.8334541938602852, + "grad_norm": 0.41401758140390904, + "learning_rate": 7.45699439040588e-07, + "loss": 0.5149, + "step": 1724 + }, + { + "epoch": 0.8339376359680928, + "grad_norm": 0.41725576477900833, + "learning_rate": 7.415049579167783e-07, + "loss": 0.5086, + "step": 1725 + }, + { + "epoch": 0.8344210780759004, + "grad_norm": 0.40808361223845036, + "learning_rate": 7.37321361849514e-07, + "loss": 0.5171, + "step": 1726 + }, + { + "epoch": 0.834904520183708, + "grad_norm": 0.4044441513281848, + "learning_rate": 7.331486615324024e-07, + "loss": 0.4931, + "step": 1727 + }, + { + "epoch": 0.8353879622915156, + "grad_norm": 0.39255016483428246, + "learning_rate": 7.289868676312023e-07, + "loss": 0.4895, + "step": 1728 + }, + { + "epoch": 0.8358714043993232, + "grad_norm": 0.4273894357037594, + "learning_rate": 7.248359907837959e-07, + "loss": 0.5141, + "step": 1729 + }, + { + "epoch": 0.8363548465071308, + "grad_norm": 0.41270523260835523, + "learning_rate": 7.206960416001563e-07, + "loss": 0.5053, + "step": 1730 + }, + { + "epoch": 0.8368382886149384, + "grad_norm": 0.42210989792552517, + "learning_rate": 7.165670306623296e-07, + "loss": 0.515, + "step": 1731 + }, + { + "epoch": 0.837321730722746, + "grad_norm": 0.4005116526979819, + "learning_rate": 7.124489685243985e-07, + "loss": 0.5084, + "step": 1732 + }, + { + "epoch": 0.8378051728305536, + "grad_norm": 0.42730888005294004, + "learning_rate": 7.08341865712463e-07, + "loss": 0.5149, + "step": 1733 + }, + { + "epoch": 0.8382886149383612, + "grad_norm": 0.3946117211995092, + "learning_rate": 7.042457327246088e-07, + "loss": 0.5272, + "step": 1734 + }, + { + "epoch": 0.8387720570461688, + "grad_norm": 0.40058125990145727, + "learning_rate": 7.001605800308825e-07, + "loss": 0.5173, + "step": 1735 + }, + { + "epoch": 0.8392554991539763, + "grad_norm": 0.39419621537510763, + "learning_rate": 6.960864180732618e-07, + "loss": 0.5182, + "step": 1736 + }, + { + "epoch": 0.8397389412617839, + "grad_norm": 0.4302451888948554, + "learning_rate": 6.920232572656349e-07, + "loss": 0.5145, + "step": 1737 + }, + { + "epoch": 0.8402223833695915, + "grad_norm": 0.39221396906385003, + "learning_rate": 6.879711079937667e-07, + "loss": 0.5079, + "step": 1738 + }, + { + "epoch": 0.840705825477399, + "grad_norm": 0.4210023704512398, + "learning_rate": 6.839299806152799e-07, + "loss": 0.5061, + "step": 1739 + }, + { + "epoch": 0.8411892675852066, + "grad_norm": 0.4031707044630559, + "learning_rate": 6.79899885459619e-07, + "loss": 0.5174, + "step": 1740 + }, + { + "epoch": 0.8416727096930142, + "grad_norm": 0.40104705743190977, + "learning_rate": 6.758808328280325e-07, + "loss": 0.4981, + "step": 1741 + }, + { + "epoch": 0.8421561518008218, + "grad_norm": 0.4158859718137932, + "learning_rate": 6.718728329935448e-07, + "loss": 0.5216, + "step": 1742 + }, + { + "epoch": 0.8426395939086294, + "grad_norm": 0.4140963838597211, + "learning_rate": 6.678758962009241e-07, + "loss": 0.5154, + "step": 1743 + }, + { + "epoch": 0.843123036016437, + "grad_norm": 0.41926365963573253, + "learning_rate": 6.638900326666653e-07, + "loss": 0.5181, + "step": 1744 + }, + { + "epoch": 0.8436064781242446, + "grad_norm": 0.4007033614343704, + "learning_rate": 6.599152525789531e-07, + "loss": 0.4772, + "step": 1745 + }, + { + "epoch": 0.8440899202320522, + "grad_norm": 0.4266694328755557, + "learning_rate": 6.559515660976506e-07, + "loss": 0.5153, + "step": 1746 + }, + { + "epoch": 0.8445733623398598, + "grad_norm": 0.42158713984389296, + "learning_rate": 6.519989833542567e-07, + "loss": 0.5218, + "step": 1747 + }, + { + "epoch": 0.8450568044476674, + "grad_norm": 0.4190422236566301, + "learning_rate": 6.480575144518931e-07, + "loss": 0.5267, + "step": 1748 + }, + { + "epoch": 0.845540246555475, + "grad_norm": 0.42322451653416415, + "learning_rate": 6.441271694652701e-07, + "loss": 0.517, + "step": 1749 + }, + { + "epoch": 0.8460236886632826, + "grad_norm": 0.4050974433698499, + "learning_rate": 6.402079584406673e-07, + "loss": 0.523, + "step": 1750 + }, + { + "epoch": 0.8465071307710902, + "grad_norm": 0.39927068510798064, + "learning_rate": 6.36299891395904e-07, + "loss": 0.4943, + "step": 1751 + }, + { + "epoch": 0.8469905728788978, + "grad_norm": 0.40520913199613756, + "learning_rate": 6.32402978320315e-07, + "loss": 0.519, + "step": 1752 + }, + { + "epoch": 0.8474740149867054, + "grad_norm": 0.41524557234436116, + "learning_rate": 6.285172291747232e-07, + "loss": 0.5087, + "step": 1753 + }, + { + "epoch": 0.847957457094513, + "grad_norm": 0.39348055940589066, + "learning_rate": 6.246426538914174e-07, + "loss": 0.5135, + "step": 1754 + }, + { + "epoch": 0.8484408992023206, + "grad_norm": 0.40472211918575973, + "learning_rate": 6.207792623741249e-07, + "loss": 0.5181, + "step": 1755 + }, + { + "epoch": 0.8489243413101281, + "grad_norm": 0.3797781522780497, + "learning_rate": 6.169270644979836e-07, + "loss": 0.4718, + "step": 1756 + }, + { + "epoch": 0.8494077834179357, + "grad_norm": 0.4172564454240539, + "learning_rate": 6.130860701095226e-07, + "loss": 0.5093, + "step": 1757 + }, + { + "epoch": 0.8498912255257433, + "grad_norm": 0.460481903524328, + "learning_rate": 6.092562890266341e-07, + "loss": 0.5245, + "step": 1758 + }, + { + "epoch": 0.8503746676335509, + "grad_norm": 0.40544203017797725, + "learning_rate": 6.054377310385479e-07, + "loss": 0.5067, + "step": 1759 + }, + { + "epoch": 0.8508581097413584, + "grad_norm": 0.4225253280006634, + "learning_rate": 6.016304059058031e-07, + "loss": 0.5169, + "step": 1760 + }, + { + "epoch": 0.851341551849166, + "grad_norm": 0.40769266639259943, + "learning_rate": 5.97834323360233e-07, + "loss": 0.5243, + "step": 1761 + }, + { + "epoch": 0.8518249939569736, + "grad_norm": 0.42284940262412657, + "learning_rate": 5.940494931049262e-07, + "loss": 0.5194, + "step": 1762 + }, + { + "epoch": 0.8523084360647812, + "grad_norm": 0.3916025337851957, + "learning_rate": 5.902759248142187e-07, + "loss": 0.4975, + "step": 1763 + }, + { + "epoch": 0.8527918781725888, + "grad_norm": 0.41326270414280697, + "learning_rate": 5.86513628133652e-07, + "loss": 0.5154, + "step": 1764 + }, + { + "epoch": 0.8532753202803964, + "grad_norm": 0.40856588365868324, + "learning_rate": 5.827626126799613e-07, + "loss": 0.5154, + "step": 1765 + }, + { + "epoch": 0.853758762388204, + "grad_norm": 0.42831173680710594, + "learning_rate": 5.790228880410426e-07, + "loss": 0.5163, + "step": 1766 + }, + { + "epoch": 0.8542422044960116, + "grad_norm": 0.4218590594382107, + "learning_rate": 5.75294463775935e-07, + "loss": 0.517, + "step": 1767 + }, + { + "epoch": 0.8547256466038192, + "grad_norm": 0.38253864809006055, + "learning_rate": 5.715773494147919e-07, + "loss": 0.4929, + "step": 1768 + }, + { + "epoch": 0.8552090887116268, + "grad_norm": 0.40270548702028475, + "learning_rate": 5.678715544588547e-07, + "loss": 0.5088, + "step": 1769 + }, + { + "epoch": 0.8556925308194344, + "grad_norm": 0.4229953125269584, + "learning_rate": 5.641770883804365e-07, + "loss": 0.5258, + "step": 1770 + }, + { + "epoch": 0.856175972927242, + "grad_norm": 0.4037677845049078, + "learning_rate": 5.604939606228887e-07, + "loss": 0.5095, + "step": 1771 + }, + { + "epoch": 0.8566594150350495, + "grad_norm": 0.39977977942883575, + "learning_rate": 5.568221806005847e-07, + "loss": 0.5128, + "step": 1772 + }, + { + "epoch": 0.8571428571428571, + "grad_norm": 0.4175904938844971, + "learning_rate": 5.531617576988879e-07, + "loss": 0.5114, + "step": 1773 + }, + { + "epoch": 0.8576262992506647, + "grad_norm": 0.41812393010867166, + "learning_rate": 5.495127012741352e-07, + "loss": 0.5188, + "step": 1774 + }, + { + "epoch": 0.8581097413584723, + "grad_norm": 0.38871202154348194, + "learning_rate": 5.45875020653609e-07, + "loss": 0.4882, + "step": 1775 + }, + { + "epoch": 0.8585931834662799, + "grad_norm": 0.40175664384357557, + "learning_rate": 5.422487251355146e-07, + "loss": 0.5088, + "step": 1776 + }, + { + "epoch": 0.8590766255740875, + "grad_norm": 0.4258611448475652, + "learning_rate": 5.386338239889549e-07, + "loss": 0.5136, + "step": 1777 + }, + { + "epoch": 0.8595600676818951, + "grad_norm": 0.38040545155326977, + "learning_rate": 5.350303264539091e-07, + "loss": 0.4692, + "step": 1778 + }, + { + "epoch": 0.8600435097897027, + "grad_norm": 0.4185751036827134, + "learning_rate": 5.314382417412062e-07, + "loss": 0.516, + "step": 1779 + }, + { + "epoch": 0.8605269518975103, + "grad_norm": 0.4237092619379993, + "learning_rate": 5.278575790325052e-07, + "loss": 0.5146, + "step": 1780 + }, + { + "epoch": 0.8610103940053179, + "grad_norm": 0.4173802982789206, + "learning_rate": 5.242883474802696e-07, + "loss": 0.5125, + "step": 1781 + }, + { + "epoch": 0.8614938361131255, + "grad_norm": 0.41838440801291993, + "learning_rate": 5.207305562077403e-07, + "loss": 0.5177, + "step": 1782 + }, + { + "epoch": 0.8619772782209331, + "grad_norm": 0.4779855097218796, + "learning_rate": 5.1718421430892e-07, + "loss": 0.5304, + "step": 1783 + }, + { + "epoch": 0.8624607203287407, + "grad_norm": 0.37738685143261025, + "learning_rate": 5.136493308485446e-07, + "loss": 0.486, + "step": 1784 + }, + { + "epoch": 0.8629441624365483, + "grad_norm": 0.39963258309250466, + "learning_rate": 5.101259148620618e-07, + "loss": 0.4959, + "step": 1785 + }, + { + "epoch": 0.8634276045443559, + "grad_norm": 0.39604391770722097, + "learning_rate": 5.066139753556049e-07, + "loss": 0.4993, + "step": 1786 + }, + { + "epoch": 0.8639110466521635, + "grad_norm": 0.40732958269577874, + "learning_rate": 5.031135213059756e-07, + "loss": 0.5153, + "step": 1787 + }, + { + "epoch": 0.864394488759971, + "grad_norm": 0.3897806967927546, + "learning_rate": 4.99624561660616e-07, + "loss": 0.4871, + "step": 1788 + }, + { + "epoch": 0.8648779308677785, + "grad_norm": 0.4048723969181331, + "learning_rate": 4.961471053375899e-07, + "loss": 0.512, + "step": 1789 + }, + { + "epoch": 0.8653613729755861, + "grad_norm": 0.4203351282800037, + "learning_rate": 4.926811612255539e-07, + "loss": 0.5121, + "step": 1790 + }, + { + "epoch": 0.8658448150833937, + "grad_norm": 0.39858565202586066, + "learning_rate": 4.892267381837396e-07, + "loss": 0.5011, + "step": 1791 + }, + { + "epoch": 0.8663282571912013, + "grad_norm": 0.4344627773200746, + "learning_rate": 4.857838450419339e-07, + "loss": 0.5103, + "step": 1792 + }, + { + "epoch": 0.8668116992990089, + "grad_norm": 0.40293448022650774, + "learning_rate": 4.823524906004468e-07, + "loss": 0.5138, + "step": 1793 + }, + { + "epoch": 0.8672951414068165, + "grad_norm": 0.41801337173969716, + "learning_rate": 4.789326836300983e-07, + "loss": 0.5151, + "step": 1794 + }, + { + "epoch": 0.8677785835146241, + "grad_norm": 0.4058943681689954, + "learning_rate": 4.7552443287218866e-07, + "loss": 0.5098, + "step": 1795 + }, + { + "epoch": 0.8682620256224317, + "grad_norm": 0.42652856984845416, + "learning_rate": 4.7212774703848273e-07, + "loss": 0.508, + "step": 1796 + }, + { + "epoch": 0.8687454677302393, + "grad_norm": 0.4211824745719729, + "learning_rate": 4.687426348111834e-07, + "loss": 0.5122, + "step": 1797 + }, + { + "epoch": 0.8692289098380469, + "grad_norm": 0.4022753726796167, + "learning_rate": 4.65369104842911e-07, + "loss": 0.52, + "step": 1798 + }, + { + "epoch": 0.8697123519458545, + "grad_norm": 0.40283890754002527, + "learning_rate": 4.620071657566777e-07, + "loss": 0.5072, + "step": 1799 + }, + { + "epoch": 0.8701957940536621, + "grad_norm": 0.4283203699114763, + "learning_rate": 4.586568261458729e-07, + "loss": 0.5096, + "step": 1800 + }, + { + "epoch": 0.8706792361614697, + "grad_norm": 0.38328034805235095, + "learning_rate": 4.553180945742336e-07, + "loss": 0.4861, + "step": 1801 + }, + { + "epoch": 0.8711626782692773, + "grad_norm": 0.4122501087059972, + "learning_rate": 4.5199097957582816e-07, + "loss": 0.5136, + "step": 1802 + }, + { + "epoch": 0.8716461203770849, + "grad_norm": 0.41264272327652995, + "learning_rate": 4.486754896550288e-07, + "loss": 0.5012, + "step": 1803 + }, + { + "epoch": 0.8721295624848925, + "grad_norm": 0.41725003600600513, + "learning_rate": 4.45371633286496e-07, + "loss": 0.5185, + "step": 1804 + }, + { + "epoch": 0.8726130045927, + "grad_norm": 0.4078148663174146, + "learning_rate": 4.4207941891515335e-07, + "loss": 0.5135, + "step": 1805 + }, + { + "epoch": 0.8730964467005076, + "grad_norm": 0.4209684818924423, + "learning_rate": 4.3879885495616505e-07, + "loss": 0.512, + "step": 1806 + }, + { + "epoch": 0.8735798888083152, + "grad_norm": 0.3832255061477332, + "learning_rate": 4.3552994979491836e-07, + "loss": 0.5131, + "step": 1807 + }, + { + "epoch": 0.8740633309161228, + "grad_norm": 0.415646535369065, + "learning_rate": 4.322727117869951e-07, + "loss": 0.5156, + "step": 1808 + }, + { + "epoch": 0.8745467730239304, + "grad_norm": 0.403529021224522, + "learning_rate": 4.290271492581627e-07, + "loss": 0.5225, + "step": 1809 + }, + { + "epoch": 0.875030215131738, + "grad_norm": 0.4248226663595473, + "learning_rate": 4.257932705043372e-07, + "loss": 0.5276, + "step": 1810 + }, + { + "epoch": 0.8755136572395456, + "grad_norm": 0.42279657022545747, + "learning_rate": 4.2257108379157586e-07, + "loss": 0.5224, + "step": 1811 + }, + { + "epoch": 0.8759970993473531, + "grad_norm": 0.4140176038814713, + "learning_rate": 4.1936059735604497e-07, + "loss": 0.5161, + "step": 1812 + }, + { + "epoch": 0.8764805414551607, + "grad_norm": 0.39792458444383394, + "learning_rate": 4.161618194040079e-07, + "loss": 0.5277, + "step": 1813 + }, + { + "epoch": 0.8769639835629683, + "grad_norm": 0.39549573015495143, + "learning_rate": 4.129747581117993e-07, + "loss": 0.5053, + "step": 1814 + }, + { + "epoch": 0.8774474256707759, + "grad_norm": 0.4160802633412061, + "learning_rate": 4.0979942162580387e-07, + "loss": 0.516, + "step": 1815 + }, + { + "epoch": 0.8779308677785835, + "grad_norm": 0.41239251805984983, + "learning_rate": 4.06635818062438e-07, + "loss": 0.5278, + "step": 1816 + }, + { + "epoch": 0.8784143098863911, + "grad_norm": 0.40871873580107365, + "learning_rate": 4.0348395550812713e-07, + "loss": 0.5294, + "step": 1817 + }, + { + "epoch": 0.8788977519941987, + "grad_norm": 0.40365670038657436, + "learning_rate": 4.003438420192873e-07, + "loss": 0.5158, + "step": 1818 + }, + { + "epoch": 0.8793811941020063, + "grad_norm": 0.4255428234546921, + "learning_rate": 3.9721548562229985e-07, + "loss": 0.5114, + "step": 1819 + }, + { + "epoch": 0.8798646362098139, + "grad_norm": 0.41203315649756733, + "learning_rate": 3.9409889431349656e-07, + "loss": 0.5116, + "step": 1820 + }, + { + "epoch": 0.8803480783176215, + "grad_norm": 0.4149872650348109, + "learning_rate": 3.9099407605913576e-07, + "loss": 0.5099, + "step": 1821 + }, + { + "epoch": 0.880831520425429, + "grad_norm": 0.4258100076362105, + "learning_rate": 3.879010387953841e-07, + "loss": 0.5175, + "step": 1822 + }, + { + "epoch": 0.8813149625332366, + "grad_norm": 0.3902355927247227, + "learning_rate": 3.84819790428293e-07, + "loss": 0.498, + "step": 1823 + }, + { + "epoch": 0.8817984046410442, + "grad_norm": 0.40842472365457144, + "learning_rate": 3.8175033883378233e-07, + "loss": 0.518, + "step": 1824 + }, + { + "epoch": 0.8822818467488518, + "grad_norm": 0.4221970543634826, + "learning_rate": 3.7869269185761613e-07, + "loss": 0.5216, + "step": 1825 + }, + { + "epoch": 0.8827652888566594, + "grad_norm": 0.40616883661281006, + "learning_rate": 3.7564685731538985e-07, + "loss": 0.5066, + "step": 1826 + }, + { + "epoch": 0.883248730964467, + "grad_norm": 0.4061562407072031, + "learning_rate": 3.7261284299249967e-07, + "loss": 0.517, + "step": 1827 + }, + { + "epoch": 0.8837321730722746, + "grad_norm": 0.4079225433423233, + "learning_rate": 3.695906566441304e-07, + "loss": 0.4959, + "step": 1828 + }, + { + "epoch": 0.8842156151800822, + "grad_norm": 0.38197368709112006, + "learning_rate": 3.665803059952344e-07, + "loss": 0.4871, + "step": 1829 + }, + { + "epoch": 0.8846990572878898, + "grad_norm": 0.411849076052872, + "learning_rate": 3.63581798740511e-07, + "loss": 0.5143, + "step": 1830 + }, + { + "epoch": 0.8851824993956974, + "grad_norm": 0.393276210273132, + "learning_rate": 3.605951425443871e-07, + "loss": 0.4936, + "step": 1831 + }, + { + "epoch": 0.885665941503505, + "grad_norm": 0.369604359657528, + "learning_rate": 3.576203450409943e-07, + "loss": 0.4684, + "step": 1832 + }, + { + "epoch": 0.8861493836113126, + "grad_norm": 0.43326466002005165, + "learning_rate": 3.5465741383415684e-07, + "loss": 0.5104, + "step": 1833 + }, + { + "epoch": 0.8866328257191202, + "grad_norm": 0.41527359664646213, + "learning_rate": 3.5170635649736497e-07, + "loss": 0.519, + "step": 1834 + }, + { + "epoch": 0.8871162678269278, + "grad_norm": 0.41356740894281485, + "learning_rate": 3.487671805737597e-07, + "loss": 0.508, + "step": 1835 + }, + { + "epoch": 0.8875997099347354, + "grad_norm": 0.4050751048123327, + "learning_rate": 3.4583989357611037e-07, + "loss": 0.5135, + "step": 1836 + }, + { + "epoch": 0.888083152042543, + "grad_norm": 0.3923610722591795, + "learning_rate": 3.4292450298679945e-07, + "loss": 0.5075, + "step": 1837 + }, + { + "epoch": 0.8885665941503504, + "grad_norm": 0.41919225013002887, + "learning_rate": 3.400210162577999e-07, + "loss": 0.5166, + "step": 1838 + }, + { + "epoch": 0.889050036258158, + "grad_norm": 0.42118222715491443, + "learning_rate": 3.371294408106585e-07, + "loss": 0.523, + "step": 1839 + }, + { + "epoch": 0.8895334783659656, + "grad_norm": 0.3952238335142466, + "learning_rate": 3.3424978403647443e-07, + "loss": 0.5138, + "step": 1840 + }, + { + "epoch": 0.8900169204737732, + "grad_norm": 0.4163195177412695, + "learning_rate": 3.313820532958817e-07, + "loss": 0.5274, + "step": 1841 + }, + { + "epoch": 0.8905003625815808, + "grad_norm": 0.3930314520659748, + "learning_rate": 3.285262559190322e-07, + "loss": 0.4991, + "step": 1842 + }, + { + "epoch": 0.8909838046893884, + "grad_norm": 0.4336804309313973, + "learning_rate": 3.256823992055741e-07, + "loss": 0.5009, + "step": 1843 + }, + { + "epoch": 0.891467246797196, + "grad_norm": 0.41714068524986875, + "learning_rate": 3.228504904246349e-07, + "loss": 0.5238, + "step": 1844 + }, + { + "epoch": 0.8919506889050036, + "grad_norm": 0.41848606366751967, + "learning_rate": 3.20030536814801e-07, + "loss": 0.5202, + "step": 1845 + }, + { + "epoch": 0.8924341310128112, + "grad_norm": 0.422964314144621, + "learning_rate": 3.1722254558410047e-07, + "loss": 0.5104, + "step": 1846 + }, + { + "epoch": 0.8929175731206188, + "grad_norm": 0.41539348703446205, + "learning_rate": 3.144265239099864e-07, + "loss": 0.5152, + "step": 1847 + }, + { + "epoch": 0.8934010152284264, + "grad_norm": 0.3936271006898258, + "learning_rate": 3.1164247893931575e-07, + "loss": 0.5071, + "step": 1848 + }, + { + "epoch": 0.893884457336234, + "grad_norm": 0.4152031331913687, + "learning_rate": 3.088704177883306e-07, + "loss": 0.5181, + "step": 1849 + }, + { + "epoch": 0.8943678994440416, + "grad_norm": 0.4176432021270733, + "learning_rate": 3.06110347542643e-07, + "loss": 0.5235, + "step": 1850 + }, + { + "epoch": 0.8948513415518492, + "grad_norm": 0.3954219378639727, + "learning_rate": 3.033622752572157e-07, + "loss": 0.5019, + "step": 1851 + }, + { + "epoch": 0.8953347836596568, + "grad_norm": 0.45830856560980365, + "learning_rate": 3.0062620795634214e-07, + "loss": 0.5263, + "step": 1852 + }, + { + "epoch": 0.8958182257674644, + "grad_norm": 0.4009466020951186, + "learning_rate": 2.9790215263363174e-07, + "loss": 0.5222, + "step": 1853 + }, + { + "epoch": 0.896301667875272, + "grad_norm": 0.3933495297633584, + "learning_rate": 2.951901162519877e-07, + "loss": 0.5233, + "step": 1854 + }, + { + "epoch": 0.8967851099830795, + "grad_norm": 0.39895160904445, + "learning_rate": 2.9249010574359636e-07, + "loss": 0.5212, + "step": 1855 + }, + { + "epoch": 0.8972685520908871, + "grad_norm": 0.42068899596041226, + "learning_rate": 2.898021280098995e-07, + "loss": 0.5168, + "step": 1856 + }, + { + "epoch": 0.8977519941986947, + "grad_norm": 0.39971963228555085, + "learning_rate": 2.8712618992158656e-07, + "loss": 0.5084, + "step": 1857 + }, + { + "epoch": 0.8982354363065023, + "grad_norm": 0.3999616227972635, + "learning_rate": 2.8446229831856964e-07, + "loss": 0.5088, + "step": 1858 + }, + { + "epoch": 0.8987188784143099, + "grad_norm": 0.4001447692276326, + "learning_rate": 2.8181046000997136e-07, + "loss": 0.521, + "step": 1859 + }, + { + "epoch": 0.8992023205221175, + "grad_norm": 0.41592034251039167, + "learning_rate": 2.791706817741041e-07, + "loss": 0.5072, + "step": 1860 + }, + { + "epoch": 0.8996857626299251, + "grad_norm": 0.4445686187455443, + "learning_rate": 2.765429703584538e-07, + "loss": 0.5148, + "step": 1861 + }, + { + "epoch": 0.9001692047377327, + "grad_norm": 0.40228802491920107, + "learning_rate": 2.739273324796621e-07, + "loss": 0.5262, + "step": 1862 + }, + { + "epoch": 0.9006526468455403, + "grad_norm": 0.40404504261863744, + "learning_rate": 2.7132377482351037e-07, + "loss": 0.5147, + "step": 1863 + }, + { + "epoch": 0.9011360889533478, + "grad_norm": 0.3986359660989621, + "learning_rate": 2.687323040449025e-07, + "loss": 0.5172, + "step": 1864 + }, + { + "epoch": 0.9016195310611554, + "grad_norm": 0.42039178580411435, + "learning_rate": 2.6615292676784533e-07, + "loss": 0.5191, + "step": 1865 + }, + { + "epoch": 0.902102973168963, + "grad_norm": 0.4168785648766661, + "learning_rate": 2.635856495854372e-07, + "loss": 0.5116, + "step": 1866 + }, + { + "epoch": 0.9025864152767706, + "grad_norm": 0.4006359687639295, + "learning_rate": 2.6103047905984224e-07, + "loss": 0.5243, + "step": 1867 + }, + { + "epoch": 0.9030698573845782, + "grad_norm": 0.4136741219117099, + "learning_rate": 2.584874217222855e-07, + "loss": 0.516, + "step": 1868 + }, + { + "epoch": 0.9035532994923858, + "grad_norm": 0.41454758895188654, + "learning_rate": 2.5595648407302496e-07, + "loss": 0.5299, + "step": 1869 + }, + { + "epoch": 0.9040367416001934, + "grad_norm": 0.43072596167116733, + "learning_rate": 2.53437672581342e-07, + "loss": 0.5192, + "step": 1870 + }, + { + "epoch": 0.9045201837080009, + "grad_norm": 0.413346134850188, + "learning_rate": 2.5093099368551974e-07, + "loss": 0.5135, + "step": 1871 + }, + { + "epoch": 0.9050036258158085, + "grad_norm": 0.44414111234791465, + "learning_rate": 2.484364537928341e-07, + "loss": 0.5248, + "step": 1872 + }, + { + "epoch": 0.9054870679236161, + "grad_norm": 0.41031454686253116, + "learning_rate": 2.45954059279529e-07, + "loss": 0.5198, + "step": 1873 + }, + { + "epoch": 0.9059705100314237, + "grad_norm": 0.3982976345229948, + "learning_rate": 2.4348381649080486e-07, + "loss": 0.5163, + "step": 1874 + }, + { + "epoch": 0.9064539521392313, + "grad_norm": 0.4007617837820295, + "learning_rate": 2.41025731740801e-07, + "loss": 0.511, + "step": 1875 + }, + { + "epoch": 0.9069373942470389, + "grad_norm": 0.40168617787804406, + "learning_rate": 2.3857981131258037e-07, + "loss": 0.5114, + "step": 1876 + }, + { + "epoch": 0.9074208363548465, + "grad_norm": 0.38110421429609603, + "learning_rate": 2.3614606145811347e-07, + "loss": 0.4992, + "step": 1877 + }, + { + "epoch": 0.9079042784626541, + "grad_norm": 0.3870732423514054, + "learning_rate": 2.3372448839825978e-07, + "loss": 0.4887, + "step": 1878 + }, + { + "epoch": 0.9083877205704617, + "grad_norm": 0.39979584331802676, + "learning_rate": 2.3131509832275633e-07, + "loss": 0.5122, + "step": 1879 + }, + { + "epoch": 0.9088711626782693, + "grad_norm": 0.3996732608438804, + "learning_rate": 2.2891789739019733e-07, + "loss": 0.5102, + "step": 1880 + }, + { + "epoch": 0.9093546047860769, + "grad_norm": 0.40968516048558534, + "learning_rate": 2.2653289172802295e-07, + "loss": 0.5049, + "step": 1881 + }, + { + "epoch": 0.9098380468938845, + "grad_norm": 0.4006751726323446, + "learning_rate": 2.241600874324984e-07, + "loss": 0.5144, + "step": 1882 + }, + { + "epoch": 0.9103214890016921, + "grad_norm": 0.4066456668668, + "learning_rate": 2.2179949056870432e-07, + "loss": 0.5184, + "step": 1883 + }, + { + "epoch": 0.9108049311094997, + "grad_norm": 0.4179374057794063, + "learning_rate": 2.194511071705141e-07, + "loss": 0.5131, + "step": 1884 + }, + { + "epoch": 0.9112883732173073, + "grad_norm": 0.419480536858942, + "learning_rate": 2.1711494324058724e-07, + "loss": 0.5147, + "step": 1885 + }, + { + "epoch": 0.9117718153251149, + "grad_norm": 0.40624640146953556, + "learning_rate": 2.1479100475034598e-07, + "loss": 0.5084, + "step": 1886 + }, + { + "epoch": 0.9122552574329225, + "grad_norm": 0.40367583928635464, + "learning_rate": 2.1247929763996534e-07, + "loss": 0.4832, + "step": 1887 + }, + { + "epoch": 0.91273869954073, + "grad_norm": 0.3989060344990105, + "learning_rate": 2.101798278183542e-07, + "loss": 0.5144, + "step": 1888 + }, + { + "epoch": 0.9132221416485375, + "grad_norm": 0.3998308893808953, + "learning_rate": 2.0789260116314215e-07, + "loss": 0.5081, + "step": 1889 + }, + { + "epoch": 0.9137055837563451, + "grad_norm": 0.4063990008087812, + "learning_rate": 2.0561762352066638e-07, + "loss": 0.5109, + "step": 1890 + }, + { + "epoch": 0.9141890258641527, + "grad_norm": 0.4167108480628528, + "learning_rate": 2.0335490070595208e-07, + "loss": 0.5186, + "step": 1891 + }, + { + "epoch": 0.9146724679719603, + "grad_norm": 0.39430080435851855, + "learning_rate": 2.011044385027011e-07, + "loss": 0.5101, + "step": 1892 + }, + { + "epoch": 0.9151559100797679, + "grad_norm": 0.42096559238441866, + "learning_rate": 1.988662426632765e-07, + "loss": 0.5078, + "step": 1893 + }, + { + "epoch": 0.9156393521875755, + "grad_norm": 0.39723951707790667, + "learning_rate": 1.9664031890868795e-07, + "loss": 0.5223, + "step": 1894 + }, + { + "epoch": 0.9161227942953831, + "grad_norm": 0.3912147208179025, + "learning_rate": 1.9442667292857432e-07, + "loss": 0.509, + "step": 1895 + }, + { + "epoch": 0.9166062364031907, + "grad_norm": 0.4054442997347736, + "learning_rate": 1.922253103811944e-07, + "loss": 0.4972, + "step": 1896 + }, + { + "epoch": 0.9170896785109983, + "grad_norm": 0.4117401816100168, + "learning_rate": 1.9003623689340777e-07, + "loss": 0.5143, + "step": 1897 + }, + { + "epoch": 0.9175731206188059, + "grad_norm": 0.40528953423093284, + "learning_rate": 1.8785945806066297e-07, + "loss": 0.5186, + "step": 1898 + }, + { + "epoch": 0.9180565627266135, + "grad_norm": 0.4027696401480633, + "learning_rate": 1.85694979446982e-07, + "loss": 0.5167, + "step": 1899 + }, + { + "epoch": 0.9185400048344211, + "grad_norm": 0.38938110778215645, + "learning_rate": 1.835428065849465e-07, + "loss": 0.5141, + "step": 1900 + }, + { + "epoch": 0.9190234469422287, + "grad_norm": 0.3958049685314876, + "learning_rate": 1.814029449756849e-07, + "loss": 0.5231, + "step": 1901 + }, + { + "epoch": 0.9195068890500363, + "grad_norm": 0.4039199277502588, + "learning_rate": 1.7927540008885414e-07, + "loss": 0.5088, + "step": 1902 + }, + { + "epoch": 0.9199903311578439, + "grad_norm": 0.40426884197944674, + "learning_rate": 1.7716017736263192e-07, + "loss": 0.5129, + "step": 1903 + }, + { + "epoch": 0.9204737732656514, + "grad_norm": 0.41358470698939953, + "learning_rate": 1.7505728220369667e-07, + "loss": 0.5203, + "step": 1904 + }, + { + "epoch": 0.920957215373459, + "grad_norm": 0.4250820090378729, + "learning_rate": 1.729667199872187e-07, + "loss": 0.5223, + "step": 1905 + }, + { + "epoch": 0.9214406574812666, + "grad_norm": 0.40899977989644076, + "learning_rate": 1.70888496056843e-07, + "loss": 0.5107, + "step": 1906 + }, + { + "epoch": 0.9219240995890742, + "grad_norm": 0.4187760713922149, + "learning_rate": 1.6882261572467862e-07, + "loss": 0.5142, + "step": 1907 + }, + { + "epoch": 0.9224075416968818, + "grad_norm": 0.39684261118945696, + "learning_rate": 1.6676908427128103e-07, + "loss": 0.4847, + "step": 1908 + }, + { + "epoch": 0.9228909838046894, + "grad_norm": 0.4124141033869449, + "learning_rate": 1.64727906945642e-07, + "loss": 0.5063, + "step": 1909 + }, + { + "epoch": 0.923374425912497, + "grad_norm": 0.4104731721152495, + "learning_rate": 1.6269908896517638e-07, + "loss": 0.5035, + "step": 1910 + }, + { + "epoch": 0.9238578680203046, + "grad_norm": 0.38208183163995635, + "learning_rate": 1.6068263551570596e-07, + "loss": 0.4855, + "step": 1911 + }, + { + "epoch": 0.9243413101281122, + "grad_norm": 0.37943822460943005, + "learning_rate": 1.5867855175144885e-07, + "loss": 0.4863, + "step": 1912 + }, + { + "epoch": 0.9248247522359198, + "grad_norm": 0.4169103989292416, + "learning_rate": 1.5668684279500245e-07, + "loss": 0.5077, + "step": 1913 + }, + { + "epoch": 0.9253081943437274, + "grad_norm": 0.41157707540822663, + "learning_rate": 1.5470751373733773e-07, + "loss": 0.5184, + "step": 1914 + }, + { + "epoch": 0.925791636451535, + "grad_norm": 0.39771451862665147, + "learning_rate": 1.5274056963777817e-07, + "loss": 0.5094, + "step": 1915 + }, + { + "epoch": 0.9262750785593425, + "grad_norm": 0.4092987974762817, + "learning_rate": 1.507860155239921e-07, + "loss": 0.5154, + "step": 1916 + }, + { + "epoch": 0.9267585206671501, + "grad_norm": 0.3854503813446518, + "learning_rate": 1.488438563919764e-07, + "loss": 0.4938, + "step": 1917 + }, + { + "epoch": 0.9272419627749577, + "grad_norm": 0.3900052964813903, + "learning_rate": 1.4691409720604732e-07, + "loss": 0.5077, + "step": 1918 + }, + { + "epoch": 0.9277254048827653, + "grad_norm": 0.40750712678387396, + "learning_rate": 1.449967428988247e-07, + "loss": 0.5145, + "step": 1919 + }, + { + "epoch": 0.9282088469905729, + "grad_norm": 0.4023813113333878, + "learning_rate": 1.4309179837122045e-07, + "loss": 0.5291, + "step": 1920 + }, + { + "epoch": 0.9286922890983804, + "grad_norm": 0.38502235475455626, + "learning_rate": 1.411992684924257e-07, + "loss": 0.5119, + "step": 1921 + }, + { + "epoch": 0.929175731206188, + "grad_norm": 0.40862887218787325, + "learning_rate": 1.3931915809990039e-07, + "loss": 0.5106, + "step": 1922 + }, + { + "epoch": 0.9296591733139956, + "grad_norm": 0.4123756674563694, + "learning_rate": 1.374514719993575e-07, + "loss": 0.5126, + "step": 1923 + }, + { + "epoch": 0.9301426154218032, + "grad_norm": 0.41456641529199556, + "learning_rate": 1.3559621496475438e-07, + "loss": 0.5145, + "step": 1924 + }, + { + "epoch": 0.9306260575296108, + "grad_norm": 0.4049152537963314, + "learning_rate": 1.3375339173827551e-07, + "loss": 0.5261, + "step": 1925 + }, + { + "epoch": 0.9311094996374184, + "grad_norm": 0.37450439680837744, + "learning_rate": 1.3192300703032733e-07, + "loss": 0.474, + "step": 1926 + }, + { + "epoch": 0.931592941745226, + "grad_norm": 0.41100475742292075, + "learning_rate": 1.3010506551952018e-07, + "loss": 0.5134, + "step": 1927 + }, + { + "epoch": 0.9320763838530336, + "grad_norm": 0.41369315234307685, + "learning_rate": 1.2829957185265863e-07, + "loss": 0.52, + "step": 1928 + }, + { + "epoch": 0.9325598259608412, + "grad_norm": 0.3885589982730842, + "learning_rate": 1.2650653064473106e-07, + "loss": 0.5031, + "step": 1929 + }, + { + "epoch": 0.9330432680686488, + "grad_norm": 0.3951920703691663, + "learning_rate": 1.2472594647889357e-07, + "loss": 0.5092, + "step": 1930 + }, + { + "epoch": 0.9335267101764564, + "grad_norm": 0.40947647060207415, + "learning_rate": 1.2295782390646494e-07, + "loss": 0.5177, + "step": 1931 + }, + { + "epoch": 0.934010152284264, + "grad_norm": 0.390574491653679, + "learning_rate": 1.2120216744690716e-07, + "loss": 0.5133, + "step": 1932 + }, + { + "epoch": 0.9344935943920716, + "grad_norm": 0.4045498383765011, + "learning_rate": 1.194589815878211e-07, + "loss": 0.5163, + "step": 1933 + }, + { + "epoch": 0.9349770364998792, + "grad_norm": 0.40440549648310886, + "learning_rate": 1.177282707849281e-07, + "loss": 0.5181, + "step": 1934 + }, + { + "epoch": 0.9354604786076868, + "grad_norm": 0.4024689599876574, + "learning_rate": 1.1601003946206723e-07, + "loss": 0.5181, + "step": 1935 + }, + { + "epoch": 0.9359439207154944, + "grad_norm": 0.3986512567562451, + "learning_rate": 1.1430429201117476e-07, + "loss": 0.5032, + "step": 1936 + }, + { + "epoch": 0.9364273628233019, + "grad_norm": 0.39397430112101045, + "learning_rate": 1.1261103279227858e-07, + "loss": 0.5178, + "step": 1937 + }, + { + "epoch": 0.9369108049311095, + "grad_norm": 0.4291769455926264, + "learning_rate": 1.1093026613348601e-07, + "loss": 0.5196, + "step": 1938 + }, + { + "epoch": 0.937394247038917, + "grad_norm": 0.3917679927009391, + "learning_rate": 1.0926199633097156e-07, + "loss": 0.4919, + "step": 1939 + }, + { + "epoch": 0.9378776891467246, + "grad_norm": 0.42599062790587783, + "learning_rate": 1.0760622764896866e-07, + "loss": 0.5147, + "step": 1940 + }, + { + "epoch": 0.9383611312545322, + "grad_norm": 0.4023777838627757, + "learning_rate": 1.0596296431975406e-07, + "loss": 0.5156, + "step": 1941 + }, + { + "epoch": 0.9388445733623398, + "grad_norm": 0.3966354448847634, + "learning_rate": 1.0433221054364174e-07, + "loss": 0.5065, + "step": 1942 + }, + { + "epoch": 0.9393280154701474, + "grad_norm": 0.4013413460541232, + "learning_rate": 1.0271397048897014e-07, + "loss": 0.5053, + "step": 1943 + }, + { + "epoch": 0.939811457577955, + "grad_norm": 0.37653088174864213, + "learning_rate": 1.0110824829209164e-07, + "loss": 0.4939, + "step": 1944 + }, + { + "epoch": 0.9402948996857626, + "grad_norm": 0.399035469753345, + "learning_rate": 9.951504805735979e-08, + "loss": 0.5106, + "step": 1945 + }, + { + "epoch": 0.9407783417935702, + "grad_norm": 0.3991989592342914, + "learning_rate": 9.793437385712479e-08, + "loss": 0.5153, + "step": 1946 + }, + { + "epoch": 0.9412617839013778, + "grad_norm": 0.4057585743893453, + "learning_rate": 9.636622973171583e-08, + "loss": 0.51, + "step": 1947 + }, + { + "epoch": 0.9417452260091854, + "grad_norm": 0.4054528739627977, + "learning_rate": 9.481061968943717e-08, + "loss": 0.516, + "step": 1948 + }, + { + "epoch": 0.942228668116993, + "grad_norm": 0.3921980636127477, + "learning_rate": 9.3267547706552e-08, + "loss": 0.5051, + "step": 1949 + }, + { + "epoch": 0.9427121102248006, + "grad_norm": 0.39913144156030567, + "learning_rate": 9.17370177272775e-08, + "loss": 0.5055, + "step": 1950 + }, + { + "epoch": 0.9431955523326082, + "grad_norm": 0.4004586843938766, + "learning_rate": 9.021903366377093e-08, + "loss": 0.5164, + "step": 1951 + }, + { + "epoch": 0.9436789944404158, + "grad_norm": 0.4037223050343566, + "learning_rate": 8.8713599396123e-08, + "loss": 0.5098, + "step": 1952 + }, + { + "epoch": 0.9441624365482234, + "grad_norm": 0.39850858877215634, + "learning_rate": 8.72207187723445e-08, + "loss": 0.5211, + "step": 1953 + }, + { + "epoch": 0.9446458786560309, + "grad_norm": 0.41059877409881057, + "learning_rate": 8.5740395608358e-08, + "loss": 0.5121, + "step": 1954 + }, + { + "epoch": 0.9451293207638385, + "grad_norm": 0.40573184845060545, + "learning_rate": 8.427263368798955e-08, + "loss": 0.5256, + "step": 1955 + }, + { + "epoch": 0.9456127628716461, + "grad_norm": 0.3966583772201167, + "learning_rate": 8.281743676295639e-08, + "loss": 0.5183, + "step": 1956 + }, + { + "epoch": 0.9460962049794537, + "grad_norm": 0.40701943797191764, + "learning_rate": 8.13748085528604e-08, + "loss": 0.5135, + "step": 1957 + }, + { + "epoch": 0.9465796470872613, + "grad_norm": 0.37606341196980025, + "learning_rate": 7.99447527451741e-08, + "loss": 0.4903, + "step": 1958 + }, + { + "epoch": 0.9470630891950689, + "grad_norm": 0.4114856897492863, + "learning_rate": 7.852727299523577e-08, + "loss": 0.5068, + "step": 1959 + }, + { + "epoch": 0.9475465313028765, + "grad_norm": 0.4093526523044555, + "learning_rate": 7.71223729262377e-08, + "loss": 0.5127, + "step": 1960 + }, + { + "epoch": 0.9480299734106841, + "grad_norm": 0.4130076310229578, + "learning_rate": 7.573005612921903e-08, + "loss": 0.5121, + "step": 1961 + }, + { + "epoch": 0.9485134155184917, + "grad_norm": 0.40254945616875554, + "learning_rate": 7.435032616305238e-08, + "loss": 0.5178, + "step": 1962 + }, + { + "epoch": 0.9489968576262993, + "grad_norm": 0.4108181664423654, + "learning_rate": 7.298318655443893e-08, + "loss": 0.5078, + "step": 1963 + }, + { + "epoch": 0.9494802997341069, + "grad_norm": 0.3954161759006289, + "learning_rate": 7.162864079789777e-08, + "loss": 0.5137, + "step": 1964 + }, + { + "epoch": 0.9499637418419145, + "grad_norm": 0.3993428213266096, + "learning_rate": 7.028669235575714e-08, + "loss": 0.496, + "step": 1965 + }, + { + "epoch": 0.950447183949722, + "grad_norm": 0.41493027982851327, + "learning_rate": 6.895734465814597e-08, + "loss": 0.5257, + "step": 1966 + }, + { + "epoch": 0.9509306260575296, + "grad_norm": 0.38537633628397905, + "learning_rate": 6.764060110298287e-08, + "loss": 0.5208, + "step": 1967 + }, + { + "epoch": 0.9514140681653372, + "grad_norm": 0.41057398606285567, + "learning_rate": 6.633646505597113e-08, + "loss": 0.5224, + "step": 1968 + }, + { + "epoch": 0.9518975102731448, + "grad_norm": 0.4420797620121168, + "learning_rate": 6.504493985058813e-08, + "loss": 0.5108, + "step": 1969 + }, + { + "epoch": 0.9523809523809523, + "grad_norm": 0.39854773939873966, + "learning_rate": 6.376602878807592e-08, + "loss": 0.5134, + "step": 1970 + }, + { + "epoch": 0.9528643944887599, + "grad_norm": 0.4104047856111181, + "learning_rate": 6.249973513743345e-08, + "loss": 0.5079, + "step": 1971 + }, + { + "epoch": 0.9533478365965675, + "grad_norm": 0.40077931999667527, + "learning_rate": 6.124606213541052e-08, + "loss": 0.5196, + "step": 1972 + }, + { + "epoch": 0.9538312787043751, + "grad_norm": 0.43500257686302385, + "learning_rate": 6.000501298649653e-08, + "loss": 0.5197, + "step": 1973 + }, + { + "epoch": 0.9543147208121827, + "grad_norm": 0.4186094433656202, + "learning_rate": 5.8776590862911764e-08, + "loss": 0.5135, + "step": 1974 + }, + { + "epoch": 0.9547981629199903, + "grad_norm": 0.4119358911199865, + "learning_rate": 5.756079890460342e-08, + "loss": 0.5137, + "step": 1975 + }, + { + "epoch": 0.9552816050277979, + "grad_norm": 0.39694645564275877, + "learning_rate": 5.635764021923229e-08, + "loss": 0.5121, + "step": 1976 + }, + { + "epoch": 0.9557650471356055, + "grad_norm": 0.4154887872586203, + "learning_rate": 5.5167117882171104e-08, + "loss": 0.516, + "step": 1977 + }, + { + "epoch": 0.9562484892434131, + "grad_norm": 0.7692472130509296, + "learning_rate": 5.3989234936489556e-08, + "loss": 0.5055, + "step": 1978 + }, + { + "epoch": 0.9567319313512207, + "grad_norm": 0.4198618304821996, + "learning_rate": 5.2823994392951497e-08, + "loss": 0.5094, + "step": 1979 + }, + { + "epoch": 0.9572153734590283, + "grad_norm": 0.39385026351820934, + "learning_rate": 5.167139923000553e-08, + "loss": 0.4933, + "step": 1980 + }, + { + "epoch": 0.9576988155668359, + "grad_norm": 0.4159053427086944, + "learning_rate": 5.053145239377777e-08, + "loss": 0.4936, + "step": 1981 + }, + { + "epoch": 0.9581822576746435, + "grad_norm": 0.3990167973444839, + "learning_rate": 4.940415679806465e-08, + "loss": 0.5124, + "step": 1982 + }, + { + "epoch": 0.9586656997824511, + "grad_norm": 0.4012277528608715, + "learning_rate": 4.828951532432457e-08, + "loss": 0.5151, + "step": 1983 + }, + { + "epoch": 0.9591491418902587, + "grad_norm": 0.4099731484035176, + "learning_rate": 4.718753082167071e-08, + "loss": 0.5191, + "step": 1984 + }, + { + "epoch": 0.9596325839980663, + "grad_norm": 0.41474696363438857, + "learning_rate": 4.6098206106863774e-08, + "loss": 0.515, + "step": 1985 + }, + { + "epoch": 0.9601160261058739, + "grad_norm": 0.4044716506352786, + "learning_rate": 4.5021543964306466e-08, + "loss": 0.5123, + "step": 1986 + }, + { + "epoch": 0.9605994682136814, + "grad_norm": 0.40133573312591214, + "learning_rate": 4.395754714603351e-08, + "loss": 0.5133, + "step": 1987 + }, + { + "epoch": 0.961082910321489, + "grad_norm": 0.4089192998561785, + "learning_rate": 4.290621837170661e-08, + "loss": 0.5236, + "step": 1988 + }, + { + "epoch": 0.9615663524292966, + "grad_norm": 0.39452360352891674, + "learning_rate": 4.186756032860728e-08, + "loss": 0.5137, + "step": 1989 + }, + { + "epoch": 0.9620497945371042, + "grad_norm": 0.39867371724056727, + "learning_rate": 4.08415756716285e-08, + "loss": 0.5093, + "step": 1990 + }, + { + "epoch": 0.9625332366449117, + "grad_norm": 0.357065447847406, + "learning_rate": 3.9828267023269696e-08, + "loss": 0.4505, + "step": 1991 + }, + { + "epoch": 0.9630166787527193, + "grad_norm": 0.427089982663271, + "learning_rate": 3.8827636973630126e-08, + "loss": 0.5101, + "step": 1992 + }, + { + "epoch": 0.9635001208605269, + "grad_norm": 0.4025101063369687, + "learning_rate": 3.783968808039995e-08, + "loss": 0.5245, + "step": 1993 + }, + { + "epoch": 0.9639835629683345, + "grad_norm": 0.4012223737061637, + "learning_rate": 3.68644228688575e-08, + "loss": 0.514, + "step": 1994 + }, + { + "epoch": 0.9644670050761421, + "grad_norm": 0.39715847085154765, + "learning_rate": 3.590184383185758e-08, + "loss": 0.507, + "step": 1995 + }, + { + "epoch": 0.9649504471839497, + "grad_norm": 0.4019019064729592, + "learning_rate": 3.4951953429831484e-08, + "loss": 0.5093, + "step": 1996 + }, + { + "epoch": 0.9654338892917573, + "grad_norm": 0.3997438820964838, + "learning_rate": 3.401475409077426e-08, + "loss": 0.4987, + "step": 1997 + }, + { + "epoch": 0.9659173313995649, + "grad_norm": 0.42247021949710184, + "learning_rate": 3.309024821024354e-08, + "loss": 0.5099, + "step": 1998 + }, + { + "epoch": 0.9664007735073725, + "grad_norm": 0.4228197536210846, + "learning_rate": 3.2178438151350685e-08, + "loss": 0.5181, + "step": 1999 + }, + { + "epoch": 0.9668842156151801, + "grad_norm": 0.40195549014330173, + "learning_rate": 3.127932624475638e-08, + "loss": 0.5118, + "step": 2000 + }, + { + "epoch": 0.9673676577229877, + "grad_norm": 0.40083823310970984, + "learning_rate": 3.039291478866169e-08, + "loss": 0.5265, + "step": 2001 + }, + { + "epoch": 0.9678510998307953, + "grad_norm": 0.4054162095867977, + "learning_rate": 2.9519206048807535e-08, + "loss": 0.5173, + "step": 2002 + }, + { + "epoch": 0.9683345419386028, + "grad_norm": 0.4091589042260666, + "learning_rate": 2.8658202258462498e-08, + "loss": 0.5199, + "step": 2003 + }, + { + "epoch": 0.9688179840464104, + "grad_norm": 0.37360554914951866, + "learning_rate": 2.7809905618422227e-08, + "loss": 0.4667, + "step": 2004 + }, + { + "epoch": 0.969301426154218, + "grad_norm": 0.4264262470861418, + "learning_rate": 2.6974318297001144e-08, + "loss": 0.5208, + "step": 2005 + }, + { + "epoch": 0.9697848682620256, + "grad_norm": 0.4133603239690626, + "learning_rate": 2.615144243002743e-08, + "loss": 0.5049, + "step": 2006 + }, + { + "epoch": 0.9702683103698332, + "grad_norm": 0.41234915425778607, + "learning_rate": 2.534128012083914e-08, + "loss": 0.5215, + "step": 2007 + }, + { + "epoch": 0.9707517524776408, + "grad_norm": 0.39530605693418713, + "learning_rate": 2.4543833440275332e-08, + "loss": 0.5096, + "step": 2008 + }, + { + "epoch": 0.9712351945854484, + "grad_norm": 0.42034129099753553, + "learning_rate": 2.375910442667495e-08, + "loss": 0.5111, + "step": 2009 + }, + { + "epoch": 0.971718636693256, + "grad_norm": 0.4128961831040994, + "learning_rate": 2.298709508586794e-08, + "loss": 0.5136, + "step": 2010 + }, + { + "epoch": 0.9722020788010636, + "grad_norm": 0.40946352601157776, + "learning_rate": 2.2227807391172474e-08, + "loss": 0.5239, + "step": 2011 + }, + { + "epoch": 0.9726855209088712, + "grad_norm": 0.3999251664775986, + "learning_rate": 2.1481243283389408e-08, + "loss": 0.514, + "step": 2012 + }, + { + "epoch": 0.9731689630166788, + "grad_norm": 0.412440530125608, + "learning_rate": 2.074740467079672e-08, + "loss": 0.5174, + "step": 2013 + }, + { + "epoch": 0.9736524051244864, + "grad_norm": 0.40416741933458183, + "learning_rate": 2.002629342914453e-08, + "loss": 0.5173, + "step": 2014 + }, + { + "epoch": 0.974135847232294, + "grad_norm": 0.4091688587167212, + "learning_rate": 1.9317911401651734e-08, + "loss": 0.5035, + "step": 2015 + }, + { + "epoch": 0.9746192893401016, + "grad_norm": 0.41181207482580323, + "learning_rate": 1.862226039899995e-08, + "loss": 0.5194, + "step": 2016 + }, + { + "epoch": 0.9751027314479092, + "grad_norm": 0.38917062513507206, + "learning_rate": 1.7939342199329023e-08, + "loss": 0.5081, + "step": 2017 + }, + { + "epoch": 0.9755861735557168, + "grad_norm": 0.4058095413062891, + "learning_rate": 1.7269158548232633e-08, + "loss": 0.514, + "step": 2018 + }, + { + "epoch": 0.9760696156635243, + "grad_norm": 0.3909310997249257, + "learning_rate": 1.661171115875493e-08, + "loss": 0.5086, + "step": 2019 + }, + { + "epoch": 0.9765530577713318, + "grad_norm": 0.3924463631554743, + "learning_rate": 1.5967001711383877e-08, + "loss": 0.5074, + "step": 2020 + }, + { + "epoch": 0.9770364998791394, + "grad_norm": 0.3897349184690982, + "learning_rate": 1.5335031854049055e-08, + "loss": 0.5164, + "step": 2021 + }, + { + "epoch": 0.977519941986947, + "grad_norm": 0.4006696563415638, + "learning_rate": 1.4715803202116075e-08, + "loss": 0.516, + "step": 2022 + }, + { + "epoch": 0.9780033840947546, + "grad_norm": 0.39127207135897235, + "learning_rate": 1.4109317338383832e-08, + "loss": 0.4864, + "step": 2023 + }, + { + "epoch": 0.9784868262025622, + "grad_norm": 0.41074499955315413, + "learning_rate": 1.3515575813078386e-08, + "loss": 0.5276, + "step": 2024 + }, + { + "epoch": 0.9789702683103698, + "grad_norm": 0.3989638057067789, + "learning_rate": 1.2934580143851294e-08, + "loss": 0.5116, + "step": 2025 + }, + { + "epoch": 0.9794537104181774, + "grad_norm": 0.4179669163774858, + "learning_rate": 1.2366331815774069e-08, + "loss": 0.5169, + "step": 2026 + }, + { + "epoch": 0.979937152525985, + "grad_norm": 0.420952308284563, + "learning_rate": 1.1810832281335394e-08, + "loss": 0.5221, + "step": 2027 + }, + { + "epoch": 0.9804205946337926, + "grad_norm": 0.40444089801366945, + "learning_rate": 1.1268082960436688e-08, + "loss": 0.526, + "step": 2028 + }, + { + "epoch": 0.9809040367416002, + "grad_norm": 0.39774599938725236, + "learning_rate": 1.0738085240389883e-08, + "loss": 0.5158, + "step": 2029 + }, + { + "epoch": 0.9813874788494078, + "grad_norm": 0.40528876629152616, + "learning_rate": 1.0220840475910765e-08, + "loss": 0.5148, + "step": 2030 + }, + { + "epoch": 0.9818709209572154, + "grad_norm": 0.3880426443734388, + "learning_rate": 9.716349989118412e-09, + "loss": 0.4977, + "step": 2031 + }, + { + "epoch": 0.982354363065023, + "grad_norm": 0.397796748759872, + "learning_rate": 9.224615069532428e-09, + "loss": 0.5183, + "step": 2032 + }, + { + "epoch": 0.9828378051728306, + "grad_norm": 0.41455233095701044, + "learning_rate": 8.745636974066274e-09, + "loss": 0.5151, + "step": 2033 + }, + { + "epoch": 0.9833212472806382, + "grad_norm": 0.39149878382311915, + "learning_rate": 8.279416927026163e-09, + "loss": 0.4852, + "step": 2034 + }, + { + "epoch": 0.9838046893884458, + "grad_norm": 0.428044296219464, + "learning_rate": 7.82595612010828e-09, + "loss": 0.5088, + "step": 2035 + }, + { + "epoch": 0.9842881314962533, + "grad_norm": 0.39066146033771326, + "learning_rate": 7.385255712395456e-09, + "loss": 0.5092, + "step": 2036 + }, + { + "epoch": 0.9847715736040609, + "grad_norm": 0.3944546565780817, + "learning_rate": 6.9573168303532775e-09, + "loss": 0.5048, + "step": 2037 + }, + { + "epoch": 0.9852550157118685, + "grad_norm": 0.3875137169857006, + "learning_rate": 6.542140567827871e-09, + "loss": 0.5166, + "step": 2038 + }, + { + "epoch": 0.9857384578196761, + "grad_norm": 0.3942008356766705, + "learning_rate": 6.1397279860431205e-09, + "loss": 0.4846, + "step": 2039 + }, + { + "epoch": 0.9862218999274837, + "grad_norm": 0.4122295335735729, + "learning_rate": 5.750080113598455e-09, + "loss": 0.5191, + "step": 2040 + }, + { + "epoch": 0.9867053420352913, + "grad_norm": 0.4027922091033447, + "learning_rate": 5.373197946464403e-09, + "loss": 0.509, + "step": 2041 + }, + { + "epoch": 0.9871887841430989, + "grad_norm": 0.41834346896126373, + "learning_rate": 5.009082447983149e-09, + "loss": 0.52, + "step": 2042 + }, + { + "epoch": 0.9876722262509064, + "grad_norm": 0.4293813449158595, + "learning_rate": 4.65773454886298e-09, + "loss": 0.5131, + "step": 2043 + }, + { + "epoch": 0.988155668358714, + "grad_norm": 0.3928243234499177, + "learning_rate": 4.319155147176624e-09, + "loss": 0.515, + "step": 2044 + }, + { + "epoch": 0.9886391104665216, + "grad_norm": 0.4041253868270013, + "learning_rate": 3.9933451083612464e-09, + "loss": 0.5001, + "step": 2045 + }, + { + "epoch": 0.9891225525743292, + "grad_norm": 0.4010404057661429, + "learning_rate": 3.6803052652134572e-09, + "loss": 0.5077, + "step": 2046 + }, + { + "epoch": 0.9896059946821368, + "grad_norm": 0.4298866341542551, + "learning_rate": 3.3800364178881996e-09, + "loss": 0.5112, + "step": 2047 + }, + { + "epoch": 0.9900894367899444, + "grad_norm": 0.39490801058055036, + "learning_rate": 3.092539333896527e-09, + "loss": 0.5087, + "step": 2048 + }, + { + "epoch": 0.990572878897752, + "grad_norm": 0.4110997944280951, + "learning_rate": 2.817814748104497e-09, + "loss": 0.5044, + "step": 2049 + }, + { + "epoch": 0.9910563210055596, + "grad_norm": 0.4386800273412446, + "learning_rate": 2.555863362730393e-09, + "loss": 0.5217, + "step": 2050 + }, + { + "epoch": 0.9915397631133672, + "grad_norm": 0.411463570514358, + "learning_rate": 2.30668584734306e-09, + "loss": 0.5117, + "step": 2051 + }, + { + "epoch": 0.9920232052211748, + "grad_norm": 0.40419424446848334, + "learning_rate": 2.070282838859683e-09, + "loss": 0.5056, + "step": 2052 + }, + { + "epoch": 0.9925066473289823, + "grad_norm": 0.40703300041841234, + "learning_rate": 1.8466549415463442e-09, + "loss": 0.5319, + "step": 2053 + }, + { + "epoch": 0.9929900894367899, + "grad_norm": 0.42530845899182246, + "learning_rate": 1.635802727013025e-09, + "loss": 0.5138, + "step": 2054 + }, + { + "epoch": 0.9934735315445975, + "grad_norm": 0.4058607106461754, + "learning_rate": 1.4377267342158274e-09, + "loss": 0.4883, + "step": 2055 + }, + { + "epoch": 0.9939569736524051, + "grad_norm": 0.4062782037922318, + "learning_rate": 1.2524274694525329e-09, + "loss": 0.5225, + "step": 2056 + }, + { + "epoch": 0.9944404157602127, + "grad_norm": 4.724922277274833, + "learning_rate": 1.0799054063626024e-09, + "loss": 0.5232, + "step": 2057 + }, + { + "epoch": 0.9949238578680203, + "grad_norm": 0.3906054598062824, + "learning_rate": 9.201609859271765e-10, + "loss": 0.5132, + "step": 2058 + }, + { + "epoch": 0.9954072999758279, + "grad_norm": 0.3973857264750407, + "learning_rate": 7.731946164657445e-10, + "loss": 0.5083, + "step": 2059 + }, + { + "epoch": 0.9958907420836355, + "grad_norm": 0.39787099969142303, + "learning_rate": 6.390066736355893e-10, + "loss": 0.5138, + "step": 2060 + }, + { + "epoch": 0.9963741841914431, + "grad_norm": 0.4073985533388715, + "learning_rate": 5.17597500432343e-10, + "loss": 0.5134, + "step": 2061 + }, + { + "epoch": 0.9968576262992507, + "grad_norm": 0.4217388952314898, + "learning_rate": 4.089674071872107e-10, + "loss": 0.5204, + "step": 2062 + }, + { + "epoch": 0.9973410684070583, + "grad_norm": 0.3938957473860418, + "learning_rate": 3.131166715680811e-10, + "loss": 0.5134, + "step": 2063 + }, + { + "epoch": 0.9978245105148659, + "grad_norm": 0.4133454970429142, + "learning_rate": 2.3004553857675082e-10, + "loss": 0.5136, + "step": 2064 + }, + { + "epoch": 0.9983079526226735, + "grad_norm": 3.9528345451843885, + "learning_rate": 1.5975422055003465e-10, + "loss": 0.5088, + "step": 2065 + }, + { + "epoch": 0.9987913947304811, + "grad_norm": 0.4124191883874225, + "learning_rate": 1.022428971581002e-10, + "loss": 0.5106, + "step": 2066 + }, + { + "epoch": 0.9992748368382887, + "grad_norm": 0.42214447971757757, + "learning_rate": 5.751171540391287e-11, + "loss": 0.513, + "step": 2067 + }, + { + "epoch": 0.9997582789460963, + "grad_norm": 0.4373529560530007, + "learning_rate": 2.556078962490105e-11, + "loss": 0.5278, + "step": 2068 + }, + { + "epoch": 1.0, + "grad_norm": 0.4373529560530007, + "learning_rate": 6.390201489625547e-12, + "loss": 0.4723, + "step": 2069 + }, + { + "epoch": 1.0, + "step": 2069, + "total_flos": 2898754626256896.0, + "train_loss": 0.5475362745847836, + "train_runtime": 128435.5344, + "train_samples_per_second": 2.061, + "train_steps_per_second": 0.016 + } + ], + "logging_steps": 1, + "max_steps": 2069, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 208, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2898754626256896.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}